forgot to separate phonemes by spaces for [redacted]

This commit is contained in:
mrq 2023-03-17 02:08:07 +00:00
parent d4c50967a6
commit 1b72d0bba0
2 changed files with 44 additions and 30 deletions

View File

@ -106,22 +106,24 @@
"ɜ": 61, "ɜ": 61,
"ᵻ": 62, "ᵻ": 62,
"ɾ": 63, "ɾ": 63,
"n̩": 64, "n\u0329": 64,
"ː": 65, "ː": 65,
"ˈ": 66, "ˈ": 66,
"d͡ʒ": 67, "ˌ": 67,
"aɪ": 68, "ʔ": 68,
"aʊ": 69, "d͡ʒ": 69,
"eɪ": 70, "aɪ": 70,
"oʊ": 71, "aʊ": 71,
"t͡ʃ": 72, "eɪ": 72,
"ɔɪ": 73, "oʊ": 73,
"ɔː": 74, "t͡ʃ": 74,
"uː": 75, "ɔɪ": 75,
"iː": 76, "ɔː": 76,
"ɑː": 77, "uː": 77,
"oː": 78, "iː": 78,
"ɜː": 79 "ɑː": 79,
"oː": 80,
"ɜː": 81
}, },
"merges": [ "merges": [
"a ɪ", "a ɪ",

View File

@ -1372,6 +1372,14 @@ def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, p
# implicitly segment # implicitly segment
if use_segment and not use_segments: if use_segment and not use_segments:
exists = True
for segment in result['segments']:
if os.path.exists(filename.replace(".wav", f"_{pad(segment['id'], 4)}.wav")):
continue
exists = False
break
if not exists:
tmp = {} tmp = {}
tmp[filename] = result tmp[filename] = result
print(f"Audio not segmented, segmenting: {filename}") print(f"Audio not segmented, segmenting: {filename}")
@ -1444,10 +1452,11 @@ def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, p
# from vall_e.emb.g2p import encode as phonemize # from vall_e.emb.g2p import encode as phonemize
quantized = quantize( waveform, sample_rate ).cpu() quantized = quantize( waveform, sample_rate ).cpu()
torch.save(quantized, f'{indir}/valle/{file.replace(".wav",".qnt.pt")}')
print("Quantized:", file) print("Quantized:", file)
torch.save(quantized, f'{indir}/valle/{file.replace(".wav",".qnt.pt")}') tokens = tokenize_text(text, stringed=False, skip_specials=True)
open(f'{indir}/valle/{file.replace(".wav",".phn.txt")}', 'w', encoding='utf-8').write(text) open(f'{indir}/valle/{file.replace(".wav",".phn.txt")}', 'w', encoding='utf-8').write(" ".join( tokens ).replace(" \u02C8", "\u02C8"))
training_joined = "\n".join(lines['training']) training_joined = "\n".join(lines['training'])
validation_joined = "\n".join(lines['validation']) validation_joined = "\n".join(lines['validation'])
@ -1815,19 +1824,22 @@ def get_tokenizer_jsons( dir="./models/tokenizers/" ):
additionals = sorted([ f'{dir}/{d}' for d in os.listdir(dir) if d[-5:] == ".json" ]) if os.path.isdir(dir) else [] additionals = sorted([ f'{dir}/{d}' for d in os.listdir(dir) if d[-5:] == ".json" ]) if os.path.isdir(dir) else []
return relative_paths([ "./modules/tortoise-tts/tortoise/data/tokenizer.json" ] + additionals) return relative_paths([ "./modules/tortoise-tts/tortoise/data/tokenizer.json" ] + additionals)
def tokenize_text( text ): def tokenize_text( text, stringed=True, skip_specials=False ):
from tortoise.utils.tokenizer import VoiceBpeTokenizer from tortoise.utils.tokenizer import VoiceBpeTokenizer
if not tts: if not tts:
if tts_loading: tokenizer = VoiceBpeTokenizer(args.tokenizer_json if args.tokenizer_json else get_tokenizer_jsons()[0])
raise Exception("TTS is still initializing...") else:
load_tts() tts.tokenizer
encoded = tts.tokenizer.encode(text) encoded = tokenizer.encode(text)
decoded = tts.tokenizer.tokenizer.decode(encoded, skip_special_tokens=False).split(" ") decoded = tokenizer.tokenizer.decode(encoded, skip_special_tokens=specials).split(" ")
if stringed:
return "\n".join([ str(encoded), str(decoded) ]) return "\n".join([ str(encoded), str(decoded) ])
return decoded
def get_dataset_list(dir="./training/"): def get_dataset_list(dir="./training/"):
return sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d)) and "train.txt" in os.listdir(os.path.join(dir, d)) ]) return sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d)) and "train.txt" in os.listdir(os.path.join(dir, d)) ])