cleaned up some prepare dataset code

This commit is contained in:
mrq 2023-03-17 01:24:02 +00:00
parent 0b62ccc112
commit d4c50967a6
2 changed files with 189 additions and 157 deletions

View File

@ -2,8 +2,11 @@
"version": "1.0", "version": "1.0",
"truncation": null, "truncation": null,
"padding": null, "padding": null,
"added_tokens": "normalizer": null,
[ "pre_tokenizer": null,
"post_processor": null,
"decoder": null,
"added_tokens": [
{ {
"id": 0, "id": 0,
"special": true, "special": true,
@ -32,20 +35,14 @@
"normalized": false "normalized": false
} }
], ],
"normalizer": null, "model": {
"pre_tokenizer": null,
"post_processor": null,
"decoder": null,
"model":
{
"type": "BPE", "type": "BPE",
"dropout": null, "dropout": null,
"unk_token": "[UNK]", "unk_token": "[UNK]",
"continuing_subword_prefix": null, "continuing_subword_prefix": null,
"end_of_word_suffix": null, "end_of_word_suffix": null,
"fuse_unk": false, "fuse_unk": false,
"vocab": "vocab": {
{
"[STOP]": 0, "[STOP]": 0,
"[UNK]": 1, "[UNK]": 1,
"[SPACE]": 2, "[SPACE]": 2,
@ -61,40 +58,39 @@
";": 12, ";": 12,
"?": 13, "?": 13,
"a": 14, "a": 14,
"aɪ": 15, "b": 15,
"aʊ": 16, "c": 16,
"b": 17, "d": 17,
"d": 18, "e": 18,
"d͡": 19, "f": 19,
"d͡ʒ": 20, "g": 20,
"e": 21, "h": 21,
"eɪ": 22, "i": 22,
"f": 23, "j": 23,
"h": 24, "k": 24,
"i": 25, "l": 25,
"j": 26, "m": 26,
"k": 27, "n": 27,
"l": 28, "o": 28,
"m": 29, "p": 29,
"n": 30, "q": 30,
"o": 31, "r": 31,
"oʊ": 32, "s": 32,
"p": 33, "t": 33,
"s": 34, "u": 34,
"t": 35, "v": 35,
"t͡": 36, "w": 36,
"t͡ʃ": 37, "x": 37,
"u": 38, "y": 38,
"v": 39, "z": 39,
"w": 40, "d͡": 41,
"z": 41, "t͡": 42,
"|": 42, "|": 43,
"æ": 43, "æ": 44,
"ð": 44, "ð": 45,
"ŋ": 45, "ŋ": 46,
"ɑ": 46, "ɑ": 47,
"ɔ": 47, "ɔ": 48,
"ɔɪ": 48,
"ə": 49, "ə": 49,
"ɚ": 50, "ɚ": 50,
"ɛ": 51, "ɛ": 51,
@ -112,27 +108,34 @@
"ɾ": 63, "ɾ": 63,
"n̩": 64, "n̩": 64,
"ː": 65, "ː": 65,
"ɔː": 66, "ˈ": 66,
"uː": 67, "d͡ʒ": 67,
"iː": 68, "aɪ": 68,
"ɑː": 69, "aʊ": 69,
"oː": 70, "eɪ": 70,
"ɜː": 71 "oʊ": 71,
"t͡ʃ": 72,
"ɔɪ": 73,
"ɔː": 74,
"uː": 75,
"iː": 76,
"ɑː": 77,
"oː": 78,
"ɜː": 79
}, },
"merges": "merges": [
[
"a ɪ", "a ɪ",
"a ʊ",
"d͡ ʒ",
"e ɪ", "e ɪ",
"o ʊ",
"t͡ ʃ",
ɪ", ɪ",
ː", "a ʊ",
"u ː", "o ʊ",
"d͡ ʒ",
"t͡ ʃ",
"i ː", "i ː",
"ɑ ː",
"o ː", "o ː",
"u ː",
"ɑ ː",
ː",
ː" ː"
] ]
} }

View File

@ -1187,7 +1187,8 @@ def transcribe_dataset( voice, language=None, skip_existings=False, progress=Non
# resample to the input rate, since it'll get resampled for training anyways # resample to the input rate, since it'll get resampled for training anyways
# this should also "help" increase throughput a bit when filling the dataloaders # this should also "help" increase throughput a bit when filling the dataloaders
waveform, sample_rate = resample(waveform, sample_rate, tts.input_sample_rate if tts is not None else 22050) waveform, sample_rate = resample(waveform, sample_rate, tts.input_sample_rate if tts is not None else 22050)
if waveform.shape[0] == 2:
waveform = waveform[:1]
torchaudio.save(f"{indir}/audio/{basename}", waveform, sample_rate, encoding="PCM_S", bits_per_sample=16) torchaudio.save(f"{indir}/audio/{basename}", waveform, sample_rate, encoding="PCM_S", bits_per_sample=16)
with open(infile, 'w', encoding="utf-8") as f: with open(infile, 'w', encoding="utf-8") as f:
@ -1254,6 +1255,10 @@ def slice_dataset( voice, trim_silence=True, start_offset=0, end_offset=0, resul
messages.append(message) messages.append(message)
continue continue
sliced, _ = resample( sliced, sample_rate, 22050 ) sliced, _ = resample( sliced, sample_rate, 22050 )
if waveform.shape[0] == 2:
waveform = waveform[:1]
torchaudio.save(f"{indir}/audio/{file}", sliced, 22050, encoding="PCM_S", bits_per_sample=16) torchaudio.save(f"{indir}/audio/{file}", sliced, 22050, encoding="PCM_S", bits_per_sample=16)
segments +=1 segments +=1
@ -1261,15 +1266,8 @@ def slice_dataset( voice, trim_silence=True, start_offset=0, end_offset=0, resul
messages.append(f"Sliced segments: {files} => {segments}.") messages.append(f"Sliced segments: {files} => {segments}.")
return "\n".join(messages) return "\n".join(messages)
""" # takes an LJSpeech-dataset-formatted .txt file and phonemize it
def phonemizer( text, language="eng" ): def phonemize_txt_file( path ):
transducer = make_g2p(language, f'{language}-ipa')
phones = transducer(text).output_string
ignored = [" "] + [ p for p in string.punctuation ]
return ["_" if p in ignored else p for p in phones]
"""
def phonemize_txt( path ):
with open(path, 'r', encoding='utf-8') as f: with open(path, 'r', encoding='utf-8') as f:
lines = f.readlines() lines = f.readlines()
@ -1291,39 +1289,62 @@ def phonemize_txt( path ):
return joined return joined
# takes an LJSpeech-dataset-formatted .txt (and phonemized .phn.txt from the above) and creates a JSON that should slot in as whisper.json
def create_dataset_json( path ):
with open(path, 'r', encoding='utf-8') as f:
lines = f.readlines()
phonemes = None
phn_path = path.replace(".txt", ".phn.txt")
if os.path.exists(phn_path):
with open(phn_path, 'r', encoding='utf-8') as f:
phonemes = f.readlines()
data = {}
for line in lines:
split = line.split("|")
audio = split[0]
text = split[1]
data[audio] = {
'text': text.strip()
}
for line in phonemes:
split = line.split("|")
audio = split[0]
text = split[1]
data[audio]['phonemes'] = text.strip()
with open(path.replace(".txt", ".json"), 'w', encoding='utf-8') as f:
f.write(json.dumps(data, indent="\t"))
def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, progress=gr.Progress() ): def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, progress=gr.Progress() ):
indir = f'./training/{voice}/' indir = f'./training/{voice}/'
infile = f'{indir}/whisper.json' infile = f'{indir}/whisper.json'
messages = []
normalize = True
phonemize = args.tokenizer_json is not None and args.tokenizer_json[-8:] == "ipa.json"
if args.tts_backend == "vall-e":
phonemize = True
if not os.path.exists(infile): if not os.path.exists(infile):
raise Exception(f"Missing dataset: {infile}") raise Exception(f"Missing dataset: {infile}")
results = json.load(open(infile, 'r', encoding="utf-8")) results = json.load(open(infile, 'r', encoding="utf-8"))
lines = {
'training': [],
'validation': []
}
already_segmented = []
errored = 0 errored = 0
messages = []
normalize = True
phonemize = args.tokenizer_json is not None and args.tokenizer_json[-8:] == "ipa.json"
lines = { 'training': [], 'validation': [] }
segments = {}
if args.tts_backend == "vall-e":
phonemize = True
for filename in results: for filename in results:
use_segment = use_segments use_segment = use_segments
result = results[filename] result = results[filename]
language = LANGUAGES[result['language']] if result['language'] in LANGUAGES else None language = LANGUAGES[result['language']] if result['language'] in LANGUAGES else None
if language == "english": normalizer = EnglishTextNormalizer() if language and language == "english" else BasicTextNormalizer()
language = "en-us"
normalizer = None
if normalize:
normalizer = EnglishTextNormalizer() if language and language == "english" else BasicTextNormalizer()
# check if unsegmented text exceeds 200 characters # check if unsegmented text exceeds 200 characters
if not use_segment: if not use_segment:
@ -1349,84 +1370,84 @@ def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, p
messages.append(message) messages.append(message)
use_segment = True use_segment = True
segments = result['segments'] if use_segment else [{'text': result['text']}] # implicitly segment
if use_segment and not use_segments:
tmp = {}
tmp[filename] = result
print(f"Audio not segmented, segmenting: {filename}")
message = slice_dataset( voice, results=tmp )
print(message)
messages = messages + message.split("\n")
for segment in enumerate_progress(segments, desc="Parsing segments", progress=progress): if not use_segment:
file = filename.replace(".wav", f"_{pad(segment['id'], 4)}.wav") if use_segment else filename segments[filename] = {
path = f'{indir}/audio/{file}' 'text': result['text'],
# segment when needed 'language': language,
if not os.path.exists(path) and filename not in already_segmented: 'normalizer': normalizer,
already_segmented.append(filename) 'phonemes': result['phonemes'] if 'phonemes' in result else None
}
else:
for segment in result['segments']:
segments[filename.replace(".wav", f"_{pad(segment['id'], 4)}.wav")] = {
'text': segment['text'],
'language': language,
'normalizer': normalizer,
'phonemes': segment['phonemes'] if 'phonemes' in segment else None
}
tmp_results = {} for file in enumerate_progress(segments, desc="Parsing segments", progress=progress):
tmp_results[filename] = result result = segments[file]
print(f"Audio not segmented, segmenting: {filename}") path = f'{indir}/audio/{file}'
message = slice_dataset( voice, results=tmp_results )
print(message)
messages = messages + message.split("\n")
if not os.path.exists(path): text = result['text']
message = f"Missing source audio: {file}" language = result['language']
print(message) normalizer = result['normalizer']
messages.append(message) phonemes = result['phonemes']
errored += 1 if phonemize and phonemes is None:
continue phonemes = phonemizer( text, language=language if language != "english" else "en-us", strip=True, preserve_punctuation=True, with_stress=True, backend=args.phonemizer_backend )
if phonemize:
text = phonemes
text = segment['text'].strip() if len(text) > 200:
normalized_text = normalizer(text) if normalize else None message = f"Text length too long (200 < {len(text)}), skipping... {file}"
try: print(message)
phonemes = phonemizer( text, language=language, preserve_punctuation=True, strip=True ) if phonemize else None messages.append(message)
if phonemize: errored += 1
text = phonemes continue
except Exception as e:
print(e)
pass
waveform, sample_rate = torchaudio.load(path)
num_channels, num_frames = waveform.shape
duration = num_frames / sample_rate
if len(text) > 200: error = validate_waveform( waveform, sample_rate )
message = f"Text length too long (200 < {len(text)}), skipping... {file}" if error:
print(message) message = f"{error}, skipping... {file}"
messages.append(message) print(message)
errored += 1 messages.append(message)
continue errored += 1
continue
waveform, sample_rate = torchaudio.load(path) culled = len(text) < text_length
num_channels, num_frames = waveform.shape if not culled and audio_length > 0:
duration = num_frames / sample_rate culled = duration < audio_length
error = validate_waveform( waveform, sample_rate ) line = f'audio/{file}|{text}'
if error:
message = f"{error}, skipping... {file}"
print(message)
messages.append(message)
errored += 1
continue
lines['training' if not culled else 'validation'].append(line)
culled = len(text) < text_length if culled or args.tts_backend != "vall-e":
if not culled and audio_length > 0: continue
culled = duration < audio_length
line = f'audio/{file}|{text}' os.makedirs(f'{indir}/valle/', exist_ok=True)
lines['training' if not culled else 'validation'].append(line) from vall_e.emb.qnt import encode as quantize
# from vall_e.emb.g2p import encode as phonemize
if culled or args.tts_backend != "vall-e": quantized = quantize( waveform, sample_rate ).cpu()
continue print("Quantized:", file)
os.makedirs(f'{indir}/valle/', exist_ok=True) torch.save(quantized, f'{indir}/valle/{file.replace(".wav",".qnt.pt")}')
open(f'{indir}/valle/{file.replace(".wav",".phn.txt")}', 'w', encoding='utf-8').write(text)
from vall_e.emb.qnt import encode as quantize
# from vall_e.emb.g2p import encode as phonemize
if waveform.shape[0] == 2:
waveform = waveform[:1]
quantized = quantize( waveform, sample_rate ).cpu()
torch.save(quantized, f'{indir}/valle/{file.replace(".wav",".qnt.pt")}')
# phonemes = phonemizer(normalized_text)
open(f'{indir}/valle/{file.replace(".wav",".phn.txt")}', 'w', encoding='utf-8').write(text)
training_joined = "\n".join(lines['training']) training_joined = "\n".join(lines['training'])
validation_joined = "\n".join(lines['validation']) validation_joined = "\n".join(lines['validation'])
@ -1803,9 +1824,9 @@ def tokenize_text( text ):
load_tts() load_tts()
encoded = tts.tokenizer.encode(text) encoded = tts.tokenizer.encode(text)
decoded = tts.tokenizer.tokenizer.decode(encoded, skip_special_tokens=False).replace(" ", "").replace("[SPACE]", " ") decoded = tts.tokenizer.tokenizer.decode(encoded, skip_special_tokens=False).split(" ")
return "\n".join([ str(encoded), decoded ]) return "\n".join([ str(encoded), str(decoded) ])
def get_dataset_list(dir="./training/"): def get_dataset_list(dir="./training/"):
return sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d)) and "train.txt" in os.listdir(os.path.join(dir, d)) ]) return sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d)) and "train.txt" in os.listdir(os.path.join(dir, d)) ])
@ -1928,6 +1949,8 @@ def setup_args():
'vocoder-model': VOCODERS[-1], 'vocoder-model': VOCODERS[-1],
'tokenizer-json': None, 'tokenizer-json': None,
'phonemizer-backend': 'espeak',
'whisper-backend': 'openai/whisper', 'whisper-backend': 'openai/whisper',
'whisper-model': "base", 'whisper-model': "base",
@ -1972,6 +1995,8 @@ def setup_args():
parser.add_argument("--vocoder-model", default=default_arguments['vocoder-model'], action='store_true', help="Specifies with vocoder to use") parser.add_argument("--vocoder-model", default=default_arguments['vocoder-model'], action='store_true', help="Specifies with vocoder to use")
parser.add_argument("--tokenizer-json", default=default_arguments['tokenizer-json'], help="Specifies which tokenizer json to use for tokenizing.") parser.add_argument("--tokenizer-json", default=default_arguments['tokenizer-json'], help="Specifies which tokenizer json to use for tokenizing.")
parser.add_argument("--phonemizer-backend", default=default_arguments['phonemizer-backend'], help="Specifies which phonemizer backend to use.")
parser.add_argument("--whisper-backend", default=default_arguments['whisper-backend'], action='store_true', help="Picks which whisper backend to use (openai/whisper, lightmare/whispercpp)") parser.add_argument("--whisper-backend", default=default_arguments['whisper-backend'], action='store_true', help="Picks which whisper backend to use (openai/whisper, lightmare/whispercpp)")
parser.add_argument("--whisper-model", default=default_arguments['whisper-model'], help="Specifies which whisper model to use for transcription.") parser.add_argument("--whisper-model", default=default_arguments['whisper-model'], help="Specifies which whisper model to use for transcription.")
@ -2037,6 +2062,8 @@ def get_default_settings( hypenated=True ):
'vocoder-model': args.vocoder_model, 'vocoder-model': args.vocoder_model,
'tokenizer-json': args.tokenizer_json, 'tokenizer-json': args.tokenizer_json,
'phonemizer-backend': args.phonemizer_backend,
'whisper-backend': args.whisper_backend, 'whisper-backend': args.whisper_backend,
'whisper-model': args.whisper_model, 'whisper-model': args.whisper_model,
@ -2081,6 +2108,8 @@ def update_args( **kwargs ):
args.vocoder_model = settings['vocoder_model'] args.vocoder_model = settings['vocoder_model']
args.tokenizer_json = settings['tokenizer_json'] args.tokenizer_json = settings['tokenizer_json']
args.phonemizer_backend = settings['phonemizer_backend']
args.whisper_backend = settings['whisper_backend'] args.whisper_backend = settings['whisper_backend']
args.whisper_model = settings['whisper_model'] args.whisper_model = settings['whisper_model']