From f9154c4db1869c12904478d35bea2f449f9f8880 Mon Sep 17 00:00:00 2001 From: mrq Date: Thu, 16 Mar 2023 14:19:56 +0000 Subject: [PATCH] fixes --- modules/tortoise-tts | 2 +- src/utils.py | 23 +++++++++++++---------- src/webui.py | 4 ++-- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/modules/tortoise-tts b/modules/tortoise-tts index 1f674a4..e201746 160000 --- a/modules/tortoise-tts +++ b/modules/tortoise-tts @@ -1 +1 @@ -Subproject commit 1f674a468f4202ac47feb8fb3587dc5837f2af2b +Subproject commit e201746eeb3f5be602ae3395df8344f231a5f0d4 diff --git a/src/utils.py b/src/utils.py index 24cb6a5..ca63003 100755 --- a/src/utils.py +++ b/src/utils.py @@ -1214,7 +1214,7 @@ def slice_waveform( waveform, sample_rate, start, end, trim ): return sliced, error -def slice_dataset( voice, trim_silence=True, start_offset=0, end_offset=0, results=None ): +def slice_dataset( voice, trim_silence=True, start_offset=0, end_offset=0, results=None, progress=gr.Progress() ): indir = f'./training/{voice}/' infile = f'{indir}/whisper.json' messages = [] @@ -1269,12 +1269,14 @@ def phonemizer( text, language="eng" ): return ["_" if p in ignored else p for p in phones] """ -def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, normalize=True ): +def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, normalize=True, progress=gr.Progress() ): indir = f'./training/{voice}/' infile = f'{indir}/whisper.json' messages = [] - phonemize = phonemize=args.tokenizer_json[-8:] == "ipa.json" + phonemize = args.tokenizer_json is not None and phonemize=args.tokenizer_json[-8:] == "ipa.json" + if args.tts_backend == "vall-e": + phonemize = True if not os.path.exists(infile): raise Exception(f"Missing dataset: {infile}") @@ -1283,11 +1285,10 @@ def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, n lines = { 'training': [], - 'validation': [], - 'recordings': [], - 'supervisions': [], + 'validation': [] } + already_segmented = [] errored = 0 for filename in results: @@ -1328,11 +1329,13 @@ def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, n segments = result['segments'] if use_segment else [{'text': result['text']}] - for segment in segments: + for segment in enumerate_progress(segments, desc="Parsing segments", progress=progress): file = filename.replace(".wav", f"_{pad(segment['id'], 4)}.wav") if use_segment else filename path = f'{indir}/audio/{file}' # segment when needed - if not os.path.exists(path): + if not os.path.exists(path) and filename not in already_segmented: + already_segmented.append(filename) + tmp_results = {} tmp_results[filename] = result print(f"Audio not segmented, segmenting: {filename}") @@ -2360,9 +2363,9 @@ def update_tokenizer(tokenizer_json): if hasattr(tts, "loading") and tts.loading: raise Exception("TTS is still initializing...") - print(f"Loading model: {tokenizer_json}") + print(f"Loading tokenizer vocab: {tokenizer_json}") tts.load_tokenizer_json(tokenizer_json) - print(f"Loaded model: {tts.tokenizer_json}") + print(f"Loaded tokenizer vocab: {tts.tokenizer_json}") do_gc() diff --git a/src/webui.py b/src/webui.py index 6f6462a..fece74e 100755 --- a/src/webui.py +++ b/src/webui.py @@ -199,10 +199,10 @@ def prepare_dataset_proxy( voice, language, validation_text_length, validation_a messages.append(message) if slice_audio: - message = slice_dataset( voice, trim_silence=trim_silence, start_offset=slice_start_offset, end_offset=slice_end_offset ) + message = slice_dataset( voice, trim_silence=trim_silence, start_offset=slice_start_offset, end_offset=slice_end_offset, progress=progress ) messages.append(message) - message = prepare_dataset( voice, use_segments=slice_audio, text_length=validation_text_length, audio_length=validation_audio_length ) + message = prepare_dataset( voice, use_segments=slice_audio, text_length=validation_text_length, audio_length=validation_audio_length, progress=progress ) messages.append(message) return "\n".join(messages)