From 14779a5020d8b22b07ea7a8f7a81e965e584059f Mon Sep 17 00:00:00 2001 From: mrq Date: Mon, 6 Mar 2023 10:47:06 +0000 Subject: [PATCH] Added option to skip transcribing if it exists in the output text file, because apparently whisperx will throw a "max files opened" error when using ROCm because it does not close some file descriptors if you're batch-transcribing or something, so poor little me, who's retranscribing his japanese dataset for the 305823042th time woke up to it partially done i am so mad I have to wait another few hours for it to continue when I was hoping to wake up to it done --- src/utils.py | 22 +++++++++++++++++++++- src/webui.py | 7 ++++--- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/src/utils.py b/src/utils.py index cfe5f97..047c90a 100755 --- a/src/utils.py +++ b/src/utils.py @@ -1037,7 +1037,7 @@ def whisper_transcribe( file, language=None ): return result -def prepare_dataset( files, outdir, language=None, progress=None ): +def prepare_dataset( files, outdir, language=None, skip_existings=False, progress=None ): unload_tts() global whisper_model @@ -1049,8 +1049,28 @@ def prepare_dataset( files, outdir, language=None, progress=None ): results = {} transcription = [] + previous_list = [] + if skip_existings and os.path.exists(f'{outdir}/train.txt'): + parsed_list = [] + with open(f'{outdir}/train.txt', 'r', encoding="utf-8") as f: + parsed_list = f.readlines() + + for line in parsed_list: + match = re.findall(r"^(.+?)_\d+\.wav$", line.split("|")[0]) + print(match) + if match is None or len(match) == 0: + continue + + if match[0] not in previous_list: + previous_list.append(f'{match[0]}.wav') + for file in enumerate_progress(files, desc="Iterating through voice files", progress=progress): basename = os.path.basename(file) + + if basename in previous_list: + print(f"Skipping already parsed file: {basename}") + continue + result = whisper_transcribe(file, language=language) results[basename] = result print(f"Transcribed file: {file}, {len(result['segments'])} found.") diff --git a/src/webui.py b/src/webui.py index b04d57e..9baf5f1 100755 --- a/src/webui.py +++ b/src/webui.py @@ -185,8 +185,8 @@ def read_generate_settings_proxy(file, saveAs='.temp'): gr.update(visible=j is not None), ) -def prepare_dataset_proxy( voice, language, progress=gr.Progress(track_tqdm=True) ): - return prepare_dataset( get_voices(load_latents=False)[voice], outdir=f"./training/{voice}/", language=language, progress=progress ) +def prepare_dataset_proxy( voice, language, skip_existings, progress=gr.Progress(track_tqdm=True) ): + return prepare_dataset( get_voices(load_latents=False)[voice], outdir=f"./training/{voice}/", language=language, skip_existings=skip_existings, progress=progress ) def optimize_training_settings_proxy( *args, **kwargs ): tup = optimize_training_settings(*args, **kwargs) @@ -478,7 +478,8 @@ def setup_gradio(): with gr.Column(): dataset_settings = [ gr.Dropdown( choices=voice_list, label="Dataset Source", type="value", value=voice_list[0] if len(voice_list) > 0 else "" ), - gr.Textbox(label="Language", value="en") + gr.Textbox(label="Language", value="en"), + gr.Checkbox(label="Skip Already Transcribed", value=False) ] prepare_dataset_button = gr.Button(value="Prepare") with gr.Column():