Added option to skip transcribing if it exists in the output text file, because apparently whisperx will throw a "max files opened" error when using ROCm because it does not close some file descriptors if you're batch-transcribing or something, so poor little me, who's retranscribing his japanese dataset for the 305823042th time woke up to it partially done i am so mad I have to wait another few hours for it to continue when I was hoping to wake up to it done
This commit is contained in:
parent
0e3bbc55f8
commit
14779a5020
22
src/utils.py
22
src/utils.py
|
@ -1037,7 +1037,7 @@ def whisper_transcribe( file, language=None ):
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def prepare_dataset( files, outdir, language=None, progress=None ):
|
def prepare_dataset( files, outdir, language=None, skip_existings=False, progress=None ):
|
||||||
unload_tts()
|
unload_tts()
|
||||||
|
|
||||||
global whisper_model
|
global whisper_model
|
||||||
|
@ -1049,8 +1049,28 @@ def prepare_dataset( files, outdir, language=None, progress=None ):
|
||||||
results = {}
|
results = {}
|
||||||
transcription = []
|
transcription = []
|
||||||
|
|
||||||
|
previous_list = []
|
||||||
|
if skip_existings and os.path.exists(f'{outdir}/train.txt'):
|
||||||
|
parsed_list = []
|
||||||
|
with open(f'{outdir}/train.txt', 'r', encoding="utf-8") as f:
|
||||||
|
parsed_list = f.readlines()
|
||||||
|
|
||||||
|
for line in parsed_list:
|
||||||
|
match = re.findall(r"^(.+?)_\d+\.wav$", line.split("|")[0])
|
||||||
|
print(match)
|
||||||
|
if match is None or len(match) == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if match[0] not in previous_list:
|
||||||
|
previous_list.append(f'{match[0]}.wav')
|
||||||
|
|
||||||
for file in enumerate_progress(files, desc="Iterating through voice files", progress=progress):
|
for file in enumerate_progress(files, desc="Iterating through voice files", progress=progress):
|
||||||
basename = os.path.basename(file)
|
basename = os.path.basename(file)
|
||||||
|
|
||||||
|
if basename in previous_list:
|
||||||
|
print(f"Skipping already parsed file: {basename}")
|
||||||
|
continue
|
||||||
|
|
||||||
result = whisper_transcribe(file, language=language)
|
result = whisper_transcribe(file, language=language)
|
||||||
results[basename] = result
|
results[basename] = result
|
||||||
print(f"Transcribed file: {file}, {len(result['segments'])} found.")
|
print(f"Transcribed file: {file}, {len(result['segments'])} found.")
|
||||||
|
|
|
@ -185,8 +185,8 @@ def read_generate_settings_proxy(file, saveAs='.temp'):
|
||||||
gr.update(visible=j is not None),
|
gr.update(visible=j is not None),
|
||||||
)
|
)
|
||||||
|
|
||||||
def prepare_dataset_proxy( voice, language, progress=gr.Progress(track_tqdm=True) ):
|
def prepare_dataset_proxy( voice, language, skip_existings, progress=gr.Progress(track_tqdm=True) ):
|
||||||
return prepare_dataset( get_voices(load_latents=False)[voice], outdir=f"./training/{voice}/", language=language, progress=progress )
|
return prepare_dataset( get_voices(load_latents=False)[voice], outdir=f"./training/{voice}/", language=language, skip_existings=skip_existings, progress=progress )
|
||||||
|
|
||||||
def optimize_training_settings_proxy( *args, **kwargs ):
|
def optimize_training_settings_proxy( *args, **kwargs ):
|
||||||
tup = optimize_training_settings(*args, **kwargs)
|
tup = optimize_training_settings(*args, **kwargs)
|
||||||
|
@ -478,7 +478,8 @@ def setup_gradio():
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
dataset_settings = [
|
dataset_settings = [
|
||||||
gr.Dropdown( choices=voice_list, label="Dataset Source", type="value", value=voice_list[0] if len(voice_list) > 0 else "" ),
|
gr.Dropdown( choices=voice_list, label="Dataset Source", type="value", value=voice_list[0] if len(voice_list) > 0 else "" ),
|
||||||
gr.Textbox(label="Language", value="en")
|
gr.Textbox(label="Language", value="en"),
|
||||||
|
gr.Checkbox(label="Skip Already Transcribed", value=False)
|
||||||
]
|
]
|
||||||
prepare_dataset_button = gr.Button(value="Prepare")
|
prepare_dataset_button = gr.Button(value="Prepare")
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
|
|
Loading…
Reference in New Issue
Block a user