From ede9804b766134b04367862222809eb441aa000b Mon Sep 17 00:00:00 2001 From: mrq Date: Sat, 11 Mar 2023 21:41:35 +0000 Subject: [PATCH] added option to trim silence using torchaudio's VAD --- src/utils.py | 5 ++++- src/webui.py | 6 ++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/utils.py b/src/utils.py index 883ffd9..743b873 100755 --- a/src/utils.py +++ b/src/utils.py @@ -1100,7 +1100,7 @@ def transcribe_dataset( voice, language=None, skip_existings=False, progress=Non return f"Processed dataset to: {indir}" -def slice_dataset( voice, start_offset=0, end_offset=0 ): +def slice_dataset( voice, trim_silence=True, start_offset=0, end_offset=0 ): indir = f'./training/{voice}/' infile = f'{indir}/whisper.json' @@ -1134,6 +1134,9 @@ def slice_dataset( voice, start_offset=0, end_offset=0 ): print(f"Invalid waveform segment ({segment['start']}:{segment['end']}): {file}, skipping...") continue + if trim_silence: + sliced = torchaudio.functional.vad( sliced, sampling_rate ) + torchaudio.save(f"{indir}/audio/{file}", sliced, sampling_rate) return f"Sliced segments: {files} => {segments}." diff --git a/src/webui.py b/src/webui.py index 6e6cc14..9dd6475 100755 --- a/src/webui.py +++ b/src/webui.py @@ -182,14 +182,14 @@ def read_generate_settings_proxy(file, saveAs='.temp'): gr.update(visible=j is not None), ) -def prepare_dataset_proxy( voice, language, validation_text_length, validation_audio_length, skip_existings, slice_audio, slice_start_offset, slice_end_offset, progress=gr.Progress(track_tqdm=False) ): +def prepare_dataset_proxy( voice, language, validation_text_length, validation_audio_length, skip_existings, slice_audio, trim_silence, slice_start_offset, slice_end_offset, progress=gr.Progress(track_tqdm=False) ): messages = [] message = transcribe_dataset( voice=voice, language=language, skip_existings=skip_existings, progress=progress ) messages.append(message) if slice_audio: - message = slice_dataset( voice, start_offset=slice_start_offset, end_offset=slice_end_offset ) + message = slice_dataset( voice, trim_silence=trim_silence, start_offset=slice_start_offset, end_offset=slice_end_offset ) messages.append(message) message = prepare_dataset( voice, use_segments=slice_audio, text_length=validation_text_length, audio_length=validation_audio_length ) @@ -421,6 +421,7 @@ def setup_gradio(): with gr.Row(): DATASET_SETTINGS['skip'] = gr.Checkbox(label="Skip Already Transcribed", value=False) DATASET_SETTINGS['slice'] = gr.Checkbox(label="Slice Segments", value=False) + DATASET_SETTINGS['trim_silence'] = gr.Checkbox(label="Trim Silence", value=False) with gr.Row(): DATASET_SETTINGS['slice_start_offset'] = gr.Number(label="Slice Start Offset", value=0) DATASET_SETTINGS['slice_end_offset'] = gr.Number(label="Slice End Offset", value=0) @@ -763,6 +764,7 @@ def setup_gradio(): slice_dataset, inputs=[ DATASET_SETTINGS['voice'], + DATASET_SETTINGS['trim_silence'], DATASET_SETTINGS['slice_start_offset'], DATASET_SETTINGS['slice_end_offset'], ],