added option to trim silence using torchaudio's VAD

This commit is contained in:
mrq 2023-03-11 21:41:35 +00:00
parent dea2fa9caf
commit ede9804b76
2 changed files with 8 additions and 3 deletions

View File

@ -1100,7 +1100,7 @@ def transcribe_dataset( voice, language=None, skip_existings=False, progress=Non
return f"Processed dataset to: {indir}" return f"Processed dataset to: {indir}"
def slice_dataset( voice, start_offset=0, end_offset=0 ): def slice_dataset( voice, trim_silence=True, start_offset=0, end_offset=0 ):
indir = f'./training/{voice}/' indir = f'./training/{voice}/'
infile = f'{indir}/whisper.json' infile = f'{indir}/whisper.json'
@ -1134,6 +1134,9 @@ def slice_dataset( voice, start_offset=0, end_offset=0 ):
print(f"Invalid waveform segment ({segment['start']}:{segment['end']}): {file}, skipping...") print(f"Invalid waveform segment ({segment['start']}:{segment['end']}): {file}, skipping...")
continue continue
if trim_silence:
sliced = torchaudio.functional.vad( sliced, sampling_rate )
torchaudio.save(f"{indir}/audio/{file}", sliced, sampling_rate) torchaudio.save(f"{indir}/audio/{file}", sliced, sampling_rate)
return f"Sliced segments: {files} => {segments}." return f"Sliced segments: {files} => {segments}."

View File

@ -182,14 +182,14 @@ def read_generate_settings_proxy(file, saveAs='.temp'):
gr.update(visible=j is not None), gr.update(visible=j is not None),
) )
def prepare_dataset_proxy( voice, language, validation_text_length, validation_audio_length, skip_existings, slice_audio, slice_start_offset, slice_end_offset, progress=gr.Progress(track_tqdm=False) ): def prepare_dataset_proxy( voice, language, validation_text_length, validation_audio_length, skip_existings, slice_audio, trim_silence, slice_start_offset, slice_end_offset, progress=gr.Progress(track_tqdm=False) ):
messages = [] messages = []
message = transcribe_dataset( voice=voice, language=language, skip_existings=skip_existings, progress=progress ) message = transcribe_dataset( voice=voice, language=language, skip_existings=skip_existings, progress=progress )
messages.append(message) messages.append(message)
if slice_audio: if slice_audio:
message = slice_dataset( voice, start_offset=slice_start_offset, end_offset=slice_end_offset ) message = slice_dataset( voice, trim_silence=trim_silence, start_offset=slice_start_offset, end_offset=slice_end_offset )
messages.append(message) messages.append(message)
message = prepare_dataset( voice, use_segments=slice_audio, text_length=validation_text_length, audio_length=validation_audio_length ) message = prepare_dataset( voice, use_segments=slice_audio, text_length=validation_text_length, audio_length=validation_audio_length )
@ -421,6 +421,7 @@ def setup_gradio():
with gr.Row(): with gr.Row():
DATASET_SETTINGS['skip'] = gr.Checkbox(label="Skip Already Transcribed", value=False) DATASET_SETTINGS['skip'] = gr.Checkbox(label="Skip Already Transcribed", value=False)
DATASET_SETTINGS['slice'] = gr.Checkbox(label="Slice Segments", value=False) DATASET_SETTINGS['slice'] = gr.Checkbox(label="Slice Segments", value=False)
DATASET_SETTINGS['trim_silence'] = gr.Checkbox(label="Trim Silence", value=False)
with gr.Row(): with gr.Row():
DATASET_SETTINGS['slice_start_offset'] = gr.Number(label="Slice Start Offset", value=0) DATASET_SETTINGS['slice_start_offset'] = gr.Number(label="Slice Start Offset", value=0)
DATASET_SETTINGS['slice_end_offset'] = gr.Number(label="Slice End Offset", value=0) DATASET_SETTINGS['slice_end_offset'] = gr.Number(label="Slice End Offset", value=0)
@ -763,6 +764,7 @@ def setup_gradio():
slice_dataset, slice_dataset,
inputs=[ inputs=[
DATASET_SETTINGS['voice'], DATASET_SETTINGS['voice'],
DATASET_SETTINGS['trim_silence'],
DATASET_SETTINGS['slice_start_offset'], DATASET_SETTINGS['slice_start_offset'],
DATASET_SETTINGS['slice_end_offset'], DATASET_SETTINGS['slice_end_offset'],
], ],