From 736cdc8926cf5ca8bfe8a9660d8066b1a743c4e2 Mon Sep 17 00:00:00 2001 From: mrq Date: Wed, 22 Mar 2023 20:38:58 +0000 Subject: [PATCH] disable diarization for whisperx as it's just a useless performance hit (I don't have anything that's multispeaker within the same audio file at the moment) --- src/utils.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/utils.py b/src/utils.py index 389d2af..80ae60d 100755 --- a/src/utils.py +++ b/src/utils.py @@ -1147,6 +1147,9 @@ def whisper_sanitize( results ): last_segment['text'] += segment['text'] last_segment['end'] = segment['end'] + for i in range(len(sanitized['segments'])): + sanitized['segments']['id'] = i + return sanitized def whisper_transcribe( file, language=None ): @@ -1263,10 +1266,19 @@ def transcribe_dataset( voice, language=None, skip_existings=False, progress=Non if basename in results and skip_existings: print(f"Skipping already parsed file: {basename}") else: - result = whisper_transcribe(file, language=language) + try: + result = whisper_transcribe(file, language=language) + except Exception as e: + print("Failed to transcribe:", file) + continue results[basename] = result - # results[basename] = whisper_sanitize(results[basename]) + try: + sanitized = whisper_sanitize(results[basename]) + results[basename] = sanitized + except Exception as e: + print("Failed to sanitize:", basename, e) + pass waveform, sample_rate = torchaudio.load(file) # resample to the input rate, since it'll get resampled for training anyways