disable diarization for whisperx as it's just a useless performance hit (I don't have anything that's multispeaker within the same audio file at the moment)

This commit is contained in:
mrq 2023-03-22 20:38:58 +00:00
parent aa5bdafb06
commit 736cdc8926

View File

@ -1147,6 +1147,9 @@ def whisper_sanitize( results ):
last_segment['text'] += segment['text'] last_segment['text'] += segment['text']
last_segment['end'] = segment['end'] last_segment['end'] = segment['end']
for i in range(len(sanitized['segments'])):
sanitized['segments']['id'] = i
return sanitized return sanitized
def whisper_transcribe( file, language=None ): def whisper_transcribe( file, language=None ):
@ -1263,10 +1266,19 @@ def transcribe_dataset( voice, language=None, skip_existings=False, progress=Non
if basename in results and skip_existings: if basename in results and skip_existings:
print(f"Skipping already parsed file: {basename}") print(f"Skipping already parsed file: {basename}")
else: else:
try:
result = whisper_transcribe(file, language=language) result = whisper_transcribe(file, language=language)
except Exception as e:
print("Failed to transcribe:", file)
continue
results[basename] = result results[basename] = result
# results[basename] = whisper_sanitize(results[basename]) try:
sanitized = whisper_sanitize(results[basename])
results[basename] = sanitized
except Exception as e:
print("Failed to sanitize:", basename, e)
pass
waveform, sample_rate = torchaudio.load(file) waveform, sample_rate = torchaudio.load(file)
# resample to the input rate, since it'll get resampled for training anyways # resample to the input rate, since it'll get resampled for training anyways