updated whisperX integration for use with the latest version (v3) (NOTE: you WILL need to also update whisperx if you pull this commit)

2023-06-09 02:41:29 +00:00 · 2023-06-09 02:41:29 +00:00 · e227ab8e08
commit e227ab8e08
parent 805d7d35e8
1 changed files with 3 additions and 64 deletions
--- a/src/utils.py
+++ b/src/utils.py
@ -177,8 +177,6 @@ webui = None
 voicefixer = None
 whisper_model = None
 whisper_vad = None
 whisper_diarize = None
 whisper_align_model = None
 training_state = None
@ -1999,8 +1997,6 @@ def whisper_sanitize( results ):
 def whisper_transcribe( file, language=None ):
 	# shouldn't happen, but it's for safety
 	global whisper_model
 	global whisper_vad
 	global whisper_diarize
 	global whisper_align_model
 	if not whisper_model:
@ -2035,37 +2031,12 @@ def whisper_transcribe( file, language=None ):
 	if args.whisper_backend == "m-bain/whisperx":
 		import whisperx
 		from whisperx.diarize import assign_word_speakers
 		device = "cuda" if get_device_name() == "cuda" else "cpu"
-		if whisper_vad:
+		result = whisper_model.transcribe(file, batch_size=args.whisper_batchsize)
 			# omits a considerable amount of the end
 			if args.whisper_batchsize > 1:
 				result = whisperx.transcribe_with_vad_parallel(whisper_model, file, whisper_vad, batch_size=args.whisper_batchsize, language=language, task="transcribe")
 			else:
 				result = whisperx.transcribe_with_vad(whisper_model, file, whisper_vad)
 			"""
 			result = whisperx.transcribe_with_vad(whisper_model, file, whisper_vad)
 			"""
 		else:
 			result = whisper_model.transcribe(file)
 		align_model, metadata = whisper_align_model
-		result_aligned = whisperx.align(result["segments"], align_model, metadata, file, device)
+		result_aligned = whisperx.align(result["segments"], align_model, metadata, file, device, return_char_alignments=False)
 		if whisper_diarize:
 			diarize_segments = whisper_diarize(file)
 			diarize_df = pd.DataFrame(diarize_segments.itertracks(yield_label=True))
 			diarize_df['start'] = diarize_df[0].apply(lambda x: x.start)
 			diarize_df['end'] = diarize_df[0].apply(lambda x: x.end)
 			# assumes each utterance is single speaker (needs fix)
 			result_segments, word_segments = assign_word_speakers(diarize_df, result_aligned["segments"], fill_nearest=True)
 			result_aligned["segments"] = result_segments
 			result_aligned["word_segments"] = word_segments
 		for i in range(len(result_aligned['segments'])):
 			del result_aligned['segments'][i]['word-segments']
 			del result_aligned['segments'][i]['char-segments']
 		result['segments'] = result_aligned['segments']
 		result['text'] = []
@ -3625,8 +3596,6 @@ def unload_voicefixer():
 def load_whisper_model(language=None, model_name=None, progress=None):
 	global whisper_model
 	global whisper_vad
 	global whisper_diarize
 	global whisper_align_model
 	if args.whisper_backend not in WHISPER_BACKENDS:
@ -3662,45 +3631,15 @@ def load_whisper_model(language=None, model_name=None, progress=None):
 	elif args.whisper_backend == "m-bain/whisperx":
 		import whisper, whisperx
 		device = "cuda" if get_device_name() == "cuda" else "cpu"
-		try:
+		whisper_model = whisperx.load_model(model_name, device)
 			whisper_model = whisperx.load_model(model_name, device)
 		except Exception as e:
 			whisper_model = whisper.load_model(model_name, device)
 		if not args.hf_token:
 			print("No huggingface token used, needs to be saved in environment variable, otherwise will throw error loading VAD model.")
 		try:
 			from pyannote.audio import Inference, Pipeline
 			whisper_vad = Inference(
 				"pyannote/segmentation",
 				pre_aggregation_hook=lambda segmentation: segmentation,
 				use_auth_token=args.hf_token,
 				device=torch.device(device),
 			)
 			# whisper_diarize = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1",use_auth_token=args.hf_token)
 		except Exception as e:
 			pass
 		whisper_align_model = whisperx.load_align_model(model_name="WAV2VEC2_ASR_LARGE_LV60K_960H" if language=="en" else None, language_code=language, device=device)
 	print("Loaded Whisper model")
 def unload_whisper():
 	global whisper_model
 	global whisper_vad
 	global whisper_diarize
 	global whisper_align_model
 	if whisper_vad:
 		del whisper_vad
 		whisper_vad = None
 	if whisper_diarize:
 		del whisper_diarize
 		whisper_diarize = None
 	if whisper_align_model:
 		del whisper_align_model
 		whisper_align_model = None