From b90c164778c5f040caed3780ad933f46ac7eb10b Mon Sep 17 00:00:00 2001 From: mrq Date: Sat, 11 Mar 2023 16:40:34 +0000 Subject: [PATCH] Farewell, parasite --- requirements.txt | 1 - src/utils.py | 39 ++++++++------------------------------- 2 files changed, 8 insertions(+), 32 deletions(-) diff --git a/requirements.txt b/requirements.txt index 5d14006..ef33ec9 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ git+https://github.com/openai/whisper.git -git+https://github.com/m-bain/whisperx.git more-itertools ffmpeg-python diff --git a/src/utils.py b/src/utils.py index 01478ba..d016c64 100755 --- a/src/utils.py +++ b/src/utils.py @@ -39,9 +39,9 @@ from tortoise.utils.device import get_device_name, set_device_name, get_device_c MODELS['dvae.pth'] = "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/3704aea61678e7e468a06d8eea121dba368a798e/.models/dvae.pth" -WHISPER_MODELS = ["tiny", "base", "small", "medium", "large", "large-v2"] +WHISPER_MODELS = ["tiny", "base", "small", "medium", "large"] WHISPER_SPECIALIZED_MODELS = ["tiny.en", "base.en", "small.en", "medium.en"] -WHISPER_BACKENDS = ["openai/whisper", "lightmare/whispercpp", "m-bain/whisperx"] +WHISPER_BACKENDS = ["openai/whisper", "lightmare/whispercpp"] VOCODERS = ['univnet', 'bigvgan_base_24khz_100band', 'bigvgan_24khz_100band'] GENERATE_SETTINGS_ARGS = None @@ -1032,7 +1032,7 @@ def whisper_transcribe( file, language=None ): return whisper_model.transcribe(file, language=language) - elif args.whisper_backend == "lightmare/whispercpp": + if args.whisper_backend == "lightmare/whispercpp": res = whisper_model.transcribe(file) segments = whisper_model.extract_text_and_timestamps( res ) @@ -1046,23 +1046,6 @@ def whisper_transcribe( file, language=None ): 'text': segment[2], } result['segments'].append(reparsed) - - return result - - # credit to https://git.ecker.tech/yqxtqymn for the busywork of getting this added - elif args.whisper_backend == "m-bain/whisperx": - import whisperx - device = "cuda" if get_device_name() == "cuda" else "cpu" - result = whisper_model.transcribe(file) - model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device) - result_aligned = whisperx.align(result["segments"], model_a, metadata, file, device) - - for i in range(len(result_aligned['segments'])): - del result_aligned['segments'][i]['word-segments'] - del result_aligned['segments'][i]['char-segments'] - - result['segments'] = result_aligned['segments'] - return result def prepare_dataset( files, outdir, language=None, skip_existings=False, slice_audio=False, progress=None ): @@ -1072,9 +1055,6 @@ def prepare_dataset( files, outdir, language=None, skip_existings=False, slice_a if whisper_model is None: load_whisper_model(language=language) - if args.whisper_backend == "m-bain/whisperx" and slice_audio: - print("! CAUTION ! Slicing audio with whisperx is terrible. Please consider using a different whisper backend if you want to slice audio.") - os.makedirs(f'{outdir}/audio/', exist_ok=True) results = {} @@ -1708,7 +1688,7 @@ def setup_args(): parser.add_argument("--output-volume", type=float, default=default_arguments['output-volume'], help="Adjusts volume of output") parser.add_argument("--autoregressive-model", default=default_arguments['autoregressive-model'], help="Specifies which autoregressive model to use for sampling.") - parser.add_argument("--whisper-backend", default=default_arguments['whisper-backend'], action='store_true', help="Picks which whisper backend to use (openai/whisper, lightmare/whispercpp, m-bain/whisperx)") + parser.add_argument("--whisper-backend", default=default_arguments['whisper-backend'], action='store_true', help="Picks which whisper backend to use (openai/whisper, lightmare/whispercpp)") parser.add_argument("--whisper-model", default=default_arguments['whisper-model'], help="Specifies which whisper model to use for transcription.") parser.add_argument("--training-default-halfp", action='store_true', default=default_arguments['training-default-halfp'], help="Training default: halfp") @@ -2069,12 +2049,13 @@ def unload_voicefixer(): def load_whisper_model(language=None, model_name=None, progress=None): global whisper_model + if model_name == "m-bain/whisperx": + print("WhisperX has been removed. Reverting to openai/whisper. Apologies for the inconvenience.") + model_name = "openai/whisper" + if args.whisper_backend not in WHISPER_BACKENDS: raise Exception(f"unavailable backend: {args.whisper_backend}") - if args.whisper_backend != "m-bain/whisperx" and model_name == "large-v2": - raise Exception("large-v2 is only available for m-bain/whisperx backend") - if not model_name: model_name = args.whisper_model else: @@ -2097,10 +2078,6 @@ def load_whisper_model(language=None, model_name=None, progress=None): b_lang = language.encode('ascii') whisper_model = Whisper(model_name, models_dir='./models/', language=b_lang) - elif args.whisper_backend == "m-bain/whisperx": - import whisperx - device = "cuda" if get_device_name() == "cuda" else "cpu" - whisper_model = whisperx.load_model(model_name, device) print("Loaded Whisper model")