forked from camenduru/ai-voice-cloning
Farewell, parasite
This commit is contained in:
parent
2424c455cb
commit
b90c164778
|
@ -1,5 +1,4 @@
|
||||||
git+https://github.com/openai/whisper.git
|
git+https://github.com/openai/whisper.git
|
||||||
git+https://github.com/m-bain/whisperx.git
|
|
||||||
|
|
||||||
more-itertools
|
more-itertools
|
||||||
ffmpeg-python
|
ffmpeg-python
|
||||||
|
|
39
src/utils.py
39
src/utils.py
|
@ -39,9 +39,9 @@ from tortoise.utils.device import get_device_name, set_device_name, get_device_c
|
||||||
|
|
||||||
MODELS['dvae.pth'] = "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/3704aea61678e7e468a06d8eea121dba368a798e/.models/dvae.pth"
|
MODELS['dvae.pth'] = "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/3704aea61678e7e468a06d8eea121dba368a798e/.models/dvae.pth"
|
||||||
|
|
||||||
WHISPER_MODELS = ["tiny", "base", "small", "medium", "large", "large-v2"]
|
WHISPER_MODELS = ["tiny", "base", "small", "medium", "large"]
|
||||||
WHISPER_SPECIALIZED_MODELS = ["tiny.en", "base.en", "small.en", "medium.en"]
|
WHISPER_SPECIALIZED_MODELS = ["tiny.en", "base.en", "small.en", "medium.en"]
|
||||||
WHISPER_BACKENDS = ["openai/whisper", "lightmare/whispercpp", "m-bain/whisperx"]
|
WHISPER_BACKENDS = ["openai/whisper", "lightmare/whispercpp"]
|
||||||
VOCODERS = ['univnet', 'bigvgan_base_24khz_100band', 'bigvgan_24khz_100band']
|
VOCODERS = ['univnet', 'bigvgan_base_24khz_100band', 'bigvgan_24khz_100band']
|
||||||
|
|
||||||
GENERATE_SETTINGS_ARGS = None
|
GENERATE_SETTINGS_ARGS = None
|
||||||
|
@ -1032,7 +1032,7 @@ def whisper_transcribe( file, language=None ):
|
||||||
|
|
||||||
return whisper_model.transcribe(file, language=language)
|
return whisper_model.transcribe(file, language=language)
|
||||||
|
|
||||||
elif args.whisper_backend == "lightmare/whispercpp":
|
if args.whisper_backend == "lightmare/whispercpp":
|
||||||
res = whisper_model.transcribe(file)
|
res = whisper_model.transcribe(file)
|
||||||
segments = whisper_model.extract_text_and_timestamps( res )
|
segments = whisper_model.extract_text_and_timestamps( res )
|
||||||
|
|
||||||
|
@ -1046,23 +1046,6 @@ def whisper_transcribe( file, language=None ):
|
||||||
'text': segment[2],
|
'text': segment[2],
|
||||||
}
|
}
|
||||||
result['segments'].append(reparsed)
|
result['segments'].append(reparsed)
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
# credit to https://git.ecker.tech/yqxtqymn for the busywork of getting this added
|
|
||||||
elif args.whisper_backend == "m-bain/whisperx":
|
|
||||||
import whisperx
|
|
||||||
device = "cuda" if get_device_name() == "cuda" else "cpu"
|
|
||||||
result = whisper_model.transcribe(file)
|
|
||||||
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
|
|
||||||
result_aligned = whisperx.align(result["segments"], model_a, metadata, file, device)
|
|
||||||
|
|
||||||
for i in range(len(result_aligned['segments'])):
|
|
||||||
del result_aligned['segments'][i]['word-segments']
|
|
||||||
del result_aligned['segments'][i]['char-segments']
|
|
||||||
|
|
||||||
result['segments'] = result_aligned['segments']
|
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def prepare_dataset( files, outdir, language=None, skip_existings=False, slice_audio=False, progress=None ):
|
def prepare_dataset( files, outdir, language=None, skip_existings=False, slice_audio=False, progress=None ):
|
||||||
|
@ -1072,9 +1055,6 @@ def prepare_dataset( files, outdir, language=None, skip_existings=False, slice_a
|
||||||
if whisper_model is None:
|
if whisper_model is None:
|
||||||
load_whisper_model(language=language)
|
load_whisper_model(language=language)
|
||||||
|
|
||||||
if args.whisper_backend == "m-bain/whisperx" and slice_audio:
|
|
||||||
print("! CAUTION ! Slicing audio with whisperx is terrible. Please consider using a different whisper backend if you want to slice audio.")
|
|
||||||
|
|
||||||
os.makedirs(f'{outdir}/audio/', exist_ok=True)
|
os.makedirs(f'{outdir}/audio/', exist_ok=True)
|
||||||
|
|
||||||
results = {}
|
results = {}
|
||||||
|
@ -1708,7 +1688,7 @@ def setup_args():
|
||||||
parser.add_argument("--output-volume", type=float, default=default_arguments['output-volume'], help="Adjusts volume of output")
|
parser.add_argument("--output-volume", type=float, default=default_arguments['output-volume'], help="Adjusts volume of output")
|
||||||
|
|
||||||
parser.add_argument("--autoregressive-model", default=default_arguments['autoregressive-model'], help="Specifies which autoregressive model to use for sampling.")
|
parser.add_argument("--autoregressive-model", default=default_arguments['autoregressive-model'], help="Specifies which autoregressive model to use for sampling.")
|
||||||
parser.add_argument("--whisper-backend", default=default_arguments['whisper-backend'], action='store_true', help="Picks which whisper backend to use (openai/whisper, lightmare/whispercpp, m-bain/whisperx)")
|
parser.add_argument("--whisper-backend", default=default_arguments['whisper-backend'], action='store_true', help="Picks which whisper backend to use (openai/whisper, lightmare/whispercpp)")
|
||||||
parser.add_argument("--whisper-model", default=default_arguments['whisper-model'], help="Specifies which whisper model to use for transcription.")
|
parser.add_argument("--whisper-model", default=default_arguments['whisper-model'], help="Specifies which whisper model to use for transcription.")
|
||||||
|
|
||||||
parser.add_argument("--training-default-halfp", action='store_true', default=default_arguments['training-default-halfp'], help="Training default: halfp")
|
parser.add_argument("--training-default-halfp", action='store_true', default=default_arguments['training-default-halfp'], help="Training default: halfp")
|
||||||
|
@ -2069,12 +2049,13 @@ def unload_voicefixer():
|
||||||
def load_whisper_model(language=None, model_name=None, progress=None):
|
def load_whisper_model(language=None, model_name=None, progress=None):
|
||||||
global whisper_model
|
global whisper_model
|
||||||
|
|
||||||
|
if model_name == "m-bain/whisperx":
|
||||||
|
print("WhisperX has been removed. Reverting to openai/whisper. Apologies for the inconvenience.")
|
||||||
|
model_name = "openai/whisper"
|
||||||
|
|
||||||
if args.whisper_backend not in WHISPER_BACKENDS:
|
if args.whisper_backend not in WHISPER_BACKENDS:
|
||||||
raise Exception(f"unavailable backend: {args.whisper_backend}")
|
raise Exception(f"unavailable backend: {args.whisper_backend}")
|
||||||
|
|
||||||
if args.whisper_backend != "m-bain/whisperx" and model_name == "large-v2":
|
|
||||||
raise Exception("large-v2 is only available for m-bain/whisperx backend")
|
|
||||||
|
|
||||||
if not model_name:
|
if not model_name:
|
||||||
model_name = args.whisper_model
|
model_name = args.whisper_model
|
||||||
else:
|
else:
|
||||||
|
@ -2097,10 +2078,6 @@ def load_whisper_model(language=None, model_name=None, progress=None):
|
||||||
|
|
||||||
b_lang = language.encode('ascii')
|
b_lang = language.encode('ascii')
|
||||||
whisper_model = Whisper(model_name, models_dir='./models/', language=b_lang)
|
whisper_model = Whisper(model_name, models_dir='./models/', language=b_lang)
|
||||||
elif args.whisper_backend == "m-bain/whisperx":
|
|
||||||
import whisperx
|
|
||||||
device = "cuda" if get_device_name() == "cuda" else "cpu"
|
|
||||||
whisper_model = whisperx.load_model(model_name, device)
|
|
||||||
|
|
||||||
print("Loaded Whisper model")
|
print("Loaded Whisper model")
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user