forked from mrq/ai-voice-cloning
added api_name for generation, added whisperx backend, relocated use whispercpp option to whisper backend list
This commit is contained in:
parent
788a957f79
commit
0e3bbc55f8
|
@ -1,4 +1,6 @@
|
||||||
git+https://github.com/openai/whisper.git
|
git+https://github.com/openai/whisper.git
|
||||||
|
git+https://github.com/m-bain/whisperx.git
|
||||||
|
|
||||||
more-itertools
|
more-itertools
|
||||||
ffmpeg-python
|
ffmpeg-python
|
||||||
gradio
|
gradio
|
||||||
|
|
69
src/utils.py
69
src/utils.py
|
@ -37,8 +37,11 @@ from tortoise.utils.text import split_and_recombine_text
|
||||||
from tortoise.utils.device import get_device_name, set_device_name
|
from tortoise.utils.device import get_device_name, set_device_name
|
||||||
|
|
||||||
MODELS['dvae.pth'] = "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/3704aea61678e7e468a06d8eea121dba368a798e/.models/dvae.pth"
|
MODELS['dvae.pth'] = "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/3704aea61678e7e468a06d8eea121dba368a798e/.models/dvae.pth"
|
||||||
WHISPER_MODELS = ["tiny", "base", "small", "medium", "large"]
|
|
||||||
|
WHISPER_MODELS = ["tiny", "base", "small", "medium", "large", "large-v2"]
|
||||||
WHISPER_SPECIALIZED_MODELS = ["tiny.en", "base.en", "small.en", "medium.en"]
|
WHISPER_SPECIALIZED_MODELS = ["tiny.en", "base.en", "small.en", "medium.en"]
|
||||||
|
WHISPER_BACKENDS = ["openai/whisper", "lightmare/whispercpp", "m-bain/whisperx"]
|
||||||
|
|
||||||
EPOCH_SCHEDULE = [ 9, 18, 25, 33 ]
|
EPOCH_SCHEDULE = [ 9, 18, 25, 33 ]
|
||||||
|
|
||||||
args = None
|
args = None
|
||||||
|
@ -1001,28 +1004,38 @@ def whisper_transcribe( file, language=None ):
|
||||||
if not whisper_model:
|
if not whisper_model:
|
||||||
load_whisper_model(language=language)
|
load_whisper_model(language=language)
|
||||||
|
|
||||||
if not args.whisper_cpp:
|
if args.whisper_backend == "openai/whisper":
|
||||||
if not language:
|
if not language:
|
||||||
language = None
|
language = None
|
||||||
|
|
||||||
return whisper_model.transcribe(file, language=language)
|
return whisper_model.transcribe(file, language=language)
|
||||||
|
|
||||||
res = whisper_model.transcribe(file)
|
elif args.whisper_backend == "lightmare/whispercpp":
|
||||||
segments = whisper_model.extract_text_and_timestamps( res )
|
res = whisper_model.transcribe(file)
|
||||||
|
segments = whisper_model.extract_text_and_timestamps( res )
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
'segments': []
|
'segments': []
|
||||||
}
|
|
||||||
for segment in segments:
|
|
||||||
reparsed = {
|
|
||||||
'start': segment[0] / 100.0,
|
|
||||||
'end': segment[1] / 100.0,
|
|
||||||
'text': segment[2],
|
|
||||||
}
|
}
|
||||||
result['segments'].append(reparsed)
|
for segment in segments:
|
||||||
|
reparsed = {
|
||||||
|
'start': segment[0] / 100.0,
|
||||||
|
'end': segment[1] / 100.0,
|
||||||
|
'text': segment[2],
|
||||||
|
}
|
||||||
|
result['segments'].append(reparsed)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
# credit to https://git.ecker.tech/yqxtqymn for the busywork of getting this added
|
||||||
|
elif args.whisper_backend == "m-bain/whisperx":
|
||||||
|
import whisperx
|
||||||
|
device = "cuda" if get_device_name() == "cuda" else "cpu"
|
||||||
|
result = whisper_model.transcribe(file)
|
||||||
|
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
|
||||||
|
result_aligned = whisperx.align(result["segments"], model_a, metadata, file, device)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
def prepare_dataset( files, outdir, language=None, progress=None ):
|
def prepare_dataset( files, outdir, language=None, progress=None ):
|
||||||
unload_tts()
|
unload_tts()
|
||||||
|
@ -1425,8 +1438,8 @@ def setup_args():
|
||||||
'output-volume': 1,
|
'output-volume': 1,
|
||||||
|
|
||||||
'autoregressive-model': None,
|
'autoregressive-model': None,
|
||||||
|
'whisper-backend': 'openai/whisper',
|
||||||
'whisper-model': "base",
|
'whisper-model': "base",
|
||||||
'whisper-cpp': False,
|
|
||||||
|
|
||||||
'training-default-halfp': False,
|
'training-default-halfp': False,
|
||||||
'training-default-bnb': True,
|
'training-default-bnb': True,
|
||||||
|
@ -1464,8 +1477,8 @@ def setup_args():
|
||||||
parser.add_argument("--output-volume", type=float, default=default_arguments['output-volume'], help="Adjusts volume of output")
|
parser.add_argument("--output-volume", type=float, default=default_arguments['output-volume'], help="Adjusts volume of output")
|
||||||
|
|
||||||
parser.add_argument("--autoregressive-model", default=default_arguments['autoregressive-model'], help="Specifies which autoregressive model to use for sampling.")
|
parser.add_argument("--autoregressive-model", default=default_arguments['autoregressive-model'], help="Specifies which autoregressive model to use for sampling.")
|
||||||
|
parser.add_argument("--whisper-backend", default=default_arguments['whisper-backend'], action='store_true', help="Picks which whisper backend to use (openai/whisper, lightmare/whispercpp, m-bain/whisperx)")
|
||||||
parser.add_argument("--whisper-model", default=default_arguments['whisper-model'], help="Specifies which whisper model to use for transcription.")
|
parser.add_argument("--whisper-model", default=default_arguments['whisper-model'], help="Specifies which whisper model to use for transcription.")
|
||||||
parser.add_argument("--whisper-cpp", default=default_arguments['whisper-cpp'], action='store_true', help="Leverages lightmare/whispercpp for transcription")
|
|
||||||
|
|
||||||
parser.add_argument("--training-default-halfp", action='store_true', default=default_arguments['training-default-halfp'], help="Training default: halfp")
|
parser.add_argument("--training-default-halfp", action='store_true', default=default_arguments['training-default-halfp'], help="Training default: halfp")
|
||||||
parser.add_argument("--training-default-bnb", action='store_true', default=default_arguments['training-default-bnb'], help="Training default: bnb")
|
parser.add_argument("--training-default-bnb", action='store_true', default=default_arguments['training-default-bnb'], help="Training default: bnb")
|
||||||
|
@ -1499,7 +1512,7 @@ def setup_args():
|
||||||
|
|
||||||
return args
|
return args
|
||||||
|
|
||||||
def update_args( listen, share, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, voice_fixer, voice_fixer_use_cuda, force_cpu_for_conditioning_latents, defer_tts_load, prune_nonfinal_outputs, use_bigvgan_vocoder, device_override, sample_batch_size, concurrency_count, autocalculate_voice_chunk_duration_size, output_volume, autoregressive_model, whisper_model, whisper_cpp, training_default_halfp, training_default_bnb ):
|
def update_args( listen, share, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, voice_fixer, voice_fixer_use_cuda, force_cpu_for_conditioning_latents, defer_tts_load, prune_nonfinal_outputs, use_bigvgan_vocoder, device_override, sample_batch_size, concurrency_count, autocalculate_voice_chunk_duration_size, output_volume, autoregressive_model, whisper_backend, whisper_model, training_default_halfp, training_default_bnb ):
|
||||||
global args
|
global args
|
||||||
|
|
||||||
args.listen = listen
|
args.listen = listen
|
||||||
|
@ -1523,8 +1536,8 @@ def update_args( listen, share, check_for_updates, models_from_local_only, low_v
|
||||||
args.output_volume = output_volume
|
args.output_volume = output_volume
|
||||||
|
|
||||||
args.autoregressive_model = autoregressive_model
|
args.autoregressive_model = autoregressive_model
|
||||||
|
args.whisper_backend = whisper_backend
|
||||||
args.whisper_model = whisper_model
|
args.whisper_model = whisper_model
|
||||||
args.whisper_cpp = whisper_cpp
|
|
||||||
|
|
||||||
args.training_default_halfp = training_default_halfp
|
args.training_default_halfp = training_default_halfp
|
||||||
args.training_default_bnb = training_default_bnb
|
args.training_default_bnb = training_default_bnb
|
||||||
|
@ -1555,8 +1568,8 @@ def save_args_settings():
|
||||||
'output-volume': args.output_volume,
|
'output-volume': args.output_volume,
|
||||||
|
|
||||||
'autoregressive-model': args.autoregressive_model,
|
'autoregressive-model': args.autoregressive_model,
|
||||||
|
'whisper-backend': args.whisper_backend,
|
||||||
'whisper-model': args.whisper_model,
|
'whisper-model': args.whisper_model,
|
||||||
'whisper-cpp': args.whisper_cpp,
|
|
||||||
|
|
||||||
'training-default-halfp': args.training_default_halfp,
|
'training-default-halfp': args.training_default_halfp,
|
||||||
'training-default-bnb': args.training_default_bnb,
|
'training-default-bnb': args.training_default_bnb,
|
||||||
|
@ -1762,6 +1775,12 @@ def unload_voicefixer():
|
||||||
def load_whisper_model(language=None, model_name=None, progress=None):
|
def load_whisper_model(language=None, model_name=None, progress=None):
|
||||||
global whisper_model
|
global whisper_model
|
||||||
|
|
||||||
|
if args.whisper_backend not in WHISPER_BACKENDS:
|
||||||
|
raise Exception(f"unavailable backend: {args.whisper_backend}")
|
||||||
|
|
||||||
|
if args.whisper_backend != "m-bain/whisperx" and model_name == "large-v2":
|
||||||
|
raise Exception("large-v2 is only available for m-bain/whisperx backend")
|
||||||
|
|
||||||
if not model_name:
|
if not model_name:
|
||||||
model_name = args.whisper_model
|
model_name = args.whisper_model
|
||||||
else:
|
else:
|
||||||
|
@ -1774,16 +1793,20 @@ def load_whisper_model(language=None, model_name=None, progress=None):
|
||||||
|
|
||||||
notify_progress(f"Loading Whisper model: {model_name}", progress)
|
notify_progress(f"Loading Whisper model: {model_name}", progress)
|
||||||
|
|
||||||
if args.whisper_cpp:
|
if args.whisper_backend == "openai/whisper":
|
||||||
|
import whisper
|
||||||
|
whisper_model = whisper.load_model(model_name)
|
||||||
|
elif args.whisper_backend == "lightmare/whispercpp":
|
||||||
from whispercpp import Whisper
|
from whispercpp import Whisper
|
||||||
if not language:
|
if not language:
|
||||||
language = 'auto'
|
language = 'auto'
|
||||||
|
|
||||||
b_lang = language.encode('ascii')
|
b_lang = language.encode('ascii')
|
||||||
whisper_model = Whisper(model_name, models_dir='./models/', language=b_lang)
|
whisper_model = Whisper(model_name, models_dir='./models/', language=b_lang)
|
||||||
else:
|
elif args.whisper_backend == "m-bain/whisperx":
|
||||||
import whisper
|
import whisperx
|
||||||
whisper_model = whisper.load_model(model_name)
|
device = "cuda" if get_device_name() == "cuda" else "cpu"
|
||||||
|
whisper_model = whisperx.load_model(model_name, device)
|
||||||
|
|
||||||
print("Loaded Whisper model")
|
print("Loaded Whisper model")
|
||||||
|
|
||||||
|
|
|
@ -589,10 +589,10 @@ def setup_gradio():
|
||||||
|
|
||||||
autoregressive_model_dropdown = gr.Dropdown(choices=autoregressive_models, label="Autoregressive Model", value=args.autoregressive_model if args.autoregressive_model else autoregressive_models[0])
|
autoregressive_model_dropdown = gr.Dropdown(choices=autoregressive_models, label="Autoregressive Model", value=args.autoregressive_model if args.autoregressive_model else autoregressive_models[0])
|
||||||
|
|
||||||
|
whisper_backend = gr.Dropdown(WHISPER_BACKENDS, label="Whisper Backends", value=args.whisper_backend)
|
||||||
whisper_model_dropdown = gr.Dropdown(WHISPER_MODELS, label="Whisper Model", value=args.whisper_model)
|
whisper_model_dropdown = gr.Dropdown(WHISPER_MODELS, label="Whisper Model", value=args.whisper_model)
|
||||||
use_whisper_cpp = gr.Checkbox(label="Use Whisper.cpp", value=args.whisper_cpp)
|
|
||||||
|
|
||||||
exec_inputs = exec_inputs + [ autoregressive_model_dropdown, whisper_model_dropdown, use_whisper_cpp, training_halfp, training_bnb ]
|
exec_inputs = exec_inputs + [ autoregressive_model_dropdown, whisper_backend, whisper_model_dropdown, training_halfp, training_bnb ]
|
||||||
|
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
autoregressive_models_update_button = gr.Button(value="Refresh Model List")
|
autoregressive_models_update_button = gr.Button(value="Refresh Model List")
|
||||||
|
@ -732,6 +732,7 @@ def setup_gradio():
|
||||||
submit_event = submit.click(run_generation,
|
submit_event = submit.click(run_generation,
|
||||||
inputs=input_settings,
|
inputs=input_settings,
|
||||||
outputs=[output_audio, source_sample, candidates_list, generation_results],
|
outputs=[output_audio, source_sample, candidates_list, generation_results],
|
||||||
|
api_name="generate",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,5 +3,6 @@ git pull
|
||||||
git submodule update --remote
|
git submodule update --remote
|
||||||
|
|
||||||
if python -m pip show whispercpp &>/dev/null; then python -m pip install -U git+https://git.ecker.tech/lightmare/whispercpp.py; fi
|
if python -m pip show whispercpp &>/dev/null; then python -m pip install -U git+https://git.ecker.tech/lightmare/whispercpp.py; fi
|
||||||
|
if python -m pip show whisperx &>/dev/null; then python -m pip install -U git+https://github.com/m-bain/whisperx.git; fi
|
||||||
|
|
||||||
deactivate
|
deactivate
|
Loading…
Reference in New Issue
Block a user