forked from mrq/ai-voice-cloning
added option to specify using BigVGAN as the vocoder for mrq/tortoise-tts
This commit is contained in:
parent
68f4858ce9
commit
740b5587df
|
@ -1239,6 +1239,7 @@ def setup_args():
|
||||||
'defer-tts-load': False,
|
'defer-tts-load': False,
|
||||||
'device-override': None,
|
'device-override': None,
|
||||||
'prune-nonfinal-outputs': True,
|
'prune-nonfinal-outputs': True,
|
||||||
|
'use-bigvgan-vocoder': True,
|
||||||
'concurrency-count': 2,
|
'concurrency-count': 2,
|
||||||
'output-sample-rate': 44100,
|
'output-sample-rate': 44100,
|
||||||
'output-volume': 1,
|
'output-volume': 1,
|
||||||
|
@ -1274,6 +1275,7 @@ def setup_args():
|
||||||
parser.add_argument("--force-cpu-for-conditioning-latents", default=default_arguments['force-cpu-for-conditioning-latents'], action='store_true', help="Forces computing conditional latents to be done on the CPU (if you constantyl OOM on low chunk counts)")
|
parser.add_argument("--force-cpu-for-conditioning-latents", default=default_arguments['force-cpu-for-conditioning-latents'], action='store_true', help="Forces computing conditional latents to be done on the CPU (if you constantyl OOM on low chunk counts)")
|
||||||
parser.add_argument("--defer-tts-load", default=default_arguments['defer-tts-load'], action='store_true', help="Defers loading TTS model")
|
parser.add_argument("--defer-tts-load", default=default_arguments['defer-tts-load'], action='store_true', help="Defers loading TTS model")
|
||||||
parser.add_argument("--prune-nonfinal-outputs", default=default_arguments['prune-nonfinal-outputs'], action='store_true', help="Deletes non-final output files on completing a generation")
|
parser.add_argument("--prune-nonfinal-outputs", default=default_arguments['prune-nonfinal-outputs'], action='store_true', help="Deletes non-final output files on completing a generation")
|
||||||
|
parser.add_argument("--use-bigvgan-vocoder", default=default_arguments['use-bigvgan-vocoder'], action='store_true', help="Uses BigVGAN in place of the default vocoder")
|
||||||
parser.add_argument("--device-override", default=default_arguments['device-override'], help="A device string to override pass through Torch")
|
parser.add_argument("--device-override", default=default_arguments['device-override'], help="A device string to override pass through Torch")
|
||||||
parser.add_argument("--sample-batch-size", default=default_arguments['sample-batch-size'], type=int, help="Sets how many batches to use during the autoregressive samples pass")
|
parser.add_argument("--sample-batch-size", default=default_arguments['sample-batch-size'], type=int, help="Sets how many batches to use during the autoregressive samples pass")
|
||||||
parser.add_argument("--concurrency-count", type=int, default=default_arguments['concurrency-count'], help="How many Gradio events to process at once")
|
parser.add_argument("--concurrency-count", type=int, default=default_arguments['concurrency-count'], help="How many Gradio events to process at once")
|
||||||
|
@ -1314,7 +1316,7 @@ def setup_args():
|
||||||
|
|
||||||
return args
|
return args
|
||||||
|
|
||||||
def update_args( listen, share, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, voice_fixer, voice_fixer_use_cuda, force_cpu_for_conditioning_latents, defer_tts_load, prune_nonfinal_outputs, device_override, sample_batch_size, concurrency_count, output_sample_rate, output_volume, autoregressive_model, whisper_model, whisper_cpp, training_default_halfp, training_default_bnb ):
|
def update_args( listen, share, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, voice_fixer, voice_fixer_use_cuda, force_cpu_for_conditioning_latents, defer_tts_load, prune_nonfinal_outputs, use_bigvgan_vocoder, device_override, sample_batch_size, concurrency_count, output_sample_rate, output_volume, autoregressive_model, whisper_model, whisper_cpp, training_default_halfp, training_default_bnb ):
|
||||||
global args
|
global args
|
||||||
|
|
||||||
args.listen = listen
|
args.listen = listen
|
||||||
|
@ -1325,6 +1327,7 @@ def update_args( listen, share, check_for_updates, models_from_local_only, low_v
|
||||||
args.force_cpu_for_conditioning_latents = force_cpu_for_conditioning_latents
|
args.force_cpu_for_conditioning_latents = force_cpu_for_conditioning_latents
|
||||||
args.defer_tts_load = defer_tts_load
|
args.defer_tts_load = defer_tts_load
|
||||||
args.prune_nonfinal_outputs = prune_nonfinal_outputs
|
args.prune_nonfinal_outputs = prune_nonfinal_outputs
|
||||||
|
args.use_bigvgan_vocoder = use_bigvgan_vocoder
|
||||||
args.device_override = device_override
|
args.device_override = device_override
|
||||||
args.sample_batch_size = sample_batch_size
|
args.sample_batch_size = sample_batch_size
|
||||||
args.embed_output_metadata = embed_output_metadata
|
args.embed_output_metadata = embed_output_metadata
|
||||||
|
@ -1355,6 +1358,7 @@ def save_args_settings():
|
||||||
'force-cpu-for-conditioning-latents': args.force_cpu_for_conditioning_latents,
|
'force-cpu-for-conditioning-latents': args.force_cpu_for_conditioning_latents,
|
||||||
'defer-tts-load': args.defer_tts_load,
|
'defer-tts-load': args.defer_tts_load,
|
||||||
'prune-nonfinal-outputs': args.prune_nonfinal_outputs,
|
'prune-nonfinal-outputs': args.prune_nonfinal_outputs,
|
||||||
|
'use-bigvgan-vocoder': args.use_bigvgan_vocoder,
|
||||||
'device-override': args.device_override,
|
'device-override': args.device_override,
|
||||||
'sample-batch-size': args.sample_batch_size,
|
'sample-batch-size': args.sample_batch_size,
|
||||||
'embed-output-metadata': args.embed_output_metadata,
|
'embed-output-metadata': args.embed_output_metadata,
|
||||||
|
@ -1469,7 +1473,7 @@ def load_tts( restart=False, model=None ):
|
||||||
|
|
||||||
tts_loading = True
|
tts_loading = True
|
||||||
try:
|
try:
|
||||||
tts = TextToSpeech(minor_optimizations=not args.low_vram, autoregressive_model_path=args.autoregressive_model)
|
tts = TextToSpeech(minor_optimizations=not args.low_vram, autoregressive_model_path=args.autoregressive_model, use_bigvgan=args.use_bigvgan_vocoder)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
tts = TextToSpeech(minor_optimizations=not args.low_vram)
|
tts = TextToSpeech(minor_optimizations=not args.low_vram)
|
||||||
load_autoregressive_model(args.autoregressive_model)
|
load_autoregressive_model(args.autoregressive_model)
|
||||||
|
|
|
@ -568,6 +568,7 @@ def setup_gradio():
|
||||||
gr.Checkbox(label="Force CPU for Conditioning Latents", value=args.force_cpu_for_conditioning_latents),
|
gr.Checkbox(label="Force CPU for Conditioning Latents", value=args.force_cpu_for_conditioning_latents),
|
||||||
gr.Checkbox(label="Do Not Load TTS On Startup", value=args.defer_tts_load),
|
gr.Checkbox(label="Do Not Load TTS On Startup", value=args.defer_tts_load),
|
||||||
gr.Checkbox(label="Delete Non-Final Output", value=args.prune_nonfinal_outputs),
|
gr.Checkbox(label="Delete Non-Final Output", value=args.prune_nonfinal_outputs),
|
||||||
|
gr.Checkbox(label="Use BigVGAN Vocoder", value=args.use_bigvgan_vocoder),
|
||||||
gr.Textbox(label="Device Override", value=args.device_override),
|
gr.Textbox(label="Device Override", value=args.device_override),
|
||||||
]
|
]
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
Subproject commit a9de01623044b3a5b2d614f0d7992a962b16b32f
|
Subproject commit aca32a71f798ebd8487c113d41d1b4e9ee15c315
|
Loading…
Reference in New Issue
Block a user