From 6d8c2dd459f8945fdf8d77c09cbb3efea0de495c Mon Sep 17 00:00:00 2001 From: mrq Date: Fri, 3 Mar 2023 21:13:48 +0000 Subject: [PATCH] auto-suggested voice chunk size is based on the total duration of the voice files divided by 10 seconds, added setting to adjust the auto-suggested division factor (a really oddly worded one), because I'm sure people will OOM blindly generating without adjusting this slider --- src/utils.py | 26 +++++++++++++++++++++++--- src/webui.py | 28 ++++++++++++++-------------- 2 files changed, 37 insertions(+), 17 deletions(-) diff --git a/src/utils.py b/src/utils.py index 3ad4eb1..f6f88a3 100755 --- a/src/utils.py +++ b/src/utils.py @@ -454,6 +454,22 @@ def hash_file(path, algo="md5", buffer_size=0): return "{0}".format(hash.hexdigest()) +def update_baseline_for_latents_chunks( voice ): + path = f'{get_voice_dir()}/{voice}/' + if not os.path.isdir(path): + return 1 + + files = os.listdir(path) + total_duration = 0 + for file in files: + if file[-4:] != ".wav": + continue + metadata = torchaudio.info(f'{path}/{file}') + duration = metadata.num_channels * metadata.num_frames / metadata.sample_rate + total_duration += duration + + return int(total_duration / args.autocalculate_voice_chunk_duration_size) if total_duration > 0 else 1 + def compute_latents(voice, voice_latents_chunks, progress=gr.Progress(track_tqdm=True)): global tts global args @@ -1244,6 +1260,7 @@ def setup_args(): 'prune-nonfinal-outputs': True, 'use-bigvgan-vocoder': True, 'concurrency-count': 2, + 'autocalculate-voice-chunk-duration-size': 10, 'output-sample-rate': 44100, 'output-volume': 1, @@ -1282,6 +1299,7 @@ def setup_args(): parser.add_argument("--device-override", default=default_arguments['device-override'], help="A device string to override pass through Torch") parser.add_argument("--sample-batch-size", default=default_arguments['sample-batch-size'], type=int, help="Sets how many batches to use during the autoregressive samples pass") parser.add_argument("--concurrency-count", type=int, default=default_arguments['concurrency-count'], help="How many Gradio events to process at once") + parser.add_argument("--autocalculate-voice-chunk-duration-size", type=float, default=default_arguments['autocalculate-voice-chunk-duration-size'], help="Number of seconds to suggest voice chunk size for (for example, 100 seconds of audio at 10 seconds per chunk will suggest 10 chunks)") parser.add_argument("--output-sample-rate", type=int, default=default_arguments['output-sample-rate'], help="Sample rate to resample the output to (from 24KHz)") parser.add_argument("--output-volume", type=float, default=default_arguments['output-volume'], help="Adjusts volume of output") @@ -1321,7 +1339,7 @@ def setup_args(): return args -def update_args( listen, share, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, voice_fixer, voice_fixer_use_cuda, force_cpu_for_conditioning_latents, defer_tts_load, prune_nonfinal_outputs, use_bigvgan_vocoder, device_override, sample_batch_size, concurrency_count, output_sample_rate, output_volume, autoregressive_model, whisper_model, whisper_cpp, training_default_halfp, training_default_bnb ): +def update_args( listen, share, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, voice_fixer, voice_fixer_use_cuda, force_cpu_for_conditioning_latents, defer_tts_load, prune_nonfinal_outputs, use_bigvgan_vocoder, device_override, sample_batch_size, concurrency_count, autocalculate_voice_chunk_duration_size, output_volume, autoregressive_model, whisper_model, whisper_cpp, training_default_halfp, training_default_bnb ): global args args.listen = listen @@ -1340,7 +1358,8 @@ def update_args( listen, share, check_for_updates, models_from_local_only, low_v args.voice_fixer = voice_fixer args.voice_fixer_use_cuda = voice_fixer_use_cuda args.concurrency_count = concurrency_count - args.output_sample_rate = output_sample_rate + args.output_sample_rate = 44000 + args.autocalculate_voice_chunk_duration_size = autocalculate_voice_chunk_duration_size args.output_volume = output_volume args.autoregressive_model = autoregressive_model @@ -1372,6 +1391,7 @@ def save_args_settings(): 'voice-fixer-use-cuda': args.voice_fixer_use_cuda, 'concurrency-count': args.concurrency_count, 'output-sample-rate': args.output_sample_rate, + 'autocalculate-voice-chunk-duration-size': args.autocalculate_voice_chunk_duration_size, 'output-volume': args.output_volume, 'autoregressive-model': args.autoregressive_model, @@ -1481,7 +1501,7 @@ def load_tts( restart=False, model=None ): tts = TextToSpeech(minor_optimizations=not args.low_vram, autoregressive_model_path=args.autoregressive_model) except Exception as e: tts = TextToSpeech(minor_optimizations=not args.low_vram) - update_autoregressive_model(args.autoregressive_model) + load_autoregressive_model(args.autoregressive_model) if not hasattr(tts, 'autoregressive_model_hash'): tts.autoregressive_model_hash = hash_file(tts.autoregressive_model_path) diff --git a/src/webui.py b/src/webui.py index 12eb476..396bde1 100755 --- a/src/webui.py +++ b/src/webui.py @@ -385,19 +385,6 @@ def setup_gradio(): refresh_voices = gr.Button(value="Refresh Voice List") recompute_voice_latents = gr.Button(value="(Re)Compute Voice Latents") - def update_baseline_for_latents_chunks( voice ): - path = f'{get_voice_dir()}/{voice}/' - if not os.path.isdir(path): - return 1 - - files = os.listdir(path) - count = 0 - for file in files: - if file[-4:] == ".wav": - count += 1 - - return count if count > 0 else 1 - voice.change( fn=update_baseline_for_latents_chunks, inputs=voice, @@ -575,7 +562,7 @@ def setup_gradio(): exec_inputs = exec_inputs + [ gr.Number(label="Sample Batch Size", precision=0, value=args.sample_batch_size), gr.Number(label="Gradio Concurrency Count", precision=0, value=args.concurrency_count), - gr.Number(label="Output Sample Rate", precision=0, value=args.output_sample_rate), + gr.Number(label="Auto-Calculate Voice Chunk Duration (in seconds)", precision=0, value=args.autocalculate_voice_chunk_duration_size), gr.Slider(label="Output Volume", minimum=0, maximum=2, value=args.output_volume), ] @@ -594,6 +581,7 @@ def setup_gradio(): inputs=autoregressive_model_dropdown, outputs=None ) + # kill_button = gr.Button(value="Close UI") def update_model_list_proxy( val ): autoregressive_models = get_autoregressive_models() @@ -814,6 +802,18 @@ def setup_gradio(): outputs=save_yaml_output #console_output ) + """ + def kill_process(): + ui.close() + exit() + + kill_button.click( + kill_process, + inputs=None, + outputs=None + ) + """ + if os.path.isfile('./config/generate.json'): ui.load(import_generate_settings, inputs=None, outputs=input_settings)