From c5337a6b512de845cbac0780073aa73d1e93e537 Mon Sep 17 00:00:00 2001 From: mrq Date: Sat, 11 Feb 2023 15:02:11 +0000 Subject: [PATCH] Added integration for "voicefixer", fixed issue where candidates>1 and lines>1 only outputs the last combined candidate, numbered step for each generation in progress, output time per generation step --- README.md | 1 + requirements.txt | 3 +- tortoise/api.py | 2 +- tortoise/utils/diffusion.py | 2 +- webui.py | 86 ++++++++++++++++++++++++------------- 5 files changed, 60 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 1150582..362de82 100755 --- a/README.md +++ b/README.md @@ -217,6 +217,7 @@ Below are settings that override the default launch arguments. Some of these req * `Low VRAM`: disables optimizations in TorToiSe that increases VRAM consumption. Suggested if your GPU has under 6GiB. * `Embed Output Metadata`: enables embedding the settings and latents used to generate that audio clip inside that audio clip. Metadata is stored as a JSON string in the `lyrics` tag. * `Slimmer Computed Latents`: falls back to the original, 12.9KiB way of storing latents (without the extra bits required for using the CVVP model). +* `Voice Fixer`: runs each generated audio clip through `voicefixer`, if available and installed. * `Voice Latent Max Chunk Size`: during the voice latents calculation pass, this limits how large, in bytes, a chunk can be. Large values can run into VRAM OOM errors. * `Sample Batch Size`: sets the batch size when generating autoregressive samples. Bigger batches result in faster compute, at the cost of increased VRAM consumption. Leave to 0 to calculate a "best" fit. * `Concurrency Count`: how many Gradio events the queue can process at once. Leave this over 1 if you want to modify settings in the UI that updates other settings while generating audio clips. diff --git a/requirements.txt b/requirements.txt index e90842c..56b5906 100755 --- a/requirements.txt +++ b/requirements.txt @@ -15,4 +15,5 @@ numpy numba gradio music-tag -k-diffusion \ No newline at end of file +k-diffusion +voicefixer \ No newline at end of file diff --git a/tortoise/api.py b/tortoise/api.py index b7c799a..24cd30d 100755 --- a/tortoise/api.py +++ b/tortoise/api.py @@ -61,7 +61,7 @@ def tqdm_override(arr, verbose=False, progress=None, desc=None): if progress is None: return tqdm(arr, disable=not verbose) - return progress.tqdm(arr, desc=desc, track_tqdm=True) + return progress.tqdm(arr, desc=f'{progress.msg_prefix} {desc}' if hasattr(progress, 'msg_prefix') else desc, track_tqdm=True) def download_models(specific_models=None): """ diff --git a/tortoise/utils/diffusion.py b/tortoise/utils/diffusion.py index 3f20689..c706416 100755 --- a/tortoise/utils/diffusion.py +++ b/tortoise/utils/diffusion.py @@ -21,7 +21,7 @@ def tqdm_override(arr, verbose=False, progress=None, desc=None): if progress is None: return tqdm(arr, disable=not verbose) - return progress.tqdm(arr, desc=desc, track_tqdm=True) + return progress.tqdm(arr, desc=f'{progress.msg_prefix} {desc}' if hasattr(progress, 'msg_prefix') else desc, track_tqdm=True) def normal_kl(mean1, logvar1, mean2, logvar2): """ diff --git a/webui.py b/webui.py index 73390b7..ebf3b43 100755 --- a/webui.py +++ b/webui.py @@ -20,6 +20,8 @@ from tortoise.api import TextToSpeech from tortoise.utils.audio import load_audio, load_voice, load_voices, get_voice_dir from tortoise.utils.text import split_and_recombine_text +voicefixer = None + def generate( text, delimiter, @@ -51,7 +53,6 @@ def generate( except NameError: raise gr.Error("TTS is still initializing...") - if voice != "microphone": voices = [voice] else: @@ -128,14 +129,17 @@ def generate( audio_cache = {} - resampler = torchaudio.transforms.Resample( - tts.output_sample_rate, - args.output_sample_rate, - lowpass_filter_width=16, - rolloff=0.85, - resampling_method="kaiser_window", - beta=8.555504641634386, - ) if tts.output_sample_rate != args.output_sample_rate else None + resample = None + # not a ternary in the event for some reason I want to rely on librosa's upsampling interpolator rather than torchaudio's, for some reason + if tts.output_sample_rate != args.output_sample_rate: + resampler = torchaudio.transforms.Resample( + tts.output_sample_rate, + args.output_sample_rate, + lowpass_filter_width=16, + rolloff=0.85, + resampling_method="kaiser_window", + beta=8.555504641634386, + ) volume_adjust = torchaudio.transforms.Vol(gain=args.output_volume, gain_type="amplitude") if args.output_volume != 1 else None @@ -147,11 +151,10 @@ def generate( idx = idx + 1 def get_name(line=0, candidate=0, combined=False): - if combined: - return f"{idx}_combined" - name = f"{idx}" - if len(texts) > 1: + if combined: + name = f"{name}_combined" + elif len(texts) > 1: name = f"{name}_{line}" if candidates > 1: name = f"{name}_{candidate}" @@ -164,12 +167,14 @@ def generate( else: cut_text = f"[I am really {emotion.lower()},] {cut_text}" - print(f"[{str(line+1)}/{str(len(texts))}] Generating line: {cut_text}") + progress.msg_prefix = f'[{str(line+1)}/{str(len(texts))}]' + print(f"{progress.msg_prefix} Generating line: {cut_text}") start_time = time.time() gen, additionals = tts.tts(cut_text, **settings ) seed = additionals[0] run_time = time.time()-start_time + print(f"Generating line took {run_time} seconds") if isinstance(gen, list): for j, g in enumerate(gen): @@ -203,15 +208,11 @@ def generate( for candidate in range(candidates): audio_clips = [] for line in range(len(texts)): - if isinstance(gen, list): - name = get_name(line=line, candidate=candidate) - audio = audio_cache[name]['audio'] - else: - name = get_name(line=line) - audio = audio_cache[name]['audio'] + name = get_name(line=line, candidate=candidate) + audio = audio_cache[name]['audio'] audio_clips.append(audio) - name = get_name(combined=True) + name = get_name(candidate=candidate, combined=True) audio = torch.cat(audio_clips, dim=-1) torchaudio.save(f'{outdir}/{voice}_{name}.wav', audio, args.output_sample_rate) @@ -225,16 +226,10 @@ def generate( output_voices.append(f'{outdir}/{voice}_{name}.wav') if output_voice is None: output_voice = f'{outdir}/{voice}_{name}.wav' - # output_voice = audio else: - if candidates > 1: - for candidate in range(candidates): - name = get_name(candidate=candidate) - output_voices.append(f'{outdir}/{voice}_{name}.wav') - else: - name = get_name() + for candidate in range(candidates): + name = get_name(candidate=candidate) output_voices.append(f'{outdir}/{voice}_{name}.wav') - #output_voice = f'{outdir}/{voice}_{name}.wav' info = { 'text': text, @@ -267,8 +262,21 @@ def generate( with open(f'{get_voice_dir()}/{voice}/cond_latents.pth', 'rb') as f: info['latents'] = base64.b64encode(f.read()).decode("ascii") + if voicefixer: + # we could do this on the pieces before they get stiched up anyways to save some compute + # but the stitching would need to read back from disk, defeating the point of caching the waveform + for path in progress.tqdm(audio_cache, desc="Running voicefix..."): + print("VoiceFix starting") + voicefixer.restore( + input=f'{outdir}/{voice}_{k}.wav', + output=f'{outdir}/{voice}_{k}.wav', + #cuda=False, + #mode=mode, + ) + print("VoiceFix finished") + if args.embed_output_metadata: - for path in audio_cache: + for path in progress.tqdm(audio_cache, desc="Embedding metadata..."): info['text'] = audio_cache[path]['text'] info['time'] = audio_cache[path]['time'] @@ -438,7 +446,7 @@ def cancel_generate(): def update_voices(): return gr.Dropdown.update(choices=sorted(os.listdir(get_voice_dir())) + ["microphone"]) -def export_exec_settings( share, listen, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, cond_latent_max_chunk_size, sample_batch_size, concurrency_count, output_sample_rate, output_volume ): +def export_exec_settings( share, listen, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, voice_fixer, cond_latent_max_chunk_size, sample_batch_size, concurrency_count, output_sample_rate, output_volume ): args.share = share args.listen = listen args.low_vram = low_vram @@ -448,6 +456,7 @@ def export_exec_settings( share, listen, check_for_updates, models_from_local_on args.sample_batch_size = sample_batch_size args.embed_output_metadata = embed_output_metadata args.latents_lean_and_mean = latents_lean_and_mean + args.voice_fixer = voice_fixer args.concurrency_count = concurrency_count args.output_sample_rate = output_sample_rate args.output_volume = output_volume @@ -462,6 +471,7 @@ def export_exec_settings( share, listen, check_for_updates, models_from_local_on 'sample-batch-size': args.sample_batch_size, 'embed-output-metadata': args.embed_output_metadata, 'latents-lean-and-mean': args.latents_lean_and_mean, + 'voice-fixer': args.voice_fixer, 'concurrency-count': args.concurrency_count, 'output-sample-rate': args.output_sample_rate, 'output-volume': args.output_volume, @@ -480,6 +490,7 @@ def setup_args(): 'sample-batch-size': None, 'embed-output-metadata': True, 'latents-lean-and-mean': True, + 'voice-fixer': True, 'cond-latent-max-chunk-size': 1000000, 'concurrency-count': 2, 'output-sample-rate': 44100, @@ -500,6 +511,7 @@ def setup_args(): parser.add_argument("--low-vram", action='store_true', default=default_arguments['low-vram'], help="Disables some optimizations that increases VRAM usage") parser.add_argument("--no-embed-output-metadata", action='store_false', default=not default_arguments['embed-output-metadata'], help="Disables embedding output metadata into resulting WAV files for easily fetching its settings used with the web UI (data is stored in the lyrics metadata tag)") parser.add_argument("--latents-lean-and-mean", action='store_true', default=default_arguments['latents-lean-and-mean'], help="Exports the bare essentials for latents.") + parser.add_argument("--voice-fixer", action='store_true', default=default_arguments['voice-fixer'], help="Uses python module 'voicefixer' to improve audio quality, if available.") parser.add_argument("--cond-latent-max-chunk-size", default=default_arguments['cond-latent-max-chunk-size'], type=int, help="Sets an upper limit to audio chunk size when computing conditioning latents") parser.add_argument("--sample-batch-size", default=default_arguments['sample-batch-size'], type=int, help="Sets an upper limit to audio chunk size when computing conditioning latents") parser.add_argument("--concurrency-count", type=int, default=default_arguments['concurrency-count'], help="How many Gradio events to process at once") @@ -526,6 +538,17 @@ def setup_args(): def setup_tortoise(): global args + global voicefixer + + if args.voice_fixer: + try: + from voicefixer import VoiceFixer + print("Initializating voice-fixer") + voicefixer = VoiceFixer() + print("initialized voice-fixer") + except Exception as e: + pass + print("Initializating TorToiSe...") tts = TextToSpeech(minor_optimizations=not args.low_vram) print("TorToiSe initialized, ready for generation.") @@ -736,6 +759,7 @@ def setup_gradio(): gr.Checkbox(label="Low VRAM", value=args.low_vram), gr.Checkbox(label="Embed Output Metadata", value=args.embed_output_metadata), gr.Checkbox(label="Slimmer Computed Latents", value=args.latents_lean_and_mean), + gr.Checkbox(label="Voice Fixer", value=args.voice_fixer), ] gr.Button(value="Check for Updates").click(check_for_updates) gr.Button(value="Reload TTS").click(reload_tts)