added button to recalculate voice latents, added experimental switch for computing voice latents

2023-02-12 18:11:40 +00:00 · 2023-02-12 18:11:40 +00:00 · 4d01bbd429
commit 4d01bbd429
parent 88529fda43
3 changed files with 40 additions and 6 deletions
--- a/README.md
+++ b/README.md
@ -158,6 +158,8 @@ However, keep in mind how you combine/separate your clips; depending on the mode
 * you might suffer from reduced throughput, as the smallest voice file will be used as the size of best fit
 * a voice might get split mid-word, affecting how the latents are computed, as each batch is averaged together

+For safety, try to keep your clips within the same length, or increase your `Voice Latents Max Chunk Size`, if console output alerts the best fit size exceeds this.
+
 If you're looking to trim your clips, in my opinion, ~~Audacity~~ Tenacity works good enough, as you can easily output your clips into the proper format (22050 Hz sampling rate).

 Power users with FFMPEG already installed can simply used the provided conversion script in `.\convert\`.
@ -182,6 +184,11 @@ You'll be presented with a bunch of options in the default `Generate` tab, but d
 * `Custom Emotion + Prompt`: a non-preset "emotion" used for the delivery. This is a shortcut to utilizing "prompt engineering" by starting with `[<emotion>]` in your prompt.
 * `Voice`: the voice you want to clone. You can select `microphone` if you want to use input from your microphone.
 * `Microphone Source`: Use your own voice from a line-in source.
+* `Reload Voice List`: refreshes the voice list and updates. ***Click this*** after adding or removing a new voice.
+* `(Re)Compute Voice Latents`: regenerates a voice's cached latents.
+* `Experimental Compute Latents Mode`: this mode will combine all voice samples into one file, then split it evenly (if under the maximum allowed chunk size under `Settings`)
+
+Below are a list of generation settings:
 * `Candidates`: number of outputs to generate, starting from the best candidate. Depending on your iteration steps, generating the final sound files could be cheap, but they only offer alternatives to the samples generated to pull from (in other words, the later candidates perform worse), so don't be compelled to generate a ton of candidates.
 * `Seed`: initializes the PRNG to this value. Use this if you want to reproduce a generated voice.
 * `Preset`: shortcut values for sample count and iteration steps. Clicking a preset will update its corresponding values. Higher presets result in better quality at the cost of computation time.
--- a/tortoise/api.py
+++ b/tortoise/api.py
@ -294,7 +294,7 @@ class TextToSpeech:
        if self.preloaded_tensors:
            self.cvvp = self.cvvp.to(self.device)

-    def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, progress=None, chunk_size=None, max_chunk_size=None, chunk_tensors=True, calculation_mode=0):
+    def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, progress=None, chunk_size=None, max_chunk_size=None, chunk_tensors=True, calculation_mode=1):
        """
        Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
        These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
@ -339,7 +339,6 @@ class TextToSpeech:
            diffusion_conds = []
            chunks = []

-
            # new behavior: combine all samples, and divide accordingly
            # doesn't work, need to fix
            if calculation_mode == 1:
@ -349,9 +348,9 @@ class TextToSpeech:

                if max_chunk_size is not None and chunk_size > max_chunk_size:
                    while chunk_size > max_chunk_size:
-                        chunk_size = chunk_size / 2
+                        chunk_size = int(chunk_size / 2)

-                print(f"Size of best fit: {chunk_size}")
+                print(f"Using method 1: size of best fit: {chunk_size}")
                chunks = torch.chunk(concat, int(concat.shape[-1] / chunk_size) + 1, dim=1)
            # default new behavior: use the smallest voice sample as a common chunk size
            else:
@ -362,7 +361,7 @@ class TextToSpeech:
                        else:
                            chunk_size = sample.shape[-1] if chunk_size is None else max( chunk_size, sample.shape[-1] )

-                print(f"Size of best fit: {chunk_size}")
+                print(f"Using method 0: size of best fit: {chunk_size}")
                if max_chunk_size is not None and chunk_size > max_chunk_size:
                    chunk_size = max_chunk_size
                    print(f"Chunk size exceeded, clamping to: {max_chunk_size}")
--- a/webui.py
+++ b/webui.py
@ -305,6 +305,27 @@ def generate(
        stats,
    )

+def compute_latents(voice, mode, progress=gr.Progress(track_tqdm=True)):
+    global tts
+    try:
+        tts
+    except NameError:
+        raise gr.Error("TTS is still initializing...")
+
+    voice_samples, conditioning_latents = load_voice(voice, load_latents=False)
+
+    if voice_samples is None:
+        return
+
+    conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, max_chunk_size=args.cond_latent_max_chunk_size, calculation_mode=1 if mode else 0)
+
+    if len(conditioning_latents) == 4:
+        conditioning_latents = (conditioning_latents[0], conditioning_latents[1], conditioning_latents[2], None)
+            
+    torch.save(conditioning_latents, f'{get_voice_dir()}/{voice}/cond_latents.pth')
+
+    return voice
+
 def update_presets(value):
    PRESETS = {
        'Ultra Fast': {'num_autoregressive_samples': 16, 'diffusion_iterations': 30, 'cond_free': False},
@ -467,7 +488,7 @@ def export_exec_settings( share, listen, check_for_updates, models_from_local_on

    settings = {
        'share': args.share,
-        'listen': args.listen,
+        'listen': None if args.listen else args.listen,
        'low-vram':args.low_vram,
        'check-for-updates':args.check_for_updates,
        'models-from-local-only':args.models_from_local_only,
@ -613,6 +634,13 @@ def setup_gradio():
                        inputs=None,
                        outputs=voice
                    )
+                    gr.Button(value="(Re)Compute Voice Latents").click(compute_latents,
+                        inputs=[
+                            voice,
+                            gr.Checkbox(label="Experimental Compute Voice Latents Mode", value=True)
+                        ],
+                        outputs=voice,
+                    )
                    
                    prompt.change(fn=lambda value: gr.update(value="Custom"),
                        inputs=prompt,