modified how conditional latents are computed (before, it just happened to only bother reading the first 102400/24000=4.26 seconds per audio input, now it will chunk it all to compute latents)

2023-02-05 23:25:41 +00:00 · 2023-02-05 23:25:41 +00:00 · c2c9b1b683
commit c2c9b1b683
parent 4ea997106e
6 changed files with 82 additions and 55 deletions
--- a/README.md
+++ b/README.md
@ -115,6 +115,8 @@ To save you from headaches, I strongly recommend playing around with shorter sen
 As a quick optimization, I modified the script to where the `conditional_latents` are saved after loading voice samples, and subsequent uses will load that file directly (at the cost of not returning the `Sample voice` to the web UI). If there's voice samples that have a modification time newer than this cached file, it'll skip loading it and load the normal WAVs instead.
 **!**NOTE**!**: cached `latents.pth` files generated before 2023.02.05 will be ignored, due to a change in computing the conditiona latents. This *should* help bump up voice cloning quality. Apologies for the inconvenience.
 ## Example(s)
 Below are some outputs I deem substantial enough to share. As I continue delving into TorToiSe, I'll supply more examples and the values I use.
--- a/app.py
+++ b/app.py
@ -10,7 +10,9 @@ from tortoise.api import TextToSpeech
 from tortoise.utils.audio import load_audio, load_voice, load_voices
 from tortoise.utils.text import split_and_recombine_text
-def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, diffusion_sampler, breathing_room, progress=gr.Progress()):
+def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, diffusion_sampler, breathing_room, experimentals, progress=gr.Progress()):
    print(experimentals)
    if voice != "microphone":
        voices = [voice]
    else:
@ -27,7 +29,7 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c
    if voice_samples is not None:
        sample_voice = voice_samples[0]
-        conditioning_latents = tts.get_conditioning_latents(voice_samples)
+        conditioning_latents = tts.get_conditioning_latents(voice_samples, progress=progress)
        torch.save(conditioning_latents, os.path.join(f'./tortoise/voices/{voice}/', f'latents.pth'))
        voice_samples = None
    else:
@ -54,6 +56,8 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c
        'diffusion_sampler': diffusion_sampler,
        'breathing_room': breathing_room,
        'progress': progress,
        'half_p': "Half Precision" in experimentals,
        'cond_free': "Conditioning-Free" in experimentals,
    }
    if delimiter == "\\n":
@ -216,6 +220,8 @@ def main():
                    type="value",
                )
                experimentals = gr.CheckboxGroup(["Half Precision", "Conditioning-Free"], value=[False, True], label="Experimental Flags")
                preset.change(fn=update_presets,
                    inputs=preset,
                    outputs=[
@ -246,7 +252,8 @@ def main():
                        diffusion_iterations,
                        temperature,
                        diffusion_sampler,
-                        breathing_room
+                        breathing_room,
                        experimentals,
                    ],
                    outputs=[selected_voice, output_audio, usedSeed],
                )
--- a/start.bat
+++ b/start.bat
@ -1,3 +1,4 @@
 call .\tortoise-venv\Scripts\activate.bat
-py .\app.py
+python .\app.py
 deactivate
 pause
--- a/tortoise/api.py
+++ b/tortoise/api.py
@ -284,7 +284,7 @@ class TextToSpeech:
        if self.minor_optimizations:
            self.cvvp = self.cvvp.to(self.device)
-    def get_conditioning_latents(self, voice_samples, return_mels=False):
+    def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, progress=None, enforced_length=102400):
        """
        Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
        These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
@ -303,14 +303,18 @@ class TextToSpeech:
            auto_conds = torch.stack(auto_conds, dim=1)
            diffusion_conds = []
-            for sample in voice_samples:
+
            for sample in tqdm_override(voice_samples, verbose=verbose, progress=progress, desc="Computing conditioning latents..."):
                # The diffuser operates at a sample rate of 24000 (except for the latent inputs)
                sample = torchaudio.functional.resample(sample, 22050, 24000)
-                sample = pad_or_truncate(sample, 102400)
+                chunks = torch.chunk(sample, int(sample.shape[-1] / enforced_length) + 1, dim=1)
                cond_mel = wav_to_univnet_mel(sample.to(self.device), do_normalization=False, device=self.device)
                diffusion_conds.append(cond_mel)
            diffusion_conds = torch.stack(diffusion_conds, dim=1)
                for chunk in chunks:
                    chunk = pad_or_truncate(chunk, enforced_length)
                    cond_mel = wav_to_univnet_mel(chunk.to(self.device), do_normalization=False, device=self.device)
                    diffusion_conds.append(cond_mel)
            diffusion_conds = torch.stack(diffusion_conds, dim=1)
            if self.minor_optimizations:
                auto_latent = self.autoregressive.get_conditioning(auto_conds)
@ -372,6 +376,7 @@ class TextToSpeech:
            diffusion_iterations=100, cond_free=True, cond_free_k=2, diffusion_temperature=1.0,
            diffusion_sampler="P",
            breathing_room=8,
            half_p=False,
            progress=None,
            **hf_generate_kwargs):
        """
@ -446,6 +451,7 @@ class TextToSpeech:
            if not self.minor_optimizations:
                self.autoregressive = self.autoregressive.to(self.device)
            with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p):
                for b in tqdm_override(range(num_batches), verbose=verbose, progress=progress, desc="Generating autoregressive samples"):
                    codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens,
                                                                 do_sample=True,
@ -462,6 +468,7 @@ class TextToSpeech:
            clip_results = []
            with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p):
                if not self.minor_optimizations:
                    self.autoregressive = self.autoregressive.cpu()
                    self.clvp = self.clvp.to(self.device)
--- a/tortoise/utils/audio.py
+++ b/tortoise/utils/audio.py
@ -108,8 +108,11 @@ def load_voice(voice, extra_voice_dirs=[], load_latents=True):
    voices = []
    latent = None
    for file in paths:
-        if file[-4:] == ".pth":
+        if file == "cond_latents.pth":
            latent = file
        elif file[-4:] == ".pth":
            {}
            # noop
        else:
            voices.append(file)
            mtime = max(mtime, os.path.getmtime(file))
--- a/update.bat
+++ b/update.bat
@ -0,0 +1,7 @@
 git pull
 python -m venv tortoise-venv
 call .\tortoise-venv\Scripts\activate.bat
 python -m pip install --upgrade pip
 python -m pip install -r ./requirements.txt
 deactivate
 pause