From 4274cce218abb0b462e7d10cd1c873c6ddd7d926 Mon Sep 17 00:00:00 2001
From: mrq <barry.quiggles@protonmail.com>
Date: Sat, 4 Feb 2023 01:50:57 +0000
Subject: [PATCH] Added small optimization with caching latents, dropped
 Anaconda for just a py3.9 + pip + venv setup, added helper install scripts
 for such, cleaned up app.py, added flag '--low-vram' to disable minor
 optimizations

---
 README.md               |  18 ++++---
 app.py                  | 116 +++++++++++++++++++---------------------
 requirements.txt        |   2 +
 start.bat               |   3 ++
 tortoise/api.py         |  77 ++++++++++++++++++--------
 tortoise/utils/audio.py |  32 +++++++----
 6 files changed, 152 insertions(+), 96 deletions(-)
 create mode 100755 start.bat
 mode change 100644 => 100755 tortoise/utils/audio.py
diff --git a/README.md b/README.md
index 1dffedf..3873e58 100755
--- a/README.md
+++ b/README.md
@@ -26,21 +26,25 @@ Lots of available RAM seems to be a requirement, as I see Python eating up 8GiB
 
 ### Pre-Requirements
 
-Anaconda: https://www.anaconda.com/products/distribution
+Python 3.9: https://www.python.org/downloads/release/python-3913/
 
 Git (optional): https://git-scm.com/download/win
 
 ### Setup
 
-Download Anaconda and run the installer.
+Download Python and run the installer.
 
-After installing `conda`, open the Start Menu and search for `Anaconda Powershell Prompt`. Type `cd `, then drag and drop the folder you want to work in (experienced users can just `cd <path>` directly).
+After installing python, open the Start Menu and search for `Command Prompt`. Type `cd `, then drag and drop the folder you want to work in (experienced users can just `cd <path>` directly).
 
 Paste `git clone https://git.ecker.tech/mrq/tortoise-tts` to download TorToiSe and additional scripts. Inexperienced users can just download the repo as a ZIP, and extract.
 
 Then move into that folder with `cd tortoise-tts`. Afterwards, enter `setup.bat` to automatically enter all the remaining commands.
 
-If you've done everything right with installing Anaconda, you shouldn't have any errors.
+If you've done everything right, you shouldn't have any errors.
+
+### Updating
+
+To check for updates with the Web UI, simply enter `git pull` in the command prompt, while the TorToiSe workspace is the current working directory.
 
 ## Preparing Voice Samples
 
@@ -64,7 +68,7 @@ After preparing your clips as WAV files at a sample rate of 22050 Hz, open up th
 
 ## Using the Software
 
-Now you're ready to generate clips. With the `conda` prompt still open, simply run the web UI with `python app.py`, and wait for it to print out a URL to open in your browser, something like `http://127.0.0.1:7861`.
+Now you're ready to generate clips. With the command prompt still open, simply enter `start.bat`, and wait for it to print out a URL to open in your browser, something like `http://127.0.0.1:7861`.
 
 If you're looking to access your copy of TorToiSe from outside your local network, pass `--share` into the command (for example, `python app.py --share`). You'll get a temporary gradio link to use.
 
@@ -72,7 +76,7 @@ You'll be presented with a bunch of options, but do not be overwhelmed, as most
 * `Text`: text you want to be read. You wrap text in `[brackets]` for "prompt engineering", where it'll affect the output, but those words won't actually be read.
 * `Emotion`: the "emotion" used for the delivery. This is a shortcut to starting with `[I am really ${emotion}],` in your text box. I assume the emotion is deduced during the CLVP pass.
 * `Voice`: the voice you want to clone. You can select `custom` if you want to use input from your microphone.
-* `Record voice`: Not required, unless you use `custom`.
+* `Microphone Source`: Not required, unless you use `custom`.
 * `Preset`: shortcut values for sample count and iteration steps. Use `none` if you want to provide your own values. Better presets rresult in better quality at the cost of computation time.
 * `Seed`: initializes the PRNG initially to this value, use this if you want to reproduce a generated voice. Currently, I don't have a way to expose the seed used.
 * `Candidates`: number of outputs to generate, starting from the best candidate. Depending on your iteration steps, generating the final sound files could be cheap, but they only offer alternatives to the samples generated to pull from (in other words, the later candidates perform worse), so don't be compelled to generate a ton of candidates.
@@ -86,6 +90,8 @@ All outputs are saved under `./result/[voice name]/[timestamp]/` as `result.wav`
 
 To save you from headaches, I strongly recommend playing around with shorter sentences first to find the right values for the voice you're using before generating longer sentences.
 
+As a quick optimization, I modified the script to where the `conditional_latents` are saved after loading voice samples. If there's voice samples that have a modification time newer than this cached file, it'll skip loading it and load the normal WAVs instead.
+
 ## Example(s)
 
 Below are some outputs I deem substantial enough to share. As I continue delving into TorToiSe, I'll supply more examples and the values I use.
diff --git a/app.py b/app.py
index b94df9f..b249f64 100755
--- a/app.py
+++ b/app.py
@@ -1,8 +1,10 @@
 import os
 import argparse
 import gradio as gr
+import torch
 import torchaudio
 import time
+
 from datetime import datetime
 from tortoise.api import TextToSpeech
 from tortoise.utils.audio import load_audio, load_voice, load_voices
@@ -18,61 +20,49 @@ def inference(text, emotion, prompt, voice, mic_audio, preset, seed, candidates,
     elif emotion != "None":
         text = f"[I am really {emotion.lower()},] {text}"
 
-    c = None
     if voice == "microphone":
         if mic_audio is None:
             raise gr.Error("Please provide audio from mic when choosing `microphone` as a voice input")
-        c = load_audio(mic_audio, 22050)
-
-
-    if len(voices) == 1 or len(voices) == 0:
-        if voice == "microphone":
-            voice_samples, conditioning_latents = [c], None
-        else:
-            voice_samples, conditioning_latents = load_voice(voice)
+        mic = load_audio(mic_audio, 22050)
+        voice_samples, conditioning_latents = [mic], None
     else:
-        voice_samples, conditioning_latents = load_voices(voices)
-        if voice == "microphone":
-            voice_samples.extend([c])
-
-    sample_voice = voice_samples[0] if len(voice_samples) else None
+        voice_samples, conditioning_latents = load_voice(voice)
+    
+    if voice_samples is not None:
+        sample_voice = voice_samples[0]
+        conditioning_latents = tts.get_conditioning_latents(voice_samples)
+        torch.save(conditioning_latents, os.path.join(f'./tortoise/voices/{voice}/', f'latents.pth'))
+        voice_samples = None
+    else:
+        sample_voice = None
 
     if seed == 0:
         seed = None
 
     start_time = time.time()
 
-    # >b-buh why not set samples and iterations to nullllll
-    # shut up
+    presets = {
+        'ultra_fast': {'num_autoregressive_samples': 16, 'diffusion_iterations': 30, 'cond_free': False},
+        'fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 80},
+        'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200},
+        'high_quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400},
+        'none': {'num_autoregressive_samples': num_autoregressive_samples, 'diffusion_iterations': diffusion_iterations},
+    }
+    settings = {
+        'temperature': temperature, 'length_penalty': 1.0, 'repetition_penalty': 2.0,
+        'top_p': .8,
+        'cond_free_k': 2.0, 'diffusion_temperature': 1.0,
 
-    if preset == "none":
-        gen, additionals = tts.tts_with_preset(
-            text,
-            voice_samples=voice_samples,
-            conditioning_latents=conditioning_latents,
-            preset="standard",
-            use_deterministic_seed=seed,
-            return_deterministic_state=True,
-            k=candidates,
-            num_autoregressive_samples=num_autoregressive_samples,
-            diffusion_iterations=diffusion_iterations,
-            temperature=temperature,
-            progress=progress
-        )
-        seed = additionals[0]
-    else:
-        gen, additionals = tts.tts_with_preset(
-            text,
-            voice_samples=voice_samples,
-            conditioning_latents=conditioning_latents,
-            preset=preset,
-            use_deterministic_seed=seed,
-            return_deterministic_state=True,
-            k=candidates,
-            temperature=temperature,
-            progress=progress
-        )
-        seed = additionals[0]
+        'voice_samples': voice_samples,
+        'conditioning_latents': conditioning_latents,
+        'use_deterministic_seed': seed,
+        'return_deterministic_state': True,
+        'k': candidates,
+        'progress': progress,
+    }
+    settings.update(presets[preset])
+    gen, additionals = tts.tts( text, **settings )
+    seed = additionals[0]
 
     info = f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Quality: {preset} preset / {num_autoregressive_samples} samples / {diffusion_iterations} iterations | Temperature: {temperature} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n"
     with open("results.log", "a") as f:
@@ -89,24 +79,24 @@ def inference(text, emotion, prompt, voice, mic_audio, preset, seed, candidates,
     if isinstance(gen, list):
         for j, g in enumerate(gen):
             torchaudio.save(os.path.join(outdir, f'result_{j}.wav'), g.squeeze(0).cpu(), 24000)
-        return (
-            (22050, sample_voice.squeeze().cpu().numpy()),
-            (24000, gen[0].squeeze().cpu().numpy()),
-            seed
-        )
+        
+        output_voice = gen[0]
     else:
         torchaudio.save(os.path.join(outdir, f'result.wav'), gen.squeeze(0).cpu(), 24000)
-        return (
-            (22050, sample_voice.squeeze().cpu().numpy()),
-            (24000, gen.squeeze().cpu().numpy()),
-            seed
-        )
+        output_voice = gen
+
+    output_voice = (24000, output_voice.squeeze().cpu().numpy())
+
+    if sample_voice is not None:
+        sample_voice = (22050, sample_voice.squeeze().cpu().numpy())
+
+    return (
+        sample_voice,
+        output_voice,
+        seed
+    )
 
 def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--share", action='store_true', help="Lets Gradio return a public URL to use anywhere")
-    args = parser.parse_args()
-
     text = gr.Textbox(lines=4, label="Prompt")
     emotion = gr.Radio(
         ["None", "Happy", "Sad", "Angry", "Disgusted", "Arrogant", "Custom"],
@@ -158,11 +148,17 @@ def main():
             temperature
         ],
         outputs=[selected_voice, output_audio, usedSeed],
-        allow_flagging=False
+        allow_flagging='never'
     )
     interface.queue().launch(share=args.share)
 
 
 if __name__ == "__main__":
-    tts = TextToSpeech()
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--share", action='store_true', help="Lets Gradio return a public URL to use anywhere")
+    parser.add_argument("--low-vram", action='store_true', help="Disables some optimizations that increases VRAM usage")
+    args = parser.parse_args()
+
+    tts = TextToSpeech(minor_optimizations=not args.low_vram)
+
     main()
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index a6070e9..917fd72 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,4 +11,6 @@ librosa
 torchaudio
 threadpoolctl
 appdirs
+numpy
+numba
 gradio
\ No newline at end of file
diff --git a/start.bat b/start.bat
new file mode 100755
index 0000000..f784640
--- /dev/null
+++ b/start.bat
@@ -0,0 +1,3 @@
+call .\tortoise-venv\Scripts\activate.bat
+py .\app.py
+deactivate
\ No newline at end of file
diff --git a/tortoise/api.py b/tortoise/api.py
index dbd22cf..3aa5220 100755
--- a/tortoise/api.py
+++ b/tortoise/api.py
@@ -206,7 +206,7 @@ class TextToSpeech:
     Main entry point into Tortoise.
     """
 
-    def __init__(self, autoregressive_batch_size=None, models_dir=MODELS_DIR, enable_redaction=True, device=None):
+    def __init__(self, autoregressive_batch_size=None, models_dir=MODELS_DIR, enable_redaction=True, device=None, minor_optimizations=True):
         """
         Constructor
         :param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing
@@ -218,6 +218,7 @@ class TextToSpeech:
                                  Default is true.
         :param device: Device to use when running the model. If omitted, the device will be automatically chosen.
         """
+        self.minor_optimizations = minor_optimizations
         self.models_dir = models_dir
         self.autoregressive_batch_size = pick_best_batch_size_for_gpu() if autoregressive_batch_size is None else autoregressive_batch_size
         self.enable_redaction = enable_redaction
@@ -243,6 +244,7 @@ class TextToSpeech:
                                           layer_drop=0, unconditioned_percentage=0).cpu().eval()
             self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', models_dir)))
 
+
         self.clvp = CLVP(dim_text=768, dim_speech=768, dim_latent=768, num_text_tokens=256, text_enc_depth=20,
                          text_seq_len=350, text_heads=12,
                          num_speech_tokens=8192, speech_enc_depth=20, speech_heads=12, speech_seq_len=430,
@@ -258,11 +260,20 @@ class TextToSpeech:
         self.rlg_auto = None
         self.rlg_diffusion = None
 
+        if self.minor_optimizations:
+            self.autoregressive = self.autoregressive.to(self.device)
+            self.diffusion = self.diffusion.to(self.device)
+            self.clvp = self.clvp.to(self.device)
+            self.vocoder = self.vocoder.to(self.device)
+
     def load_cvvp(self):
         """Load CVVP model."""
         self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0,
                          speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval()
         self.cvvp.load_state_dict(torch.load(get_model_path('cvvp.pth', self.models_dir)))
+        
+        if self.minor_optimizations:
+            self.cvvp = self.cvvp.to(self.device)
 
     def get_conditioning_latents(self, voice_samples, return_mels=False):
         """
@@ -279,11 +290,9 @@ class TextToSpeech:
                 voice_samples = [voice_samples]
             for vs in voice_samples:
                 auto_conds.append(format_conditioning(vs, device=self.device))
-            auto_conds = torch.stack(auto_conds, dim=1)
-            self.autoregressive = self.autoregressive.to(self.device)
-            auto_latent = self.autoregressive.get_conditioning(auto_conds)
-            self.autoregressive = self.autoregressive.cpu()
 
+            auto_conds = torch.stack(auto_conds, dim=1)
+            
             diffusion_conds = []
             for sample in voice_samples:
                 # The diffuser operates at a sample rate of 24000 (except for the latent inputs)
@@ -293,9 +302,18 @@ class TextToSpeech:
                 diffusion_conds.append(cond_mel)
             diffusion_conds = torch.stack(diffusion_conds, dim=1)
 
-            self.diffusion = self.diffusion.to(self.device)
-            diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
-            self.diffusion = self.diffusion.cpu()
+
+            if self.minor_optimizations:
+                auto_latent = self.autoregressive.get_conditioning(auto_conds)
+                diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
+            else:
+                self.autoregressive = self.autoregressive.to(self.device)
+                auto_latent = self.autoregressive.get_conditioning(auto_conds)
+                self.autoregressive = self.autoregressive.cpu()
+
+                self.diffusion = self.diffusion.to(self.device)
+                diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
+                self.diffusion = self.diffusion.cpu()
 
         if return_mels:
             return auto_latent, diffusion_latent, auto_conds, diffusion_conds
@@ -413,7 +431,9 @@ class TextToSpeech:
             num_batches = num_autoregressive_samples // self.autoregressive_batch_size
             stop_mel_token = self.autoregressive.stop_mel_token
             calm_token = 83  # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
-            self.autoregressive = self.autoregressive.to(self.device)
+            
+            if not self.minor_optimizations:
+                self.autoregressive = self.autoregressive.to(self.device)
             
             for b in tqdm_override(range(num_batches), verbose=verbose, progress=progress, desc="Generating autoregressive samples"):
                 codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens,
@@ -428,14 +448,18 @@ class TextToSpeech:
                 padding_needed = max_mel_tokens - codes.shape[1]
                 codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)
                 samples.append(codes)
-            self.autoregressive = self.autoregressive.cpu()
 
             clip_results = []
-            self.clvp = self.clvp.to(self.device)
+
+            if not self.minor_optimizations:
+                self.autoregressive = self.autoregressive.cpu()
+                self.clvp = self.clvp.to(self.device)
+
             if cvvp_amount > 0:
                 if self.cvvp is None:
                     self.load_cvvp()
-                self.cvvp = self.cvvp.to(self.device)
+                if not self.minor_optimizations:
+                    self.cvvp = self.cvvp.to(self.device)
             
             desc="Computing best candidates"
             if verbose:
@@ -463,25 +487,34 @@ class TextToSpeech:
             clip_results = torch.cat(clip_results, dim=0)
             samples = torch.cat(samples, dim=0)
             best_results = samples[torch.topk(clip_results, k=k).indices]
-            self.clvp = self.clvp.cpu()
-            if self.cvvp is not None:
-                self.cvvp = self.cvvp.cpu()
+            
+
+            if not self.minor_optimizations:
+                self.clvp = self.clvp.cpu()
+                if self.cvvp is not None:
+                    self.cvvp = self.cvvp.cpu()
+
             del samples
 
             # The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
             # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
             # results, but will increase memory usage.
-            self.autoregressive = self.autoregressive.to(self.device)
+            if not self.minor_optimizations:
+                self.autoregressive = self.autoregressive.to(self.device)
+
             best_latents = self.autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
                                                torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), best_results,
                                                torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
                                                return_latent=True, clip_inputs=False)
-            self.autoregressive = self.autoregressive.cpu()
+            
+            if not self.minor_optimizations:
+                self.autoregressive = self.autoregressive.cpu()
+                self.diffusion = self.diffusion.to(self.device)
+                self.vocoder = self.vocoder.to(self.device)
+            
             del auto_conditioning
 
             wav_candidates = []
-            self.diffusion = self.diffusion.to(self.device)
-            self.vocoder = self.vocoder.to(self.device)
             for b in range(best_results.shape[0]):
                 codes = best_results[b].unsqueeze(0)
                 latents = best_latents[b].unsqueeze(0)
@@ -501,8 +534,10 @@ class TextToSpeech:
                                                temperature=diffusion_temperature, verbose=verbose, progress=progress, desc="Transforming autoregressive outputs into audio..")
                 wav = self.vocoder.inference(mel)
                 wav_candidates.append(wav.cpu())
-            self.diffusion = self.diffusion.cpu()
-            self.vocoder = self.vocoder.cpu()
+            
+            if not self.minor_optimizations:
+                self.diffusion = self.diffusion.cpu()
+                self.vocoder = self.vocoder.cpu()
 
             def potentially_redact(clip, text):
                 if self.enable_redaction:
diff --git a/tortoise/utils/audio.py b/tortoise/utils/audio.py
old mode 100644
new mode 100755
index 91237dd..6a2f77d
--- a/tortoise/utils/audio.py
+++ b/tortoise/utils/audio.py
@@ -97,20 +97,34 @@ def get_voices(extra_voice_dirs=[]):
     return voices
 
 
-def load_voice(voice, extra_voice_dirs=[]):
+def load_voice(voice, extra_voice_dirs=[], load_latents=True):
     if voice == 'random':
         return None, None
 
     voices = get_voices(extra_voice_dirs)
     paths = voices[voice]
-    if len(paths) == 1 and paths[0].endswith('.pth'):
-        return None, torch.load(paths[0])
-    else:
-        conds = []
-        for cond_path in paths:
-            c = load_audio(cond_path, 22050)
-            conds.append(c)
-        return conds, None
+
+    mtime = 0
+    voices = []
+    latent = None
+    for file in paths:
+        if file[-4:] == ".pth":
+            latent = file
+        else:
+            voices.append(file)
+            mtime = max(mtime, os.path.getmtime(file))
+
+    if load_latents and latent is not None:
+        if os.path.getmtime(latent) > mtime:
+            print(f"Reading from latent: {latent}")
+            return None, torch.load(latent)
+        print(f"Latent file out of date: {latent}")
+    
+    conds = []
+    for cond_path in voices:
+        c = load_audio(cond_path, 22050)
+        conds.append(c)
+    return conds, None
 
 
 def load_voices(voices, extra_voice_dirs=[]):