diff --git a/app.py b/app.py index 2d44d78..fa53470 100755 --- a/app.py +++ b/app.py @@ -7,14 +7,7 @@ from datetime import datetime from tortoise.api import TextToSpeech from tortoise.utils.audio import load_audio, load_voice, load_voices -VOICE_OPTIONS = [ - "random", # special option for random voice - "microphone", # special option for custom voice - "disabled", # special option for disabled voice -] - - -def inference(text, emotion, prompt, voice, mic_audio, preset, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature): +def inference(text, emotion, prompt, voice, mic_audio, preset, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, progress=gr.Progress()): if voice != "microphone": voices = [voice] else: @@ -48,6 +41,10 @@ def inference(text, emotion, prompt, voice, mic_audio, preset, seed, candidates, seed = None start_time = time.time() + + # >b-buh why not set samples and iterations to nullllll + # shut up + if preset == "none": gen, additionals = tts.tts_with_preset( text, @@ -60,6 +57,7 @@ def inference(text, emotion, prompt, voice, mic_audio, preset, seed, candidates, num_autoregressive_samples=num_autoregressive_samples, diffusion_iterations=diffusion_iterations, temperature=temperature, + progress=progress ) seed = additionals[0] else: @@ -72,13 +70,13 @@ def inference(text, emotion, prompt, voice, mic_audio, preset, seed, candidates, return_deterministic_state=True, k=candidates, temperature=temperature, + progress=progress ) seed = additionals[0] + info = f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Quality: {preset} preset / {num_autoregressive_samples} samples / {diffusion_iterations} iterations | Temperature: {temperature} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n" with open("results.log", "a") as f: - f.write( - f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Quality: {preset} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n" - ) + f.write(info) timestamp = int(time.time()) outdir = f"./results/{voice}/{timestamp}/" @@ -86,7 +84,7 @@ def inference(text, emotion, prompt, voice, mic_audio, preset, seed, candidates, os.makedirs(outdir, exist_ok=True) with open(os.path.join(outdir, f'input.txt'), 'w') as f: - f.write(f"{text}\n\nSeed: {seed}") + f.write(f"{text}\n\n{info}") if isinstance(gen, list): for j, g in enumerate(gen): @@ -129,7 +127,7 @@ def main(): temperature = gr.Slider(value=0.2, minimum=0, maximum=1, step=0.1, label="Temperature") voice = gr.Dropdown( - os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS, + os.listdir(os.path.join("tortoise", "voices")) + ["random", "microphone", "disabled"], label="Voice", type="value", ) diff --git a/tortoise/api.py b/tortoise/api.py old mode 100644 new mode 100755 index 296ef14..6f88e5d --- a/tortoise/api.py +++ b/tortoise/api.py @@ -39,6 +39,13 @@ MODELS = { 'rlg_diffuser.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/rlg_diffuser.pth', } +def tqdm_override(arr, verbose=False, progress=None, desc=None): + if progress is None: + if verbose and desc is not None: + print(desc) + return tqdm(arr, disable=not verbose) + return progress.tqdm(arr, desc=desc) + def download_models(specific_models=None): """ Call to download all the models that Tortoise uses. @@ -234,17 +241,21 @@ class TextToSpeech: in_latent_channels=1024, in_tokens=8193, dropout=0, use_fp16=False, num_heads=16, layer_drop=0, unconditioned_percentage=0).cpu().eval() self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', models_dir))) + self.autoregressive = self.autoregressive.to(self.device) + self.diffusion = self.diffusion.to(self.device) self.clvp = CLVP(dim_text=768, dim_speech=768, dim_latent=768, num_text_tokens=256, text_enc_depth=20, text_seq_len=350, text_heads=12, num_speech_tokens=8192, speech_enc_depth=20, speech_heads=12, speech_seq_len=430, use_xformers=True).cpu().eval() self.clvp.load_state_dict(torch.load(get_model_path('clvp2.pth', models_dir))) + self.clvp = self.clvp.to(self.device) self.cvvp = None # CVVP model is only loaded if used. self.vocoder = UnivNetGenerator().cpu() self.vocoder.load_state_dict(torch.load(get_model_path('vocoder.pth', models_dir), map_location=torch.device('cpu'))['model_g']) self.vocoder.eval(inference=True) + self.vocoder = self.vocoder.to(self.device) # Random latent generators (RLGs) are loaded lazily. self.rlg_auto = None @@ -255,6 +266,7 @@ class TextToSpeech: self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0, speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval() self.cvvp.load_state_dict(torch.load(get_model_path('cvvp.pth', self.models_dir))) + self.cvvp = self.cvvp.to(self.device) def get_conditioning_latents(self, voice_samples, return_mels=False): """ @@ -272,9 +284,7 @@ class TextToSpeech: for vs in voice_samples: auto_conds.append(format_conditioning(vs, device=self.device)) auto_conds = torch.stack(auto_conds, dim=1) - self.autoregressive = self.autoregressive.to(self.device) auto_latent = self.autoregressive.get_conditioning(auto_conds) - self.autoregressive = self.autoregressive.cpu() diffusion_conds = [] for sample in voice_samples: @@ -285,9 +295,7 @@ class TextToSpeech: diffusion_conds.append(cond_mel) diffusion_conds = torch.stack(diffusion_conds, dim=1) - self.diffusion = self.diffusion.to(self.device) diffusion_latent = self.diffusion.get_conditioning(diffusion_conds) - self.diffusion = self.diffusion.cpu() if return_mels: return auto_latent, diffusion_latent, auto_conds, diffusion_conds @@ -335,6 +343,7 @@ class TextToSpeech: cvvp_amount=.0, # diffusion generation parameters follow diffusion_iterations=100, cond_free=True, cond_free_k=2, diffusion_temperature=1.0, + progress=None, **hf_generate_kwargs): """ Produces an audio clip of the given text being spoken with the given reference voice. @@ -404,10 +413,8 @@ class TextToSpeech: num_batches = num_autoregressive_samples // self.autoregressive_batch_size stop_mel_token = self.autoregressive.stop_mel_token calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output" - self.autoregressive = self.autoregressive.to(self.device) - if verbose: - print("Generating autoregressive samples..") - for b in tqdm(range(num_batches), disable=not verbose): + + for b in tqdm_override(range(num_batches), verbose=verbose, progress=progress, desc="Generating autoregressive samples"): codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens, do_sample=True, top_p=top_p, @@ -420,20 +427,20 @@ class TextToSpeech: padding_needed = max_mel_tokens - codes.shape[1] codes = F.pad(codes, (0, padding_needed), value=stop_mel_token) samples.append(codes) - self.autoregressive = self.autoregressive.cpu() clip_results = [] - self.clvp = self.clvp.to(self.device) if cvvp_amount > 0: if self.cvvp is None: self.load_cvvp() - self.cvvp = self.cvvp.to(self.device) + + desc="Computing best candidates" if verbose: if self.cvvp is None: - print("Computing best candidates using CLVP") + desc = "Computing best candidates using CLVP" else: - print(f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%") - for batch in tqdm(samples, disable=not verbose): + desc = f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%" + + for batch in tqdm_override(samples, verbose=verbose, progress=progress, desc=desc): for i in range(batch.shape[0]): batch[i] = fix_autoregressive_output(batch[i], stop_mel_token) if cvvp_amount != 1: @@ -452,28 +459,19 @@ class TextToSpeech: clip_results = torch.cat(clip_results, dim=0) samples = torch.cat(samples, dim=0) best_results = samples[torch.topk(clip_results, k=k).indices] - self.clvp = self.clvp.cpu() - if self.cvvp is not None: - self.cvvp = self.cvvp.cpu() del samples # The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these # results, but will increase memory usage. - self.autoregressive = self.autoregressive.to(self.device) best_latents = self.autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1), torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), best_results, torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device), return_latent=True, clip_inputs=False) - self.autoregressive = self.autoregressive.cpu() del auto_conditioning - if verbose: - print("Transforming autoregressive outputs into audio..") wav_candidates = [] - self.diffusion = self.diffusion.to(self.device) - self.vocoder = self.vocoder.to(self.device) - for b in range(best_results.shape[0]): + for b in tqdm_override(range(best_results.shape[0]), verbose=verbose, progress=progress, desc="Transforming autoregressive outputs into audio.."): codes = best_results[b].unsqueeze(0) latents = best_latents[b].unsqueeze(0) @@ -492,8 +490,6 @@ class TextToSpeech: temperature=diffusion_temperature, verbose=verbose) wav = self.vocoder.inference(mel) wav_candidates.append(wav.cpu()) - self.diffusion = self.diffusion.cpu() - self.vocoder = self.vocoder.cpu() def potentially_redact(clip, text): if self.enable_redaction: @@ -522,4 +518,4 @@ class TextToSpeech: # Can't currently set this because of CUBLAS. TODO: potentially enable it if necessary. # torch.use_deterministic_algorithms(True) - return seed + return seed \ No newline at end of file