diff --git a/app.py b/app.py index b249f64..898ac0b 100755 --- a/app.py +++ b/app.py @@ -9,7 +9,7 @@ from datetime import datetime from tortoise.api import TextToSpeech from tortoise.utils.audio import load_audio, load_voice, load_voices -def inference(text, emotion, prompt, voice, mic_audio, preset, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, progress=gr.Progress()): +def inference(text, emotion, prompt, voice, mic_audio, preset, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, diffusion_sampler, progress=gr.Progress()): if voice != "microphone": voices = [voice] else: @@ -42,11 +42,11 @@ def inference(text, emotion, prompt, voice, mic_audio, preset, seed, candidates, start_time = time.time() presets = { - 'ultra_fast': {'num_autoregressive_samples': 16, 'diffusion_iterations': 30, 'cond_free': False}, - 'fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 80}, - 'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200}, - 'high_quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400}, - 'none': {'num_autoregressive_samples': num_autoregressive_samples, 'diffusion_iterations': diffusion_iterations}, + 'Ultra Fast': {'num_autoregressive_samples': 16, 'diffusion_iterations': 30, 'cond_free': False}, + 'Fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 80}, + 'Standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200}, + 'High Quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400}, + 'None': {'num_autoregressive_samples': num_autoregressive_samples, 'diffusion_iterations': diffusion_iterations}, } settings = { 'temperature': temperature, 'length_penalty': 1.0, 'repetition_penalty': 2.0, @@ -58,13 +58,14 @@ def inference(text, emotion, prompt, voice, mic_audio, preset, seed, candidates, 'use_deterministic_seed': seed, 'return_deterministic_state': True, 'k': candidates, + 'diffusion_sampler': diffusion_sampler, 'progress': progress, } settings.update(presets[preset]) gen, additionals = tts.tts( text, **settings ) seed = additionals[0] - info = f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Quality: {preset} preset / {num_autoregressive_samples} samples / {diffusion_iterations} iterations | Temperature: {temperature} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n" + info = f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Quality: {preset} preset / {num_autoregressive_samples} samples / {diffusion_iterations} iterations | Temperature: {temperature} | Diffusion Sampler: {diffusion_sampler} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n" with open("results.log", "a") as f: f.write(info) @@ -74,7 +75,7 @@ def inference(text, emotion, prompt, voice, mic_audio, preset, seed, candidates, os.makedirs(outdir, exist_ok=True) with open(os.path.join(outdir, f'input.txt'), 'w') as f: - f.write(f"{text}\n\n{info}") + f.write(f"{info}") if isinstance(gen, list): for j, g in enumerate(gen): @@ -104,10 +105,10 @@ def main(): label="Emotion", type="value", ) - prompt = gr.Textbox(lines=1, label="Custom Emotion (if selected)") + prompt = gr.Textbox(lines=1, label="Custom Emotion + Prompt (if selected)") preset = gr.Radio( - ["ultra_fast", "fast", "standard", "high_quality", "none"], - value="none", + ["Ultra Fast", "Fast", "Standard", "High Quality", "None"], + value="None", label="Preset", type="value", ) @@ -115,6 +116,12 @@ def main(): num_autoregressive_samples = gr.Slider(value=128, minimum=0, maximum=512, step=1, label="Samples") diffusion_iterations = gr.Slider(value=128, minimum=0, maximum=512, step=1, label="Iterations") temperature = gr.Slider(value=0.2, minimum=0, maximum=1, step=0.1, label="Temperature") + diffusion_sampler = gr.Radio( + ["P", "DDIM"], + value="P", + label="Diffusion Samplers", + type="value", + ) voice = gr.Dropdown( os.listdir(os.path.join("tortoise", "voices")) + ["random", "microphone", "disabled"], @@ -145,7 +152,8 @@ def main(): candidates, num_autoregressive_samples, diffusion_iterations, - temperature + temperature, + diffusion_sampler ], outputs=[selected_voice, output_audio, usedSeed], allow_flagging='never' diff --git a/tortoise/api.py b/tortoise/api.py index 3aa5220..121ca32 100755 --- a/tortoise/api.py +++ b/tortoise/api.py @@ -153,7 +153,7 @@ def fix_autoregressive_output(codes, stop_token, complain=True): return codes -def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True, progress=None, desc=None): +def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True, progress=None, desc=None, sampler="P"): """ Uses the specified diffusion model to convert discrete codes into a spectrogram. """ @@ -163,9 +163,18 @@ def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_la precomputed_embeddings = diffusion_model.timestep_independent(latents, conditioning_latents, output_seq_len, False) noise = torch.randn(output_shape, device=latents.device) * temperature - mel = diffuser.p_sample_loop(diffusion_model, output_shape, noise=noise, + + mel = None + print(f"Sampler: {sampler}") + if sampler == "P": + mel = diffuser.p_sample_loop(diffusion_model, output_shape, noise=noise, model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings}, verbose=verbose, progress=progress, desc=desc) + elif sampler == "DDIM": + mel = diffuser.ddim_sample_loop(diffusion_model, output_shape, noise=noise, + model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings}, + verbose=verbose, progress=progress, desc=desc) + return denormalize_tacotron_mel(mel)[:,:,:output_seq_len] @@ -361,6 +370,7 @@ class TextToSpeech: cvvp_amount=.0, # diffusion generation parameters follow diffusion_iterations=100, cond_free=True, cond_free_k=2, diffusion_temperature=1.0, + diffusion_sampler="P", progress=None, **hf_generate_kwargs): """ @@ -531,7 +541,7 @@ class TextToSpeech: break mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents, diffusion_conditioning, - temperature=diffusion_temperature, verbose=verbose, progress=progress, desc="Transforming autoregressive outputs into audio..") + temperature=diffusion_temperature, verbose=verbose, progress=progress, desc="Transforming autoregressive outputs into audio..", sampler=diffusion_sampler) wav = self.vocoder.inference(mel) wav_candidates.append(wav.cpu()) diff --git a/tortoise/utils/diffusion.py b/tortoise/utils/diffusion.py index 74b8086..93aae01 100755 --- a/tortoise/utils/diffusion.py +++ b/tortoise/utils/diffusion.py @@ -734,6 +734,7 @@ class GaussianDiffusion: verbose=False, eta=0.0, progress=None, + desc=None, ): """ Generate samples from the model using DDIM. @@ -753,6 +754,7 @@ class GaussianDiffusion: verbose=verbose, eta=eta, progress=progress, + desc=desc ): final = sample return final["sample"] @@ -770,6 +772,7 @@ class GaussianDiffusion: verbose=False, eta=0.0, progress=None, + desc=None, ): """ Use DDIM to sample from the model and yield intermediate samples from @@ -790,7 +793,7 @@ class GaussianDiffusion: # Lazy import so that we don't depend on tqdm. from tqdm.auto import tqdm - indices = tqdm_override(indices, verbose=verbose, desc="DDIM Sample Loop Progressive", progress=progress) + indices = tqdm_override(indices, verbose=verbose, desc=desc, progress=progress) for i in indices: t = th.tensor([i] * shape[0], device=device)