forked from mrq/tortoise-tts
Added choices to choose between diffusion samplers (p, ddim)
This commit is contained in:
parent
4274cce218
commit
078dc0c6e2
32
app.py
32
app.py
|
@ -9,7 +9,7 @@ from datetime import datetime
|
||||||
from tortoise.api import TextToSpeech
|
from tortoise.api import TextToSpeech
|
||||||
from tortoise.utils.audio import load_audio, load_voice, load_voices
|
from tortoise.utils.audio import load_audio, load_voice, load_voices
|
||||||
|
|
||||||
def inference(text, emotion, prompt, voice, mic_audio, preset, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, progress=gr.Progress()):
|
def inference(text, emotion, prompt, voice, mic_audio, preset, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, diffusion_sampler, progress=gr.Progress()):
|
||||||
if voice != "microphone":
|
if voice != "microphone":
|
||||||
voices = [voice]
|
voices = [voice]
|
||||||
else:
|
else:
|
||||||
|
@ -42,11 +42,11 @@ def inference(text, emotion, prompt, voice, mic_audio, preset, seed, candidates,
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
presets = {
|
presets = {
|
||||||
'ultra_fast': {'num_autoregressive_samples': 16, 'diffusion_iterations': 30, 'cond_free': False},
|
'Ultra Fast': {'num_autoregressive_samples': 16, 'diffusion_iterations': 30, 'cond_free': False},
|
||||||
'fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 80},
|
'Fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 80},
|
||||||
'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200},
|
'Standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200},
|
||||||
'high_quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400},
|
'High Quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400},
|
||||||
'none': {'num_autoregressive_samples': num_autoregressive_samples, 'diffusion_iterations': diffusion_iterations},
|
'None': {'num_autoregressive_samples': num_autoregressive_samples, 'diffusion_iterations': diffusion_iterations},
|
||||||
}
|
}
|
||||||
settings = {
|
settings = {
|
||||||
'temperature': temperature, 'length_penalty': 1.0, 'repetition_penalty': 2.0,
|
'temperature': temperature, 'length_penalty': 1.0, 'repetition_penalty': 2.0,
|
||||||
|
@ -58,13 +58,14 @@ def inference(text, emotion, prompt, voice, mic_audio, preset, seed, candidates,
|
||||||
'use_deterministic_seed': seed,
|
'use_deterministic_seed': seed,
|
||||||
'return_deterministic_state': True,
|
'return_deterministic_state': True,
|
||||||
'k': candidates,
|
'k': candidates,
|
||||||
|
'diffusion_sampler': diffusion_sampler,
|
||||||
'progress': progress,
|
'progress': progress,
|
||||||
}
|
}
|
||||||
settings.update(presets[preset])
|
settings.update(presets[preset])
|
||||||
gen, additionals = tts.tts( text, **settings )
|
gen, additionals = tts.tts( text, **settings )
|
||||||
seed = additionals[0]
|
seed = additionals[0]
|
||||||
|
|
||||||
info = f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Quality: {preset} preset / {num_autoregressive_samples} samples / {diffusion_iterations} iterations | Temperature: {temperature} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n"
|
info = f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Quality: {preset} preset / {num_autoregressive_samples} samples / {diffusion_iterations} iterations | Temperature: {temperature} | Diffusion Sampler: {diffusion_sampler} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n"
|
||||||
with open("results.log", "a") as f:
|
with open("results.log", "a") as f:
|
||||||
f.write(info)
|
f.write(info)
|
||||||
|
|
||||||
|
@ -74,7 +75,7 @@ def inference(text, emotion, prompt, voice, mic_audio, preset, seed, candidates,
|
||||||
os.makedirs(outdir, exist_ok=True)
|
os.makedirs(outdir, exist_ok=True)
|
||||||
|
|
||||||
with open(os.path.join(outdir, f'input.txt'), 'w') as f:
|
with open(os.path.join(outdir, f'input.txt'), 'w') as f:
|
||||||
f.write(f"{text}\n\n{info}")
|
f.write(f"{info}")
|
||||||
|
|
||||||
if isinstance(gen, list):
|
if isinstance(gen, list):
|
||||||
for j, g in enumerate(gen):
|
for j, g in enumerate(gen):
|
||||||
|
@ -104,10 +105,10 @@ def main():
|
||||||
label="Emotion",
|
label="Emotion",
|
||||||
type="value",
|
type="value",
|
||||||
)
|
)
|
||||||
prompt = gr.Textbox(lines=1, label="Custom Emotion (if selected)")
|
prompt = gr.Textbox(lines=1, label="Custom Emotion + Prompt (if selected)")
|
||||||
preset = gr.Radio(
|
preset = gr.Radio(
|
||||||
["ultra_fast", "fast", "standard", "high_quality", "none"],
|
["Ultra Fast", "Fast", "Standard", "High Quality", "None"],
|
||||||
value="none",
|
value="None",
|
||||||
label="Preset",
|
label="Preset",
|
||||||
type="value",
|
type="value",
|
||||||
)
|
)
|
||||||
|
@ -115,6 +116,12 @@ def main():
|
||||||
num_autoregressive_samples = gr.Slider(value=128, minimum=0, maximum=512, step=1, label="Samples")
|
num_autoregressive_samples = gr.Slider(value=128, minimum=0, maximum=512, step=1, label="Samples")
|
||||||
diffusion_iterations = gr.Slider(value=128, minimum=0, maximum=512, step=1, label="Iterations")
|
diffusion_iterations = gr.Slider(value=128, minimum=0, maximum=512, step=1, label="Iterations")
|
||||||
temperature = gr.Slider(value=0.2, minimum=0, maximum=1, step=0.1, label="Temperature")
|
temperature = gr.Slider(value=0.2, minimum=0, maximum=1, step=0.1, label="Temperature")
|
||||||
|
diffusion_sampler = gr.Radio(
|
||||||
|
["P", "DDIM"],
|
||||||
|
value="P",
|
||||||
|
label="Diffusion Samplers",
|
||||||
|
type="value",
|
||||||
|
)
|
||||||
|
|
||||||
voice = gr.Dropdown(
|
voice = gr.Dropdown(
|
||||||
os.listdir(os.path.join("tortoise", "voices")) + ["random", "microphone", "disabled"],
|
os.listdir(os.path.join("tortoise", "voices")) + ["random", "microphone", "disabled"],
|
||||||
|
@ -145,7 +152,8 @@ def main():
|
||||||
candidates,
|
candidates,
|
||||||
num_autoregressive_samples,
|
num_autoregressive_samples,
|
||||||
diffusion_iterations,
|
diffusion_iterations,
|
||||||
temperature
|
temperature,
|
||||||
|
diffusion_sampler
|
||||||
],
|
],
|
||||||
outputs=[selected_voice, output_audio, usedSeed],
|
outputs=[selected_voice, output_audio, usedSeed],
|
||||||
allow_flagging='never'
|
allow_flagging='never'
|
||||||
|
|
|
@ -153,7 +153,7 @@ def fix_autoregressive_output(codes, stop_token, complain=True):
|
||||||
return codes
|
return codes
|
||||||
|
|
||||||
|
|
||||||
def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True, progress=None, desc=None):
|
def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True, progress=None, desc=None, sampler="P"):
|
||||||
"""
|
"""
|
||||||
Uses the specified diffusion model to convert discrete codes into a spectrogram.
|
Uses the specified diffusion model to convert discrete codes into a spectrogram.
|
||||||
"""
|
"""
|
||||||
|
@ -163,9 +163,18 @@ def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_la
|
||||||
precomputed_embeddings = diffusion_model.timestep_independent(latents, conditioning_latents, output_seq_len, False)
|
precomputed_embeddings = diffusion_model.timestep_independent(latents, conditioning_latents, output_seq_len, False)
|
||||||
|
|
||||||
noise = torch.randn(output_shape, device=latents.device) * temperature
|
noise = torch.randn(output_shape, device=latents.device) * temperature
|
||||||
|
|
||||||
|
mel = None
|
||||||
|
print(f"Sampler: {sampler}")
|
||||||
|
if sampler == "P":
|
||||||
mel = diffuser.p_sample_loop(diffusion_model, output_shape, noise=noise,
|
mel = diffuser.p_sample_loop(diffusion_model, output_shape, noise=noise,
|
||||||
model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings},
|
model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings},
|
||||||
verbose=verbose, progress=progress, desc=desc)
|
verbose=verbose, progress=progress, desc=desc)
|
||||||
|
elif sampler == "DDIM":
|
||||||
|
mel = diffuser.ddim_sample_loop(diffusion_model, output_shape, noise=noise,
|
||||||
|
model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings},
|
||||||
|
verbose=verbose, progress=progress, desc=desc)
|
||||||
|
|
||||||
return denormalize_tacotron_mel(mel)[:,:,:output_seq_len]
|
return denormalize_tacotron_mel(mel)[:,:,:output_seq_len]
|
||||||
|
|
||||||
|
|
||||||
|
@ -361,6 +370,7 @@ class TextToSpeech:
|
||||||
cvvp_amount=.0,
|
cvvp_amount=.0,
|
||||||
# diffusion generation parameters follow
|
# diffusion generation parameters follow
|
||||||
diffusion_iterations=100, cond_free=True, cond_free_k=2, diffusion_temperature=1.0,
|
diffusion_iterations=100, cond_free=True, cond_free_k=2, diffusion_temperature=1.0,
|
||||||
|
diffusion_sampler="P",
|
||||||
progress=None,
|
progress=None,
|
||||||
**hf_generate_kwargs):
|
**hf_generate_kwargs):
|
||||||
"""
|
"""
|
||||||
|
@ -531,7 +541,7 @@ class TextToSpeech:
|
||||||
break
|
break
|
||||||
|
|
||||||
mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents, diffusion_conditioning,
|
mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents, diffusion_conditioning,
|
||||||
temperature=diffusion_temperature, verbose=verbose, progress=progress, desc="Transforming autoregressive outputs into audio..")
|
temperature=diffusion_temperature, verbose=verbose, progress=progress, desc="Transforming autoregressive outputs into audio..", sampler=diffusion_sampler)
|
||||||
wav = self.vocoder.inference(mel)
|
wav = self.vocoder.inference(mel)
|
||||||
wav_candidates.append(wav.cpu())
|
wav_candidates.append(wav.cpu())
|
||||||
|
|
||||||
|
|
|
@ -734,6 +734,7 @@ class GaussianDiffusion:
|
||||||
verbose=False,
|
verbose=False,
|
||||||
eta=0.0,
|
eta=0.0,
|
||||||
progress=None,
|
progress=None,
|
||||||
|
desc=None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Generate samples from the model using DDIM.
|
Generate samples from the model using DDIM.
|
||||||
|
@ -753,6 +754,7 @@ class GaussianDiffusion:
|
||||||
verbose=verbose,
|
verbose=verbose,
|
||||||
eta=eta,
|
eta=eta,
|
||||||
progress=progress,
|
progress=progress,
|
||||||
|
desc=desc
|
||||||
):
|
):
|
||||||
final = sample
|
final = sample
|
||||||
return final["sample"]
|
return final["sample"]
|
||||||
|
@ -770,6 +772,7 @@ class GaussianDiffusion:
|
||||||
verbose=False,
|
verbose=False,
|
||||||
eta=0.0,
|
eta=0.0,
|
||||||
progress=None,
|
progress=None,
|
||||||
|
desc=None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Use DDIM to sample from the model and yield intermediate samples from
|
Use DDIM to sample from the model and yield intermediate samples from
|
||||||
|
@ -790,7 +793,7 @@ class GaussianDiffusion:
|
||||||
# Lazy import so that we don't depend on tqdm.
|
# Lazy import so that we don't depend on tqdm.
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
|
|
||||||
indices = tqdm_override(indices, verbose=verbose, desc="DDIM Sample Loop Progressive", progress=progress)
|
indices = tqdm_override(indices, verbose=verbose, desc=desc, progress=progress)
|
||||||
|
|
||||||
for i in indices:
|
for i in indices:
|
||||||
t = th.tensor([i] * shape[0], device=device)
|
t = th.tensor([i] * shape[0], device=device)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user