1
1
forked from mrq/tortoise-tts

Added progress for transforming to audio, changed number inputs to sliders instead

This commit is contained in:
mrq 2023-02-03 04:56:30 +00:00
parent ef237c70d0
commit 4f359bffa4
3 changed files with 36 additions and 19 deletions

6
app.py
View File

@ -121,9 +121,9 @@ def main():
label="Preset",
type="value",
)
candidates = gr.Number(value=1, precision=0, label="Candidates")
num_autoregressive_samples = gr.Number(value=128, precision=0, label="Samples")
diffusion_iterations = gr.Number(value=128, precision=0, label="Iterations")
candidates = gr.Slider(value=1, minimum=1, maximum=6, label="Candidates")
num_autoregressive_samples = gr.Slider(value=128, minimum=0, maximum=512, step=1, label="Samples")
diffusion_iterations = gr.Slider(value=128, minimum=0, maximum=512, step=1, label="Iterations")
temperature = gr.Slider(value=0.2, minimum=0, maximum=1, step=0.1, label="Temperature")
voice = gr.Dropdown(

View File

@ -40,11 +40,12 @@ MODELS = {
}
def tqdm_override(arr, verbose=False, progress=None, desc=None):
if verbose and desc is not None:
print(desc)
if progress is None:
if verbose and desc is not None:
print(desc)
return tqdm(arr, disable=not verbose)
return progress.tqdm(arr, desc=desc)
return progress.tqdm(arr, desc=desc, track_tqdm=True)
def download_models(specific_models=None):
"""
@ -152,7 +153,7 @@ def fix_autoregressive_output(codes, stop_token, complain=True):
return codes
def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True):
def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True, progress=None, desc=None):
"""
Uses the specified diffusion model to convert discrete codes into a spectrogram.
"""
@ -164,7 +165,7 @@ def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_la
noise = torch.randn(output_shape, device=latents.device) * temperature
mel = diffuser.p_sample_loop(diffusion_model, output_shape, noise=noise,
model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings},
progress=verbose)
verbose=verbose, progress=progress, desc=desc)
return denormalize_tacotron_mel(mel)[:,:,:output_seq_len]
@ -471,7 +472,7 @@ class TextToSpeech:
del auto_conditioning
wav_candidates = []
for b in tqdm_override(range(best_results.shape[0]), verbose=verbose, progress=progress, desc="Transforming autoregressive outputs into audio.."):
for b in range(best_results.shape[0]):
codes = best_results[b].unsqueeze(0)
latents = best_latents[b].unsqueeze(0)
@ -487,7 +488,7 @@ class TextToSpeech:
break
mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents, diffusion_conditioning,
temperature=diffusion_temperature, verbose=verbose)
temperature=diffusion_temperature, verbose=verbose, progress=progress, desc="Transforming autoregressive outputs into audio..")
wav = self.vocoder.inference(mel)
wav_candidates.append(wav.cpu())

34
tortoise/utils/diffusion.py Normal file → Executable file
View File

@ -15,6 +15,13 @@ import torch
import torch as th
from tqdm import tqdm
def tqdm_override(arr, verbose=False, progress=None, desc=None):
if verbose and desc is not None:
print(desc)
if progress is None:
return tqdm(arr, disable=not verbose)
return progress.tqdm(arr, desc=desc, track_tqdm=True)
def normal_kl(mean1, logvar1, mean2, logvar2):
"""
@ -540,7 +547,9 @@ class GaussianDiffusion:
cond_fn=None,
model_kwargs=None,
device=None,
progress=False,
verbose=False,
progress=None,
desc=None
):
"""
Generate samples from the model.
@ -558,7 +567,7 @@ class GaussianDiffusion:
pass to the model. This can be used for conditioning.
:param device: if specified, the device to create the samples on.
If not specified, use a model parameter's device.
:param progress: if True, show a tqdm progress bar.
:param verbose: if True, show a tqdm progress bar.
:return: a non-differentiable batch of samples.
"""
final = None
@ -571,7 +580,9 @@ class GaussianDiffusion:
cond_fn=cond_fn,
model_kwargs=model_kwargs,
device=device,
verbose=verbose,
progress=progress,
desc=desc
):
final = sample
return final["sample"]
@ -586,7 +597,9 @@ class GaussianDiffusion:
cond_fn=None,
model_kwargs=None,
device=None,
progress=False,
verbose=False,
progress=None,
desc=None
):
"""
Generate samples from the model and yield intermediate samples from
@ -605,7 +618,7 @@ class GaussianDiffusion:
img = th.randn(*shape, device=device)
indices = list(range(self.num_timesteps))[::-1]
for i in tqdm(indices, disable=not progress):
for i in tqdm_override(indices, verbose=verbose, desc=desc, progress=progress):
t = th.tensor([i] * shape[0], device=device)
with th.no_grad():
out = self.p_sample(
@ -718,8 +731,9 @@ class GaussianDiffusion:
cond_fn=None,
model_kwargs=None,
device=None,
progress=False,
verbose=False,
eta=0.0,
progress=None,
):
"""
Generate samples from the model using DDIM.
@ -736,8 +750,9 @@ class GaussianDiffusion:
cond_fn=cond_fn,
model_kwargs=model_kwargs,
device=device,
progress=progress,
verbose=verbose,
eta=eta,
progress=progress,
):
final = sample
return final["sample"]
@ -752,8 +767,9 @@ class GaussianDiffusion:
cond_fn=None,
model_kwargs=None,
device=None,
progress=False,
verbose=False,
eta=0.0,
progress=None,
):
"""
Use DDIM to sample from the model and yield intermediate samples from
@ -770,11 +786,11 @@ class GaussianDiffusion:
img = th.randn(*shape, device=device)
indices = list(range(self.num_timesteps))[::-1]
if progress:
if verbose:
# Lazy import so that we don't depend on tqdm.
from tqdm.auto import tqdm
indices = tqdm(indices, disable=not progress)
indices = tqdm_override(indices, verbose=verbose, desc="DDIM Sample Loop Progressive", progress=progress)
for i in indices:
t = th.tensor([i] * shape[0], device=device)