Added progress for transforming to audio, changed number inputs to sliders instead

This commit is contained in:
mrq 2023-02-03 04:56:30 +00:00
parent ef237c70d0
commit 4f359bffa4
3 changed files with 36 additions and 19 deletions

6
app.py
View File

@ -121,9 +121,9 @@ def main():
label="Preset", label="Preset",
type="value", type="value",
) )
candidates = gr.Number(value=1, precision=0, label="Candidates") candidates = gr.Slider(value=1, minimum=1, maximum=6, label="Candidates")
num_autoregressive_samples = gr.Number(value=128, precision=0, label="Samples") num_autoregressive_samples = gr.Slider(value=128, minimum=0, maximum=512, step=1, label="Samples")
diffusion_iterations = gr.Number(value=128, precision=0, label="Iterations") diffusion_iterations = gr.Slider(value=128, minimum=0, maximum=512, step=1, label="Iterations")
temperature = gr.Slider(value=0.2, minimum=0, maximum=1, step=0.1, label="Temperature") temperature = gr.Slider(value=0.2, minimum=0, maximum=1, step=0.1, label="Temperature")
voice = gr.Dropdown( voice = gr.Dropdown(

View File

@ -40,11 +40,12 @@ MODELS = {
} }
def tqdm_override(arr, verbose=False, progress=None, desc=None): def tqdm_override(arr, verbose=False, progress=None, desc=None):
if progress is None:
if verbose and desc is not None: if verbose and desc is not None:
print(desc) print(desc)
if progress is None:
return tqdm(arr, disable=not verbose) return tqdm(arr, disable=not verbose)
return progress.tqdm(arr, desc=desc) return progress.tqdm(arr, desc=desc, track_tqdm=True)
def download_models(specific_models=None): def download_models(specific_models=None):
""" """
@ -152,7 +153,7 @@ def fix_autoregressive_output(codes, stop_token, complain=True):
return codes return codes
def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True): def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True, progress=None, desc=None):
""" """
Uses the specified diffusion model to convert discrete codes into a spectrogram. Uses the specified diffusion model to convert discrete codes into a spectrogram.
""" """
@ -164,7 +165,7 @@ def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_la
noise = torch.randn(output_shape, device=latents.device) * temperature noise = torch.randn(output_shape, device=latents.device) * temperature
mel = diffuser.p_sample_loop(diffusion_model, output_shape, noise=noise, mel = diffuser.p_sample_loop(diffusion_model, output_shape, noise=noise,
model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings}, model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings},
progress=verbose) verbose=verbose, progress=progress, desc=desc)
return denormalize_tacotron_mel(mel)[:,:,:output_seq_len] return denormalize_tacotron_mel(mel)[:,:,:output_seq_len]
@ -471,7 +472,7 @@ class TextToSpeech:
del auto_conditioning del auto_conditioning
wav_candidates = [] wav_candidates = []
for b in tqdm_override(range(best_results.shape[0]), verbose=verbose, progress=progress, desc="Transforming autoregressive outputs into audio.."): for b in range(best_results.shape[0]):
codes = best_results[b].unsqueeze(0) codes = best_results[b].unsqueeze(0)
latents = best_latents[b].unsqueeze(0) latents = best_latents[b].unsqueeze(0)
@ -487,7 +488,7 @@ class TextToSpeech:
break break
mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents, diffusion_conditioning, mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents, diffusion_conditioning,
temperature=diffusion_temperature, verbose=verbose) temperature=diffusion_temperature, verbose=verbose, progress=progress, desc="Transforming autoregressive outputs into audio..")
wav = self.vocoder.inference(mel) wav = self.vocoder.inference(mel)
wav_candidates.append(wav.cpu()) wav_candidates.append(wav.cpu())

34
tortoise/utils/diffusion.py Normal file → Executable file
View File

@ -15,6 +15,13 @@ import torch
import torch as th import torch as th
from tqdm import tqdm from tqdm import tqdm
def tqdm_override(arr, verbose=False, progress=None, desc=None):
if verbose and desc is not None:
print(desc)
if progress is None:
return tqdm(arr, disable=not verbose)
return progress.tqdm(arr, desc=desc, track_tqdm=True)
def normal_kl(mean1, logvar1, mean2, logvar2): def normal_kl(mean1, logvar1, mean2, logvar2):
""" """
@ -540,7 +547,9 @@ class GaussianDiffusion:
cond_fn=None, cond_fn=None,
model_kwargs=None, model_kwargs=None,
device=None, device=None,
progress=False, verbose=False,
progress=None,
desc=None
): ):
""" """
Generate samples from the model. Generate samples from the model.
@ -558,7 +567,7 @@ class GaussianDiffusion:
pass to the model. This can be used for conditioning. pass to the model. This can be used for conditioning.
:param device: if specified, the device to create the samples on. :param device: if specified, the device to create the samples on.
If not specified, use a model parameter's device. If not specified, use a model parameter's device.
:param progress: if True, show a tqdm progress bar. :param verbose: if True, show a tqdm progress bar.
:return: a non-differentiable batch of samples. :return: a non-differentiable batch of samples.
""" """
final = None final = None
@ -571,7 +580,9 @@ class GaussianDiffusion:
cond_fn=cond_fn, cond_fn=cond_fn,
model_kwargs=model_kwargs, model_kwargs=model_kwargs,
device=device, device=device,
verbose=verbose,
progress=progress, progress=progress,
desc=desc
): ):
final = sample final = sample
return final["sample"] return final["sample"]
@ -586,7 +597,9 @@ class GaussianDiffusion:
cond_fn=None, cond_fn=None,
model_kwargs=None, model_kwargs=None,
device=None, device=None,
progress=False, verbose=False,
progress=None,
desc=None
): ):
""" """
Generate samples from the model and yield intermediate samples from Generate samples from the model and yield intermediate samples from
@ -605,7 +618,7 @@ class GaussianDiffusion:
img = th.randn(*shape, device=device) img = th.randn(*shape, device=device)
indices = list(range(self.num_timesteps))[::-1] indices = list(range(self.num_timesteps))[::-1]
for i in tqdm(indices, disable=not progress): for i in tqdm_override(indices, verbose=verbose, desc=desc, progress=progress):
t = th.tensor([i] * shape[0], device=device) t = th.tensor([i] * shape[0], device=device)
with th.no_grad(): with th.no_grad():
out = self.p_sample( out = self.p_sample(
@ -718,8 +731,9 @@ class GaussianDiffusion:
cond_fn=None, cond_fn=None,
model_kwargs=None, model_kwargs=None,
device=None, device=None,
progress=False, verbose=False,
eta=0.0, eta=0.0,
progress=None,
): ):
""" """
Generate samples from the model using DDIM. Generate samples from the model using DDIM.
@ -736,8 +750,9 @@ class GaussianDiffusion:
cond_fn=cond_fn, cond_fn=cond_fn,
model_kwargs=model_kwargs, model_kwargs=model_kwargs,
device=device, device=device,
progress=progress, verbose=verbose,
eta=eta, eta=eta,
progress=progress,
): ):
final = sample final = sample
return final["sample"] return final["sample"]
@ -752,8 +767,9 @@ class GaussianDiffusion:
cond_fn=None, cond_fn=None,
model_kwargs=None, model_kwargs=None,
device=None, device=None,
progress=False, verbose=False,
eta=0.0, eta=0.0,
progress=None,
): ):
""" """
Use DDIM to sample from the model and yield intermediate samples from Use DDIM to sample from the model and yield intermediate samples from
@ -770,11 +786,11 @@ class GaussianDiffusion:
img = th.randn(*shape, device=device) img = th.randn(*shape, device=device)
indices = list(range(self.num_timesteps))[::-1] indices = list(range(self.num_timesteps))[::-1]
if progress: if verbose:
# Lazy import so that we don't depend on tqdm. # Lazy import so that we don't depend on tqdm.
from tqdm.auto import tqdm from tqdm.auto import tqdm
indices = tqdm(indices, disable=not progress) indices = tqdm_override(indices, verbose=verbose, desc="DDIM Sample Loop Progressive", progress=progress)
for i in indices: for i in indices:
t = th.tensor([i] * shape[0], device=device) t = th.tensor([i] * shape[0], device=device)