forked from mrq/tortoise-tts
Added progress for transforming to audio, changed number inputs to sliders instead
This commit is contained in:
parent
ea751d7b6c
commit
e8d4a4f89c
6
app.py
6
app.py
|
@ -121,9 +121,9 @@ def main():
|
|||
label="Preset",
|
||||
type="value",
|
||||
)
|
||||
candidates = gr.Number(value=1, precision=0, label="Candidates")
|
||||
num_autoregressive_samples = gr.Number(value=128, precision=0, label="Samples")
|
||||
diffusion_iterations = gr.Number(value=128, precision=0, label="Iterations")
|
||||
candidates = gr.Slider(value=1, minimum=1, maximum=6, label="Candidates")
|
||||
num_autoregressive_samples = gr.Slider(value=128, minimum=0, maximum=512, step=1, label="Samples")
|
||||
diffusion_iterations = gr.Slider(value=128, minimum=0, maximum=512, step=1, label="Iterations")
|
||||
temperature = gr.Slider(value=0.2, minimum=0, maximum=1, step=0.1, label="Temperature")
|
||||
|
||||
voice = gr.Dropdown(
|
||||
|
|
|
@ -40,11 +40,12 @@ MODELS = {
|
|||
}
|
||||
|
||||
def tqdm_override(arr, verbose=False, progress=None, desc=None):
|
||||
if verbose and desc is not None:
|
||||
print(desc)
|
||||
|
||||
if progress is None:
|
||||
if verbose and desc is not None:
|
||||
print(desc)
|
||||
return tqdm(arr, disable=not verbose)
|
||||
return progress.tqdm(arr, desc=desc)
|
||||
return progress.tqdm(arr, desc=desc, track_tqdm=True)
|
||||
|
||||
def download_models(specific_models=None):
|
||||
"""
|
||||
|
@ -152,7 +153,7 @@ def fix_autoregressive_output(codes, stop_token, complain=True):
|
|||
return codes
|
||||
|
||||
|
||||
def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True):
|
||||
def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True, progress=None, desc=None):
|
||||
"""
|
||||
Uses the specified diffusion model to convert discrete codes into a spectrogram.
|
||||
"""
|
||||
|
@ -164,7 +165,7 @@ def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_la
|
|||
noise = torch.randn(output_shape, device=latents.device) * temperature
|
||||
mel = diffuser.p_sample_loop(diffusion_model, output_shape, noise=noise,
|
||||
model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings},
|
||||
progress=verbose)
|
||||
verbose=verbose, progress=progress, desc=desc)
|
||||
return denormalize_tacotron_mel(mel)[:,:,:output_seq_len]
|
||||
|
||||
|
||||
|
@ -471,7 +472,7 @@ class TextToSpeech:
|
|||
del auto_conditioning
|
||||
|
||||
wav_candidates = []
|
||||
for b in tqdm_override(range(best_results.shape[0]), verbose=verbose, progress=progress, desc="Transforming autoregressive outputs into audio.."):
|
||||
for b in range(best_results.shape[0]):
|
||||
codes = best_results[b].unsqueeze(0)
|
||||
latents = best_latents[b].unsqueeze(0)
|
||||
|
||||
|
@ -487,7 +488,7 @@ class TextToSpeech:
|
|||
break
|
||||
|
||||
mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents, diffusion_conditioning,
|
||||
temperature=diffusion_temperature, verbose=verbose)
|
||||
temperature=diffusion_temperature, verbose=verbose, progress=progress, desc="Transforming autoregressive outputs into audio..")
|
||||
wav = self.vocoder.inference(mel)
|
||||
wav_candidates.append(wav.cpu())
|
||||
|
||||
|
|
34
tortoise/utils/diffusion.py
Normal file → Executable file
34
tortoise/utils/diffusion.py
Normal file → Executable file
|
@ -15,6 +15,13 @@ import torch
|
|||
import torch as th
|
||||
from tqdm import tqdm
|
||||
|
||||
def tqdm_override(arr, verbose=False, progress=None, desc=None):
|
||||
if verbose and desc is not None:
|
||||
print(desc)
|
||||
|
||||
if progress is None:
|
||||
return tqdm(arr, disable=not verbose)
|
||||
return progress.tqdm(arr, desc=desc, track_tqdm=True)
|
||||
|
||||
def normal_kl(mean1, logvar1, mean2, logvar2):
|
||||
"""
|
||||
|
@ -540,7 +547,9 @@ class GaussianDiffusion:
|
|||
cond_fn=None,
|
||||
model_kwargs=None,
|
||||
device=None,
|
||||
progress=False,
|
||||
verbose=False,
|
||||
progress=None,
|
||||
desc=None
|
||||
):
|
||||
"""
|
||||
Generate samples from the model.
|
||||
|
@ -558,7 +567,7 @@ class GaussianDiffusion:
|
|||
pass to the model. This can be used for conditioning.
|
||||
:param device: if specified, the device to create the samples on.
|
||||
If not specified, use a model parameter's device.
|
||||
:param progress: if True, show a tqdm progress bar.
|
||||
:param verbose: if True, show a tqdm progress bar.
|
||||
:return: a non-differentiable batch of samples.
|
||||
"""
|
||||
final = None
|
||||
|
@ -571,7 +580,9 @@ class GaussianDiffusion:
|
|||
cond_fn=cond_fn,
|
||||
model_kwargs=model_kwargs,
|
||||
device=device,
|
||||
verbose=verbose,
|
||||
progress=progress,
|
||||
desc=desc
|
||||
):
|
||||
final = sample
|
||||
return final["sample"]
|
||||
|
@ -586,7 +597,9 @@ class GaussianDiffusion:
|
|||
cond_fn=None,
|
||||
model_kwargs=None,
|
||||
device=None,
|
||||
progress=False,
|
||||
verbose=False,
|
||||
progress=None,
|
||||
desc=None
|
||||
):
|
||||
"""
|
||||
Generate samples from the model and yield intermediate samples from
|
||||
|
@ -605,7 +618,7 @@ class GaussianDiffusion:
|
|||
img = th.randn(*shape, device=device)
|
||||
indices = list(range(self.num_timesteps))[::-1]
|
||||
|
||||
for i in tqdm(indices, disable=not progress):
|
||||
for i in tqdm_override(indices, verbose=verbose, desc=desc, progress=progress):
|
||||
t = th.tensor([i] * shape[0], device=device)
|
||||
with th.no_grad():
|
||||
out = self.p_sample(
|
||||
|
@ -718,8 +731,9 @@ class GaussianDiffusion:
|
|||
cond_fn=None,
|
||||
model_kwargs=None,
|
||||
device=None,
|
||||
progress=False,
|
||||
verbose=False,
|
||||
eta=0.0,
|
||||
progress=None,
|
||||
):
|
||||
"""
|
||||
Generate samples from the model using DDIM.
|
||||
|
@ -736,8 +750,9 @@ class GaussianDiffusion:
|
|||
cond_fn=cond_fn,
|
||||
model_kwargs=model_kwargs,
|
||||
device=device,
|
||||
progress=progress,
|
||||
verbose=verbose,
|
||||
eta=eta,
|
||||
progress=progress,
|
||||
):
|
||||
final = sample
|
||||
return final["sample"]
|
||||
|
@ -752,8 +767,9 @@ class GaussianDiffusion:
|
|||
cond_fn=None,
|
||||
model_kwargs=None,
|
||||
device=None,
|
||||
progress=False,
|
||||
verbose=False,
|
||||
eta=0.0,
|
||||
progress=None,
|
||||
):
|
||||
"""
|
||||
Use DDIM to sample from the model and yield intermediate samples from
|
||||
|
@ -770,11 +786,11 @@ class GaussianDiffusion:
|
|||
img = th.randn(*shape, device=device)
|
||||
indices = list(range(self.num_timesteps))[::-1]
|
||||
|
||||
if progress:
|
||||
if verbose:
|
||||
# Lazy import so that we don't depend on tqdm.
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
indices = tqdm(indices, disable=not progress)
|
||||
indices = tqdm_override(indices, verbose=verbose, desc="DDIM Sample Loop Progressive", progress=progress)
|
||||
|
||||
for i in indices:
|
||||
t = th.tensor([i] * shape[0], device=device)
|
||||
|
|
Loading…
Reference in New Issue
Block a user