forked from mrq/tortoise-tts
Added progress for transforming to audio, changed number inputs to sliders instead
This commit is contained in:
parent
ef237c70d0
commit
4f359bffa4
6
app.py
6
app.py
|
@ -121,9 +121,9 @@ def main():
|
||||||
label="Preset",
|
label="Preset",
|
||||||
type="value",
|
type="value",
|
||||||
)
|
)
|
||||||
candidates = gr.Number(value=1, precision=0, label="Candidates")
|
candidates = gr.Slider(value=1, minimum=1, maximum=6, label="Candidates")
|
||||||
num_autoregressive_samples = gr.Number(value=128, precision=0, label="Samples")
|
num_autoregressive_samples = gr.Slider(value=128, minimum=0, maximum=512, step=1, label="Samples")
|
||||||
diffusion_iterations = gr.Number(value=128, precision=0, label="Iterations")
|
diffusion_iterations = gr.Slider(value=128, minimum=0, maximum=512, step=1, label="Iterations")
|
||||||
temperature = gr.Slider(value=0.2, minimum=0, maximum=1, step=0.1, label="Temperature")
|
temperature = gr.Slider(value=0.2, minimum=0, maximum=1, step=0.1, label="Temperature")
|
||||||
|
|
||||||
voice = gr.Dropdown(
|
voice = gr.Dropdown(
|
||||||
|
|
|
@ -40,11 +40,12 @@ MODELS = {
|
||||||
}
|
}
|
||||||
|
|
||||||
def tqdm_override(arr, verbose=False, progress=None, desc=None):
|
def tqdm_override(arr, verbose=False, progress=None, desc=None):
|
||||||
if progress is None:
|
|
||||||
if verbose and desc is not None:
|
if verbose and desc is not None:
|
||||||
print(desc)
|
print(desc)
|
||||||
|
|
||||||
|
if progress is None:
|
||||||
return tqdm(arr, disable=not verbose)
|
return tqdm(arr, disable=not verbose)
|
||||||
return progress.tqdm(arr, desc=desc)
|
return progress.tqdm(arr, desc=desc, track_tqdm=True)
|
||||||
|
|
||||||
def download_models(specific_models=None):
|
def download_models(specific_models=None):
|
||||||
"""
|
"""
|
||||||
|
@ -152,7 +153,7 @@ def fix_autoregressive_output(codes, stop_token, complain=True):
|
||||||
return codes
|
return codes
|
||||||
|
|
||||||
|
|
||||||
def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True):
|
def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True, progress=None, desc=None):
|
||||||
"""
|
"""
|
||||||
Uses the specified diffusion model to convert discrete codes into a spectrogram.
|
Uses the specified diffusion model to convert discrete codes into a spectrogram.
|
||||||
"""
|
"""
|
||||||
|
@ -164,7 +165,7 @@ def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_la
|
||||||
noise = torch.randn(output_shape, device=latents.device) * temperature
|
noise = torch.randn(output_shape, device=latents.device) * temperature
|
||||||
mel = diffuser.p_sample_loop(diffusion_model, output_shape, noise=noise,
|
mel = diffuser.p_sample_loop(diffusion_model, output_shape, noise=noise,
|
||||||
model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings},
|
model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings},
|
||||||
progress=verbose)
|
verbose=verbose, progress=progress, desc=desc)
|
||||||
return denormalize_tacotron_mel(mel)[:,:,:output_seq_len]
|
return denormalize_tacotron_mel(mel)[:,:,:output_seq_len]
|
||||||
|
|
||||||
|
|
||||||
|
@ -471,7 +472,7 @@ class TextToSpeech:
|
||||||
del auto_conditioning
|
del auto_conditioning
|
||||||
|
|
||||||
wav_candidates = []
|
wav_candidates = []
|
||||||
for b in tqdm_override(range(best_results.shape[0]), verbose=verbose, progress=progress, desc="Transforming autoregressive outputs into audio.."):
|
for b in range(best_results.shape[0]):
|
||||||
codes = best_results[b].unsqueeze(0)
|
codes = best_results[b].unsqueeze(0)
|
||||||
latents = best_latents[b].unsqueeze(0)
|
latents = best_latents[b].unsqueeze(0)
|
||||||
|
|
||||||
|
@ -487,7 +488,7 @@ class TextToSpeech:
|
||||||
break
|
break
|
||||||
|
|
||||||
mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents, diffusion_conditioning,
|
mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents, diffusion_conditioning,
|
||||||
temperature=diffusion_temperature, verbose=verbose)
|
temperature=diffusion_temperature, verbose=verbose, progress=progress, desc="Transforming autoregressive outputs into audio..")
|
||||||
wav = self.vocoder.inference(mel)
|
wav = self.vocoder.inference(mel)
|
||||||
wav_candidates.append(wav.cpu())
|
wav_candidates.append(wav.cpu())
|
||||||
|
|
||||||
|
|
34
tortoise/utils/diffusion.py
Normal file → Executable file
34
tortoise/utils/diffusion.py
Normal file → Executable file
|
@ -15,6 +15,13 @@ import torch
|
||||||
import torch as th
|
import torch as th
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
def tqdm_override(arr, verbose=False, progress=None, desc=None):
|
||||||
|
if verbose and desc is not None:
|
||||||
|
print(desc)
|
||||||
|
|
||||||
|
if progress is None:
|
||||||
|
return tqdm(arr, disable=not verbose)
|
||||||
|
return progress.tqdm(arr, desc=desc, track_tqdm=True)
|
||||||
|
|
||||||
def normal_kl(mean1, logvar1, mean2, logvar2):
|
def normal_kl(mean1, logvar1, mean2, logvar2):
|
||||||
"""
|
"""
|
||||||
|
@ -540,7 +547,9 @@ class GaussianDiffusion:
|
||||||
cond_fn=None,
|
cond_fn=None,
|
||||||
model_kwargs=None,
|
model_kwargs=None,
|
||||||
device=None,
|
device=None,
|
||||||
progress=False,
|
verbose=False,
|
||||||
|
progress=None,
|
||||||
|
desc=None
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Generate samples from the model.
|
Generate samples from the model.
|
||||||
|
@ -558,7 +567,7 @@ class GaussianDiffusion:
|
||||||
pass to the model. This can be used for conditioning.
|
pass to the model. This can be used for conditioning.
|
||||||
:param device: if specified, the device to create the samples on.
|
:param device: if specified, the device to create the samples on.
|
||||||
If not specified, use a model parameter's device.
|
If not specified, use a model parameter's device.
|
||||||
:param progress: if True, show a tqdm progress bar.
|
:param verbose: if True, show a tqdm progress bar.
|
||||||
:return: a non-differentiable batch of samples.
|
:return: a non-differentiable batch of samples.
|
||||||
"""
|
"""
|
||||||
final = None
|
final = None
|
||||||
|
@ -571,7 +580,9 @@ class GaussianDiffusion:
|
||||||
cond_fn=cond_fn,
|
cond_fn=cond_fn,
|
||||||
model_kwargs=model_kwargs,
|
model_kwargs=model_kwargs,
|
||||||
device=device,
|
device=device,
|
||||||
|
verbose=verbose,
|
||||||
progress=progress,
|
progress=progress,
|
||||||
|
desc=desc
|
||||||
):
|
):
|
||||||
final = sample
|
final = sample
|
||||||
return final["sample"]
|
return final["sample"]
|
||||||
|
@ -586,7 +597,9 @@ class GaussianDiffusion:
|
||||||
cond_fn=None,
|
cond_fn=None,
|
||||||
model_kwargs=None,
|
model_kwargs=None,
|
||||||
device=None,
|
device=None,
|
||||||
progress=False,
|
verbose=False,
|
||||||
|
progress=None,
|
||||||
|
desc=None
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Generate samples from the model and yield intermediate samples from
|
Generate samples from the model and yield intermediate samples from
|
||||||
|
@ -605,7 +618,7 @@ class GaussianDiffusion:
|
||||||
img = th.randn(*shape, device=device)
|
img = th.randn(*shape, device=device)
|
||||||
indices = list(range(self.num_timesteps))[::-1]
|
indices = list(range(self.num_timesteps))[::-1]
|
||||||
|
|
||||||
for i in tqdm(indices, disable=not progress):
|
for i in tqdm_override(indices, verbose=verbose, desc=desc, progress=progress):
|
||||||
t = th.tensor([i] * shape[0], device=device)
|
t = th.tensor([i] * shape[0], device=device)
|
||||||
with th.no_grad():
|
with th.no_grad():
|
||||||
out = self.p_sample(
|
out = self.p_sample(
|
||||||
|
@ -718,8 +731,9 @@ class GaussianDiffusion:
|
||||||
cond_fn=None,
|
cond_fn=None,
|
||||||
model_kwargs=None,
|
model_kwargs=None,
|
||||||
device=None,
|
device=None,
|
||||||
progress=False,
|
verbose=False,
|
||||||
eta=0.0,
|
eta=0.0,
|
||||||
|
progress=None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Generate samples from the model using DDIM.
|
Generate samples from the model using DDIM.
|
||||||
|
@ -736,8 +750,9 @@ class GaussianDiffusion:
|
||||||
cond_fn=cond_fn,
|
cond_fn=cond_fn,
|
||||||
model_kwargs=model_kwargs,
|
model_kwargs=model_kwargs,
|
||||||
device=device,
|
device=device,
|
||||||
progress=progress,
|
verbose=verbose,
|
||||||
eta=eta,
|
eta=eta,
|
||||||
|
progress=progress,
|
||||||
):
|
):
|
||||||
final = sample
|
final = sample
|
||||||
return final["sample"]
|
return final["sample"]
|
||||||
|
@ -752,8 +767,9 @@ class GaussianDiffusion:
|
||||||
cond_fn=None,
|
cond_fn=None,
|
||||||
model_kwargs=None,
|
model_kwargs=None,
|
||||||
device=None,
|
device=None,
|
||||||
progress=False,
|
verbose=False,
|
||||||
eta=0.0,
|
eta=0.0,
|
||||||
|
progress=None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Use DDIM to sample from the model and yield intermediate samples from
|
Use DDIM to sample from the model and yield intermediate samples from
|
||||||
|
@ -770,11 +786,11 @@ class GaussianDiffusion:
|
||||||
img = th.randn(*shape, device=device)
|
img = th.randn(*shape, device=device)
|
||||||
indices = list(range(self.num_timesteps))[::-1]
|
indices = list(range(self.num_timesteps))[::-1]
|
||||||
|
|
||||||
if progress:
|
if verbose:
|
||||||
# Lazy import so that we don't depend on tqdm.
|
# Lazy import so that we don't depend on tqdm.
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
|
|
||||||
indices = tqdm(indices, disable=not progress)
|
indices = tqdm_override(indices, verbose=verbose, desc="DDIM Sample Loop Progressive", progress=progress)
|
||||||
|
|
||||||
for i in indices:
|
for i in indices:
|
||||||
t = th.tensor([i] * shape[0], device=device)
|
t = th.tensor([i] * shape[0], device=device)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user