forked from mrq/tortoise-tts
forgot to copy the alleged slight performance improvement patch, added detailed progress information with passing gr.Progress, save a little more info with output
This commit is contained in:
parent
43f45274dd
commit
ea751d7b6c
24
app.py
24
app.py
|
@ -7,14 +7,7 @@ from datetime import datetime
|
||||||
from tortoise.api import TextToSpeech
|
from tortoise.api import TextToSpeech
|
||||||
from tortoise.utils.audio import load_audio, load_voice, load_voices
|
from tortoise.utils.audio import load_audio, load_voice, load_voices
|
||||||
|
|
||||||
VOICE_OPTIONS = [
|
def inference(text, emotion, prompt, voice, mic_audio, preset, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, progress=gr.Progress()):
|
||||||
"random", # special option for random voice
|
|
||||||
"microphone", # special option for custom voice
|
|
||||||
"disabled", # special option for disabled voice
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def inference(text, emotion, prompt, voice, mic_audio, preset, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature):
|
|
||||||
if voice != "microphone":
|
if voice != "microphone":
|
||||||
voices = [voice]
|
voices = [voice]
|
||||||
else:
|
else:
|
||||||
|
@ -48,6 +41,10 @@ def inference(text, emotion, prompt, voice, mic_audio, preset, seed, candidates,
|
||||||
seed = None
|
seed = None
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
|
# >b-buh why not set samples and iterations to nullllll
|
||||||
|
# shut up
|
||||||
|
|
||||||
if preset == "none":
|
if preset == "none":
|
||||||
gen, additionals = tts.tts_with_preset(
|
gen, additionals = tts.tts_with_preset(
|
||||||
text,
|
text,
|
||||||
|
@ -60,6 +57,7 @@ def inference(text, emotion, prompt, voice, mic_audio, preset, seed, candidates,
|
||||||
num_autoregressive_samples=num_autoregressive_samples,
|
num_autoregressive_samples=num_autoregressive_samples,
|
||||||
diffusion_iterations=diffusion_iterations,
|
diffusion_iterations=diffusion_iterations,
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
|
progress=progress
|
||||||
)
|
)
|
||||||
seed = additionals[0]
|
seed = additionals[0]
|
||||||
else:
|
else:
|
||||||
|
@ -72,13 +70,13 @@ def inference(text, emotion, prompt, voice, mic_audio, preset, seed, candidates,
|
||||||
return_deterministic_state=True,
|
return_deterministic_state=True,
|
||||||
k=candidates,
|
k=candidates,
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
|
progress=progress
|
||||||
)
|
)
|
||||||
seed = additionals[0]
|
seed = additionals[0]
|
||||||
|
|
||||||
|
info = f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Quality: {preset} preset / {num_autoregressive_samples} samples / {diffusion_iterations} iterations | Temperature: {temperature} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n"
|
||||||
with open("results.log", "a") as f:
|
with open("results.log", "a") as f:
|
||||||
f.write(
|
f.write(info)
|
||||||
f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Quality: {preset} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n"
|
|
||||||
)
|
|
||||||
|
|
||||||
timestamp = int(time.time())
|
timestamp = int(time.time())
|
||||||
outdir = f"./results/{voice}/{timestamp}/"
|
outdir = f"./results/{voice}/{timestamp}/"
|
||||||
|
@ -86,7 +84,7 @@ def inference(text, emotion, prompt, voice, mic_audio, preset, seed, candidates,
|
||||||
os.makedirs(outdir, exist_ok=True)
|
os.makedirs(outdir, exist_ok=True)
|
||||||
|
|
||||||
with open(os.path.join(outdir, f'input.txt'), 'w') as f:
|
with open(os.path.join(outdir, f'input.txt'), 'w') as f:
|
||||||
f.write(f"{text}\n\nSeed: {seed}")
|
f.write(f"{text}\n\n{info}")
|
||||||
|
|
||||||
if isinstance(gen, list):
|
if isinstance(gen, list):
|
||||||
for j, g in enumerate(gen):
|
for j, g in enumerate(gen):
|
||||||
|
@ -129,7 +127,7 @@ def main():
|
||||||
temperature = gr.Slider(value=0.2, minimum=0, maximum=1, step=0.1, label="Temperature")
|
temperature = gr.Slider(value=0.2, minimum=0, maximum=1, step=0.1, label="Temperature")
|
||||||
|
|
||||||
voice = gr.Dropdown(
|
voice = gr.Dropdown(
|
||||||
os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
|
os.listdir(os.path.join("tortoise", "voices")) + ["random", "microphone", "disabled"],
|
||||||
label="Voice",
|
label="Voice",
|
||||||
type="value",
|
type="value",
|
||||||
)
|
)
|
||||||
|
|
50
tortoise/api.py
Normal file → Executable file
50
tortoise/api.py
Normal file → Executable file
|
@ -39,6 +39,13 @@ MODELS = {
|
||||||
'rlg_diffuser.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/rlg_diffuser.pth',
|
'rlg_diffuser.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/rlg_diffuser.pth',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def tqdm_override(arr, verbose=False, progress=None, desc=None):
|
||||||
|
if progress is None:
|
||||||
|
if verbose and desc is not None:
|
||||||
|
print(desc)
|
||||||
|
return tqdm(arr, disable=not verbose)
|
||||||
|
return progress.tqdm(arr, desc=desc)
|
||||||
|
|
||||||
def download_models(specific_models=None):
|
def download_models(specific_models=None):
|
||||||
"""
|
"""
|
||||||
Call to download all the models that Tortoise uses.
|
Call to download all the models that Tortoise uses.
|
||||||
|
@ -234,17 +241,21 @@ class TextToSpeech:
|
||||||
in_latent_channels=1024, in_tokens=8193, dropout=0, use_fp16=False, num_heads=16,
|
in_latent_channels=1024, in_tokens=8193, dropout=0, use_fp16=False, num_heads=16,
|
||||||
layer_drop=0, unconditioned_percentage=0).cpu().eval()
|
layer_drop=0, unconditioned_percentage=0).cpu().eval()
|
||||||
self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', models_dir)))
|
self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', models_dir)))
|
||||||
|
self.autoregressive = self.autoregressive.to(self.device)
|
||||||
|
self.diffusion = self.diffusion.to(self.device)
|
||||||
|
|
||||||
self.clvp = CLVP(dim_text=768, dim_speech=768, dim_latent=768, num_text_tokens=256, text_enc_depth=20,
|
self.clvp = CLVP(dim_text=768, dim_speech=768, dim_latent=768, num_text_tokens=256, text_enc_depth=20,
|
||||||
text_seq_len=350, text_heads=12,
|
text_seq_len=350, text_heads=12,
|
||||||
num_speech_tokens=8192, speech_enc_depth=20, speech_heads=12, speech_seq_len=430,
|
num_speech_tokens=8192, speech_enc_depth=20, speech_heads=12, speech_seq_len=430,
|
||||||
use_xformers=True).cpu().eval()
|
use_xformers=True).cpu().eval()
|
||||||
self.clvp.load_state_dict(torch.load(get_model_path('clvp2.pth', models_dir)))
|
self.clvp.load_state_dict(torch.load(get_model_path('clvp2.pth', models_dir)))
|
||||||
|
self.clvp = self.clvp.to(self.device)
|
||||||
self.cvvp = None # CVVP model is only loaded if used.
|
self.cvvp = None # CVVP model is only loaded if used.
|
||||||
|
|
||||||
self.vocoder = UnivNetGenerator().cpu()
|
self.vocoder = UnivNetGenerator().cpu()
|
||||||
self.vocoder.load_state_dict(torch.load(get_model_path('vocoder.pth', models_dir), map_location=torch.device('cpu'))['model_g'])
|
self.vocoder.load_state_dict(torch.load(get_model_path('vocoder.pth', models_dir), map_location=torch.device('cpu'))['model_g'])
|
||||||
self.vocoder.eval(inference=True)
|
self.vocoder.eval(inference=True)
|
||||||
|
self.vocoder = self.vocoder.to(self.device)
|
||||||
|
|
||||||
# Random latent generators (RLGs) are loaded lazily.
|
# Random latent generators (RLGs) are loaded lazily.
|
||||||
self.rlg_auto = None
|
self.rlg_auto = None
|
||||||
|
@ -255,6 +266,7 @@ class TextToSpeech:
|
||||||
self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0,
|
self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0,
|
||||||
speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval()
|
speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval()
|
||||||
self.cvvp.load_state_dict(torch.load(get_model_path('cvvp.pth', self.models_dir)))
|
self.cvvp.load_state_dict(torch.load(get_model_path('cvvp.pth', self.models_dir)))
|
||||||
|
self.cvvp = self.cvvp.to(self.device)
|
||||||
|
|
||||||
def get_conditioning_latents(self, voice_samples, return_mels=False):
|
def get_conditioning_latents(self, voice_samples, return_mels=False):
|
||||||
"""
|
"""
|
||||||
|
@ -272,9 +284,7 @@ class TextToSpeech:
|
||||||
for vs in voice_samples:
|
for vs in voice_samples:
|
||||||
auto_conds.append(format_conditioning(vs, device=self.device))
|
auto_conds.append(format_conditioning(vs, device=self.device))
|
||||||
auto_conds = torch.stack(auto_conds, dim=1)
|
auto_conds = torch.stack(auto_conds, dim=1)
|
||||||
self.autoregressive = self.autoregressive.to(self.device)
|
|
||||||
auto_latent = self.autoregressive.get_conditioning(auto_conds)
|
auto_latent = self.autoregressive.get_conditioning(auto_conds)
|
||||||
self.autoregressive = self.autoregressive.cpu()
|
|
||||||
|
|
||||||
diffusion_conds = []
|
diffusion_conds = []
|
||||||
for sample in voice_samples:
|
for sample in voice_samples:
|
||||||
|
@ -285,9 +295,7 @@ class TextToSpeech:
|
||||||
diffusion_conds.append(cond_mel)
|
diffusion_conds.append(cond_mel)
|
||||||
diffusion_conds = torch.stack(diffusion_conds, dim=1)
|
diffusion_conds = torch.stack(diffusion_conds, dim=1)
|
||||||
|
|
||||||
self.diffusion = self.diffusion.to(self.device)
|
|
||||||
diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
|
diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
|
||||||
self.diffusion = self.diffusion.cpu()
|
|
||||||
|
|
||||||
if return_mels:
|
if return_mels:
|
||||||
return auto_latent, diffusion_latent, auto_conds, diffusion_conds
|
return auto_latent, diffusion_latent, auto_conds, diffusion_conds
|
||||||
|
@ -335,6 +343,7 @@ class TextToSpeech:
|
||||||
cvvp_amount=.0,
|
cvvp_amount=.0,
|
||||||
# diffusion generation parameters follow
|
# diffusion generation parameters follow
|
||||||
diffusion_iterations=100, cond_free=True, cond_free_k=2, diffusion_temperature=1.0,
|
diffusion_iterations=100, cond_free=True, cond_free_k=2, diffusion_temperature=1.0,
|
||||||
|
progress=None,
|
||||||
**hf_generate_kwargs):
|
**hf_generate_kwargs):
|
||||||
"""
|
"""
|
||||||
Produces an audio clip of the given text being spoken with the given reference voice.
|
Produces an audio clip of the given text being spoken with the given reference voice.
|
||||||
|
@ -404,10 +413,8 @@ class TextToSpeech:
|
||||||
num_batches = num_autoregressive_samples // self.autoregressive_batch_size
|
num_batches = num_autoregressive_samples // self.autoregressive_batch_size
|
||||||
stop_mel_token = self.autoregressive.stop_mel_token
|
stop_mel_token = self.autoregressive.stop_mel_token
|
||||||
calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
|
calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
|
||||||
self.autoregressive = self.autoregressive.to(self.device)
|
|
||||||
if verbose:
|
for b in tqdm_override(range(num_batches), verbose=verbose, progress=progress, desc="Generating autoregressive samples"):
|
||||||
print("Generating autoregressive samples..")
|
|
||||||
for b in tqdm(range(num_batches), disable=not verbose):
|
|
||||||
codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens,
|
codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens,
|
||||||
do_sample=True,
|
do_sample=True,
|
||||||
top_p=top_p,
|
top_p=top_p,
|
||||||
|
@ -420,20 +427,20 @@ class TextToSpeech:
|
||||||
padding_needed = max_mel_tokens - codes.shape[1]
|
padding_needed = max_mel_tokens - codes.shape[1]
|
||||||
codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)
|
codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)
|
||||||
samples.append(codes)
|
samples.append(codes)
|
||||||
self.autoregressive = self.autoregressive.cpu()
|
|
||||||
|
|
||||||
clip_results = []
|
clip_results = []
|
||||||
self.clvp = self.clvp.to(self.device)
|
|
||||||
if cvvp_amount > 0:
|
if cvvp_amount > 0:
|
||||||
if self.cvvp is None:
|
if self.cvvp is None:
|
||||||
self.load_cvvp()
|
self.load_cvvp()
|
||||||
self.cvvp = self.cvvp.to(self.device)
|
|
||||||
|
desc="Computing best candidates"
|
||||||
if verbose:
|
if verbose:
|
||||||
if self.cvvp is None:
|
if self.cvvp is None:
|
||||||
print("Computing best candidates using CLVP")
|
desc = "Computing best candidates using CLVP"
|
||||||
else:
|
else:
|
||||||
print(f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%")
|
desc = f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%"
|
||||||
for batch in tqdm(samples, disable=not verbose):
|
|
||||||
|
for batch in tqdm_override(samples, verbose=verbose, progress=progress, desc=desc):
|
||||||
for i in range(batch.shape[0]):
|
for i in range(batch.shape[0]):
|
||||||
batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
|
batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
|
||||||
if cvvp_amount != 1:
|
if cvvp_amount != 1:
|
||||||
|
@ -452,28 +459,19 @@ class TextToSpeech:
|
||||||
clip_results = torch.cat(clip_results, dim=0)
|
clip_results = torch.cat(clip_results, dim=0)
|
||||||
samples = torch.cat(samples, dim=0)
|
samples = torch.cat(samples, dim=0)
|
||||||
best_results = samples[torch.topk(clip_results, k=k).indices]
|
best_results = samples[torch.topk(clip_results, k=k).indices]
|
||||||
self.clvp = self.clvp.cpu()
|
|
||||||
if self.cvvp is not None:
|
|
||||||
self.cvvp = self.cvvp.cpu()
|
|
||||||
del samples
|
del samples
|
||||||
|
|
||||||
# The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
|
# The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
|
||||||
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
||||||
# results, but will increase memory usage.
|
# results, but will increase memory usage.
|
||||||
self.autoregressive = self.autoregressive.to(self.device)
|
|
||||||
best_latents = self.autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
|
best_latents = self.autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
|
||||||
torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), best_results,
|
torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), best_results,
|
||||||
torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
|
torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
|
||||||
return_latent=True, clip_inputs=False)
|
return_latent=True, clip_inputs=False)
|
||||||
self.autoregressive = self.autoregressive.cpu()
|
|
||||||
del auto_conditioning
|
del auto_conditioning
|
||||||
|
|
||||||
if verbose:
|
|
||||||
print("Transforming autoregressive outputs into audio..")
|
|
||||||
wav_candidates = []
|
wav_candidates = []
|
||||||
self.diffusion = self.diffusion.to(self.device)
|
for b in tqdm_override(range(best_results.shape[0]), verbose=verbose, progress=progress, desc="Transforming autoregressive outputs into audio.."):
|
||||||
self.vocoder = self.vocoder.to(self.device)
|
|
||||||
for b in range(best_results.shape[0]):
|
|
||||||
codes = best_results[b].unsqueeze(0)
|
codes = best_results[b].unsqueeze(0)
|
||||||
latents = best_latents[b].unsqueeze(0)
|
latents = best_latents[b].unsqueeze(0)
|
||||||
|
|
||||||
|
@ -492,8 +490,6 @@ class TextToSpeech:
|
||||||
temperature=diffusion_temperature, verbose=verbose)
|
temperature=diffusion_temperature, verbose=verbose)
|
||||||
wav = self.vocoder.inference(mel)
|
wav = self.vocoder.inference(mel)
|
||||||
wav_candidates.append(wav.cpu())
|
wav_candidates.append(wav.cpu())
|
||||||
self.diffusion = self.diffusion.cpu()
|
|
||||||
self.vocoder = self.vocoder.cpu()
|
|
||||||
|
|
||||||
def potentially_redact(clip, text):
|
def potentially_redact(clip, text):
|
||||||
if self.enable_redaction:
|
if self.enable_redaction:
|
||||||
|
@ -522,4 +518,4 @@ class TextToSpeech:
|
||||||
# Can't currently set this because of CUBLAS. TODO: potentially enable it if necessary.
|
# Can't currently set this because of CUBLAS. TODO: potentially enable it if necessary.
|
||||||
# torch.use_deterministic_algorithms(True)
|
# torch.use_deterministic_algorithms(True)
|
||||||
|
|
||||||
return seed
|
return seed
|
Loading…
Reference in New Issue
Block a user