modified how conditional latents are computed (before, it just happened to only bother reading the first 102400/24000=4.26 seconds per audio input, now it will chunk it all to compute latents)

This commit is contained in:
mrq 2023-02-05 23:25:41 +00:00
parent 4ea997106e
commit c2c9b1b683
6 changed files with 82 additions and 55 deletions

View File

@ -115,6 +115,8 @@ To save you from headaches, I strongly recommend playing around with shorter sen
As a quick optimization, I modified the script to where the `conditional_latents` are saved after loading voice samples, and subsequent uses will load that file directly (at the cost of not returning the `Sample voice` to the web UI). If there's voice samples that have a modification time newer than this cached file, it'll skip loading it and load the normal WAVs instead. As a quick optimization, I modified the script to where the `conditional_latents` are saved after loading voice samples, and subsequent uses will load that file directly (at the cost of not returning the `Sample voice` to the web UI). If there's voice samples that have a modification time newer than this cached file, it'll skip loading it and load the normal WAVs instead.
**!**NOTE**!**: cached `latents.pth` files generated before 2023.02.05 will be ignored, due to a change in computing the conditiona latents. This *should* help bump up voice cloning quality. Apologies for the inconvenience.
## Example(s) ## Example(s)
Below are some outputs I deem substantial enough to share. As I continue delving into TorToiSe, I'll supply more examples and the values I use. Below are some outputs I deem substantial enough to share. As I continue delving into TorToiSe, I'll supply more examples and the values I use.

13
app.py
View File

@ -10,7 +10,9 @@ from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_audio, load_voice, load_voices from tortoise.utils.audio import load_audio, load_voice, load_voices
from tortoise.utils.text import split_and_recombine_text from tortoise.utils.text import split_and_recombine_text
def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, diffusion_sampler, breathing_room, progress=gr.Progress()): def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, diffusion_sampler, breathing_room, experimentals, progress=gr.Progress()):
print(experimentals)
if voice != "microphone": if voice != "microphone":
voices = [voice] voices = [voice]
else: else:
@ -27,7 +29,7 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c
if voice_samples is not None: if voice_samples is not None:
sample_voice = voice_samples[0] sample_voice = voice_samples[0]
conditioning_latents = tts.get_conditioning_latents(voice_samples) conditioning_latents = tts.get_conditioning_latents(voice_samples, progress=progress)
torch.save(conditioning_latents, os.path.join(f'./tortoise/voices/{voice}/', f'latents.pth')) torch.save(conditioning_latents, os.path.join(f'./tortoise/voices/{voice}/', f'latents.pth'))
voice_samples = None voice_samples = None
else: else:
@ -54,6 +56,8 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c
'diffusion_sampler': diffusion_sampler, 'diffusion_sampler': diffusion_sampler,
'breathing_room': breathing_room, 'breathing_room': breathing_room,
'progress': progress, 'progress': progress,
'half_p': "Half Precision" in experimentals,
'cond_free': "Conditioning-Free" in experimentals,
} }
if delimiter == "\\n": if delimiter == "\\n":
@ -216,6 +220,8 @@ def main():
type="value", type="value",
) )
experimentals = gr.CheckboxGroup(["Half Precision", "Conditioning-Free"], value=[False, True], label="Experimental Flags")
preset.change(fn=update_presets, preset.change(fn=update_presets,
inputs=preset, inputs=preset,
outputs=[ outputs=[
@ -246,7 +252,8 @@ def main():
diffusion_iterations, diffusion_iterations,
temperature, temperature,
diffusion_sampler, diffusion_sampler,
breathing_room breathing_room,
experimentals,
], ],
outputs=[selected_voice, output_audio, usedSeed], outputs=[selected_voice, output_audio, usedSeed],
) )

View File

@ -1,3 +1,4 @@
call .\tortoise-venv\Scripts\activate.bat call .\tortoise-venv\Scripts\activate.bat
py .\app.py python .\app.py
deactivate deactivate
pause

View File

@ -284,7 +284,7 @@ class TextToSpeech:
if self.minor_optimizations: if self.minor_optimizations:
self.cvvp = self.cvvp.to(self.device) self.cvvp = self.cvvp.to(self.device)
def get_conditioning_latents(self, voice_samples, return_mels=False): def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, progress=None, enforced_length=102400):
""" """
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent). Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
@ -303,14 +303,18 @@ class TextToSpeech:
auto_conds = torch.stack(auto_conds, dim=1) auto_conds = torch.stack(auto_conds, dim=1)
diffusion_conds = [] diffusion_conds = []
for sample in voice_samples:
for sample in tqdm_override(voice_samples, verbose=verbose, progress=progress, desc="Computing conditioning latents..."):
# The diffuser operates at a sample rate of 24000 (except for the latent inputs) # The diffuser operates at a sample rate of 24000 (except for the latent inputs)
sample = torchaudio.functional.resample(sample, 22050, 24000) sample = torchaudio.functional.resample(sample, 22050, 24000)
sample = pad_or_truncate(sample, 102400) chunks = torch.chunk(sample, int(sample.shape[-1] / enforced_length) + 1, dim=1)
cond_mel = wav_to_univnet_mel(sample.to(self.device), do_normalization=False, device=self.device)
diffusion_conds.append(cond_mel) for chunk in chunks:
diffusion_conds = torch.stack(diffusion_conds, dim=1) chunk = pad_or_truncate(chunk, enforced_length)
cond_mel = wav_to_univnet_mel(chunk.to(self.device), do_normalization=False, device=self.device)
diffusion_conds.append(cond_mel)
diffusion_conds = torch.stack(diffusion_conds, dim=1)
if self.minor_optimizations: if self.minor_optimizations:
auto_latent = self.autoregressive.get_conditioning(auto_conds) auto_latent = self.autoregressive.get_conditioning(auto_conds)
@ -372,6 +376,7 @@ class TextToSpeech:
diffusion_iterations=100, cond_free=True, cond_free_k=2, diffusion_temperature=1.0, diffusion_iterations=100, cond_free=True, cond_free_k=2, diffusion_temperature=1.0,
diffusion_sampler="P", diffusion_sampler="P",
breathing_room=8, breathing_room=8,
half_p=False,
progress=None, progress=None,
**hf_generate_kwargs): **hf_generate_kwargs):
""" """
@ -446,55 +451,57 @@ class TextToSpeech:
if not self.minor_optimizations: if not self.minor_optimizations:
self.autoregressive = self.autoregressive.to(self.device) self.autoregressive = self.autoregressive.to(self.device)
for b in tqdm_override(range(num_batches), verbose=verbose, progress=progress, desc="Generating autoregressive samples"): with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p):
codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens, for b in tqdm_override(range(num_batches), verbose=verbose, progress=progress, desc="Generating autoregressive samples"):
do_sample=True, codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens,
top_p=top_p, do_sample=True,
temperature=temperature, top_p=top_p,
num_return_sequences=self.autoregressive_batch_size, temperature=temperature,
length_penalty=length_penalty, num_return_sequences=self.autoregressive_batch_size,
repetition_penalty=repetition_penalty, length_penalty=length_penalty,
max_generate_length=max_mel_tokens, repetition_penalty=repetition_penalty,
**hf_generate_kwargs) max_generate_length=max_mel_tokens,
padding_needed = max_mel_tokens - codes.shape[1] **hf_generate_kwargs)
codes = F.pad(codes, (0, padding_needed), value=stop_mel_token) padding_needed = max_mel_tokens - codes.shape[1]
samples.append(codes) codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)
samples.append(codes)
clip_results = [] clip_results = []
if not self.minor_optimizations: with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p):
self.autoregressive = self.autoregressive.cpu()
self.clvp = self.clvp.to(self.device)
if cvvp_amount > 0:
if self.cvvp is None:
self.load_cvvp()
if not self.minor_optimizations: if not self.minor_optimizations:
self.cvvp = self.cvvp.to(self.device) self.autoregressive = self.autoregressive.cpu()
self.clvp = self.clvp.to(self.device)
desc="Computing best candidates"
if verbose:
if self.cvvp is None:
desc = "Computing best candidates using CLVP"
else:
desc = f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%"
for batch in tqdm_override(samples, verbose=verbose, progress=progress, desc=desc): if cvvp_amount > 0:
for i in range(batch.shape[0]): if self.cvvp is None:
batch[i] = fix_autoregressive_output(batch[i], stop_mel_token) self.load_cvvp()
if cvvp_amount != 1: if not self.minor_optimizations:
clvp = self.clvp(text_tokens.repeat(batch.shape[0], 1), batch, return_loss=False) self.cvvp = self.cvvp.to(self.device)
if auto_conds is not None and cvvp_amount > 0:
cvvp_accumulator = 0 desc="Computing best candidates"
for cl in range(auto_conds.shape[1]): if verbose:
cvvp_accumulator = cvvp_accumulator + self.cvvp(auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False) if self.cvvp is None:
cvvp = cvvp_accumulator / auto_conds.shape[1] desc = "Computing best candidates using CLVP"
if cvvp_amount == 1:
clip_results.append(cvvp)
else: else:
clip_results.append(cvvp * cvvp_amount + clvp * (1-cvvp_amount)) desc = f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%"
else:
clip_results.append(clvp) for batch in tqdm_override(samples, verbose=verbose, progress=progress, desc=desc):
for i in range(batch.shape[0]):
batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
if cvvp_amount != 1:
clvp = self.clvp(text_tokens.repeat(batch.shape[0], 1), batch, return_loss=False)
if auto_conds is not None and cvvp_amount > 0:
cvvp_accumulator = 0
for cl in range(auto_conds.shape[1]):
cvvp_accumulator = cvvp_accumulator + self.cvvp(auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False)
cvvp = cvvp_accumulator / auto_conds.shape[1]
if cvvp_amount == 1:
clip_results.append(cvvp)
else:
clip_results.append(cvvp * cvvp_amount + clvp * (1-cvvp_amount))
else:
clip_results.append(clvp)
clip_results = torch.cat(clip_results, dim=0) clip_results = torch.cat(clip_results, dim=0)
samples = torch.cat(samples, dim=0) samples = torch.cat(samples, dim=0)
best_results = samples[torch.topk(clip_results, k=k).indices] best_results = samples[torch.topk(clip_results, k=k).indices]

View File

@ -108,8 +108,11 @@ def load_voice(voice, extra_voice_dirs=[], load_latents=True):
voices = [] voices = []
latent = None latent = None
for file in paths: for file in paths:
if file[-4:] == ".pth": if file == "cond_latents.pth":
latent = file latent = file
elif file[-4:] == ".pth":
{}
# noop
else: else:
voices.append(file) voices.append(file)
mtime = max(mtime, os.path.getmtime(file)) mtime = max(mtime, os.path.getmtime(file))

7
update.bat Executable file
View File

@ -0,0 +1,7 @@
git pull
python -m venv tortoise-venv
call .\tortoise-venv\Scripts\activate.bat
python -m pip install --upgrade pip
python -m pip install -r ./requirements.txt
deactivate
pause