modified how conditional latents are computed (before, it just happened to only bother reading the first 102400/24000=4.26 seconds per audio input, now it will chunk it all to compute latents)

This commit is contained in:
mrq 2023-02-05 23:25:41 +00:00
parent f66754b557
commit 5bf21fdbe1
7 changed files with 88 additions and 60 deletions

View File

@ -115,6 +115,8 @@ To save you from headaches, I strongly recommend playing around with shorter sen
As a quick optimization, I modified the script to where the `conditional_latents` are saved after loading voice samples, and subsequent uses will load that file directly (at the cost of not returning the `Sample voice` to the web UI). If there's voice samples that have a modification time newer than this cached file, it'll skip loading it and load the normal WAVs instead. As a quick optimization, I modified the script to where the `conditional_latents` are saved after loading voice samples, and subsequent uses will load that file directly (at the cost of not returning the `Sample voice` to the web UI). If there's voice samples that have a modification time newer than this cached file, it'll skip loading it and load the normal WAVs instead.
**!**NOTE**!**: cached `latents.pth` files generated before 2023.02.05 will be ignored, due to a change in computing the conditiona latents. This *should* help bump up voice cloning quality. Apologies for the inconvenience.
## Example(s) ## Example(s)
Below are some outputs I deem substantial enough to share. As I continue delving into TorToiSe, I'll supply more examples and the values I use. Below are some outputs I deem substantial enough to share. As I continue delving into TorToiSe, I'll supply more examples and the values I use.

13
app.py
View File

@ -10,7 +10,9 @@ from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_audio, load_voice, load_voices from tortoise.utils.audio import load_audio, load_voice, load_voices
from tortoise.utils.text import split_and_recombine_text from tortoise.utils.text import split_and_recombine_text
def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, diffusion_sampler, breathing_room, progress=gr.Progress()): def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, diffusion_sampler, breathing_room, experimentals, progress=gr.Progress()):
print(experimentals)
if voice != "microphone": if voice != "microphone":
voices = [voice] voices = [voice]
else: else:
@ -27,7 +29,7 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c
if voice_samples is not None: if voice_samples is not None:
sample_voice = voice_samples[0] sample_voice = voice_samples[0]
conditioning_latents = tts.get_conditioning_latents(voice_samples) conditioning_latents = tts.get_conditioning_latents(voice_samples, progress=progress)
torch.save(conditioning_latents, os.path.join(f'./tortoise/voices/{voice}/', f'latents.pth')) torch.save(conditioning_latents, os.path.join(f'./tortoise/voices/{voice}/', f'latents.pth'))
voice_samples = None voice_samples = None
else: else:
@ -54,6 +56,8 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c
'diffusion_sampler': diffusion_sampler, 'diffusion_sampler': diffusion_sampler,
'breathing_room': breathing_room, 'breathing_room': breathing_room,
'progress': progress, 'progress': progress,
'half_p': "Half Precision" in experimentals,
'cond_free': "Conditioning-Free" in experimentals,
} }
if delimiter == "\\n": if delimiter == "\\n":
@ -216,6 +220,8 @@ def main():
type="value", type="value",
) )
experimentals = gr.CheckboxGroup(["Half Precision", "Conditioning-Free"], value=[False, True], label="Experimental Flags")
preset.change(fn=update_presets, preset.change(fn=update_presets,
inputs=preset, inputs=preset,
outputs=[ outputs=[
@ -246,7 +252,8 @@ def main():
diffusion_iterations, diffusion_iterations,
temperature, temperature,
diffusion_sampler, diffusion_sampler,
breathing_room breathing_room,
experimentals,
], ],
outputs=[selected_voice, output_audio, usedSeed], outputs=[selected_voice, output_audio, usedSeed],
) )

View File

@ -1,7 +1,8 @@
python -m venv tortoise-venv python -m venv tortoise-venv
call .\tortoise-venv\Scripts\activate.bat call .\tortoise-venv\Scripts\activate.bat
py -m pip install --upgrade pip python -m pip install --upgrade pip
py -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116 python -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
py -m pip install -r ./requirements.txt python -m pip install -r ./requirements.txt
py setup.py install python setup.py install
deactivate deactivate
pause

View File

@ -1,3 +1,4 @@
call .\tortoise-venv\Scripts\activate.bat call .\tortoise-venv\Scripts\activate.bat
py .\app.py python .\app.py
deactivate deactivate
pause

View File

@ -284,7 +284,7 @@ class TextToSpeech:
if self.minor_optimizations: if self.minor_optimizations:
self.cvvp = self.cvvp.to(self.device) self.cvvp = self.cvvp.to(self.device)
def get_conditioning_latents(self, voice_samples, return_mels=False): def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, progress=None, enforced_length=102400):
""" """
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent). Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
@ -303,14 +303,18 @@ class TextToSpeech:
auto_conds = torch.stack(auto_conds, dim=1) auto_conds = torch.stack(auto_conds, dim=1)
diffusion_conds = [] diffusion_conds = []
for sample in voice_samples:
for sample in tqdm_override(voice_samples, verbose=verbose, progress=progress, desc="Computing conditioning latents..."):
# The diffuser operates at a sample rate of 24000 (except for the latent inputs) # The diffuser operates at a sample rate of 24000 (except for the latent inputs)
sample = torchaudio.functional.resample(sample, 22050, 24000) sample = torchaudio.functional.resample(sample, 22050, 24000)
sample = pad_or_truncate(sample, 102400) chunks = torch.chunk(sample, int(sample.shape[-1] / enforced_length) + 1, dim=1)
cond_mel = wav_to_univnet_mel(sample.to(self.device), do_normalization=False, device=self.device)
diffusion_conds.append(cond_mel)
diffusion_conds = torch.stack(diffusion_conds, dim=1)
for chunk in chunks:
chunk = pad_or_truncate(chunk, enforced_length)
cond_mel = wav_to_univnet_mel(chunk.to(self.device), do_normalization=False, device=self.device)
diffusion_conds.append(cond_mel)
diffusion_conds = torch.stack(diffusion_conds, dim=1)
if self.minor_optimizations: if self.minor_optimizations:
auto_latent = self.autoregressive.get_conditioning(auto_conds) auto_latent = self.autoregressive.get_conditioning(auto_conds)
@ -372,6 +376,7 @@ class TextToSpeech:
diffusion_iterations=100, cond_free=True, cond_free_k=2, diffusion_temperature=1.0, diffusion_iterations=100, cond_free=True, cond_free_k=2, diffusion_temperature=1.0,
diffusion_sampler="P", diffusion_sampler="P",
breathing_room=8, breathing_room=8,
half_p=False,
progress=None, progress=None,
**hf_generate_kwargs): **hf_generate_kwargs):
""" """
@ -446,6 +451,7 @@ class TextToSpeech:
if not self.minor_optimizations: if not self.minor_optimizations:
self.autoregressive = self.autoregressive.to(self.device) self.autoregressive = self.autoregressive.to(self.device)
with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p):
for b in tqdm_override(range(num_batches), verbose=verbose, progress=progress, desc="Generating autoregressive samples"): for b in tqdm_override(range(num_batches), verbose=verbose, progress=progress, desc="Generating autoregressive samples"):
codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens, codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens,
do_sample=True, do_sample=True,
@ -462,6 +468,7 @@ class TextToSpeech:
clip_results = [] clip_results = []
with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p):
if not self.minor_optimizations: if not self.minor_optimizations:
self.autoregressive = self.autoregressive.cpu() self.autoregressive = self.autoregressive.cpu()
self.clvp = self.clvp.to(self.device) self.clvp = self.clvp.to(self.device)

View File

@ -108,8 +108,11 @@ def load_voice(voice, extra_voice_dirs=[], load_latents=True):
voices = [] voices = []
latent = None latent = None
for file in paths: for file in paths:
if file[-4:] == ".pth": if file == "cond_latents.pth":
latent = file latent = file
elif file[-4:] == ".pth":
{}
# noop
else: else:
voices.append(file) voices.append(file)
mtime = max(mtime, os.path.getmtime(file)) mtime = max(mtime, os.path.getmtime(file))

7
update.bat Executable file
View File

@ -0,0 +1,7 @@
git pull
python -m venv tortoise-venv
call .\tortoise-venv\Scripts\activate.bat
python -m pip install --upgrade pip
python -m pip install -r ./requirements.txt
deactivate
pause