forked from mrq/tortoise-tts
modified how conditional latents are computed (before, it just happened to only bother reading the first 102400/24000=4.26 seconds per audio input, now it will chunk it all to compute latents)
This commit is contained in:
parent
4ea997106e
commit
c2c9b1b683
|
@ -115,6 +115,8 @@ To save you from headaches, I strongly recommend playing around with shorter sen
|
||||||
|
|
||||||
As a quick optimization, I modified the script to where the `conditional_latents` are saved after loading voice samples, and subsequent uses will load that file directly (at the cost of not returning the `Sample voice` to the web UI). If there's voice samples that have a modification time newer than this cached file, it'll skip loading it and load the normal WAVs instead.
|
As a quick optimization, I modified the script to where the `conditional_latents` are saved after loading voice samples, and subsequent uses will load that file directly (at the cost of not returning the `Sample voice` to the web UI). If there's voice samples that have a modification time newer than this cached file, it'll skip loading it and load the normal WAVs instead.
|
||||||
|
|
||||||
|
**!**NOTE**!**: cached `latents.pth` files generated before 2023.02.05 will be ignored, due to a change in computing the conditiona latents. This *should* help bump up voice cloning quality. Apologies for the inconvenience.
|
||||||
|
|
||||||
## Example(s)
|
## Example(s)
|
||||||
|
|
||||||
Below are some outputs I deem substantial enough to share. As I continue delving into TorToiSe, I'll supply more examples and the values I use.
|
Below are some outputs I deem substantial enough to share. As I continue delving into TorToiSe, I'll supply more examples and the values I use.
|
||||||
|
|
13
app.py
13
app.py
|
@ -10,7 +10,9 @@ from tortoise.api import TextToSpeech
|
||||||
from tortoise.utils.audio import load_audio, load_voice, load_voices
|
from tortoise.utils.audio import load_audio, load_voice, load_voices
|
||||||
from tortoise.utils.text import split_and_recombine_text
|
from tortoise.utils.text import split_and_recombine_text
|
||||||
|
|
||||||
def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, diffusion_sampler, breathing_room, progress=gr.Progress()):
|
def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, diffusion_sampler, breathing_room, experimentals, progress=gr.Progress()):
|
||||||
|
print(experimentals)
|
||||||
|
|
||||||
if voice != "microphone":
|
if voice != "microphone":
|
||||||
voices = [voice]
|
voices = [voice]
|
||||||
else:
|
else:
|
||||||
|
@ -27,7 +29,7 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c
|
||||||
|
|
||||||
if voice_samples is not None:
|
if voice_samples is not None:
|
||||||
sample_voice = voice_samples[0]
|
sample_voice = voice_samples[0]
|
||||||
conditioning_latents = tts.get_conditioning_latents(voice_samples)
|
conditioning_latents = tts.get_conditioning_latents(voice_samples, progress=progress)
|
||||||
torch.save(conditioning_latents, os.path.join(f'./tortoise/voices/{voice}/', f'latents.pth'))
|
torch.save(conditioning_latents, os.path.join(f'./tortoise/voices/{voice}/', f'latents.pth'))
|
||||||
voice_samples = None
|
voice_samples = None
|
||||||
else:
|
else:
|
||||||
|
@ -54,6 +56,8 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c
|
||||||
'diffusion_sampler': diffusion_sampler,
|
'diffusion_sampler': diffusion_sampler,
|
||||||
'breathing_room': breathing_room,
|
'breathing_room': breathing_room,
|
||||||
'progress': progress,
|
'progress': progress,
|
||||||
|
'half_p': "Half Precision" in experimentals,
|
||||||
|
'cond_free': "Conditioning-Free" in experimentals,
|
||||||
}
|
}
|
||||||
|
|
||||||
if delimiter == "\\n":
|
if delimiter == "\\n":
|
||||||
|
@ -216,6 +220,8 @@ def main():
|
||||||
type="value",
|
type="value",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
experimentals = gr.CheckboxGroup(["Half Precision", "Conditioning-Free"], value=[False, True], label="Experimental Flags")
|
||||||
|
|
||||||
preset.change(fn=update_presets,
|
preset.change(fn=update_presets,
|
||||||
inputs=preset,
|
inputs=preset,
|
||||||
outputs=[
|
outputs=[
|
||||||
|
@ -246,7 +252,8 @@ def main():
|
||||||
diffusion_iterations,
|
diffusion_iterations,
|
||||||
temperature,
|
temperature,
|
||||||
diffusion_sampler,
|
diffusion_sampler,
|
||||||
breathing_room
|
breathing_room,
|
||||||
|
experimentals,
|
||||||
],
|
],
|
||||||
outputs=[selected_voice, output_audio, usedSeed],
|
outputs=[selected_voice, output_audio, usedSeed],
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
call .\tortoise-venv\Scripts\activate.bat
|
call .\tortoise-venv\Scripts\activate.bat
|
||||||
py .\app.py
|
python .\app.py
|
||||||
deactivate
|
deactivate
|
||||||
|
pause
|
105
tortoise/api.py
105
tortoise/api.py
|
@ -284,7 +284,7 @@ class TextToSpeech:
|
||||||
if self.minor_optimizations:
|
if self.minor_optimizations:
|
||||||
self.cvvp = self.cvvp.to(self.device)
|
self.cvvp = self.cvvp.to(self.device)
|
||||||
|
|
||||||
def get_conditioning_latents(self, voice_samples, return_mels=False):
|
def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, progress=None, enforced_length=102400):
|
||||||
"""
|
"""
|
||||||
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
|
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
|
||||||
These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
|
These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
|
||||||
|
@ -303,14 +303,18 @@ class TextToSpeech:
|
||||||
auto_conds = torch.stack(auto_conds, dim=1)
|
auto_conds = torch.stack(auto_conds, dim=1)
|
||||||
|
|
||||||
diffusion_conds = []
|
diffusion_conds = []
|
||||||
for sample in voice_samples:
|
|
||||||
|
for sample in tqdm_override(voice_samples, verbose=verbose, progress=progress, desc="Computing conditioning latents..."):
|
||||||
# The diffuser operates at a sample rate of 24000 (except for the latent inputs)
|
# The diffuser operates at a sample rate of 24000 (except for the latent inputs)
|
||||||
sample = torchaudio.functional.resample(sample, 22050, 24000)
|
sample = torchaudio.functional.resample(sample, 22050, 24000)
|
||||||
sample = pad_or_truncate(sample, 102400)
|
chunks = torch.chunk(sample, int(sample.shape[-1] / enforced_length) + 1, dim=1)
|
||||||
cond_mel = wav_to_univnet_mel(sample.to(self.device), do_normalization=False, device=self.device)
|
|
||||||
diffusion_conds.append(cond_mel)
|
for chunk in chunks:
|
||||||
diffusion_conds = torch.stack(diffusion_conds, dim=1)
|
chunk = pad_or_truncate(chunk, enforced_length)
|
||||||
|
cond_mel = wav_to_univnet_mel(chunk.to(self.device), do_normalization=False, device=self.device)
|
||||||
|
diffusion_conds.append(cond_mel)
|
||||||
|
|
||||||
|
diffusion_conds = torch.stack(diffusion_conds, dim=1)
|
||||||
|
|
||||||
if self.minor_optimizations:
|
if self.minor_optimizations:
|
||||||
auto_latent = self.autoregressive.get_conditioning(auto_conds)
|
auto_latent = self.autoregressive.get_conditioning(auto_conds)
|
||||||
|
@ -372,6 +376,7 @@ class TextToSpeech:
|
||||||
diffusion_iterations=100, cond_free=True, cond_free_k=2, diffusion_temperature=1.0,
|
diffusion_iterations=100, cond_free=True, cond_free_k=2, diffusion_temperature=1.0,
|
||||||
diffusion_sampler="P",
|
diffusion_sampler="P",
|
||||||
breathing_room=8,
|
breathing_room=8,
|
||||||
|
half_p=False,
|
||||||
progress=None,
|
progress=None,
|
||||||
**hf_generate_kwargs):
|
**hf_generate_kwargs):
|
||||||
"""
|
"""
|
||||||
|
@ -446,55 +451,57 @@ class TextToSpeech:
|
||||||
if not self.minor_optimizations:
|
if not self.minor_optimizations:
|
||||||
self.autoregressive = self.autoregressive.to(self.device)
|
self.autoregressive = self.autoregressive.to(self.device)
|
||||||
|
|
||||||
for b in tqdm_override(range(num_batches), verbose=verbose, progress=progress, desc="Generating autoregressive samples"):
|
with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p):
|
||||||
codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens,
|
for b in tqdm_override(range(num_batches), verbose=verbose, progress=progress, desc="Generating autoregressive samples"):
|
||||||
do_sample=True,
|
codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens,
|
||||||
top_p=top_p,
|
do_sample=True,
|
||||||
temperature=temperature,
|
top_p=top_p,
|
||||||
num_return_sequences=self.autoregressive_batch_size,
|
temperature=temperature,
|
||||||
length_penalty=length_penalty,
|
num_return_sequences=self.autoregressive_batch_size,
|
||||||
repetition_penalty=repetition_penalty,
|
length_penalty=length_penalty,
|
||||||
max_generate_length=max_mel_tokens,
|
repetition_penalty=repetition_penalty,
|
||||||
**hf_generate_kwargs)
|
max_generate_length=max_mel_tokens,
|
||||||
padding_needed = max_mel_tokens - codes.shape[1]
|
**hf_generate_kwargs)
|
||||||
codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)
|
padding_needed = max_mel_tokens - codes.shape[1]
|
||||||
samples.append(codes)
|
codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)
|
||||||
|
samples.append(codes)
|
||||||
|
|
||||||
clip_results = []
|
clip_results = []
|
||||||
|
|
||||||
if not self.minor_optimizations:
|
with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p):
|
||||||
self.autoregressive = self.autoregressive.cpu()
|
|
||||||
self.clvp = self.clvp.to(self.device)
|
|
||||||
|
|
||||||
if cvvp_amount > 0:
|
|
||||||
if self.cvvp is None:
|
|
||||||
self.load_cvvp()
|
|
||||||
if not self.minor_optimizations:
|
if not self.minor_optimizations:
|
||||||
self.cvvp = self.cvvp.to(self.device)
|
self.autoregressive = self.autoregressive.cpu()
|
||||||
|
self.clvp = self.clvp.to(self.device)
|
||||||
desc="Computing best candidates"
|
|
||||||
if verbose:
|
|
||||||
if self.cvvp is None:
|
|
||||||
desc = "Computing best candidates using CLVP"
|
|
||||||
else:
|
|
||||||
desc = f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%"
|
|
||||||
|
|
||||||
for batch in tqdm_override(samples, verbose=verbose, progress=progress, desc=desc):
|
if cvvp_amount > 0:
|
||||||
for i in range(batch.shape[0]):
|
if self.cvvp is None:
|
||||||
batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
|
self.load_cvvp()
|
||||||
if cvvp_amount != 1:
|
if not self.minor_optimizations:
|
||||||
clvp = self.clvp(text_tokens.repeat(batch.shape[0], 1), batch, return_loss=False)
|
self.cvvp = self.cvvp.to(self.device)
|
||||||
if auto_conds is not None and cvvp_amount > 0:
|
|
||||||
cvvp_accumulator = 0
|
desc="Computing best candidates"
|
||||||
for cl in range(auto_conds.shape[1]):
|
if verbose:
|
||||||
cvvp_accumulator = cvvp_accumulator + self.cvvp(auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False)
|
if self.cvvp is None:
|
||||||
cvvp = cvvp_accumulator / auto_conds.shape[1]
|
desc = "Computing best candidates using CLVP"
|
||||||
if cvvp_amount == 1:
|
|
||||||
clip_results.append(cvvp)
|
|
||||||
else:
|
else:
|
||||||
clip_results.append(cvvp * cvvp_amount + clvp * (1-cvvp_amount))
|
desc = f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%"
|
||||||
else:
|
|
||||||
clip_results.append(clvp)
|
for batch in tqdm_override(samples, verbose=verbose, progress=progress, desc=desc):
|
||||||
|
for i in range(batch.shape[0]):
|
||||||
|
batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
|
||||||
|
if cvvp_amount != 1:
|
||||||
|
clvp = self.clvp(text_tokens.repeat(batch.shape[0], 1), batch, return_loss=False)
|
||||||
|
if auto_conds is not None and cvvp_amount > 0:
|
||||||
|
cvvp_accumulator = 0
|
||||||
|
for cl in range(auto_conds.shape[1]):
|
||||||
|
cvvp_accumulator = cvvp_accumulator + self.cvvp(auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False)
|
||||||
|
cvvp = cvvp_accumulator / auto_conds.shape[1]
|
||||||
|
if cvvp_amount == 1:
|
||||||
|
clip_results.append(cvvp)
|
||||||
|
else:
|
||||||
|
clip_results.append(cvvp * cvvp_amount + clvp * (1-cvvp_amount))
|
||||||
|
else:
|
||||||
|
clip_results.append(clvp)
|
||||||
clip_results = torch.cat(clip_results, dim=0)
|
clip_results = torch.cat(clip_results, dim=0)
|
||||||
samples = torch.cat(samples, dim=0)
|
samples = torch.cat(samples, dim=0)
|
||||||
best_results = samples[torch.topk(clip_results, k=k).indices]
|
best_results = samples[torch.topk(clip_results, k=k).indices]
|
||||||
|
|
|
@ -108,8 +108,11 @@ def load_voice(voice, extra_voice_dirs=[], load_latents=True):
|
||||||
voices = []
|
voices = []
|
||||||
latent = None
|
latent = None
|
||||||
for file in paths:
|
for file in paths:
|
||||||
if file[-4:] == ".pth":
|
if file == "cond_latents.pth":
|
||||||
latent = file
|
latent = file
|
||||||
|
elif file[-4:] == ".pth":
|
||||||
|
{}
|
||||||
|
# noop
|
||||||
else:
|
else:
|
||||||
voices.append(file)
|
voices.append(file)
|
||||||
mtime = max(mtime, os.path.getmtime(file))
|
mtime = max(mtime, os.path.getmtime(file))
|
||||||
|
|
7
update.bat
Executable file
7
update.bat
Executable file
|
@ -0,0 +1,7 @@
|
||||||
|
git pull
|
||||||
|
python -m venv tortoise-venv
|
||||||
|
call .\tortoise-venv\Scripts\activate.bat
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
python -m pip install -r ./requirements.txt
|
||||||
|
deactivate
|
||||||
|
pause
|
Loading…
Reference in New Issue
Block a user