(finally) added the CVVP model weigh slider, latents export more data too for weighing against CVVP

This commit is contained in:
mrq 2023-02-07 20:55:56 +00:00
parent f7274112c3
commit e45e4431d1
3 changed files with 35 additions and 18 deletions

View File

@ -168,6 +168,9 @@ Below are settings that override the default launch arguments. Some of these req
Below are an explanation of experimental flags. Messing with these might impact performance, as these are exposed only if you know what you are doing. Below are an explanation of experimental flags. Messing with these might impact performance, as these are exposed only if you know what you are doing.
* `Half-Precision`: (attempts to) hint to PyTorch to auto-cast to float16 (half precision) for compute. Disabled by default, due to it making computations slower. * `Half-Precision`: (attempts to) hint to PyTorch to auto-cast to float16 (half precision) for compute. Disabled by default, due to it making computations slower.
* `Conditional Free`: a quality boosting improvement at the cost of some performance. Enabled by default, as I think the penaly is negligible in the end. * `Conditional Free`: a quality boosting improvement at the cost of some performance. Enabled by default, as I think the penaly is negligible in the end.
* `CVVP Weight`: governs how much weight the CVVP model should influence candidates. The original documentation mentions this is deprecated as it does not really influence things, but you're still free to play around with it.
Currently, setting requires regenerating your voice latents, as I forgot to have it return some extra data that weighing against the CVVP model uses. Oops.
Setting this to 1 leads to bad behavior.
## Example(s) ## Example(s)

41
app.py
View File

@ -18,7 +18,7 @@ from tortoise.utils.audio import load_audio, load_voice, load_voices
from tortoise.utils.text import split_and_recombine_text from tortoise.utils.text import split_and_recombine_text
def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, diffusion_sampler, breathing_room, experimentals, progress=gr.Progress(track_tqdm=True)): def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, diffusion_sampler, breathing_room, cvvp_weight, experimentals, progress=gr.Progress(track_tqdm=True)):
if voice != "microphone": if voice != "microphone":
voices = [voice] voices = [voice]
else: else:
@ -35,7 +35,7 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
if voice_samples is not None: if voice_samples is not None:
sample_voice = voice_samples[0] sample_voice = voice_samples[0]
conditioning_latents = tts.get_conditioning_latents(voice_samples, progress=progress, max_chunk_size=args.cond_latent_max_chunk_size) conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=True, progress=progress, max_chunk_size=args.cond_latent_max_chunk_size)
if voice != "microphone": if voice != "microphone":
torch.save(conditioning_latents, f'./tortoise/voices/{voice}/cond_latents.pth') torch.save(conditioning_latents, f'./tortoise/voices/{voice}/cond_latents.pth')
voice_samples = None voice_samples = None
@ -45,6 +45,10 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
if seed == 0: if seed == 0:
seed = None seed = None
if conditioning_latents is not None and len(conditioning_latents) == 2 and cvvp_weight > 0:
print("Requesting weighing against CVVP weight, but voice latents are missing some extra data. Please regenerate your voice latents.")
cvvp_weight = 0
start_time = time.time() start_time = time.time()
settings = { settings = {
@ -66,6 +70,7 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
'progress': progress, 'progress': progress,
'half_p': "Half Precision" in experimentals, 'half_p': "Half Precision" in experimentals,
'cond_free': "Conditioning-Free" in experimentals, 'cond_free': "Conditioning-Free" in experimentals,
'cvvp_amount': cvvp_weight,
} }
if delimiter == "\\n": if delimiter == "\\n":
@ -159,6 +164,7 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
'temperature': temperature, 'temperature': temperature,
'diffusion_sampler': diffusion_sampler, 'diffusion_sampler': diffusion_sampler,
'breathing_room': breathing_room, 'breathing_room': breathing_room,
'cvvp_weight': cvvp_weight,
'experimentals': experimentals, 'experimentals': experimentals,
'time': time.time()-start_time, 'time': time.time()-start_time,
} }
@ -244,20 +250,21 @@ def import_generate_settings(file="./config/generate.json"):
return None return None
return ( return (
settings['text'], None if 'text' not in settings else settings['text'],
settings['delimiter'], None if 'delimiter' not in settings else settings['delimiter'],
settings['emotion'], None if 'emotion' not in settings else settings['emotion'],
settings['prompt'], None if 'prompt' not in settings else settings['prompt'],
settings['voice'], None if 'voice' not in settings else settings['voice'],
settings['mic_audio'], None if 'mic_audio' not in settings else settings['mic_audio'],
settings['seed'], None if 'seed' not in settings else settings['seed'],
settings['candidates'], None if 'candidates' not in settings else settings['candidates'],
settings['num_autoregressive_samples'], None if 'num_autoregressive_samples' not in settings else settings['num_autoregressive_samples'],
settings['diffusion_iterations'], None if 'diffusion_iterations' not in settings else settings['diffusion_iterations'],
settings['temperature'], None if 'temperature' not in settings else settings['temperature'],
settings['diffusion_sampler'], None if 'diffusion_sampler' not in settings else settings['diffusion_sampler'],
settings['breathing_room'], None if 'breathing_room' not in settings else settings['breathing_room'],
settings['experimentals'], None if 'cvvp_weight' not in settings else settings['cvvp_weight'],
None if 'experimentals' not in settings else settings['experimentals'],
) )
def curl(url): def curl(url):
@ -436,6 +443,7 @@ def main():
experimentals = gr.CheckboxGroup(["Half Precision", "Conditioning-Free"], value=["Conditioning-Free"], label="Experimental Flags") experimentals = gr.CheckboxGroup(["Half Precision", "Conditioning-Free"], value=["Conditioning-Free"], label="Experimental Flags")
cvvp_weight = gr.Slider(value=0, minimum=0, maximum=1, label="CVVP Weight")
check_updates_now = gr.Button(value="Check for Updates") check_updates_now = gr.Button(value="Check for Updates")
@ -463,6 +471,7 @@ def main():
temperature, temperature,
diffusion_sampler, diffusion_sampler,
breathing_room, breathing_room,
cvvp_weight,
experimentals, experimentals,
] ]

View File

@ -124,7 +124,7 @@ def format_conditioning(clip, cond_length=132300, device='cuda', sampling_rate=2
elif gap > 0: elif gap > 0:
rand_start = random.randint(0, gap) rand_start = random.randint(0, gap)
clip = clip[:, rand_start:rand_start + cond_length] clip = clip[:, rand_start:rand_start + cond_length]
mel_clip = TorchMelSpectrogram(sampling_rate=sample_rate)(clip.unsqueeze(0)).squeeze(0) mel_clip = TorchMelSpectrogram(sampling_rate=sampling_rate)(clip.unsqueeze(0)).squeeze(0)
return mel_clip.unsqueeze(0).to(device) return mel_clip.unsqueeze(0).to(device)
@ -469,7 +469,11 @@ class TextToSpeech:
if voice_samples is not None: if voice_samples is not None:
auto_conditioning, diffusion_conditioning, auto_conds, _ = self.get_conditioning_latents(voice_samples, return_mels=True, verbose=True) auto_conditioning, diffusion_conditioning, auto_conds, _ = self.get_conditioning_latents(voice_samples, return_mels=True, verbose=True)
elif conditioning_latents is not None: elif conditioning_latents is not None:
latent_tuple = conditioning_latents
if len(latent_tuple) == 2:
auto_conditioning, diffusion_conditioning = conditioning_latents auto_conditioning, diffusion_conditioning = conditioning_latents
else:
auto_conditioning, diffusion_conditioning, auto_conds, _ = conditioning_latents
else: else:
auto_conditioning, diffusion_conditioning = self.get_random_conditioning_latents() auto_conditioning, diffusion_conditioning = self.get_random_conditioning_latents()
auto_conditioning = auto_conditioning.to(self.device) auto_conditioning = auto_conditioning.to(self.device)
@ -539,6 +543,7 @@ class TextToSpeech:
clip_results.append(cvvp * cvvp_amount + clvp * (1-cvvp_amount)) clip_results.append(cvvp * cvvp_amount + clvp * (1-cvvp_amount))
else: else:
clip_results.append(clvp) clip_results.append(clvp)
clip_results = torch.cat(clip_results, dim=0) clip_results = torch.cat(clip_results, dim=0)
samples = torch.cat(samples, dim=0) samples = torch.cat(samples, dim=0)
best_results = samples[torch.topk(clip_results, k=k).indices] best_results = samples[torch.topk(clip_results, k=k).indices]