done away with kludgy shit code, just have the user decide how many chunks to slice concat'd samples to (since it actually does improve vocie replicability)

This commit is contained in:
mrq 2023-02-15 04:39:31 +00:00
parent 314feaeea1
commit 2e777e8a67
3 changed files with 30 additions and 57 deletions

View File

@ -189,9 +189,14 @@ You'll be presented with a bunch of options in the default `Generate` tab, but d
* `Voice`: the voice you want to clone. You can select `microphone` if you want to use input from your microphone. * `Voice`: the voice you want to clone. You can select `microphone` if you want to use input from your microphone.
* `Microphone Source`: Use your own voice from a line-in source. * `Microphone Source`: Use your own voice from a line-in source.
* `Reload Voice List`: refreshes the voice list and updates. ***Click this*** after adding or removing a new voice. * `Reload Voice List`: refreshes the voice list and updates. ***Click this*** after adding or removing a new voice.
* `Voice Chunks`: how many pieces to break up your input voice samples into
- this originally was because of VRAM constraints, as large voice samples will run into OOM problems on destitute enough cards
- however, after some fiddling, it seems to help with improving replicability
- this is a very tricky setting to suggest, as there's not necessarily a go-to solution
+ some samples seem to work best if it's just one whole chunk
+ other voices seem to work better if i split it up more
- the best advice is to just play around with it a bit; pick the lowest chunk size you can make, and if a voice doesn't quite replicate right, increase the chunk count.
* `(Re)Compute Voice Latents`: regenerates a voice's cached latents. * `(Re)Compute Voice Latents`: regenerates a voice's cached latents.
* `Experimental Compute Latents Mode`: this mode will adjust the behavior for computing voice latents. Leave this checked if you're unsure, as this helps boost replicating a voice.
- if you're curious, feel free to play around with it by regenerating latents with and without it.
Below are a list of generation settings: Below are a list of generation settings:
* `Candidates`: number of outputs to generate, starting from the best candidate. Depending on your iteration steps, generating the final sound files could be cheap, but they only offer alternatives to the samples generated to pull from (in other words, the later candidates perform worse), so don't be compelled to generate a ton of candidates. * `Candidates`: number of outputs to generate, starting from the best candidate. Depending on your iteration steps, generating the final sound files could be cheap, but they only offer alternatives to the samples generated to pull from (in other words, the later candidates perform worse), so don't be compelled to generate a ton of candidates.

View File

@ -294,7 +294,7 @@ class TextToSpeech:
if self.preloaded_tensors: if self.preloaded_tensors:
self.cvvp = self.cvvp.to(self.device) self.cvvp = self.cvvp.to(self.device)
def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, progress=None, chunk_size=None, max_chunk_size=None, chunk_tensors=True, calculation_mode=1): def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, progress=None, slices=1, max_chunk_size=None):
""" """
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent). Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
@ -339,57 +339,19 @@ class TextToSpeech:
diffusion_conds = [] diffusion_conds = []
chunks = [] chunks = []
# below are two behaviors while i try and figure out how I should gauge the "best" method concat = torch.cat(samples, dim=-1)
# there's too many little variables to consider, like: chunk_size = concat.shape[-1]
# does it matter if there's a lot of silence (from expanding to largest size)
# how detrimental is it to slice a waveform mid-sentence/word/phoneme
# is it "more accurate" to use one large file to compute the latents across
# is it "more accurate" to compute latents across each individual sample (or sentence) and then average them
# averaging latents is how tortoise can voice mix, so it most likely will just average a speaker's range
# do any of these considerations even matter? they don't really seem to
# new behavior: if slices == 0:
# combine all samples slices = 1
# divide until each chunk fits under the requested max chunk size
if calculation_mode == 1:
concat = torch.cat(samples, dim=-1)
if chunk_size is None:
chunk_size = concat.shape[-1]
if max_chunk_size is not None and chunk_size > max_chunk_size:
divisions = 1
while int(chunk_size / divisions) > max_chunk_size:
divisions = divisions + 1
chunk_size = int(chunk_size / divisions)
print(f"Using method 1: size of best fit: {chunk_size}")
chunks = torch.chunk(concat, int(concat.shape[-1] / chunk_size), dim=1)
chunk_size = chunks[0].shape[-1]
# old new behavior:
# if chunkning tensors: use the smallest voice sample as a common size of best fit
# if not chunking tensors: use the largest voice sample as a common size of best fit
else: else:
if chunk_size is None:
for sample in tqdm_override(samples, verbose=verbose and len(samples) > 1, progress=progress if len(samples) > 1 else None, desc="Calculating size of best fit..."):
if chunk_tensors:
chunk_size = sample.shape[-1] if chunk_size is None else min( chunk_size, sample.shape[-1] )
else:
chunk_size = sample.shape[-1] if chunk_size is None else max( chunk_size, sample.shape[-1] )
print(f"Using method 0: size of best fit: {chunk_size}")
if max_chunk_size is not None and chunk_size > max_chunk_size: if max_chunk_size is not None and chunk_size > max_chunk_size:
chunk_size = max_chunk_size slices = 1
print(f"Chunk size exceeded, clamping to: {max_chunk_size}") while int(chunk_size / slices) > max_chunk_size:
slices = slices + 1
if chunk_tensors: chunks = torch.chunk(concat, slices, dim=1)
for sample in tqdm_override(samples, verbose=verbose, progress=progress, desc="Slicing samples into chunks..."): chunk_size = chunks[0].shape[-1]
sliced = torch.chunk(sample, int(sample.shape[-1] / chunk_size) + 1, dim=1)
for s in sliced:
chunks.append(s)
else:
chunks = samples
chunk_size = chunks[0].shape[-1]
# expand / truncate samples to match the common size # expand / truncate samples to match the common size
# required, as tensors need to be of the same length # required, as tensors need to be of the same length

View File

@ -30,6 +30,7 @@ def generate(
prompt, prompt,
voice, voice,
mic_audio, mic_audio,
voice_latents_chunks,
seed, seed,
candidates, candidates,
num_autoregressive_samples, num_autoregressive_samples,
@ -73,7 +74,7 @@ def generate(
if voice_samples is not None: if voice_samples is not None:
sample_voice = torch.cat(voice_samples, dim=-1).squeeze().cpu() sample_voice = torch.cat(voice_samples, dim=-1).squeeze().cpu()
conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, max_chunk_size=args.cond_latent_max_chunk_size) conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, slices=voice_latents_chunks)
if len(conditioning_latents) == 4: if len(conditioning_latents) == 4:
conditioning_latents = (conditioning_latents[0], conditioning_latents[1], conditioning_latents[2], None) conditioning_latents = (conditioning_latents[0], conditioning_latents[1], conditioning_latents[2], None)
@ -244,7 +245,6 @@ def generate(
'emotion': emotion, 'emotion': emotion,
'prompt': prompt, 'prompt': prompt,
'voice': voice, 'voice': voice,
'mic_audio': mic_audio,
'seed': seed, 'seed': seed,
'candidates': candidates, 'candidates': candidates,
'num_autoregressive_samples': num_autoregressive_samples, 'num_autoregressive_samples': num_autoregressive_samples,
@ -319,7 +319,7 @@ def generate(
stats, stats,
) )
def compute_latents(voice, mode, progress=gr.Progress(track_tqdm=True)): def compute_latents(voice, voice_latents_chunks, progress=gr.Progress(track_tqdm=True)):
global tts global tts
try: try:
tts tts
@ -331,7 +331,7 @@ def compute_latents(voice, mode, progress=gr.Progress(track_tqdm=True)):
if voice_samples is None: if voice_samples is None:
return return
conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, max_chunk_size=args.cond_latent_max_chunk_size, calculation_mode=1 if mode else 0) conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, slices=voice_latents_chunks)
if len(conditioning_latents) == 4: if len(conditioning_latents) == 4:
conditioning_latents = (conditioning_latents[0], conditioning_latents[1], conditioning_latents[2], None) conditioning_latents = (conditioning_latents[0], conditioning_latents[1], conditioning_latents[2], None)
@ -453,7 +453,8 @@ def import_generate_settings(file="./config/generate.json"):
None if 'emotion' not in settings else settings['emotion'], None if 'emotion' not in settings else settings['emotion'],
None if 'prompt' not in settings else settings['prompt'], None if 'prompt' not in settings else settings['prompt'],
None if 'voice' not in settings else settings['voice'], None if 'voice' not in settings else settings['voice'],
None if 'mic_audio' not in settings else settings['mic_audio'], None,
None,
None if 'seed' not in settings else settings['seed'], None if 'seed' not in settings else settings['seed'],
None if 'candidates' not in settings else settings['candidates'], None if 'candidates' not in settings else settings['candidates'],
None if 'num_autoregressive_samples' not in settings else settings['num_autoregressive_samples'], None if 'num_autoregressive_samples' not in settings else settings['num_autoregressive_samples'],
@ -698,10 +699,12 @@ def setup_gradio():
inputs=None, inputs=None,
outputs=voice outputs=voice
) )
gr.Button(value="(Re)Compute Voice Latents").click(compute_latents, voice_latents_chunks = gr.Slider(label="Voice Chunks", minimum=1, maximum=16, value=1, step=1)
recompute_voice_latents = gr.Button(value="(Re)Compute Voice Latents")
recompute_voice_latents.click(compute_latents,
inputs=[ inputs=[
voice, voice,
gr.Checkbox(label="Experimental Compute Voice Latents Mode", value=True) voice_latents_chunks,
], ],
outputs=voice, outputs=voice,
) )
@ -933,6 +936,7 @@ def setup_gradio():
prompt, prompt,
voice, voice,
mic_audio, mic_audio,
voice_latents_chunks,
seed, seed,
candidates, candidates,
num_autoregressive_samples, num_autoregressive_samples,
@ -957,6 +961,7 @@ def setup_gradio():
prompt, prompt,
voice, voice,
mic_audio, mic_audio,
voice_latents_chunks,
seed, seed,
candidates, candidates,
num_autoregressive_samples, num_autoregressive_samples,
@ -981,6 +986,7 @@ def setup_gradio():
prompt, prompt,
voice, voice,
mic_audio, mic_audio,
voice_latents_chunks,
seed, seed,
candidates, candidates,
num_autoregressive_samples, num_autoregressive_samples,