1
1
forked from mrq/tortoise-tts

added button to recalculate voice latents, added experimental switch for computing voice latents

This commit is contained in:
mrq 2023-02-12 18:11:40 +00:00
parent 88529fda43
commit 4d01bbd429
3 changed files with 40 additions and 6 deletions

View File

@ -158,6 +158,8 @@ However, keep in mind how you combine/separate your clips; depending on the mode
* you might suffer from reduced throughput, as the smallest voice file will be used as the size of best fit
* a voice might get split mid-word, affecting how the latents are computed, as each batch is averaged together
For safety, try to keep your clips within the same length, or increase your `Voice Latents Max Chunk Size`, if console output alerts the best fit size exceeds this.
If you're looking to trim your clips, in my opinion, ~~Audacity~~ Tenacity works good enough, as you can easily output your clips into the proper format (22050 Hz sampling rate).
Power users with FFMPEG already installed can simply used the provided conversion script in `.\convert\`.
@ -182,6 +184,11 @@ You'll be presented with a bunch of options in the default `Generate` tab, but d
* `Custom Emotion + Prompt`: a non-preset "emotion" used for the delivery. This is a shortcut to utilizing "prompt engineering" by starting with `[<emotion>]` in your prompt.
* `Voice`: the voice you want to clone. You can select `microphone` if you want to use input from your microphone.
* `Microphone Source`: Use your own voice from a line-in source.
* `Reload Voice List`: refreshes the voice list and updates. ***Click this*** after adding or removing a new voice.
* `(Re)Compute Voice Latents`: regenerates a voice's cached latents.
* `Experimental Compute Latents Mode`: this mode will combine all voice samples into one file, then split it evenly (if under the maximum allowed chunk size under `Settings`)
Below are a list of generation settings:
* `Candidates`: number of outputs to generate, starting from the best candidate. Depending on your iteration steps, generating the final sound files could be cheap, but they only offer alternatives to the samples generated to pull from (in other words, the later candidates perform worse), so don't be compelled to generate a ton of candidates.
* `Seed`: initializes the PRNG to this value. Use this if you want to reproduce a generated voice.
* `Preset`: shortcut values for sample count and iteration steps. Clicking a preset will update its corresponding values. Higher presets result in better quality at the cost of computation time.

View File

@ -294,7 +294,7 @@ class TextToSpeech:
if self.preloaded_tensors:
self.cvvp = self.cvvp.to(self.device)
def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, progress=None, chunk_size=None, max_chunk_size=None, chunk_tensors=True, calculation_mode=0):
def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, progress=None, chunk_size=None, max_chunk_size=None, chunk_tensors=True, calculation_mode=1):
"""
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
@ -339,7 +339,6 @@ class TextToSpeech:
diffusion_conds = []
chunks = []
# new behavior: combine all samples, and divide accordingly
# doesn't work, need to fix
if calculation_mode == 1:
@ -349,9 +348,9 @@ class TextToSpeech:
if max_chunk_size is not None and chunk_size > max_chunk_size:
while chunk_size > max_chunk_size:
chunk_size = chunk_size / 2
chunk_size = int(chunk_size / 2)
print(f"Size of best fit: {chunk_size}")
print(f"Using method 1: size of best fit: {chunk_size}")
chunks = torch.chunk(concat, int(concat.shape[-1] / chunk_size) + 1, dim=1)
# default new behavior: use the smallest voice sample as a common chunk size
else:
@ -362,7 +361,7 @@ class TextToSpeech:
else:
chunk_size = sample.shape[-1] if chunk_size is None else max( chunk_size, sample.shape[-1] )
print(f"Size of best fit: {chunk_size}")
print(f"Using method 0: size of best fit: {chunk_size}")
if max_chunk_size is not None and chunk_size > max_chunk_size:
chunk_size = max_chunk_size
print(f"Chunk size exceeded, clamping to: {max_chunk_size}")

View File

@ -305,6 +305,27 @@ def generate(
stats,
)
def compute_latents(voice, mode, progress=gr.Progress(track_tqdm=True)):
global tts
try:
tts
except NameError:
raise gr.Error("TTS is still initializing...")
voice_samples, conditioning_latents = load_voice(voice, load_latents=False)
if voice_samples is None:
return
conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, max_chunk_size=args.cond_latent_max_chunk_size, calculation_mode=1 if mode else 0)
if len(conditioning_latents) == 4:
conditioning_latents = (conditioning_latents[0], conditioning_latents[1], conditioning_latents[2], None)
torch.save(conditioning_latents, f'{get_voice_dir()}/{voice}/cond_latents.pth')
return voice
def update_presets(value):
PRESETS = {
'Ultra Fast': {'num_autoregressive_samples': 16, 'diffusion_iterations': 30, 'cond_free': False},
@ -467,7 +488,7 @@ def export_exec_settings( share, listen, check_for_updates, models_from_local_on
settings = {
'share': args.share,
'listen': args.listen,
'listen': None if args.listen else args.listen,
'low-vram':args.low_vram,
'check-for-updates':args.check_for_updates,
'models-from-local-only':args.models_from_local_only,
@ -613,6 +634,13 @@ def setup_gradio():
inputs=None,
outputs=voice
)
gr.Button(value="(Re)Compute Voice Latents").click(compute_latents,
inputs=[
voice,
gr.Checkbox(label="Experimental Compute Voice Latents Mode", value=True)
],
outputs=voice,
)
prompt.change(fn=lambda value: gr.update(value="Custom"),
inputs=prompt,