forked from mrq/tortoise-tts
added button to recalculate voice latents, added experimental switch for computing voice latents
This commit is contained in:
parent
88529fda43
commit
4d01bbd429
|
@ -158,6 +158,8 @@ However, keep in mind how you combine/separate your clips; depending on the mode
|
||||||
* you might suffer from reduced throughput, as the smallest voice file will be used as the size of best fit
|
* you might suffer from reduced throughput, as the smallest voice file will be used as the size of best fit
|
||||||
* a voice might get split mid-word, affecting how the latents are computed, as each batch is averaged together
|
* a voice might get split mid-word, affecting how the latents are computed, as each batch is averaged together
|
||||||
|
|
||||||
|
For safety, try to keep your clips within the same length, or increase your `Voice Latents Max Chunk Size`, if console output alerts the best fit size exceeds this.
|
||||||
|
|
||||||
If you're looking to trim your clips, in my opinion, ~~Audacity~~ Tenacity works good enough, as you can easily output your clips into the proper format (22050 Hz sampling rate).
|
If you're looking to trim your clips, in my opinion, ~~Audacity~~ Tenacity works good enough, as you can easily output your clips into the proper format (22050 Hz sampling rate).
|
||||||
|
|
||||||
Power users with FFMPEG already installed can simply used the provided conversion script in `.\convert\`.
|
Power users with FFMPEG already installed can simply used the provided conversion script in `.\convert\`.
|
||||||
|
@ -182,6 +184,11 @@ You'll be presented with a bunch of options in the default `Generate` tab, but d
|
||||||
* `Custom Emotion + Prompt`: a non-preset "emotion" used for the delivery. This is a shortcut to utilizing "prompt engineering" by starting with `[<emotion>]` in your prompt.
|
* `Custom Emotion + Prompt`: a non-preset "emotion" used for the delivery. This is a shortcut to utilizing "prompt engineering" by starting with `[<emotion>]` in your prompt.
|
||||||
* `Voice`: the voice you want to clone. You can select `microphone` if you want to use input from your microphone.
|
* `Voice`: the voice you want to clone. You can select `microphone` if you want to use input from your microphone.
|
||||||
* `Microphone Source`: Use your own voice from a line-in source.
|
* `Microphone Source`: Use your own voice from a line-in source.
|
||||||
|
* `Reload Voice List`: refreshes the voice list and updates. ***Click this*** after adding or removing a new voice.
|
||||||
|
* `(Re)Compute Voice Latents`: regenerates a voice's cached latents.
|
||||||
|
* `Experimental Compute Latents Mode`: this mode will combine all voice samples into one file, then split it evenly (if under the maximum allowed chunk size under `Settings`)
|
||||||
|
|
||||||
|
Below are a list of generation settings:
|
||||||
* `Candidates`: number of outputs to generate, starting from the best candidate. Depending on your iteration steps, generating the final sound files could be cheap, but they only offer alternatives to the samples generated to pull from (in other words, the later candidates perform worse), so don't be compelled to generate a ton of candidates.
|
* `Candidates`: number of outputs to generate, starting from the best candidate. Depending on your iteration steps, generating the final sound files could be cheap, but they only offer alternatives to the samples generated to pull from (in other words, the later candidates perform worse), so don't be compelled to generate a ton of candidates.
|
||||||
* `Seed`: initializes the PRNG to this value. Use this if you want to reproduce a generated voice.
|
* `Seed`: initializes the PRNG to this value. Use this if you want to reproduce a generated voice.
|
||||||
* `Preset`: shortcut values for sample count and iteration steps. Clicking a preset will update its corresponding values. Higher presets result in better quality at the cost of computation time.
|
* `Preset`: shortcut values for sample count and iteration steps. Clicking a preset will update its corresponding values. Higher presets result in better quality at the cost of computation time.
|
||||||
|
|
|
@ -294,7 +294,7 @@ class TextToSpeech:
|
||||||
if self.preloaded_tensors:
|
if self.preloaded_tensors:
|
||||||
self.cvvp = self.cvvp.to(self.device)
|
self.cvvp = self.cvvp.to(self.device)
|
||||||
|
|
||||||
def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, progress=None, chunk_size=None, max_chunk_size=None, chunk_tensors=True, calculation_mode=0):
|
def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, progress=None, chunk_size=None, max_chunk_size=None, chunk_tensors=True, calculation_mode=1):
|
||||||
"""
|
"""
|
||||||
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
|
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
|
||||||
These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
|
These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
|
||||||
|
@ -339,7 +339,6 @@ class TextToSpeech:
|
||||||
diffusion_conds = []
|
diffusion_conds = []
|
||||||
chunks = []
|
chunks = []
|
||||||
|
|
||||||
|
|
||||||
# new behavior: combine all samples, and divide accordingly
|
# new behavior: combine all samples, and divide accordingly
|
||||||
# doesn't work, need to fix
|
# doesn't work, need to fix
|
||||||
if calculation_mode == 1:
|
if calculation_mode == 1:
|
||||||
|
@ -349,9 +348,9 @@ class TextToSpeech:
|
||||||
|
|
||||||
if max_chunk_size is not None and chunk_size > max_chunk_size:
|
if max_chunk_size is not None and chunk_size > max_chunk_size:
|
||||||
while chunk_size > max_chunk_size:
|
while chunk_size > max_chunk_size:
|
||||||
chunk_size = chunk_size / 2
|
chunk_size = int(chunk_size / 2)
|
||||||
|
|
||||||
print(f"Size of best fit: {chunk_size}")
|
print(f"Using method 1: size of best fit: {chunk_size}")
|
||||||
chunks = torch.chunk(concat, int(concat.shape[-1] / chunk_size) + 1, dim=1)
|
chunks = torch.chunk(concat, int(concat.shape[-1] / chunk_size) + 1, dim=1)
|
||||||
# default new behavior: use the smallest voice sample as a common chunk size
|
# default new behavior: use the smallest voice sample as a common chunk size
|
||||||
else:
|
else:
|
||||||
|
@ -362,7 +361,7 @@ class TextToSpeech:
|
||||||
else:
|
else:
|
||||||
chunk_size = sample.shape[-1] if chunk_size is None else max( chunk_size, sample.shape[-1] )
|
chunk_size = sample.shape[-1] if chunk_size is None else max( chunk_size, sample.shape[-1] )
|
||||||
|
|
||||||
print(f"Size of best fit: {chunk_size}")
|
print(f"Using method 0: size of best fit: {chunk_size}")
|
||||||
if max_chunk_size is not None and chunk_size > max_chunk_size:
|
if max_chunk_size is not None and chunk_size > max_chunk_size:
|
||||||
chunk_size = max_chunk_size
|
chunk_size = max_chunk_size
|
||||||
print(f"Chunk size exceeded, clamping to: {max_chunk_size}")
|
print(f"Chunk size exceeded, clamping to: {max_chunk_size}")
|
||||||
|
|
30
webui.py
30
webui.py
|
@ -305,6 +305,27 @@ def generate(
|
||||||
stats,
|
stats,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def compute_latents(voice, mode, progress=gr.Progress(track_tqdm=True)):
|
||||||
|
global tts
|
||||||
|
try:
|
||||||
|
tts
|
||||||
|
except NameError:
|
||||||
|
raise gr.Error("TTS is still initializing...")
|
||||||
|
|
||||||
|
voice_samples, conditioning_latents = load_voice(voice, load_latents=False)
|
||||||
|
|
||||||
|
if voice_samples is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, max_chunk_size=args.cond_latent_max_chunk_size, calculation_mode=1 if mode else 0)
|
||||||
|
|
||||||
|
if len(conditioning_latents) == 4:
|
||||||
|
conditioning_latents = (conditioning_latents[0], conditioning_latents[1], conditioning_latents[2], None)
|
||||||
|
|
||||||
|
torch.save(conditioning_latents, f'{get_voice_dir()}/{voice}/cond_latents.pth')
|
||||||
|
|
||||||
|
return voice
|
||||||
|
|
||||||
def update_presets(value):
|
def update_presets(value):
|
||||||
PRESETS = {
|
PRESETS = {
|
||||||
'Ultra Fast': {'num_autoregressive_samples': 16, 'diffusion_iterations': 30, 'cond_free': False},
|
'Ultra Fast': {'num_autoregressive_samples': 16, 'diffusion_iterations': 30, 'cond_free': False},
|
||||||
|
@ -467,7 +488,7 @@ def export_exec_settings( share, listen, check_for_updates, models_from_local_on
|
||||||
|
|
||||||
settings = {
|
settings = {
|
||||||
'share': args.share,
|
'share': args.share,
|
||||||
'listen': args.listen,
|
'listen': None if args.listen else args.listen,
|
||||||
'low-vram':args.low_vram,
|
'low-vram':args.low_vram,
|
||||||
'check-for-updates':args.check_for_updates,
|
'check-for-updates':args.check_for_updates,
|
||||||
'models-from-local-only':args.models_from_local_only,
|
'models-from-local-only':args.models_from_local_only,
|
||||||
|
@ -613,6 +634,13 @@ def setup_gradio():
|
||||||
inputs=None,
|
inputs=None,
|
||||||
outputs=voice
|
outputs=voice
|
||||||
)
|
)
|
||||||
|
gr.Button(value="(Re)Compute Voice Latents").click(compute_latents,
|
||||||
|
inputs=[
|
||||||
|
voice,
|
||||||
|
gr.Checkbox(label="Experimental Compute Voice Latents Mode", value=True)
|
||||||
|
],
|
||||||
|
outputs=voice,
|
||||||
|
)
|
||||||
|
|
||||||
prompt.change(fn=lambda value: gr.update(value="Custom"),
|
prompt.change(fn=lambda value: gr.update(value="Custom"),
|
||||||
inputs=prompt,
|
inputs=prompt,
|
||||||
|
|
Loading…
Reference in New Issue
Block a user