Added two flags/settings: embed output settings, slimmer computed voice latents

This commit is contained in:
mrq 2023-02-08 14:14:28 +00:00
parent 94eab20529
commit 81e4d261b7
2 changed files with 32 additions and 10 deletions

View File

@ -161,6 +161,8 @@ Below are settings that override the default launch arguments. Some of these req
* `Public Share Gradio`: overrides `--share`. Tells Gradio to generate a public URL for the web UI * `Public Share Gradio`: overrides `--share`. Tells Gradio to generate a public URL for the web UI
* `Check for Updates`: checks for updates on page load and notifies in console. Only works if you pulled this repo from a gitea instance. * `Check for Updates`: checks for updates on page load and notifies in console. Only works if you pulled this repo from a gitea instance.
* `Low VRAM`: disables optimizations in TorToiSe that increases VRAM consumption. Suggested if your GPU has under 6GiB. * `Low VRAM`: disables optimizations in TorToiSe that increases VRAM consumption. Suggested if your GPU has under 6GiB.
* `Embed Output Metadata`: enables embedding the settings and latents used to generate that audio clip inside that audio clip. Metadata is stored as a JSON string in the `lyrics` tag.
* `Slimmer Computed Latents`: falls back to the original, 12.9KiB way of storing latents (without the extra bits required for using the CVVP model).
* `Voice Latent Max Chunk Size`: during the voice latents calculation pass, this limits how large, in bytes, a chunk can be. Large values can run into VRAM OOM errors. * `Voice Latent Max Chunk Size`: during the voice latents calculation pass, this limits how large, in bytes, a chunk can be. Large values can run into VRAM OOM errors.
* `Sample Batch Size`: sets the batch size when generating autoregressive samples. Bigger batches result in faster compute, at the cost of increased VRAM consumption. Leave to 0 to calculate a "best" fit. * `Sample Batch Size`: sets the batch size when generating autoregressive samples. Bigger batches result in faster compute, at the cost of increased VRAM consumption. Leave to 0 to calculate a "best" fit.
* `Concurrency Count`: how many Gradio events the queue can process at once. Leave this over 1 if you want to modify settings in the UI that updates other settings while generating audio clips. * `Concurrency Count`: how many Gradio events the queue can process at once. Leave this over 1 if you want to modify settings in the UI that updates other settings while generating audio clips.
@ -208,6 +210,11 @@ I'm providing this even with its nasty warts to highlight the quirks: the weird
I think this also highlights how just combining your entire source sample gung-ho isn't a good idea, as he's not as high of a pitch in his delivery compared to how he usually is throughout most of the game (a sort of average between his two ranges). I can't gauge how well it did in reproducing it, since my ears are pretty much burnt out from listening to so many clips, but I believe he's pretty believable as a James Sunderland. I think this also highlights how just combining your entire source sample gung-ho isn't a good idea, as he's not as high of a pitch in his delivery compared to how he usually is throughout most of the game (a sort of average between his two ranges). I can't gauge how well it did in reproducing it, since my ears are pretty much burnt out from listening to so many clips, but I believe he's pretty believable as a James Sunderland.
Output (`Is that really you, Mary?`, Ultra Fast preset, settings and latents embedded)
* https://files.catbox.moe/gy1jvz.wav
This was just a quick test for an adjustable setting, but this one turned out really nice on the off chance. It's not the original delivery, and it definitely sounds robotic still, but it's on the Ultra Fast preset, as expected.
## Caveats (and Upsides) ## Caveats (and Upsides)
To me, I find a few problems with TorToiSe over 11.AI: To me, I find a few problems with TorToiSe over 11.AI:

35
app.py
View File

@ -36,7 +36,10 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
if voice_samples is not None: if voice_samples is not None:
sample_voice = voice_samples[0] sample_voice = voice_samples[0]
conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=True, progress=progress, max_chunk_size=args.cond_latent_max_chunk_size) conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, max_chunk_size=args.cond_latent_max_chunk_size)
if len(conditioning_latents) == 4:
conditioning_latents = (conditioning_latents[0], conditioning_latents[1], conditioning_latents[2], None)
if voice != "microphone": if voice != "microphone":
torch.save(conditioning_latents, f'./tortoise/voices/{voice}/cond_latents.pth') torch.save(conditioning_latents, f'./tortoise/voices/{voice}/cond_latents.pth')
voice_samples = None voice_samples = None
@ -177,13 +180,13 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
with open(f'./tortoise/voices/{voice}/cond_latents.pth', 'rb') as f: with open(f'./tortoise/voices/{voice}/cond_latents.pth', 'rb') as f:
info['latents'] = base64.b64encode(f.read()).decode("ascii") info['latents'] = base64.b64encode(f.read()).decode("ascii")
if args.embed_output_metadata:
for path in audio_cache:
info['text'] = audio_cache[path]['text']
for path in audio_cache: metadata = music_tag.load_file(f"{outdir}/{path}")
info['text'] = audio_cache[path]['text'] metadata['lyrics'] = json.dumps(info)
metadata.save()
metadata = music_tag.load_file(f"{outdir}/{path}")
metadata['lyrics'] = json.dumps(info)
metadata.save()
if sample_voice is not None: if sample_voice is not None:
sample_voice = (tts.input_sample_rate, sample_voice.squeeze().cpu().numpy()) sample_voice = (tts.input_sample_rate, sample_voice.squeeze().cpu().numpy())
@ -318,12 +321,14 @@ def check_for_updates():
def update_voices(): def update_voices():
return gr.Dropdown.update(choices=sorted(os.listdir("./tortoise/voices")) + ["microphone"]) return gr.Dropdown.update(choices=sorted(os.listdir("./tortoise/voices")) + ["microphone"])
def export_exec_settings( share, check_for_updates, low_vram, cond_latent_max_chunk_size, sample_batch_size, concurrency_count ): def export_exec_settings( share, check_for_updates, low_vram, embed_output_metadata, latents_lean_and_mean, cond_latent_max_chunk_size, sample_batch_size, concurrency_count ):
args.share = share args.share = share
args.low_vram = low_vram args.low_vram = low_vram
args.check_for_updates = check_for_updates args.check_for_updates = check_for_updates
args.cond_latent_max_chunk_size = cond_latent_max_chunk_size args.cond_latent_max_chunk_size = cond_latent_max_chunk_size
args.sample_batch_size = sample_batch_size args.sample_batch_size = sample_batch_size
args.embed_output_metadata = embed_output_metadata
args.latents_lean_and_mean = latents_lean_and_mean
args.concurrency_count = concurrency_count args.concurrency_count = concurrency_count
settings = { settings = {
@ -332,6 +337,8 @@ def export_exec_settings( share, check_for_updates, low_vram, cond_latent_max_ch
'check-for-updates':args.check_for_updates, 'check-for-updates':args.check_for_updates,
'cond-latent-max-chunk-size': args.cond_latent_max_chunk_size, 'cond-latent-max-chunk-size': args.cond_latent_max_chunk_size,
'sample-batch-size': args.sample_batch_size, 'sample-batch-size': args.sample_batch_size,
'embed-output-metadata': args.embed_output_metadata,
'latents-lean-and-mean': args.latents_lean_and_mean,
'concurrency-count': args.concurrency_count, 'concurrency-count': args.concurrency_count,
} }
@ -438,6 +445,8 @@ def main():
exec_arg_share = gr.Checkbox(label="Public Share Gradio", value=args.share) exec_arg_share = gr.Checkbox(label="Public Share Gradio", value=args.share)
exec_check_for_updates = gr.Checkbox(label="Check For Updates", value=args.check_for_updates) exec_check_for_updates = gr.Checkbox(label="Check For Updates", value=args.check_for_updates)
exec_arg_low_vram = gr.Checkbox(label="Low VRAM", value=args.low_vram) exec_arg_low_vram = gr.Checkbox(label="Low VRAM", value=args.low_vram)
exec_arg_embed_output_metadata = gr.Checkbox(label="Embed Output Metadata", value=args.embed_output_metadata)
exec_arg_latents_lean_and_mean = gr.Checkbox(label="Slimmer Computed Latents", value=args.latents_lean_and_mean)
exec_arg_cond_latent_max_chunk_size = gr.Number(label="Voice Latents Max Chunk Size", precision=0, value=args.cond_latent_max_chunk_size) exec_arg_cond_latent_max_chunk_size = gr.Number(label="Voice Latents Max Chunk Size", precision=0, value=args.cond_latent_max_chunk_size)
exec_arg_sample_batch_size = gr.Number(label="Sample Batch Size", precision=0, value=args.sample_batch_size) exec_arg_sample_batch_size = gr.Number(label="Sample Batch Size", precision=0, value=args.sample_batch_size)
exec_arg_concurrency_count = gr.Number(label="Concurrency Count", precision=0, value=args.concurrency_count) exec_arg_concurrency_count = gr.Number(label="Concurrency Count", precision=0, value=args.concurrency_count)
@ -448,7 +457,7 @@ def main():
check_updates_now = gr.Button(value="Check for Updates") check_updates_now = gr.Button(value="Check for Updates")
exec_inputs = [exec_arg_share, exec_check_for_updates, exec_arg_low_vram, exec_arg_cond_latent_max_chunk_size, exec_arg_sample_batch_size, exec_arg_concurrency_count] exec_inputs = [exec_arg_share, exec_check_for_updates, exec_arg_low_vram, exec_arg_embed_output_metadata, exec_arg_latents_lean_and_mean, exec_arg_cond_latent_max_chunk_size, exec_arg_sample_batch_size, exec_arg_concurrency_count]
for i in exec_inputs: for i in exec_inputs:
i.change( i.change(
@ -502,8 +511,10 @@ if __name__ == "__main__":
'share': False, 'share': False,
'check-for-updates': False, 'check-for-updates': False,
'low-vram': False, 'low-vram': False,
'cond-latent-max-chunk-size': 1000000,
'sample-batch-size': None, 'sample-batch-size': None,
'embed-output-metadata': True,
'latents-lean-and-mean': True,
'cond-latent-max-chunk-size': 1000000,
'concurrency-count': 3, 'concurrency-count': 3,
} }
@ -517,11 +528,15 @@ if __name__ == "__main__":
parser.add_argument("--share", action='store_true', default=default_arguments['share'], help="Lets Gradio return a public URL to use anywhere") parser.add_argument("--share", action='store_true', default=default_arguments['share'], help="Lets Gradio return a public URL to use anywhere")
parser.add_argument("--check-for-updates", action='store_true', default=default_arguments['check-for-updates'], help="Checks for update on startup") parser.add_argument("--check-for-updates", action='store_true', default=default_arguments['check-for-updates'], help="Checks for update on startup")
parser.add_argument("--low-vram", action='store_true', default=default_arguments['low-vram'], help="Disables some optimizations that increases VRAM usage") parser.add_argument("--low-vram", action='store_true', default=default_arguments['low-vram'], help="Disables some optimizations that increases VRAM usage")
parser.add_argument("--no-embed-output-metadata", action='store_false', default=not default_arguments['embed-output-metadata'], help="Disables embedding output metadata into resulting WAV files for easily fetching its settings used with the web UI (data is stored in the lyrics metadata tag)")
parser.add_argument("--latents-lean-and-mean", action='store_true', default=default_arguments['latents-lean-and-mean'], help="Exports the bare essentials for latents.")
parser.add_argument("--cond-latent-max-chunk-size", default=default_arguments['cond-latent-max-chunk-size'], type=int, help="Sets an upper limit to audio chunk size when computing conditioning latents") parser.add_argument("--cond-latent-max-chunk-size", default=default_arguments['cond-latent-max-chunk-size'], type=int, help="Sets an upper limit to audio chunk size when computing conditioning latents")
parser.add_argument("--sample-batch-size", default=default_arguments['sample-batch-size'], type=int, help="Sets an upper limit to audio chunk size when computing conditioning latents") parser.add_argument("--sample-batch-size", default=default_arguments['sample-batch-size'], type=int, help="Sets an upper limit to audio chunk size when computing conditioning latents")
parser.add_argument("--concurrency-count", type=int, default=default_arguments['concurrency-count'], help="How many Gradio events to process at once") parser.add_argument("--concurrency-count", type=int, default=default_arguments['concurrency-count'], help="How many Gradio events to process at once")
args = parser.parse_args() args = parser.parse_args()
args.embed_output_metadata = not args.no_embed_output_metadata
if not args.share: if not args.share:
def noop(function, return_value=None): def noop(function, return_value=None):
def wrapped(*args, **kwargs): def wrapped(*args, **kwargs):