Added encoding and ripping latents used to generate the voice

2023-02-06 16:32:09 +00:00 · 2023-02-06 16:32:09 +00:00 · d1172ead36
commit d1172ead36
parent e25ec325fe
1 changed files with 34 additions and 6 deletions
--- a/app.py
+++ b/app.py
@ -5,6 +5,7 @@ import torch
 import torchaudio
 import time
 import json
+import base64

 from datetime import datetime
 from tortoise.api import TextToSpeech
@ -31,6 +32,7 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c
    if voice_samples is not None:
        sample_voice = voice_samples[0]
        conditioning_latents = tts.get_conditioning_latents(voice_samples, progress=progress, max_chunk_size=args.cond_latent_max_chunk_size)
+        if voice != "microphone":
            torch.save(conditioning_latents, os.path.join(f'./tortoise/voices/{voice}/', f'cond_latents.pth'))
        voice_samples = None
    else:
@ -39,6 +41,8 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c
    if seed == 0:
        seed = None

+    print(conditioning_latents)
+
    start_time = time.time()

    settings = {
@ -150,6 +154,9 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c
    with open(os.path.join(outdir, f'input.txt'), 'w', encoding="utf-8") as f:
        f.write(json.dumps(info, indent='\t') )

+    if voice is not None and conditioning_latents is not None:
+        with open(os.path.join(f'./tortoise/voices/{voice}/', f'cond_latents.pth'), 'rb') as f:
+            info['latents'] = base64.b64encode(f.read()).decode("ascii")

    print(f"Saved to '{outdir}'")

@ -185,17 +192,34 @@ def update_presets(value):
    else:
        return (gr.update(), gr.update())

-def read_metadata(file):
+def read_metadata(file, save_latents=True):
    j = None
+    latents = None
+
    if file is not None:
        metadata = music_tag.load_file(file.name)
        if 'lyrics' in metadata:
            j = json.loads(str(metadata['lyrics']))
-            print(j)
-    return j
+
+            if 'latents' in j and save_latents:
+                latents = base64.b64decode(j['latents'])
+                del j['latents']
+
+    if latents and save_latents:
+        outdir='/voices/.temp/'
+        os.makedirs(os.path.join(outdir), exist_ok=True)
+        with open(os.path.join(outdir, 'cond_latents.pth'), 'wb') as f:
+            f.write(latents)
+        latents = os.path.join(outdir, 'cond_latents.pth')
+
+    return (
+        j,
+        latents
+    )

 def copy_settings(file):
-    metadata = read_metadata(file)
+    metadata, latents = read_metadata(file, save_latents=False)
+    
    if metadata is None:
        return None

@ -330,11 +354,15 @@ def main():
                    copy_button = gr.Button(value="Copy Settings")
                with gr.Column():
                    metadata_out = gr.JSON(label="Audio Metadata")
+                    latents_out = gr.File(type="binary", label="Voice Latents")

                    audio_in.upload(
                        fn=read_metadata,
                        inputs=audio_in,
-                        outputs=metadata_out,
+                        outputs=[
+                            metadata_out,
+                            latents_out
+                        ]
                    )

                    copy_button.click(copy_settings,