1
1
forked from mrq/tortoise-tts

Added encoding and ripping latents used to generate the voice

This commit is contained in:
mrq 2023-02-06 16:32:09 +00:00
parent e25ec325fe
commit d1172ead36

40
app.py
View File

@ -5,6 +5,7 @@ import torch
import torchaudio import torchaudio
import time import time
import json import json
import base64
from datetime import datetime from datetime import datetime
from tortoise.api import TextToSpeech from tortoise.api import TextToSpeech
@ -31,7 +32,8 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c
if voice_samples is not None: if voice_samples is not None:
sample_voice = voice_samples[0] sample_voice = voice_samples[0]
conditioning_latents = tts.get_conditioning_latents(voice_samples, progress=progress, max_chunk_size=args.cond_latent_max_chunk_size) conditioning_latents = tts.get_conditioning_latents(voice_samples, progress=progress, max_chunk_size=args.cond_latent_max_chunk_size)
torch.save(conditioning_latents, os.path.join(f'./tortoise/voices/{voice}/', f'cond_latents.pth')) if voice != "microphone":
torch.save(conditioning_latents, os.path.join(f'./tortoise/voices/{voice}/', f'cond_latents.pth'))
voice_samples = None voice_samples = None
else: else:
sample_voice = None sample_voice = None
@ -39,6 +41,8 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c
if seed == 0: if seed == 0:
seed = None seed = None
print(conditioning_latents)
start_time = time.time() start_time = time.time()
settings = { settings = {
@ -150,6 +154,9 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c
with open(os.path.join(outdir, f'input.txt'), 'w', encoding="utf-8") as f: with open(os.path.join(outdir, f'input.txt'), 'w', encoding="utf-8") as f:
f.write(json.dumps(info, indent='\t') ) f.write(json.dumps(info, indent='\t') )
if voice is not None and conditioning_latents is not None:
with open(os.path.join(f'./tortoise/voices/{voice}/', f'cond_latents.pth'), 'rb') as f:
info['latents'] = base64.b64encode(f.read()).decode("ascii")
print(f"Saved to '{outdir}'") print(f"Saved to '{outdir}'")
@ -185,17 +192,34 @@ def update_presets(value):
else: else:
return (gr.update(), gr.update()) return (gr.update(), gr.update())
def read_metadata(file): def read_metadata(file, save_latents=True):
j = None j = None
latents = None
if file is not None: if file is not None:
metadata = music_tag.load_file(file.name) metadata = music_tag.load_file(file.name)
if 'lyrics' in metadata: if 'lyrics' in metadata:
j = json.loads(str(metadata['lyrics'])) j = json.loads(str(metadata['lyrics']))
print(j)
return j if 'latents' in j and save_latents:
latents = base64.b64decode(j['latents'])
del j['latents']
if latents and save_latents:
outdir='/voices/.temp/'
os.makedirs(os.path.join(outdir), exist_ok=True)
with open(os.path.join(outdir, 'cond_latents.pth'), 'wb') as f:
f.write(latents)
latents = os.path.join(outdir, 'cond_latents.pth')
return (
j,
latents
)
def copy_settings(file): def copy_settings(file):
metadata = read_metadata(file) metadata, latents = read_metadata(file, save_latents=False)
if metadata is None: if metadata is None:
return None return None
@ -330,11 +354,15 @@ def main():
copy_button = gr.Button(value="Copy Settings") copy_button = gr.Button(value="Copy Settings")
with gr.Column(): with gr.Column():
metadata_out = gr.JSON(label="Audio Metadata") metadata_out = gr.JSON(label="Audio Metadata")
latents_out = gr.File(type="binary", label="Voice Latents")
audio_in.upload( audio_in.upload(
fn=read_metadata, fn=read_metadata,
inputs=audio_in, inputs=audio_in,
outputs=metadata_out, outputs=[
metadata_out,
latents_out
]
) )
copy_button.click(copy_settings, copy_button.click(copy_settings,