forked from mrq/tortoise-tts
Added encoding and ripping latents used to generate the voice
This commit is contained in:
parent
e25ec325fe
commit
d1172ead36
38
app.py
38
app.py
|
@ -5,6 +5,7 @@ import torch
|
|||
import torchaudio
|
||||
import time
|
||||
import json
|
||||
import base64
|
||||
|
||||
from datetime import datetime
|
||||
from tortoise.api import TextToSpeech
|
||||
|
@ -31,6 +32,7 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c
|
|||
if voice_samples is not None:
|
||||
sample_voice = voice_samples[0]
|
||||
conditioning_latents = tts.get_conditioning_latents(voice_samples, progress=progress, max_chunk_size=args.cond_latent_max_chunk_size)
|
||||
if voice != "microphone":
|
||||
torch.save(conditioning_latents, os.path.join(f'./tortoise/voices/{voice}/', f'cond_latents.pth'))
|
||||
voice_samples = None
|
||||
else:
|
||||
|
@ -39,6 +41,8 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c
|
|||
if seed == 0:
|
||||
seed = None
|
||||
|
||||
print(conditioning_latents)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
settings = {
|
||||
|
@ -150,6 +154,9 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c
|
|||
with open(os.path.join(outdir, f'input.txt'), 'w', encoding="utf-8") as f:
|
||||
f.write(json.dumps(info, indent='\t') )
|
||||
|
||||
if voice is not None and conditioning_latents is not None:
|
||||
with open(os.path.join(f'./tortoise/voices/{voice}/', f'cond_latents.pth'), 'rb') as f:
|
||||
info['latents'] = base64.b64encode(f.read()).decode("ascii")
|
||||
|
||||
print(f"Saved to '{outdir}'")
|
||||
|
||||
|
@ -185,17 +192,34 @@ def update_presets(value):
|
|||
else:
|
||||
return (gr.update(), gr.update())
|
||||
|
||||
def read_metadata(file):
|
||||
def read_metadata(file, save_latents=True):
|
||||
j = None
|
||||
latents = None
|
||||
|
||||
if file is not None:
|
||||
metadata = music_tag.load_file(file.name)
|
||||
if 'lyrics' in metadata:
|
||||
j = json.loads(str(metadata['lyrics']))
|
||||
print(j)
|
||||
return j
|
||||
|
||||
if 'latents' in j and save_latents:
|
||||
latents = base64.b64decode(j['latents'])
|
||||
del j['latents']
|
||||
|
||||
if latents and save_latents:
|
||||
outdir='/voices/.temp/'
|
||||
os.makedirs(os.path.join(outdir), exist_ok=True)
|
||||
with open(os.path.join(outdir, 'cond_latents.pth'), 'wb') as f:
|
||||
f.write(latents)
|
||||
latents = os.path.join(outdir, 'cond_latents.pth')
|
||||
|
||||
return (
|
||||
j,
|
||||
latents
|
||||
)
|
||||
|
||||
def copy_settings(file):
|
||||
metadata = read_metadata(file)
|
||||
metadata, latents = read_metadata(file, save_latents=False)
|
||||
|
||||
if metadata is None:
|
||||
return None
|
||||
|
||||
|
@ -330,11 +354,15 @@ def main():
|
|||
copy_button = gr.Button(value="Copy Settings")
|
||||
with gr.Column():
|
||||
metadata_out = gr.JSON(label="Audio Metadata")
|
||||
latents_out = gr.File(type="binary", label="Voice Latents")
|
||||
|
||||
audio_in.upload(
|
||||
fn=read_metadata,
|
||||
inputs=audio_in,
|
||||
outputs=metadata_out,
|
||||
outputs=[
|
||||
metadata_out,
|
||||
latents_out
|
||||
]
|
||||
)
|
||||
|
||||
copy_button.click(copy_settings,
|
||||
|
|
Loading…
Reference in New Issue
Block a user