forked from mrq/tortoise-tts
Added encoding and ripping latents used to generate the voice
This commit is contained in:
parent
e25ec325fe
commit
d1172ead36
40
app.py
40
app.py
|
@ -5,6 +5,7 @@ import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
import time
|
import time
|
||||||
import json
|
import json
|
||||||
|
import base64
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from tortoise.api import TextToSpeech
|
from tortoise.api import TextToSpeech
|
||||||
|
@ -31,7 +32,8 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c
|
||||||
if voice_samples is not None:
|
if voice_samples is not None:
|
||||||
sample_voice = voice_samples[0]
|
sample_voice = voice_samples[0]
|
||||||
conditioning_latents = tts.get_conditioning_latents(voice_samples, progress=progress, max_chunk_size=args.cond_latent_max_chunk_size)
|
conditioning_latents = tts.get_conditioning_latents(voice_samples, progress=progress, max_chunk_size=args.cond_latent_max_chunk_size)
|
||||||
torch.save(conditioning_latents, os.path.join(f'./tortoise/voices/{voice}/', f'cond_latents.pth'))
|
if voice != "microphone":
|
||||||
|
torch.save(conditioning_latents, os.path.join(f'./tortoise/voices/{voice}/', f'cond_latents.pth'))
|
||||||
voice_samples = None
|
voice_samples = None
|
||||||
else:
|
else:
|
||||||
sample_voice = None
|
sample_voice = None
|
||||||
|
@ -39,6 +41,8 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c
|
||||||
if seed == 0:
|
if seed == 0:
|
||||||
seed = None
|
seed = None
|
||||||
|
|
||||||
|
print(conditioning_latents)
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
settings = {
|
settings = {
|
||||||
|
@ -150,6 +154,9 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c
|
||||||
with open(os.path.join(outdir, f'input.txt'), 'w', encoding="utf-8") as f:
|
with open(os.path.join(outdir, f'input.txt'), 'w', encoding="utf-8") as f:
|
||||||
f.write(json.dumps(info, indent='\t') )
|
f.write(json.dumps(info, indent='\t') )
|
||||||
|
|
||||||
|
if voice is not None and conditioning_latents is not None:
|
||||||
|
with open(os.path.join(f'./tortoise/voices/{voice}/', f'cond_latents.pth'), 'rb') as f:
|
||||||
|
info['latents'] = base64.b64encode(f.read()).decode("ascii")
|
||||||
|
|
||||||
print(f"Saved to '{outdir}'")
|
print(f"Saved to '{outdir}'")
|
||||||
|
|
||||||
|
@ -185,17 +192,34 @@ def update_presets(value):
|
||||||
else:
|
else:
|
||||||
return (gr.update(), gr.update())
|
return (gr.update(), gr.update())
|
||||||
|
|
||||||
def read_metadata(file):
|
def read_metadata(file, save_latents=True):
|
||||||
j = None
|
j = None
|
||||||
|
latents = None
|
||||||
|
|
||||||
if file is not None:
|
if file is not None:
|
||||||
metadata = music_tag.load_file(file.name)
|
metadata = music_tag.load_file(file.name)
|
||||||
if 'lyrics' in metadata:
|
if 'lyrics' in metadata:
|
||||||
j = json.loads(str(metadata['lyrics']))
|
j = json.loads(str(metadata['lyrics']))
|
||||||
print(j)
|
|
||||||
return j
|
if 'latents' in j and save_latents:
|
||||||
|
latents = base64.b64decode(j['latents'])
|
||||||
|
del j['latents']
|
||||||
|
|
||||||
|
if latents and save_latents:
|
||||||
|
outdir='/voices/.temp/'
|
||||||
|
os.makedirs(os.path.join(outdir), exist_ok=True)
|
||||||
|
with open(os.path.join(outdir, 'cond_latents.pth'), 'wb') as f:
|
||||||
|
f.write(latents)
|
||||||
|
latents = os.path.join(outdir, 'cond_latents.pth')
|
||||||
|
|
||||||
|
return (
|
||||||
|
j,
|
||||||
|
latents
|
||||||
|
)
|
||||||
|
|
||||||
def copy_settings(file):
|
def copy_settings(file):
|
||||||
metadata = read_metadata(file)
|
metadata, latents = read_metadata(file, save_latents=False)
|
||||||
|
|
||||||
if metadata is None:
|
if metadata is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -330,11 +354,15 @@ def main():
|
||||||
copy_button = gr.Button(value="Copy Settings")
|
copy_button = gr.Button(value="Copy Settings")
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
metadata_out = gr.JSON(label="Audio Metadata")
|
metadata_out = gr.JSON(label="Audio Metadata")
|
||||||
|
latents_out = gr.File(type="binary", label="Voice Latents")
|
||||||
|
|
||||||
audio_in.upload(
|
audio_in.upload(
|
||||||
fn=read_metadata,
|
fn=read_metadata,
|
||||||
inputs=audio_in,
|
inputs=audio_in,
|
||||||
outputs=metadata_out,
|
outputs=[
|
||||||
|
metadata_out,
|
||||||
|
latents_out
|
||||||
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
copy_button.click(copy_settings,
|
copy_button.click(copy_settings,
|
||||||
|
|
Loading…
Reference in New Issue
Block a user