forked from mrq/tortoise-tts
Added multi-line parsing
This commit is contained in:
parent
3e3634f36a
commit
f38c479e9b
60
app.py
60
app.py
|
@ -8,8 +8,9 @@ import time
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from tortoise.api import TextToSpeech
|
from tortoise.api import TextToSpeech
|
||||||
from tortoise.utils.audio import load_audio, load_voice, load_voices
|
from tortoise.utils.audio import load_audio, load_voice, load_voices
|
||||||
|
from tortoise.utils.text import split_and_recombine_text
|
||||||
|
|
||||||
def generate(text, emotion, prompt, voice, mic_audio, preset, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, diffusion_sampler, progress=gr.Progress()):
|
def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, diffusion_sampler, progress=gr.Progress()):
|
||||||
if voice != "microphone":
|
if voice != "microphone":
|
||||||
voices = [voice]
|
voices = [voice]
|
||||||
else:
|
else:
|
||||||
|
@ -58,35 +59,64 @@ def generate(text, emotion, prompt, voice, mic_audio, preset, seed, candidates,
|
||||||
'progress': progress,
|
'progress': progress,
|
||||||
}
|
}
|
||||||
|
|
||||||
gen, additionals = tts.tts( text, **settings )
|
if delimiter == "\\n":
|
||||||
seed = additionals[0]
|
delimiter = "\n"
|
||||||
|
|
||||||
|
if delimiter != "" and delimiter in text:
|
||||||
|
texts = text.split(delimiter)
|
||||||
|
else:
|
||||||
|
texts = split_and_recombine_text(text)
|
||||||
|
|
||||||
info = f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Quality: {preset} preset / {num_autoregressive_samples} samples / {diffusion_iterations} iterations | Temperature: {temperature} | Diffusion Sampler: {diffusion_sampler} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n".encode('utf8')
|
|
||||||
with open("results.log", "w") as f:
|
|
||||||
f.write(info)
|
|
||||||
|
|
||||||
timestamp = int(time.time())
|
timestamp = int(time.time())
|
||||||
outdir = f"./results/{voice}/{timestamp}/"
|
outdir = f"./results/{voice}/{timestamp}/"
|
||||||
|
|
||||||
os.makedirs(outdir, exist_ok=True)
|
os.makedirs(outdir, exist_ok=True)
|
||||||
|
|
||||||
with open(os.path.join(outdir, f'input.txt'), 'w') as f:
|
# to-do: store audio to array to avoid having to re-read from disk when combining
|
||||||
f.write(f"{info}")
|
# to-do: do not rejoin when not splitting lines
|
||||||
|
|
||||||
|
for line, cut_text in enumerate(texts):
|
||||||
|
print(f"[{str(line+1)}/{str(len(texts))}] Generating line: {cut_text}")
|
||||||
|
|
||||||
|
gen, additionals = tts.tts(cut_text, **settings )
|
||||||
|
seed = additionals[0]
|
||||||
|
|
||||||
if isinstance(gen, list):
|
if isinstance(gen, list):
|
||||||
for j, g in enumerate(gen):
|
for j, g in enumerate(gen):
|
||||||
torchaudio.save(os.path.join(outdir, f'result_{j}.wav'), g.squeeze(0).cpu(), 24000)
|
os.makedirs(os.path.join(outdir, f'candidate_{j}'), exist_ok=True)
|
||||||
|
torchaudio.save(os.path.join(outdir, f'candidate_{j}/result_{line}.wav'), g.squeeze(0).cpu(), 24000)
|
||||||
output_voice = gen[0]
|
|
||||||
else:
|
else:
|
||||||
torchaudio.save(os.path.join(outdir, f'result.wav'), gen.squeeze(0).cpu(), 24000)
|
torchaudio.save(os.path.join(outdir, f'result_{line}.wav'), gen.squeeze(0).cpu(), 24000)
|
||||||
output_voice = gen
|
|
||||||
|
|
||||||
output_voice = (24000, output_voice.squeeze().cpu().numpy())
|
for candidate in range(candidates):
|
||||||
|
audio_clips = []
|
||||||
|
for line in range(len(texts)):
|
||||||
|
if isinstance(gen, list):
|
||||||
|
wav_file = os.path.join(outdir, f'candidate_{candidate}/result_{line}.wav')
|
||||||
|
else:
|
||||||
|
wav_file = os.path.join(outdir, f'result_{line}.wav')
|
||||||
|
|
||||||
|
audio_clips.append(load_audio(wav_file, 24000))
|
||||||
|
audio_clips = torch.cat(audio_clips, dim=-1)
|
||||||
|
torchaudio.save(os.path.join(outdir, f'combined_{candidate}.wav'), audio_clips, 24000)
|
||||||
|
|
||||||
|
info = f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Quality: {preset} preset / {num_autoregressive_samples} samples / {diffusion_iterations} iterations | Temperature: {temperature} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n"
|
||||||
|
|
||||||
|
with open(os.path.join(outdir, f'input.txt'), 'w', encoding="utf-8") as f:
|
||||||
|
f.write(info)
|
||||||
|
|
||||||
|
with open("results.log", "w", encoding="utf-8") as f:
|
||||||
|
f.write(info)
|
||||||
|
|
||||||
|
print(f"Saved to '{outdir}'")
|
||||||
|
|
||||||
|
output_voice = (24000, audio_clips.squeeze().cpu().numpy())
|
||||||
|
|
||||||
if sample_voice is not None:
|
if sample_voice is not None:
|
||||||
sample_voice = (22050, sample_voice.squeeze().cpu().numpy())
|
sample_voice = (22050, sample_voice.squeeze().cpu().numpy())
|
||||||
|
|
||||||
|
audio_clips = []
|
||||||
return (
|
return (
|
||||||
sample_voice,
|
sample_voice,
|
||||||
output_voice,
|
output_voice,
|
||||||
|
@ -112,6 +142,7 @@ def main():
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
text = gr.Textbox(lines=4, label="Prompt")
|
text = gr.Textbox(lines=4, label="Prompt")
|
||||||
|
delimiter = gr.Textbox(lines=1, label="Multi-Line Delimiter", placeholder="\\n")
|
||||||
|
|
||||||
emotion = gr.Radio(
|
emotion = gr.Radio(
|
||||||
["None", "Happy", "Sad", "Angry", "Disgusted", "Arrogant", "Custom"],
|
["None", "Happy", "Sad", "Angry", "Disgusted", "Arrogant", "Custom"],
|
||||||
|
@ -179,6 +210,7 @@ def main():
|
||||||
submit_event = submit.click(generate,
|
submit_event = submit.click(generate,
|
||||||
inputs=[
|
inputs=[
|
||||||
text,
|
text,
|
||||||
|
delimiter,
|
||||||
emotion,
|
emotion,
|
||||||
prompt,
|
prompt,
|
||||||
voice,
|
voice,
|
||||||
|
|
|
@ -170,7 +170,6 @@ def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_la
|
||||||
noise = torch.randn(output_shape, device=latents.device) * temperature
|
noise = torch.randn(output_shape, device=latents.device) * temperature
|
||||||
|
|
||||||
mel = None
|
mel = None
|
||||||
print(f"Sampler: {sampler}")
|
|
||||||
if sampler == "P":
|
if sampler == "P":
|
||||||
mel = diffuser.p_sample_loop(diffusion_model, output_shape, noise=noise,
|
mel = diffuser.p_sample_loop(diffusion_model, output_shape, noise=noise,
|
||||||
model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings},
|
model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings},
|
||||||
|
|
Loading…
Reference in New Issue
Block a user