forked from mrq/tortoise-tts
Skip combining if not splitting, also avoids reading back the audio files to combine them by keeping them in memory
This commit is contained in:
parent
f38c479e9b
commit
98dbf56d44
|
@ -86,6 +86,8 @@ If you're looking to access your copy of TorToiSe from outside your local networ
|
||||||
|
|
||||||
You'll be presented with a bunch of options, but do not be overwhelmed, as most of the defaults are sane, but below are a rough explanation on which input does what:
|
You'll be presented with a bunch of options, but do not be overwhelmed, as most of the defaults are sane, but below are a rough explanation on which input does what:
|
||||||
* `Prompt`: text you want to be read. You wrap text in `[brackets]` for "prompt engineering", where it'll affect the output, but those words won't actually be read.
|
* `Prompt`: text you want to be read. You wrap text in `[brackets]` for "prompt engineering", where it'll affect the output, but those words won't actually be read.
|
||||||
|
* `Line Delimiter`: String to split the prompt into pieces. The stitched clip will be stored as `combined.wav`
|
||||||
|
- Setting this to `\n` will generate each line as one clip before stitching it.
|
||||||
* `Emotion`: the "emotion" used for the delivery. This is a shortcut to utilizing "prompt engineering" by starting with `[I am really <emotion>,]` in your prompt. This is not a guarantee, however.
|
* `Emotion`: the "emotion" used for the delivery. This is a shortcut to utilizing "prompt engineering" by starting with `[I am really <emotion>,]` in your prompt. This is not a guarantee, however.
|
||||||
* `Custom Emotion + Prompt`: a non-preset "emotion" used for the delivery. This is a shortcut to utilizing "prompt engineering" by starting with `[<emotion>]` in your prompt.
|
* `Custom Emotion + Prompt`: a non-preset "emotion" used for the delivery. This is a shortcut to utilizing "prompt engineering" by starting with `[<emotion>]` in your prompt.
|
||||||
* `Voice`: the voice you want to clone. You can select `microphone` if you want to use input from your microphone.
|
* `Voice`: the voice you want to clone. You can select `microphone` if you want to use input from your microphone.
|
||||||
|
|
37
app.py
37
app.py
|
@ -73,9 +73,8 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c
|
||||||
|
|
||||||
os.makedirs(outdir, exist_ok=True)
|
os.makedirs(outdir, exist_ok=True)
|
||||||
|
|
||||||
# to-do: store audio to array to avoid having to re-read from disk when combining
|
|
||||||
# to-do: do not rejoin when not splitting lines
|
|
||||||
|
|
||||||
|
audio_cache = {}
|
||||||
for line, cut_text in enumerate(texts):
|
for line, cut_text in enumerate(texts):
|
||||||
print(f"[{str(line+1)}/{str(len(texts))}] Generating line: {cut_text}")
|
print(f"[{str(line+1)}/{str(len(texts))}] Generating line: {cut_text}")
|
||||||
|
|
||||||
|
@ -84,23 +83,38 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c
|
||||||
|
|
||||||
if isinstance(gen, list):
|
if isinstance(gen, list):
|
||||||
for j, g in enumerate(gen):
|
for j, g in enumerate(gen):
|
||||||
os.makedirs(os.path.join(outdir, f'candidate_{j}'), exist_ok=True)
|
audio = g.squeeze(0).cpu()
|
||||||
torchaudio.save(os.path.join(outdir, f'candidate_{j}/result_{line}.wav'), g.squeeze(0).cpu(), 24000)
|
audio_cache[f"candidate_{j}/result_{line}.wav"] = audio
|
||||||
else:
|
|
||||||
torchaudio.save(os.path.join(outdir, f'result_{line}.wav'), gen.squeeze(0).cpu(), 24000)
|
|
||||||
|
|
||||||
|
os.makedirs(os.path.join(outdir, f'candidate_{j}'), exist_ok=True)
|
||||||
|
torchaudio.save(os.path.join(outdir, f'candidate_{j}/result_{line}.wav'), audio, 24000)
|
||||||
|
else:
|
||||||
|
audio = gen.squeeze(0).cpu()
|
||||||
|
audio_cache[f"result_{line}.wav"] = audio
|
||||||
|
torchaudio.save(os.path.join(outdir, f'result_{line}.wav'), audio, 24000)
|
||||||
|
|
||||||
|
output_voice = None
|
||||||
|
if len(texts) > 1:
|
||||||
for candidate in range(candidates):
|
for candidate in range(candidates):
|
||||||
audio_clips = []
|
audio_clips = []
|
||||||
for line in range(len(texts)):
|
for line in range(len(texts)):
|
||||||
if isinstance(gen, list):
|
if isinstance(gen, list):
|
||||||
wav_file = os.path.join(outdir, f'candidate_{candidate}/result_{line}.wav')
|
piece = audio_cache[f'candidate_{candidate}/result_{line}.wav']
|
||||||
else:
|
else:
|
||||||
wav_file = os.path.join(outdir, f'result_{line}.wav')
|
piece = audio_cache[f'result_{line}.wav']
|
||||||
|
audio_clips.append(piece)
|
||||||
audio_clips.append(load_audio(wav_file, 24000))
|
|
||||||
audio_clips = torch.cat(audio_clips, dim=-1)
|
audio_clips = torch.cat(audio_clips, dim=-1)
|
||||||
torchaudio.save(os.path.join(outdir, f'combined_{candidate}.wav'), audio_clips, 24000)
|
torchaudio.save(os.path.join(outdir, f'combined_{candidate}.wav'), audio_clips, 24000)
|
||||||
|
|
||||||
|
if output_voice is None:
|
||||||
|
output_voice = (24000, audio_clips.squeeze().cpu().numpy())
|
||||||
|
else:
|
||||||
|
if isinstance(gen, list):
|
||||||
|
output_voice = gen[0]
|
||||||
|
else:
|
||||||
|
output_voice = gen
|
||||||
|
output_voice = (24000, output_voice.squeeze().cpu().numpy())
|
||||||
|
|
||||||
info = f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Quality: {preset} preset / {num_autoregressive_samples} samples / {diffusion_iterations} iterations | Temperature: {temperature} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n"
|
info = f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Quality: {preset} preset / {num_autoregressive_samples} samples / {diffusion_iterations} iterations | Temperature: {temperature} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n"
|
||||||
|
|
||||||
with open(os.path.join(outdir, f'input.txt'), 'w', encoding="utf-8") as f:
|
with open(os.path.join(outdir, f'input.txt'), 'w', encoding="utf-8") as f:
|
||||||
|
@ -111,7 +125,6 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c
|
||||||
|
|
||||||
print(f"Saved to '{outdir}'")
|
print(f"Saved to '{outdir}'")
|
||||||
|
|
||||||
output_voice = (24000, audio_clips.squeeze().cpu().numpy())
|
|
||||||
|
|
||||||
if sample_voice is not None:
|
if sample_voice is not None:
|
||||||
sample_voice = (22050, sample_voice.squeeze().cpu().numpy())
|
sample_voice = (22050, sample_voice.squeeze().cpu().numpy())
|
||||||
|
@ -142,7 +155,7 @@ def main():
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
text = gr.Textbox(lines=4, label="Prompt")
|
text = gr.Textbox(lines=4, label="Prompt")
|
||||||
delimiter = gr.Textbox(lines=1, label="Multi-Line Delimiter", placeholder="\\n")
|
delimiter = gr.Textbox(lines=1, label="Line Delimiter", placeholder="\\n")
|
||||||
|
|
||||||
emotion = gr.Radio(
|
emotion = gr.Radio(
|
||||||
["None", "Happy", "Sad", "Angry", "Disgusted", "Arrogant", "Custom"],
|
["None", "Happy", "Sad", "Angry", "Disgusted", "Arrogant", "Custom"],
|
||||||
|
|
Loading…
Reference in New Issue
Block a user