From bc567d72632c39cd4047bdb77bab83e17381d75f Mon Sep 17 00:00:00 2001 From: mrq Date: Sun, 5 Feb 2023 06:35:32 +0000 Subject: [PATCH] Skip combining if not splitting, also avoids reading back the audio files to combine them by keeping them in memory --- README.md | 2 ++ app.py | 53 +++++++++++++++++++++++++++++++++-------------------- 2 files changed, 35 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 8d69647..625106b 100755 --- a/README.md +++ b/README.md @@ -86,6 +86,8 @@ If you're looking to access your copy of TorToiSe from outside your local networ You'll be presented with a bunch of options, but do not be overwhelmed, as most of the defaults are sane, but below are a rough explanation on which input does what: * `Prompt`: text you want to be read. You wrap text in `[brackets]` for "prompt engineering", where it'll affect the output, but those words won't actually be read. +* `Line Delimiter`: String to split the prompt into pieces. The stitched clip will be stored as `combined.wav` + - Setting this to `\n` will generate each line as one clip before stitching it. * `Emotion`: the "emotion" used for the delivery. This is a shortcut to utilizing "prompt engineering" by starting with `[I am really ,]` in your prompt. This is not a guarantee, however. * `Custom Emotion + Prompt`: a non-preset "emotion" used for the delivery. This is a shortcut to utilizing "prompt engineering" by starting with `[]` in your prompt. * `Voice`: the voice you want to clone. You can select `microphone` if you want to use input from your microphone. diff --git a/app.py b/app.py index 2fb4bf2..73300ad 100755 --- a/app.py +++ b/app.py @@ -73,9 +73,8 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c os.makedirs(outdir, exist_ok=True) - # to-do: store audio to array to avoid having to re-read from disk when combining - # to-do: do not rejoin when not splitting lines - + + audio_cache = {} for line, cut_text in enumerate(texts): print(f"[{str(line+1)}/{str(len(texts))}] Generating line: {cut_text}") @@ -84,22 +83,37 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c if isinstance(gen, list): for j, g in enumerate(gen): - os.makedirs(os.path.join(outdir, f'candidate_{j}'), exist_ok=True) - torchaudio.save(os.path.join(outdir, f'candidate_{j}/result_{line}.wav'), g.squeeze(0).cpu(), 24000) - else: - torchaudio.save(os.path.join(outdir, f'result_{line}.wav'), gen.squeeze(0).cpu(), 24000) - - for candidate in range(candidates): - audio_clips = [] - for line in range(len(texts)): - if isinstance(gen, list): - wav_file = os.path.join(outdir, f'candidate_{candidate}/result_{line}.wav') - else: - wav_file = os.path.join(outdir, f'result_{line}.wav') + audio = g.squeeze(0).cpu() + audio_cache[f"candidate_{j}/result_{line}.wav"] = audio - audio_clips.append(load_audio(wav_file, 24000)) - audio_clips = torch.cat(audio_clips, dim=-1) - torchaudio.save(os.path.join(outdir, f'combined_{candidate}.wav'), audio_clips, 24000) + os.makedirs(os.path.join(outdir, f'candidate_{j}'), exist_ok=True) + torchaudio.save(os.path.join(outdir, f'candidate_{j}/result_{line}.wav'), audio, 24000) + else: + audio = gen.squeeze(0).cpu() + audio_cache[f"result_{line}.wav"] = audio + torchaudio.save(os.path.join(outdir, f'result_{line}.wav'), audio, 24000) + + output_voice = None + if len(texts) > 1: + for candidate in range(candidates): + audio_clips = [] + for line in range(len(texts)): + if isinstance(gen, list): + piece = audio_cache[f'candidate_{candidate}/result_{line}.wav'] + else: + piece = audio_cache[f'result_{line}.wav'] + audio_clips.append(piece) + audio_clips = torch.cat(audio_clips, dim=-1) + torchaudio.save(os.path.join(outdir, f'combined_{candidate}.wav'), audio_clips, 24000) + + if output_voice is None: + output_voice = (24000, audio_clips.squeeze().cpu().numpy()) + else: + if isinstance(gen, list): + output_voice = gen[0] + else: + output_voice = gen + output_voice = (24000, output_voice.squeeze().cpu().numpy()) info = f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Quality: {preset} preset / {num_autoregressive_samples} samples / {diffusion_iterations} iterations | Temperature: {temperature} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n" @@ -111,7 +125,6 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c print(f"Saved to '{outdir}'") - output_voice = (24000, audio_clips.squeeze().cpu().numpy()) if sample_voice is not None: sample_voice = (22050, sample_voice.squeeze().cpu().numpy()) @@ -142,7 +155,7 @@ def main(): with gr.Row(): with gr.Column(): text = gr.Textbox(lines=4, label="Prompt") - delimiter = gr.Textbox(lines=1, label="Multi-Line Delimiter", placeholder="\\n") + delimiter = gr.Textbox(lines=1, label="Line Delimiter", placeholder="\\n") emotion = gr.Radio( ["None", "Happy", "Sad", "Angry", "Disgusted", "Arrogant", "Custom"],