New tunable: pause size/breathing room (governs pause at the end of clips)

2023-02-05 14:45:51 +00:00 · 2023-02-05 14:45:51 +00:00 · 7b767e1442
commit 7b767e1442
parent ec31d1763a
3 changed files with 21 additions and 16 deletions
--- a/README.md
+++ b/README.md
@ -87,7 +87,7 @@ If you're looking to access your copy of TorToiSe from outside your local networ
 You'll be presented with a bunch of options, but do not be overwhelmed, as most of the defaults are sane, but below are a rough explanation on which input does what:
 * `Prompt`: text you want to be read. You wrap text in `[brackets]` for "prompt engineering", where it'll affect the output, but those words won't actually be read.
 * `Line Delimiter`: String to split the prompt into pieces. The stitched clip will be stored as `combined.wav`
-	- Setting this to `\n` will generate each line as one clip before stitching it.
+	- Setting this to `\n` will generate each line as one clip before stitching it. Leave blank to disable this.
 * `Emotion`: the "emotion" used for the delivery. This is a shortcut to utilizing "prompt engineering" by starting with `[I am really <emotion>,]` in your prompt. This is not a guarantee, however.
 * `Custom Emotion + Prompt`: a non-preset "emotion" used for the delivery. This is a shortcut to utilizing "prompt engineering" by starting with `[<emotion>]` in your prompt.
 * `Voice`: the voice you want to clone. You can select `microphone` if you want to use input from your microphone.
@ -97,7 +97,8 @@ You'll be presented with a bunch of options, but do not be overwhelmed, as most
 * `Preset`: shortcut values for sample count and iteration steps. Clicking a preset will update its corresponding values. Higher presets result in better quality at the cost of computation time.
 * `Samples`: analogous to samples in image generation. More samples = better resemblance / clone quality, at the cost of performance. This strictly affects clone quality.
 * `Iterations`: influences audio sound quality in the final output. More iterations = higher quality sound. This step is relatively cheap, so do not be discouraged from increasing this. This strictly affects quality in the actual sound.
-* `Temperature`: how much randomness to introduce to the generated samples. Lower values = better resemblance to the source samples, but some temperature is still required for great output. This value is very inconsistent and entirely depends on the input voice.
+* `Temperature`: how much randomness to introduce to the generated samples. Lower values = better resemblance to the source samples, but some temperature is still required for great output. This value is very inconsistent and entirely depends on the input voice. In other words, some voices will be receptive to playing with this value, while others won't make much of a difference.
+* `Pause Size`: Governs how large pauses are at the end of a clip (in token size, not seconds). Increase this if your output gets cut off at the end.
 * `Diffusion Sampler`: sampler method during the diffusion pass. Currently, only `P` and `DDIM` are added, but does not seem to offer any substantial differences in my short tests.

 After you fill everything out, click `Run`, and wait for your output in the output window. The sampled voice is also returned, but if you're using multiple files, it'll return the first file, rather than a combined file.
--- a/app.py
+++ b/app.py
@ -10,7 +10,7 @@ from tortoise.api import TextToSpeech
 from tortoise.utils.audio import load_audio, load_voice, load_voices
 from tortoise.utils.text import split_and_recombine_text

-def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, diffusion_sampler, progress=gr.Progress()):
+def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, diffusion_sampler, breathing_room, progress=gr.Progress()):
    if voice != "microphone":
        voices = [voice]
    else:
@ -51,6 +51,7 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c
        'return_deterministic_state': True,
        'k': candidates,
        'diffusion_sampler': diffusion_sampler,
+        'breathing_room': breathing_room,
        'progress': progress,
    }

@ -176,6 +177,15 @@ def main():
                    type="filepath",
                )
                
+                prompt.change(fn=lambda value: gr.update(value="Custom"),
+                    inputs=prompt,
+                    outputs=emotion
+                )
+                mic_audio.change(fn=lambda value: gr.update(value="microphone"),
+                    inputs=mic_audio,
+                    outputs=voice
+                )
+            with gr.Column():
                candidates = gr.Slider(value=1, minimum=1, maximum=6, step=1, label="Candidates")
                seed = gr.Number(value=0, precision=0, label="Seed")

@ -185,10 +195,11 @@ def main():
                    label="Preset",
                    type="value",
                )
-                num_autoregressive_samples = gr.Slider(value=128, minimum=0, maximum=512, step=1, label="Samples", interactive=True)
-                diffusion_iterations = gr.Slider(value=128, minimum=0, maximum=512, step=1, label="Iterations", interactive=True)
+                num_autoregressive_samples = gr.Slider(value=128, minimum=0, maximum=512, step=1, label="Samples")
+                diffusion_iterations = gr.Slider(value=128, minimum=0, maximum=512, step=1, label="Iterations")

                temperature = gr.Slider(value=0.2, minimum=0, maximum=1, step=0.1, label="Temperature")
+                breathing_room = gr.Slider(value=12, minimum=1, maximum=32, step=1, label="Pause Size")
                diffusion_sampler = gr.Radio(
                    ["P", "DDIM"],
                    value="P",
@ -196,15 +207,6 @@ def main():
                    type="value",
                )

-                prompt.change(fn=lambda value: gr.update(value="Custom"),
-                    inputs=prompt,
-                    outputs=emotion
-                )
-                mic_audio.change(fn=lambda value: gr.update(value="microphone"),
-                    inputs=mic_audio,
-                    outputs=voice
-                )
-
                preset.change(fn=update_presets,
                    inputs=preset,
                    outputs=[
@ -234,7 +236,8 @@ def main():
                        num_autoregressive_samples,
                        diffusion_iterations,
                        temperature,
-                        diffusion_sampler
+                        diffusion_sampler,
+                        breathing_room
                    ],
                    outputs=[selected_voice, output_audio, usedSeed],
                )
--- a/tortoise/api.py
+++ b/tortoise/api.py
@ -375,6 +375,7 @@ class TextToSpeech:
            # diffusion generation parameters follow
            diffusion_iterations=100, cond_free=True, cond_free_k=2, diffusion_temperature=1.0,
            diffusion_sampler="P",
+            breathing_room=8,
            progress=None,
            **hf_generate_kwargs):
        """
@ -540,7 +541,7 @@ class TextToSpeech:
                        ctokens += 1
                    else:
                        ctokens = 0
-                    if ctokens > 8:  # 8 tokens gives the diffusion model some "breathing room" to terminate speech.
+                    if ctokens > breathing_room:  # 8 tokens gives the diffusion model some "breathing room" to terminate speech.
                        latents = latents[:, :k]
                        break