1
1
forked from mrq/tortoise-tts
This commit is contained in:
mrq 2023-02-10 19:55:33 +00:00
parent a09eff5d9c
commit 8b83c9083d
3 changed files with 52 additions and 8 deletions

View File

@ -187,7 +187,9 @@ In this tab, you can find some helper utilities that might be of assistance.
For now, an analog to the PNG info found in Voldy's Stable Diffusion Web UI resides here. With it, you can upload an audio file generated with this web UI to view the settings used to generate that output. Additionally, the voice latents used to generate the uploaded audio clip can be extracted.
If you want to reuse its generation settings, simply click "Copy Settings".
If you want to reuse its generation settings, simply click `Copy Settings`.
To import a voice, click `Import Voice`. Remember to click `Refresh Voice List` in the `Generate` panel afterwards, if it's a new voice.
### Settings
@ -215,6 +217,11 @@ Below are an explanation of experimental flags. Messing with these might impact
* `CVVP Weight`: governs how much weight the CVVP model should influence candidates. The original documentation mentions this is deprecated as it does not really influence things, but you're still free to play around with it.
Currently, setting requires regenerating your voice latents, as I forgot to have it return some extra data that weighing against the CVVP model uses. Oops.
Setting this to 1 leads to bad behavior.
* `Top P`: P value used in nucleus sampling; lower values mean the decoder produces more "likely" (aka boring) outputs.
* `Diffusion Temperature`: the variance of the noise fed into the diffusion model; values at 0 are the "mean" prediction of the diffusion network and will sound bland and smeared.
* `Length Penalty`: a length penalty applied to the autoregressive decoder; higher settings causes the model to produce more terse outputs.
* `Repetition Penalty`: a penalty that prevents the autoregressive decoder from repeating itself during decoding. Can be used to reduce the incidence of long silences or "uhhhhhhs", etc.
* `Conditioning-Free K`: determintes balancing the conditioning free signal with the conditioning-present signal.
## Example(s)

View File

@ -80,7 +80,7 @@
"\n",
"mrq.args = mrq.setup_args()\n",
"mrq.webui = mrq.setup_gradio()\n",
"mrq.webui.launch(share=True, prevent_thread_lock=True)\n",
"mrq.webui.launch(share=True, prevent_thread_lock=True, height=1000)\n",
"mrq.tts = mrq.setup_tortoise()\n",
"mrq.webui.block_thread()"
],
@ -90,6 +90,35 @@
"execution_count":null,
"outputs":[
]
},
{
"cell_type":"markdown",
"source":[
"## Exporting"
],
"metadata":{
"id":"2AnVQxEJx47p"
}
},
{
"cell_type":"code",
"source":[
"!apt install -y p7zip-full\n",
"from datetime import datetime\n",
"timestamp = datetime.now().strftime('%m-%d-%Y_%H:%M:%S')\n",
"!mkdir -p \"../{timestamp}\"\n",
"!mv ./results/* \"../{timestamp}/.\"\n",
"!7z a -t7z -m0=lzma2 -mx=9 -mfb=64 -md=32m -ms=on \"../{timestamp}.7z\" \"../{timestamp}/\"\n",
"!ls ~/\n",
"!echo \"Finished zipping, archive is available at {timestamp}.7z\""
],
"metadata":{
"id":"YOACiDCXx72G"
},
"execution_count":null,
"outputs":[
]
}
]

View File

@ -279,7 +279,7 @@ def update_presets(value):
else:
return (gr.update(), gr.update())
def read_generate_settings(file, save_latents=True):
def read_generate_settings(file, save_latents=True, save_as_temp=True):
j = None
latents = None
@ -297,7 +297,7 @@ def read_generate_settings(file, save_latents=True):
del j['latents']
if latents and save_latents:
outdir='./voices/.temp/'
outdir=f'./tortoise/voices/{".temp" if save_as_temp else j["voice"]}/'
os.makedirs(outdir, exist_ok=True)
with open(f'{outdir}/cond_latents.pth', 'wb') as f:
f.write(latents)
@ -307,6 +307,8 @@ def read_generate_settings(file, save_latents=True):
j,
latents
)
def save_latents(file):
read_generate_settings(file, save_latents=True, save_as_temp=False)
def import_generate_settings(file="./config/generate.json"):
settings, _ = read_generate_settings(file, save_latents=False)
@ -327,7 +329,7 @@ def import_generate_settings(file="./config/generate.json"):
None if 'diffusion_iterations' not in settings else settings['diffusion_iterations'],
0.8 if 'temperature' not in settings else settings['temperature'],
"DDIM" if 'diffusion_sampler' not in settings else settings['diffusion_sampler'],
8.0 if 'breathing_room' not in settings else settings['breathing_room'],
8 if 'breathing_room' not in settings else settings['breathing_room'],
0.0 if 'cvvp_weight' not in settings else settings['cvvp_weight'],
0.8 if 'top_p' not in settings else settings['top_p'],
1.0 if 'diffusion_temperature' not in settings else settings['diffusion_temperature'],
@ -578,6 +580,7 @@ def setup_gradio():
with gr.Column():
audio_in = gr.File(type="file", label="Audio Input", file_types=["audio"])
copy_button = gr.Button(value="Copy Settings")
import_voice = gr.Button(value="Import Voice")
with gr.Column():
metadata_out = gr.JSON(label="Audio Metadata")
latents_out = gr.File(type="binary", label="Voice Latents")
@ -590,6 +593,11 @@ def setup_gradio():
latents_out
]
)
import_voice.click(
fn=save_latents,
inputs=audio_in,
)
with gr.Tab("Settings"):
with gr.Row():
exec_inputs = []
@ -622,11 +630,11 @@ def setup_gradio():
with gr.Column():
experimental_checkboxes = gr.CheckboxGroup(["Half Precision", "Conditioning-Free"], value=["Conditioning-Free"], label="Experimental Flags")
cvvp_weight = gr.Slider(value=0, minimum=0, maximum=1, label="CVVP Weight")
top_p = gr.Slider(value=0.8, minimum=0, maximum=2, label="Top P")
diffusion_temperature = gr.Slider(value=1.0, minimum=0, maximum=2, label="Diffusion Temperature")
top_p = gr.Slider(value=0.8, minimum=0, maximum=1, label="Top P")
diffusion_temperature = gr.Slider(value=1.0, minimum=0, maximum=1, label="Diffusion Temperature")
length_penalty = gr.Slider(value=1.0, minimum=0, maximum=8, label="Length Penalty")
repetition_penalty = gr.Slider(value=2.0, minimum=0, maximum=8, label="Repetition Penalty")
cond_free_k = gr.Slider(value=2.0, minimum=0, maximum=8, label="Conditioning-Free K")
cond_free_k = gr.Slider(value=2.0, minimum=0, maximum=4, label="Conditioning-Free K")
input_settings = [