1
1
forked from mrq/tortoise-tts

Added new options: "Output Sample Rate", "Output Volume", and documentation

This commit is contained in:
mrq 2023-02-10 03:02:09 +00:00
parent 57af25c6c0
commit efa556b793
3 changed files with 91 additions and 50 deletions

View File

@ -197,6 +197,8 @@ Below are settings that override the default launch arguments. Some of these req
* `Voice Latent Max Chunk Size`: during the voice latents calculation pass, this limits how large, in bytes, a chunk can be. Large values can run into VRAM OOM errors.
* `Sample Batch Size`: sets the batch size when generating autoregressive samples. Bigger batches result in faster compute, at the cost of increased VRAM consumption. Leave to 0 to calculate a "best" fit.
* `Concurrency Count`: how many Gradio events the queue can process at once. Leave this over 1 if you want to modify settings in the UI that updates other settings while generating audio clips.
* `Output Sample Rate`: the sample rate to save the generated audio as. It provides a bit of slight bump in quality
* `Output Volume`: adjusts the volume through amplitude scaling
Below are an explanation of experimental flags. Messing with these might impact performance, as these are exposed only if you know what you are doing.
* `Half-Precision`: (attempts to) hint to PyTorch to auto-cast to float16 (half precision) for compute. Disabled by default, due to it making computations slower.

137
app.py
View File

@ -21,6 +21,12 @@ from tortoise.utils.audio import load_audio, load_voice, load_voices
from tortoise.utils.text import split_and_recombine_text
def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, diffusion_sampler, breathing_room, cvvp_weight, experimentals, progress=gr.Progress(track_tqdm=True)):
try:
tts
except NameError:
raise gr.Error("TTS is still initializing...")
if voice != "microphone":
voices = [voice]
else:
@ -36,7 +42,8 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
voice_samples, conditioning_latents = load_voice(voice)
if voice_samples is not None:
sample_voice = voice_samples[0]
sample_voice = voice_samples[0].squeeze().cpu()
conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, max_chunk_size=args.cond_latent_max_chunk_size)
if len(conditioning_latents) == 4:
conditioning_latents = (conditioning_latents[0], conditioning_latents[1], conditioning_latents[2], None)
@ -54,7 +61,6 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
print("Requesting weighing against CVVP weight, but voice latents are missing some extra data. Please regenerate your voice latents.")
cvvp_weight = 0
start_time = time.time()
settings = {
'temperature': temperature, 'length_penalty': 1.0, 'repetition_penalty': 2.0,
@ -86,14 +92,24 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
else:
texts = split_and_recombine_text(text)
start_time = time.time()
timestamp = int(time.time())
outdir = f"./results/{voice}/{timestamp}/"
outdir = f"./results/{voice}/{int(start_time)}/"
os.makedirs(outdir, exist_ok=True)
audio_cache = {}
resampler = torchaudio.transforms.Resample(
tts.output_sample_rate,
args.output_sample_rate,
lowpass_filter_width=16,
rolloff=0.85,
resampling_method="kaiser_window",
beta=8.555504641634386,
) if tts.output_sample_rate != args.output_sample_rate else None
volume_adjust = torchaudio.transforms.Vol(gain=args.output_volume, gain_type="amplitude") if args.output_volume != 1 else None
for line, cut_text in enumerate(texts):
if emotion == "Custom":
if prompt.strip() != "":
@ -108,21 +124,27 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
if isinstance(gen, list):
for j, g in enumerate(gen):
audio = g.squeeze(0).cpu()
os.makedirs(f'{outdir}/candidate_{j}', exist_ok=True)
audio_cache[f"candidate_{j}/result_{line}.wav"] = {
'audio': audio,
'audio': g,
'text': cut_text,
}
os.makedirs(f'{outdir}/candidate_{j}', exist_ok=True)
torchaudio.save(f'{outdir}/candidate_{j}/result_{line}.wav', audio, tts.output_sample_rate)
else:
audio = gen.squeeze(0).cpu()
audio_cache[f"result_{line}.wav"] = {
'audio': audio,
'audio': gen,
'text': cut_text,
}
torchaudio.save(f'{outdir}/result_{line}.wav', audio, tts.output_sample_rate)
for k in audio_cache:
audio = audio_cache[k]['audio'].squeeze(0).cpu()
if resampler is not None:
audio = resampler(audio)
if volume_adjust is not None:
audio = volume_adjust(audio)
audio_cache[k]['audio'] = audio
torchaudio.save(f'{outdir}/{k}', audio, args.output_sample_rate)
output_voice = None
if len(texts) > 1:
@ -136,7 +158,7 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
audio_clips.append(audio)
audio = torch.cat(audio_clips, dim=-1)
torchaudio.save(f'{outdir}/combined_{candidate}.wav', audio, tts.output_sample_rate)
torchaudio.save(f'{outdir}/combined_{candidate}.wav', audio, args.output_sample_rate)
audio = audio.squeeze(0).cpu()
audio_cache[f'combined_{candidate}.wav'] = {
@ -145,15 +167,15 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
}
if output_voice is None:
output_voice = audio
output_voice = f'{outdir}/combined_{candidate}.wav'
# output_voice = audio
else:
if isinstance(gen, list):
output_voice = gen[0]
output_voice = f'{outdir}/candidate_0/result_0.wav'
#output_voice = gen[0]
else:
output_voice = gen
if output_voice is not None:
output_voice = (tts.output_sample_rate, output_voice.numpy())
output_voice = f'{outdir}/result_0.wav'
#output_voice = gen
info = {
'text': text,
@ -188,9 +210,12 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
metadata = music_tag.load_file(f"{outdir}/{path}")
metadata['lyrics'] = json.dumps(info)
metadata.save()
#if output_voice is not None:
# output_voice = (args.output_sample_rate, output_voice.numpy())
if sample_voice is not None:
sample_voice = (tts.input_sample_rate, sample_voice.squeeze().cpu().numpy())
sample_voice = (tts.input_sample_rate, sample_voice.numpy())
print(f"Generation took {info['time']} seconds, saved to '{outdir}'\n")
@ -319,10 +344,13 @@ def check_for_updates():
return False
def reload_tts():
tts = setup_tortoise()
def update_voices():
return gr.Dropdown.update(choices=sorted(os.listdir("./tortoise/voices")) + ["microphone"])
def export_exec_settings( share, listen, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, cond_latent_max_chunk_size, sample_batch_size, concurrency_count ):
def export_exec_settings( share, listen, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, cond_latent_max_chunk_size, sample_batch_size, concurrency_count, output_sample_rate, output_volume ):
args.share = share
args.listen = listen
args.low_vram = low_vram
@ -333,6 +361,8 @@ def export_exec_settings( share, listen, check_for_updates, models_from_local_on
args.embed_output_metadata = embed_output_metadata
args.latents_lean_and_mean = latents_lean_and_mean
args.concurrency_count = concurrency_count
args.output_sample_rate = output_sample_rate
args.output_volume = output_volume
settings = {
'share': args.share,
@ -345,6 +375,8 @@ def export_exec_settings( share, listen, check_for_updates, models_from_local_on
'embed-output-metadata': args.embed_output_metadata,
'latents-lean-and-mean': args.latents_lean_and_mean,
'concurrency-count': args.concurrency_count,
'output-sample-rate': args.output_sample_rate,
'output-volume': args.output_volume,
}
with open(f'./config/exec.json', 'w', encoding="utf-8") as f:
@ -361,7 +393,9 @@ def setup_args():
'embed-output-metadata': True,
'latents-lean-and-mean': True,
'cond-latent-max-chunk-size': 1000000,
'concurrency-count': 3,
'concurrency-count': 2,
'output-sample-rate': 44100,
'output-volume': 1,
}
if os.path.isfile('./config/exec.json'):
@ -381,6 +415,8 @@ def setup_args():
parser.add_argument("--cond-latent-max-chunk-size", default=default_arguments['cond-latent-max-chunk-size'], type=int, help="Sets an upper limit to audio chunk size when computing conditioning latents")
parser.add_argument("--sample-batch-size", default=default_arguments['sample-batch-size'], type=int, help="Sets an upper limit to audio chunk size when computing conditioning latents")
parser.add_argument("--concurrency-count", type=int, default=default_arguments['concurrency-count'], help="How many Gradio events to process at once")
parser.add_argument("--output-sample-rate", type=int, default=default_arguments['output-sample-rate'], help="Sample rate to resample the output to (from 24KHz)")
parser.add_argument("--output-volume", type=float, default=default_arguments['output-volume'], help="Adjusts volume of output")
args = parser.parse_args()
args.embed_output_metadata = not args.no_embed_output_metadata
@ -392,7 +428,7 @@ def setup_args():
match = re.findall(r"^(?:(.+?):(\d+))?(\/.+?)?$", args.listen)[0]
args.listen_host = match[0] if match[0] != "" else "127.0.0.1"
args.listen_port = match[1] if match[1] != "" else 8000
args.listen_port = match[1] if match[1] != "" else None
args.listen_path = match[2] if match[2] != "" else "/"
if args.listen_port is not None:
@ -516,34 +552,37 @@ def setup_gradio():
)
with gr.Tab("Settings"):
with gr.Row():
exec_inputs = []
with gr.Column():
with gr.Box():
exec_arg_listen = gr.Textbox(label="Listen", value=args.listen, placeholder="127.0.0.1:7860/")
exec_arg_share = gr.Checkbox(label="Public Share Gradio", value=args.share)
exec_arg_check_for_updates = gr.Checkbox(label="Check For Updates", value=args.check_for_updates)
exec_arg_models_from_local_only = gr.Checkbox(label="Only Load Models Locally", value=args.models_from_local_only)
exec_arg_low_vram = gr.Checkbox(label="Low VRAM", value=args.low_vram)
exec_arg_embed_output_metadata = gr.Checkbox(label="Embed Output Metadata", value=args.embed_output_metadata)
exec_arg_latents_lean_and_mean = gr.Checkbox(label="Slimmer Computed Latents", value=args.latents_lean_and_mean)
exec_arg_cond_latent_max_chunk_size = gr.Number(label="Voice Latents Max Chunk Size", precision=0, value=args.cond_latent_max_chunk_size)
exec_arg_sample_batch_size = gr.Number(label="Sample Batch Size", precision=0, value=args.sample_batch_size)
exec_arg_concurrency_count = gr.Number(label="Concurrency Count", precision=0, value=args.concurrency_count)
exec_inputs = exec_inputs + [
gr.Textbox(label="Listen", value=args.listen, placeholder="127.0.0.1:7860/"),
gr.Checkbox(label="Public Share Gradio", value=args.share),
gr.Checkbox(label="Check For Updates", value=args.check_for_updates),
gr.Checkbox(label="Only Load Models Locally", value=args.models_from_local_only),
gr.Checkbox(label="Low VRAM", value=args.low_vram),
gr.Checkbox(label="Embed Output Metadata", value=args.embed_output_metadata),
gr.Checkbox(label="Slimmer Computed Latents", value=args.latents_lean_and_mean),
]
gr.Button(value="Check for Updates").click(check_for_updates)
with gr.Column():
exec_inputs = exec_inputs + [
gr.Number(label="Voice Latents Max Chunk Size", precision=0, value=args.cond_latent_max_chunk_size),
gr.Number(label="Sample Batch Size", precision=0, value=args.sample_batch_size),
gr.Number(label="Concurrency Count", precision=0, value=args.concurrency_count),
gr.Number(label="Ouptut Sample Rate", precision=0, value=args.output_sample_rate),
gr.Slider(label="Ouptut Volume", minimum=0, maximum=2, value=args.output_volume),
]
for i in exec_inputs:
i.change(
fn=export_exec_settings,
inputs=exec_inputs
)
with gr.Column():
experimentals = gr.CheckboxGroup(["Half Precision", "Conditioning-Free"], value=["Conditioning-Free"], label="Experimental Flags")
cvvp_weight = gr.Slider(value=0, minimum=0, maximum=1, label="CVVP Weight")
check_updates_now = gr.Button(value="Check for Updates")
exec_inputs = [exec_arg_share, exec_arg_listen, exec_arg_check_for_updates, exec_arg_models_from_local_only, exec_arg_low_vram, exec_arg_embed_output_metadata, exec_arg_latents_lean_and_mean, exec_arg_cond_latent_max_chunk_size, exec_arg_sample_batch_size, exec_arg_concurrency_count]
for i in exec_inputs:
i.change(
fn=export_exec_settings,
inputs=exec_inputs
)
check_updates_now.click(check_for_updates)
gr.Button(value="Reload TTS").click(reload_tts)
input_settings = [
text,
@ -591,7 +630,7 @@ if __name__ == "__main__":
if args.listen_path is not None and args.listen_path != "/":
import uvicorn
uvicorn.run("app:app", host=args.listen_host, port=args.listen_port)
uvicorn.run("app:app", host=args.listen_host, port=args.listen_port if not None else 8000)
else:
webui = setup_gradio()
webui.launch(share=args.share, prevent_thread_lock=True, server_name=args.listen_host, server_port=args.listen_port)

View File

@ -562,7 +562,7 @@ class TextToSpeech:
# results, but will increase memory usage.
if not self.minor_optimizations:
self.autoregressive = self.autoregressive.to(self.device)
if get_device_name() == "dml":
text_tokens = text_tokens.cpu()
best_results = best_results.cpu()