Added new options: "Output Sample Rate", "Output Volume", and documentation

This commit is contained in:
mrq 2023-02-10 03:02:09 +00:00
parent 57af25c6c0
commit efa556b793
3 changed files with 91 additions and 50 deletions

View File

@ -197,6 +197,8 @@ Below are settings that override the default launch arguments. Some of these req
* `Voice Latent Max Chunk Size`: during the voice latents calculation pass, this limits how large, in bytes, a chunk can be. Large values can run into VRAM OOM errors. * `Voice Latent Max Chunk Size`: during the voice latents calculation pass, this limits how large, in bytes, a chunk can be. Large values can run into VRAM OOM errors.
* `Sample Batch Size`: sets the batch size when generating autoregressive samples. Bigger batches result in faster compute, at the cost of increased VRAM consumption. Leave to 0 to calculate a "best" fit. * `Sample Batch Size`: sets the batch size when generating autoregressive samples. Bigger batches result in faster compute, at the cost of increased VRAM consumption. Leave to 0 to calculate a "best" fit.
* `Concurrency Count`: how many Gradio events the queue can process at once. Leave this over 1 if you want to modify settings in the UI that updates other settings while generating audio clips. * `Concurrency Count`: how many Gradio events the queue can process at once. Leave this over 1 if you want to modify settings in the UI that updates other settings while generating audio clips.
* `Output Sample Rate`: the sample rate to save the generated audio as. It provides a bit of slight bump in quality
* `Output Volume`: adjusts the volume through amplitude scaling
Below are an explanation of experimental flags. Messing with these might impact performance, as these are exposed only if you know what you are doing. Below are an explanation of experimental flags. Messing with these might impact performance, as these are exposed only if you know what you are doing.
* `Half-Precision`: (attempts to) hint to PyTorch to auto-cast to float16 (half precision) for compute. Disabled by default, due to it making computations slower. * `Half-Precision`: (attempts to) hint to PyTorch to auto-cast to float16 (half precision) for compute. Disabled by default, due to it making computations slower.

137
app.py
View File

@ -21,6 +21,12 @@ from tortoise.utils.audio import load_audio, load_voice, load_voices
from tortoise.utils.text import split_and_recombine_text from tortoise.utils.text import split_and_recombine_text
def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, diffusion_sampler, breathing_room, cvvp_weight, experimentals, progress=gr.Progress(track_tqdm=True)): def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, diffusion_sampler, breathing_room, cvvp_weight, experimentals, progress=gr.Progress(track_tqdm=True)):
try:
tts
except NameError:
raise gr.Error("TTS is still initializing...")
if voice != "microphone": if voice != "microphone":
voices = [voice] voices = [voice]
else: else:
@ -36,7 +42,8 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
voice_samples, conditioning_latents = load_voice(voice) voice_samples, conditioning_latents = load_voice(voice)
if voice_samples is not None: if voice_samples is not None:
sample_voice = voice_samples[0] sample_voice = voice_samples[0].squeeze().cpu()
conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, max_chunk_size=args.cond_latent_max_chunk_size) conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, max_chunk_size=args.cond_latent_max_chunk_size)
if len(conditioning_latents) == 4: if len(conditioning_latents) == 4:
conditioning_latents = (conditioning_latents[0], conditioning_latents[1], conditioning_latents[2], None) conditioning_latents = (conditioning_latents[0], conditioning_latents[1], conditioning_latents[2], None)
@ -54,7 +61,6 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
print("Requesting weighing against CVVP weight, but voice latents are missing some extra data. Please regenerate your voice latents.") print("Requesting weighing against CVVP weight, but voice latents are missing some extra data. Please regenerate your voice latents.")
cvvp_weight = 0 cvvp_weight = 0
start_time = time.time()
settings = { settings = {
'temperature': temperature, 'length_penalty': 1.0, 'repetition_penalty': 2.0, 'temperature': temperature, 'length_penalty': 1.0, 'repetition_penalty': 2.0,
@ -86,14 +92,24 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
else: else:
texts = split_and_recombine_text(text) texts = split_and_recombine_text(text)
start_time = time.time()
timestamp = int(time.time()) outdir = f"./results/{voice}/{int(start_time)}/"
outdir = f"./results/{voice}/{timestamp}/"
os.makedirs(outdir, exist_ok=True) os.makedirs(outdir, exist_ok=True)
audio_cache = {} audio_cache = {}
resampler = torchaudio.transforms.Resample(
tts.output_sample_rate,
args.output_sample_rate,
lowpass_filter_width=16,
rolloff=0.85,
resampling_method="kaiser_window",
beta=8.555504641634386,
) if tts.output_sample_rate != args.output_sample_rate else None
volume_adjust = torchaudio.transforms.Vol(gain=args.output_volume, gain_type="amplitude") if args.output_volume != 1 else None
for line, cut_text in enumerate(texts): for line, cut_text in enumerate(texts):
if emotion == "Custom": if emotion == "Custom":
if prompt.strip() != "": if prompt.strip() != "":
@ -108,21 +124,27 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
if isinstance(gen, list): if isinstance(gen, list):
for j, g in enumerate(gen): for j, g in enumerate(gen):
audio = g.squeeze(0).cpu() os.makedirs(f'{outdir}/candidate_{j}', exist_ok=True)
audio_cache[f"candidate_{j}/result_{line}.wav"] = { audio_cache[f"candidate_{j}/result_{line}.wav"] = {
'audio': audio, 'audio': g,
'text': cut_text, 'text': cut_text,
} }
os.makedirs(f'{outdir}/candidate_{j}', exist_ok=True)
torchaudio.save(f'{outdir}/candidate_{j}/result_{line}.wav', audio, tts.output_sample_rate)
else: else:
audio = gen.squeeze(0).cpu()
audio_cache[f"result_{line}.wav"] = { audio_cache[f"result_{line}.wav"] = {
'audio': audio, 'audio': gen,
'text': cut_text, 'text': cut_text,
} }
torchaudio.save(f'{outdir}/result_{line}.wav', audio, tts.output_sample_rate)
for k in audio_cache:
audio = audio_cache[k]['audio'].squeeze(0).cpu()
if resampler is not None:
audio = resampler(audio)
if volume_adjust is not None:
audio = volume_adjust(audio)
audio_cache[k]['audio'] = audio
torchaudio.save(f'{outdir}/{k}', audio, args.output_sample_rate)
output_voice = None output_voice = None
if len(texts) > 1: if len(texts) > 1:
@ -136,7 +158,7 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
audio_clips.append(audio) audio_clips.append(audio)
audio = torch.cat(audio_clips, dim=-1) audio = torch.cat(audio_clips, dim=-1)
torchaudio.save(f'{outdir}/combined_{candidate}.wav', audio, tts.output_sample_rate) torchaudio.save(f'{outdir}/combined_{candidate}.wav', audio, args.output_sample_rate)
audio = audio.squeeze(0).cpu() audio = audio.squeeze(0).cpu()
audio_cache[f'combined_{candidate}.wav'] = { audio_cache[f'combined_{candidate}.wav'] = {
@ -145,15 +167,15 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
} }
if output_voice is None: if output_voice is None:
output_voice = audio output_voice = f'{outdir}/combined_{candidate}.wav'
# output_voice = audio
else: else:
if isinstance(gen, list): if isinstance(gen, list):
output_voice = gen[0] output_voice = f'{outdir}/candidate_0/result_0.wav'
#output_voice = gen[0]
else: else:
output_voice = gen output_voice = f'{outdir}/result_0.wav'
#output_voice = gen
if output_voice is not None:
output_voice = (tts.output_sample_rate, output_voice.numpy())
info = { info = {
'text': text, 'text': text,
@ -189,8 +211,11 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
metadata['lyrics'] = json.dumps(info) metadata['lyrics'] = json.dumps(info)
metadata.save() metadata.save()
#if output_voice is not None:
# output_voice = (args.output_sample_rate, output_voice.numpy())
if sample_voice is not None: if sample_voice is not None:
sample_voice = (tts.input_sample_rate, sample_voice.squeeze().cpu().numpy()) sample_voice = (tts.input_sample_rate, sample_voice.numpy())
print(f"Generation took {info['time']} seconds, saved to '{outdir}'\n") print(f"Generation took {info['time']} seconds, saved to '{outdir}'\n")
@ -319,10 +344,13 @@ def check_for_updates():
return False return False
def reload_tts():
tts = setup_tortoise()
def update_voices(): def update_voices():
return gr.Dropdown.update(choices=sorted(os.listdir("./tortoise/voices")) + ["microphone"]) return gr.Dropdown.update(choices=sorted(os.listdir("./tortoise/voices")) + ["microphone"])
def export_exec_settings( share, listen, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, cond_latent_max_chunk_size, sample_batch_size, concurrency_count ): def export_exec_settings( share, listen, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, cond_latent_max_chunk_size, sample_batch_size, concurrency_count, output_sample_rate, output_volume ):
args.share = share args.share = share
args.listen = listen args.listen = listen
args.low_vram = low_vram args.low_vram = low_vram
@ -333,6 +361,8 @@ def export_exec_settings( share, listen, check_for_updates, models_from_local_on
args.embed_output_metadata = embed_output_metadata args.embed_output_metadata = embed_output_metadata
args.latents_lean_and_mean = latents_lean_and_mean args.latents_lean_and_mean = latents_lean_and_mean
args.concurrency_count = concurrency_count args.concurrency_count = concurrency_count
args.output_sample_rate = output_sample_rate
args.output_volume = output_volume
settings = { settings = {
'share': args.share, 'share': args.share,
@ -345,6 +375,8 @@ def export_exec_settings( share, listen, check_for_updates, models_from_local_on
'embed-output-metadata': args.embed_output_metadata, 'embed-output-metadata': args.embed_output_metadata,
'latents-lean-and-mean': args.latents_lean_and_mean, 'latents-lean-and-mean': args.latents_lean_and_mean,
'concurrency-count': args.concurrency_count, 'concurrency-count': args.concurrency_count,
'output-sample-rate': args.output_sample_rate,
'output-volume': args.output_volume,
} }
with open(f'./config/exec.json', 'w', encoding="utf-8") as f: with open(f'./config/exec.json', 'w', encoding="utf-8") as f:
@ -361,7 +393,9 @@ def setup_args():
'embed-output-metadata': True, 'embed-output-metadata': True,
'latents-lean-and-mean': True, 'latents-lean-and-mean': True,
'cond-latent-max-chunk-size': 1000000, 'cond-latent-max-chunk-size': 1000000,
'concurrency-count': 3, 'concurrency-count': 2,
'output-sample-rate': 44100,
'output-volume': 1,
} }
if os.path.isfile('./config/exec.json'): if os.path.isfile('./config/exec.json'):
@ -381,6 +415,8 @@ def setup_args():
parser.add_argument("--cond-latent-max-chunk-size", default=default_arguments['cond-latent-max-chunk-size'], type=int, help="Sets an upper limit to audio chunk size when computing conditioning latents") parser.add_argument("--cond-latent-max-chunk-size", default=default_arguments['cond-latent-max-chunk-size'], type=int, help="Sets an upper limit to audio chunk size when computing conditioning latents")
parser.add_argument("--sample-batch-size", default=default_arguments['sample-batch-size'], type=int, help="Sets an upper limit to audio chunk size when computing conditioning latents") parser.add_argument("--sample-batch-size", default=default_arguments['sample-batch-size'], type=int, help="Sets an upper limit to audio chunk size when computing conditioning latents")
parser.add_argument("--concurrency-count", type=int, default=default_arguments['concurrency-count'], help="How many Gradio events to process at once") parser.add_argument("--concurrency-count", type=int, default=default_arguments['concurrency-count'], help="How many Gradio events to process at once")
parser.add_argument("--output-sample-rate", type=int, default=default_arguments['output-sample-rate'], help="Sample rate to resample the output to (from 24KHz)")
parser.add_argument("--output-volume", type=float, default=default_arguments['output-volume'], help="Adjusts volume of output")
args = parser.parse_args() args = parser.parse_args()
args.embed_output_metadata = not args.no_embed_output_metadata args.embed_output_metadata = not args.no_embed_output_metadata
@ -392,7 +428,7 @@ def setup_args():
match = re.findall(r"^(?:(.+?):(\d+))?(\/.+?)?$", args.listen)[0] match = re.findall(r"^(?:(.+?):(\d+))?(\/.+?)?$", args.listen)[0]
args.listen_host = match[0] if match[0] != "" else "127.0.0.1" args.listen_host = match[0] if match[0] != "" else "127.0.0.1"
args.listen_port = match[1] if match[1] != "" else 8000 args.listen_port = match[1] if match[1] != "" else None
args.listen_path = match[2] if match[2] != "" else "/" args.listen_path = match[2] if match[2] != "" else "/"
if args.listen_port is not None: if args.listen_port is not None:
@ -516,34 +552,37 @@ def setup_gradio():
) )
with gr.Tab("Settings"): with gr.Tab("Settings"):
with gr.Row(): with gr.Row():
exec_inputs = []
with gr.Column(): with gr.Column():
with gr.Box(): exec_inputs = exec_inputs + [
exec_arg_listen = gr.Textbox(label="Listen", value=args.listen, placeholder="127.0.0.1:7860/") gr.Textbox(label="Listen", value=args.listen, placeholder="127.0.0.1:7860/"),
exec_arg_share = gr.Checkbox(label="Public Share Gradio", value=args.share) gr.Checkbox(label="Public Share Gradio", value=args.share),
exec_arg_check_for_updates = gr.Checkbox(label="Check For Updates", value=args.check_for_updates) gr.Checkbox(label="Check For Updates", value=args.check_for_updates),
exec_arg_models_from_local_only = gr.Checkbox(label="Only Load Models Locally", value=args.models_from_local_only) gr.Checkbox(label="Only Load Models Locally", value=args.models_from_local_only),
exec_arg_low_vram = gr.Checkbox(label="Low VRAM", value=args.low_vram) gr.Checkbox(label="Low VRAM", value=args.low_vram),
exec_arg_embed_output_metadata = gr.Checkbox(label="Embed Output Metadata", value=args.embed_output_metadata) gr.Checkbox(label="Embed Output Metadata", value=args.embed_output_metadata),
exec_arg_latents_lean_and_mean = gr.Checkbox(label="Slimmer Computed Latents", value=args.latents_lean_and_mean) gr.Checkbox(label="Slimmer Computed Latents", value=args.latents_lean_and_mean),
exec_arg_cond_latent_max_chunk_size = gr.Number(label="Voice Latents Max Chunk Size", precision=0, value=args.cond_latent_max_chunk_size) ]
exec_arg_sample_batch_size = gr.Number(label="Sample Batch Size", precision=0, value=args.sample_batch_size) gr.Button(value="Check for Updates").click(check_for_updates)
exec_arg_concurrency_count = gr.Number(label="Concurrency Count", precision=0, value=args.concurrency_count) with gr.Column():
exec_inputs = exec_inputs + [
gr.Number(label="Voice Latents Max Chunk Size", precision=0, value=args.cond_latent_max_chunk_size),
gr.Number(label="Sample Batch Size", precision=0, value=args.sample_batch_size),
gr.Number(label="Concurrency Count", precision=0, value=args.concurrency_count),
gr.Number(label="Ouptut Sample Rate", precision=0, value=args.output_sample_rate),
gr.Slider(label="Ouptut Volume", minimum=0, maximum=2, value=args.output_volume),
]
for i in exec_inputs:
i.change(
fn=export_exec_settings,
inputs=exec_inputs
)
with gr.Column():
experimentals = gr.CheckboxGroup(["Half Precision", "Conditioning-Free"], value=["Conditioning-Free"], label="Experimental Flags") experimentals = gr.CheckboxGroup(["Half Precision", "Conditioning-Free"], value=["Conditioning-Free"], label="Experimental Flags")
cvvp_weight = gr.Slider(value=0, minimum=0, maximum=1, label="CVVP Weight") cvvp_weight = gr.Slider(value=0, minimum=0, maximum=1, label="CVVP Weight")
check_updates_now = gr.Button(value="Check for Updates") gr.Button(value="Reload TTS").click(reload_tts)
exec_inputs = [exec_arg_share, exec_arg_listen, exec_arg_check_for_updates, exec_arg_models_from_local_only, exec_arg_low_vram, exec_arg_embed_output_metadata, exec_arg_latents_lean_and_mean, exec_arg_cond_latent_max_chunk_size, exec_arg_sample_batch_size, exec_arg_concurrency_count]
for i in exec_inputs:
i.change(
fn=export_exec_settings,
inputs=exec_inputs
)
check_updates_now.click(check_for_updates)
input_settings = [ input_settings = [
text, text,
@ -591,7 +630,7 @@ if __name__ == "__main__":
if args.listen_path is not None and args.listen_path != "/": if args.listen_path is not None and args.listen_path != "/":
import uvicorn import uvicorn
uvicorn.run("app:app", host=args.listen_host, port=args.listen_port) uvicorn.run("app:app", host=args.listen_host, port=args.listen_port if not None else 8000)
else: else:
webui = setup_gradio() webui = setup_gradio()
webui.launch(share=args.share, prevent_thread_lock=True, server_name=args.listen_host, server_port=args.listen_port) webui.launch(share=args.share, prevent_thread_lock=True, server_name=args.listen_host, server_port=args.listen_port)