forked from mrq/tortoise-tts
Added new options: "Output Sample Rate", "Output Volume", and documentation
This commit is contained in:
parent
77b39e59ac
commit
39b81318f2
|
@ -197,6 +197,8 @@ Below are settings that override the default launch arguments. Some of these req
|
||||||
* `Voice Latent Max Chunk Size`: during the voice latents calculation pass, this limits how large, in bytes, a chunk can be. Large values can run into VRAM OOM errors.
|
* `Voice Latent Max Chunk Size`: during the voice latents calculation pass, this limits how large, in bytes, a chunk can be. Large values can run into VRAM OOM errors.
|
||||||
* `Sample Batch Size`: sets the batch size when generating autoregressive samples. Bigger batches result in faster compute, at the cost of increased VRAM consumption. Leave to 0 to calculate a "best" fit.
|
* `Sample Batch Size`: sets the batch size when generating autoregressive samples. Bigger batches result in faster compute, at the cost of increased VRAM consumption. Leave to 0 to calculate a "best" fit.
|
||||||
* `Concurrency Count`: how many Gradio events the queue can process at once. Leave this over 1 if you want to modify settings in the UI that updates other settings while generating audio clips.
|
* `Concurrency Count`: how many Gradio events the queue can process at once. Leave this over 1 if you want to modify settings in the UI that updates other settings while generating audio clips.
|
||||||
|
* `Output Sample Rate`: the sample rate to save the generated audio as. It provides a bit of slight bump in quality
|
||||||
|
* `Output Volume`: adjusts the volume through amplitude scaling
|
||||||
|
|
||||||
Below are an explanation of experimental flags. Messing with these might impact performance, as these are exposed only if you know what you are doing.
|
Below are an explanation of experimental flags. Messing with these might impact performance, as these are exposed only if you know what you are doing.
|
||||||
* `Half-Precision`: (attempts to) hint to PyTorch to auto-cast to float16 (half precision) for compute. Disabled by default, due to it making computations slower.
|
* `Half-Precision`: (attempts to) hint to PyTorch to auto-cast to float16 (half precision) for compute. Disabled by default, due to it making computations slower.
|
||||||
|
|
137
app.py
137
app.py
|
@ -21,6 +21,12 @@ from tortoise.utils.audio import load_audio, load_voice, load_voices
|
||||||
from tortoise.utils.text import split_and_recombine_text
|
from tortoise.utils.text import split_and_recombine_text
|
||||||
|
|
||||||
def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, diffusion_sampler, breathing_room, cvvp_weight, experimentals, progress=gr.Progress(track_tqdm=True)):
|
def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, diffusion_sampler, breathing_room, cvvp_weight, experimentals, progress=gr.Progress(track_tqdm=True)):
|
||||||
|
try:
|
||||||
|
tts
|
||||||
|
except NameError:
|
||||||
|
raise gr.Error("TTS is still initializing...")
|
||||||
|
|
||||||
|
|
||||||
if voice != "microphone":
|
if voice != "microphone":
|
||||||
voices = [voice]
|
voices = [voice]
|
||||||
else:
|
else:
|
||||||
|
@ -36,7 +42,8 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
|
||||||
voice_samples, conditioning_latents = load_voice(voice)
|
voice_samples, conditioning_latents = load_voice(voice)
|
||||||
|
|
||||||
if voice_samples is not None:
|
if voice_samples is not None:
|
||||||
sample_voice = voice_samples[0]
|
sample_voice = voice_samples[0].squeeze().cpu()
|
||||||
|
|
||||||
conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, max_chunk_size=args.cond_latent_max_chunk_size)
|
conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, max_chunk_size=args.cond_latent_max_chunk_size)
|
||||||
if len(conditioning_latents) == 4:
|
if len(conditioning_latents) == 4:
|
||||||
conditioning_latents = (conditioning_latents[0], conditioning_latents[1], conditioning_latents[2], None)
|
conditioning_latents = (conditioning_latents[0], conditioning_latents[1], conditioning_latents[2], None)
|
||||||
|
@ -54,7 +61,6 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
|
||||||
print("Requesting weighing against CVVP weight, but voice latents are missing some extra data. Please regenerate your voice latents.")
|
print("Requesting weighing against CVVP weight, but voice latents are missing some extra data. Please regenerate your voice latents.")
|
||||||
cvvp_weight = 0
|
cvvp_weight = 0
|
||||||
|
|
||||||
start_time = time.time()
|
|
||||||
|
|
||||||
settings = {
|
settings = {
|
||||||
'temperature': temperature, 'length_penalty': 1.0, 'repetition_penalty': 2.0,
|
'temperature': temperature, 'length_penalty': 1.0, 'repetition_penalty': 2.0,
|
||||||
|
@ -86,14 +92,24 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
|
||||||
else:
|
else:
|
||||||
texts = split_and_recombine_text(text)
|
texts = split_and_recombine_text(text)
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
timestamp = int(time.time())
|
outdir = f"./results/{voice}/{int(start_time)}/"
|
||||||
outdir = f"./results/{voice}/{timestamp}/"
|
|
||||||
|
|
||||||
os.makedirs(outdir, exist_ok=True)
|
os.makedirs(outdir, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
audio_cache = {}
|
audio_cache = {}
|
||||||
|
|
||||||
|
resampler = torchaudio.transforms.Resample(
|
||||||
|
tts.output_sample_rate,
|
||||||
|
args.output_sample_rate,
|
||||||
|
lowpass_filter_width=16,
|
||||||
|
rolloff=0.85,
|
||||||
|
resampling_method="kaiser_window",
|
||||||
|
beta=8.555504641634386,
|
||||||
|
) if tts.output_sample_rate != args.output_sample_rate else None
|
||||||
|
|
||||||
|
volume_adjust = torchaudio.transforms.Vol(gain=args.output_volume, gain_type="amplitude") if args.output_volume != 1 else None
|
||||||
|
|
||||||
for line, cut_text in enumerate(texts):
|
for line, cut_text in enumerate(texts):
|
||||||
if emotion == "Custom":
|
if emotion == "Custom":
|
||||||
if prompt.strip() != "":
|
if prompt.strip() != "":
|
||||||
|
@ -108,21 +124,27 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
|
||||||
|
|
||||||
if isinstance(gen, list):
|
if isinstance(gen, list):
|
||||||
for j, g in enumerate(gen):
|
for j, g in enumerate(gen):
|
||||||
audio = g.squeeze(0).cpu()
|
os.makedirs(f'{outdir}/candidate_{j}', exist_ok=True)
|
||||||
audio_cache[f"candidate_{j}/result_{line}.wav"] = {
|
audio_cache[f"candidate_{j}/result_{line}.wav"] = {
|
||||||
'audio': audio,
|
'audio': g,
|
||||||
|
'text': cut_text,
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
audio_cache[f"result_{line}.wav"] = {
|
||||||
|
'audio': gen,
|
||||||
'text': cut_text,
|
'text': cut_text,
|
||||||
}
|
}
|
||||||
|
|
||||||
os.makedirs(f'{outdir}/candidate_{j}', exist_ok=True)
|
for k in audio_cache:
|
||||||
torchaudio.save(f'{outdir}/candidate_{j}/result_{line}.wav', audio, tts.output_sample_rate)
|
audio = audio_cache[k]['audio'].squeeze(0).cpu()
|
||||||
else:
|
if resampler is not None:
|
||||||
audio = gen.squeeze(0).cpu()
|
audio = resampler(audio)
|
||||||
audio_cache[f"result_{line}.wav"] = {
|
if volume_adjust is not None:
|
||||||
'audio': audio,
|
audio = volume_adjust(audio)
|
||||||
'text': cut_text,
|
|
||||||
}
|
audio_cache[k]['audio'] = audio
|
||||||
torchaudio.save(f'{outdir}/result_{line}.wav', audio, tts.output_sample_rate)
|
torchaudio.save(f'{outdir}/{k}', audio, args.output_sample_rate)
|
||||||
|
|
||||||
|
|
||||||
output_voice = None
|
output_voice = None
|
||||||
if len(texts) > 1:
|
if len(texts) > 1:
|
||||||
|
@ -136,7 +158,7 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
|
||||||
audio_clips.append(audio)
|
audio_clips.append(audio)
|
||||||
|
|
||||||
audio = torch.cat(audio_clips, dim=-1)
|
audio = torch.cat(audio_clips, dim=-1)
|
||||||
torchaudio.save(f'{outdir}/combined_{candidate}.wav', audio, tts.output_sample_rate)
|
torchaudio.save(f'{outdir}/combined_{candidate}.wav', audio, args.output_sample_rate)
|
||||||
|
|
||||||
audio = audio.squeeze(0).cpu()
|
audio = audio.squeeze(0).cpu()
|
||||||
audio_cache[f'combined_{candidate}.wav'] = {
|
audio_cache[f'combined_{candidate}.wav'] = {
|
||||||
|
@ -145,15 +167,15 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
|
||||||
}
|
}
|
||||||
|
|
||||||
if output_voice is None:
|
if output_voice is None:
|
||||||
output_voice = audio
|
output_voice = f'{outdir}/combined_{candidate}.wav'
|
||||||
|
# output_voice = audio
|
||||||
else:
|
else:
|
||||||
if isinstance(gen, list):
|
if isinstance(gen, list):
|
||||||
output_voice = gen[0]
|
output_voice = f'{outdir}/candidate_0/result_0.wav'
|
||||||
|
#output_voice = gen[0]
|
||||||
else:
|
else:
|
||||||
output_voice = gen
|
output_voice = f'{outdir}/result_0.wav'
|
||||||
|
#output_voice = gen
|
||||||
if output_voice is not None:
|
|
||||||
output_voice = (tts.output_sample_rate, output_voice.numpy())
|
|
||||||
|
|
||||||
info = {
|
info = {
|
||||||
'text': text,
|
'text': text,
|
||||||
|
@ -189,8 +211,11 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
|
||||||
metadata['lyrics'] = json.dumps(info)
|
metadata['lyrics'] = json.dumps(info)
|
||||||
metadata.save()
|
metadata.save()
|
||||||
|
|
||||||
|
#if output_voice is not None:
|
||||||
|
# output_voice = (args.output_sample_rate, output_voice.numpy())
|
||||||
|
|
||||||
if sample_voice is not None:
|
if sample_voice is not None:
|
||||||
sample_voice = (tts.input_sample_rate, sample_voice.squeeze().cpu().numpy())
|
sample_voice = (tts.input_sample_rate, sample_voice.numpy())
|
||||||
|
|
||||||
print(f"Generation took {info['time']} seconds, saved to '{outdir}'\n")
|
print(f"Generation took {info['time']} seconds, saved to '{outdir}'\n")
|
||||||
|
|
||||||
|
@ -319,10 +344,13 @@ def check_for_updates():
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def reload_tts():
|
||||||
|
tts = setup_tortoise()
|
||||||
|
|
||||||
def update_voices():
|
def update_voices():
|
||||||
return gr.Dropdown.update(choices=sorted(os.listdir("./tortoise/voices")) + ["microphone"])
|
return gr.Dropdown.update(choices=sorted(os.listdir("./tortoise/voices")) + ["microphone"])
|
||||||
|
|
||||||
def export_exec_settings( share, listen, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, cond_latent_max_chunk_size, sample_batch_size, concurrency_count ):
|
def export_exec_settings( share, listen, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, cond_latent_max_chunk_size, sample_batch_size, concurrency_count, output_sample_rate, output_volume ):
|
||||||
args.share = share
|
args.share = share
|
||||||
args.listen = listen
|
args.listen = listen
|
||||||
args.low_vram = low_vram
|
args.low_vram = low_vram
|
||||||
|
@ -333,6 +361,8 @@ def export_exec_settings( share, listen, check_for_updates, models_from_local_on
|
||||||
args.embed_output_metadata = embed_output_metadata
|
args.embed_output_metadata = embed_output_metadata
|
||||||
args.latents_lean_and_mean = latents_lean_and_mean
|
args.latents_lean_and_mean = latents_lean_and_mean
|
||||||
args.concurrency_count = concurrency_count
|
args.concurrency_count = concurrency_count
|
||||||
|
args.output_sample_rate = output_sample_rate
|
||||||
|
args.output_volume = output_volume
|
||||||
|
|
||||||
settings = {
|
settings = {
|
||||||
'share': args.share,
|
'share': args.share,
|
||||||
|
@ -345,6 +375,8 @@ def export_exec_settings( share, listen, check_for_updates, models_from_local_on
|
||||||
'embed-output-metadata': args.embed_output_metadata,
|
'embed-output-metadata': args.embed_output_metadata,
|
||||||
'latents-lean-and-mean': args.latents_lean_and_mean,
|
'latents-lean-and-mean': args.latents_lean_and_mean,
|
||||||
'concurrency-count': args.concurrency_count,
|
'concurrency-count': args.concurrency_count,
|
||||||
|
'output-sample-rate': args.output_sample_rate,
|
||||||
|
'output-volume': args.output_volume,
|
||||||
}
|
}
|
||||||
|
|
||||||
with open(f'./config/exec.json', 'w', encoding="utf-8") as f:
|
with open(f'./config/exec.json', 'w', encoding="utf-8") as f:
|
||||||
|
@ -361,7 +393,9 @@ def setup_args():
|
||||||
'embed-output-metadata': True,
|
'embed-output-metadata': True,
|
||||||
'latents-lean-and-mean': True,
|
'latents-lean-and-mean': True,
|
||||||
'cond-latent-max-chunk-size': 1000000,
|
'cond-latent-max-chunk-size': 1000000,
|
||||||
'concurrency-count': 3,
|
'concurrency-count': 2,
|
||||||
|
'output-sample-rate': 44100,
|
||||||
|
'output-volume': 1,
|
||||||
}
|
}
|
||||||
|
|
||||||
if os.path.isfile('./config/exec.json'):
|
if os.path.isfile('./config/exec.json'):
|
||||||
|
@ -381,6 +415,8 @@ def setup_args():
|
||||||
parser.add_argument("--cond-latent-max-chunk-size", default=default_arguments['cond-latent-max-chunk-size'], type=int, help="Sets an upper limit to audio chunk size when computing conditioning latents")
|
parser.add_argument("--cond-latent-max-chunk-size", default=default_arguments['cond-latent-max-chunk-size'], type=int, help="Sets an upper limit to audio chunk size when computing conditioning latents")
|
||||||
parser.add_argument("--sample-batch-size", default=default_arguments['sample-batch-size'], type=int, help="Sets an upper limit to audio chunk size when computing conditioning latents")
|
parser.add_argument("--sample-batch-size", default=default_arguments['sample-batch-size'], type=int, help="Sets an upper limit to audio chunk size when computing conditioning latents")
|
||||||
parser.add_argument("--concurrency-count", type=int, default=default_arguments['concurrency-count'], help="How many Gradio events to process at once")
|
parser.add_argument("--concurrency-count", type=int, default=default_arguments['concurrency-count'], help="How many Gradio events to process at once")
|
||||||
|
parser.add_argument("--output-sample-rate", type=int, default=default_arguments['output-sample-rate'], help="Sample rate to resample the output to (from 24KHz)")
|
||||||
|
parser.add_argument("--output-volume", type=float, default=default_arguments['output-volume'], help="Adjusts volume of output")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
args.embed_output_metadata = not args.no_embed_output_metadata
|
args.embed_output_metadata = not args.no_embed_output_metadata
|
||||||
|
@ -392,7 +428,7 @@ def setup_args():
|
||||||
match = re.findall(r"^(?:(.+?):(\d+))?(\/.+?)?$", args.listen)[0]
|
match = re.findall(r"^(?:(.+?):(\d+))?(\/.+?)?$", args.listen)[0]
|
||||||
|
|
||||||
args.listen_host = match[0] if match[0] != "" else "127.0.0.1"
|
args.listen_host = match[0] if match[0] != "" else "127.0.0.1"
|
||||||
args.listen_port = match[1] if match[1] != "" else 8000
|
args.listen_port = match[1] if match[1] != "" else None
|
||||||
args.listen_path = match[2] if match[2] != "" else "/"
|
args.listen_path = match[2] if match[2] != "" else "/"
|
||||||
|
|
||||||
if args.listen_port is not None:
|
if args.listen_port is not None:
|
||||||
|
@ -516,34 +552,37 @@ def setup_gradio():
|
||||||
)
|
)
|
||||||
with gr.Tab("Settings"):
|
with gr.Tab("Settings"):
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
|
exec_inputs = []
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
with gr.Box():
|
exec_inputs = exec_inputs + [
|
||||||
exec_arg_listen = gr.Textbox(label="Listen", value=args.listen, placeholder="127.0.0.1:7860/")
|
gr.Textbox(label="Listen", value=args.listen, placeholder="127.0.0.1:7860/"),
|
||||||
exec_arg_share = gr.Checkbox(label="Public Share Gradio", value=args.share)
|
gr.Checkbox(label="Public Share Gradio", value=args.share),
|
||||||
exec_arg_check_for_updates = gr.Checkbox(label="Check For Updates", value=args.check_for_updates)
|
gr.Checkbox(label="Check For Updates", value=args.check_for_updates),
|
||||||
exec_arg_models_from_local_only = gr.Checkbox(label="Only Load Models Locally", value=args.models_from_local_only)
|
gr.Checkbox(label="Only Load Models Locally", value=args.models_from_local_only),
|
||||||
exec_arg_low_vram = gr.Checkbox(label="Low VRAM", value=args.low_vram)
|
gr.Checkbox(label="Low VRAM", value=args.low_vram),
|
||||||
exec_arg_embed_output_metadata = gr.Checkbox(label="Embed Output Metadata", value=args.embed_output_metadata)
|
gr.Checkbox(label="Embed Output Metadata", value=args.embed_output_metadata),
|
||||||
exec_arg_latents_lean_and_mean = gr.Checkbox(label="Slimmer Computed Latents", value=args.latents_lean_and_mean)
|
gr.Checkbox(label="Slimmer Computed Latents", value=args.latents_lean_and_mean),
|
||||||
exec_arg_cond_latent_max_chunk_size = gr.Number(label="Voice Latents Max Chunk Size", precision=0, value=args.cond_latent_max_chunk_size)
|
]
|
||||||
exec_arg_sample_batch_size = gr.Number(label="Sample Batch Size", precision=0, value=args.sample_batch_size)
|
gr.Button(value="Check for Updates").click(check_for_updates)
|
||||||
exec_arg_concurrency_count = gr.Number(label="Concurrency Count", precision=0, value=args.concurrency_count)
|
with gr.Column():
|
||||||
|
exec_inputs = exec_inputs + [
|
||||||
|
gr.Number(label="Voice Latents Max Chunk Size", precision=0, value=args.cond_latent_max_chunk_size),
|
||||||
experimentals = gr.CheckboxGroup(["Half Precision", "Conditioning-Free"], value=["Conditioning-Free"], label="Experimental Flags")
|
gr.Number(label="Sample Batch Size", precision=0, value=args.sample_batch_size),
|
||||||
cvvp_weight = gr.Slider(value=0, minimum=0, maximum=1, label="CVVP Weight")
|
gr.Number(label="Concurrency Count", precision=0, value=args.concurrency_count),
|
||||||
|
gr.Number(label="Ouptut Sample Rate", precision=0, value=args.output_sample_rate),
|
||||||
check_updates_now = gr.Button(value="Check for Updates")
|
gr.Slider(label="Ouptut Volume", minimum=0, maximum=2, value=args.output_volume),
|
||||||
|
]
|
||||||
exec_inputs = [exec_arg_share, exec_arg_listen, exec_arg_check_for_updates, exec_arg_models_from_local_only, exec_arg_low_vram, exec_arg_embed_output_metadata, exec_arg_latents_lean_and_mean, exec_arg_cond_latent_max_chunk_size, exec_arg_sample_batch_size, exec_arg_concurrency_count]
|
|
||||||
|
|
||||||
for i in exec_inputs:
|
for i in exec_inputs:
|
||||||
i.change(
|
i.change(
|
||||||
fn=export_exec_settings,
|
fn=export_exec_settings,
|
||||||
inputs=exec_inputs
|
inputs=exec_inputs
|
||||||
)
|
)
|
||||||
|
with gr.Column():
|
||||||
|
experimentals = gr.CheckboxGroup(["Half Precision", "Conditioning-Free"], value=["Conditioning-Free"], label="Experimental Flags")
|
||||||
|
cvvp_weight = gr.Slider(value=0, minimum=0, maximum=1, label="CVVP Weight")
|
||||||
|
|
||||||
check_updates_now.click(check_for_updates)
|
gr.Button(value="Reload TTS").click(reload_tts)
|
||||||
|
|
||||||
input_settings = [
|
input_settings = [
|
||||||
text,
|
text,
|
||||||
|
@ -591,7 +630,7 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
if args.listen_path is not None and args.listen_path != "/":
|
if args.listen_path is not None and args.listen_path != "/":
|
||||||
import uvicorn
|
import uvicorn
|
||||||
uvicorn.run("app:app", host=args.listen_host, port=args.listen_port)
|
uvicorn.run("app:app", host=args.listen_host, port=args.listen_port if not None else 8000)
|
||||||
else:
|
else:
|
||||||
webui = setup_gradio()
|
webui = setup_gradio()
|
||||||
webui.launch(share=args.share, prevent_thread_lock=True, server_name=args.listen_host, server_port=args.listen_port)
|
webui.launch(share=args.share, prevent_thread_lock=True, server_name=args.listen_host, server_port=args.listen_port)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user