forked from mrq/tortoise-tts
Added integration for "voicefixer", fixed issue where candidates>1 and lines>1 only outputs the last combined candidate, numbered step for each generation in progress, output time per generation step
This commit is contained in:
parent
fa743e2e9b
commit
c5337a6b51
|
@ -217,6 +217,7 @@ Below are settings that override the default launch arguments. Some of these req
|
||||||
* `Low VRAM`: disables optimizations in TorToiSe that increases VRAM consumption. Suggested if your GPU has under 6GiB.
|
* `Low VRAM`: disables optimizations in TorToiSe that increases VRAM consumption. Suggested if your GPU has under 6GiB.
|
||||||
* `Embed Output Metadata`: enables embedding the settings and latents used to generate that audio clip inside that audio clip. Metadata is stored as a JSON string in the `lyrics` tag.
|
* `Embed Output Metadata`: enables embedding the settings and latents used to generate that audio clip inside that audio clip. Metadata is stored as a JSON string in the `lyrics` tag.
|
||||||
* `Slimmer Computed Latents`: falls back to the original, 12.9KiB way of storing latents (without the extra bits required for using the CVVP model).
|
* `Slimmer Computed Latents`: falls back to the original, 12.9KiB way of storing latents (without the extra bits required for using the CVVP model).
|
||||||
|
* `Voice Fixer`: runs each generated audio clip through `voicefixer`, if available and installed.
|
||||||
* `Voice Latent Max Chunk Size`: during the voice latents calculation pass, this limits how large, in bytes, a chunk can be. Large values can run into VRAM OOM errors.
|
* `Voice Latent Max Chunk Size`: during the voice latents calculation pass, this limits how large, in bytes, a chunk can be. Large values can run into VRAM OOM errors.
|
||||||
* `Sample Batch Size`: sets the batch size when generating autoregressive samples. Bigger batches result in faster compute, at the cost of increased VRAM consumption. Leave to 0 to calculate a "best" fit.
|
* `Sample Batch Size`: sets the batch size when generating autoregressive samples. Bigger batches result in faster compute, at the cost of increased VRAM consumption. Leave to 0 to calculate a "best" fit.
|
||||||
* `Concurrency Count`: how many Gradio events the queue can process at once. Leave this over 1 if you want to modify settings in the UI that updates other settings while generating audio clips.
|
* `Concurrency Count`: how many Gradio events the queue can process at once. Leave this over 1 if you want to modify settings in the UI that updates other settings while generating audio clips.
|
||||||
|
|
|
@ -15,4 +15,5 @@ numpy
|
||||||
numba
|
numba
|
||||||
gradio
|
gradio
|
||||||
music-tag
|
music-tag
|
||||||
k-diffusion
|
k-diffusion
|
||||||
|
voicefixer
|
|
@ -61,7 +61,7 @@ def tqdm_override(arr, verbose=False, progress=None, desc=None):
|
||||||
|
|
||||||
if progress is None:
|
if progress is None:
|
||||||
return tqdm(arr, disable=not verbose)
|
return tqdm(arr, disable=not verbose)
|
||||||
return progress.tqdm(arr, desc=desc, track_tqdm=True)
|
return progress.tqdm(arr, desc=f'{progress.msg_prefix} {desc}' if hasattr(progress, 'msg_prefix') else desc, track_tqdm=True)
|
||||||
|
|
||||||
def download_models(specific_models=None):
|
def download_models(specific_models=None):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -21,7 +21,7 @@ def tqdm_override(arr, verbose=False, progress=None, desc=None):
|
||||||
|
|
||||||
if progress is None:
|
if progress is None:
|
||||||
return tqdm(arr, disable=not verbose)
|
return tqdm(arr, disable=not verbose)
|
||||||
return progress.tqdm(arr, desc=desc, track_tqdm=True)
|
return progress.tqdm(arr, desc=f'{progress.msg_prefix} {desc}' if hasattr(progress, 'msg_prefix') else desc, track_tqdm=True)
|
||||||
|
|
||||||
def normal_kl(mean1, logvar1, mean2, logvar2):
|
def normal_kl(mean1, logvar1, mean2, logvar2):
|
||||||
"""
|
"""
|
||||||
|
|
86
webui.py
86
webui.py
|
@ -20,6 +20,8 @@ from tortoise.api import TextToSpeech
|
||||||
from tortoise.utils.audio import load_audio, load_voice, load_voices, get_voice_dir
|
from tortoise.utils.audio import load_audio, load_voice, load_voices, get_voice_dir
|
||||||
from tortoise.utils.text import split_and_recombine_text
|
from tortoise.utils.text import split_and_recombine_text
|
||||||
|
|
||||||
|
voicefixer = None
|
||||||
|
|
||||||
def generate(
|
def generate(
|
||||||
text,
|
text,
|
||||||
delimiter,
|
delimiter,
|
||||||
|
@ -51,7 +53,6 @@ def generate(
|
||||||
except NameError:
|
except NameError:
|
||||||
raise gr.Error("TTS is still initializing...")
|
raise gr.Error("TTS is still initializing...")
|
||||||
|
|
||||||
|
|
||||||
if voice != "microphone":
|
if voice != "microphone":
|
||||||
voices = [voice]
|
voices = [voice]
|
||||||
else:
|
else:
|
||||||
|
@ -128,14 +129,17 @@ def generate(
|
||||||
|
|
||||||
audio_cache = {}
|
audio_cache = {}
|
||||||
|
|
||||||
resampler = torchaudio.transforms.Resample(
|
resample = None
|
||||||
tts.output_sample_rate,
|
# not a ternary in the event for some reason I want to rely on librosa's upsampling interpolator rather than torchaudio's, for some reason
|
||||||
args.output_sample_rate,
|
if tts.output_sample_rate != args.output_sample_rate:
|
||||||
lowpass_filter_width=16,
|
resampler = torchaudio.transforms.Resample(
|
||||||
rolloff=0.85,
|
tts.output_sample_rate,
|
||||||
resampling_method="kaiser_window",
|
args.output_sample_rate,
|
||||||
beta=8.555504641634386,
|
lowpass_filter_width=16,
|
||||||
) if tts.output_sample_rate != args.output_sample_rate else None
|
rolloff=0.85,
|
||||||
|
resampling_method="kaiser_window",
|
||||||
|
beta=8.555504641634386,
|
||||||
|
)
|
||||||
|
|
||||||
volume_adjust = torchaudio.transforms.Vol(gain=args.output_volume, gain_type="amplitude") if args.output_volume != 1 else None
|
volume_adjust = torchaudio.transforms.Vol(gain=args.output_volume, gain_type="amplitude") if args.output_volume != 1 else None
|
||||||
|
|
||||||
|
@ -147,11 +151,10 @@ def generate(
|
||||||
idx = idx + 1
|
idx = idx + 1
|
||||||
|
|
||||||
def get_name(line=0, candidate=0, combined=False):
|
def get_name(line=0, candidate=0, combined=False):
|
||||||
if combined:
|
|
||||||
return f"{idx}_combined"
|
|
||||||
|
|
||||||
name = f"{idx}"
|
name = f"{idx}"
|
||||||
if len(texts) > 1:
|
if combined:
|
||||||
|
name = f"{name}_combined"
|
||||||
|
elif len(texts) > 1:
|
||||||
name = f"{name}_{line}"
|
name = f"{name}_{line}"
|
||||||
if candidates > 1:
|
if candidates > 1:
|
||||||
name = f"{name}_{candidate}"
|
name = f"{name}_{candidate}"
|
||||||
|
@ -164,12 +167,14 @@ def generate(
|
||||||
else:
|
else:
|
||||||
cut_text = f"[I am really {emotion.lower()},] {cut_text}"
|
cut_text = f"[I am really {emotion.lower()},] {cut_text}"
|
||||||
|
|
||||||
print(f"[{str(line+1)}/{str(len(texts))}] Generating line: {cut_text}")
|
progress.msg_prefix = f'[{str(line+1)}/{str(len(texts))}]'
|
||||||
|
print(f"{progress.msg_prefix} Generating line: {cut_text}")
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
gen, additionals = tts.tts(cut_text, **settings )
|
gen, additionals = tts.tts(cut_text, **settings )
|
||||||
seed = additionals[0]
|
seed = additionals[0]
|
||||||
run_time = time.time()-start_time
|
run_time = time.time()-start_time
|
||||||
|
print(f"Generating line took {run_time} seconds")
|
||||||
|
|
||||||
if isinstance(gen, list):
|
if isinstance(gen, list):
|
||||||
for j, g in enumerate(gen):
|
for j, g in enumerate(gen):
|
||||||
|
@ -203,15 +208,11 @@ def generate(
|
||||||
for candidate in range(candidates):
|
for candidate in range(candidates):
|
||||||
audio_clips = []
|
audio_clips = []
|
||||||
for line in range(len(texts)):
|
for line in range(len(texts)):
|
||||||
if isinstance(gen, list):
|
name = get_name(line=line, candidate=candidate)
|
||||||
name = get_name(line=line, candidate=candidate)
|
audio = audio_cache[name]['audio']
|
||||||
audio = audio_cache[name]['audio']
|
|
||||||
else:
|
|
||||||
name = get_name(line=line)
|
|
||||||
audio = audio_cache[name]['audio']
|
|
||||||
audio_clips.append(audio)
|
audio_clips.append(audio)
|
||||||
|
|
||||||
name = get_name(combined=True)
|
name = get_name(candidate=candidate, combined=True)
|
||||||
audio = torch.cat(audio_clips, dim=-1)
|
audio = torch.cat(audio_clips, dim=-1)
|
||||||
torchaudio.save(f'{outdir}/{voice}_{name}.wav', audio, args.output_sample_rate)
|
torchaudio.save(f'{outdir}/{voice}_{name}.wav', audio, args.output_sample_rate)
|
||||||
|
|
||||||
|
@ -225,16 +226,10 @@ def generate(
|
||||||
output_voices.append(f'{outdir}/{voice}_{name}.wav')
|
output_voices.append(f'{outdir}/{voice}_{name}.wav')
|
||||||
if output_voice is None:
|
if output_voice is None:
|
||||||
output_voice = f'{outdir}/{voice}_{name}.wav'
|
output_voice = f'{outdir}/{voice}_{name}.wav'
|
||||||
# output_voice = audio
|
|
||||||
else:
|
else:
|
||||||
if candidates > 1:
|
for candidate in range(candidates):
|
||||||
for candidate in range(candidates):
|
name = get_name(candidate=candidate)
|
||||||
name = get_name(candidate=candidate)
|
|
||||||
output_voices.append(f'{outdir}/{voice}_{name}.wav')
|
|
||||||
else:
|
|
||||||
name = get_name()
|
|
||||||
output_voices.append(f'{outdir}/{voice}_{name}.wav')
|
output_voices.append(f'{outdir}/{voice}_{name}.wav')
|
||||||
#output_voice = f'{outdir}/{voice}_{name}.wav'
|
|
||||||
|
|
||||||
info = {
|
info = {
|
||||||
'text': text,
|
'text': text,
|
||||||
|
@ -267,8 +262,21 @@ def generate(
|
||||||
with open(f'{get_voice_dir()}/{voice}/cond_latents.pth', 'rb') as f:
|
with open(f'{get_voice_dir()}/{voice}/cond_latents.pth', 'rb') as f:
|
||||||
info['latents'] = base64.b64encode(f.read()).decode("ascii")
|
info['latents'] = base64.b64encode(f.read()).decode("ascii")
|
||||||
|
|
||||||
|
if voicefixer:
|
||||||
|
# we could do this on the pieces before they get stiched up anyways to save some compute
|
||||||
|
# but the stitching would need to read back from disk, defeating the point of caching the waveform
|
||||||
|
for path in progress.tqdm(audio_cache, desc="Running voicefix..."):
|
||||||
|
print("VoiceFix starting")
|
||||||
|
voicefixer.restore(
|
||||||
|
input=f'{outdir}/{voice}_{k}.wav',
|
||||||
|
output=f'{outdir}/{voice}_{k}.wav',
|
||||||
|
#cuda=False,
|
||||||
|
#mode=mode,
|
||||||
|
)
|
||||||
|
print("VoiceFix finished")
|
||||||
|
|
||||||
if args.embed_output_metadata:
|
if args.embed_output_metadata:
|
||||||
for path in audio_cache:
|
for path in progress.tqdm(audio_cache, desc="Embedding metadata..."):
|
||||||
info['text'] = audio_cache[path]['text']
|
info['text'] = audio_cache[path]['text']
|
||||||
info['time'] = audio_cache[path]['time']
|
info['time'] = audio_cache[path]['time']
|
||||||
|
|
||||||
|
@ -438,7 +446,7 @@ def cancel_generate():
|
||||||
def update_voices():
|
def update_voices():
|
||||||
return gr.Dropdown.update(choices=sorted(os.listdir(get_voice_dir())) + ["microphone"])
|
return gr.Dropdown.update(choices=sorted(os.listdir(get_voice_dir())) + ["microphone"])
|
||||||
|
|
||||||
def export_exec_settings( share, listen, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, cond_latent_max_chunk_size, sample_batch_size, concurrency_count, output_sample_rate, output_volume ):
|
def export_exec_settings( share, listen, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, voice_fixer, cond_latent_max_chunk_size, sample_batch_size, concurrency_count, output_sample_rate, output_volume ):
|
||||||
args.share = share
|
args.share = share
|
||||||
args.listen = listen
|
args.listen = listen
|
||||||
args.low_vram = low_vram
|
args.low_vram = low_vram
|
||||||
|
@ -448,6 +456,7 @@ def export_exec_settings( share, listen, check_for_updates, models_from_local_on
|
||||||
args.sample_batch_size = sample_batch_size
|
args.sample_batch_size = sample_batch_size
|
||||||
args.embed_output_metadata = embed_output_metadata
|
args.embed_output_metadata = embed_output_metadata
|
||||||
args.latents_lean_and_mean = latents_lean_and_mean
|
args.latents_lean_and_mean = latents_lean_and_mean
|
||||||
|
args.voice_fixer = voice_fixer
|
||||||
args.concurrency_count = concurrency_count
|
args.concurrency_count = concurrency_count
|
||||||
args.output_sample_rate = output_sample_rate
|
args.output_sample_rate = output_sample_rate
|
||||||
args.output_volume = output_volume
|
args.output_volume = output_volume
|
||||||
|
@ -462,6 +471,7 @@ def export_exec_settings( share, listen, check_for_updates, models_from_local_on
|
||||||
'sample-batch-size': args.sample_batch_size,
|
'sample-batch-size': args.sample_batch_size,
|
||||||
'embed-output-metadata': args.embed_output_metadata,
|
'embed-output-metadata': args.embed_output_metadata,
|
||||||
'latents-lean-and-mean': args.latents_lean_and_mean,
|
'latents-lean-and-mean': args.latents_lean_and_mean,
|
||||||
|
'voice-fixer': args.voice_fixer,
|
||||||
'concurrency-count': args.concurrency_count,
|
'concurrency-count': args.concurrency_count,
|
||||||
'output-sample-rate': args.output_sample_rate,
|
'output-sample-rate': args.output_sample_rate,
|
||||||
'output-volume': args.output_volume,
|
'output-volume': args.output_volume,
|
||||||
|
@ -480,6 +490,7 @@ def setup_args():
|
||||||
'sample-batch-size': None,
|
'sample-batch-size': None,
|
||||||
'embed-output-metadata': True,
|
'embed-output-metadata': True,
|
||||||
'latents-lean-and-mean': True,
|
'latents-lean-and-mean': True,
|
||||||
|
'voice-fixer': True,
|
||||||
'cond-latent-max-chunk-size': 1000000,
|
'cond-latent-max-chunk-size': 1000000,
|
||||||
'concurrency-count': 2,
|
'concurrency-count': 2,
|
||||||
'output-sample-rate': 44100,
|
'output-sample-rate': 44100,
|
||||||
|
@ -500,6 +511,7 @@ def setup_args():
|
||||||
parser.add_argument("--low-vram", action='store_true', default=default_arguments['low-vram'], help="Disables some optimizations that increases VRAM usage")
|
parser.add_argument("--low-vram", action='store_true', default=default_arguments['low-vram'], help="Disables some optimizations that increases VRAM usage")
|
||||||
parser.add_argument("--no-embed-output-metadata", action='store_false', default=not default_arguments['embed-output-metadata'], help="Disables embedding output metadata into resulting WAV files for easily fetching its settings used with the web UI (data is stored in the lyrics metadata tag)")
|
parser.add_argument("--no-embed-output-metadata", action='store_false', default=not default_arguments['embed-output-metadata'], help="Disables embedding output metadata into resulting WAV files for easily fetching its settings used with the web UI (data is stored in the lyrics metadata tag)")
|
||||||
parser.add_argument("--latents-lean-and-mean", action='store_true', default=default_arguments['latents-lean-and-mean'], help="Exports the bare essentials for latents.")
|
parser.add_argument("--latents-lean-and-mean", action='store_true', default=default_arguments['latents-lean-and-mean'], help="Exports the bare essentials for latents.")
|
||||||
|
parser.add_argument("--voice-fixer", action='store_true', default=default_arguments['voice-fixer'], help="Uses python module 'voicefixer' to improve audio quality, if available.")
|
||||||
parser.add_argument("--cond-latent-max-chunk-size", default=default_arguments['cond-latent-max-chunk-size'], type=int, help="Sets an upper limit to audio chunk size when computing conditioning latents")
|
parser.add_argument("--cond-latent-max-chunk-size", default=default_arguments['cond-latent-max-chunk-size'], type=int, help="Sets an upper limit to audio chunk size when computing conditioning latents")
|
||||||
parser.add_argument("--sample-batch-size", default=default_arguments['sample-batch-size'], type=int, help="Sets an upper limit to audio chunk size when computing conditioning latents")
|
parser.add_argument("--sample-batch-size", default=default_arguments['sample-batch-size'], type=int, help="Sets an upper limit to audio chunk size when computing conditioning latents")
|
||||||
parser.add_argument("--concurrency-count", type=int, default=default_arguments['concurrency-count'], help="How many Gradio events to process at once")
|
parser.add_argument("--concurrency-count", type=int, default=default_arguments['concurrency-count'], help="How many Gradio events to process at once")
|
||||||
|
@ -526,6 +538,17 @@ def setup_args():
|
||||||
|
|
||||||
def setup_tortoise():
|
def setup_tortoise():
|
||||||
global args
|
global args
|
||||||
|
global voicefixer
|
||||||
|
|
||||||
|
if args.voice_fixer:
|
||||||
|
try:
|
||||||
|
from voicefixer import VoiceFixer
|
||||||
|
print("Initializating voice-fixer")
|
||||||
|
voicefixer = VoiceFixer()
|
||||||
|
print("initialized voice-fixer")
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
|
||||||
print("Initializating TorToiSe...")
|
print("Initializating TorToiSe...")
|
||||||
tts = TextToSpeech(minor_optimizations=not args.low_vram)
|
tts = TextToSpeech(minor_optimizations=not args.low_vram)
|
||||||
print("TorToiSe initialized, ready for generation.")
|
print("TorToiSe initialized, ready for generation.")
|
||||||
|
@ -736,6 +759,7 @@ def setup_gradio():
|
||||||
gr.Checkbox(label="Low VRAM", value=args.low_vram),
|
gr.Checkbox(label="Low VRAM", value=args.low_vram),
|
||||||
gr.Checkbox(label="Embed Output Metadata", value=args.embed_output_metadata),
|
gr.Checkbox(label="Embed Output Metadata", value=args.embed_output_metadata),
|
||||||
gr.Checkbox(label="Slimmer Computed Latents", value=args.latents_lean_and_mean),
|
gr.Checkbox(label="Slimmer Computed Latents", value=args.latents_lean_and_mean),
|
||||||
|
gr.Checkbox(label="Voice Fixer", value=args.voice_fixer),
|
||||||
]
|
]
|
||||||
gr.Button(value="Check for Updates").click(check_for_updates)
|
gr.Button(value="Check for Updates").click(check_for_updates)
|
||||||
gr.Button(value="Reload TTS").click(reload_tts)
|
gr.Button(value="Reload TTS").click(reload_tts)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user