forked from mrq/tortoise-tts
added option: force cpu for conditioning latents, for when you want low chunk counts but your GPU keeps OOMing because fuck fragmentation
This commit is contained in:
parent
b721e395b5
commit
ea1bc770aa
|
@ -25,6 +25,7 @@ To try and keep the terminology used here (somewhat) consistent and coherent, be
|
|||
* `diffusion decoder` / `vocoder`: these passes are responsible for encoding the tokens into a MEL spectrogram into a waveform.
|
||||
* `diffusion iterations`: how many passes to put into generating the output waveform. More iterations = better audio quality.
|
||||
* `diffusion sampler` / `sampler`: the sampling method used during the diffusion decoding pass, albeit a bit of a misnomer. Currently, only two samplers are implemented.
|
||||
* `OOM`: out of memory, happens due to cards boasting low VRAM, or terrible, god awful fragmentation
|
||||
|
||||
## Modifications
|
||||
|
||||
|
@ -274,7 +275,7 @@ Below are settings that override the default launch arguments. Some of these req
|
|||
* `Slimmer Computed Latents`: falls back to the original, 12.9KiB way of storing latents (without the extra bits required for using the CVVP model).
|
||||
* `Voice Fixer`: runs each generated audio clip through `voicefixer`, if available and installed.
|
||||
* `Use CUDA for Voice Fixer`: if available, hints to `voicefixer` to use hardware acceleration. this flag is specifically because I'll OOM on my 2060, since the models for `voicefixer` do not leave the GPU and are heavily fragmented, I presume.
|
||||
* `Voice Latent Max Chunk Size`: during the voice latents calculation pass, this limits how large, in bytes, a chunk can be. Large values can run into VRAM OOM errors.
|
||||
* `Force CPU for Conditioning Latents`: forces conditional latents to be calculated on the CPU. Use this if you have really, really large voice samples, and you insist on using very low chunk sizes that your GPU keeps OOMing when calculating
|
||||
* `Sample Batch Size`: sets the batch size when generating autoregressive samples. Bigger batches result in faster compute, at the cost of increased VRAM consumption. Leave to 0 to calculate a "best" fit.
|
||||
* `Concurrency Count`: how many Gradio events the queue can process at once. Leave this over 1 if you want to modify settings in the UI that updates other settings while generating audio clips.
|
||||
* `Output Sample Rate`: the sample rate to save the generated audio as. It provides a bit of slight bump in quality
|
||||
|
|
|
@ -294,7 +294,7 @@ class TextToSpeech:
|
|||
if self.preloaded_tensors:
|
||||
self.cvvp = self.cvvp.to(self.device)
|
||||
|
||||
def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, progress=None, slices=1, max_chunk_size=None):
|
||||
def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, progress=None, slices=1, max_chunk_size=None, force_cpu=False):
|
||||
"""
|
||||
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
|
||||
These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
|
||||
|
@ -303,7 +303,9 @@ class TextToSpeech:
|
|||
"""
|
||||
with torch.no_grad():
|
||||
# computing conditional latents requires being done on the CPU if using DML because M$ still hasn't implemented some core functions
|
||||
device = torch.device('cpu') if get_device_name() == "dml" else self.device
|
||||
if get_device_name() == "dml":
|
||||
force_cpu = True
|
||||
device = torch.device('cpu') if force_cpu else self.device
|
||||
|
||||
if not isinstance(voice_samples, list):
|
||||
voice_samples = [voice_samples]
|
||||
|
@ -371,6 +373,8 @@ class TextToSpeech:
|
|||
else:
|
||||
self.diffusion = self.diffusion.cpu()
|
||||
|
||||
|
||||
|
||||
if return_mels:
|
||||
return auto_latent, diffusion_latent, auto_conds, diffusion_conds
|
||||
else:
|
||||
|
|
18
webui.py
18
webui.py
|
@ -74,7 +74,7 @@ def generate(
|
|||
if voice_samples is not None:
|
||||
sample_voice = torch.cat(voice_samples, dim=-1).squeeze().cpu()
|
||||
|
||||
conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, slices=voice_latents_chunks)
|
||||
conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, slices=voice_latents_chunks, force_cpu=args.force_cpu_for_conditioning_latents)
|
||||
if len(conditioning_latents) == 4:
|
||||
conditioning_latents = (conditioning_latents[0], conditioning_latents[1], conditioning_latents[2], None)
|
||||
|
||||
|
@ -331,7 +331,7 @@ def compute_latents(voice, voice_latents_chunks, progress=gr.Progress(track_tqdm
|
|||
if voice_samples is None:
|
||||
return
|
||||
|
||||
conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, slices=voice_latents_chunks)
|
||||
conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, slices=voice_latents_chunks, force_cpu=args.force_cpu_for_conditioning_latents)
|
||||
|
||||
if len(conditioning_latents) == 4:
|
||||
conditioning_latents = (conditioning_latents[0], conditioning_latents[1], conditioning_latents[2], None)
|
||||
|
@ -532,13 +532,13 @@ def get_voice_list(dir=get_voice_dir()):
|
|||
def update_voices():
|
||||
return gr.Dropdown.update(choices=get_voice_list())
|
||||
|
||||
def export_exec_settings( listen, share, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, voice_fixer, voice_fixer_use_cuda, cond_latent_max_chunk_size, sample_batch_size, concurrency_count, output_sample_rate, output_volume ):
|
||||
def export_exec_settings( listen, share, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, voice_fixer, voice_fixer_use_cuda, force_cpu_for_conditioning_latents, sample_batch_size, concurrency_count, output_sample_rate, output_volume ):
|
||||
args.listen = listen
|
||||
args.share = share
|
||||
args.check_for_updates = check_for_updates
|
||||
args.models_from_local_only = models_from_local_only
|
||||
args.low_vram = low_vram
|
||||
args.cond_latent_max_chunk_size = cond_latent_max_chunk_size
|
||||
args.force_cpu_for_conditioning_latents = force_cpu_for_conditioning_latents
|
||||
args.sample_batch_size = sample_batch_size
|
||||
args.embed_output_metadata = embed_output_metadata
|
||||
args.latents_lean_and_mean = latents_lean_and_mean
|
||||
|
@ -554,7 +554,7 @@ def export_exec_settings( listen, share, check_for_updates, models_from_local_on
|
|||
'low-vram':args.low_vram,
|
||||
'check-for-updates':args.check_for_updates,
|
||||
'models-from-local-only':args.models_from_local_only,
|
||||
'cond-latent-max-chunk-size': args.cond_latent_max_chunk_size,
|
||||
'force-cpu-for-conditioning-latents': args.force_cpu_for_conditioning_latents,
|
||||
'sample-batch-size': args.sample_batch_size,
|
||||
'embed-output-metadata': args.embed_output_metadata,
|
||||
'latents-lean-and-mean': args.latents_lean_and_mean,
|
||||
|
@ -580,7 +580,7 @@ def setup_args():
|
|||
'latents-lean-and-mean': True,
|
||||
'voice-fixer': True,
|
||||
'voice-fixer-use-cuda': True,
|
||||
'cond-latent-max-chunk-size': 1000000,
|
||||
'force-cpu-for-conditioning-latents': False,
|
||||
'concurrency-count': 2,
|
||||
'output-sample-rate': 44100,
|
||||
'output-volume': 1,
|
||||
|
@ -602,8 +602,8 @@ def setup_args():
|
|||
parser.add_argument("--latents-lean-and-mean", action='store_true', default=default_arguments['latents-lean-and-mean'], help="Exports the bare essentials for latents.")
|
||||
parser.add_argument("--voice-fixer", action='store_true', default=default_arguments['voice-fixer'], help="Uses python module 'voicefixer' to improve audio quality, if available.")
|
||||
parser.add_argument("--voice-fixer-use-cuda", action='store_true', default=default_arguments['voice-fixer-use-cuda'], help="Hints to voicefixer to use CUDA, if available.")
|
||||
parser.add_argument("--cond-latent-max-chunk-size", default=default_arguments['cond-latent-max-chunk-size'], type=int, help="Sets an upper limit to audio chunk size when computing conditioning latents")
|
||||
parser.add_argument("--sample-batch-size", default=default_arguments['sample-batch-size'], type=int, help="Sets an upper limit to audio chunk size when computing conditioning latents")
|
||||
parser.add_argument("--force-cpu-for-conditioning-latents", default=default_arguments['force-cpu-for-conditioning-latents'], action='store_true', help="Forces computing conditional latents to be done on the CPU (if you constantyl OOM on low chunk counts)")
|
||||
parser.add_argument("--sample-batch-size", default=default_arguments['sample-batch-size'], type=int, help="Sets how many batches to use during the autoregressive samples pass")
|
||||
parser.add_argument("--concurrency-count", type=int, default=default_arguments['concurrency-count'], help="How many Gradio events to process at once")
|
||||
parser.add_argument("--output-sample-rate", type=int, default=default_arguments['output-sample-rate'], help="Sample rate to resample the output to (from 24KHz)")
|
||||
parser.add_argument("--output-volume", type=float, default=default_arguments['output-volume'], help="Adjusts volume of output")
|
||||
|
@ -916,7 +916,7 @@ def setup_gradio():
|
|||
gr.Button(value="Reload TTS").click(reload_tts)
|
||||
with gr.Column():
|
||||
exec_inputs = exec_inputs + [
|
||||
gr.Number(label="Voice Latents Max Chunk Size", precision=0, value=args.cond_latent_max_chunk_size),
|
||||
gr.Number(label="Voice Latents Max Chunk Size", precision=0, value=args.force_cpu_for_conditioning_latents),
|
||||
gr.Number(label="Sample Batch Size", precision=0, value=args.sample_batch_size),
|
||||
gr.Number(label="Concurrency Count", precision=0, value=args.concurrency_count),
|
||||
gr.Number(label="Ouptut Sample Rate", precision=0, value=args.output_sample_rate),
|
||||
|
|
Loading…
Reference in New Issue
Block a user