diff --git a/README.md b/README.md index e5c9da2..daca80f 100755 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ To try and keep the terminology used here (somewhat) consistent and coherent, be * `diffusion decoder` / `vocoder`: these passes are responsible for encoding the tokens into a MEL spectrogram into a waveform. * `diffusion iterations`: how many passes to put into generating the output waveform. More iterations = better audio quality. * `diffusion sampler` / `sampler`: the sampling method used during the diffusion decoding pass, albeit a bit of a misnomer. Currently, only two samplers are implemented. +* `OOM`: out of memory, happens due to cards boasting low VRAM, or terrible, god awful fragmentation ## Modifications @@ -274,7 +275,7 @@ Below are settings that override the default launch arguments. Some of these req * `Slimmer Computed Latents`: falls back to the original, 12.9KiB way of storing latents (without the extra bits required for using the CVVP model). * `Voice Fixer`: runs each generated audio clip through `voicefixer`, if available and installed. * `Use CUDA for Voice Fixer`: if available, hints to `voicefixer` to use hardware acceleration. this flag is specifically because I'll OOM on my 2060, since the models for `voicefixer` do not leave the GPU and are heavily fragmented, I presume. -* `Voice Latent Max Chunk Size`: during the voice latents calculation pass, this limits how large, in bytes, a chunk can be. Large values can run into VRAM OOM errors. +* `Force CPU for Conditioning Latents`: forces conditional latents to be calculated on the CPU. Use this if you have really, really large voice samples, and you insist on using very low chunk sizes that your GPU keeps OOMing when calculating * `Sample Batch Size`: sets the batch size when generating autoregressive samples. Bigger batches result in faster compute, at the cost of increased VRAM consumption. Leave to 0 to calculate a "best" fit. * `Concurrency Count`: how many Gradio events the queue can process at once. Leave this over 1 if you want to modify settings in the UI that updates other settings while generating audio clips. * `Output Sample Rate`: the sample rate to save the generated audio as. It provides a bit of slight bump in quality diff --git a/tortoise/api.py b/tortoise/api.py index c31b0c7..7ca0a74 100755 --- a/tortoise/api.py +++ b/tortoise/api.py @@ -294,7 +294,7 @@ class TextToSpeech: if self.preloaded_tensors: self.cvvp = self.cvvp.to(self.device) - def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, progress=None, slices=1, max_chunk_size=None): + def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, progress=None, slices=1, max_chunk_size=None, force_cpu=False): """ Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent). These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic @@ -303,7 +303,9 @@ class TextToSpeech: """ with torch.no_grad(): # computing conditional latents requires being done on the CPU if using DML because M$ still hasn't implemented some core functions - device = torch.device('cpu') if get_device_name() == "dml" else self.device + if get_device_name() == "dml": + force_cpu = True + device = torch.device('cpu') if force_cpu else self.device if not isinstance(voice_samples, list): voice_samples = [voice_samples] @@ -371,6 +373,8 @@ class TextToSpeech: else: self.diffusion = self.diffusion.cpu() + + if return_mels: return auto_latent, diffusion_latent, auto_conds, diffusion_conds else: diff --git a/webui.py b/webui.py index f20d75c..f7e5130 100755 --- a/webui.py +++ b/webui.py @@ -74,7 +74,7 @@ def generate( if voice_samples is not None: sample_voice = torch.cat(voice_samples, dim=-1).squeeze().cpu() - conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, slices=voice_latents_chunks) + conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, slices=voice_latents_chunks, force_cpu=args.force_cpu_for_conditioning_latents) if len(conditioning_latents) == 4: conditioning_latents = (conditioning_latents[0], conditioning_latents[1], conditioning_latents[2], None) @@ -331,7 +331,7 @@ def compute_latents(voice, voice_latents_chunks, progress=gr.Progress(track_tqdm if voice_samples is None: return - conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, slices=voice_latents_chunks) + conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, slices=voice_latents_chunks, force_cpu=args.force_cpu_for_conditioning_latents) if len(conditioning_latents) == 4: conditioning_latents = (conditioning_latents[0], conditioning_latents[1], conditioning_latents[2], None) @@ -532,13 +532,13 @@ def get_voice_list(dir=get_voice_dir()): def update_voices(): return gr.Dropdown.update(choices=get_voice_list()) -def export_exec_settings( listen, share, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, voice_fixer, voice_fixer_use_cuda, cond_latent_max_chunk_size, sample_batch_size, concurrency_count, output_sample_rate, output_volume ): +def export_exec_settings( listen, share, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, voice_fixer, voice_fixer_use_cuda, force_cpu_for_conditioning_latents, sample_batch_size, concurrency_count, output_sample_rate, output_volume ): args.listen = listen args.share = share args.check_for_updates = check_for_updates args.models_from_local_only = models_from_local_only args.low_vram = low_vram - args.cond_latent_max_chunk_size = cond_latent_max_chunk_size + args.force_cpu_for_conditioning_latents = force_cpu_for_conditioning_latents args.sample_batch_size = sample_batch_size args.embed_output_metadata = embed_output_metadata args.latents_lean_and_mean = latents_lean_and_mean @@ -554,7 +554,7 @@ def export_exec_settings( listen, share, check_for_updates, models_from_local_on 'low-vram':args.low_vram, 'check-for-updates':args.check_for_updates, 'models-from-local-only':args.models_from_local_only, - 'cond-latent-max-chunk-size': args.cond_latent_max_chunk_size, + 'force-cpu-for-conditioning-latents': args.force_cpu_for_conditioning_latents, 'sample-batch-size': args.sample_batch_size, 'embed-output-metadata': args.embed_output_metadata, 'latents-lean-and-mean': args.latents_lean_and_mean, @@ -580,7 +580,7 @@ def setup_args(): 'latents-lean-and-mean': True, 'voice-fixer': True, 'voice-fixer-use-cuda': True, - 'cond-latent-max-chunk-size': 1000000, + 'force-cpu-for-conditioning-latents': False, 'concurrency-count': 2, 'output-sample-rate': 44100, 'output-volume': 1, @@ -602,8 +602,8 @@ def setup_args(): parser.add_argument("--latents-lean-and-mean", action='store_true', default=default_arguments['latents-lean-and-mean'], help="Exports the bare essentials for latents.") parser.add_argument("--voice-fixer", action='store_true', default=default_arguments['voice-fixer'], help="Uses python module 'voicefixer' to improve audio quality, if available.") parser.add_argument("--voice-fixer-use-cuda", action='store_true', default=default_arguments['voice-fixer-use-cuda'], help="Hints to voicefixer to use CUDA, if available.") - parser.add_argument("--cond-latent-max-chunk-size", default=default_arguments['cond-latent-max-chunk-size'], type=int, help="Sets an upper limit to audio chunk size when computing conditioning latents") - parser.add_argument("--sample-batch-size", default=default_arguments['sample-batch-size'], type=int, help="Sets an upper limit to audio chunk size when computing conditioning latents") + parser.add_argument("--force-cpu-for-conditioning-latents", default=default_arguments['force-cpu-for-conditioning-latents'], action='store_true', help="Forces computing conditional latents to be done on the CPU (if you constantyl OOM on low chunk counts)") + parser.add_argument("--sample-batch-size", default=default_arguments['sample-batch-size'], type=int, help="Sets how many batches to use during the autoregressive samples pass") parser.add_argument("--concurrency-count", type=int, default=default_arguments['concurrency-count'], help="How many Gradio events to process at once") parser.add_argument("--output-sample-rate", type=int, default=default_arguments['output-sample-rate'], help="Sample rate to resample the output to (from 24KHz)") parser.add_argument("--output-volume", type=float, default=default_arguments['output-volume'], help="Adjusts volume of output") @@ -916,7 +916,7 @@ def setup_gradio(): gr.Button(value="Reload TTS").click(reload_tts) with gr.Column(): exec_inputs = exec_inputs + [ - gr.Number(label="Voice Latents Max Chunk Size", precision=0, value=args.cond_latent_max_chunk_size), + gr.Number(label="Voice Latents Max Chunk Size", precision=0, value=args.force_cpu_for_conditioning_latents), gr.Number(label="Sample Batch Size", precision=0, value=args.sample_batch_size), gr.Number(label="Concurrency Count", precision=0, value=args.concurrency_count), gr.Number(label="Ouptut Sample Rate", precision=0, value=args.output_sample_rate),