added option: force cpu for conditioning latents, for when you want low chunk counts but your GPU keeps OOMing because fuck fragmentation

2023-02-15 05:01:40 +00:00 · 2023-02-15 05:01:40 +00:00 · ea1bc770aa
commit ea1bc770aa
parent b721e395b5
3 changed files with 17 additions and 12 deletions
--- a/README.md
+++ b/README.md
@ -25,6 +25,7 @@ To try and keep the terminology used here (somewhat) consistent and coherent, be
 * `diffusion decoder` / `vocoder`: these passes are responsible for encoding the tokens into a MEL spectrogram into a waveform.
 * `diffusion iterations`: how many passes to put into generating the output waveform. More iterations = better audio quality.
 * `diffusion sampler` / `sampler`: the sampling method used during the diffusion decoding pass, albeit a bit of a misnomer. Currently, only two samplers are implemented.
 * `OOM`: out of memory, happens due to cards boasting low VRAM, or terrible, god awful fragmentation
 ## Modifications
@ -274,7 +275,7 @@ Below are settings that override the default launch arguments. Some of these req
 * `Slimmer Computed Latents`: falls back to the original, 12.9KiB way of storing latents (without the extra bits required for using the CVVP model).
 * `Voice Fixer`: runs each generated audio clip through `voicefixer`, if available and installed.
 * `Use CUDA for Voice Fixer`: if available, hints to `voicefixer` to use hardware acceleration. this flag is specifically because I'll OOM on my 2060, since the models for `voicefixer` do not leave the GPU and are heavily fragmented, I presume.
-* `Voice Latent Max Chunk Size`: during the voice latents calculation pass, this limits how large, in bytes, a chunk can be. Large values can run into VRAM OOM errors.
+* `Force CPU for Conditioning Latents`: forces conditional latents to be calculated on the CPU. Use this if you have really, really large voice samples, and you insist on using very low chunk sizes that your GPU keeps OOMing when calculating
 * `Sample Batch Size`: sets the batch size when generating autoregressive samples. Bigger batches result in faster compute, at the cost of increased VRAM consumption. Leave to 0 to calculate a "best" fit.
 * `Concurrency Count`: how many Gradio events the queue can process at once. Leave this over 1 if you want to modify settings in the UI that updates other settings while generating audio clips.
 * `Output Sample Rate`: the sample rate to save the generated audio as. It provides a bit of slight bump in quality
--- a/tortoise/api.py
+++ b/tortoise/api.py
@ -294,7 +294,7 @@ class TextToSpeech:
        if self.preloaded_tensors:
            self.cvvp = self.cvvp.to(self.device)
-    def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, progress=None, slices=1, max_chunk_size=None):
+    def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, progress=None, slices=1, max_chunk_size=None, force_cpu=False):
        """
        Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
        These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
@ -303,7 +303,9 @@ class TextToSpeech:
        """
        with torch.no_grad():
            # computing conditional latents requires being done on the CPU if using DML because M$ still hasn't implemented some core functions
-            device = torch.device('cpu') if get_device_name() == "dml" else self.device
+            if get_device_name() == "dml":
                force_cpu = True
            device = torch.device('cpu') if force_cpu else self.device
            if not isinstance(voice_samples, list):
                voice_samples = [voice_samples]
@ -371,6 +373,8 @@ class TextToSpeech:
            else:
                self.diffusion = self.diffusion.cpu()
        if return_mels:
            return auto_latent, diffusion_latent, auto_conds, diffusion_conds
        else:
--- a/webui.py
+++ b/webui.py
@ -74,7 +74,7 @@ def generate(
    if voice_samples is not None:
        sample_voice = torch.cat(voice_samples, dim=-1).squeeze().cpu()
-        conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, slices=voice_latents_chunks)
+        conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, slices=voice_latents_chunks, force_cpu=args.force_cpu_for_conditioning_latents)
        if len(conditioning_latents) == 4:
            conditioning_latents = (conditioning_latents[0], conditioning_latents[1], conditioning_latents[2], None)
@ -331,7 +331,7 @@ def compute_latents(voice, voice_latents_chunks, progress=gr.Progress(track_tqdm
    if voice_samples is None:
        return
-    conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, slices=voice_latents_chunks)
+    conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, slices=voice_latents_chunks, force_cpu=args.force_cpu_for_conditioning_latents)
    if len(conditioning_latents) == 4:
        conditioning_latents = (conditioning_latents[0], conditioning_latents[1], conditioning_latents[2], None)
@ -532,13 +532,13 @@ def get_voice_list(dir=get_voice_dir()):
 def update_voices():
    return gr.Dropdown.update(choices=get_voice_list())
-def export_exec_settings( listen, share, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, voice_fixer, voice_fixer_use_cuda, cond_latent_max_chunk_size, sample_batch_size, concurrency_count, output_sample_rate, output_volume ):
+def export_exec_settings( listen, share, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, voice_fixer, voice_fixer_use_cuda, force_cpu_for_conditioning_latents, sample_batch_size, concurrency_count, output_sample_rate, output_volume ):
    args.listen = listen
    args.share = share
    args.check_for_updates = check_for_updates
    args.models_from_local_only = models_from_local_only
    args.low_vram = low_vram
-    args.cond_latent_max_chunk_size = cond_latent_max_chunk_size
+    args.force_cpu_for_conditioning_latents = force_cpu_for_conditioning_latents
    args.sample_batch_size = sample_batch_size
    args.embed_output_metadata = embed_output_metadata
    args.latents_lean_and_mean = latents_lean_and_mean
@ -554,7 +554,7 @@ def export_exec_settings( listen, share, check_for_updates, models_from_local_on
        'low-vram':args.low_vram,
        'check-for-updates':args.check_for_updates,
        'models-from-local-only':args.models_from_local_only,
-        'cond-latent-max-chunk-size': args.cond_latent_max_chunk_size,
+        'force-cpu-for-conditioning-latents': args.force_cpu_for_conditioning_latents,
        'sample-batch-size': args.sample_batch_size,
        'embed-output-metadata': args.embed_output_metadata,
        'latents-lean-and-mean': args.latents_lean_and_mean,
@ -580,7 +580,7 @@ def setup_args():
        'latents-lean-and-mean': True,
        'voice-fixer': True,
        'voice-fixer-use-cuda': True,
-        'cond-latent-max-chunk-size': 1000000,
+        'force-cpu-for-conditioning-latents': False,
        'concurrency-count': 2,
        'output-sample-rate': 44100,
        'output-volume': 1,
@ -602,8 +602,8 @@ def setup_args():
    parser.add_argument("--latents-lean-and-mean", action='store_true', default=default_arguments['latents-lean-and-mean'], help="Exports the bare essentials for latents.")
    parser.add_argument("--voice-fixer", action='store_true', default=default_arguments['voice-fixer'], help="Uses python module 'voicefixer' to improve audio quality, if available.")
    parser.add_argument("--voice-fixer-use-cuda", action='store_true', default=default_arguments['voice-fixer-use-cuda'], help="Hints to voicefixer to use CUDA, if available.")
-    parser.add_argument("--cond-latent-max-chunk-size", default=default_arguments['cond-latent-max-chunk-size'], type=int, help="Sets an upper limit to audio chunk size when computing conditioning latents")
+    parser.add_argument("--force-cpu-for-conditioning-latents", default=default_arguments['force-cpu-for-conditioning-latents'], action='store_true', help="Forces computing conditional latents to be done on the CPU (if you constantyl OOM on low chunk counts)")
-    parser.add_argument("--sample-batch-size", default=default_arguments['sample-batch-size'], type=int, help="Sets an upper limit to audio chunk size when computing conditioning latents")
+    parser.add_argument("--sample-batch-size", default=default_arguments['sample-batch-size'], type=int, help="Sets how many batches to use during the autoregressive samples pass")
    parser.add_argument("--concurrency-count", type=int, default=default_arguments['concurrency-count'], help="How many Gradio events to process at once")
    parser.add_argument("--output-sample-rate", type=int, default=default_arguments['output-sample-rate'], help="Sample rate to resample the output to (from 24KHz)")
    parser.add_argument("--output-volume", type=float, default=default_arguments['output-volume'], help="Adjusts volume of output")
@ -916,7 +916,7 @@ def setup_gradio():
                    gr.Button(value="Reload TTS").click(reload_tts)
                with gr.Column():
                    exec_inputs = exec_inputs + [
-                        gr.Number(label="Voice Latents Max Chunk Size", precision=0, value=args.cond_latent_max_chunk_size),
+                        gr.Number(label="Voice Latents Max Chunk Size", precision=0, value=args.force_cpu_for_conditioning_latents),
                        gr.Number(label="Sample Batch Size", precision=0, value=args.sample_batch_size),
                        gr.Number(label="Concurrency Count", precision=0, value=args.concurrency_count),
                        gr.Number(label="Ouptut Sample Rate", precision=0, value=args.output_sample_rate),