Update 'tortoise/utils/device.py'

Noticed that the autoregressive batch size was being set off of VRAM size. Adjusted to scale for the VRAM capacity of 90 series GPUs. In this case, 16 -> 32 batches. Using the standard pre-set with ChungusVGAN, I went from 16 steps to 8. Over an average of 3 runs, I achieved an average of 294 seconds with 16 batches, to 234 seconds with 32. Can't complain at a 1.2x speed increase with functionally 2 lines of code. Can't complain. I restarted tortoise each run, and executing ```torch.cuda.empty_cache()``` just before loading the autoregressive model to clean the memory cache each time.
2023-03-07 14:05:27 +00:00
15 changed files with 272 additions and 488 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -7,9 +7,9 @@ progressbar
 einops
 unidecode
 scipy
-librosa==0.8.1
+librosa
 torchaudio
 threadpoolctl
 appdirs
-numpy<=1.23.5
+numpy
 numba
--- a/setup.py
+++ b/setup.py
@ -6,7 +6,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
 setuptools.setup(
    name="TorToiSe",
    packages=setuptools.find_packages(),
-    version="2.4.5",
+    version="2.4.4",
    author="James Betker",
    author_email="james@adamant.ai",
    description="A high quality multi-voice text-to-speech library",
--- a/tortoise/api.py
+++ b/tortoise/api.py
@ -29,7 +29,7 @@ from tortoise.utils.diffusion import SpacedDiffusion, space_timesteps, get_named
 from tortoise.utils.tokenizer import VoiceBpeTokenizer
 from tortoise.utils.wav2vec_alignment import Wav2VecAlignment

-from tortoise.utils.device import get_device, get_device_name, get_device_batch_size, print_stats, do_gc
+from tortoise.utils.device import get_device, get_device_name, get_device_batch_size

 pbar = None
 STOP_SIGNAL = False
@ -43,12 +43,8 @@ MODELS = {
    'vocoder.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/vocoder.pth',
    'rlg_auto.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/rlg_auto.pth',
    'rlg_diffuser.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/rlg_diffuser.pth',
-    
    'bigvgan_base_24khz_100band.pth': 'https://huggingface.co/ecker/tortoise-tts-models/resolve/main/models/bigvgan_base_24khz_100band.pth',
-    'bigvgan_24khz_100band.pth': 'https://huggingface.co/ecker/tortoise-tts-models/resolve/main/models/bigvgan_24khz_100band.pth',
-
-    'bigvgan_base_24khz_100band.json': 'https://huggingface.co/ecker/tortoise-tts-models/resolve/main/models/bigvgan_base_24khz_100band.json',
-    'bigvgan_24khz_100band.json': 'https://huggingface.co/ecker/tortoise-tts-models/resolve/main/models/bigvgan_24khz_100band.json',
+    #'bigvgan_24khz_100band.pth': 'https://huggingface.co/ecker/tortoise-tts-models/resolve/main/models/bigvgan_24khz_100band.pth',
 }

 def hash_file(path, algo="md5", buffer_size=0):
@ -83,6 +79,16 @@ def check_for_kill_signal():
        STOP_SIGNAL = False
        raise Exception("Kill signal detected")

+def tqdm_override(arr, verbose=False, progress=None, desc=None):
+    check_for_kill_signal()
+
+    if verbose and desc is not None:
+        print(desc)
+
+    if progress is None:
+        return tqdm(arr, disable=not verbose)
+    return progress.tqdm(arr, desc=f'{progress.msg_prefix} {desc}' if hasattr(progress, 'msg_prefix') else desc, track_tqdm=True)
+
 def download_models(specific_models=None):
    """
    Call to download all the models that Tortoise uses.
@ -150,7 +156,7 @@ def load_discrete_vocoder_diffuser(trained_diffusion_steps=4000, desired_diffusi
                           model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', trained_diffusion_steps),
                           conditioning_free=cond_free, conditioning_free_k=cond_free_k)

-@torch.inference_mode()
+
 def format_conditioning(clip, cond_length=132300, device='cuda', sampling_rate=22050):
    """
    Converts the given conditioning signal to a MEL spectrogram and clips it as expected by the models.
@ -162,8 +168,8 @@ def format_conditioning(clip, cond_length=132300, device='cuda', sampling_rate=2
        rand_start = random.randint(0, gap)
        clip = clip[:, rand_start:rand_start + cond_length]
    mel_clip = TorchMelSpectrogram(sampling_rate=sampling_rate)(clip.unsqueeze(0)).squeeze(0)
-    mel_clip = mel_clip.unsqueeze(0)
-    return migrate_to_device(mel_clip, device)
+    return mel_clip.unsqueeze(0).to(device)
+

 def fix_autoregressive_output(codes, stop_token, complain=True):
    """
@ -194,8 +200,8 @@ def fix_autoregressive_output(codes, stop_token, complain=True):

    return codes

-@torch.inference_mode()
-def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True, desc=None, sampler="P", input_sample_rate=22050, output_sample_rate=24000):
+
+def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True, progress=None, desc=None, sampler="P", input_sample_rate=22050, output_sample_rate=24000):
    """
    Uses the specified diffusion model to convert discrete codes into a spectrogram.
    """
@ -208,7 +214,8 @@ def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_la
        
        diffuser.sampler = sampler.lower()
        mel = diffuser.sample_loop(diffusion_model, output_shape, noise=noise,
-                                      model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings}, desc=desc)
+                                      model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings},
+                                     verbose=verbose, progress=progress, desc=desc)

        mel = denormalize_tacotron_mel(mel)[:,:,:output_seq_len]
        if get_device_name() == "dml":
@ -230,37 +237,12 @@ def classify_audio_clip(clip):
    results = F.softmax(classifier(clip), dim=-1)
    return results[0][0]

-def migrate_to_device( t, device ):
-    if t is None:
-        return t
-
-    if not hasattr(t, 'device'):
-        t.device = device
-        t.manually_track_device = True
-    elif t.device == device:
-        return t
-
-    if hasattr(t, 'manually_track_device') and t.manually_track_device:
-        t.device = device
-
-    t = t.to(device)
-    
-    do_gc()
-
-    return t
-
 class TextToSpeech:
    """
    Main entry point into Tortoise.
    """

-    def __init__(self, autoregressive_batch_size=None, models_dir=MODELS_DIR, enable_redaction=True, device=None,
-        minor_optimizations=True,
-        unsqueeze_sample_batches=False,
-        input_sample_rate=22050, output_sample_rate=24000,
-        autoregressive_model_path=None, diffusion_model_path=None, vocoder_model=None, tokenizer_json=None,
-#    ):
-        use_deepspeed=False):  # Add use_deepspeed parameter
+    def __init__(self, autoregressive_batch_size=None, models_dir=MODELS_DIR, enable_redaction=True, device=None, minor_optimizations=True, input_sample_rate=22050, output_sample_rate=24000, autoregressive_model_path=None, vocoder_model=None):
        """
        Constructor
        :param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing
@ -280,9 +262,7 @@ class TextToSpeech:
        self.input_sample_rate = input_sample_rate
        self.output_sample_rate = output_sample_rate
        self.minor_optimizations = minor_optimizations
-        self.unsqueeze_sample_batches = unsqueeze_sample_batches
-        self.use_deepspeed = use_deepspeed  # Store use_deepspeed as an instance variable
-        print(f'use_deepspeed api_debug {use_deepspeed}')
+
        # for clarity, it's simpler to split these up and just predicate them on requesting VRAM-consuming optimizations
        self.preloaded_tensors = minor_optimizations
        self.use_kv_cache = minor_optimizations
@ -297,23 +277,23 @@ class TextToSpeech:
        if self.enable_redaction:
            self.aligner = Wav2VecAlignment(device='cpu' if get_device_name() == "dml" else self.device)

-        self.load_tokenizer_json(tokenizer_json)
+        self.tokenizer = VoiceBpeTokenizer()
+

        if os.path.exists(f'{models_dir}/autoregressive.ptt'):
+            # Assume this is a traced directory.
            self.autoregressive = torch.jit.load(f'{models_dir}/autoregressive.ptt')
+            self.diffusion = torch.jit.load(f'{models_dir}/diffusion_decoder.ptt')
        else:
            if not autoregressive_model_path or not os.path.exists(autoregressive_model_path):
                autoregressive_model_path = get_model_path('autoregressive.pth', models_dir)

            self.load_autoregressive_model(autoregressive_model_path)

-        if os.path.exists(f'{models_dir}/diffusion_decoder.ptt'):
-            self.diffusion = torch.jit.load(f'{models_dir}/diffusion_decoder.ptt')
-        else:
-            if not diffusion_model_path or not os.path.exists(diffusion_model_path):
-                diffusion_model_path = get_model_path('diffusion_decoder.pth', models_dir)
-
-            self.load_diffusion_model(diffusion_model_path)
+            self.diffusion = DiffusionTts(model_channels=1024, num_layers=10, in_channels=100, out_channels=200,
+                                          in_latent_channels=1024, in_tokens=8193, dropout=0, use_fp16=False, num_heads=16,
+                                          layer_drop=0, unconditioned_percentage=0).cpu().eval()
+            self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', models_dir)))


        self.clvp = CLVP(dim_text=768, dim_speech=768, dim_latent=768, num_text_tokens=256, text_enc_depth=20,
@ -331,107 +311,39 @@ class TextToSpeech:
        self.rlg_diffusion = None

        if self.preloaded_tensors:
-            self.autoregressive = migrate_to_device( self.autoregressive, self.device )
-            self.diffusion = migrate_to_device( self.diffusion, self.device )
-            self.clvp = migrate_to_device( self.clvp, self.device )
-            self.vocoder = migrate_to_device( self.vocoder, self.device )
-
+            self.autoregressive = self.autoregressive.to(self.device)
+            self.diffusion = self.diffusion.to(self.device)
+            self.clvp = self.clvp.to(self.device)
+            self.vocoder = self.vocoder.to(self.device)
        self.loading = False

-    def load_autoregressive_model(self, autoregressive_model_path, is_xtts=False):
-        if hasattr(self,"autoregressive_model_path") and os.path.samefile(self.autoregressive_model_path, autoregressive_model_path):
+    def load_autoregressive_model(self, autoregressive_model_path):
+        if hasattr(self,"autoregressive_model_path") and self.autoregressive_model_path == autoregressive_model_path:
            return

-        self.autoregressive_model_path = autoregressive_model_path if autoregressive_model_path and os.path.exists(autoregressive_model_path) else get_model_path('autoregressive.pth', self.models_dir)
-        new_hash = hash_file(self.autoregressive_model_path)
-
-        if hasattr(self,"autoregressive_model_hash") and self.autoregressive_model_hash == new_hash:
-            return
-
-        self.autoregressive_model_hash = new_hash
-
        self.loading = True
+
+        self.autoregressive_model_path = autoregressive_model_path if autoregressive_model_path and os.path.exists(autoregressive_model_path) else get_model_path('autoregressive.pth', self.models_dir)
+        self.autoregressive_model_hash = hash_file(self.autoregressive_model_path)
        print(f"Loading autoregressive model: {self.autoregressive_model_path}")

        if hasattr(self, 'autoregressive'):
            del self.autoregressive

-        # XTTS requires a different "dimensionality" for its autoregressive model
-        if new_hash == "e4ce21eae0043f7691d6a6c8540b74b8" or is_xtts:
-            dimensionality = {
-                "max_mel_tokens": 605,
-                "max_text_tokens": 402,
-                "max_prompt_tokens": 70,
-                "max_conditioning_inputs": 1,
-                "layers": 30,
-                "model_dim": 1024,
-                "heads": 16,
-                "number_text_tokens": 5023, # -1
-                "start_text_token": 261,
-                "stop_text_token": 0,
-                "number_mel_codes": 8194,
-                "start_mel_token": 8192,
-                "stop_mel_token": 8193,
-            }
-        else:
-            dimensionality = {
-                "max_mel_tokens": 604,
-                "max_text_tokens": 402,
-                "max_conditioning_inputs": 2,
-                "layers": 30,
-                "model_dim": 1024,
-                "heads": 16,
-                "number_text_tokens": 255,
-                "start_text_token": 255,
-                "checkpointing": False,
-                "train_solo_embeddings": False
-            }
-
-        self.autoregressive = UnifiedVoice(**dimensionality).cpu().eval()
+        self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
+                                          model_dim=1024,
+                                          heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
+                                          train_solo_embeddings=False).cpu().eval()
        self.autoregressive.load_state_dict(torch.load(self.autoregressive_model_path))
-        self.autoregressive.post_init_gpt2_config(use_deepspeed=self.use_deepspeed, kv_cache=self.use_kv_cache)
+        self.autoregressive.post_init_gpt2_config(kv_cache=self.use_kv_cache)
        if self.preloaded_tensors:
-            self.autoregressive = migrate_to_device( self.autoregressive, self.device )
+            self.autoregressive = self.autoregressive.to(self.device)

        self.loading = False
        print(f"Loaded autoregressive model")

-    def load_diffusion_model(self, diffusion_model_path):
-        if hasattr(self,"diffusion_model_path") and os.path.samefile(self.diffusion_model_path, diffusion_model_path):
-            return
-
-        self.loading = True
-
-        self.diffusion_model_path = diffusion_model_path if diffusion_model_path and os.path.exists(diffusion_model_path) else get_model_path('diffusion_decoder.pth', self.models_dir)
-        self.diffusion_model_hash = hash_file(self.diffusion_model_path)
-
-        if hasattr(self, 'diffusion'):
-            del self.diffusion
-
-        # XTTS does not require a different "dimensionality" for its diffusion model
-        dimensionality = {
-            "model_channels": 1024,
-            "num_layers": 10,
-            "in_channels": 100,
-            "out_channels": 200,
-            "in_latent_channels": 1024,
-            "in_tokens": 8193,
-            "dropout": 0,
-            "use_fp16": False,
-            "num_heads": 16,
-            "layer_drop": 0,
-            "unconditioned_percentage": 0
-        }
-        self.diffusion = DiffusionTts(**dimensionality)
-        self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', self.models_dir)))
-        if self.preloaded_tensors:
-            self.diffusion = migrate_to_device( self.diffusion, self.device )
-
-        self.loading = False
-        print(f"Loaded diffusion model")
-
    def load_vocoder_model(self, vocoder_model):
-        if hasattr(self,"vocoder_model_path") and os.path.samefile(self.vocoder_model_path, vocoder_model):
+        if hasattr(self,"vocoder_model_path") and self.vocoder_model_path == vocoder_model:
            return

        self.loading = True
@ -439,7 +351,7 @@ class TextToSpeech:
        if hasattr(self, 'vocoder'):
            del self.vocoder

-        print("Loading vocoder model:", vocoder_model)
+        print(vocoder_model)
        if vocoder_model is None:
            vocoder_model = 'bigvgan_24khz_100band'

@ -449,12 +361,7 @@ class TextToSpeech:
            self.vocoder_model_path = 'bigvgan_24khz_100band.pth'
            if f'{vocoder_model}.pth' in MODELS:
                self.vocoder_model_path = f'{vocoder_model}.pth'
-            vocoder_config = 'bigvgan_24khz_100band.json'
-            if f'{vocoder_model}.json' in MODELS:
-                vocoder_config = f'{vocoder_model}.json'
-            vocoder_config = get_model_path(vocoder_config, self.models_dir)
-
-            self.vocoder = BigVGAN(config=vocoder_config).cpu()
+            self.vocoder = BigVGAN().cpu()
        #elif vocoder_model == "univnet":
        else:
            vocoder_key = 'model_g'
@ -466,26 +373,10 @@ class TextToSpeech:

        self.vocoder.eval(inference=True)
        if self.preloaded_tensors:
-            self.vocoder = migrate_to_device( self.vocoder, self.device )
+            self.vocoder = self.vocoder.to(self.device)
        self.loading = False
        print(f"Loaded vocoder model")

-    def load_tokenizer_json(self, tokenizer_json):
-        if hasattr(self,"tokenizer_json") and os.path.samefile(self.tokenizer_json, tokenizer_json):
-            return
-        
-        self.loading = True
-        self.tokenizer_json = tokenizer_json if tokenizer_json else os.path.join(os.path.dirname(os.path.realpath(__file__)), '../tortoise/data/tokenizer.json')
-        print("Loading tokenizer JSON:", self.tokenizer_json)
-
-        if hasattr(self, 'tokenizer'):
-            del self.tokenizer
-
-        self.tokenizer = VoiceBpeTokenizer(vocab_file=self.tokenizer_json)
-
-        self.loading = False
-        print(f"Loaded tokenizer")
-
    def load_cvvp(self):
        """Load CVVP model."""
        self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0,
@ -493,17 +384,15 @@ class TextToSpeech:
        self.cvvp.load_state_dict(torch.load(get_model_path('cvvp.pth', self.models_dir)))
        
        if self.preloaded_tensors:
-            self.cvvp = migrate_to_device( self.cvvp, self.device )
+            self.cvvp = self.cvvp.to(self.device)

-    @torch.inference_mode()
-    def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, slices=1, max_chunk_size=None, force_cpu=False, original_ar=False, original_diffusion=False):
+    def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, progress=None, slices=1, max_chunk_size=None, force_cpu=False):
        """
        Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
        These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
        properties.
        :param voice_samples: List of 2 or more ~10 second reference clips, which should be torch tensors containing 22.05kHz waveform data.
        """
-
        with torch.no_grad():
            # computing conditional latents requires being done on the CPU if using DML because M$ still hasn't implemented some core functions
            if get_device_name() == "dml":
@ -513,75 +402,70 @@ class TextToSpeech:
            if not isinstance(voice_samples, list):
                voice_samples = [voice_samples]
            
-            resampler_22K = torchaudio.transforms.Resample(
+            voice_samples = [v.to(device) for v in voice_samples]
+
+            resampler = torchaudio.transforms.Resample(
                self.input_sample_rate,
-                22050,
+                self.output_sample_rate,
                lowpass_filter_width=16,
                rolloff=0.85,
                resampling_method="kaiser_window",
                beta=8.555504641634386,
-            ).to(device)
-
-            resampler_24K = torchaudio.transforms.Resample(
-                self.input_sample_rate,
-                24000,
-                lowpass_filter_width=16,
-                rolloff=0.85,
-                resampling_method="kaiser_window",
-                beta=8.555504641634386,
-            ).to(device)
-
-            voice_samples = [migrate_to_device(v, device)  for v in voice_samples]
+            )

+            samples = []
            auto_conds = []
-            diffusion_conds = []
-
-            if original_ar:
-                samples = [resampler_22K(sample) for sample in voice_samples]
-                for sample in tqdm(samples, desc="Computing AR conditioning latents..."):
-                    auto_conds.append(format_conditioning(sample, device=device, sampling_rate=self.input_sample_rate, cond_length=132300))
-            else:
-                samples = [resampler_22K(sample) for sample in voice_samples]
-                concat = torch.cat(samples, dim=-1)
-                chunk_size = concat.shape[-1]
-
-                if slices == 0:
-                    slices = 1
-                elif max_chunk_size is not None and chunk_size > max_chunk_size:
-                    slices = 1
-                    while int(chunk_size / slices) > max_chunk_size:
-                        slices = slices + 1
-
-                chunks = torch.chunk(concat, slices, dim=1)
-                chunk_size = chunks[0].shape[-1]
-
-                for chunk in tqdm(chunks, desc="Computing AR conditioning latents..."):
-                    auto_conds.append(format_conditioning(chunk, device=device, sampling_rate=self.input_sample_rate, cond_length=chunk_size))
-                
-
-            if original_diffusion:
-                samples = [resampler_24K(sample) for sample in voice_samples]
-                for sample in tqdm(samples, desc="Computing diffusion conditioning latents..."):
-                    sample = pad_or_truncate(sample, 102400)
-                    cond_mel = wav_to_univnet_mel(migrate_to_device(sample, device), do_normalization=False, device=self.device)
-                    diffusion_conds.append(cond_mel)
-            else:
-                samples = [resampler_24K(sample) for sample in voice_samples]
-                for chunk in tqdm(chunks, desc="Computing diffusion conditioning latents..."):
-                    check_for_kill_signal()
-                    chunk = pad_or_truncate(chunk, chunk_size)
-                    cond_mel = wav_to_univnet_mel(migrate_to_device( chunk, device ), do_normalization=False, device=device)
-                    diffusion_conds.append(cond_mel)
+            for sample in voice_samples:
+                auto_conds.append(format_conditioning(sample, device=device, sampling_rate=self.input_sample_rate))
+                samples.append(resampler(sample.cpu()).to(device)) # icky no good, easier to do the resampling on CPU than figure out how to do it on GPU

            auto_conds = torch.stack(auto_conds, dim=1)
-            self.autoregressive = migrate_to_device( self.autoregressive, device )
+
+
+            self.autoregressive = self.autoregressive.to(device)
            auto_latent = self.autoregressive.get_conditioning(auto_conds)
-            self.autoregressive = migrate_to_device( self.autoregressive, self.device if self.preloaded_tensors else 'cpu' )
+            if self.preloaded_tensors:
+                self.autoregressive = self.autoregressive.to(self.device)
+            else:
+                self.autoregressive = self.autoregressive.cpu()
+
+            
+            diffusion_conds = []
+            chunks = []
+
+            concat = torch.cat(samples, dim=-1)
+            chunk_size = concat.shape[-1]
+
+            if slices == 0:
+                slices = 1
+            elif max_chunk_size is not None and chunk_size > max_chunk_size:
+                slices = 1
+                while int(chunk_size / slices) > max_chunk_size:
+                    slices = slices + 1
+
+            chunks = torch.chunk(concat, slices, dim=1)
+            chunk_size = chunks[0].shape[-1]
+            
+            # expand / truncate samples to match the common size
+            # required, as tensors need to be of the same length
+            for chunk in tqdm_override(chunks, verbose=verbose, progress=progress, desc="Computing conditioning latents..."):
+                check_for_kill_signal()
+                chunk = pad_or_truncate(chunk, chunk_size)
+                cond_mel = wav_to_univnet_mel(chunk.to(device), do_normalization=False, device=device)
+                diffusion_conds.append(cond_mel)

            diffusion_conds = torch.stack(diffusion_conds, dim=1)
-            self.diffusion = migrate_to_device( self.diffusion, device )
+
+            self.diffusion = self.diffusion.to(device)
+            
            diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
-            self.diffusion = migrate_to_device( self.diffusion, self.device if self.preloaded_tensors else 'cpu' )
+
+            if self.preloaded_tensors:
+                self.diffusion = self.diffusion.to(self.device)
+            else:
+                self.diffusion = self.diffusion.cpu()
+
+

        if return_mels:
            return auto_latent, diffusion_latent, auto_conds, diffusion_conds
@ -621,15 +505,12 @@ class TextToSpeech:
        settings.update(kwargs) # allow overriding of preset settings with kwargs
        return self.tts(text, **settings)

-    @torch.inference_mode()
    def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True, use_deterministic_seed=None,
            return_deterministic_state=False,
            # autoregressive generation parameters follow
            num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, top_p=.8, max_mel_tokens=500,
            sample_batch_size=None,
            autoregressive_model=None,
-            diffusion_model=None,
-            tokenizer_json=None,
            # CVVP parameters follow
            cvvp_amount=.0,
            # diffusion generation parameters follow
@ -637,6 +518,7 @@ class TextToSpeech:
            diffusion_sampler="P",
            breathing_room=8,
            half_p=False,
+            progress=None,
            **hf_generate_kwargs):
        """
        Produces an audio clip of the given text being spoken with the given reference voice.
@ -696,19 +578,7 @@ class TextToSpeech:
        elif autoregressive_model != self.autoregressive_model_path:
            self.load_autoregressive_model(autoregressive_model)

-        if diffusion_model is None:
-            diffusion_model = self.diffusion_model_path
-        elif diffusion_model != self.diffusion_model_path:
-            self.load_diffusion_model(diffusion_model)
-
-        if tokenizer_json is None:
-            tokenizer_json = self.tokenizer_json
-        elif tokenizer_json != self.tokenizer_json:
-            self.load_tokenizer_json(tokenizer_json)
-
-        text_tokens = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0)
-        text_tokens = migrate_to_device( text_tokens, self.device )
-
+        text_tokens = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0).to(self.device)
        text_tokens = F.pad(text_tokens, (0, 1))  # This may not be necessary.
        assert text_tokens.shape[-1] < 400, 'Too much text provided. Break the text up into separate segments and re-try inference.'

@ -736,12 +606,12 @@ class TextToSpeech:
            stop_mel_token = self.autoregressive.stop_mel_token
            calm_token = 83  # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"

-            self.autoregressive = migrate_to_device( self.autoregressive, self.device )
-            auto_conditioning = migrate_to_device( auto_conditioning, self.device )
-            text_tokens = migrate_to_device( text_tokens, self.device )
+            self.autoregressive = self.autoregressive.to(self.device)
+            auto_conditioning = auto_conditioning.to(self.device)
+            text_tokens = text_tokens.to(self.device)

            with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p):
-                for b in tqdm(range(num_batches), desc="Generating autoregressive samples"):
+                for b in tqdm_override(range(num_batches), verbose=verbose, progress=progress, desc="Generating autoregressive samples"):
                    check_for_kill_signal()
                    codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens,
                                                                 do_sample=True,
@ -757,30 +627,24 @@ class TextToSpeech:
                    samples.append(codes)

            if not self.preloaded_tensors:
-                self.autoregressive = migrate_to_device( self.autoregressive, 'cpu' )
-
-            if self.unsqueeze_sample_batches:
-                new_samples = []
-                for batch in samples:
-                     for i in range(batch.shape[0]):
-                        new_samples.append(batch[i].unsqueeze(0))
-                samples = new_samples
+                self.autoregressive = self.autoregressive.cpu()
+                auto_conditioning = auto_conditioning.cpu()

            clip_results = []
+
            if auto_conds is not None:
-                auto_conditioning = migrate_to_device( auto_conditioning, self.device )
+                auto_conds = auto_conds.to(self.device)

            with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p):
-                if not self.preloaded_tensors:
-                    self.autoregressive = migrate_to_device( self.autoregressive, 'cpu' )
-                    self.clvp = migrate_to_device( self.clvp, self.device )
+                if not self.minor_optimizations:
+                    self.autoregressive = self.autoregressive.cpu()
+                    self.clvp = self.clvp.to(self.device)

                if cvvp_amount > 0:
                    if self.cvvp is None:
                        self.load_cvvp()
-                    
-                    if not self.preloaded_tensors:
-                        self.cvvp = migrate_to_device( self.cvvp, self.device )
+                    if not self.minor_optimizations:
+                        self.cvvp = self.cvvp.to(self.device)
                
                desc="Computing best candidates"
                if verbose:
@ -789,8 +653,7 @@ class TextToSpeech:
                    else:
                        desc = f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%"

-                
-                for batch in tqdm(samples, desc=desc):
+                for batch in tqdm_override(samples, verbose=verbose, progress=progress, desc=desc):
                    check_for_kill_signal()
                    for i in range(batch.shape[0]):
                        batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
@ -811,31 +674,30 @@ class TextToSpeech:
                        clip_results.append(clvp)

            if not self.preloaded_tensors and auto_conds is not None:
-                auto_conds = migrate_to_device( auto_conds, 'cpu' )
+                auto_conds = auto_conds.cpu()

            clip_results = torch.cat(clip_results, dim=0)
            samples = torch.cat(samples, dim=0)
-            if k < num_autoregressive_samples:
-                best_results = samples[torch.topk(clip_results, k=k).indices]
-            else:
-                best_results = samples
+            best_results = samples[torch.topk(clip_results, k=k).indices]
            
            if not self.preloaded_tensors:
-                self.clvp = migrate_to_device( self.clvp, 'cpu' )
-                self.cvvp = migrate_to_device( self.cvvp, 'cpu' )
-            
-
-            if get_device_name() == "dml":
-                text_tokens = migrate_to_device( text_tokens, 'cpu' )
-                best_results = migrate_to_device( best_results, 'cpu' )
-                auto_conditioning = migrate_to_device( auto_conditioning, 'cpu' )
-                self.autoregressive = migrate_to_device( self.autoregressive, 'cpu' )
-            else:
-                auto_conditioning = auto_conditioning.to(self.device)
-                self.autoregressive = self.autoregressive.to(self.device)
+                self.clvp = self.clvp.cpu()
+                if self.cvvp is not None:
+                    self.cvvp = self.cvvp.cpu()

            del samples

+            if get_device_name() == "dml":
+                text_tokens = text_tokens.cpu()
+                best_results = best_results.cpu()
+                auto_conditioning = auto_conditioning.cpu()
+                self.autoregressive = self.autoregressive.cpu()
+            else:
+                #text_tokens = text_tokens.to(self.device)
+                #best_results = best_results.to(self.device)
+                auto_conditioning = auto_conditioning.to(self.device)
+                self.autoregressive = self.autoregressive.to(self.device)
+
            # The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
            # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
            # results, but will increase memory usage.
@ -844,19 +706,21 @@ class TextToSpeech:
                                               torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
                                               return_latent=True, clip_inputs=False)
            
-            diffusion_conditioning = migrate_to_device( diffusion_conditioning, self.device )
+            diffusion_conditioning = diffusion_conditioning.to(self.device)

            if get_device_name() == "dml":
-                self.autoregressive = migrate_to_device( self.autoregressive, self.device )
-                best_results = migrate_to_device( best_results, self.device )
-                best_latents = migrate_to_device( best_latents, self.device )
-                self.vocoder = migrate_to_device( self.vocoder, 'cpu' )
+                self.autoregressive = self.autoregressive.to(self.device)
+                best_results = best_results.to(self.device)
+                best_latents = best_latents.to(self.device)
+
+                self.vocoder = self.vocoder.cpu()
            else:
                if not self.preloaded_tensors:
-                    self.autoregressive = migrate_to_device( self.autoregressive, 'cpu' )
+                    self.autoregressive = self.autoregressive.cpu()
+
+                self.diffusion = self.diffusion.to(self.device)
+                self.vocoder = self.vocoder.to(self.device)

-                self.diffusion = migrate_to_device( self.diffusion, self.device )
-                self.vocoder = migrate_to_device( self.vocoder, self.device )
            
            del text_tokens
            del auto_conditioning
@ -878,21 +742,19 @@ class TextToSpeech:
                        break

                mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents, diffusion_conditioning,
-                                               temperature=diffusion_temperature, desc="Transforming autoregressive outputs into audio..", sampler=diffusion_sampler,
+                                               temperature=diffusion_temperature, verbose=verbose, progress=progress, desc="Transforming autoregressive outputs into audio..", sampler=diffusion_sampler,
                                               input_sample_rate=self.input_sample_rate, output_sample_rate=self.output_sample_rate)

                wav = self.vocoder.inference(mel)
                wav_candidates.append(wav)
            
            if not self.preloaded_tensors:
-                self.diffusion = migrate_to_device( self.diffusion, 'cpu' )
-                self.vocoder = migrate_to_device( self.vocoder, 'cpu' )
+                self.diffusion = self.diffusion.cpu()
+                self.vocoder = self.vocoder.cpu()

            def potentially_redact(clip, text):
                if self.enable_redaction:
-                    t = clip.squeeze(1)
-                    t = migrate_to_device( t, 'cpu' if get_device_name() == "dml" else self.device)
-                    return self.aligner.redact(t, text, self.output_sample_rate).unsqueeze(1)
+                    return self.aligner.redact(clip.squeeze(1).to('cpu' if get_device_name() == "dml" else self.device), text, self.output_sample_rate).unsqueeze(1)
                return clip
            wav_candidates = [potentially_redact(wav_candidate, text) for wav_candidate in wav_candidates]

@ -901,7 +763,7 @@ class TextToSpeech:
            else:
                res = wav_candidates[0]

-            do_gc()
+            gc.collect()

            if return_deterministic_state:
                return res, (deterministic_seed, text, voice_samples, conditioning_latents)
--- a/tortoise/do_tts.py
+++ b/tortoise/do_tts.py
@ -14,7 +14,6 @@ if __name__ == '__main__':
    parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
                                                 'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='random')
    parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard')
-    parser.add_argument('--use_deepspeed', type=bool, help='Use deepspeed for speed bump.', default=True)
    parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/')
    parser.add_argument('--model_dir', type=str, help='Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this'
                                                      'should only be specified if you have custom checkpoints.', default=MODELS_DIR)
@ -38,8 +37,8 @@ if __name__ == '__main__':


    os.makedirs(args.output_path, exist_ok=True)
-    #print(f'use_deepspeed do_tts_debug {use_deepspeed}')
-    tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed)
+
+    tts = TextToSpeech(models_dir=args.model_dir)

    selected_voices = args.voice.split(',')
    for k, selected_voice in enumerate(selected_voices):
--- a/tortoise/models/autoregressive.py
+++ b/tortoise/models/autoregressive.py
@ -283,9 +283,9 @@ class MelEncoder(nn.Module):


 class UnifiedVoice(nn.Module):
-    def __init__(self, layers=8, model_dim=512, heads=8, max_text_tokens=120, max_prompt_tokens=2, max_mel_tokens=250, max_conditioning_inputs=1,
+    def __init__(self, layers=8, model_dim=512, heads=8, max_text_tokens=120, max_mel_tokens=250, max_conditioning_inputs=1,
                 mel_length_compression=1024, number_text_tokens=256,
-                 start_text_token=None, stop_text_token=0, number_mel_codes=8194, start_mel_token=8192,
+                 start_text_token=None, number_mel_codes=8194, start_mel_token=8192,
                 stop_mel_token=8193, train_solo_embeddings=False, use_mel_codes_as_input=True,
                 checkpointing=True, types=1):
        """
@ -295,7 +295,6 @@ class UnifiedVoice(nn.Module):
            heads: Number of transformer heads. Must be divisible by model_dim. Recommend model_dim//64
            max_text_tokens: Maximum number of text tokens that will be encountered by model.
            max_mel_tokens: Maximum number of MEL tokens that will be encountered by model.
-            max_prompt_tokens: compat set to 2, 70 for XTTS
            max_conditioning_inputs: Maximum number of conditioning inputs provided to the model. If (1), conditioning input can be of format (b,80,s), otherwise (b,n,80,s).
            mel_length_compression: The factor between <number_input_samples> and <mel_tokens>. Used to compute MEL code padding given wav input length.
            number_text_tokens:
@ -312,7 +311,7 @@ class UnifiedVoice(nn.Module):

        self.number_text_tokens = number_text_tokens
        self.start_text_token = number_text_tokens * types if start_text_token is None else start_text_token
-        self.stop_text_token = stop_text_token
+        self.stop_text_token = 0
        self.number_mel_codes = number_mel_codes
        self.start_mel_token = start_mel_token
        self.stop_mel_token = stop_mel_token
@ -320,7 +319,6 @@ class UnifiedVoice(nn.Module):
        self.heads = heads
        self.max_mel_tokens = max_mel_tokens
        self.max_text_tokens = max_text_tokens
-        self.max_prompt_tokens = max_prompt_tokens
        self.model_dim = model_dim
        self.max_conditioning_inputs = max_conditioning_inputs
        self.mel_length_compression = mel_length_compression
@ -354,8 +352,8 @@ class UnifiedVoice(nn.Module):
        for module in embeddings:
            module.weight.data.normal_(mean=0.0, std=.02)

-    def post_init_gpt2_config(self, use_deepspeed=False, kv_cache=False):
-        seq_length = self.max_mel_tokens + self.max_text_tokens + self.max_prompt_tokens
+    def post_init_gpt2_config(self, kv_cache=False):
+        seq_length = self.max_mel_tokens + self.max_text_tokens + 2
        gpt_config = GPT2Config(vocab_size=self.max_mel_tokens,
                                n_positions=seq_length,
                                n_ctx=seq_length,
@ -365,17 +363,6 @@ class UnifiedVoice(nn.Module):
                                gradient_checkpointing=False,
                                use_cache=True)
        self.inference_model = GPT2InferenceModel(gpt_config, self.gpt, self.mel_pos_embedding, self.mel_embedding, self.final_norm, self.mel_head, kv_cache=kv_cache)
-        #print(f'use_deepspeed autoregressive_debug {use_deepspeed}')
-        if use_deepspeed and torch.cuda.is_available():
-            import deepspeed
-            self.ds_engine = deepspeed.init_inference(model=self.inference_model,  
-                                                    mp_size=1,
-                                                    replace_with_kernel_inject=True,
-                                                    dtype=torch.float32)
-            self.inference_model = self.ds_engine.module.eval()
-        else:
-            self.inference_model = self.inference_model.eval()
-            
        self.gpt.wte = self.mel_embedding

    def build_aligned_inputs_and_targets(self, input, start_token, stop_token):
@ -496,9 +483,9 @@ class UnifiedVoice(nn.Module):

    def inference_speech(self, speech_conditioning_latent, text_inputs, input_tokens=None, num_return_sequences=1,
                         max_generate_length=None, typical_sampling=False, typical_mass=.9, **hf_generate_kwargs):
-        seq_length = self.max_mel_tokens + self.max_text_tokens + self.max_prompt_tokens
+        seq_length = self.max_mel_tokens + self.max_text_tokens + 2
        if not hasattr(self, 'inference_model'):
-            self.post_init_gpt2_config(kv_cache=self.kv_cache)
+            self.post_init_gpt2_config(kv_cache=self.kv_cachepost_init_gpt2_config)
            

        text_inputs = F.pad(text_inputs, (0, 1), value=self.stop_text_token)
--- a/tortoise/models/bigvgan.py
+++ b/tortoise/models/bigvgan.py
@ -129,27 +129,14 @@ class AttrDict(dict):

 class BigVGAN(nn.Module):
    # this is our main BigVGAN model. Applies anti-aliased periodic activation for resblocks.
-    def __init__(self, config=None, data=None):
+    def __init__(self):
        super(BigVGAN, self).__init__()

-        """
        with open(os.path.join(os.path.dirname(__file__), 'config.json'), 'r') as f:
            data = f.read()
-        """
-        if config and data is None:
-            with open(config, 'r') as f:
-                data = f.read()
-            jsonConfig = json.loads(data)
-        elif data is not None:
-            if isinstance(data, str):
-                jsonConfig = json.loads(data)
-            else:
-                jsonConfig = data
-        else:
-            raise Exception("no config specified")
-

        global h
+        jsonConfig = json.loads(data)
        h = AttrDict(jsonConfig)

        self.mel_channel = h.num_mels
--- a/tortoise/models/clvp.py
+++ b/tortoise/models/clvp.py
@ -9,8 +9,6 @@ from tortoise.models.xtransformers import Encoder

 import tortoise.utils.torch_intermediary as ml

-from tortoise.utils.device import print_stats, do_gc
-
 def exists(val):
    return val is not None

@ -126,13 +124,14 @@ class CLVP(nn.Module):
            text_emb += self.text_pos_emb(torch.arange(text.shape[1], device=device))
            speech_emb += self.speech_pos_emb(torch.arange(speech_emb.shape[1], device=device))

-        
-        text_latents = self.to_text_latent(masked_mean(self.text_transformer(text_emb, mask=text_mask), text_mask, dim=1))
+        enc_text = self.text_transformer(text_emb, mask=text_mask)
+        enc_speech = self.speech_transformer(speech_emb, mask=voice_mask)

-        # on ROCm at least, allocated VRAM spikes here
-        do_gc()
-        speech_latents = self.to_speech_latent(masked_mean(self.speech_transformer(speech_emb, mask=voice_mask), voice_mask, dim=1))
-        do_gc()
+        text_latents = masked_mean(enc_text, text_mask, dim=1)
+        speech_latents = masked_mean(enc_speech, voice_mask, dim=1)
+
+        text_latents = self.to_text_latent(text_latents)
+        speech_latents = self.to_speech_latent(speech_latents)

        text_latents, speech_latents = map(lambda t: F.normalize(t, p=2, dim=-1), (text_latents, speech_latents))

--- a/tortoise/models/config.json
+++ b/tortoise/models/config.json
@ -0,0 +1,46 @@
+{
+    "resblock": "1",
+    "num_gpus": 0,
+    "batch_size": 32,
+    "learning_rate": 0.0001,
+    "adam_b1": 0.8,
+    "adam_b2": 0.99,
+    "lr_decay": 0.999,
+    "seed": 1234,
+
+    "upsample_rates": [8,8,2,2],
+    "upsample_kernel_sizes": [16,16,4,4],
+    "upsample_initial_channel": 512,
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+
+    "activation": "snakebeta",
+    "snake_logscale": true,
+
+    "discriminator": "mrd",
+    "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
+    "mpd_reshapes": [2, 3, 5, 7, 11],
+    "use_spectral_norm": false,
+    "discriminator_channel_mult": 1,
+
+    "segment_size": 8192,
+    "num_mels": 100,
+    "num_freq": 1025,
+    "n_fft": 1024,
+    "hop_size": 256,
+    "win_size": 1024,
+
+    "sampling_rate": 24000,
+
+    "fmin": 0,
+    "fmax": 12000,
+    "fmax_for_loss": null,
+
+    "num_workers": 4,
+
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54321",
+        "world_size": 1
+    }
+}
--- a/tortoise/read.py
+++ b/tortoise/read.py
@ -17,7 +17,6 @@ if __name__ == '__main__':
                                                 'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='pat')
    parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/longform/')
    parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard')
-    parser.add_argument('--use_deepspeed', type=bool, help='Use deepspeed for speed bump.', default=True)
    parser.add_argument('--regenerate', type=str, help='Comma-separated list of clip numbers to re-generate, or nothing.', default=None)
    parser.add_argument('--candidates', type=int, help='How many output candidates to produce per-voice. Only the first candidate is actually used in the final product, the others can be used manually.', default=1)
    parser.add_argument('--model_dir', type=str, help='Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this'
@ -26,7 +25,7 @@ if __name__ == '__main__':
    parser.add_argument('--produce_debug_state', type=bool, help='Whether or not to produce debug_state.pth, which can aid in reproducing problems. Defaults to true.', default=True)

    args = parser.parse_args()
-    tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed)
+    tts = TextToSpeech(models_dir=args.model_dir)

    outpath = args.output_path
    selected_voices = args.voice.split(',')
--- a/tortoise/utils/audio.py
+++ b/tortoise/utils/audio.py
@ -2,7 +2,6 @@ import os
 from glob import glob

 import librosa
-import soundfile as sf
 import torch
 import torchaudio
 import numpy as np
@ -25,9 +24,6 @@ def load_audio(audiopath, sampling_rate):
    elif audiopath[-4:] == '.mp3':
        audio, lsr = librosa.load(audiopath, sr=sampling_rate)
        audio = torch.FloatTensor(audio)
-    elif audiopath[-5:] == '.flac':
-        audio, lsr = sf.read(audiopath)
-        audio = torch.FloatTensor(audio)
    else:
        assert False, f"Unsupported audio format provided: {audiopath[-4:]}"

@ -89,77 +85,17 @@ def get_voices(extra_voice_dirs=[], load_latents=True):
        for sub in subs:
            subj = os.path.join(d, sub)
            if os.path.isdir(subj):
-                voices[sub] = list(glob(f'{subj}/*.wav')) + list(glob(f'{subj}/*.mp3')) + list(glob(f'{subj}/*.flac'))
+                voices[sub] = list(glob(f'{subj}/*.wav')) + list(glob(f'{subj}/*.mp3'))
                if load_latents:
                    voices[sub] = voices[sub] + list(glob(f'{subj}/*.pth'))
    return voices

-def get_voice( name, dir=get_voice_dir(), load_latents=True, extensions=["wav", "mp3", "flac"] ):
-    subj = f'{dir}/{name}/'
-    if not os.path.isdir(subj):
-        return
-    files = os.listdir(subj)
-    
-    if load_latents:
-        extensions.append("pth")
-
-    voice = []
-    for file in files:
-        ext = os.path.splitext(file)[-1][1:]
-        if ext not in extensions:
-            continue
-
-        voice.append(f'{subj}/{file}')
-
-    return sorted( voice )
-
-def get_voice_list(dir=get_voice_dir(), append_defaults=False, load_latents=True, extensions=["wav", "mp3", "flac"]):
-    defaults = [ "random", "microphone" ]
-    os.makedirs(dir, exist_ok=True)
-    #res = sorted([d for d in os.listdir(dir) if d not in defaults and os.path.isdir(os.path.join(dir, d)) and len(os.listdir(os.path.join(dir, d))) > 0 ])
-
-    res = []
-    for name in os.listdir(dir):
-        if name in defaults:
-            continue
-        if not os.path.isdir(f'{dir}/{name}'):
-            continue
-        if len(os.listdir(os.path.join(dir, name))) == 0:
-            continue
-        files = get_voice( name, dir=dir, extensions=extensions, load_latents=load_latents )
-
-        if len(files) > 0:
-            res.append(name)
-        else:
-            for subdir in os.listdir(f'{dir}/{name}'):
-                if not os.path.isdir(f'{dir}/{name}/{subdir}'):
-                    continue
-                files = get_voice( f'{name}/{subdir}', dir=dir, extensions=extensions, load_latents=load_latents )
-                if len(files) == 0:
-                    continue
-                res.append(f'{name}/{subdir}')
-
-    res = sorted(res)
-    
-    if append_defaults:
-        res = res + defaults
-    
-    return res
-
-
-def _get_voices( dirs=[get_voice_dir()], load_latents=True ):
-    voices = {}
-    for dir in dirs:
-        voice_list = get_voice_list(dir=dir)
-        voices |= { name: get_voice(name=name, dir=dir, load_latents=load_latents) for name in voice_list }
-
-    return voices

 def load_voice(voice, extra_voice_dirs=[], load_latents=True, sample_rate=22050, device='cpu', model_hash=None):
    if voice == 'random':
        return None, None

-    voices = _get_voices(dirs=[get_voice_dir()] + extra_voice_dirs, load_latents=load_latents)
+    voices = get_voices(extra_voice_dirs=extra_voice_dirs, load_latents=load_latents)

    paths = voices[voice]
    mtime = 0
--- a/tortoise/utils/device.py
+++ b/tortoise/utils/device.py
@ -3,30 +3,6 @@ import psutil
 import importlib

 DEVICE_OVERRIDE = None
-DEVICE_BATCH_SIZE_MAP = [(14, 16), (10,8), (7,4)]
-
-from inspect import currentframe, getframeinfo
-import gc
-
-def do_gc():
-    gc.collect()
-    try:
-        torch.cuda.empty_cache()
-    except Exception as e:
-        pass
-
-def print_stats(collect=False):
-    cf = currentframe().f_back
-    msg = f'{getframeinfo(cf).filename}:{cf.f_lineno}'
-
-    if collect:
-        do_gc()
-
-    tot = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)
-    res = torch.cuda.memory_reserved(0) / (1024 ** 3)
-    alloc = torch.cuda.memory_allocated(0) / (1024 ** 3)
-    print("[{}] Total: {:.3f} | Reserved: {:.3f} | Allocated: {:.3f} | Free: {:.3f}".format( msg, tot, res, alloc, tot-res ))
-

 def has_dml():
    loader = importlib.find_loader('torch_directml')
@ -40,7 +16,7 @@ def set_device_name(name):
    global DEVICE_OVERRIDE
    DEVICE_OVERRIDE = name

-def get_device_name(attempt_gc=True):
+def get_device_name():
    global DEVICE_OVERRIDE
    if DEVICE_OVERRIDE is not None and DEVICE_OVERRIDE != "":
        return DEVICE_OVERRIDE
@ -49,8 +25,6 @@ def get_device_name(attempt_gc=True):

    if torch.cuda.is_available():
        name = 'cuda'
-        if attempt_gc:
-            torch.cuda.empty_cache() # may have performance implications
    elif has_dml():
        name = 'dml'

@ -71,30 +45,37 @@ def get_device(verbose=False):

    return torch.device(name)

-def get_device_vram( name=get_device_name() ):
+def get_device_batch_size():
    available = 1
-
-    if name == "cuda":
-        _, available = torch.cuda.mem_get_info()
+    name = get_device_name()
+    
+    if name == "dml":
+        # there's nothing publicly accessible in the DML API that exposes this
+        # there's a method to get currently used RAM statistics... as tiles
+        available = 1
+    elif name == "cuda":
+        _,available = torch.cuda.mem_get_info()
    elif name == "cpu":
        available = psutil.virtual_memory()[4]

-    return available / (1024 ** 3)
-
-def get_device_batch_size(name=get_device_name()):
-    vram = get_device_vram(name)
-
-    if vram > 14:
+    availableGb = available / (1024 ** 3)
+    
+    print(f"Total device memory available: {availableGb}")
+    if availableGb > 18:
+        print(f"Setting AutoRegressive Batch Size to: 32")
+        print(f"Damn. Nice GPU Dude.")
+        return 32
+    elif availableGb > 14:
+        print(f"Setting AutoRegressive Batch Size to: 16")
        return 16
-    elif vram > 10:
+    elif availableGb > 10:
+        print(f"Setting AutoRegressive Batch Size to: 8")
        return 8
-    elif vram > 7:
+    elif availableGb > 7:
+        print(f"Setting AutoRegressive Batch Size to: 4")
        return 4
-    """
-    for k, v in DEVICE_BATCH_SIZE_MAP:
-        if vram > k:
-            return v
-    """
+    print(f"Setting AutoRegressive Batch Size to: 1")
+    print(f"Don't cry about it if it doesn't work.")
    return 1

 def get_device_count(name=get_device_name()):
@ -107,8 +88,6 @@ def get_device_count(name=get_device_name()):
    return 1


-# if you're getting errors make sure you've updated your torch-directml, and if you're still getting errors then you can uncomment the below block
-"""
 if has_dml():
    _cumsum = torch.cumsum
    _repeat_interleave = torch.repeat_interleave
@ -126,5 +105,4 @@ if has_dml():
    torch.Tensor.new = lambda self, *args, **kwargs: ( _Tensor_new(self.to("cpu"), *args, **kwargs).to(self.device) )
    torch.Tensor.cumsum = lambda self, *args, **kwargs: ( _Tensor_cumsum(self.to("cpu"), *args, **kwargs).to(self.device) )
    torch.Tensor.repeat_interleave = lambda self, *args, **kwargs: ( _Tensor_repeat_interleave(self.to("cpu"), *args, **kwargs).to(self.device) )
-    torch.Tensor.multinomial = lambda self, *args, **kwargs: ( _Tensor_multinomial(self.to("cpu"), *args, **kwargs).to(self.device) )
-"""
+    torch.Tensor.multinomial = lambda self, *args, **kwargs: ( _Tensor_multinomial(self.to("cpu"), *args, **kwargs).to(self.device) )
--- a/tortoise/utils/diffusion.py
+++ b/tortoise/utils/diffusion.py
@ -13,7 +13,15 @@ import math
 import numpy as np
 import torch
 import torch as th
-from tqdm.auto import tqdm
+from tqdm import tqdm
+
+def tqdm_override(arr, verbose=False, progress=None, desc=None):
+    if verbose and desc is not None:
+        print(desc)
+        
+    if progress is None:
+        return tqdm(arr, disable=not verbose)
+    return progress.tqdm(arr, desc=f'{progress.msg_prefix} {desc}' if hasattr(progress, 'msg_prefix') else desc, track_tqdm=True)

 def normal_kl(mean1, logvar1, mean2, logvar2):
    """
@ -548,6 +556,7 @@ class GaussianDiffusion:
        model_kwargs=None,
        device=None,
        verbose=False,
+        progress=None,
        desc=None
    ):
        """
@ -580,6 +589,7 @@ class GaussianDiffusion:
            model_kwargs=model_kwargs,
            device=device,
            verbose=verbose,
+            progress=progress,
            desc=desc
        ):
            final = sample
@ -596,6 +606,7 @@ class GaussianDiffusion:
        model_kwargs=None,
        device=None,
        verbose=False,
+        progress=None,
        desc=None
    ):
        """
@ -615,7 +626,7 @@ class GaussianDiffusion:
            img = th.randn(*shape, device=device)
        indices = list(range(self.num_timesteps))[::-1]

-        for i in tqdm(indices, desc=desc):
+        for i in tqdm_override(indices, verbose=verbose, desc=desc, progress=progress):
            t = th.tensor([i] * shape[0], device=device)
            with th.no_grad():
                out = self.p_sample(
@ -730,6 +741,7 @@ class GaussianDiffusion:
        device=None,
        verbose=False,
        eta=0.0,
+        progress=None,
        desc=None,
    ):
        """
@ -749,6 +761,7 @@ class GaussianDiffusion:
            device=device,
            verbose=verbose,
            eta=eta,
+            progress=progress,
            desc=desc
        ):
            final = sample
@ -766,6 +779,7 @@ class GaussianDiffusion:
        device=None,
        verbose=False,
        eta=0.0,
+        progress=None,
        desc=None,
    ):
        """
@ -784,7 +798,10 @@ class GaussianDiffusion:
        indices = list(range(self.num_timesteps))[::-1]

        if verbose:
-            indices = tqdm(indices, desc=desc)
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+
+            indices = tqdm_override(indices, verbose=verbose, desc=desc, progress=progress)

        for i in indices:
            t = th.tensor([i] * shape[0], device=device)
--- a/tortoise/utils/tokenizer.py
+++ b/tortoise/utils/tokenizer.py
@ -1,6 +1,5 @@
 import os
 import re
-import json

 import inflect
 import torch
@ -171,39 +170,16 @@ DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), '


 class VoiceBpeTokenizer:
-    def __init__(self, vocab_file=DEFAULT_VOCAB_FILE, preprocess=None):
-        with open(vocab_file, 'r', encoding='utf-8') as f:
-          vocab = json.load(f)
-
-        self.language = vocab['model']['language'] if 'language' in vocab['model'] else None
-
-        if preprocess is None:
-          self.preprocess = 'pre_tokenizer' in vocab and vocab['pre_tokenizer']
-        else:
-            self.preprocess = preprocess
+    def __init__(self, vocab_file=DEFAULT_VOCAB_FILE):
        if vocab_file is not None:
            self.tokenizer = Tokenizer.from_file(vocab_file)

    def preprocess_text(self, txt):
-        if self.language == 'ja':
-          import pykakasi
-
-          kks = pykakasi.kakasi()
-          results = kks.convert(txt)
-          words = []
-
-          for result in results:
-            words.append(result['kana'])
-
-          txt = " ".join(words)
-          txt = basic_cleaners(txt)
-        else:
-          txt = english_cleaners(txt)
+        txt = english_cleaners(txt)
        return txt

    def encode(self, txt):
-        if self.preprocess:
-          txt = self.preprocess_text(txt)
+        txt = self.preprocess_text(txt)
        txt = txt.replace(' ', '[SPACE]')
        return self.tokenizer.encode(txt).ids

--- a/tortoise/utils/torch_intermediary.py
+++ b/tortoise/utils/torch_intermediary.py
@ -22,19 +22,17 @@ import os

 USE_STABLE_EMBEDDING = False
 try:
+	import bitsandbytes as bnb
 	OVERRIDE_LINEAR = False
-	OVERRIDE_EMBEDDING = False
-	OVERRIDE_ADAM = False
-	OVERRIDE_ADAMW = False
+	OVERRIDE_EMBEDDING = True
+	OVERRIDE_ADAM = True
+	OVERRIDE_ADAMW = True

 	USE_STABLE_EMBEDDING = os.environ.get('BITSANDBYTES_USE_STABLE_EMBEDDING', '1' if USE_STABLE_EMBEDDING else '0') == '1'
 	OVERRIDE_LINEAR = os.environ.get('BITSANDBYTES_OVERRIDE_LINEAR', '1' if OVERRIDE_LINEAR else '0') == '1'
 	OVERRIDE_EMBEDDING = os.environ.get('BITSANDBYTES_OVERRIDE_EMBEDDING', '1' if OVERRIDE_EMBEDDING else '0') == '1'
 	OVERRIDE_ADAM = os.environ.get('BITSANDBYTES_OVERRIDE_ADAM', '1' if OVERRIDE_ADAM else '0') == '1'
 	OVERRIDE_ADAMW = os.environ.get('BITSANDBYTES_OVERRIDE_ADAMW', '1' if OVERRIDE_ADAMW else '0') == '1'
-	
-	if OVERRIDE_LINEAR or OVERRIDE_EMBEDDING or OVERRIDE_ADAM or OVERRIDE_ADAMW:
-		import bitsandbytes as bnb
 except Exception as e:
 	OVERRIDE_LINEAR = False
 	OVERRIDE_EMBEDDING = False
--- a/tortoise/utils/wav2vec_alignment.py
+++ b/tortoise/utils/wav2vec_alignment.py
@ -144,7 +144,7 @@ class Wav2VecAlignment:
        non_redacted_intervals = []
        last_point = 0
        for i in range(len(fully_split)):
-            if i % 2 == 0 and fully_split[i] != "": # Check for empty string fixes index error
+            if i % 2 == 0:
                end_interval = max(0, last_point + len(fully_split[i]) - 1)
                non_redacted_intervals.append((last_point, end_interval))
            last_point += len(fully_split[i])