possible fix for when candidates >= samples

added compat for coqui's XTTS
Merge pull request 'main' (#47 ) from ken11o2/tortoise-tts:main into main
2023-10-10 15:30:08 +00:00 · 2023-09-16 03:38:21 +00:00 · 2023-09-04 20:01:14 +00:00 · 2023-09-04 19:14:53 +00:00 · 2023-09-04 19:13:45 +00:00 · 2023-09-04 19:12:13 +00:00
10 changed files with 372 additions and 234 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -11,5 +11,5 @@ librosa==0.8.1
 torchaudio
 threadpoolctl
 appdirs
-numpy
+numpy<=1.23.5
 numba
--- a/tortoise/api.py
+++ b/tortoise/api.py
@ -83,16 +83,6 @@ def check_for_kill_signal():
        STOP_SIGNAL = False
        raise Exception("Kill signal detected")

-def tqdm_override(arr, verbose=False, progress=None, desc=None):
-    check_for_kill_signal()
-
-    if verbose and desc is not None:
-        print(desc)
-
-    if progress is None:
-        return tqdm(arr, disable=not verbose)
-    return progress.tqdm(arr, desc=f'{progress.msg_prefix} {desc}' if hasattr(progress, 'msg_prefix') else desc, track_tqdm=True)
-
 def download_models(specific_models=None):
    """
    Call to download all the models that Tortoise uses.
@ -160,7 +150,7 @@ def load_discrete_vocoder_diffuser(trained_diffusion_steps=4000, desired_diffusi
                           model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', trained_diffusion_steps),
                           conditioning_free=cond_free, conditioning_free_k=cond_free_k)

-
+@torch.inference_mode()
 def format_conditioning(clip, cond_length=132300, device='cuda', sampling_rate=22050):
    """
    Converts the given conditioning signal to a MEL spectrogram and clips it as expected by the models.
@ -204,8 +194,8 @@ def fix_autoregressive_output(codes, stop_token, complain=True):

    return codes

-
-def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True, progress=None, desc=None, sampler="P", input_sample_rate=22050, output_sample_rate=24000):
+@torch.inference_mode()
+def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True, desc=None, sampler="P", input_sample_rate=22050, output_sample_rate=24000):
    """
    Uses the specified diffusion model to convert discrete codes into a spectrogram.
    """
@ -218,8 +208,7 @@ def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_la
        
        diffuser.sampler = sampler.lower()
        mel = diffuser.sample_loop(diffusion_model, output_shape, noise=noise,
-                                      model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings},
-                                     verbose=verbose, progress=progress, desc=desc)
+                                      model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings}, desc=desc)

        mel = denormalize_tacotron_mel(mel)[:,:,:output_seq_len]
        if get_device_name() == "dml":
@ -267,9 +256,11 @@ class TextToSpeech:

    def __init__(self, autoregressive_batch_size=None, models_dir=MODELS_DIR, enable_redaction=True, device=None,
        minor_optimizations=True,
+        unsqueeze_sample_batches=False,
        input_sample_rate=22050, output_sample_rate=24000,
-        autoregressive_model_path=None, diffusion_model_path=None, vocoder_model=None, tokenizer_json=None
-    ):
+        autoregressive_model_path=None, diffusion_model_path=None, vocoder_model=None, tokenizer_json=None,
+#    ):
+        use_deepspeed=False):  # Add use_deepspeed parameter
        """
        Constructor
        :param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing
@ -289,7 +280,9 @@ class TextToSpeech:
        self.input_sample_rate = input_sample_rate
        self.output_sample_rate = output_sample_rate
        self.minor_optimizations = minor_optimizations
-
+        self.unsqueeze_sample_batches = unsqueeze_sample_batches
+        self.use_deepspeed = use_deepspeed  # Store use_deepspeed as an instance variable
+        print(f'use_deepspeed api_debug {use_deepspeed}')
        # for clarity, it's simpler to split these up and just predicate them on requesting VRAM-consuming optimizations
        self.preloaded_tensors = minor_optimizations
        self.use_kv_cache = minor_optimizations
@ -345,25 +338,58 @@ class TextToSpeech:

        self.loading = False

-    def load_autoregressive_model(self, autoregressive_model_path):
-        if hasattr(self,"autoregressive_model_path") and self.autoregressive_model_path == autoregressive_model_path:
+    def load_autoregressive_model(self, autoregressive_model_path, is_xtts=False):
+        if hasattr(self,"autoregressive_model_path") and os.path.samefile(self.autoregressive_model_path, autoregressive_model_path):
            return

-        self.loading = True
-
        self.autoregressive_model_path = autoregressive_model_path if autoregressive_model_path and os.path.exists(autoregressive_model_path) else get_model_path('autoregressive.pth', self.models_dir)
-        self.autoregressive_model_hash = hash_file(self.autoregressive_model_path)
+        new_hash = hash_file(self.autoregressive_model_path)
+
+        if hasattr(self,"autoregressive_model_hash") and self.autoregressive_model_hash == new_hash:
+            return
+
+        self.autoregressive_model_hash = new_hash
+
+        self.loading = True
        print(f"Loading autoregressive model: {self.autoregressive_model_path}")

        if hasattr(self, 'autoregressive'):
            del self.autoregressive

-        self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
-                                          model_dim=1024,
-                                          heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
-                                          train_solo_embeddings=False).cpu().eval()
+        # XTTS requires a different "dimensionality" for its autoregressive model
+        if new_hash == "e4ce21eae0043f7691d6a6c8540b74b8" or is_xtts:
+            dimensionality = {
+                "max_mel_tokens": 605,
+                "max_text_tokens": 402,
+                "max_prompt_tokens": 70,
+                "max_conditioning_inputs": 1,
+                "layers": 30,
+                "model_dim": 1024,
+                "heads": 16,
+                "number_text_tokens": 5023, # -1
+                "start_text_token": 261,
+                "stop_text_token": 0,
+                "number_mel_codes": 8194,
+                "start_mel_token": 8192,
+                "stop_mel_token": 8193,
+            }
+        else:
+            dimensionality = {
+                "max_mel_tokens": 604,
+                "max_text_tokens": 402,
+                "max_conditioning_inputs": 2,
+                "layers": 30,
+                "model_dim": 1024,
+                "heads": 16,
+                "number_text_tokens": 255,
+                "start_text_token": 255,
+                "checkpointing": False,
+                "train_solo_embeddings": False
+            }
+
+        self.autoregressive = UnifiedVoice(**dimensionality).cpu().eval()
        self.autoregressive.load_state_dict(torch.load(self.autoregressive_model_path))
-        self.autoregressive.post_init_gpt2_config(kv_cache=self.use_kv_cache)
+        self.autoregressive.post_init_gpt2_config(use_deepspeed=self.use_deepspeed, kv_cache=self.use_kv_cache)
        if self.preloaded_tensors:
            self.autoregressive = migrate_to_device( self.autoregressive, self.device )

@ -371,7 +397,7 @@ class TextToSpeech:
        print(f"Loaded autoregressive model")

    def load_diffusion_model(self, diffusion_model_path):
-        if hasattr(self,"diffusion_model_path") and self.diffusion_model_path == diffusion_model_path:
+        if hasattr(self,"diffusion_model_path") and os.path.samefile(self.diffusion_model_path, diffusion_model_path):
            return

        self.loading = True
@ -382,9 +408,21 @@ class TextToSpeech:
        if hasattr(self, 'diffusion'):
            del self.diffusion

-        self.diffusion = DiffusionTts(model_channels=1024, num_layers=10, in_channels=100, out_channels=200,
-                                          in_latent_channels=1024, in_tokens=8193, dropout=0, use_fp16=False, num_heads=16,
-                                          layer_drop=0, unconditioned_percentage=0).cpu().eval()
+        # XTTS does not require a different "dimensionality" for its diffusion model
+        dimensionality = {
+            "model_channels": 1024,
+            "num_layers": 10,
+            "in_channels": 100,
+            "out_channels": 200,
+            "in_latent_channels": 1024,
+            "in_tokens": 8193,
+            "dropout": 0,
+            "use_fp16": False,
+            "num_heads": 16,
+            "layer_drop": 0,
+            "unconditioned_percentage": 0
+        }
+        self.diffusion = DiffusionTts(**dimensionality)
        self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', self.models_dir)))
        if self.preloaded_tensors:
            self.diffusion = migrate_to_device( self.diffusion, self.device )
@ -393,7 +431,7 @@ class TextToSpeech:
        print(f"Loaded diffusion model")

    def load_vocoder_model(self, vocoder_model):
-        if hasattr(self,"vocoder_model_path") and self.vocoder_model_path == vocoder_model:
+        if hasattr(self,"vocoder_model_path") and os.path.samefile(self.vocoder_model_path, vocoder_model):
            return

        self.loading = True
@ -433,7 +471,7 @@ class TextToSpeech:
        print(f"Loaded vocoder model")

    def load_tokenizer_json(self, tokenizer_json):
-        if hasattr(self,"tokenizer_json") and self.tokenizer_json == tokenizer_json:
+        if hasattr(self,"tokenizer_json") and os.path.samefile(self.tokenizer_json, tokenizer_json):
            return
        
        self.loading = True
@ -457,13 +495,15 @@ class TextToSpeech:
        if self.preloaded_tensors:
            self.cvvp = migrate_to_device( self.cvvp, self.device )

-    def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, progress=None, slices=1, max_chunk_size=None, force_cpu=False):
+    @torch.inference_mode()
+    def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, slices=1, max_chunk_size=None, force_cpu=False, original_ar=False, original_diffusion=False):
        """
        Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
        These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
        properties.
        :param voice_samples: List of 2 or more ~10 second reference clips, which should be torch tensors containing 22.05kHz waveform data.
        """
+
        with torch.no_grad():
            # computing conditional latents requires being done on the CPU if using DML because M$ still hasn't implemented some core functions
            if get_device_name() == "dml":
@ -473,50 +513,72 @@ class TextToSpeech:
            if not isinstance(voice_samples, list):
                voice_samples = [voice_samples]
            
-            voice_samples = [migrate_to_device(v, device)  for v in voice_samples]
-
-            resampler = torchaudio.transforms.Resample(
+            resampler_22K = torchaudio.transforms.Resample(
                self.input_sample_rate,
-                self.output_sample_rate,
+                22050,
                lowpass_filter_width=16,
                rolloff=0.85,
                resampling_method="kaiser_window",
                beta=8.555504641634386,
            ).to(device)

-            samples = [resampler(sample) for sample in voice_samples]
-            chunks = []
+            resampler_24K = torchaudio.transforms.Resample(
+                self.input_sample_rate,
+                24000,
+                lowpass_filter_width=16,
+                rolloff=0.85,
+                resampling_method="kaiser_window",
+                beta=8.555504641634386,
+            ).to(device)

-            concat = torch.cat(samples, dim=-1)
-            chunk_size = concat.shape[-1]
-
-            if slices == 0:
-                slices = 1
-            elif max_chunk_size is not None and chunk_size > max_chunk_size:
-                slices = 1
-                while int(chunk_size / slices) > max_chunk_size:
-                    slices = slices + 1
-
-            chunks = torch.chunk(concat, slices, dim=1)
-            chunk_size = chunks[0].shape[-1]
+            voice_samples = [migrate_to_device(v, device)  for v in voice_samples]

            auto_conds = []
-            for chunk in tqdm_override(chunks, verbose=verbose, progress=progress, desc="Computing AR conditioning latents..."):
-                auto_conds.append(format_conditioning(chunk, device=device, sampling_rate=self.input_sample_rate, cond_length=chunk_size))
-            auto_conds = torch.stack(auto_conds, dim=1)
+            diffusion_conds = []

+            if original_ar:
+                samples = [resampler_22K(sample) for sample in voice_samples]
+                for sample in tqdm(samples, desc="Computing AR conditioning latents..."):
+                    auto_conds.append(format_conditioning(sample, device=device, sampling_rate=self.input_sample_rate, cond_length=132300))
+            else:
+                samples = [resampler_22K(sample) for sample in voice_samples]
+                concat = torch.cat(samples, dim=-1)
+                chunk_size = concat.shape[-1]
+
+                if slices == 0:
+                    slices = 1
+                elif max_chunk_size is not None and chunk_size > max_chunk_size:
+                    slices = 1
+                    while int(chunk_size / slices) > max_chunk_size:
+                        slices = slices + 1
+
+                chunks = torch.chunk(concat, slices, dim=1)
+                chunk_size = chunks[0].shape[-1]
+
+                for chunk in tqdm(chunks, desc="Computing AR conditioning latents..."):
+                    auto_conds.append(format_conditioning(chunk, device=device, sampling_rate=self.input_sample_rate, cond_length=chunk_size))
+                
+
+            if original_diffusion:
+                samples = [resampler_24K(sample) for sample in voice_samples]
+                for sample in tqdm(samples, desc="Computing diffusion conditioning latents..."):
+                    sample = pad_or_truncate(sample, 102400)
+                    cond_mel = wav_to_univnet_mel(migrate_to_device(sample, device), do_normalization=False, device=self.device)
+                    diffusion_conds.append(cond_mel)
+            else:
+                samples = [resampler_24K(sample) for sample in voice_samples]
+                for chunk in tqdm(chunks, desc="Computing diffusion conditioning latents..."):
+                    check_for_kill_signal()
+                    chunk = pad_or_truncate(chunk, chunk_size)
+                    cond_mel = wav_to_univnet_mel(migrate_to_device( chunk, device ), do_normalization=False, device=device)
+                    diffusion_conds.append(cond_mel)
+
+            auto_conds = torch.stack(auto_conds, dim=1)
            self.autoregressive = migrate_to_device( self.autoregressive, device )
            auto_latent = self.autoregressive.get_conditioning(auto_conds)
            self.autoregressive = migrate_to_device( self.autoregressive, self.device if self.preloaded_tensors else 'cpu' )
-            
-            diffusion_conds = []
-            for chunk in tqdm_override(chunks, verbose=verbose, progress=progress, desc="Computing diffusion conditioning latents..."):
-                check_for_kill_signal()
-                chunk = pad_or_truncate(chunk, chunk_size)
-                cond_mel = wav_to_univnet_mel(migrate_to_device( chunk, device ), do_normalization=False, device=device)
-                diffusion_conds.append(cond_mel)
-            diffusion_conds = torch.stack(diffusion_conds, dim=1)

+            diffusion_conds = torch.stack(diffusion_conds, dim=1)
            self.diffusion = migrate_to_device( self.diffusion, device )
            diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
            self.diffusion = migrate_to_device( self.diffusion, self.device if self.preloaded_tensors else 'cpu' )
@ -559,6 +621,7 @@ class TextToSpeech:
        settings.update(kwargs) # allow overriding of preset settings with kwargs
        return self.tts(text, **settings)

+    @torch.inference_mode()
    def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True, use_deterministic_seed=None,
            return_deterministic_state=False,
            # autoregressive generation parameters follow
@ -574,7 +637,6 @@ class TextToSpeech:
            diffusion_sampler="P",
            breathing_room=8,
            half_p=False,
-            progress=None,
            **hf_generate_kwargs):
        """
        Produces an audio clip of the given text being spoken with the given reference voice.
@ -679,7 +741,7 @@ class TextToSpeech:
            text_tokens = migrate_to_device( text_tokens, self.device )

            with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p):
-                for b in tqdm_override(range(num_batches), verbose=verbose, progress=progress, desc="Generating autoregressive samples"):
+                for b in tqdm(range(num_batches), desc="Generating autoregressive samples"):
                    check_for_kill_signal()
                    codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens,
                                                                 do_sample=True,
@ -697,8 +759,14 @@ class TextToSpeech:
            if not self.preloaded_tensors:
                self.autoregressive = migrate_to_device( self.autoregressive, 'cpu' )

-            clip_results = []
+            if self.unsqueeze_sample_batches:
+                new_samples = []
+                for batch in samples:
+                     for i in range(batch.shape[0]):
+                        new_samples.append(batch[i].unsqueeze(0))
+                samples = new_samples

+            clip_results = []
            if auto_conds is not None:
                auto_conditioning = migrate_to_device( auto_conditioning, self.device )

@ -722,7 +790,7 @@ class TextToSpeech:
                        desc = f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%"

                
-                for batch in tqdm_override(samples, verbose=verbose, progress=progress, desc=desc):
+                for batch in tqdm(samples, desc=desc):
                    check_for_kill_signal()
                    for i in range(batch.shape[0]):
                        batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
@ -747,7 +815,10 @@ class TextToSpeech:

            clip_results = torch.cat(clip_results, dim=0)
            samples = torch.cat(samples, dim=0)
-            best_results = samples[torch.topk(clip_results, k=k).indices]
+            if k < num_autoregressive_samples:
+                best_results = samples[torch.topk(clip_results, k=k).indices]
+            else:
+                best_results = samples
            
            if not self.preloaded_tensors:
                self.clvp = migrate_to_device( self.clvp, 'cpu' )
@ -807,7 +878,7 @@ class TextToSpeech:
                        break

                mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents, diffusion_conditioning,
-                                               temperature=diffusion_temperature, verbose=verbose, progress=progress, desc="Transforming autoregressive outputs into audio..", sampler=diffusion_sampler,
+                                               temperature=diffusion_temperature, desc="Transforming autoregressive outputs into audio..", sampler=diffusion_sampler,
                                               input_sample_rate=self.input_sample_rate, output_sample_rate=self.output_sample_rate)

                wav = self.vocoder.inference(mel)
--- a/tortoise/do_tts.py
+++ b/tortoise/do_tts.py
@ -14,6 +14,7 @@ if __name__ == '__main__':
    parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
                                                 'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='random')
    parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard')
+    parser.add_argument('--use_deepspeed', type=bool, help='Use deepspeed for speed bump.', default=True)
    parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/')
    parser.add_argument('--model_dir', type=str, help='Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this'
                                                      'should only be specified if you have custom checkpoints.', default=MODELS_DIR)
@ -37,8 +38,8 @@ if __name__ == '__main__':


    os.makedirs(args.output_path, exist_ok=True)
-
-    tts = TextToSpeech(models_dir=args.model_dir)
+    #print(f'use_deepspeed do_tts_debug {use_deepspeed}')
+    tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed)

    selected_voices = args.voice.split(',')
    for k, selected_voice in enumerate(selected_voices):
--- a/tortoise/models/autoregressive.py
+++ b/tortoise/models/autoregressive.py
@ -283,9 +283,9 @@ class MelEncoder(nn.Module):


 class UnifiedVoice(nn.Module):
-    def __init__(self, layers=8, model_dim=512, heads=8, max_text_tokens=120, max_mel_tokens=250, max_conditioning_inputs=1,
+    def __init__(self, layers=8, model_dim=512, heads=8, max_text_tokens=120, max_prompt_tokens=2, max_mel_tokens=250, max_conditioning_inputs=1,
                 mel_length_compression=1024, number_text_tokens=256,
-                 start_text_token=None, number_mel_codes=8194, start_mel_token=8192,
+                 start_text_token=None, stop_text_token=0, number_mel_codes=8194, start_mel_token=8192,
                 stop_mel_token=8193, train_solo_embeddings=False, use_mel_codes_as_input=True,
                 checkpointing=True, types=1):
        """
@ -295,6 +295,7 @@ class UnifiedVoice(nn.Module):
            heads: Number of transformer heads. Must be divisible by model_dim. Recommend model_dim//64
            max_text_tokens: Maximum number of text tokens that will be encountered by model.
            max_mel_tokens: Maximum number of MEL tokens that will be encountered by model.
+            max_prompt_tokens: compat set to 2, 70 for XTTS
            max_conditioning_inputs: Maximum number of conditioning inputs provided to the model. If (1), conditioning input can be of format (b,80,s), otherwise (b,n,80,s).
            mel_length_compression: The factor between <number_input_samples> and <mel_tokens>. Used to compute MEL code padding given wav input length.
            number_text_tokens:
@ -311,7 +312,7 @@ class UnifiedVoice(nn.Module):

        self.number_text_tokens = number_text_tokens
        self.start_text_token = number_text_tokens * types if start_text_token is None else start_text_token
-        self.stop_text_token = 0
+        self.stop_text_token = stop_text_token
        self.number_mel_codes = number_mel_codes
        self.start_mel_token = start_mel_token
        self.stop_mel_token = stop_mel_token
@ -319,6 +320,7 @@ class UnifiedVoice(nn.Module):
        self.heads = heads
        self.max_mel_tokens = max_mel_tokens
        self.max_text_tokens = max_text_tokens
+        self.max_prompt_tokens = max_prompt_tokens
        self.model_dim = model_dim
        self.max_conditioning_inputs = max_conditioning_inputs
        self.mel_length_compression = mel_length_compression
@ -352,8 +354,8 @@ class UnifiedVoice(nn.Module):
        for module in embeddings:
            module.weight.data.normal_(mean=0.0, std=.02)

-    def post_init_gpt2_config(self, kv_cache=False):
-        seq_length = self.max_mel_tokens + self.max_text_tokens + 2
+    def post_init_gpt2_config(self, use_deepspeed=False, kv_cache=False):
+        seq_length = self.max_mel_tokens + self.max_text_tokens + self.max_prompt_tokens
        gpt_config = GPT2Config(vocab_size=self.max_mel_tokens,
                                n_positions=seq_length,
                                n_ctx=seq_length,
@ -363,6 +365,17 @@ class UnifiedVoice(nn.Module):
                                gradient_checkpointing=False,
                                use_cache=True)
        self.inference_model = GPT2InferenceModel(gpt_config, self.gpt, self.mel_pos_embedding, self.mel_embedding, self.final_norm, self.mel_head, kv_cache=kv_cache)
+        #print(f'use_deepspeed autoregressive_debug {use_deepspeed}')
+        if use_deepspeed and torch.cuda.is_available():
+            import deepspeed
+            self.ds_engine = deepspeed.init_inference(model=self.inference_model,  
+                                                    mp_size=1,
+                                                    replace_with_kernel_inject=True,
+                                                    dtype=torch.float32)
+            self.inference_model = self.ds_engine.module.eval()
+        else:
+            self.inference_model = self.inference_model.eval()
+            
        self.gpt.wte = self.mel_embedding

    def build_aligned_inputs_and_targets(self, input, start_token, stop_token):
@ -483,7 +496,7 @@ class UnifiedVoice(nn.Module):

    def inference_speech(self, speech_conditioning_latent, text_inputs, input_tokens=None, num_return_sequences=1,
                         max_generate_length=None, typical_sampling=False, typical_mass=.9, **hf_generate_kwargs):
-        seq_length = self.max_mel_tokens + self.max_text_tokens + 2
+        seq_length = self.max_mel_tokens + self.max_text_tokens + self.max_prompt_tokens
        if not hasattr(self, 'inference_model'):
            self.post_init_gpt2_config(kv_cache=self.kv_cache)
            
--- a/tortoise/read.py
+++ b/tortoise/read.py
@ -17,6 +17,7 @@ if __name__ == '__main__':
                                                 'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='pat')
    parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/longform/')
    parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard')
+    parser.add_argument('--use_deepspeed', type=bool, help='Use deepspeed for speed bump.', default=True)
    parser.add_argument('--regenerate', type=str, help='Comma-separated list of clip numbers to re-generate, or nothing.', default=None)
    parser.add_argument('--candidates', type=int, help='How many output candidates to produce per-voice. Only the first candidate is actually used in the final product, the others can be used manually.', default=1)
    parser.add_argument('--model_dir', type=str, help='Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this'
@ -25,7 +26,7 @@ if __name__ == '__main__':
    parser.add_argument('--produce_debug_state', type=bool, help='Whether or not to produce debug_state.pth, which can aid in reproducing problems. Defaults to true.', default=True)

    args = parser.parse_args()
-    tts = TextToSpeech(models_dir=args.model_dir)
+    tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed)

    outpath = args.output_path
    selected_voices = args.voice.split(',')
--- a/tortoise/utils/audio.py
+++ b/tortoise/utils/audio.py
@ -2,6 +2,7 @@ import os
 from glob import glob

 import librosa
+import soundfile as sf
 import torch
 import torchaudio
 import numpy as np
@ -24,6 +25,9 @@ def load_audio(audiopath, sampling_rate):
    elif audiopath[-4:] == '.mp3':
        audio, lsr = librosa.load(audiopath, sr=sampling_rate)
        audio = torch.FloatTensor(audio)
+    elif audiopath[-5:] == '.flac':
+        audio, lsr = sf.read(audiopath)
+        audio = torch.FloatTensor(audio)
    else:
        assert False, f"Unsupported audio format provided: {audiopath[-4:]}"

@ -85,17 +89,77 @@ def get_voices(extra_voice_dirs=[], load_latents=True):
        for sub in subs:
            subj = os.path.join(d, sub)
            if os.path.isdir(subj):
-                voices[sub] = list(glob(f'{subj}/*.wav')) + list(glob(f'{subj}/*.mp3'))
+                voices[sub] = list(glob(f'{subj}/*.wav')) + list(glob(f'{subj}/*.mp3')) + list(glob(f'{subj}/*.flac'))
                if load_latents:
                    voices[sub] = voices[sub] + list(glob(f'{subj}/*.pth'))
    return voices

+def get_voice( name, dir=get_voice_dir(), load_latents=True, extensions=["wav", "mp3", "flac"] ):
+    subj = f'{dir}/{name}/'
+    if not os.path.isdir(subj):
+        return
+    files = os.listdir(subj)
+    
+    if load_latents:
+        extensions.append("pth")
+
+    voice = []
+    for file in files:
+        ext = os.path.splitext(file)[-1][1:]
+        if ext not in extensions:
+            continue
+
+        voice.append(f'{subj}/{file}')
+
+    return sorted( voice )
+
+def get_voice_list(dir=get_voice_dir(), append_defaults=False, load_latents=True, extensions=["wav", "mp3", "flac"]):
+    defaults = [ "random", "microphone" ]
+    os.makedirs(dir, exist_ok=True)
+    #res = sorted([d for d in os.listdir(dir) if d not in defaults and os.path.isdir(os.path.join(dir, d)) and len(os.listdir(os.path.join(dir, d))) > 0 ])
+
+    res = []
+    for name in os.listdir(dir):
+        if name in defaults:
+            continue
+        if not os.path.isdir(f'{dir}/{name}'):
+            continue
+        if len(os.listdir(os.path.join(dir, name))) == 0:
+            continue
+        files = get_voice( name, dir=dir, extensions=extensions, load_latents=load_latents )
+
+        if len(files) > 0:
+            res.append(name)
+        else:
+            for subdir in os.listdir(f'{dir}/{name}'):
+                if not os.path.isdir(f'{dir}/{name}/{subdir}'):
+                    continue
+                files = get_voice( f'{name}/{subdir}', dir=dir, extensions=extensions, load_latents=load_latents )
+                if len(files) == 0:
+                    continue
+                res.append(f'{name}/{subdir}')
+
+    res = sorted(res)
+    
+    if append_defaults:
+        res = res + defaults
+    
+    return res
+
+
+def _get_voices( dirs=[get_voice_dir()], load_latents=True ):
+    voices = {}
+    for dir in dirs:
+        voice_list = get_voice_list(dir=dir)
+        voices |= { name: get_voice(name=name, dir=dir, load_latents=load_latents) for name in voice_list }
+
+    return voices

 def load_voice(voice, extra_voice_dirs=[], load_latents=True, sample_rate=22050, device='cpu', model_hash=None):
    if voice == 'random':
        return None, None

-    voices = get_voices(extra_voice_dirs=extra_voice_dirs, load_latents=load_latents)
+    voices = _get_voices(dirs=[get_voice_dir()] + extra_voice_dirs, load_latents=load_latents)

    paths = voices[voice]
    mtime = 0
--- a/tortoise/utils/device.py
+++ b/tortoise/utils/device.py
@ -1,127 +1,130 @@
-import torch
-import psutil
-import importlib
-
-DEVICE_OVERRIDE = None
-DEVICE_BATCH_SIZE_MAP = [(14, 16), (10,8), (7,4)]
-
-from inspect import currentframe, getframeinfo
-import gc
-
-def do_gc():
-    gc.collect()
-    try:
-        torch.cuda.empty_cache()
-    except Exception as e:
-        pass
-
-def print_stats(collect=False):
-    cf = currentframe().f_back
-    msg = f'{getframeinfo(cf).filename}:{cf.f_lineno}'
-
-    if collect:
-        do_gc()
-
-    tot = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)
-    res = torch.cuda.memory_reserved(0) / (1024 ** 3)
-    alloc = torch.cuda.memory_allocated(0) / (1024 ** 3)
-    print("[{}] Total: {:.3f} | Reserved: {:.3f} | Allocated: {:.3f} | Free: {:.3f}".format( msg, tot, res, alloc, tot-res ))
-
-
-def has_dml():
-    loader = importlib.find_loader('torch_directml')
-    if loader is None:
-        return False
-    
-    import torch_directml
-    return torch_directml.is_available()
-
-def set_device_name(name):
-    global DEVICE_OVERRIDE
-    DEVICE_OVERRIDE = name
-
-def get_device_name(attempt_gc=True):
-    global DEVICE_OVERRIDE
-    if DEVICE_OVERRIDE is not None and DEVICE_OVERRIDE != "":
-        return DEVICE_OVERRIDE
-
-    name = 'cpu'
-
-    if torch.cuda.is_available():
-        name = 'cuda'
-        if attempt_gc:
-            torch.cuda.empty_cache() # may have performance implications
-    elif has_dml():
-        name = 'dml'
-
-    return name
-
-def get_device(verbose=False):
-    name = get_device_name()
-
-    if verbose:
-        if name == 'cpu':
-            print("No hardware acceleration is available, falling back to CPU...")    
-        else:
-            print(f"Hardware acceleration found: {name}")
-
-    if name == "dml":
-        import torch_directml
-        return torch_directml.device()
-
-    return torch.device(name)
-
-def get_device_vram( name=get_device_name() ):
-    available = 1
-
-    if name == "cuda":
-        _, available = torch.cuda.mem_get_info()
-    elif name == "cpu":
-        available = psutil.virtual_memory()[4]
-
-    return available / (1024 ** 3)
-
-def get_device_batch_size(name=None):
-    vram = get_device_vram(name)
-
-    if vram > 14:
-        return 16
-    elif vram > 10:
-        return 8
-    elif vram > 7:
-        return 4
-    """
-    for k, v in DEVICE_BATCH_SIZE_MAP:
-        if vram > k:
-            return v
-    """
-    return 1
-
-def get_device_count(name=get_device_name()):
-    if name == "cuda":
-        return torch.cuda.device_count()
-    if name == "dml":
-        import torch_directml
-        return torch_directml.device_count()
-
-    return 1
-
-
-if has_dml():
-    _cumsum = torch.cumsum
-    _repeat_interleave = torch.repeat_interleave
-    _multinomial = torch.multinomial
-    
-    _Tensor_new = torch.Tensor.new
-    _Tensor_cumsum = torch.Tensor.cumsum
-    _Tensor_repeat_interleave = torch.Tensor.repeat_interleave
-    _Tensor_multinomial = torch.Tensor.multinomial
-
-    torch.cumsum = lambda input, *args, **kwargs: ( _cumsum(input.to("cpu"), *args, **kwargs).to(input.device) )
-    torch.repeat_interleave = lambda input, *args, **kwargs: ( _repeat_interleave(input.to("cpu"), *args, **kwargs).to(input.device) )
-    torch.multinomial = lambda input, *args, **kwargs: ( _multinomial(input.to("cpu"), *args, **kwargs).to(input.device) )
-    
-    torch.Tensor.new = lambda self, *args, **kwargs: ( _Tensor_new(self.to("cpu"), *args, **kwargs).to(self.device) )
-    torch.Tensor.cumsum = lambda self, *args, **kwargs: ( _Tensor_cumsum(self.to("cpu"), *args, **kwargs).to(self.device) )
-    torch.Tensor.repeat_interleave = lambda self, *args, **kwargs: ( _Tensor_repeat_interleave(self.to("cpu"), *args, **kwargs).to(self.device) )
-    torch.Tensor.multinomial = lambda self, *args, **kwargs: ( _Tensor_multinomial(self.to("cpu"), *args, **kwargs).to(self.device) )
+import torch
+import psutil
+import importlib
+
+DEVICE_OVERRIDE = None
+DEVICE_BATCH_SIZE_MAP = [(14, 16), (10,8), (7,4)]
+
+from inspect import currentframe, getframeinfo
+import gc
+
+def do_gc():
+    gc.collect()
+    try:
+        torch.cuda.empty_cache()
+    except Exception as e:
+        pass
+
+def print_stats(collect=False):
+    cf = currentframe().f_back
+    msg = f'{getframeinfo(cf).filename}:{cf.f_lineno}'
+
+    if collect:
+        do_gc()
+
+    tot = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)
+    res = torch.cuda.memory_reserved(0) / (1024 ** 3)
+    alloc = torch.cuda.memory_allocated(0) / (1024 ** 3)
+    print("[{}] Total: {:.3f} | Reserved: {:.3f} | Allocated: {:.3f} | Free: {:.3f}".format( msg, tot, res, alloc, tot-res ))
+
+
+def has_dml():
+    loader = importlib.find_loader('torch_directml')
+    if loader is None:
+        return False
+    
+    import torch_directml
+    return torch_directml.is_available()
+
+def set_device_name(name):
+    global DEVICE_OVERRIDE
+    DEVICE_OVERRIDE = name
+
+def get_device_name(attempt_gc=True):
+    global DEVICE_OVERRIDE
+    if DEVICE_OVERRIDE is not None and DEVICE_OVERRIDE != "":
+        return DEVICE_OVERRIDE
+
+    name = 'cpu'
+
+    if torch.cuda.is_available():
+        name = 'cuda'
+        if attempt_gc:
+            torch.cuda.empty_cache() # may have performance implications
+    elif has_dml():
+        name = 'dml'
+
+    return name
+
+def get_device(verbose=False):
+    name = get_device_name()
+
+    if verbose:
+        if name == 'cpu':
+            print("No hardware acceleration is available, falling back to CPU...")    
+        else:
+            print(f"Hardware acceleration found: {name}")
+
+    if name == "dml":
+        import torch_directml
+        return torch_directml.device()
+
+    return torch.device(name)
+
+def get_device_vram( name=get_device_name() ):
+    available = 1
+
+    if name == "cuda":
+        _, available = torch.cuda.mem_get_info()
+    elif name == "cpu":
+        available = psutil.virtual_memory()[4]
+
+    return available / (1024 ** 3)
+
+def get_device_batch_size(name=get_device_name()):
+    vram = get_device_vram(name)
+
+    if vram > 14:
+        return 16
+    elif vram > 10:
+        return 8
+    elif vram > 7:
+        return 4
+    """
+    for k, v in DEVICE_BATCH_SIZE_MAP:
+        if vram > k:
+            return v
+    """
+    return 1
+
+def get_device_count(name=get_device_name()):
+    if name == "cuda":
+        return torch.cuda.device_count()
+    if name == "dml":
+        import torch_directml
+        return torch_directml.device_count()
+
+    return 1
+
+
+# if you're getting errors make sure you've updated your torch-directml, and if you're still getting errors then you can uncomment the below block
+"""
+if has_dml():
+    _cumsum = torch.cumsum
+    _repeat_interleave = torch.repeat_interleave
+    _multinomial = torch.multinomial
+    
+    _Tensor_new = torch.Tensor.new
+    _Tensor_cumsum = torch.Tensor.cumsum
+    _Tensor_repeat_interleave = torch.Tensor.repeat_interleave
+    _Tensor_multinomial = torch.Tensor.multinomial
+
+    torch.cumsum = lambda input, *args, **kwargs: ( _cumsum(input.to("cpu"), *args, **kwargs).to(input.device) )
+    torch.repeat_interleave = lambda input, *args, **kwargs: ( _repeat_interleave(input.to("cpu"), *args, **kwargs).to(input.device) )
+    torch.multinomial = lambda input, *args, **kwargs: ( _multinomial(input.to("cpu"), *args, **kwargs).to(input.device) )
+    
+    torch.Tensor.new = lambda self, *args, **kwargs: ( _Tensor_new(self.to("cpu"), *args, **kwargs).to(self.device) )
+    torch.Tensor.cumsum = lambda self, *args, **kwargs: ( _Tensor_cumsum(self.to("cpu"), *args, **kwargs).to(self.device) )
+    torch.Tensor.repeat_interleave = lambda self, *args, **kwargs: ( _Tensor_repeat_interleave(self.to("cpu"), *args, **kwargs).to(self.device) )
+    torch.Tensor.multinomial = lambda self, *args, **kwargs: ( _Tensor_multinomial(self.to("cpu"), *args, **kwargs).to(self.device) )
+"""
--- a/tortoise/utils/diffusion.py
+++ b/tortoise/utils/diffusion.py
@ -13,15 +13,7 @@ import math
 import numpy as np
 import torch
 import torch as th
-from tqdm import tqdm
-
-def tqdm_override(arr, verbose=False, progress=None, desc=None):
-    if verbose and desc is not None:
-        print(desc)
-        
-    if progress is None:
-        return tqdm(arr, disable=not verbose)
-    return progress.tqdm(arr, desc=f'{progress.msg_prefix} {desc}' if hasattr(progress, 'msg_prefix') else desc, track_tqdm=True)
+from tqdm.auto import tqdm

 def normal_kl(mean1, logvar1, mean2, logvar2):
    """
@ -556,7 +548,6 @@ class GaussianDiffusion:
        model_kwargs=None,
        device=None,
        verbose=False,
-        progress=None,
        desc=None
    ):
        """
@ -589,7 +580,6 @@ class GaussianDiffusion:
            model_kwargs=model_kwargs,
            device=device,
            verbose=verbose,
-            progress=progress,
            desc=desc
        ):
            final = sample
@ -606,7 +596,6 @@ class GaussianDiffusion:
        model_kwargs=None,
        device=None,
        verbose=False,
-        progress=None,
        desc=None
    ):
        """
@ -626,7 +615,7 @@ class GaussianDiffusion:
            img = th.randn(*shape, device=device)
        indices = list(range(self.num_timesteps))[::-1]

-        for i in tqdm_override(indices, verbose=verbose, desc=desc, progress=progress):
+        for i in tqdm(indices, desc=desc):
            t = th.tensor([i] * shape[0], device=device)
            with th.no_grad():
                out = self.p_sample(
@ -741,7 +730,6 @@ class GaussianDiffusion:
        device=None,
        verbose=False,
        eta=0.0,
-        progress=None,
        desc=None,
    ):
        """
@ -761,7 +749,6 @@ class GaussianDiffusion:
            device=device,
            verbose=verbose,
            eta=eta,
-            progress=progress,
            desc=desc
        ):
            final = sample
@ -779,7 +766,6 @@ class GaussianDiffusion:
        device=None,
        verbose=False,
        eta=0.0,
-        progress=None,
        desc=None,
    ):
        """
@ -798,10 +784,7 @@ class GaussianDiffusion:
        indices = list(range(self.num_timesteps))[::-1]

        if verbose:
-            # Lazy import so that we don't depend on tqdm.
-            from tqdm.auto import tqdm
-
-            indices = tqdm_override(indices, verbose=verbose, desc=desc, progress=progress)
+            indices = tqdm(indices, desc=desc)

        for i in indices:
            t = th.tensor([i] * shape[0], device=device)
--- a/tortoise/utils/torch_intermediary.py
+++ b/tortoise/utils/torch_intermediary.py
@ -22,17 +22,19 @@ import os

 USE_STABLE_EMBEDDING = False
 try:
-	import bitsandbytes as bnb
 	OVERRIDE_LINEAR = False
-	OVERRIDE_EMBEDDING = True
-	OVERRIDE_ADAM = True
-	OVERRIDE_ADAMW = True
+	OVERRIDE_EMBEDDING = False
+	OVERRIDE_ADAM = False
+	OVERRIDE_ADAMW = False

 	USE_STABLE_EMBEDDING = os.environ.get('BITSANDBYTES_USE_STABLE_EMBEDDING', '1' if USE_STABLE_EMBEDDING else '0') == '1'
 	OVERRIDE_LINEAR = os.environ.get('BITSANDBYTES_OVERRIDE_LINEAR', '1' if OVERRIDE_LINEAR else '0') == '1'
 	OVERRIDE_EMBEDDING = os.environ.get('BITSANDBYTES_OVERRIDE_EMBEDDING', '1' if OVERRIDE_EMBEDDING else '0') == '1'
 	OVERRIDE_ADAM = os.environ.get('BITSANDBYTES_OVERRIDE_ADAM', '1' if OVERRIDE_ADAM else '0') == '1'
 	OVERRIDE_ADAMW = os.environ.get('BITSANDBYTES_OVERRIDE_ADAMW', '1' if OVERRIDE_ADAMW else '0') == '1'
+	
+	if OVERRIDE_LINEAR or OVERRIDE_EMBEDDING or OVERRIDE_ADAM or OVERRIDE_ADAMW:
+		import bitsandbytes as bnb
 except Exception as e:
 	OVERRIDE_LINEAR = False
 	OVERRIDE_EMBEDDING = False
--- a/tortoise/utils/wav2vec_alignment.py
+++ b/tortoise/utils/wav2vec_alignment.py
@ -144,7 +144,7 @@ class Wav2VecAlignment:
        non_redacted_intervals = []
        last_point = 0
        for i in range(len(fully_split)):
-            if i % 2 == 0:
+            if i % 2 == 0 and fully_split[i] != "": # Check for empty string fixes index error
                end_interval = max(0, last_point + len(fully_split[i]) - 1)
                non_redacted_intervals.append((last_point, end_interval))
            last_point += len(fully_split[i])
Author	SHA1	Message	Date
mrq	95f679f4ba	possible fix for when candidates >= samples	2023-10-10 15:30:08 +00:00
mrq	bf3b6c87aa	added compat for coqui's XTTS	2023-09-16 03:38:21 +00:00
mrq	d7e6914fb8	Merge pull request 'main' (#47 ) from ken11o2/tortoise-tts:main into main Reviewed-on: mrq/tortoise-tts#47	2023-09-04 20:01:14 +00:00
ken11o2	b7c7fd1c5f	add arg use_deepspeed	2023-09-04 19:14:53 +00:00
ken11o2	2478dc255e	update TextToSpeech	2023-09-04 19:13:45 +00:00
ken11o2	18adfaf785	add use_deepspeed to contructor and update method post_init_gpt2_config	2023-09-04 19:12:13 +00:00
ken11o2	ac97c17bf7	add use_deepspeed	2023-09-04 19:10:27 +00:00
mrq	b10c58436d	pesky dot	2023-08-20 22:41:55 -05:00
mrq	cbd3c95c42	possible speedup with one simple trick (it worked for valle inferencing), also backported the voice list loading from aivc	2023-08-20 22:32:01 -05:00
mrq	9afa71542b	little sloppy hack to try and not load the same model when it was already loaded	2023-08-11 04:02:36 +00:00
mrq	e2cd07d560	Fix for redaction at end of text (#45 )	2023-06-10 21:16:21 +00:00
mrq	5ff00bf3bf	added flags to rever to default method of latent generation (separately for the AR and Diffusion latents, as some voices don't play nicely with the chunk-for-all method)	2023-05-21 01:46:55 +00:00
mrq	c90ee7c529	removed kludgy wrappers for passing progress when I was a pythonlet and didn't know gradio can hook into tqdm outputs anyways	2023-05-04 23:39:39 +00:00
mrq	086aad5b49	quick hotfix to remove offending codesmell (will actually clean it when I finish eating)	2023-05-04 22:59:57 +00:00
mrq	04b7049811	freeze numpy to 1.23.5 because latest version will moan about deprecating complex	2023-05-04 01:54:41 +00:00
mrq	b6a213bbbd	removed some CPU fallback wrappers because directml seems to work now without them	2023-04-29 00:46:36 +00:00
mrq	2f7d9ab932	disable BNB for inferencing by default because I'm pretty sure it makes zero differences (can be force enabled with env vars if you'r erelying on this for some reason)	2023-04-29 00:38:18 +00:00
mrq	f025470d60	Merge pull request 'Update tortoise/utils/devices.py vram issue' (#44 ) from aJoe/tortoise-tts:main into main Reviewed-on: mrq/tortoise-tts#44	2023-04-12 19:58:02 +00:00
aJoe	eea4c68edc	Update tortoise/utils/devices.py vram issue Added line 85 to set the name variable as it was 'None' causing vram to be incorrect	2023-04-12 05:33:30 +00:00
mrq	815ae5d707	Merge pull request 'feat: support .flac voice files' (#43 ) from NtTestAlert/tortoise-tts:support_flac_voice into main Reviewed-on: mrq/tortoise-tts#43	2023-04-01 16:37:56 +00:00
NtTestAlert	2cd7b72688	feat: support .flac voice files	2023-04-01 15:08:31 +02:00
mrq	0bcdf81d04	option to decouple sample batch size from CLVP candidate selection size (currently just unsqueezes the batches)	2023-03-21 21:33:46 +00:00