From 9ddfcb57aa09939714fee0906bf3082de26d888d Mon Sep 17 00:00:00 2001 From: HarkonCollider Date: Sat, 9 Sep 2023 22:00:21 +0000 Subject: [PATCH] Update tortoise/api.py My changed versions with more presets --- tortoise/api.py | 298 ++++++++++++++++++++++++++++-------------------- 1 file changed, 174 insertions(+), 124 deletions(-) diff --git a/tortoise/api.py b/tortoise/api.py index 88acb40..480c5c7 100755 --- a/tortoise/api.py +++ b/tortoise/api.py @@ -43,7 +43,7 @@ MODELS = { 'vocoder.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/vocoder.pth', 'rlg_auto.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/rlg_auto.pth', 'rlg_diffuser.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/rlg_diffuser.pth', - + 'bigvgan_base_24khz_100band.pth': 'https://huggingface.co/ecker/tortoise-tts-models/resolve/main/models/bigvgan_base_24khz_100band.pth', 'bigvgan_24khz_100band.pth': 'https://huggingface.co/ecker/tortoise-tts-models/resolve/main/models/bigvgan_24khz_100band.pth', @@ -51,6 +51,7 @@ MODELS = { 'bigvgan_24khz_100band.json': 'https://huggingface.co/ecker/tortoise-tts-models/resolve/main/models/bigvgan_24khz_100band.json', } + def hash_file(path, algo="md5", buffer_size=0): import hashlib @@ -77,12 +78,14 @@ def hash_file(path, algo="md5", buffer_size=0): return "{0}".format(hash.hexdigest()) + def check_for_kill_signal(): global STOP_SIGNAL if STOP_SIGNAL: STOP_SIGNAL = False raise Exception("Kill signal detected") + def download_models(specific_models=None): """ Call to download all the models that Tortoise uses. @@ -102,6 +105,7 @@ def download_models(specific_models=None): else: pbar.finish() pbar = None + for model_name, url in MODELS.items(): if specific_models is not None and model_name not in specific_models: continue @@ -112,7 +116,7 @@ def download_models(specific_models=None): proxy = ProxyHandler({}) opener = build_opener(proxy) - opener.addheaders = [('User-Agent','mrq/AI-Voice-Cloning')] + opener.addheaders = [('User-Agent', 'mrq/AI-Voice-Cloning')] install_opener(opener) request.urlretrieve(url, model_path, show_progress) print('Done.') @@ -137,19 +141,23 @@ def pad_or_truncate(t, length): if t.shape[-1] == length: return t elif t.shape[-1] < length: - return F.pad(t, (0, length-t.shape[-1])) + return F.pad(t, (0, length - t.shape[-1])) else: return t[..., :length] -def load_discrete_vocoder_diffuser(trained_diffusion_steps=4000, desired_diffusion_steps=200, cond_free=True, cond_free_k=1): +def load_discrete_vocoder_diffuser(trained_diffusion_steps=4000, desired_diffusion_steps=200, cond_free=True, + cond_free_k=1): """ Helper function to load a GaussianDiffusion instance configured for use as a vocoder. """ - return SpacedDiffusion(use_timesteps=space_timesteps(trained_diffusion_steps, [desired_diffusion_steps]), model_mean_type='epsilon', - model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', trained_diffusion_steps), + return SpacedDiffusion(use_timesteps=space_timesteps(trained_diffusion_steps, [desired_diffusion_steps]), + model_mean_type='epsilon', + model_var_type='learned_range', loss_type='mse', + betas=get_named_beta_schedule('linear', trained_diffusion_steps), conditioning_free=cond_free, conditioning_free_k=cond_free_k) + @torch.inference_mode() def format_conditioning(clip, cond_length=132300, device='cuda', sampling_rate=22050): """ @@ -165,6 +173,7 @@ def format_conditioning(clip, cond_length=132300, device='cuda', sampling_rate=2 mel_clip = mel_clip.unsqueeze(0) return migrate_to_device(mel_clip, device) + def fix_autoregressive_output(codes, stop_token, complain=True): """ This function performs some padding on coded audio that fixes a mismatch issue between what the diffusion model was @@ -194,23 +203,27 @@ def fix_autoregressive_output(codes, stop_token, complain=True): return codes + @torch.inference_mode() -def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True, desc=None, sampler="P", input_sample_rate=22050, output_sample_rate=24000): +def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True, + desc=None, sampler="P", input_sample_rate=22050, output_sample_rate=24000): """ Uses the specified diffusion model to convert discrete codes into a spectrogram. """ with torch.no_grad(): - output_seq_len = latents.shape[1] * 4 * output_sample_rate // input_sample_rate # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal. + output_seq_len = latents.shape[ + 1] * 4 * output_sample_rate // input_sample_rate # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal. output_shape = (latents.shape[0], 100, output_seq_len) - precomputed_embeddings = diffusion_model.timestep_independent(latents, conditioning_latents, output_seq_len, False) + precomputed_embeddings = diffusion_model.timestep_independent(latents, conditioning_latents, output_seq_len, + False) noise = torch.randn(output_shape, device=latents.device) * temperature - + diffuser.sampler = sampler.lower() mel = diffuser.sample_loop(diffusion_model, output_shape, noise=noise, - model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings}, desc=desc) + model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings}, desc=desc) - mel = denormalize_tacotron_mel(mel)[:,:,:output_seq_len] + mel = denormalize_tacotron_mel(mel)[:, :, :output_seq_len] if get_device_name() == "dml": mel = mel.cpu() return mel @@ -230,7 +243,8 @@ def classify_audio_clip(clip): results = F.softmax(classifier(clip), dim=-1) return results[0][0] -def migrate_to_device( t, device ): + +def migrate_to_device(t, device): if t is None: return t @@ -244,23 +258,23 @@ def migrate_to_device( t, device ): t.device = device t = t.to(device) - + do_gc() return t + class TextToSpeech: """ Main entry point into Tortoise. """ def __init__(self, autoregressive_batch_size=None, models_dir=MODELS_DIR, enable_redaction=True, device=None, - minor_optimizations=True, - unsqueeze_sample_batches=False, - input_sample_rate=22050, output_sample_rate=24000, - autoregressive_model_path=None, diffusion_model_path=None, vocoder_model=None, tokenizer_json=None, -# ): - use_deepspeed=False): # Add use_deepspeed parameter + minor_optimizations=True, + unsqueeze_sample_batches=False, + input_sample_rate=22050, output_sample_rate=24000, + autoregressive_model_path=None, diffusion_model_path=None, vocoder_model=None, tokenizer_json=None, + ): """ Constructor :param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing @@ -271,22 +285,21 @@ class TextToSpeech: (but are still rendered by the model). This can be used for prompt engineering. Default is true. :param device: Device to use when running the model. If omitted, the device will be automatically chosen. - """ + """ self.loading = True if device is None: device = get_device(verbose=True) - self.version = [2,4,4] # to-do, autograb this from setup.py, or have setup.py autograb this + self.version = [2, 4, 4] # to-do, autograb this from setup.py, or have setup.py autograb this self.input_sample_rate = input_sample_rate self.output_sample_rate = output_sample_rate self.minor_optimizations = minor_optimizations self.unsqueeze_sample_batches = unsqueeze_sample_batches - self.use_deepspeed = use_deepspeed # Store use_deepspeed as an instance variable - print(f'use_deepspeed api_debug {use_deepspeed}') + # for clarity, it's simpler to split these up and just predicate them on requesting VRAM-consuming optimizations self.preloaded_tensors = minor_optimizations self.use_kv_cache = minor_optimizations - if get_device_name() == "dml": # does not work with DirectML + if get_device_name() == "dml": # does not work with DirectML print("KV caching requested but not supported with the DirectML backend, disabling...") self.use_kv_cache = False @@ -315,13 +328,12 @@ class TextToSpeech: self.load_diffusion_model(diffusion_model_path) - self.clvp = CLVP(dim_text=768, dim_speech=768, dim_latent=768, num_text_tokens=256, text_enc_depth=20, text_seq_len=350, text_heads=12, num_speech_tokens=8192, speech_enc_depth=20, speech_heads=12, speech_seq_len=430, use_xformers=True).cpu().eval() self.clvp.load_state_dict(torch.load(get_model_path('clvp2.pth', models_dir))) - self.cvvp = None # CVVP model is only loaded if used. + self.cvvp = None # CVVP model is only loaded if used. self.vocoder_model = vocoder_model self.load_vocoder_model(self.vocoder_model) @@ -331,21 +343,23 @@ class TextToSpeech: self.rlg_diffusion = None if self.preloaded_tensors: - self.autoregressive = migrate_to_device( self.autoregressive, self.device ) - self.diffusion = migrate_to_device( self.diffusion, self.device ) - self.clvp = migrate_to_device( self.clvp, self.device ) - self.vocoder = migrate_to_device( self.vocoder, self.device ) + self.autoregressive = migrate_to_device(self.autoregressive, self.device) + self.diffusion = migrate_to_device(self.diffusion, self.device) + self.clvp = migrate_to_device(self.clvp, self.device) + self.vocoder = migrate_to_device(self.vocoder, self.device) self.loading = False def load_autoregressive_model(self, autoregressive_model_path): - if hasattr(self,"autoregressive_model_path") and os.path.samefile(self.autoregressive_model_path, autoregressive_model_path): + if hasattr(self, "autoregressive_model_path") and os.path.samefile(self.autoregressive_model_path, + autoregressive_model_path): return - self.autoregressive_model_path = autoregressive_model_path if autoregressive_model_path and os.path.exists(autoregressive_model_path) else get_model_path('autoregressive.pth', self.models_dir) + self.autoregressive_model_path = autoregressive_model_path if autoregressive_model_path and os.path.exists( + autoregressive_model_path) else get_model_path('autoregressive.pth', self.models_dir) new_hash = hash_file(self.autoregressive_model_path) - if hasattr(self,"autoregressive_model_hash") and self.autoregressive_model_hash == new_hash: + if hasattr(self, "autoregressive_model_hash") and self.autoregressive_model_hash == new_hash: return self.autoregressive_model_hash = new_hash @@ -356,42 +370,44 @@ class TextToSpeech: if hasattr(self, 'autoregressive'): del self.autoregressive - self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30, - model_dim=1024, - heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False, - train_solo_embeddings=False).cpu().eval() + self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, + layers=30, + model_dim=1024, + heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False, + train_solo_embeddings=False).cpu().eval() self.autoregressive.load_state_dict(torch.load(self.autoregressive_model_path)) - self.autoregressive.post_init_gpt2_config(use_deepspeed=self.use_deepspeed, kv_cache=self.use_kv_cache) + self.autoregressive.post_init_gpt2_config(kv_cache=self.use_kv_cache) if self.preloaded_tensors: - self.autoregressive = migrate_to_device( self.autoregressive, self.device ) + self.autoregressive = migrate_to_device(self.autoregressive, self.device) self.loading = False print(f"Loaded autoregressive model") def load_diffusion_model(self, diffusion_model_path): - if hasattr(self,"diffusion_model_path") and os.path.samefile(self.diffusion_model_path, diffusion_model_path): + if hasattr(self, "diffusion_model_path") and os.path.samefile(self.diffusion_model_path, diffusion_model_path): return self.loading = True - self.diffusion_model_path = diffusion_model_path if diffusion_model_path and os.path.exists(diffusion_model_path) else get_model_path('diffusion_decoder.pth', self.models_dir) + self.diffusion_model_path = diffusion_model_path if diffusion_model_path and os.path.exists( + diffusion_model_path) else get_model_path('diffusion_decoder.pth', self.models_dir) self.diffusion_model_hash = hash_file(self.diffusion_model_path) if hasattr(self, 'diffusion'): del self.diffusion self.diffusion = DiffusionTts(model_channels=1024, num_layers=10, in_channels=100, out_channels=200, - in_latent_channels=1024, in_tokens=8193, dropout=0, use_fp16=False, num_heads=16, - layer_drop=0, unconditioned_percentage=0).cpu().eval() + in_latent_channels=1024, in_tokens=8193, dropout=0, use_fp16=False, num_heads=16, + layer_drop=0, unconditioned_percentage=0).cpu().eval() self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', self.models_dir))) if self.preloaded_tensors: - self.diffusion = migrate_to_device( self.diffusion, self.device ) + self.diffusion = migrate_to_device(self.diffusion, self.device) self.loading = False print(f"Loaded diffusion model") def load_vocoder_model(self, vocoder_model): - if hasattr(self,"vocoder_model_path") and os.path.samefile(self.vocoder_model_path, vocoder_model): + if hasattr(self, "vocoder_model_path") and os.path.samefile(self.vocoder_model_path, vocoder_model): return self.loading = True @@ -415,27 +431,30 @@ class TextToSpeech: vocoder_config = get_model_path(vocoder_config, self.models_dir) self.vocoder = BigVGAN(config=vocoder_config).cpu() - #elif vocoder_model == "univnet": + # elif vocoder_model == "univnet": else: vocoder_key = 'model_g' self.vocoder_model_path = 'vocoder.pth' self.vocoder = UnivNetGenerator().cpu() - + print(f"Loading vocoder model: {self.vocoder_model_path}") - self.vocoder.load_state_dict(torch.load(get_model_path(self.vocoder_model_path, self.models_dir), map_location=torch.device('cpu'))[vocoder_key]) + self.vocoder.load_state_dict( + torch.load(get_model_path(self.vocoder_model_path, self.models_dir), map_location=torch.device('cpu'))[ + vocoder_key]) self.vocoder.eval(inference=True) if self.preloaded_tensors: - self.vocoder = migrate_to_device( self.vocoder, self.device ) + self.vocoder = migrate_to_device(self.vocoder, self.device) self.loading = False print(f"Loaded vocoder model") def load_tokenizer_json(self, tokenizer_json): - if hasattr(self,"tokenizer_json") and os.path.samefile(self.tokenizer_json, tokenizer_json): + if hasattr(self, "tokenizer_json") and os.path.samefile(self.tokenizer_json, tokenizer_json): return - + self.loading = True - self.tokenizer_json = tokenizer_json if tokenizer_json else os.path.join(os.path.dirname(os.path.realpath(__file__)), '../tortoise/data/tokenizer.json') + self.tokenizer_json = tokenizer_json if tokenizer_json else os.path.join( + os.path.dirname(os.path.realpath(__file__)), '../tortoise/data/tokenizer.json') print("Loading tokenizer JSON:", self.tokenizer_json) if hasattr(self, 'tokenizer'): @@ -448,20 +467,32 @@ class TextToSpeech: def load_cvvp(self): """Load CVVP model.""" - self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0, + self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, + cond_mask_percentage=0, speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval() self.cvvp.load_state_dict(torch.load(get_model_path('cvvp.pth', self.models_dir))) - + if self.preloaded_tensors: - self.cvvp = migrate_to_device( self.cvvp, self.device ) + self.cvvp = migrate_to_device(self.cvvp, self.device) @torch.inference_mode() - def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, slices=1, max_chunk_size=None, force_cpu=False, original_ar=False, original_diffusion=False): + def get_conditioning_latents( + self, voice_samples, return_mels=False, verbose=False, slices=1, max_chunk_size=None, force_cpu=False, + original_ar=False, original_diffusion=False + ): """ Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent). These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic properties. - :param voice_samples: List of 2 or more ~10 second reference clips, which should be torch tensors containing 22.05kHz waveform data. + :param force_cpu: + :param max_chunk_size: + :param slices: + :param verbose: + :param return_mels: + :param original_diffusion: + :param original_ar: + :param voice_samples: List of 2 or more ~10 second reference clips, + which should be torch tensors containing 22.05kHz waveform data. """ with torch.no_grad(): @@ -472,7 +503,7 @@ class TextToSpeech: if not isinstance(voice_samples, list): voice_samples = [voice_samples] - + resampler_22K = torchaudio.transforms.Resample( self.input_sample_rate, 22050, @@ -491,7 +522,7 @@ class TextToSpeech: beta=8.555504641634386, ).to(device) - voice_samples = [migrate_to_device(v, device) for v in voice_samples] + voice_samples = [migrate_to_device(v, device) for v in voice_samples] auto_conds = [] diffusion_conds = [] @@ -499,7 +530,8 @@ class TextToSpeech: if original_ar: samples = [resampler_22K(sample) for sample in voice_samples] for sample in tqdm(samples, desc="Computing AR conditioning latents..."): - auto_conds.append(format_conditioning(sample, device=device, sampling_rate=self.input_sample_rate, cond_length=132300)) + auto_conds.append(format_conditioning(sample, device=device, sampling_rate=self.input_sample_rate, + cond_length=132300)) else: samples = [resampler_22K(sample) for sample in voice_samples] concat = torch.cat(samples, dim=-1) @@ -516,32 +548,35 @@ class TextToSpeech: chunk_size = chunks[0].shape[-1] for chunk in tqdm(chunks, desc="Computing AR conditioning latents..."): - auto_conds.append(format_conditioning(chunk, device=device, sampling_rate=self.input_sample_rate, cond_length=chunk_size)) - + auto_conds.append(format_conditioning(chunk, device=device, sampling_rate=self.input_sample_rate, + cond_length=chunk_size)) if original_diffusion: samples = [resampler_24K(sample) for sample in voice_samples] for sample in tqdm(samples, desc="Computing diffusion conditioning latents..."): sample = pad_or_truncate(sample, 102400) - cond_mel = wav_to_univnet_mel(migrate_to_device(sample, device), do_normalization=False, device=self.device) + cond_mel = wav_to_univnet_mel(migrate_to_device(sample, device), do_normalization=False, + device=self.device) diffusion_conds.append(cond_mel) else: samples = [resampler_24K(sample) for sample in voice_samples] for chunk in tqdm(chunks, desc="Computing diffusion conditioning latents..."): check_for_kill_signal() chunk = pad_or_truncate(chunk, chunk_size) - cond_mel = wav_to_univnet_mel(migrate_to_device( chunk, device ), do_normalization=False, device=device) + cond_mel = wav_to_univnet_mel(migrate_to_device(chunk, device), do_normalization=False, + device=device) diffusion_conds.append(cond_mel) auto_conds = torch.stack(auto_conds, dim=1) - self.autoregressive = migrate_to_device( self.autoregressive, device ) + self.autoregressive = migrate_to_device(self.autoregressive, device) auto_latent = self.autoregressive.get_conditioning(auto_conds) - self.autoregressive = migrate_to_device( self.autoregressive, self.device if self.preloaded_tensors else 'cpu' ) + self.autoregressive = migrate_to_device(self.autoregressive, + self.device if self.preloaded_tensors else 'cpu') diffusion_conds = torch.stack(diffusion_conds, dim=1) - self.diffusion = migrate_to_device( self.diffusion, device ) + self.diffusion = migrate_to_device(self.diffusion, device) diffusion_latent = self.diffusion.get_conditioning(diffusion_conds) - self.diffusion = migrate_to_device( self.diffusion, self.device if self.preloaded_tensors else 'cpu' ) + self.diffusion = migrate_to_device(self.diffusion, self.device if self.preloaded_tensors else 'cpu') if return_mels: return auto_latent, diffusion_latent, auto_conds, diffusion_conds @@ -552,9 +587,11 @@ class TextToSpeech: # Lazy-load the RLG models. if self.rlg_auto is None: self.rlg_auto = RandomLatentConverter(1024).eval() - self.rlg_auto.load_state_dict(torch.load(get_model_path('rlg_auto.pth', self.models_dir), map_location=torch.device('cpu'))) + self.rlg_auto.load_state_dict( + torch.load(get_model_path('rlg_auto.pth', self.models_dir), map_location=torch.device('cpu'))) self.rlg_diffusion = RandomLatentConverter(2048).eval() - self.rlg_diffusion.load_state_dict(torch.load(get_model_path('rlg_diffuser.pth', self.models_dir), map_location=torch.device('cpu'))) + self.rlg_diffusion.load_state_dict( + torch.load(get_model_path('rlg_diffuser.pth', self.models_dir), map_location=torch.device('cpu'))) with torch.no_grad(): return self.rlg_auto(torch.tensor([0.0])), self.rlg_diffusion(torch.tensor([0.0])) @@ -576,16 +613,19 @@ class TextToSpeech: 'fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 80}, 'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200}, 'high_quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400}, + 'narration': {'num_autoregressive_samples': 30, 'diffusion_iterations': 80, "diffusion_sampler": "DDIM"}, + 'dialogue': {'num_autoregressive_samples': 60, 'diffusion_iterations': 120, "diffusion_sampler": "DDIM"} } settings.update(presets[preset]) - settings.update(kwargs) # allow overriding of preset settings with kwargs + settings.update(kwargs) # allow overriding of preset settings with kwargs return self.tts(text, **settings) @torch.inference_mode() def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True, use_deterministic_seed=None, return_deterministic_state=False, # autoregressive generation parameters follow - num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, top_p=.8, max_mel_tokens=500, + num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, top_p=.8, + max_mel_tokens=500, sample_batch_size=None, autoregressive_model=None, diffusion_model=None, @@ -667,14 +707,17 @@ class TextToSpeech: self.load_tokenizer_json(tokenizer_json) text_tokens = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0) - text_tokens = migrate_to_device( text_tokens, self.device ) + text_tokens = migrate_to_device(text_tokens, self.device) text_tokens = F.pad(text_tokens, (0, 1)) # This may not be necessary. - assert text_tokens.shape[-1] < 400, 'Too much text provided. Break the text up into separate segments and re-try inference.' + assert text_tokens.shape[ + -1] < 400, 'Too much text provided. Break the text up into separate segments and re-try inference.' auto_conds = None if voice_samples is not None: - auto_conditioning, diffusion_conditioning, auto_conds, _ = self.get_conditioning_latents(voice_samples, return_mels=True, verbose=True) + auto_conditioning, diffusion_conditioning, auto_conds, _ = self.get_conditioning_latents(voice_samples, + return_mels=True, + verbose=True) elif conditioning_latents is not None: latent_tuple = conditioning_latents if len(latent_tuple) == 2: @@ -684,7 +727,8 @@ class TextToSpeech: else: auto_conditioning, diffusion_conditioning = self.get_random_conditioning_latents() - diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k) + diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, + cond_free_k=cond_free_k) self.autoregressive_batch_size = get_device_batch_size() if sample_batch_size is None or sample_batch_size == 0 else sample_batch_size @@ -696,12 +740,12 @@ class TextToSpeech: stop_mel_token = self.autoregressive.stop_mel_token calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output" - self.autoregressive = migrate_to_device( self.autoregressive, self.device ) - auto_conditioning = migrate_to_device( auto_conditioning, self.device ) - text_tokens = migrate_to_device( text_tokens, self.device ) + self.autoregressive = migrate_to_device(self.autoregressive, self.device) + auto_conditioning = migrate_to_device(auto_conditioning, self.device) + text_tokens = migrate_to_device(text_tokens, self.device) with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p): - for b in tqdm(range(num_batches), desc="Generating autoregressive samples"): + for b in tqdm(range(num_batches), desc="Generating autoregressive samples", disable=not verbose): check_for_kill_signal() codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens, do_sample=True, @@ -717,76 +761,75 @@ class TextToSpeech: samples.append(codes) if not self.preloaded_tensors: - self.autoregressive = migrate_to_device( self.autoregressive, 'cpu' ) + self.autoregressive = migrate_to_device(self.autoregressive, 'cpu') if self.unsqueeze_sample_batches: new_samples = [] for batch in samples: - for i in range(batch.shape[0]): + for i in range(batch.shape[0]): new_samples.append(batch[i].unsqueeze(0)) samples = new_samples clip_results = [] if auto_conds is not None: - auto_conditioning = migrate_to_device( auto_conditioning, self.device ) + auto_conditioning = migrate_to_device(auto_conditioning, self.device) with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p): if not self.preloaded_tensors: - self.autoregressive = migrate_to_device( self.autoregressive, 'cpu' ) - self.clvp = migrate_to_device( self.clvp, self.device ) + self.autoregressive = migrate_to_device(self.autoregressive, 'cpu') + self.clvp = migrate_to_device(self.clvp, self.device) if cvvp_amount > 0: if self.cvvp is None: self.load_cvvp() - + if not self.preloaded_tensors: - self.cvvp = migrate_to_device( self.cvvp, self.device ) - - desc="Computing best candidates" + self.cvvp = migrate_to_device(self.cvvp, self.device) + + desc = "Computing best candidates" if verbose: if self.cvvp is None: desc = "Computing best candidates using CLVP" else: - desc = f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%" + desc = f"Computing best candidates using CLVP {((1 - cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%" - - for batch in tqdm(samples, desc=desc): + for batch in tqdm(samples, desc=desc, disable=not verbose): check_for_kill_signal() for i in range(batch.shape[0]): batch[i] = fix_autoregressive_output(batch[i], stop_mel_token) if cvvp_amount != 1: clvp = self.clvp(text_tokens.repeat(batch.shape[0], 1), batch, return_loss=False) - + if auto_conds is not None and cvvp_amount > 0: cvvp_accumulator = 0 for cl in range(auto_conds.shape[1]): - cvvp_accumulator = cvvp_accumulator + self.cvvp(auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False) + cvvp_accumulator = cvvp_accumulator + self.cvvp( + auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False) cvvp = cvvp_accumulator / auto_conds.shape[1] if cvvp_amount == 1: clip_results.append(cvvp) else: - clip_results.append(cvvp * cvvp_amount + clvp * (1-cvvp_amount)) + clip_results.append(cvvp * cvvp_amount + clvp * (1 - cvvp_amount)) else: clip_results.append(clvp) if not self.preloaded_tensors and auto_conds is not None: - auto_conds = migrate_to_device( auto_conds, 'cpu' ) + auto_conds = migrate_to_device(auto_conds, 'cpu') clip_results = torch.cat(clip_results, dim=0) samples = torch.cat(samples, dim=0) best_results = samples[torch.topk(clip_results, k=k).indices] - + if not self.preloaded_tensors: - self.clvp = migrate_to_device( self.clvp, 'cpu' ) - self.cvvp = migrate_to_device( self.cvvp, 'cpu' ) - + self.clvp = migrate_to_device(self.clvp, 'cpu') + self.cvvp = migrate_to_device(self.cvvp, 'cpu') if get_device_name() == "dml": - text_tokens = migrate_to_device( text_tokens, 'cpu' ) - best_results = migrate_to_device( best_results, 'cpu' ) - auto_conditioning = migrate_to_device( auto_conditioning, 'cpu' ) - self.autoregressive = migrate_to_device( self.autoregressive, 'cpu' ) + text_tokens = migrate_to_device(text_tokens, 'cpu') + best_results = migrate_to_device(best_results, 'cpu') + auto_conditioning = migrate_to_device(auto_conditioning, 'cpu') + self.autoregressive = migrate_to_device(self.autoregressive, 'cpu') else: auto_conditioning = auto_conditioning.to(self.device) self.autoregressive = self.autoregressive.to(self.device) @@ -797,24 +840,27 @@ class TextToSpeech: # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these # results, but will increase memory usage. best_latents = self.autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1), - torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), best_results, - torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device), + torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), + best_results, + torch.tensor([best_results.shape[ + -1] * self.autoregressive.mel_length_compression], + device=text_tokens.device), return_latent=True, clip_inputs=False) - - diffusion_conditioning = migrate_to_device( diffusion_conditioning, self.device ) + + diffusion_conditioning = migrate_to_device(diffusion_conditioning, self.device) if get_device_name() == "dml": - self.autoregressive = migrate_to_device( self.autoregressive, self.device ) - best_results = migrate_to_device( best_results, self.device ) - best_latents = migrate_to_device( best_latents, self.device ) - self.vocoder = migrate_to_device( self.vocoder, 'cpu' ) + self.autoregressive = migrate_to_device(self.autoregressive, self.device) + best_results = migrate_to_device(best_results, self.device) + best_latents = migrate_to_device(best_latents, self.device) + self.vocoder = migrate_to_device(self.vocoder, 'cpu') else: if not self.preloaded_tensors: - self.autoregressive = migrate_to_device( self.autoregressive, 'cpu' ) + self.autoregressive = migrate_to_device(self.autoregressive, 'cpu') + + self.diffusion = migrate_to_device(self.diffusion, self.device) + self.vocoder = migrate_to_device(self.vocoder, self.device) - self.diffusion = migrate_to_device( self.diffusion, self.device ) - self.vocoder = migrate_to_device( self.vocoder, self.device ) - del text_tokens del auto_conditioning @@ -835,22 +881,26 @@ class TextToSpeech: break mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents, diffusion_conditioning, - temperature=diffusion_temperature, desc="Transforming autoregressive outputs into audio..", sampler=diffusion_sampler, - input_sample_rate=self.input_sample_rate, output_sample_rate=self.output_sample_rate) + temperature=diffusion_temperature, + desc="Transforming autoregressive outputs into audio..", + sampler=diffusion_sampler, + input_sample_rate=self.input_sample_rate, + output_sample_rate=self.output_sample_rate) wav = self.vocoder.inference(mel) wav_candidates.append(wav) - + if not self.preloaded_tensors: - self.diffusion = migrate_to_device( self.diffusion, 'cpu' ) - self.vocoder = migrate_to_device( self.vocoder, 'cpu' ) + self.diffusion = migrate_to_device(self.diffusion, 'cpu') + self.vocoder = migrate_to_device(self.vocoder, 'cpu') def potentially_redact(clip, text): if self.enable_redaction: t = clip.squeeze(1) - t = migrate_to_device( t, 'cpu' if get_device_name() == "dml" else self.device) + t = migrate_to_device(t, 'cpu' if get_device_name() == "dml" else self.device) return self.aligner.redact(t, text, self.output_sample_rate).unsqueeze(1) return clip + wav_candidates = [potentially_redact(wav_candidate, text) for wav_candidate in wav_candidates] if len(wav_candidates) > 1: @@ -876,4 +926,4 @@ class TextToSpeech: # Can't currently set this because of CUBLAS. TODO: potentially enable it if necessary. # torch.use_deterministic_algorithms(True) - return seed \ No newline at end of file + return seed