diff --git a/tortoise/api.py b/tortoise/api.py index 07ce6b4..5231484 100644 --- a/tortoise/api.py +++ b/tortoise/api.py @@ -194,8 +194,7 @@ class TextToSpeech: self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30, model_dim=1024, heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False, - train_solo_embeddings=False, - average_conditioning_embeddings=True).cpu().eval() + train_solo_embeddings=False).cpu().eval() self.autoregressive.load_state_dict(torch.load(f'{models_dir}/autoregressive.pth')) self.diffusion = DiffusionTts(model_channels=1024, num_layers=10, in_channels=100, out_channels=200, @@ -244,7 +243,7 @@ class TextToSpeech: kwargs.update(presets[preset]) return self.tts(text, **kwargs) - def get_conditioning_latents(self, voice_samples): + def get_conditioning_latents(self, voice_samples, return_mels=False): """ Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent). These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic @@ -268,7 +267,7 @@ class TextToSpeech: # The diffuser operates at a sample rate of 24000 (except for the latent inputs) sample = torchaudio.functional.resample(sample, 22050, 24000) sample = pad_or_truncate(sample, 102400) - cond_mel = wav_to_univnet_mel(sample.to(voice_samples.device), do_normalization=False) + cond_mel = wav_to_univnet_mel(sample.to('cuda'), do_normalization=False) diffusion_conds.append(cond_mel) diffusion_conds = torch.stack(diffusion_conds, dim=1) @@ -276,7 +275,10 @@ class TextToSpeech: diffusion_latent = self.diffusion.get_conditioning(diffusion_conds) self.diffusion = self.diffusion.cpu() - return auto_latent, diffusion_latent, auto_conds + if return_mels: + return auto_latent, diffusion_latent, auto_conds, diffusion_conds + else: + return auto_latent, diffusion_latent def get_random_conditioning_latents(self): # Lazy-load the RLG models. @@ -295,7 +297,6 @@ class TextToSpeech: def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True, # autoregressive generation parameters follow num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, top_p=.8, max_mel_tokens=500, - typical_sampling=False, typical_mass=.9, # CLVP & CVVP parameters clvp_cvvp_slider=.5, # diffusion generation parameters follow @@ -354,13 +355,13 @@ class TextToSpeech: auto_conds = None if voice_samples is not None: - auto_conditioning, diffusion_conditioning, auto_conds = self.get_conditioning_latents(voice_samples) + auto_conditioning, diffusion_conditioning, auto_conds, _ = self.get_conditioning_latents(voice_samples, return_mels=True) elif conditioning_latents is not None: auto_conditioning, diffusion_conditioning = conditioning_latents else: auto_conditioning, diffusion_conditioning = self.get_random_conditioning_latents() - auto_conditioning = auto_conditioning.cuda() - diffusion_conditioning = diffusion_conditioning.cuda() + auto_conditioning = auto_conditioning.cuda() + diffusion_conditioning = diffusion_conditioning.cuda() diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k) diff --git a/tortoise/get_conditioning_latents.py b/tortoise/get_conditioning_latents.py index c96e608..aa7e9b7 100644 --- a/tortoise/get_conditioning_latents.py +++ b/tortoise/get_conditioning_latents.py @@ -11,8 +11,8 @@ other ML models, or can be augmented manually and fed back into Tortoise to affe """ if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('--voice', type=str, help='Selects the voice to convert to conditioning latents', default='pat') - parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/conditioning_latents') + parser.add_argument('--voice', type=str, help='Selects the voice to convert to conditioning latents', default='pat2') + parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='../results/conditioning_latents') args = parser.parse_args() os.makedirs(args.output_path, exist_ok=True) diff --git a/tortoise/models/autoregressive.py b/tortoise/models/autoregressive.py index e56ad27..757a7a8 100644 --- a/tortoise/models/autoregressive.py +++ b/tortoise/models/autoregressive.py @@ -280,8 +280,7 @@ class UnifiedVoice(nn.Module): mel_length_compression=1024, number_text_tokens=256, start_text_token=None, number_mel_codes=8194, start_mel_token=8192, stop_mel_token=8193, train_solo_embeddings=False, use_mel_codes_as_input=True, - checkpointing=True, average_conditioning_embeddings=False, - types=1): + checkpointing=True, types=1): """ Args: layers: Number of layers in transformer stack. @@ -300,7 +299,6 @@ class UnifiedVoice(nn.Module): train_solo_embeddings: use_mel_codes_as_input: checkpointing: - average_conditioning_embeddings: Whether or not conditioning embeddings should be averaged, instead of fed piecewise into the model. """ super().__init__() @@ -318,7 +316,6 @@ class UnifiedVoice(nn.Module): self.max_conditioning_inputs = max_conditioning_inputs self.mel_length_compression = mel_length_compression self.conditioning_encoder = ConditioningEncoder(80, model_dim, num_attn_heads=heads) - self.average_conditioning_embeddings = average_conditioning_embeddings self.text_embedding = nn.Embedding(self.number_text_tokens*types+1, model_dim) if use_mel_codes_as_input: self.mel_embedding = nn.Embedding(self.number_mel_codes, model_dim) @@ -397,8 +394,7 @@ class UnifiedVoice(nn.Module): for j in range(speech_conditioning_input.shape[1]): conds.append(self.conditioning_encoder(speech_conditioning_input[:, j])) conds = torch.stack(conds, dim=1) - if self.average_conditioning_embeddings: - conds = conds.mean(dim=1).unsqueeze(1) + conds = conds.mean(dim=1) return conds def forward(self, speech_conditioning_latent, text_inputs, text_lengths, mel_codes, wav_lengths, types=None, text_first=True, raw_mels=None, return_attentions=False, @@ -461,65 +457,6 @@ class UnifiedVoice(nn.Module): loss_mel = F.cross_entropy(mel_logits, mel_targets.long()) return loss_text.mean(), loss_mel.mean(), mel_logits - def text_forward(self, speech_conditioning_input, text_inputs, text_lengths): - """ - Performs autoregressive modeling on only text. Still requires a speech_conditioning_input due to the way the - model inputs are formatted. Just provide any audio clip (arguably, zeros could be provided). - """ - assert self.max_text_tokens >= text_inputs.shape[1], f'{text_inputs.shape[1]}' - - # This model will receive micro-batches with a ton of padding for both the text and MELs. Ameliorate this by - # chopping the inputs by the maximum actual length. - max_text_len = text_lengths.max() - text_inputs = F.pad(text_inputs[:, :max_text_len], (0,1), value=self.stop_text_token) - - speech_conditioning_input = speech_conditioning_input.unsqueeze(1) if len(speech_conditioning_input.shape) == 3 else speech_conditioning_input - conds = [] - for j in range(speech_conditioning_input.shape[1]): - conds.append(self.conditioning_encoder(speech_conditioning_input[:, j])) - conds = torch.stack(conds, dim=1) - if self.average_conditioning_embeddings: - conds = conds.mean(dim=1).unsqueeze(1) - - text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, self.stop_text_token) - text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs) + self.text_solo_embedding - text_logits = self.get_logits(conds, text_emb, self.text_head) - loss_text = F.cross_entropy(text_logits, text_targets.long()) - return loss_text.mean() - - def speech_forward(self, speech_conditioning_input, mel_codes, wav_lengths, raw_mels=None): - """ - Performs autoregressive modeling on only speech data. - """ - assert self.max_mel_tokens >= mel_codes.shape[1], f'{mel_codes.shape[1]}' - - # This model will receive micro-batches with a ton of padding for both the text and MELs. Ameliorate this by - # chopping the inputs by the maximum actual length. - max_mel_len = wav_lengths.max() // self.mel_length_compression - mel_codes = F.pad(mel_codes[:, :max_mel_len], (0,1), value=self.stop_mel_token) - mel_codes = self.set_mel_padding(mel_codes, wav_lengths) - if raw_mels is not None: - raw_mels = raw_mels[:, :, :max_mel_len*4] - - speech_conditioning_input = speech_conditioning_input.unsqueeze(1) if len(speech_conditioning_input.shape) == 3 else speech_conditioning_input - conds = [] - for j in range(speech_conditioning_input.shape[1]): - conds.append(self.conditioning_encoder(speech_conditioning_input[:, j])) - conds = torch.stack(conds, dim=1) - if self.average_conditioning_embeddings: - conds = conds.mean(dim=1).unsqueeze(1) - - mel_codes, mel_targets = self.build_aligned_inputs_and_targets(mel_codes, self.start_mel_token, self.stop_mel_token) - if raw_mels is not None: - mel_inp = F.pad(raw_mels, (0, 4)) - else: - mel_inp = mel_codes - mel_emb = self.mel_embedding(mel_inp) - mel_emb = mel_emb + self.mel_pos_embedding(mel_codes) + self.mel_solo_embedding - mel_logits = self.get_logits(conds, mel_emb, self.mel_head) - loss_mel = F.cross_entropy(mel_logits, mel_targets.long()) - return loss_mel.mean() - def inference_speech(self, speech_conditioning_latent, text_inputs, input_tokens=None, num_return_sequences=1, max_generate_length=None, typical_sampling=False, typical_mass=.9, **hf_generate_kwargs): seq_length = self.max_mel_tokens + self.max_text_tokens + 2 diff --git a/tortoise/utils/audio.py b/tortoise/utils/audio.py index a33abf1..5940eb8 100644 --- a/tortoise/utils/audio.py +++ b/tortoise/utils/audio.py @@ -87,7 +87,7 @@ def get_voices(): for sub in subs: subj = os.path.join('voices', sub) if os.path.isdir(subj): - voices[sub] = list(glob(f'{subj}/*.wav')) + list(glob(f'{subj}/*.mp3')) + voices[sub] = list(glob(f'{subj}/*.wav')) + list(glob(f'{subj}/*.mp3')) + list(glob(f'{subj}/*.pth')) return voices @@ -111,6 +111,9 @@ def load_voices(voices): latents = [] clips = [] for voice in voices: + if voice == 'random': + print("Cannot combine a random voice with a non-random voice. Just using a random voice.") + return None, None latent, clip = load_voice(voice) if latent is None: assert len(latents) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." @@ -119,10 +122,10 @@ def load_voices(voices): assert len(voices) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." latents.append(latent) if len(latents) == 0: - return clips + return clips, None else: latents = torch.stack(latents, dim=0) - return latents.mean(dim=0) + return None, latents.mean(dim=0) class TacotronSTFT(torch.nn.Module):