From 81f6ea1afa843ab714e4a8b263da6164193487f9 Mon Sep 17 00:00:00 2001 From: James Betker Date: Mon, 4 Apr 2022 16:51:35 -0600 Subject: [PATCH] integrate new autoregressive model and fix new diffusion bug --- api.py | 7 ++++--- do_tts.py | 4 ++-- models/diffusion_decoder.py | 10 +++++----- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/api.py b/api.py index 6c3fb1e..7c33484 100644 --- a/api.py +++ b/api.py @@ -117,13 +117,14 @@ def do_spectrogram_diffusion(diffusion_model, diffuser, mel_codes, conditioning_ cond_mels.append(cond_mel) cond_mels = torch.stack(cond_mels, dim=1) - output_shape = (mel_codes.shape[0], 100, mel_codes.shape[-1]*4) - precomputed_embeddings = diffusion_model.timestep_independent(mel_codes, cond_mels, False) + output_seq_len = mel_codes.shape[-1]*4*24000//22050 # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal. + output_shape = (mel_codes.shape[0], 100, output_seq_len) + precomputed_embeddings = diffusion_model.timestep_independent(mel_codes, cond_mels, output_seq_len, False) noise = torch.randn(output_shape, device=mel_codes.device) * temperature mel = diffuser.p_sample_loop(diffusion_model, output_shape, noise=noise, model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings}) - return denormalize_tacotron_mel(mel)[:,:,:mel_codes.shape[-1]*4] + return denormalize_tacotron_mel(mel)[:,:,:output_seq_len] class TextToSpeech: diff --git a/do_tts.py b/do_tts.py index af5c780..e48e9d5 100644 --- a/do_tts.py +++ b/do_tts.py @@ -5,7 +5,7 @@ import torch import torch.nn.functional as F import torchaudio -from api import TextToSpeech, load_conditioning +from api_new_autoregressive import TextToSpeech, load_conditioning from utils.audio import load_audio from utils.tokenizer import VoiceBpeTokenizer @@ -28,7 +28,7 @@ if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.") parser.add_argument('-voice', type=str, help='Use a preset conditioning voice (defined above). Overrides cond_path.', default='dotrice,harris,lescault,otto,atkins,grace,kennard,mol') - parser.add_argument('-num_samples', type=int, help='How many total outputs the autoregressive transformer should produce.', default=512) + parser.add_argument('-num_samples', type=int, help='How many total outputs the autoregressive transformer should produce.', default=32) parser.add_argument('-batch_size', type=int, help='How many samples to process at once in the autoregressive model.', default=16) parser.add_argument('-num_diffusion_samples', type=int, help='Number of outputs that progress to the diffusion stage.', default=16) parser.add_argument('-output_path', type=str, help='Where to store outputs.', default='results/') diff --git a/models/diffusion_decoder.py b/models/diffusion_decoder.py index cacdfc1..1baf809 100644 --- a/models/diffusion_decoder.py +++ b/models/diffusion_decoder.py @@ -212,7 +212,7 @@ class DiffusionTts(nn.Module): } return groups - def timestep_independent(self, aligned_conditioning, conditioning_input, return_code_pred): + def timestep_independent(self, aligned_conditioning, conditioning_input, expected_seq_len, return_code_pred): # Shuffle aligned_latent to BxCxS format if is_latent(aligned_conditioning): aligned_conditioning = aligned_conditioning.permute(0, 2, 1) @@ -227,7 +227,7 @@ class DiffusionTts(nn.Module): cond_emb = conds.mean(dim=-1) cond_scale, cond_shift = torch.chunk(cond_emb, 2, dim=1) if is_latent(aligned_conditioning): - code_emb = self.latent_converter(aligned_conditioning) + code_emb = self.autoregressive_latent_converter(aligned_conditioning) else: code_emb = self.code_embedding(aligned_conditioning).permute(0, 2, 1) code_emb = self.code_converter(code_emb) @@ -240,7 +240,7 @@ class DiffusionTts(nn.Module): device=code_emb.device) < self.unconditioned_percentage code_emb = torch.where(unconditioned_batches, self.unconditioned_embedding.repeat(aligned_conditioning.shape[0], 1, 1), code_emb) - expanded_code_emb = F.interpolate(code_emb, size=aligned_conditioning.shape[-1]*4, mode='nearest') + expanded_code_emb = F.interpolate(code_emb, size=expected_seq_len, mode='nearest') if not return_code_pred: return expanded_code_emb @@ -250,7 +250,6 @@ class DiffusionTts(nn.Module): mel_pred = mel_pred * unconditioned_batches.logical_not() return expanded_code_emb, mel_pred - def forward(self, x, timesteps, aligned_conditioning=None, conditioning_input=None, precomputed_aligned_embeddings=None, conditioning_free=False, return_code_pred=False): """ Apply the model to an input batch. @@ -275,11 +274,12 @@ class DiffusionTts(nn.Module): if precomputed_aligned_embeddings is not None: code_emb = precomputed_aligned_embeddings else: - code_emb, mel_pred = self.timestep_independent(aligned_conditioning, conditioning_input, True) + code_emb, mel_pred = self.timestep_independent(aligned_conditioning, conditioning_input, x.shape[-1], True) if is_latent(aligned_conditioning): unused_params.extend(list(self.code_converter.parameters()) + list(self.code_embedding.parameters())) else: unused_params.extend(list(self.latent_converter.parameters())) + unused_params.append(self.unconditioned_embedding) time_emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))