forked from mrq/tortoise-tts
integrate new autoregressive model and fix new diffusion bug
This commit is contained in:
parent
4747fae381
commit
81f6ea1afa
7
api.py
7
api.py
|
@ -117,13 +117,14 @@ def do_spectrogram_diffusion(diffusion_model, diffuser, mel_codes, conditioning_
|
||||||
cond_mels.append(cond_mel)
|
cond_mels.append(cond_mel)
|
||||||
cond_mels = torch.stack(cond_mels, dim=1)
|
cond_mels = torch.stack(cond_mels, dim=1)
|
||||||
|
|
||||||
output_shape = (mel_codes.shape[0], 100, mel_codes.shape[-1]*4)
|
output_seq_len = mel_codes.shape[-1]*4*24000//22050 # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal.
|
||||||
precomputed_embeddings = diffusion_model.timestep_independent(mel_codes, cond_mels, False)
|
output_shape = (mel_codes.shape[0], 100, output_seq_len)
|
||||||
|
precomputed_embeddings = diffusion_model.timestep_independent(mel_codes, cond_mels, output_seq_len, False)
|
||||||
|
|
||||||
noise = torch.randn(output_shape, device=mel_codes.device) * temperature
|
noise = torch.randn(output_shape, device=mel_codes.device) * temperature
|
||||||
mel = diffuser.p_sample_loop(diffusion_model, output_shape, noise=noise,
|
mel = diffuser.p_sample_loop(diffusion_model, output_shape, noise=noise,
|
||||||
model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings})
|
model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings})
|
||||||
return denormalize_tacotron_mel(mel)[:,:,:mel_codes.shape[-1]*4]
|
return denormalize_tacotron_mel(mel)[:,:,:output_seq_len]
|
||||||
|
|
||||||
|
|
||||||
class TextToSpeech:
|
class TextToSpeech:
|
||||||
|
|
|
@ -5,7 +5,7 @@ import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
import torchaudio
|
import torchaudio
|
||||||
|
|
||||||
from api import TextToSpeech, load_conditioning
|
from api_new_autoregressive import TextToSpeech, load_conditioning
|
||||||
from utils.audio import load_audio
|
from utils.audio import load_audio
|
||||||
from utils.tokenizer import VoiceBpeTokenizer
|
from utils.tokenizer import VoiceBpeTokenizer
|
||||||
|
|
||||||
|
@ -28,7 +28,7 @@ if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('-text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.")
|
parser.add_argument('-text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.")
|
||||||
parser.add_argument('-voice', type=str, help='Use a preset conditioning voice (defined above). Overrides cond_path.', default='dotrice,harris,lescault,otto,atkins,grace,kennard,mol')
|
parser.add_argument('-voice', type=str, help='Use a preset conditioning voice (defined above). Overrides cond_path.', default='dotrice,harris,lescault,otto,atkins,grace,kennard,mol')
|
||||||
parser.add_argument('-num_samples', type=int, help='How many total outputs the autoregressive transformer should produce.', default=512)
|
parser.add_argument('-num_samples', type=int, help='How many total outputs the autoregressive transformer should produce.', default=32)
|
||||||
parser.add_argument('-batch_size', type=int, help='How many samples to process at once in the autoregressive model.', default=16)
|
parser.add_argument('-batch_size', type=int, help='How many samples to process at once in the autoregressive model.', default=16)
|
||||||
parser.add_argument('-num_diffusion_samples', type=int, help='Number of outputs that progress to the diffusion stage.', default=16)
|
parser.add_argument('-num_diffusion_samples', type=int, help='Number of outputs that progress to the diffusion stage.', default=16)
|
||||||
parser.add_argument('-output_path', type=str, help='Where to store outputs.', default='results/')
|
parser.add_argument('-output_path', type=str, help='Where to store outputs.', default='results/')
|
||||||
|
|
|
@ -212,7 +212,7 @@ class DiffusionTts(nn.Module):
|
||||||
}
|
}
|
||||||
return groups
|
return groups
|
||||||
|
|
||||||
def timestep_independent(self, aligned_conditioning, conditioning_input, return_code_pred):
|
def timestep_independent(self, aligned_conditioning, conditioning_input, expected_seq_len, return_code_pred):
|
||||||
# Shuffle aligned_latent to BxCxS format
|
# Shuffle aligned_latent to BxCxS format
|
||||||
if is_latent(aligned_conditioning):
|
if is_latent(aligned_conditioning):
|
||||||
aligned_conditioning = aligned_conditioning.permute(0, 2, 1)
|
aligned_conditioning = aligned_conditioning.permute(0, 2, 1)
|
||||||
|
@ -227,7 +227,7 @@ class DiffusionTts(nn.Module):
|
||||||
cond_emb = conds.mean(dim=-1)
|
cond_emb = conds.mean(dim=-1)
|
||||||
cond_scale, cond_shift = torch.chunk(cond_emb, 2, dim=1)
|
cond_scale, cond_shift = torch.chunk(cond_emb, 2, dim=1)
|
||||||
if is_latent(aligned_conditioning):
|
if is_latent(aligned_conditioning):
|
||||||
code_emb = self.latent_converter(aligned_conditioning)
|
code_emb = self.autoregressive_latent_converter(aligned_conditioning)
|
||||||
else:
|
else:
|
||||||
code_emb = self.code_embedding(aligned_conditioning).permute(0, 2, 1)
|
code_emb = self.code_embedding(aligned_conditioning).permute(0, 2, 1)
|
||||||
code_emb = self.code_converter(code_emb)
|
code_emb = self.code_converter(code_emb)
|
||||||
|
@ -240,7 +240,7 @@ class DiffusionTts(nn.Module):
|
||||||
device=code_emb.device) < self.unconditioned_percentage
|
device=code_emb.device) < self.unconditioned_percentage
|
||||||
code_emb = torch.where(unconditioned_batches, self.unconditioned_embedding.repeat(aligned_conditioning.shape[0], 1, 1),
|
code_emb = torch.where(unconditioned_batches, self.unconditioned_embedding.repeat(aligned_conditioning.shape[0], 1, 1),
|
||||||
code_emb)
|
code_emb)
|
||||||
expanded_code_emb = F.interpolate(code_emb, size=aligned_conditioning.shape[-1]*4, mode='nearest')
|
expanded_code_emb = F.interpolate(code_emb, size=expected_seq_len, mode='nearest')
|
||||||
|
|
||||||
if not return_code_pred:
|
if not return_code_pred:
|
||||||
return expanded_code_emb
|
return expanded_code_emb
|
||||||
|
@ -250,7 +250,6 @@ class DiffusionTts(nn.Module):
|
||||||
mel_pred = mel_pred * unconditioned_batches.logical_not()
|
mel_pred = mel_pred * unconditioned_batches.logical_not()
|
||||||
return expanded_code_emb, mel_pred
|
return expanded_code_emb, mel_pred
|
||||||
|
|
||||||
|
|
||||||
def forward(self, x, timesteps, aligned_conditioning=None, conditioning_input=None, precomputed_aligned_embeddings=None, conditioning_free=False, return_code_pred=False):
|
def forward(self, x, timesteps, aligned_conditioning=None, conditioning_input=None, precomputed_aligned_embeddings=None, conditioning_free=False, return_code_pred=False):
|
||||||
"""
|
"""
|
||||||
Apply the model to an input batch.
|
Apply the model to an input batch.
|
||||||
|
@ -275,11 +274,12 @@ class DiffusionTts(nn.Module):
|
||||||
if precomputed_aligned_embeddings is not None:
|
if precomputed_aligned_embeddings is not None:
|
||||||
code_emb = precomputed_aligned_embeddings
|
code_emb = precomputed_aligned_embeddings
|
||||||
else:
|
else:
|
||||||
code_emb, mel_pred = self.timestep_independent(aligned_conditioning, conditioning_input, True)
|
code_emb, mel_pred = self.timestep_independent(aligned_conditioning, conditioning_input, x.shape[-1], True)
|
||||||
if is_latent(aligned_conditioning):
|
if is_latent(aligned_conditioning):
|
||||||
unused_params.extend(list(self.code_converter.parameters()) + list(self.code_embedding.parameters()))
|
unused_params.extend(list(self.code_converter.parameters()) + list(self.code_embedding.parameters()))
|
||||||
else:
|
else:
|
||||||
unused_params.extend(list(self.latent_converter.parameters()))
|
unused_params.extend(list(self.latent_converter.parameters()))
|
||||||
|
|
||||||
unused_params.append(self.unconditioned_embedding)
|
unused_params.append(self.unconditioned_embedding)
|
||||||
|
|
||||||
time_emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
|
time_emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
|
||||||
|
|
Loading…
Reference in New Issue
Block a user