more fixes

This commit is contained in:
James Betker 2022-05-02 16:44:47 -06:00
parent 4836e1f792
commit ccf16f978e
4 changed files with 20 additions and 79 deletions

View File

@ -194,8 +194,7 @@ class TextToSpeech:
self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30, self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
model_dim=1024, model_dim=1024,
heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False, heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
train_solo_embeddings=False, train_solo_embeddings=False).cpu().eval()
average_conditioning_embeddings=True).cpu().eval()
self.autoregressive.load_state_dict(torch.load(f'{models_dir}/autoregressive.pth')) self.autoregressive.load_state_dict(torch.load(f'{models_dir}/autoregressive.pth'))
self.diffusion = DiffusionTts(model_channels=1024, num_layers=10, in_channels=100, out_channels=200, self.diffusion = DiffusionTts(model_channels=1024, num_layers=10, in_channels=100, out_channels=200,
@ -244,7 +243,7 @@ class TextToSpeech:
kwargs.update(presets[preset]) kwargs.update(presets[preset])
return self.tts(text, **kwargs) return self.tts(text, **kwargs)
def get_conditioning_latents(self, voice_samples): def get_conditioning_latents(self, voice_samples, return_mels=False):
""" """
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent). Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
@ -268,7 +267,7 @@ class TextToSpeech:
# The diffuser operates at a sample rate of 24000 (except for the latent inputs) # The diffuser operates at a sample rate of 24000 (except for the latent inputs)
sample = torchaudio.functional.resample(sample, 22050, 24000) sample = torchaudio.functional.resample(sample, 22050, 24000)
sample = pad_or_truncate(sample, 102400) sample = pad_or_truncate(sample, 102400)
cond_mel = wav_to_univnet_mel(sample.to(voice_samples.device), do_normalization=False) cond_mel = wav_to_univnet_mel(sample.to('cuda'), do_normalization=False)
diffusion_conds.append(cond_mel) diffusion_conds.append(cond_mel)
diffusion_conds = torch.stack(diffusion_conds, dim=1) diffusion_conds = torch.stack(diffusion_conds, dim=1)
@ -276,7 +275,10 @@ class TextToSpeech:
diffusion_latent = self.diffusion.get_conditioning(diffusion_conds) diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
self.diffusion = self.diffusion.cpu() self.diffusion = self.diffusion.cpu()
return auto_latent, diffusion_latent, auto_conds if return_mels:
return auto_latent, diffusion_latent, auto_conds, diffusion_conds
else:
return auto_latent, diffusion_latent
def get_random_conditioning_latents(self): def get_random_conditioning_latents(self):
# Lazy-load the RLG models. # Lazy-load the RLG models.
@ -295,7 +297,6 @@ class TextToSpeech:
def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True, def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True,
# autoregressive generation parameters follow # autoregressive generation parameters follow
num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, top_p=.8, max_mel_tokens=500, num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, top_p=.8, max_mel_tokens=500,
typical_sampling=False, typical_mass=.9,
# CLVP & CVVP parameters # CLVP & CVVP parameters
clvp_cvvp_slider=.5, clvp_cvvp_slider=.5,
# diffusion generation parameters follow # diffusion generation parameters follow
@ -354,13 +355,13 @@ class TextToSpeech:
auto_conds = None auto_conds = None
if voice_samples is not None: if voice_samples is not None:
auto_conditioning, diffusion_conditioning, auto_conds = self.get_conditioning_latents(voice_samples) auto_conditioning, diffusion_conditioning, auto_conds, _ = self.get_conditioning_latents(voice_samples, return_mels=True)
elif conditioning_latents is not None: elif conditioning_latents is not None:
auto_conditioning, diffusion_conditioning = conditioning_latents auto_conditioning, diffusion_conditioning = conditioning_latents
else: else:
auto_conditioning, diffusion_conditioning = self.get_random_conditioning_latents() auto_conditioning, diffusion_conditioning = self.get_random_conditioning_latents()
auto_conditioning = auto_conditioning.cuda() auto_conditioning = auto_conditioning.cuda()
diffusion_conditioning = diffusion_conditioning.cuda() diffusion_conditioning = diffusion_conditioning.cuda()
diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k) diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)

View File

@ -11,8 +11,8 @@ other ML models, or can be augmented manually and fed back into Tortoise to affe
""" """
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--voice', type=str, help='Selects the voice to convert to conditioning latents', default='pat') parser.add_argument('--voice', type=str, help='Selects the voice to convert to conditioning latents', default='pat2')
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/conditioning_latents') parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='../results/conditioning_latents')
args = parser.parse_args() args = parser.parse_args()
os.makedirs(args.output_path, exist_ok=True) os.makedirs(args.output_path, exist_ok=True)

View File

@ -280,8 +280,7 @@ class UnifiedVoice(nn.Module):
mel_length_compression=1024, number_text_tokens=256, mel_length_compression=1024, number_text_tokens=256,
start_text_token=None, number_mel_codes=8194, start_mel_token=8192, start_text_token=None, number_mel_codes=8194, start_mel_token=8192,
stop_mel_token=8193, train_solo_embeddings=False, use_mel_codes_as_input=True, stop_mel_token=8193, train_solo_embeddings=False, use_mel_codes_as_input=True,
checkpointing=True, average_conditioning_embeddings=False, checkpointing=True, types=1):
types=1):
""" """
Args: Args:
layers: Number of layers in transformer stack. layers: Number of layers in transformer stack.
@ -300,7 +299,6 @@ class UnifiedVoice(nn.Module):
train_solo_embeddings: train_solo_embeddings:
use_mel_codes_as_input: use_mel_codes_as_input:
checkpointing: checkpointing:
average_conditioning_embeddings: Whether or not conditioning embeddings should be averaged, instead of fed piecewise into the model.
""" """
super().__init__() super().__init__()
@ -318,7 +316,6 @@ class UnifiedVoice(nn.Module):
self.max_conditioning_inputs = max_conditioning_inputs self.max_conditioning_inputs = max_conditioning_inputs
self.mel_length_compression = mel_length_compression self.mel_length_compression = mel_length_compression
self.conditioning_encoder = ConditioningEncoder(80, model_dim, num_attn_heads=heads) self.conditioning_encoder = ConditioningEncoder(80, model_dim, num_attn_heads=heads)
self.average_conditioning_embeddings = average_conditioning_embeddings
self.text_embedding = nn.Embedding(self.number_text_tokens*types+1, model_dim) self.text_embedding = nn.Embedding(self.number_text_tokens*types+1, model_dim)
if use_mel_codes_as_input: if use_mel_codes_as_input:
self.mel_embedding = nn.Embedding(self.number_mel_codes, model_dim) self.mel_embedding = nn.Embedding(self.number_mel_codes, model_dim)
@ -397,8 +394,7 @@ class UnifiedVoice(nn.Module):
for j in range(speech_conditioning_input.shape[1]): for j in range(speech_conditioning_input.shape[1]):
conds.append(self.conditioning_encoder(speech_conditioning_input[:, j])) conds.append(self.conditioning_encoder(speech_conditioning_input[:, j]))
conds = torch.stack(conds, dim=1) conds = torch.stack(conds, dim=1)
if self.average_conditioning_embeddings: conds = conds.mean(dim=1)
conds = conds.mean(dim=1).unsqueeze(1)
return conds return conds
def forward(self, speech_conditioning_latent, text_inputs, text_lengths, mel_codes, wav_lengths, types=None, text_first=True, raw_mels=None, return_attentions=False, def forward(self, speech_conditioning_latent, text_inputs, text_lengths, mel_codes, wav_lengths, types=None, text_first=True, raw_mels=None, return_attentions=False,
@ -461,65 +457,6 @@ class UnifiedVoice(nn.Module):
loss_mel = F.cross_entropy(mel_logits, mel_targets.long()) loss_mel = F.cross_entropy(mel_logits, mel_targets.long())
return loss_text.mean(), loss_mel.mean(), mel_logits return loss_text.mean(), loss_mel.mean(), mel_logits
def text_forward(self, speech_conditioning_input, text_inputs, text_lengths):
"""
Performs autoregressive modeling on only text. Still requires a speech_conditioning_input due to the way the
model inputs are formatted. Just provide any audio clip (arguably, zeros could be provided).
"""
assert self.max_text_tokens >= text_inputs.shape[1], f'{text_inputs.shape[1]}'
# This model will receive micro-batches with a ton of padding for both the text and MELs. Ameliorate this by
# chopping the inputs by the maximum actual length.
max_text_len = text_lengths.max()
text_inputs = F.pad(text_inputs[:, :max_text_len], (0,1), value=self.stop_text_token)
speech_conditioning_input = speech_conditioning_input.unsqueeze(1) if len(speech_conditioning_input.shape) == 3 else speech_conditioning_input
conds = []
for j in range(speech_conditioning_input.shape[1]):
conds.append(self.conditioning_encoder(speech_conditioning_input[:, j]))
conds = torch.stack(conds, dim=1)
if self.average_conditioning_embeddings:
conds = conds.mean(dim=1).unsqueeze(1)
text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, self.stop_text_token)
text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs) + self.text_solo_embedding
text_logits = self.get_logits(conds, text_emb, self.text_head)
loss_text = F.cross_entropy(text_logits, text_targets.long())
return loss_text.mean()
def speech_forward(self, speech_conditioning_input, mel_codes, wav_lengths, raw_mels=None):
"""
Performs autoregressive modeling on only speech data.
"""
assert self.max_mel_tokens >= mel_codes.shape[1], f'{mel_codes.shape[1]}'
# This model will receive micro-batches with a ton of padding for both the text and MELs. Ameliorate this by
# chopping the inputs by the maximum actual length.
max_mel_len = wav_lengths.max() // self.mel_length_compression
mel_codes = F.pad(mel_codes[:, :max_mel_len], (0,1), value=self.stop_mel_token)
mel_codes = self.set_mel_padding(mel_codes, wav_lengths)
if raw_mels is not None:
raw_mels = raw_mels[:, :, :max_mel_len*4]
speech_conditioning_input = speech_conditioning_input.unsqueeze(1) if len(speech_conditioning_input.shape) == 3 else speech_conditioning_input
conds = []
for j in range(speech_conditioning_input.shape[1]):
conds.append(self.conditioning_encoder(speech_conditioning_input[:, j]))
conds = torch.stack(conds, dim=1)
if self.average_conditioning_embeddings:
conds = conds.mean(dim=1).unsqueeze(1)
mel_codes, mel_targets = self.build_aligned_inputs_and_targets(mel_codes, self.start_mel_token, self.stop_mel_token)
if raw_mels is not None:
mel_inp = F.pad(raw_mels, (0, 4))
else:
mel_inp = mel_codes
mel_emb = self.mel_embedding(mel_inp)
mel_emb = mel_emb + self.mel_pos_embedding(mel_codes) + self.mel_solo_embedding
mel_logits = self.get_logits(conds, mel_emb, self.mel_head)
loss_mel = F.cross_entropy(mel_logits, mel_targets.long())
return loss_mel.mean()
def inference_speech(self, speech_conditioning_latent, text_inputs, input_tokens=None, num_return_sequences=1, def inference_speech(self, speech_conditioning_latent, text_inputs, input_tokens=None, num_return_sequences=1,
max_generate_length=None, typical_sampling=False, typical_mass=.9, **hf_generate_kwargs): max_generate_length=None, typical_sampling=False, typical_mass=.9, **hf_generate_kwargs):
seq_length = self.max_mel_tokens + self.max_text_tokens + 2 seq_length = self.max_mel_tokens + self.max_text_tokens + 2

View File

@ -87,7 +87,7 @@ def get_voices():
for sub in subs: for sub in subs:
subj = os.path.join('voices', sub) subj = os.path.join('voices', sub)
if os.path.isdir(subj): if os.path.isdir(subj):
voices[sub] = list(glob(f'{subj}/*.wav')) + list(glob(f'{subj}/*.mp3')) voices[sub] = list(glob(f'{subj}/*.wav')) + list(glob(f'{subj}/*.mp3')) + list(glob(f'{subj}/*.pth'))
return voices return voices
@ -111,6 +111,9 @@ def load_voices(voices):
latents = [] latents = []
clips = [] clips = []
for voice in voices: for voice in voices:
if voice == 'random':
print("Cannot combine a random voice with a non-random voice. Just using a random voice.")
return None, None
latent, clip = load_voice(voice) latent, clip = load_voice(voice)
if latent is None: if latent is None:
assert len(latents) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." assert len(latents) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
@ -119,10 +122,10 @@ def load_voices(voices):
assert len(voices) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." assert len(voices) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
latents.append(latent) latents.append(latent)
if len(latents) == 0: if len(latents) == 0:
return clips return clips, None
else: else:
latents = torch.stack(latents, dim=0) latents = torch.stack(latents, dim=0)
return latents.mean(dim=0) return None, latents.mean(dim=0)
class TacotronSTFT(torch.nn.Module): class TacotronSTFT(torch.nn.Module):