232 lines
10 KiB
Python
232 lines
10 KiB
Python
import random
|
|
|
|
import torch
|
|
import torch.nn.functional as F
|
|
import torchaudio
|
|
|
|
from trainer.inject import Injector
|
|
from utils.util import opt_get, load_model_from_config, pad_or_truncate
|
|
|
|
TACOTRON_MEL_MAX = 2.3143386840820312
|
|
TACOTRON_MEL_MIN = -11.512925148010254
|
|
|
|
def normalize_tacotron_mel(mel):
|
|
return 2 * ((mel - TACOTRON_MEL_MIN) / (TACOTRON_MEL_MAX - TACOTRON_MEL_MIN)) - 1
|
|
|
|
def denormalize_tacotron_mel(norm_mel):
|
|
return ((norm_mel+1)/2)*(TACOTRON_MEL_MAX-TACOTRON_MEL_MIN)+TACOTRON_MEL_MIN
|
|
|
|
class MelSpectrogramInjector(Injector):
|
|
def __init__(self, opt, env):
|
|
super().__init__(opt, env)
|
|
from models.audio.tts.tacotron2 import TacotronSTFT
|
|
# These are the default tacotron values for the MEL spectrogram.
|
|
filter_length = opt_get(opt, ['filter_length'], 1024)
|
|
hop_length = opt_get(opt, ['hop_length'], 256)
|
|
win_length = opt_get(opt, ['win_length'], 1024)
|
|
n_mel_channels = opt_get(opt, ['n_mel_channels'], 80)
|
|
mel_fmin = opt_get(opt, ['mel_fmin'], 0)
|
|
mel_fmax = opt_get(opt, ['mel_fmax'], 8000)
|
|
sampling_rate = opt_get(opt, ['sampling_rate'], 22050)
|
|
self.stft = TacotronSTFT(filter_length, hop_length, win_length, n_mel_channels, sampling_rate, mel_fmin, mel_fmax)
|
|
self.do_normalization = opt_get(opt, ['do_normalization'], None) # This is different from the TorchMelSpectrogramInjector. This just normalizes to the range [-1,1]
|
|
|
|
def forward(self, state):
|
|
inp = state[self.input]
|
|
if len(inp.shape) == 3: # Automatically squeeze out the channels dimension if it is present (assuming mono-audio)
|
|
inp = inp.squeeze(1)
|
|
assert len(inp.shape) == 2
|
|
self.stft = self.stft.to(inp.device)
|
|
mel = self.stft.mel_spectrogram(inp)
|
|
if self.do_normalization:
|
|
mel = normalize_tacotron_mel(mel)
|
|
return {self.output: mel}
|
|
|
|
|
|
class TorchMelSpectrogramInjector(Injector):
|
|
def __init__(self, opt, env):
|
|
super().__init__(opt, env)
|
|
# These are the default tacotron values for the MEL spectrogram.
|
|
self.filter_length = opt_get(opt, ['filter_length'], 1024)
|
|
self.hop_length = opt_get(opt, ['hop_length'], 256)
|
|
self.win_length = opt_get(opt, ['win_length'], 1024)
|
|
self.n_mel_channels = opt_get(opt, ['n_mel_channels'], 80)
|
|
self.mel_fmin = opt_get(opt, ['mel_fmin'], 0)
|
|
self.mel_fmax = opt_get(opt, ['mel_fmax'], 8000)
|
|
self.sampling_rate = opt_get(opt, ['sampling_rate'], 22050)
|
|
norm = opt_get(opt, ['normalize'], False)
|
|
self.mel_stft = torchaudio.transforms.MelSpectrogram(n_fft=self.filter_length, hop_length=self.hop_length,
|
|
win_length=self.win_length, power=2, normalized=norm,
|
|
sample_rate=self.sampling_rate, f_min=self.mel_fmin,
|
|
f_max=self.mel_fmax, n_mels=self.n_mel_channels,
|
|
norm="slaney")
|
|
self.mel_norm_file = opt_get(opt, ['mel_norm_file'], None)
|
|
if self.mel_norm_file is not None:
|
|
self.mel_norms = torch.load(self.mel_norm_file)
|
|
else:
|
|
self.mel_norms = None
|
|
|
|
def forward(self, state):
|
|
inp = state[self.input]
|
|
if len(inp.shape) == 3: # Automatically squeeze out the channels dimension if it is present (assuming mono-audio)
|
|
inp = inp.squeeze(1)
|
|
assert len(inp.shape) == 2
|
|
self.mel_stft = self.mel_stft.to(inp.device)
|
|
mel = self.mel_stft(inp)
|
|
# Perform dynamic range compression
|
|
mel = torch.log(torch.clamp(mel, min=1e-5))
|
|
if self.mel_norms is not None:
|
|
self.mel_norms = self.mel_norms.to(mel.device)
|
|
mel = mel / self.mel_norms.unsqueeze(0).unsqueeze(-1)
|
|
return {self.output: mel}
|
|
|
|
|
|
class RandomAudioCropInjector(Injector):
|
|
def __init__(self, opt, env):
|
|
super().__init__(opt, env)
|
|
self.crop_sz = opt['crop_size']
|
|
self.lengths_key = opt['lengths_key']
|
|
|
|
def forward(self, state):
|
|
inp = state[self.input]
|
|
lens = state[self.lengths_key]
|
|
len = torch.min(lens)
|
|
margin = len - self.crop_sz
|
|
if margin < 0:
|
|
return {self.output: inp}
|
|
start = random.randint(0, margin)
|
|
return {self.output: inp[:, :, start:start+self.crop_sz]}
|
|
|
|
|
|
class AudioClipInjector(Injector):
|
|
def __init__(self, opt, env):
|
|
super().__init__(opt, env)
|
|
self.clip_size = opt['clip_size']
|
|
self.ctc_codes = opt['ctc_codes_key']
|
|
self.output_ctc = opt['ctc_out_key']
|
|
|
|
def forward(self, state):
|
|
inp = state[self.input]
|
|
ctc = state[self.ctc_codes]
|
|
len = inp.shape[-1]
|
|
if len > self.clip_size:
|
|
proportion_inp_remaining = self.clip_size/len
|
|
inp = inp[:, :, :self.clip_size]
|
|
ctc = ctc[:,:int(proportion_inp_remaining*ctc.shape[-1])]
|
|
return {self.output: inp, self.output_ctc: ctc}
|
|
|
|
|
|
class AudioResampleInjector(Injector):
|
|
def __init__(self, opt, env):
|
|
super().__init__(opt, env)
|
|
self.input_sr = opt['input_sample_rate']
|
|
self.output_sr = opt['output_sample_rate']
|
|
|
|
def forward(self, state):
|
|
inp = state[self.input]
|
|
return {self.output: torchaudio.functional.resample(inp, self.input_sr, self.output_sr)}
|
|
|
|
|
|
class DiscreteTokenInjector(Injector):
|
|
def __init__(self, opt, env):
|
|
super().__init__(opt, env)
|
|
cfg = opt_get(opt, ['dvae_config'], "../experiments/train_diffusion_vocoder_22k_level.yml")
|
|
dvae_name = opt_get(opt, ['dvae_name'], 'dvae')
|
|
self.dvae = load_model_from_config(cfg, dvae_name, device=f'cuda:{env["device"]}').eval()
|
|
|
|
def forward(self, state):
|
|
inp = state[self.input]
|
|
with torch.no_grad():
|
|
self.dvae = self.dvae.to(inp.device)
|
|
codes = self.dvae.get_codebook_indices(inp)
|
|
return {self.output: codes}
|
|
|
|
|
|
class GptVoiceLatentInjector(Injector):
|
|
"""
|
|
This injector does all the legwork to generate latents out of a UnifiedVoice model, including encoding all audio
|
|
inputs into a MEL spectrogram and discretizing the inputs.
|
|
"""
|
|
def __init__(self, opt, env):
|
|
super().__init__(opt, env)
|
|
# For discrete tokenization.
|
|
cfg = opt_get(opt, ['dvae_config'], "../experiments/train_diffusion_vocoder_22k_level.yml")
|
|
dvae_name = opt_get(opt, ['dvae_name'], 'dvae')
|
|
self.dvae = load_model_from_config(cfg, dvae_name).cuda().eval()
|
|
# The unified_voice model.
|
|
cfg = opt_get(opt, ['gpt_config'], "../experiments/train_gpt_tts_unified.yml")
|
|
model_name = opt_get(opt, ['gpt_name'], 'gpt')
|
|
pretrained_path = opt['gpt_path']
|
|
self.gpt = load_model_from_config(cfg, model_name=model_name,
|
|
also_load_savepoint=False, load_path=pretrained_path).cuda().eval()
|
|
self.needs_move = True
|
|
# Mel converter
|
|
self.mel_inj = TorchMelSpectrogramInjector({'in': 'wav', 'out': 'mel', 'mel_norm_file': '../experiments/clips_mel_norms.pth'},{})
|
|
# Aux input keys.
|
|
self.conditioning_key = opt['conditioning_clip']
|
|
self.text_input_key = opt['text']
|
|
self.text_lengths_key = opt['text_lengths']
|
|
self.input_lengths_key = opt['input_lengths']
|
|
|
|
def to_mel(self, t):
|
|
return self.mel_inj({'wav': t})['mel']
|
|
|
|
def forward(self, state):
|
|
with torch.no_grad():
|
|
mel_inputs = self.to_mel(state[self.input])
|
|
state_cond = pad_or_truncate(state[self.conditioning_key], 132300)
|
|
mel_conds = []
|
|
for k in range(state_cond.shape[1]):
|
|
mel_conds.append(self.to_mel(state_cond[:, k]))
|
|
mel_conds = torch.stack(mel_conds, dim=1)
|
|
|
|
if self.needs_move:
|
|
self.dvae = self.dvae.to(mel_inputs.device)
|
|
self.gpt = self.gpt.to(mel_inputs.device)
|
|
codes = self.dvae.get_codebook_indices(mel_inputs)
|
|
latents = self.gpt(mel_conds, state[self.text_input_key],
|
|
state[self.text_lengths_key], codes, state[self.input_lengths_key],
|
|
text_first=True, raw_mels=None, return_attentions=False, return_latent=True,
|
|
clip_inputs=False)
|
|
assert latents.shape[1] == codes.shape[1]
|
|
return {self.output: latents}
|
|
|
|
|
|
class ReverseUnivnetInjector(Injector):
|
|
"""
|
|
This injector specifically builds inputs and labels for a univnet detector.g
|
|
"""
|
|
def __init__(self, opt, env):
|
|
super().__init__(opt, env)
|
|
from scripts.audio.gen.speech_synthesis_utils import load_univnet_vocoder
|
|
self.univnet = load_univnet_vocoder().cuda()
|
|
self.mel_input_key = opt['mel']
|
|
self.label_output_key = opt['labels']
|
|
self.do_augmentations = opt_get(opt, ['do_aug'], True)
|
|
|
|
def forward(self, state):
|
|
with torch.no_grad():
|
|
original_audio = state[self.input]
|
|
mel = state[self.mel_input_key]
|
|
decoded_mel = self.univnet.inference(mel)[:,:,:original_audio.shape[-1]]
|
|
|
|
if self.do_augmentations:
|
|
original_audio = original_audio + torch.rand_like(original_audio) * random.random() * .005
|
|
decoded_mel = decoded_mel + torch.rand_like(decoded_mel) * random.random() * .005
|
|
if(random.random() < .5):
|
|
original_audio = torchaudio.functional.resample(torchaudio.functional.resample(original_audio, 24000, 10000), 10000, 24000)
|
|
if(random.random() < .5):
|
|
decoded_mel = torchaudio.functional.resample(torchaudio.functional.resample(decoded_mel, 24000, 10000), 10000, 24000)
|
|
if(random.random() < .5):
|
|
original_audio = torchaudio.functional.resample(original_audio, 24000, 22000 + random.randint(0,2000))
|
|
if(random.random() < .5):
|
|
decoded_mel = torchaudio.functional.resample(decoded_mel, 24000, 22000 + random.randint(0,2000))
|
|
|
|
smallest_dim = min(original_audio.shape[-1], decoded_mel.shape[-1])
|
|
original_audio = original_audio[:,:,:smallest_dim]
|
|
decoded_mel = decoded_mel[:,:,:smallest_dim]
|
|
|
|
labels = (torch.rand(mel.shape[0], 1, 1, device=mel.device) > .5)
|
|
output = torch.where(labels, original_audio, decoded_mel)
|
|
|
|
return {self.output: output, self.label_output_key: labels[:,0,0].long()} |