From cfd284f42595b3138395f0c72f7c29dd30e45f8e Mon Sep 17 00:00:00 2001 From: James Betker Date: Fri, 13 Aug 2021 18:35:55 -0600 Subject: [PATCH] Fix up some stuff that allows the MEL to be computed on-GPU --- codes/data/audio/nv_tacotron_dataset.py | 34 +++++++++++++++++++++-- codes/trainer/injectors/base_injectors.py | 20 +++++++++++++ 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/codes/data/audio/nv_tacotron_dataset.py b/codes/data/audio/nv_tacotron_dataset.py index 2623ad3c..7ea2f115 100644 --- a/codes/data/audio/nv_tacotron_dataset.py +++ b/codes/data/audio/nv_tacotron_dataset.py @@ -31,7 +31,6 @@ class TextMelLoader(torch.utils.data.Dataset): def __init__(self, hparams): self.path = os.path.dirname(hparams['path']) fetcher_mode = opt_get(hparams, ['fetcher_mode'], 'lj') - fetcher_fn = None if fetcher_mode == 'lj': fetcher_fn = load_filepaths_and_text elif fetcher_mode == 'mozilla_cv': @@ -128,7 +127,7 @@ class TextMelLoader(torch.utils.data.Dataset): 'input_lengths': torch.tensor(orig_text_len, dtype=torch.long), 'padded_mel': m, 'output_lengths': torch.tensor(orig_output, dtype=torch.long), - 'filenames': [p] + 'filenames': p } return t, m, p @@ -181,7 +180,6 @@ class TextMelCollate(): gate_padded[i, mel.size(1)-1:] = 1 output_lengths[i] = mel.size(1) - return { 'padded_text': text_padded, 'input_lengths': input_lengths, @@ -192,7 +190,36 @@ class TextMelCollate(): } +def dump_mels_to_disk(): + params = { + 'mode': 'nv_tacotron', + 'path': 'E:\\audio\\MozillaCommonVoice\\en\\test.tsv', + 'phase': 'train', + 'n_workers': 0, + 'batch_size': 32, + 'fetcher_mode': 'mozilla_cv', + 'needs_collate': False, + 'max_mel_length': 255800, + 'max_text_length': 200, + 'return_wavs': True, + #'return_wavs': True, + #'input_sample_rate': 22050, + #'sampling_rate': 8000 + } + output_path = 'D:\\mozcv_mels' + from data import create_dataset, create_dataloader + ds, c = create_dataset(params, return_collate=True) + dl = create_dataloader(ds, params, collate_fn=c) + for i, b in tqdm(enumerate(dl)): + mels = b['padded_mel'] + fnames = b['filenames'] + for j, fname in enumerate(fnames): + torch.save(mels[j], f'{os.path.join(output_path, fname)}_mel.pth') + + if __name__ == '__main__': + dump_mels_to_disk() + ''' params = { 'mode': 'nv_tacotron', 'path': 'E:\\audio\\MozillaCommonVoice\\en\\train.tsv', @@ -220,3 +247,4 @@ if __name__ == '__main__': pm = torch.nn.functional.pad(pm, (0, 800-pm.shape[-1])) m = pm if m is None else torch.cat([m, pm], dim=0) print(m.mean(), m.std()) + ''' \ No newline at end of file diff --git a/codes/trainer/injectors/base_injectors.py b/codes/trainer/injectors/base_injectors.py index 3eeb8749..9d9ae00a 100644 --- a/codes/trainer/injectors/base_injectors.py +++ b/codes/trainer/injectors/base_injectors.py @@ -515,6 +515,26 @@ class DenormalizeInjector(Injector): return {self.output: out} +# Performs normalization across fixed constants. +class MelSpectrogramInjector(Injector): + def __init__(self, opt, env): + super().__init__(opt, env) + from models.tacotron2.layers import TacotronSTFT + from munch import munchify + from models.tacotron2 import hparams + hp = munchify(hparams.create_hparams()) # Just use the default tacotron values for the MEL spectrogram. Noone uses anything else anyway. + self.stft = TacotronSTFT(hp.filter_length, hp.hop_length, hp.win_length, + hp.n_mel_channels, hp.sampling_rate, hp.mel_fmin, hp.mel_fmax) + + def forward(self, state): + inp = state[self.input] + if len(inp.shape) == 3: # Automatically squeeze out the channels dimension if it is present (assuming mono-audio) + inp = inp.squeeze(1) + assert len(inp.shape) == 2 + self.stft = self.stft.to(inp.device) + return {self.output: self.stft.mel_spectrogram(inp)} + + if __name__ == '__main__': inj = DecomposeDimensionInjector({'dim':2, 'in': 'x', 'out': 'y'}, None)