From cfd284f42595b3138395f0c72f7c29dd30e45f8e Mon Sep 17 00:00:00 2001
From: James Betker <jbetker@gmail.com>
Date: Fri, 13 Aug 2021 18:35:55 -0600
Subject: [PATCH] Fix up some stuff that allows the MEL to be computed on-GPU

---
 codes/data/audio/nv_tacotron_dataset.py   | 34 +++++++++++++++++++++--
 codes/trainer/injectors/base_injectors.py | 20 +++++++++++++
 2 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/codes/data/audio/nv_tacotron_dataset.py b/codes/data/audio/nv_tacotron_dataset.py
index 2623ad3c..7ea2f115 100644
--- a/codes/data/audio/nv_tacotron_dataset.py
+++ b/codes/data/audio/nv_tacotron_dataset.py
@@ -31,7 +31,6 @@ class TextMelLoader(torch.utils.data.Dataset):
     def __init__(self, hparams):
         self.path = os.path.dirname(hparams['path'])
         fetcher_mode = opt_get(hparams, ['fetcher_mode'], 'lj')
-        fetcher_fn = None
         if fetcher_mode == 'lj':
             fetcher_fn = load_filepaths_and_text
         elif fetcher_mode == 'mozilla_cv':
@@ -128,7 +127,7 @@ class TextMelLoader(torch.utils.data.Dataset):
                 'input_lengths': torch.tensor(orig_text_len, dtype=torch.long),
                 'padded_mel': m,
                 'output_lengths': torch.tensor(orig_output, dtype=torch.long),
-                'filenames': [p]
+                'filenames': p
             }
         return t, m, p
 
@@ -181,7 +180,6 @@ class TextMelCollate():
             gate_padded[i, mel.size(1)-1:] = 1
             output_lengths[i] = mel.size(1)
 
-
         return {
             'padded_text': text_padded,
             'input_lengths': input_lengths,
@@ -192,7 +190,36 @@ class TextMelCollate():
         }
 
 
+def dump_mels_to_disk():
+    params = {
+        'mode': 'nv_tacotron',
+        'path': 'E:\\audio\\MozillaCommonVoice\\en\\test.tsv',
+        'phase': 'train',
+        'n_workers': 0,
+        'batch_size': 32,
+        'fetcher_mode': 'mozilla_cv',
+        'needs_collate': False,
+        'max_mel_length': 255800,
+        'max_text_length': 200,
+        'return_wavs': True,
+        #'return_wavs': True,
+        #'input_sample_rate': 22050,
+        #'sampling_rate': 8000
+    }
+    output_path = 'D:\\mozcv_mels'
+    from data import create_dataset, create_dataloader
+    ds, c = create_dataset(params, return_collate=True)
+    dl = create_dataloader(ds, params, collate_fn=c)
+    for i, b in tqdm(enumerate(dl)):
+        mels = b['padded_mel']
+        fnames = b['filenames']
+        for j, fname in enumerate(fnames):
+            torch.save(mels[j], f'{os.path.join(output_path, fname)}_mel.pth')
+
+
 if __name__ == '__main__':
+    dump_mels_to_disk()
+    '''
     params = {
         'mode': 'nv_tacotron',
         'path': 'E:\\audio\\MozillaCommonVoice\\en\\train.tsv',
@@ -220,3 +247,4 @@ if __name__ == '__main__':
             pm = torch.nn.functional.pad(pm, (0, 800-pm.shape[-1]))
             m = pm if m is None else torch.cat([m, pm], dim=0)
             print(m.mean(), m.std())
+    '''
\ No newline at end of file
diff --git a/codes/trainer/injectors/base_injectors.py b/codes/trainer/injectors/base_injectors.py
index 3eeb8749..9d9ae00a 100644
--- a/codes/trainer/injectors/base_injectors.py
+++ b/codes/trainer/injectors/base_injectors.py
@@ -515,6 +515,26 @@ class DenormalizeInjector(Injector):
         return {self.output: out}
 
 
+# Performs normalization across fixed constants.
+class MelSpectrogramInjector(Injector):
+    def __init__(self, opt, env):
+        super().__init__(opt, env)
+        from models.tacotron2.layers import TacotronSTFT
+        from munch import munchify
+        from models.tacotron2 import hparams
+        hp = munchify(hparams.create_hparams())  # Just use the default tacotron values for the MEL spectrogram. Noone uses anything else anyway.
+        self.stft = TacotronSTFT(hp.filter_length, hp.hop_length, hp.win_length,
+            hp.n_mel_channels, hp.sampling_rate, hp.mel_fmin, hp.mel_fmax)
+
+    def forward(self, state):
+        inp = state[self.input]
+        if len(inp.shape) == 3:  # Automatically squeeze out the channels dimension if it is present (assuming mono-audio)
+            inp = inp.squeeze(1)
+        assert len(inp.shape) == 2
+        self.stft = self.stft.to(inp.device)
+        return {self.output: self.stft.mel_spectrogram(inp)}
+
+
 
 if __name__ == '__main__':
     inj = DecomposeDimensionInjector({'dim':2, 'in': 'x', 'out': 'y'}, None)