forked from mrq/DL-Art-School
Fix up some stuff that allows the MEL to be computed on-GPU
This commit is contained in:
parent
cdee31c60b
commit
cfd284f425
|
@ -31,7 +31,6 @@ class TextMelLoader(torch.utils.data.Dataset):
|
||||||
def __init__(self, hparams):
|
def __init__(self, hparams):
|
||||||
self.path = os.path.dirname(hparams['path'])
|
self.path = os.path.dirname(hparams['path'])
|
||||||
fetcher_mode = opt_get(hparams, ['fetcher_mode'], 'lj')
|
fetcher_mode = opt_get(hparams, ['fetcher_mode'], 'lj')
|
||||||
fetcher_fn = None
|
|
||||||
if fetcher_mode == 'lj':
|
if fetcher_mode == 'lj':
|
||||||
fetcher_fn = load_filepaths_and_text
|
fetcher_fn = load_filepaths_and_text
|
||||||
elif fetcher_mode == 'mozilla_cv':
|
elif fetcher_mode == 'mozilla_cv':
|
||||||
|
@ -128,7 +127,7 @@ class TextMelLoader(torch.utils.data.Dataset):
|
||||||
'input_lengths': torch.tensor(orig_text_len, dtype=torch.long),
|
'input_lengths': torch.tensor(orig_text_len, dtype=torch.long),
|
||||||
'padded_mel': m,
|
'padded_mel': m,
|
||||||
'output_lengths': torch.tensor(orig_output, dtype=torch.long),
|
'output_lengths': torch.tensor(orig_output, dtype=torch.long),
|
||||||
'filenames': [p]
|
'filenames': p
|
||||||
}
|
}
|
||||||
return t, m, p
|
return t, m, p
|
||||||
|
|
||||||
|
@ -181,7 +180,6 @@ class TextMelCollate():
|
||||||
gate_padded[i, mel.size(1)-1:] = 1
|
gate_padded[i, mel.size(1)-1:] = 1
|
||||||
output_lengths[i] = mel.size(1)
|
output_lengths[i] = mel.size(1)
|
||||||
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'padded_text': text_padded,
|
'padded_text': text_padded,
|
||||||
'input_lengths': input_lengths,
|
'input_lengths': input_lengths,
|
||||||
|
@ -192,7 +190,36 @@ class TextMelCollate():
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def dump_mels_to_disk():
|
||||||
|
params = {
|
||||||
|
'mode': 'nv_tacotron',
|
||||||
|
'path': 'E:\\audio\\MozillaCommonVoice\\en\\test.tsv',
|
||||||
|
'phase': 'train',
|
||||||
|
'n_workers': 0,
|
||||||
|
'batch_size': 32,
|
||||||
|
'fetcher_mode': 'mozilla_cv',
|
||||||
|
'needs_collate': False,
|
||||||
|
'max_mel_length': 255800,
|
||||||
|
'max_text_length': 200,
|
||||||
|
'return_wavs': True,
|
||||||
|
#'return_wavs': True,
|
||||||
|
#'input_sample_rate': 22050,
|
||||||
|
#'sampling_rate': 8000
|
||||||
|
}
|
||||||
|
output_path = 'D:\\mozcv_mels'
|
||||||
|
from data import create_dataset, create_dataloader
|
||||||
|
ds, c = create_dataset(params, return_collate=True)
|
||||||
|
dl = create_dataloader(ds, params, collate_fn=c)
|
||||||
|
for i, b in tqdm(enumerate(dl)):
|
||||||
|
mels = b['padded_mel']
|
||||||
|
fnames = b['filenames']
|
||||||
|
for j, fname in enumerate(fnames):
|
||||||
|
torch.save(mels[j], f'{os.path.join(output_path, fname)}_mel.pth')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
dump_mels_to_disk()
|
||||||
|
'''
|
||||||
params = {
|
params = {
|
||||||
'mode': 'nv_tacotron',
|
'mode': 'nv_tacotron',
|
||||||
'path': 'E:\\audio\\MozillaCommonVoice\\en\\train.tsv',
|
'path': 'E:\\audio\\MozillaCommonVoice\\en\\train.tsv',
|
||||||
|
@ -220,3 +247,4 @@ if __name__ == '__main__':
|
||||||
pm = torch.nn.functional.pad(pm, (0, 800-pm.shape[-1]))
|
pm = torch.nn.functional.pad(pm, (0, 800-pm.shape[-1]))
|
||||||
m = pm if m is None else torch.cat([m, pm], dim=0)
|
m = pm if m is None else torch.cat([m, pm], dim=0)
|
||||||
print(m.mean(), m.std())
|
print(m.mean(), m.std())
|
||||||
|
'''
|
|
@ -515,6 +515,26 @@ class DenormalizeInjector(Injector):
|
||||||
return {self.output: out}
|
return {self.output: out}
|
||||||
|
|
||||||
|
|
||||||
|
# Performs normalization across fixed constants.
|
||||||
|
class MelSpectrogramInjector(Injector):
|
||||||
|
def __init__(self, opt, env):
|
||||||
|
super().__init__(opt, env)
|
||||||
|
from models.tacotron2.layers import TacotronSTFT
|
||||||
|
from munch import munchify
|
||||||
|
from models.tacotron2 import hparams
|
||||||
|
hp = munchify(hparams.create_hparams()) # Just use the default tacotron values for the MEL spectrogram. Noone uses anything else anyway.
|
||||||
|
self.stft = TacotronSTFT(hp.filter_length, hp.hop_length, hp.win_length,
|
||||||
|
hp.n_mel_channels, hp.sampling_rate, hp.mel_fmin, hp.mel_fmax)
|
||||||
|
|
||||||
|
def forward(self, state):
|
||||||
|
inp = state[self.input]
|
||||||
|
if len(inp.shape) == 3: # Automatically squeeze out the channels dimension if it is present (assuming mono-audio)
|
||||||
|
inp = inp.squeeze(1)
|
||||||
|
assert len(inp.shape) == 2
|
||||||
|
self.stft = self.stft.to(inp.device)
|
||||||
|
return {self.output: self.stft.mel_spectrogram(inp)}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
inj = DecomposeDimensionInjector({'dim':2, 'in': 'x', 'out': 'y'}, None)
|
inj = DecomposeDimensionInjector({'dim':2, 'in': 'x', 'out': 'y'}, None)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user