Allow usage of pre-rendered mels saved to npy files

This commit is contained in:
James Betker 2021-08-14 23:38:15 -06:00
parent ad3391bd96
commit c28f657ab8
2 changed files with 10 additions and 13 deletions

View File

@ -51,7 +51,7 @@ class TextMelLoader(torch.utils.data.Dataset):
self.text_cleaners = hparams.text_cleaners
self.max_wav_value = hparams.max_wav_value
self.sampling_rate = hparams.sampling_rate
self.load_mel_from_disk = hparams.load_mel_from_disk
self.load_mel_from_disk = opt_get(hparams, ['load_mel_from_disk'], False)
self.return_wavs = opt_get(hparams, ['return_wavs'], False)
self.input_sample_rate = opt_get(hparams, ['input_sample_rate'], self.sampling_rate)
assert not (self.load_mel_from_disk and self.return_wavs)
@ -76,7 +76,11 @@ class TextMelLoader(torch.utils.data.Dataset):
return (text_seq, mel, text, audiopath_and_text[0])
def get_mel(self, filename):
if not self.load_mel_from_disk:
if self.load_mel_from_disk and os.path.exists(f'{filename}_mel.npy'):
melspec = torch.from_numpy(np.load(f'{filename}_mel.npy'))
assert melspec.size(0) == self.stft.n_mel_channels, (
'Mel dimension mismatch: given {}, expected {}'.format(melspec.size(0), self.stft.n_mel_channels))
else:
if filename.endswith('.wav'):
audio, sampling_rate = load_wav_to_torch(filename)
audio = (audio / self.max_wav_value)
@ -105,9 +109,6 @@ class TextMelLoader(torch.utils.data.Dataset):
melspec = self.stft.mel_spectrogram(audio_norm)
melspec = torch.squeeze(melspec, 0)
else:
melspec = torch.from_numpy(np.load(filename))
assert melspec.size(0) == self.stft.n_mel_channels, (
'Mel dimension mismatch: given {}, expected {}'.format(melspec.size(0), self.stft.n_mel_channels))
return melspec
@ -209,10 +210,6 @@ def save_mel_buffer_to_file(mel, path):
np.save(path, mel.numpy())
def load_mel_buffer_from_file(path):
return torch.tensor(np.load(path))
def dump_mels_to_disk():
params = {
'mode': 'nv_tacotron',

View File

@ -9,11 +9,11 @@ import numpy as np
if __name__ == '__main__':
src_dir = 'O:\\podcast_dumps'
src_dir = 'P:\\Audiobooks-Podcasts'
#src_dir = 'E:\\audio\\books'
output_dir = 'D:\\data\\audio\\podcasts-split'
output_dir_lq = 'D:\\data\\audio\\podcasts-split-with-bg'
output_dir_garbage = 'D:\\data\\audio\\podcasts-split-garbage'
output_dir = 'D:\\data\\audio\\misc-split'
output_dir_lq = 'D:\\data\\audio\\misc-split-with-bg'
output_dir_garbage = 'D:\\data\\audio\\misc-split-garbage'
#output_dir = 'E:\\audio\\books-clips'
clip_length = 5 # In seconds
sparsity = .1 # Only this proportion of the total clips are extracted as wavs.