diff --git a/codes/data/audio/nv_tacotron_dataset.py b/codes/data/audio/nv_tacotron_dataset.py index 98f085af..7dadb7ad 100644 --- a/codes/data/audio/nv_tacotron_dataset.py +++ b/codes/data/audio/nv_tacotron_dataset.py @@ -51,7 +51,7 @@ class TextMelLoader(torch.utils.data.Dataset): self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate - self.load_mel_from_disk = hparams.load_mel_from_disk + self.load_mel_from_disk = opt_get(hparams, ['load_mel_from_disk'], False) self.return_wavs = opt_get(hparams, ['return_wavs'], False) self.input_sample_rate = opt_get(hparams, ['input_sample_rate'], self.sampling_rate) assert not (self.load_mel_from_disk and self.return_wavs) @@ -76,7 +76,11 @@ class TextMelLoader(torch.utils.data.Dataset): return (text_seq, mel, text, audiopath_and_text[0]) def get_mel(self, filename): - if not self.load_mel_from_disk: + if self.load_mel_from_disk and os.path.exists(f'{filename}_mel.npy'): + melspec = torch.from_numpy(np.load(f'{filename}_mel.npy')) + assert melspec.size(0) == self.stft.n_mel_channels, ( + 'Mel dimension mismatch: given {}, expected {}'.format(melspec.size(0), self.stft.n_mel_channels)) + else: if filename.endswith('.wav'): audio, sampling_rate = load_wav_to_torch(filename) audio = (audio / self.max_wav_value) @@ -105,9 +109,6 @@ class TextMelLoader(torch.utils.data.Dataset): melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) else: - melspec = torch.from_numpy(np.load(filename)) - assert melspec.size(0) == self.stft.n_mel_channels, ( - 'Mel dimension mismatch: given {}, expected {}'.format(melspec.size(0), self.stft.n_mel_channels)) return melspec @@ -209,10 +210,6 @@ def save_mel_buffer_to_file(mel, path): np.save(path, mel.numpy()) -def load_mel_buffer_from_file(path): - return torch.tensor(np.load(path)) - - def dump_mels_to_disk(): params = { 'mode': 'nv_tacotron', diff --git a/codes/scripts/audio/random_mp3_splitter.py b/codes/scripts/audio/random_mp3_splitter.py index 758bf6d0..1c0e278c 100644 --- a/codes/scripts/audio/random_mp3_splitter.py +++ b/codes/scripts/audio/random_mp3_splitter.py @@ -9,11 +9,11 @@ import numpy as np if __name__ == '__main__': - src_dir = 'O:\\podcast_dumps' + src_dir = 'P:\\Audiobooks-Podcasts' #src_dir = 'E:\\audio\\books' - output_dir = 'D:\\data\\audio\\podcasts-split' - output_dir_lq = 'D:\\data\\audio\\podcasts-split-with-bg' - output_dir_garbage = 'D:\\data\\audio\\podcasts-split-garbage' + output_dir = 'D:\\data\\audio\\misc-split' + output_dir_lq = 'D:\\data\\audio\\misc-split-with-bg' + output_dir_garbage = 'D:\\data\\audio\\misc-split-garbage' #output_dir = 'E:\\audio\\books-clips' clip_length = 5 # In seconds sparsity = .1 # Only this proportion of the total clips are extracted as wavs.