From f7d0901ce6e0dbaf1abc44732b3f7b611ebe4cd4 Mon Sep 17 00:00:00 2001 From: James Betker <jbetker@gmail.com> Date: Sun, 31 Oct 2021 15:01:38 -0600 Subject: [PATCH] Decouple MEL from nv_tacotron_dataset --- codes/data/__init__.py | 4 +- codes/data/audio/nv_tacotron_dataset.py | 180 ++++++------------------ 2 files changed, 45 insertions(+), 139 deletions(-) diff --git a/codes/data/__init__.py b/codes/data/__init__.py index 4feca367..1ede1635 100644 --- a/codes/data/__init__.py +++ b/codes/data/__init__.py @@ -61,14 +61,14 @@ def create_dataset(dataset_opt, return_collate=False): elif mode == 'zipfile': from data.zip_file_dataset import ZipFileDataset as D elif mode == 'nv_tacotron': - from data.audio.nv_tacotron_dataset import TextMelLoader as D + from data.audio.nv_tacotron_dataset import TextWavLoader as D from data.audio.nv_tacotron_dataset import TextMelCollate as C from models.tacotron2.hparams import create_hparams default_params = create_hparams() default_params.update(dataset_opt) dataset_opt = munchify(default_params) if opt_get(dataset_opt, ['needs_collate'], True): - collate = C(dataset_opt.n_frames_per_step) + collate = C() elif mode == 'gpt_tts': from data.audio.gpt_tts_dataset import GptTtsDataset as D from data.audio.gpt_tts_dataset import GptTtsCollater as C diff --git a/codes/data/audio/nv_tacotron_dataset.py b/codes/data/audio/nv_tacotron_dataset.py index 66e28ae9..a9b22196 100644 --- a/codes/data/audio/nv_tacotron_dataset.py +++ b/codes/data/audio/nv_tacotron_dataset.py @@ -6,9 +6,11 @@ import numpy as np import torch import torch.utils.data import torch.nn.functional as F +import torchaudio from tqdm import tqdm import models.tacotron2.layers as layers +from data.audio.unsupervised_audio_dataset import load_audio from models.tacotron2.taco_utils import load_wav_to_torch, load_filepaths_and_text from models.tacotron2.text import text_to_sequence @@ -37,12 +39,7 @@ def load_voxpopuli(filename): return filepaths_and_text -class TextMelLoader(torch.utils.data.Dataset): - """ - 1) loads audio,text pairs - 2) normalizes text and converts them to sequences of one-hot vectors - 3) computes mel-spectrograms from audio files. - """ +class TextWavLoader(torch.utils.data.Dataset): def __init__(self, hparams): self.path = hparams['path'] if not isinstance(self.path, list): @@ -65,117 +62,67 @@ class TextMelLoader(torch.utils.data.Dataset): raise NotImplementedError() self.audiopaths_and_text.extend(fetcher_fn(p)) self.text_cleaners = hparams.text_cleaners - self.sampling_rate = hparams.sampling_rate - self.load_mel_from_disk = opt_get(hparams, ['load_mel_from_disk'], False) - self.return_wavs = opt_get(hparams, ['return_wavs'], False) - self.input_sample_rate = opt_get(hparams, ['input_sample_rate'], self.sampling_rate) - assert not (self.load_mel_from_disk and self.return_wavs) - self.stft = layers.TacotronSTFT( - hparams.filter_length, hparams.hop_length, hparams.win_length, - hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, - hparams.mel_fmax) + self.sample_rate = hparams.sample_rate random.seed(hparams.seed) random.shuffle(self.audiopaths_and_text) - self.max_mel_len = opt_get(hparams, ['max_mel_length'], None) + self.max_wav_len = opt_get(hparams, ['max_wav_length'], None) self.max_text_len = opt_get(hparams, ['max_text_length'], None) # If needs_collate=False, all outputs will be aligned and padded at maximum length. self.needs_collate = opt_get(hparams, ['needs_collate'], True) if not self.needs_collate: - assert self.max_mel_len is not None and self.max_text_len is not None + assert self.max_wav_len is not None and self.max_text_len is not None - def get_mel_text_pair(self, audiopath_and_text): + def get_wav_text_pair(self, audiopath_and_text): # separate filename and text audiopath, text = audiopath_and_text[0], audiopath_and_text[1] text_seq = self.get_text(text) - mel = self.get_mel(audiopath) - return (text_seq, mel, text, audiopath_and_text[0]) - - def get_mel(self, filename): - if self.load_mel_from_disk and os.path.exists(f'{filename}_mel.npy'): - melspec = torch.from_numpy(np.load(f'{filename}_mel.npy')) - assert melspec.size(0) == self.stft.n_mel_channels, ( - 'Mel dimension mismatch: given {}, expected {}'.format(melspec.size(0), self.stft.n_mel_channels)) - else: - if filename.endswith('.wav'): - audio, sampling_rate = load_wav_to_torch(filename) - elif filename.endswith('.mp3'): - # https://github.com/neonbjb/pyfastmp3decoder - Definitely worth it. - from pyfastmp3decoder.mp3decoder import load_mp3 - audio, sampling_rate = load_mp3(filename, self.input_sample_rate) - audio = torch.FloatTensor(audio) - else: - audio, sampling_rate = audio2numpy.audio_from_file(filename) - audio = torch.tensor(audio) - - if sampling_rate != self.input_sample_rate: - if sampling_rate < self.input_sample_rate: - print(f'{filename} has a sample rate of {sampling_rate} which is lower than the requested sample rate of {self.input_sample_rate}. This is not a good idea.') - audio_norm = torch.nn.functional.interpolate(audio.unsqueeze(0).unsqueeze(1), scale_factor=self.input_sample_rate/sampling_rate, mode='nearest', recompute_scale_factor=False).squeeze() - else: - audio_norm = audio - if audio_norm.std() > 1: - print(f"Something is very wrong with the given audio. std_dev={audio_norm.std()}. file={filename}") - return None - audio_norm.clip_(-1, 1) - audio_norm = audio_norm.unsqueeze(0) - audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) - if self.input_sample_rate != self.sampling_rate: - ratio = self.sampling_rate / self.input_sample_rate - audio_norm = torch.nn.functional.interpolate(audio_norm.unsqueeze(0), scale_factor=ratio, mode='area').squeeze(0) - if self.return_wavs: - melspec = audio_norm - else: - melspec = self.stft.mel_spectrogram(audio_norm) - melspec = torch.squeeze(melspec, 0) - - return melspec + wav = load_audio(audiopath, self.sample_rate) + return (text_seq, wav, text, audiopath_and_text[0]) def get_text(self, text): text_norm = torch.IntTensor(text_to_sequence(text, self.text_cleaners)) return text_norm def __getitem__(self, index): - tseq, mel, text, path = self.get_mel_text_pair(self.audiopaths_and_text[index]) - if mel is None or \ - (self.max_mel_len is not None and mel.shape[-1] > self.max_mel_len) or \ + tseq, wav, text, path = self.get_wav_text_pair(self.audiopaths_and_text[index]) + if wav is None or \ + (self.max_wav_len is not None and wav.shape[-1] > self.max_wav_len) or \ (self.max_text_len is not None and tseq.shape[0] > self.max_text_len): - #if mel is not None: - # print(f"Exception {index} mel_len:{mel.shape[-1]} text_len:{tseq.shape[0]} fname: {path}") + # Basically, this audio file is nonexistent or too long to be supported by the dataset. # It's hard to handle this situation properly. Best bet is to return the a random valid token and skew the dataset somewhat as a result. + #if wav is not None: + # print(f"Exception {index} wav_len:{wav.shape[-1]} text_len:{tseq.shape[0]} fname: {path}") rv = random.randint(0,len(self)-1) return self[rv] - orig_output = mel.shape[-1] + orig_output = wav.shape[-1] orig_text_len = tseq.shape[0] if not self.needs_collate: - if mel.shape[-1] != self.max_mel_len: - mel = F.pad(mel, (0, self.max_mel_len - mel.shape[-1])) + if wav.shape[-1] != self.max_wav_len: + wav = F.pad(wav, (0, self.max_wav_len - wav.shape[-1])) if tseq.shape[0] != self.max_text_len: tseq = F.pad(tseq, (0, self.max_text_len - tseq.shape[0])) return { 'real_text': text, 'padded_text': tseq, 'input_lengths': torch.tensor(orig_text_len, dtype=torch.long), - 'padded_mel': mel, + 'wav': wav, 'output_lengths': torch.tensor(orig_output, dtype=torch.long), 'filenames': path } - return tseq, mel, path, text + return tseq, wav, path, text def __len__(self): return len(self.audiopaths_and_text) class TextMelCollate(): - """ Zero-pads model inputs and targets based on number of frames per setep + """ Zero-pads model inputs and targets based on number of frames per step """ - def __init__(self, n_frames_per_step): - self.n_frames_per_step = n_frames_per_step - def __call__(self, batch): - """Collate's training batch from normalized text and mel-spectrogram + """Collate's training batch from normalized text and wav PARAMS ------ - batch: [text_normalized, mel_normalized, filename] + batch: [text_normalized, wav, filename, text] """ # Right zero-pad all one-hot text sequences to max input length input_lengths, ids_sorted_decreasing = torch.sort( @@ -193,81 +140,42 @@ class TextMelCollate(): filenames.append(batch[ids_sorted_decreasing[i]][2]) real_text.append(batch[ids_sorted_decreasing[i]][3]) - # Right zero-pad mel-spec - num_mels = batch[0][1].size(0) + # Right zero-pad wav + num_wavs = batch[0][1].size(0) max_target_len = max([x[1].size(1) for x in batch]) - if max_target_len % self.n_frames_per_step != 0: - max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step - assert max_target_len % self.n_frames_per_step == 0 # include mel padded and gate padded - mel_padded = torch.FloatTensor(len(batch), num_mels, max_target_len) - mel_padded.zero_() - gate_padded = torch.FloatTensor(len(batch), max_target_len) - gate_padded.zero_() + wav_padded = torch.FloatTensor(len(batch), num_wavs, max_target_len) + wav_padded.zero_() output_lengths = torch.LongTensor(len(batch)) for i in range(len(ids_sorted_decreasing)): - mel = batch[ids_sorted_decreasing[i]][1] - mel_padded[i, :, :mel.size(1)] = mel - gate_padded[i, mel.size(1)-1:] = 1 - output_lengths[i] = mel.size(1) + wav = batch[ids_sorted_decreasing[i]][1] + wav_padded[i, :, :wav.size(1)] = wav + output_lengths[i] = wav.size(1) return { 'padded_text': text_padded, 'input_lengths': input_lengths, - 'padded_mel': mel_padded, - 'padded_gate': gate_padded, + 'wav': wav_padded, 'output_lengths': output_lengths, 'filenames': filenames, 'real_text': real_text, } -def save_mel_buffer_to_file(mel, path): - np.save(path, mel.cpu().numpy()) - - -def dump_mels_to_disk(): - params = { - 'mode': 'nv_tacotron', - 'path': ['Z:\\mozcv\\en\\train.tsv'], - 'fetcher_mode': ['mozilla_cv'], - 'phase': 'train', - 'n_workers': 8, - 'batch_size': 1, - 'needs_collate': True, - 'max_mel_length': 10000, - 'max_text_length': 1000, - #'return_wavs': True, - #'input_sample_rate': 22050, - #'sampling_rate': 8000 - } - from data import create_dataset, create_dataloader - ds, c = create_dataset(params, return_collate=True) - dl = create_dataloader(ds, params, collate_fn=c) - for b in tqdm(dl): - mels = b['padded_mel'] - fnames = b['filenames'] - for j, fname in enumerate(fnames): - save_mel_buffer_to_file(mels[j], f'{fname}_mel.npy') - - if __name__ == '__main__': - dump_mels_to_disk() - ''' + batch_sz = 32 params = { 'mode': 'nv_tacotron', - 'path': 'E:\\audio\\MozillaCommonVoice\\en\\train.tsv', + 'path': 'E:\\audio\\MozillaCommonVoice\\en\\test.tsv', 'phase': 'train', - 'n_workers': 12, - 'batch_size': 32, + 'n_workers': 0, + 'batch_size': batch_sz, 'fetcher_mode': 'mozilla_cv', - 'needs_collate': False, - 'max_mel_length': 800, - 'max_text_length': 200, - #'return_wavs': True, - #'input_sample_rate': 22050, - #'sampling_rate': 8000 + 'needs_collate': True, + #'max_wav_length': 256000, + #'max_text_length': 200, + 'sample_rate': 22050, } from data import create_dataset, create_dataloader @@ -277,9 +185,7 @@ if __name__ == '__main__': m = None for k in range(1000): for i, b in tqdm(enumerate(dl)): - continue - pm = b['padded_mel'] - pm = torch.nn.functional.pad(pm, (0, 800-pm.shape[-1])) - m = pm if m is None else torch.cat([m, pm], dim=0) - print(m.mean(), m.std()) - ''' \ No newline at end of file + w = b['wav'] + for ib in range(batch_sz): + print(f'{i} {ib} {b["real_text"][ib]}') + torchaudio.save(f'{i}_clip_{ib}.wav', b['wav'][ib], ds.sample_rate)