Remove collating from paired_voice_audio_dataset

This will now be done at the model level, which is more efficient
2022-01-06 10:29:39 -07:00 · 2022-01-06 10:29:39 -07:00 · 06c1093090
commit 06c1093090
parent e7a705fe6e
1 changed files with 16 additions and 77 deletions
--- a/codes/data/audio/paired_voice_audio_dataset.py
+++ b/codes/data/audio/paired_voice_audio_dataset.py
@ -89,9 +89,6 @@ class TextWavLoader(torch.utils.data.Dataset):
        random.shuffle(self.audiopaths_and_text)
        self.max_wav_len = opt_get(hparams, ['max_wav_length'], None)
        self.max_text_len = opt_get(hparams, ['max_text_length'], None)
        # If needs_collate=False, all outputs will be aligned and padded at maximum length.
        self.needs_collate = opt_get(hparams, ['needs_collate'], True)
        if not self.needs_collate:
        assert self.max_wav_len is not None and self.max_text_len is not None
        self.use_bpe_tokenizer = opt_get(hparams, ['use_bpe_tokenizer'], True)
        if self.use_bpe_tokenizer:
@ -137,7 +134,6 @@ class TextWavLoader(torch.utils.data.Dataset):
            return self[rv]
        orig_output = wav.shape[-1]
        orig_text_len = tseq.shape[0]
        if not self.needs_collate:
        if wav.shape[-1] != self.max_wav_len:
            wav = F.pad(wav, (0, self.max_wav_len - wav.shape[-1]))
        if tseq.shape[0] != self.max_text_len:
@ -153,67 +149,11 @@ class TextWavLoader(torch.utils.data.Dataset):
        if self.load_conditioning:
            res['conditioning'] = cond
        return res
        return tseq, wav, path, text, cond
    def __len__(self):
        return len(self.audiopaths_and_text)
 class TextMelCollate():
    """ Zero-pads model inputs and targets based on number of frames per step
    """
    def __call__(self, batch):
        """Collate's training batch from normalized text and wav
        PARAMS
        ------
        batch: [text_normalized, wav, filename, text]
        """
        # Right zero-pad all one-hot text sequences to max input length
        input_lengths, ids_sorted_decreasing = torch.sort(
            torch.LongTensor([len(x[0]) for x in batch]),
            dim=0, descending=True)
        max_input_len = input_lengths[0]
        text_padded = torch.LongTensor(len(batch), max_input_len)
        text_padded.zero_()
        filenames = []
        real_text = []
        conds = []
        for i in range(len(ids_sorted_decreasing)):
            text = batch[ids_sorted_decreasing[i]][0]
            text_padded[i, :text.size(0)] = text
            filenames.append(batch[ids_sorted_decreasing[i]][2])
            real_text.append(batch[ids_sorted_decreasing[i]][3])
            c = batch[ids_sorted_decreasing[i]][4]
            if c is not None:
                conds.append(c)
        # Right zero-pad wav
        num_wavs = batch[0][1].size(0)
        max_target_len = max([x[1].size(1) for x in batch])
        # include mel padded and gate padded
        wav_padded = torch.FloatTensor(len(batch), num_wavs, max_target_len)
        wav_padded.zero_()
        output_lengths = torch.LongTensor(len(batch))
        for i in range(len(ids_sorted_decreasing)):
            wav = batch[ids_sorted_decreasing[i]][1]
            wav_padded[i, :, :wav.size(1)] = wav
            output_lengths[i] = wav.size(1)
        res = {
            'padded_text': text_padded,
            'text_lengths': input_lengths,
            'wav': wav_padded,
            'wav_lengths': output_lengths,
            'filenames': filenames,
            'real_text': real_text,
        }
        if len(conds) > 0:
            res['conditioning'] = torch.stack(conds)
        return res
 if __name__ == '__main__':
    batch_sz = 8
    params = {
@ -223,7 +163,6 @@ if __name__ == '__main__':
        'phase': 'train',
        'n_workers': 0,
        'batch_size': batch_sz,
        'needs_collate': True,
        'max_wav_length': 255995,
        'max_text_length': 200,
        'sample_rate': 22050,