DL-Art-School/codes/data/audio/stop_prediction_dataset_2.py

import os
import pathlib
import random

from munch import munchify
from torch.utils.data import Dataset
import torch
from tqdm import tqdm

from data.audio.nv_tacotron_dataset import save_mel_buffer_to_file
from models.tacotron2 import hparams
from models.tacotron2.layers import TacotronSTFT
from models.tacotron2.taco_utils import load_wav_to_torch
from utils.util import opt_get


# A dataset that consumes the result from the script `produce_libri_stretched_dataset`, which itself is a combined
# set of clips from the librivox corpus of equal length with the sentence alignment labeled.
class StopPredictionDataset(Dataset):
    def __init__(self, opt):
        path = opt['path']
        label_compaction = opt_get(opt, ['label_compaction'], 1)
        hp = munchify(hparams.create_hparams())
        cache_path = os.path.join(path, 'cache.pth')
        if os.path.exists(cache_path):
            self.files = torch.load(cache_path)
        else:
            print("Building cache..")
            self.files = list(pathlib.Path(path).glob('*.wav'))
            torch.save(self.files, cache_path)
        self.sampling_rate = 22050  # Fixed since the underlying data is also fixed at this SR.
        self.mel_length = 2000
        self.stft = TacotronSTFT(
            hp.filter_length, hp.hop_length, hp.win_length,
            hp.n_mel_channels, hp.sampling_rate, hp.mel_fmin,
            hp.mel_fmax)
        self.label_compaction = label_compaction

    def __getitem__(self, index):
        audio, _ = load_wav_to_torch(self.files[index])
        starts, ends = torch.load(str(self.files[index]).replace('.wav', '_se.pth'))

        if audio.std() > 1:
            print(f"Something is very wrong with the given audio. std_dev={audio.std()}. file={self.files[index]}")
            return None
        audio.clip_(-1, 1)
        mels = self.stft.mel_spectrogram(audio.unsqueeze(0))[:, :, :self.mel_length].squeeze(0)

        # Form labels.
        labels_start = torch.zeros((2000 // self.label_compaction,), dtype=torch.long)
        for s in starts:
            # Mel compaction operates at a ratio of 1/256, the dataset also allows further compaction.
            s = s // (256 * self.label_compaction)
            if s >= 2000//self.label_compaction:
                continue
            labels_start[s] = 1
        labels_end = torch.zeros((2000 // self.label_compaction,), dtype=torch.long)
        for e in ends:
            e = e // (256 * self.label_compaction)
            if e >= 2000//self.label_compaction:
                continue
            labels_end[e] = 1

        return {
            'mels': mels,
            'labels_start': labels_start,
            'labels_end': labels_end,
        }


    def __len__(self):
        return len(self.files)


if __name__ == '__main__':
    opt = {
        'path': 'D:\\data\\audio\\libritts\\stop_dataset',
        'label_compaction': 4,
    }
    ds = StopPredictionDataset(opt)
    j = 0
    for i in tqdm(range(100)):
        b = ds[random.randint(0, len(ds))]
        start_indices = torch.nonzero(b['labels_start']).squeeze(1)
        end_indices = torch.nonzero(b['labels_end']).squeeze(1)
        assert len(end_indices) <= len(start_indices)  # There should always be more START tokens then END tokens.
        for i in range(len(end_indices)):
            s = start_indices[i].item()*4
            e = end_indices[i].item()*4
            m = b['mels'][:, s:e]
            save_mel_buffer_to_file(m, f'{j}.npy')
            j += 1
Stop dataset - attempt #2 2021-08-19 00:29:38 +00:00			`import os`
			`import pathlib`
			`import random`

			`from munch import munchify`
			`from torch.utils.data import Dataset`
			`import torch`
			`from tqdm import tqdm`

			`from data.audio.nv_tacotron_dataset import save_mel_buffer_to_file`
			`from models.tacotron2 import hparams`
			`from models.tacotron2.layers import TacotronSTFT`
			`from models.tacotron2.taco_utils import load_wav_to_torch`
			`from utils.util import opt_get`


			# A dataset that consumes the result from the script `produce_libri_stretched_dataset`, which itself is a combined
			`# set of clips from the librivox corpus of equal length with the sentence alignment labeled.`
			`class StopPredictionDataset(Dataset):`
			`def __init__(self, opt):`
			`path = opt['path']`
			`label_compaction = opt_get(opt, ['label_compaction'], 1)`
			`hp = munchify(hparams.create_hparams())`
			`cache_path = os.path.join(path, 'cache.pth')`
			`if os.path.exists(cache_path):`
			`self.files = torch.load(cache_path)`
			`else:`
			`print("Building cache..")`
			`self.files = list(pathlib.Path(path).glob('*.wav'))`
			`torch.save(self.files, cache_path)`
			`self.sampling_rate = 22050 # Fixed since the underlying data is also fixed at this SR.`
			`self.mel_length = 2000`
			`self.stft = TacotronSTFT(`
			`hp.filter_length, hp.hop_length, hp.win_length,`
			`hp.n_mel_channels, hp.sampling_rate, hp.mel_fmin,`
			`hp.mel_fmax)`
			`self.label_compaction = label_compaction`

			`def __getitem__(self, index):`
			`audio, _ = load_wav_to_torch(self.files[index])`
			`starts, ends = torch.load(str(self.files[index]).replace('.wav', '_se.pth'))`

			`if audio.std() > 1:`
			`print(f"Something is very wrong with the given audio. std_dev={audio.std()}. file={self.files[index]}")`
			`return None`
			`audio.clip_(-1, 1)`
			`mels = self.stft.mel_spectrogram(audio.unsqueeze(0))[:, :, :self.mel_length].squeeze(0)`

			`# Form labels.`
			`labels_start = torch.zeros((2000 // self.label_compaction,), dtype=torch.long)`
			`for s in starts:`
			`# Mel compaction operates at a ratio of 1/256, the dataset also allows further compaction.`
			`s = s // (256 * self.label_compaction)`
			`if s >= 2000//self.label_compaction:`
			`continue`
			`labels_start[s] = 1`
			`labels_end = torch.zeros((2000 // self.label_compaction,), dtype=torch.long)`
			`for e in ends:`
			`e = e // (256 * self.label_compaction)`
			`if e >= 2000//self.label_compaction:`
			`continue`
			`labels_end[e] = 1`

			`return {`
			`'mels': mels,`
			`'labels_start': labels_start,`
			`'labels_end': labels_end,`
			`}`


			`def __len__(self):`
			`return len(self.files)`


			`if __name__ == '__main__':`
			`opt = {`
			`'path': 'D:\\data\\audio\\libritts\\stop_dataset',`
			`'label_compaction': 4,`
			`}`
			`ds = StopPredictionDataset(opt)`
			`j = 0`
			`for i in tqdm(range(100)):`
			`b = ds[random.randint(0, len(ds))]`
			`start_indices = torch.nonzero(b['labels_start']).squeeze(1)`
			`end_indices = torch.nonzero(b['labels_end']).squeeze(1)`
			`assert len(end_indices) <= len(start_indices) # There should always be more START tokens then END tokens.`
			`for i in range(len(end_indices)):`
			`s = start_indices[i].item()*4`
			`e = end_indices[i].item()*4`
			`m = b['mels'][:, s:e]`
			`save_mel_buffer_to_file(m, f'{j}.npy')`
			`j += 1`