Improve efficiency of audio_with_noise_dataset

2022-03-08 15:50:13 -07:00 · 2022-03-08 15:50:13 -07:00 · 38fd9fc985
commit 38fd9fc985
parent b3def182de
2 changed files with 13 additions and 8 deletions
--- a/codes/data/audio/audio_with_noise_dataset.py
+++ b/codes/data/audio/audio_with_noise_dataset.py
@ -8,6 +8,7 @@ import torchaudio
 from torch.utils.data import Dataset
 from tqdm import tqdm

+import torch.nn.functional as F
 from data.audio.unsupervised_audio_dataset import UnsupervisedAudioDataset, load_audio
 from data.util import load_paths_from_cache, find_files_of_type, is_audio_file

@ -95,6 +96,8 @@ class AudioWithNoiseDataset(Dataset):

        out = self.underlying_dataset[item]
        clip = out['clip']
+        dlen = clip.shape[-1]
+        clip = clip[:, :out['clip_lengths']]
        augpath = ''
        augvol = 0
        try:
@ -149,15 +152,17 @@ class AudioWithNoiseDataset(Dataset):
                # Apply the GSM codec to simulate cellular phone audio.
                clip = torchaudio.functional.apply_codec(clip, self.underlying_dataset.sampling_rate, format="gsm")
        except:
-            print(f"Exception encountered processing {item}, re-trying because this is often just a failed aug.")
-            print(sys.exc_info())
+            #print(f"Exception encountered processing {item}, re-trying because this is often just a failed aug.")
+            #print(sys.exc_info())
            #raise  # Uncomment to surface exceptions.
            return self[item]

        clip.clip_(-1, 1)
+        # Restore padding.
+        clip = F.pad(clip, (0, dlen-clip.shape[-1]))
        out['clip'] = clip
        out['label'] = label
-        out['aug'] = aug
+        #out['aug'] = aug
        out['augpath'] = augpath
        out['augvol'] = augvol
        out['clipvol'] = clipvol
@ -170,10 +175,10 @@ class AudioWithNoiseDataset(Dataset):
 if __name__ == '__main__':
    params = {
        'mode': 'unsupervised_audio_with_noise',
-        'path': ['\\\\192.168.5.3\\rtx3080_audio\\split\\cleaned\\books0'],
-        'cache_path': 'E:\\audio\\remote-cache3.pth',
+        'path': ['y:/clips/books1'],
+        'cache_path': 'E:\\audio\\remote-cache4.pth',
        'sampling_rate': 22050,
-        'pad_to_samples': 80960,
+        'pad_to_samples': 400000,
        'phase': 'train',
        'n_workers': 0,
        'batch_size': 4,
@ -191,7 +196,7 @@ if __name__ == '__main__':
    i = 0
    for b in tqdm(dl):
        for b_ in range(b['clip'].shape[0]):
-            #torchaudio.save(f'{i}_clip_{b_}_{b["label"][b_].item()}.wav', b['clip'][b_], ds.sampling_rate)
+            torchaudio.save(f'{i}_clip_{b_}_{b["label"][b_].item()}.wav', b['clip'][b_], ds.sampling_rate)
            #torchaudio.save(f'{i}_clip_{b_}_aug.wav', b['aug'][b_], ds.sampling_rate)
            print(f'{i} aug path: {b["augpath"][b_]} aug volume: {b["augvol"][b_]} clip volume: {b["clipvol"][b_]}')
            i += 1
--- a/codes/data/audio/fast_paired_dataset.py
+++ b/codes/data/audio/fast_paired_dataset.py
@ -37,7 +37,7 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):

    In practice, this means two things:
    1) Index {i} of this dataset means nothing: fetching from the same index will almost always return different data.
-       As a result, this dataset should not be used for validation or test runs.
+       As a result, this dataset should not be used for validation or test runs. Use PairedVoiceAudio dataset instead.
    2) This dataset has a slight bias for items with longer text or longer filenames.

    The upshot is that this dataset loads extremely quickly and consumes almost no system memory.