From 726e30c4f7dd182b8ee2c087df9ff2d388857bb2 Mon Sep 17 00:00:00 2001 From: James Betker Date: Wed, 9 Mar 2022 09:43:10 -0700 Subject: [PATCH] Update noise augmentation dataset to include voices that are appended at the end of another clip. --- codes/data/audio/audio_with_noise_dataset.py | 55 +++++++++++++------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/codes/data/audio/audio_with_noise_dataset.py b/codes/data/audio/audio_with_noise_dataset.py index 3d8ee895..d3d07a00 100644 --- a/codes/data/audio/audio_with_noise_dataset.py +++ b/codes/data/audio/audio_with_noise_dataset.py @@ -98,6 +98,7 @@ class AudioWithNoiseDataset(Dataset): clip = out['clip'] dlen = clip.shape[-1] clip = clip[:, :out['clip_lengths']] + padding_room = dlen - clip.shape[-1] augpath = '' augvol = 0 try: @@ -106,9 +107,9 @@ class AudioWithNoiseDataset(Dataset): clip = clip * clipvol label = random.randint(0, 4) # Current excludes GSM corruption. - #label = 2 - aug = torch.zeros_like(clip) + label = 3 if label > 0 and label < 4: # 0 is basically "leave it alone" + aug_needed = True augvol = (random.random() * (self.max_volume-self.min_volume) + self.min_volume) if label == 1: # Add environmental noise. @@ -120,22 +121,38 @@ class AudioWithNoiseDataset(Dataset): intg_fns = [_integration_fn_fully_enabled] augvol *= .5 # Music is often severely in the background. elif label == 3: - # Add another voice. augpath = random.choice(self.underlying_dataset.audiopaths) - intg_fns = [_integration_fn_smooth, _integration_fn_fully_enabled] - aug = load_audio(augpath, self.underlying_dataset.sampling_rate) - if aug.shape[1] > clip.shape[1]: - n, cn = aug.shape[1], clip.shape[1] - gap = n-cn - placement = random.randint(0, gap) - aug = aug[:, placement:placement+cn] - aug = random.choice(intg_fns)(aug.shape[1]) * aug - aug = aug * augvol - if aug.shape[1] < clip.shape[1]: - gap = clip.shape[1] - aug.shape[1] - placement = random.randint(0, gap-1) - aug = torch.nn.functional.pad(aug, (placement, gap-placement)) - clip = clip + aug + # This can take two forms: + if padding_room < 22000 or random.random() < .5: + # (1) The voices talk over one another. If there is no padding room, we always take this choice. + intg_fns = [_integration_fn_smooth, _integration_fn_fully_enabled] + else: + # (2) There are simply two voices in the clip, separated from one another. + # This is a special case that does not use the same logic as the rest of the augmentations. + aug = load_audio(augpath, self.underlying_dataset.sampling_rate) + # Pad with some random silence + aug = F.pad(aug, (random.randint(20,4000), 0)) + # Fit what we can given the padding room we have. + aug = aug[:, :padding_room] + clip = torch.cat([clip, aug], dim=1) + # Restore some meta-parameters. + padding_room = dlen - clip.shape[-1] + out['clip_lengths'] = clip.shape[-1] + aug_needed = False + if aug_needed: + aug = load_audio(augpath, self.underlying_dataset.sampling_rate) + if aug.shape[1] > clip.shape[1]: + n, cn = aug.shape[1], clip.shape[1] + gap = n-cn + placement = random.randint(0, gap) + aug = aug[:, placement:placement+cn] + aug = random.choice(intg_fns)(aug.shape[1]) * aug + aug = aug * augvol + if aug.shape[1] < clip.shape[1]: + gap = clip.shape[1] - aug.shape[1] + placement = random.randint(0, gap-1) + aug = torch.nn.functional.pad(aug, (placement, gap-placement)) + clip = clip + aug elif label == 4: # Perform reverb (to simulate being in a large room with an omni-mic). This is performed by convolving # impulse recordings from openair over the input clip. @@ -159,7 +176,7 @@ class AudioWithNoiseDataset(Dataset): clip.clip_(-1, 1) # Restore padding. - clip = F.pad(clip, (0, dlen-clip.shape[-1])) + clip = F.pad(clip, (0, padding_room)) out['clip'] = clip out['label'] = label #out['aug'] = aug @@ -196,7 +213,7 @@ if __name__ == '__main__': i = 0 for b in tqdm(dl): for b_ in range(b['clip'].shape[0]): - torchaudio.save(f'{i}_clip_{b_}_{b["label"][b_].item()}.wav', b['clip'][b_], ds.sampling_rate) + torchaudio.save(f'{i}_clip_{b_}_{b["label"][b_].item()}.wav', b['clip'][b_][:, :b['clip_lengths'][b_]], ds.sampling_rate) #torchaudio.save(f'{i}_clip_{b_}_aug.wav', b['aug'][b_], ds.sampling_rate) print(f'{i} aug path: {b["augpath"][b_]} aug volume: {b["augvol"][b_]} clip volume: {b["clipvol"][b_]}') i += 1