import random import sys from math import pi import librosa import torch import torchaudio from torch.utils.data import Dataset from tqdm import tqdm import torch.nn.functional as F from data.audio.unsupervised_audio_dataset import UnsupervisedAudioDataset, load_audio from data.util import load_paths_from_cache, find_files_of_type, is_audio_file # Just all ones. from utils.util import opt_get def _integration_fn_fully_enabled(n): return torch.ones((n,)) # Randomly assigns up to 5 blocks of the output tensor the value '1'. Rest is zero def _integration_fn_spiky(n): fn = torch.zeros((n,)) spikes = random.randint(1,5) for _ in range(spikes): sz = random.randint(n//8, n//2) pos = random.randint(0, n) extent = min(n, sz+pos) fn[pos:extent] = 1 return fn # Uses a sinusoidal ramp up and down (of random length) to a peak which is held for a random duration. def _integration_fn_smooth(n): center = random.randint(1, n-2) max_duration=n-center-1 duration = random.randint(max_duration//4, max_duration) end = center+duration ramp_up_sz = random.randint(n//16,n//4) ramp_up = torch.sin(pi*torch.arange(0,ramp_up_sz)/(2*ramp_up_sz)) if ramp_up_sz > center: ramp_up = ramp_up[(ramp_up_sz-center):] ramp_up_sz = center ramp_down_sz = random.randint(n//16,n//4) ramp_down = torch.flip(torch.sin(pi*torch.arange(0,ramp_down_sz)/(2*ramp_down_sz)), dims=[0]) if ramp_down_sz > (n-end): ramp_down = ramp_down[:(n-end)] ramp_down_sz = n-end fn = torch.zeros((n,)) fn[(center-ramp_up_sz):center] = ramp_up fn[center:end] = 1 fn[end:(end+ramp_down_sz)] = ramp_down return fn def load_rir(path, sr, max_sz): rir = load_audio(path, sr).abs() if rir.shape[-1] > max_sz: rir = rir[:, :max_sz] rir = (rir / torch.norm(rir, p=2)).flip([1]) return rir ''' Wraps a unsupervised_audio_dataset and applies noise to the output clips, then provides labels depending on what noise was added. ''' class AudioWithNoiseDataset(Dataset): def __init__(self, opt): self.underlying_dataset = UnsupervisedAudioDataset(opt) self.env_noise_paths = load_paths_from_cache(opt['env_noise_paths'], opt['env_noise_cache']) self.music_paths = load_paths_from_cache(opt['music_paths'], opt['music_cache']) self.openair_paths = find_files_of_type('img', opt['openair_path'], qualifier=is_audio_file)[0] self.min_volume = opt_get(opt, ['min_noise_volume'], .2) self.max_volume = opt_get(opt, ['max_noise_volume'], .5) self.sampling_rate = self.underlying_dataset.sampling_rate self.use_gpu_for_reverb_compute = opt_get(opt, ['use_gpu_for_reverb_compute'], True) self.openair_kernels = None def load_openair_kernels(self): if self.use_gpu_for_reverb_compute and self.openair_kernels is None: # Load the openair reverbs as CUDA tensors. self.openair_kernels = [] for oa in self.openair_paths: self.openair_kernels.append(load_rir(oa, self.underlying_dataset.sampling_rate, self.underlying_dataset.sampling_rate*2).cuda()) def __getitem__(self, item): # Load on the fly to prevent GPU memory sharing across process errors. self.load_openair_kernels() out = self.underlying_dataset[item] clip = out['clip'] dlen = clip.shape[-1] clip = clip[:, :out['clip_lengths']] padding_room = dlen - clip.shape[-1] augpath = '' augvol = 0 try: # Randomly adjust clip volume, regardless of the selection, between clipvol = (random.random() * (.8-.5) + .5) clip = clip * clipvol label = random.randint(0, 4) # Current excludes GSM corruption. label = 3 if label > 0 and label < 4: # 0 is basically "leave it alone" aug_needed = True augvol = (random.random() * (self.max_volume-self.min_volume) + self.min_volume) if label == 1: # Add environmental noise. augpath = random.choice(self.env_noise_paths) intg_fns = [_integration_fn_fully_enabled] elif label == 2: # Add music augpath = random.choice(self.music_paths) intg_fns = [_integration_fn_fully_enabled] augvol *= .5 # Music is often severely in the background. elif label == 3: augpath = random.choice(self.underlying_dataset.audiopaths) # This can take two forms: if padding_room < 22000 or random.random() < .5: # (1) The voices talk over one another. If there is no padding room, we always take this choice. intg_fns = [_integration_fn_smooth, _integration_fn_fully_enabled] else: # (2) There are simply two voices in the clip, separated from one another. # This is a special case that does not use the same logic as the rest of the augmentations. aug = load_audio(augpath, self.underlying_dataset.sampling_rate) # Pad with some random silence aug = F.pad(aug, (random.randint(20,4000), 0)) # Fit what we can given the padding room we have. aug = aug[:, :padding_room] clip = torch.cat([clip, aug], dim=1) # Restore some meta-parameters. padding_room = dlen - clip.shape[-1] out['clip_lengths'] = clip.shape[-1] aug_needed = False if aug_needed: aug = load_audio(augpath, self.underlying_dataset.sampling_rate) if aug.shape[1] > clip.shape[1]: n, cn = aug.shape[1], clip.shape[1] gap = n-cn placement = random.randint(0, gap) aug = aug[:, placement:placement+cn] aug = random.choice(intg_fns)(aug.shape[1]) * aug aug = aug * augvol if aug.shape[1] < clip.shape[1]: gap = clip.shape[1] - aug.shape[1] placement = random.randint(0, gap-1) aug = torch.nn.functional.pad(aug, (placement, gap-placement)) clip = clip + aug elif label == 4: # Perform reverb (to simulate being in a large room with an omni-mic). This is performed by convolving # impulse recordings from openair over the input clip. if self.use_gpu_for_reverb_compute: rir = random.choice(self.openair_kernels) else: augpath = random.choice(self.openair_paths) rir = load_rir(augpath, self.underlying_dataset.sampling_rate, clip.shape[-1]) clip = torch.nn.functional.pad(clip, (rir.shape[1]-1, 0)) if self.use_gpu_for_reverb_compute: clip = clip.cuda() clip = torch.nn.functional.conv1d(clip.unsqueeze(0), rir.unsqueeze(0)).squeeze(0).cpu() elif label == 5: # Apply the GSM codec to simulate cellular phone audio. clip = torchaudio.functional.apply_codec(clip, self.underlying_dataset.sampling_rate, format="gsm") except: #print(f"Exception encountered processing {item}, re-trying because this is often just a failed aug.") #print(sys.exc_info()) #raise # Uncomment to surface exceptions. return self[item] clip.clip_(-1, 1) # Restore padding. clip = F.pad(clip, (0, padding_room)) out['clip'] = clip out['label'] = label #out['aug'] = aug out['augpath'] = augpath out['augvol'] = augvol out['clipvol'] = clipvol return out def __len__(self): return len(self.underlying_dataset) if __name__ == '__main__': params = { 'mode': 'unsupervised_audio_with_noise', 'path': ['y:/clips/books1'], 'cache_path': 'E:\\audio\\remote-cache4.pth', 'sampling_rate': 22050, 'pad_to_samples': 400000, 'phase': 'train', 'n_workers': 0, 'batch_size': 4, 'extra_samples': 4, 'env_noise_paths': ['E:\\audio\\UrbanSound\\filtered', 'E:\\audio\\UrbanSound\\MSSND'], 'env_noise_cache': 'E:\\audio\\UrbanSound\\cache.pth', 'music_paths': ['E:\\audio\\music\\FMA\\fma_large', 'E:\\audio\\music\\maestro\\maestro-v3.0.0'], 'music_cache': 'E:\\audio\\music\\cache.pth', 'openair_path': 'D:\\data\\audio\\openair\\resampled' } from data import create_dataset, create_dataloader, util ds = create_dataset(params) dl = create_dataloader(ds, params) i = 0 for b in tqdm(dl): for b_ in range(b['clip'].shape[0]): torchaudio.save(f'{i}_clip_{b_}_{b["label"][b_].item()}.wav', b['clip'][b_][:, :b['clip_lengths'][b_]], ds.sampling_rate) #torchaudio.save(f'{i}_clip_{b_}_aug.wav', b['aug'][b_], ds.sampling_rate) print(f'{i} aug path: {b["augpath"][b_]} aug volume: {b["augvol"][b_]} clip volume: {b["clipvol"][b_]}') i += 1