diff --git a/codes/data/audio/audio_with_noise_dataset.py b/codes/data/audio/audio_with_noise_dataset.py new file mode 100644 index 00000000..834c5d95 --- /dev/null +++ b/codes/data/audio/audio_with_noise_dataset.py @@ -0,0 +1,154 @@ +import random +from math import pi + +import torch +import torchaudio +from torch.utils.data import Dataset +from tqdm import tqdm + +from data.audio.unsupervised_audio_dataset import UnsupervisedAudioDataset, load_audio +from data.util import load_paths_from_cache + + +# Just all ones. +from utils.util import opt_get + + +def _integration_fn_fully_enabled(n): + return torch.ones((n,)) + + +# Randomly assigns up to 5 blocks of the output tensor the value '1'. Rest is zero +def _integration_fn_spiky(n): + fn = torch.zeros((n,)) + spikes = random.randint(1,5) + for _ in range(spikes): + sz = random.randint(n//8, n//2) + pos = random.randint(0, n) + extent = min(n, sz+pos) + fn[pos:extent] = 1 + return fn + + +# Uses a sinusoidal ramp up and down (of random length) to a peak which is held for a random duration. +def _integration_fn_smooth(n): + center = random.randint(1, n-2) + max_duration=n-center-1 + duration = random.randint(max_duration//4, max_duration) + end = center+duration + + ramp_up_sz = random.randint(n//16,n//4) + ramp_up = torch.sin(pi*torch.arange(0,ramp_up_sz)/(2*ramp_up_sz)) + if ramp_up_sz > center: + ramp_up = ramp_up[(ramp_up_sz-center):] + ramp_up_sz = center + + ramp_down_sz = random.randint(n//16,n//4) + ramp_down = torch.flip(torch.sin(pi*torch.arange(0,ramp_down_sz)/(2*ramp_down_sz)), dims=[0]) + if ramp_down_sz > (n-end): + ramp_down = ramp_down[:(n-end)] + ramp_down_sz = n-end + + fn = torch.zeros((n,)) + fn[(center-ramp_up_sz):center] = ramp_up + fn[center:end] = 1 + fn[end:(end+ramp_down_sz)] = ramp_down + + return fn + + +''' +Wraps a unsupervised_audio_dataset and applies noise to the output clips, then provides labels depending on what +noise was added. +''' +class AudioWithNoiseDataset(Dataset): + def __init__(self, opt): + self.underlying_dataset = UnsupervisedAudioDataset(opt) + self.env_noise_paths = load_paths_from_cache(opt['env_noise_paths'], opt['env_noise_cache']) + self.music_paths = load_paths_from_cache(opt['music_paths'], opt['music_cache']) + self.min_volume = opt_get(opt, ['min_noise_volume'], .2) + self.max_volume = opt_get(opt, ['max_noise_volume'], .5) + self.sampling_rate = self.underlying_dataset.sampling_rate + + def __getitem__(self, item): + out = self.underlying_dataset[item] + clip = out['clip'] + augpath = '' + augvol = 0 + try: + # Randomly adjust clip volume, regardless of the selection, between + clipvol = (random.random() * (.8-.5) + .5) + clip = clip * clipvol + + label = random.randint(0,3) + aug = torch.zeros_like(clip) + if label != 0: # 0 is basically "leave it alone" + augvol = (random.random() * (self.max_volume-self.min_volume) + self.min_volume) + if label == 1: + augpath = random.choice(self.env_noise_paths) + intg_fns = [_integration_fn_fully_enabled] + elif label == 2: + augpath = random.choice(self.music_paths) + intg_fns = [_integration_fn_fully_enabled] + augvol *= .5 # Music is often severely in the background. + elif label == 3: + augpath = random.choice(self.underlying_dataset.audiopaths) + intg_fns = [_integration_fn_smooth, _integration_fn_fully_enabled] + aug = load_audio(augpath, self.underlying_dataset.sampling_rate) + if aug.shape[1] > clip.shape[1]: + n, cn = aug.shape[1], clip.shape[1] + gap = n-cn + placement = random.randint(0, gap) + aug = aug[:, placement:placement+cn] + aug = random.choice(intg_fns)(aug.shape[1]) * aug + aug = aug * augvol + if aug.shape[1] < clip.shape[1]: + gap = clip.shape[1] - aug.shape[1] + placement = random.randint(0, gap-1) + aug = torch.nn.functional.pad(aug, (placement, gap-placement)) + clip = clip + aug + clip.clip_(-1, 1) + except: + print("Exception encountered processing {item}, re-trying because this is often just a failed aug.") + return self[item] + + out['clip'] = clip + out['label'] = label + out['aug'] = aug + out['augpath'] = augpath + out['augvol'] = augvol + out['clipvol'] = clipvol + return out + + def __len__(self): + return len(self.underlying_dataset) + + +if __name__ == '__main__': + params = { + 'mode': 'unsupervised_audio_with_noise', + 'path': ['\\\\192.168.5.3\\rtx3080_audio_y\\split\\books2', '\\\\192.168.5.3\\rtx3080_audio\\split\\books1', '\\\\192.168.5.3\\rtx3080_audio\\split\\cleaned-2'], + 'cache_path': 'E:\\audio\\remote-cache2.pth', + 'sampling_rate': 22050, + 'pad_to_samples': 80960, + 'phase': 'train', + 'n_workers': 0, + 'batch_size': 16, + 'extra_samples': 4, + 'env_noise_paths': ['E:\\audio\\UrbanSound\\filtered', 'E:\\audio\\UrbanSound\\MSSND'], + 'env_noise_cache': 'E:\\audio\\UrbanSound\\cache.pth', + 'music_paths': ['E:\\audio\\music\\FMA\\fma_large', 'E:\\audio\\music\\maestro\\maestro-v3.0.0'], + 'music_cache': 'E:\\audio\\music\\cache.pth', + } + from data import create_dataset, create_dataloader, util + + ds = create_dataset(params) + dl = create_dataloader(ds, params) + i = 0 + for b in tqdm(dl): + for b_ in range(b['clip'].shape[0]): + #pass + torchaudio.save(f'{i}_clip_{b_}_{b["label"][b_].item()}.wav', b['clip'][b_], ds.sampling_rate) + torchaudio.save(f'{i}_clip_{b_}_aug.wav', b['aug'][b_], ds.sampling_rate) + print(f'{i} aug path: {b["augpath"][b_]} aug volume: {b["augvol"][b_]} clip volume: {b["clipvol"][b_]}') + i += 1 diff --git a/codes/data/audio/unsupervised_audio_dataset.py b/codes/data/audio/unsupervised_audio_dataset.py index bd0bb181..c3465964 100644 --- a/codes/data/audio/unsupervised_audio_dataset.py +++ b/codes/data/audio/unsupervised_audio_dataset.py @@ -1,6 +1,7 @@ import os import pathlib import random +import sys import torch import torch.utils.data @@ -10,7 +11,7 @@ from audio2numpy import open_audio from tqdm import tqdm from data.audio.wav_aug import WavAugmentor -from data.util import find_files_of_type, is_wav_file, is_audio_file +from data.util import find_files_of_type, is_wav_file, is_audio_file, load_paths_from_cache from models.tacotron2.taco_utils import load_wav_to_torch from utils.util import opt_get @@ -49,16 +50,7 @@ class UnsupervisedAudioDataset(torch.utils.data.Dataset): def __init__(self, opt): path = opt['path'] cache_path = opt['cache_path'] # Will fail when multiple paths specified, must be specified in this case. - if not isinstance(path, list): - path = [path] - if os.path.exists(cache_path): - self.audiopaths = torch.load(cache_path) - else: - print("Building cache..") - self.audiopaths = [] - for p in path: - self.audiopaths.extend(find_files_of_type('img', p, qualifier=is_audio_file)[0]) - torch.save(self.audiopaths, cache_path) + self.audiopaths = load_paths_from_cache(path, cache_path) # Parse options self.sampling_rate = opt_get(opt, ['sampling_rate'], 22050) @@ -113,7 +105,7 @@ class UnsupervisedAudioDataset(torch.utils.data.Dataset): audio_norm, filename = self.get_audio_for_index(index) alt_files, actual_samples = self.get_related_audio_for_index(index) except: - print(f"Error loading audio for file {self.audiopaths[index]}") + print(f"Error loading audio for file {self.audiopaths[index]} {sys.exc_info()}") return self[index+1] # This is required when training to make sure all clips align. diff --git a/codes/models/tacotron2/taco_utils.py b/codes/models/tacotron2/taco_utils.py index 566436fb..c6297bfe 100644 --- a/codes/models/tacotron2/taco_utils.py +++ b/codes/models/tacotron2/taco_utils.py @@ -1,8 +1,9 @@ import os.path import numpy as np -from scipy.io.wavfile import read import torch +from scipy.io.wavfile import read + def get_mask_from_lengths(lengths, max_len=None): if max_len is None: @@ -14,8 +15,10 @@ def get_mask_from_lengths(lengths, max_len=None): def load_wav_to_torch(full_path): sampling_rate, data = read(full_path) - if data.dtype == np.int16: - norm_fix = 32768 + if data.dtype == np.int32: + norm_fix = 2 ** 31 + elif data.dtype == np.int16: + norm_fix = 2 ** 15 elif data.dtype == np.float16 or data.dtype == np.float32: norm_fix = 1. else: diff --git a/codes/utils/filter_urbansounds.py b/codes/utils/filter_urbansounds.py new file mode 100644 index 00000000..7e3dea11 --- /dev/null +++ b/codes/utils/filter_urbansounds.py @@ -0,0 +1,33 @@ +import os +import shutil + +from scipy.io.wavfile import read +from tqdm import tqdm +import numpy as np + +if __name__ == '__main__': + apath = 'E:\\audio\\UrbanSound\\UrbanSound8K\\audio\\' + csv_file = open('E:\\audio\\UrbanSound\\UrbanSound8K\\metadata\\UrbanSound8K.csv', 'r') + csv = csv_file.read() + csv_file.close() + for it, line in tqdm(enumerate(csv.splitlines(keepends=False))): + if it == 0: + continue + l = line.split(',') + f = os.path.join(apath, f'fold{l[5]}', l[0]) + c = l[7] + try: + if c in ['children_playing', 'street_music', 'gun_shot']: + continue + sampling_rate, data = read(f) + if data.dtype == np.int32: + norm_fix = 2 ** 31 + elif data.dtype == np.int16: + norm_fix = 2 ** 15 + elif data.dtype == np.float16 or data.dtype == np.float32: + norm_fix = 1. + else: + raise NotImplemented(f"Provided data dtype not supported: {data.dtype}") + shutil.copy(f, os.path.join('E:\\audio\\UrbanSound\\filtered', l[0])) + except: + pass \ No newline at end of file