Initial implementation of audio_with_noise dataset
This commit is contained in:
parent
9a3e89ec53
commit
06ea6191a9
154
codes/data/audio/audio_with_noise_dataset.py
Normal file
154
codes/data/audio/audio_with_noise_dataset.py
Normal file
|
@ -0,0 +1,154 @@
|
||||||
|
import random
|
||||||
|
from math import pi
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torchaudio
|
||||||
|
from torch.utils.data import Dataset
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from data.audio.unsupervised_audio_dataset import UnsupervisedAudioDataset, load_audio
|
||||||
|
from data.util import load_paths_from_cache
|
||||||
|
|
||||||
|
|
||||||
|
# Just all ones.
|
||||||
|
from utils.util import opt_get
|
||||||
|
|
||||||
|
|
||||||
|
def _integration_fn_fully_enabled(n):
|
||||||
|
return torch.ones((n,))
|
||||||
|
|
||||||
|
|
||||||
|
# Randomly assigns up to 5 blocks of the output tensor the value '1'. Rest is zero
|
||||||
|
def _integration_fn_spiky(n):
|
||||||
|
fn = torch.zeros((n,))
|
||||||
|
spikes = random.randint(1,5)
|
||||||
|
for _ in range(spikes):
|
||||||
|
sz = random.randint(n//8, n//2)
|
||||||
|
pos = random.randint(0, n)
|
||||||
|
extent = min(n, sz+pos)
|
||||||
|
fn[pos:extent] = 1
|
||||||
|
return fn
|
||||||
|
|
||||||
|
|
||||||
|
# Uses a sinusoidal ramp up and down (of random length) to a peak which is held for a random duration.
|
||||||
|
def _integration_fn_smooth(n):
|
||||||
|
center = random.randint(1, n-2)
|
||||||
|
max_duration=n-center-1
|
||||||
|
duration = random.randint(max_duration//4, max_duration)
|
||||||
|
end = center+duration
|
||||||
|
|
||||||
|
ramp_up_sz = random.randint(n//16,n//4)
|
||||||
|
ramp_up = torch.sin(pi*torch.arange(0,ramp_up_sz)/(2*ramp_up_sz))
|
||||||
|
if ramp_up_sz > center:
|
||||||
|
ramp_up = ramp_up[(ramp_up_sz-center):]
|
||||||
|
ramp_up_sz = center
|
||||||
|
|
||||||
|
ramp_down_sz = random.randint(n//16,n//4)
|
||||||
|
ramp_down = torch.flip(torch.sin(pi*torch.arange(0,ramp_down_sz)/(2*ramp_down_sz)), dims=[0])
|
||||||
|
if ramp_down_sz > (n-end):
|
||||||
|
ramp_down = ramp_down[:(n-end)]
|
||||||
|
ramp_down_sz = n-end
|
||||||
|
|
||||||
|
fn = torch.zeros((n,))
|
||||||
|
fn[(center-ramp_up_sz):center] = ramp_up
|
||||||
|
fn[center:end] = 1
|
||||||
|
fn[end:(end+ramp_down_sz)] = ramp_down
|
||||||
|
|
||||||
|
return fn
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
Wraps a unsupervised_audio_dataset and applies noise to the output clips, then provides labels depending on what
|
||||||
|
noise was added.
|
||||||
|
'''
|
||||||
|
class AudioWithNoiseDataset(Dataset):
|
||||||
|
def __init__(self, opt):
|
||||||
|
self.underlying_dataset = UnsupervisedAudioDataset(opt)
|
||||||
|
self.env_noise_paths = load_paths_from_cache(opt['env_noise_paths'], opt['env_noise_cache'])
|
||||||
|
self.music_paths = load_paths_from_cache(opt['music_paths'], opt['music_cache'])
|
||||||
|
self.min_volume = opt_get(opt, ['min_noise_volume'], .2)
|
||||||
|
self.max_volume = opt_get(opt, ['max_noise_volume'], .5)
|
||||||
|
self.sampling_rate = self.underlying_dataset.sampling_rate
|
||||||
|
|
||||||
|
def __getitem__(self, item):
|
||||||
|
out = self.underlying_dataset[item]
|
||||||
|
clip = out['clip']
|
||||||
|
augpath = ''
|
||||||
|
augvol = 0
|
||||||
|
try:
|
||||||
|
# Randomly adjust clip volume, regardless of the selection, between
|
||||||
|
clipvol = (random.random() * (.8-.5) + .5)
|
||||||
|
clip = clip * clipvol
|
||||||
|
|
||||||
|
label = random.randint(0,3)
|
||||||
|
aug = torch.zeros_like(clip)
|
||||||
|
if label != 0: # 0 is basically "leave it alone"
|
||||||
|
augvol = (random.random() * (self.max_volume-self.min_volume) + self.min_volume)
|
||||||
|
if label == 1:
|
||||||
|
augpath = random.choice(self.env_noise_paths)
|
||||||
|
intg_fns = [_integration_fn_fully_enabled]
|
||||||
|
elif label == 2:
|
||||||
|
augpath = random.choice(self.music_paths)
|
||||||
|
intg_fns = [_integration_fn_fully_enabled]
|
||||||
|
augvol *= .5 # Music is often severely in the background.
|
||||||
|
elif label == 3:
|
||||||
|
augpath = random.choice(self.underlying_dataset.audiopaths)
|
||||||
|
intg_fns = [_integration_fn_smooth, _integration_fn_fully_enabled]
|
||||||
|
aug = load_audio(augpath, self.underlying_dataset.sampling_rate)
|
||||||
|
if aug.shape[1] > clip.shape[1]:
|
||||||
|
n, cn = aug.shape[1], clip.shape[1]
|
||||||
|
gap = n-cn
|
||||||
|
placement = random.randint(0, gap)
|
||||||
|
aug = aug[:, placement:placement+cn]
|
||||||
|
aug = random.choice(intg_fns)(aug.shape[1]) * aug
|
||||||
|
aug = aug * augvol
|
||||||
|
if aug.shape[1] < clip.shape[1]:
|
||||||
|
gap = clip.shape[1] - aug.shape[1]
|
||||||
|
placement = random.randint(0, gap-1)
|
||||||
|
aug = torch.nn.functional.pad(aug, (placement, gap-placement))
|
||||||
|
clip = clip + aug
|
||||||
|
clip.clip_(-1, 1)
|
||||||
|
except:
|
||||||
|
print("Exception encountered processing {item}, re-trying because this is often just a failed aug.")
|
||||||
|
return self[item]
|
||||||
|
|
||||||
|
out['clip'] = clip
|
||||||
|
out['label'] = label
|
||||||
|
out['aug'] = aug
|
||||||
|
out['augpath'] = augpath
|
||||||
|
out['augvol'] = augvol
|
||||||
|
out['clipvol'] = clipvol
|
||||||
|
return out
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.underlying_dataset)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
params = {
|
||||||
|
'mode': 'unsupervised_audio_with_noise',
|
||||||
|
'path': ['\\\\192.168.5.3\\rtx3080_audio_y\\split\\books2', '\\\\192.168.5.3\\rtx3080_audio\\split\\books1', '\\\\192.168.5.3\\rtx3080_audio\\split\\cleaned-2'],
|
||||||
|
'cache_path': 'E:\\audio\\remote-cache2.pth',
|
||||||
|
'sampling_rate': 22050,
|
||||||
|
'pad_to_samples': 80960,
|
||||||
|
'phase': 'train',
|
||||||
|
'n_workers': 0,
|
||||||
|
'batch_size': 16,
|
||||||
|
'extra_samples': 4,
|
||||||
|
'env_noise_paths': ['E:\\audio\\UrbanSound\\filtered', 'E:\\audio\\UrbanSound\\MSSND'],
|
||||||
|
'env_noise_cache': 'E:\\audio\\UrbanSound\\cache.pth',
|
||||||
|
'music_paths': ['E:\\audio\\music\\FMA\\fma_large', 'E:\\audio\\music\\maestro\\maestro-v3.0.0'],
|
||||||
|
'music_cache': 'E:\\audio\\music\\cache.pth',
|
||||||
|
}
|
||||||
|
from data import create_dataset, create_dataloader, util
|
||||||
|
|
||||||
|
ds = create_dataset(params)
|
||||||
|
dl = create_dataloader(ds, params)
|
||||||
|
i = 0
|
||||||
|
for b in tqdm(dl):
|
||||||
|
for b_ in range(b['clip'].shape[0]):
|
||||||
|
#pass
|
||||||
|
torchaudio.save(f'{i}_clip_{b_}_{b["label"][b_].item()}.wav', b['clip'][b_], ds.sampling_rate)
|
||||||
|
torchaudio.save(f'{i}_clip_{b_}_aug.wav', b['aug'][b_], ds.sampling_rate)
|
||||||
|
print(f'{i} aug path: {b["augpath"][b_]} aug volume: {b["augvol"][b_]} clip volume: {b["clipvol"][b_]}')
|
||||||
|
i += 1
|
|
@ -1,6 +1,7 @@
|
||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
import random
|
import random
|
||||||
|
import sys
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.data
|
import torch.utils.data
|
||||||
|
@ -10,7 +11,7 @@ from audio2numpy import open_audio
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from data.audio.wav_aug import WavAugmentor
|
from data.audio.wav_aug import WavAugmentor
|
||||||
from data.util import find_files_of_type, is_wav_file, is_audio_file
|
from data.util import find_files_of_type, is_wav_file, is_audio_file, load_paths_from_cache
|
||||||
from models.tacotron2.taco_utils import load_wav_to_torch
|
from models.tacotron2.taco_utils import load_wav_to_torch
|
||||||
from utils.util import opt_get
|
from utils.util import opt_get
|
||||||
|
|
||||||
|
@ -49,16 +50,7 @@ class UnsupervisedAudioDataset(torch.utils.data.Dataset):
|
||||||
def __init__(self, opt):
|
def __init__(self, opt):
|
||||||
path = opt['path']
|
path = opt['path']
|
||||||
cache_path = opt['cache_path'] # Will fail when multiple paths specified, must be specified in this case.
|
cache_path = opt['cache_path'] # Will fail when multiple paths specified, must be specified in this case.
|
||||||
if not isinstance(path, list):
|
self.audiopaths = load_paths_from_cache(path, cache_path)
|
||||||
path = [path]
|
|
||||||
if os.path.exists(cache_path):
|
|
||||||
self.audiopaths = torch.load(cache_path)
|
|
||||||
else:
|
|
||||||
print("Building cache..")
|
|
||||||
self.audiopaths = []
|
|
||||||
for p in path:
|
|
||||||
self.audiopaths.extend(find_files_of_type('img', p, qualifier=is_audio_file)[0])
|
|
||||||
torch.save(self.audiopaths, cache_path)
|
|
||||||
|
|
||||||
# Parse options
|
# Parse options
|
||||||
self.sampling_rate = opt_get(opt, ['sampling_rate'], 22050)
|
self.sampling_rate = opt_get(opt, ['sampling_rate'], 22050)
|
||||||
|
@ -113,7 +105,7 @@ class UnsupervisedAudioDataset(torch.utils.data.Dataset):
|
||||||
audio_norm, filename = self.get_audio_for_index(index)
|
audio_norm, filename = self.get_audio_for_index(index)
|
||||||
alt_files, actual_samples = self.get_related_audio_for_index(index)
|
alt_files, actual_samples = self.get_related_audio_for_index(index)
|
||||||
except:
|
except:
|
||||||
print(f"Error loading audio for file {self.audiopaths[index]}")
|
print(f"Error loading audio for file {self.audiopaths[index]} {sys.exc_info()}")
|
||||||
return self[index+1]
|
return self[index+1]
|
||||||
|
|
||||||
# This is required when training to make sure all clips align.
|
# This is required when training to make sure all clips align.
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
import os.path
|
import os.path
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from scipy.io.wavfile import read
|
|
||||||
import torch
|
import torch
|
||||||
|
from scipy.io.wavfile import read
|
||||||
|
|
||||||
|
|
||||||
def get_mask_from_lengths(lengths, max_len=None):
|
def get_mask_from_lengths(lengths, max_len=None):
|
||||||
if max_len is None:
|
if max_len is None:
|
||||||
|
@ -14,8 +15,10 @@ def get_mask_from_lengths(lengths, max_len=None):
|
||||||
|
|
||||||
def load_wav_to_torch(full_path):
|
def load_wav_to_torch(full_path):
|
||||||
sampling_rate, data = read(full_path)
|
sampling_rate, data = read(full_path)
|
||||||
if data.dtype == np.int16:
|
if data.dtype == np.int32:
|
||||||
norm_fix = 32768
|
norm_fix = 2 ** 31
|
||||||
|
elif data.dtype == np.int16:
|
||||||
|
norm_fix = 2 ** 15
|
||||||
elif data.dtype == np.float16 or data.dtype == np.float32:
|
elif data.dtype == np.float16 or data.dtype == np.float32:
|
||||||
norm_fix = 1.
|
norm_fix = 1.
|
||||||
else:
|
else:
|
||||||
|
|
33
codes/utils/filter_urbansounds.py
Normal file
33
codes/utils/filter_urbansounds.py
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
from scipy.io.wavfile import read
|
||||||
|
from tqdm import tqdm
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
apath = 'E:\\audio\\UrbanSound\\UrbanSound8K\\audio\\'
|
||||||
|
csv_file = open('E:\\audio\\UrbanSound\\UrbanSound8K\\metadata\\UrbanSound8K.csv', 'r')
|
||||||
|
csv = csv_file.read()
|
||||||
|
csv_file.close()
|
||||||
|
for it, line in tqdm(enumerate(csv.splitlines(keepends=False))):
|
||||||
|
if it == 0:
|
||||||
|
continue
|
||||||
|
l = line.split(',')
|
||||||
|
f = os.path.join(apath, f'fold{l[5]}', l[0])
|
||||||
|
c = l[7]
|
||||||
|
try:
|
||||||
|
if c in ['children_playing', 'street_music', 'gun_shot']:
|
||||||
|
continue
|
||||||
|
sampling_rate, data = read(f)
|
||||||
|
if data.dtype == np.int32:
|
||||||
|
norm_fix = 2 ** 31
|
||||||
|
elif data.dtype == np.int16:
|
||||||
|
norm_fix = 2 ** 15
|
||||||
|
elif data.dtype == np.float16 or data.dtype == np.float32:
|
||||||
|
norm_fix = 1.
|
||||||
|
else:
|
||||||
|
raise NotImplemented(f"Provided data dtype not supported: {data.dtype}")
|
||||||
|
shutil.copy(f, os.path.join('E:\\audio\\UrbanSound\\filtered', l[0]))
|
||||||
|
except:
|
||||||
|
pass
|
Loading…
Reference in New Issue
Block a user