Move UnsupervisedAudioDataset to use my new mp3 loader

This commit is contained in:
James Betker 2021-10-28 22:33:12 -06:00
parent 2afea126d7
commit 579f0a70ee
3 changed files with 62 additions and 14 deletions

View File

@ -1,14 +1,15 @@
import random import random
import sys
from math import pi from math import pi
import librosa
import torch import torch
import torchaudio import torchaudio
from torch.utils.data import Dataset from torch.utils.data import Dataset
from tqdm import tqdm from tqdm import tqdm
from data.audio.unsupervised_audio_dataset import UnsupervisedAudioDataset, load_audio from data.audio.unsupervised_audio_dataset import UnsupervisedAudioDataset, load_audio
from data.util import load_paths_from_cache from data.util import load_paths_from_cache, find_files_of_type, is_audio_file
# Just all ones. # Just all ones.
from utils.util import opt_get from utils.util import opt_get
@ -57,6 +58,14 @@ def _integration_fn_smooth(n):
return fn return fn
def load_rir(path, sr, max_sz):
rir = load_audio(path, sr).abs()
if rir.shape[-1] > max_sz:
rir = rir[:, :max_sz]
rir = (rir / torch.norm(rir, p=2)).flip([1])
return rir
''' '''
Wraps a unsupervised_audio_dataset and applies noise to the output clips, then provides labels depending on what Wraps a unsupervised_audio_dataset and applies noise to the output clips, then provides labels depending on what
noise was added. noise was added.
@ -66,11 +75,24 @@ class AudioWithNoiseDataset(Dataset):
self.underlying_dataset = UnsupervisedAudioDataset(opt) self.underlying_dataset = UnsupervisedAudioDataset(opt)
self.env_noise_paths = load_paths_from_cache(opt['env_noise_paths'], opt['env_noise_cache']) self.env_noise_paths = load_paths_from_cache(opt['env_noise_paths'], opt['env_noise_cache'])
self.music_paths = load_paths_from_cache(opt['music_paths'], opt['music_cache']) self.music_paths = load_paths_from_cache(opt['music_paths'], opt['music_cache'])
self.openair_paths = find_files_of_type('img', opt['openair_path'], qualifier=is_audio_file)[0]
self.min_volume = opt_get(opt, ['min_noise_volume'], .2) self.min_volume = opt_get(opt, ['min_noise_volume'], .2)
self.max_volume = opt_get(opt, ['max_noise_volume'], .5) self.max_volume = opt_get(opt, ['max_noise_volume'], .5)
self.sampling_rate = self.underlying_dataset.sampling_rate self.sampling_rate = self.underlying_dataset.sampling_rate
self.use_gpu_for_reverb_compute = opt_get(opt, ['use_gpu_for_reverb_compute'], True)
self.openair_kernels = None
def load_openair_kernels(self):
if self.use_gpu_for_reverb_compute and self.openair_kernels is None:
# Load the openair reverbs as CUDA tensors.
self.openair_kernels = []
for oa in self.openair_paths:
self.openair_kernels.append(load_rir(oa, self.underlying_dataset.sampling_rate, self.underlying_dataset.sampling_rate*2).cuda())
def __getitem__(self, item): def __getitem__(self, item):
# Load on the fly to prevent GPU memory sharing across process errors.
self.load_openair_kernels()
out = self.underlying_dataset[item] out = self.underlying_dataset[item]
clip = out['clip'] clip = out['clip']
augpath = '' augpath = ''
@ -80,18 +102,22 @@ class AudioWithNoiseDataset(Dataset):
clipvol = (random.random() * (.8-.5) + .5) clipvol = (random.random() * (.8-.5) + .5)
clip = clip * clipvol clip = clip * clipvol
label = random.randint(0,3) label = random.randint(0, 4) # Current excludes GSM corruption.
#label = 2
aug = torch.zeros_like(clip) aug = torch.zeros_like(clip)
if label != 0: # 0 is basically "leave it alone" if label > 0 and label < 4: # 0 is basically "leave it alone"
augvol = (random.random() * (self.max_volume-self.min_volume) + self.min_volume) augvol = (random.random() * (self.max_volume-self.min_volume) + self.min_volume)
if label == 1: if label == 1:
# Add environmental noise.
augpath = random.choice(self.env_noise_paths) augpath = random.choice(self.env_noise_paths)
intg_fns = [_integration_fn_fully_enabled] intg_fns = [_integration_fn_fully_enabled]
elif label == 2: elif label == 2:
# Add music
augpath = random.choice(self.music_paths) augpath = random.choice(self.music_paths)
intg_fns = [_integration_fn_fully_enabled] intg_fns = [_integration_fn_fully_enabled]
augvol *= .5 # Music is often severely in the background. augvol *= .5 # Music is often severely in the background.
elif label == 3: elif label == 3:
# Add another voice.
augpath = random.choice(self.underlying_dataset.audiopaths) augpath = random.choice(self.underlying_dataset.audiopaths)
intg_fns = [_integration_fn_smooth, _integration_fn_fully_enabled] intg_fns = [_integration_fn_smooth, _integration_fn_fully_enabled]
aug = load_audio(augpath, self.underlying_dataset.sampling_rate) aug = load_audio(augpath, self.underlying_dataset.sampling_rate)
@ -107,11 +133,28 @@ class AudioWithNoiseDataset(Dataset):
placement = random.randint(0, gap-1) placement = random.randint(0, gap-1)
aug = torch.nn.functional.pad(aug, (placement, gap-placement)) aug = torch.nn.functional.pad(aug, (placement, gap-placement))
clip = clip + aug clip = clip + aug
clip.clip_(-1, 1) elif label == 4:
# Perform reverb (to simulate being in a large room with an omni-mic). This is performed by convolving
# impulse recordings from openair over the input clip.
if self.use_gpu_for_reverb_compute:
rir = random.choice(self.openair_kernels)
else:
augpath = random.choice(self.openair_paths)
rir = load_rir(augpath, self.underlying_dataset.sampling_rate, clip.shape[-1])
clip = torch.nn.functional.pad(clip, (rir.shape[1]-1, 0))
if self.use_gpu_for_reverb_compute:
clip = clip.cuda()
clip = torch.nn.functional.conv1d(clip.unsqueeze(0), rir.unsqueeze(0)).squeeze(0).cpu()
elif label == 5:
# Apply the GSM codec to simulate cellular phone audio.
clip = torchaudio.functional.apply_codec(clip, self.underlying_dataset.sampling_rate, format="gsm")
except: except:
print(f"Exception encountered processing {item}, re-trying because this is often just a failed aug.") print(f"Exception encountered processing {item}, re-trying because this is often just a failed aug.")
print(sys.exc_info())
#raise # Uncomment to surface exceptions.
return self[item] return self[item]
clip.clip_(-1, 1)
out['clip'] = clip out['clip'] = clip
out['label'] = label out['label'] = label
out['aug'] = aug out['aug'] = aug
@ -127,18 +170,19 @@ class AudioWithNoiseDataset(Dataset):
if __name__ == '__main__': if __name__ == '__main__':
params = { params = {
'mode': 'unsupervised_audio_with_noise', 'mode': 'unsupervised_audio_with_noise',
'path': ['\\\\192.168.5.3\\rtx3080_audio_y\\split\\books2', '\\\\192.168.5.3\\rtx3080_audio\\split\\books1', '\\\\192.168.5.3\\rtx3080_audio\\split\\cleaned-2'], 'path': ['\\\\192.168.5.3\\rtx3080_audio\\split\\cleaned\\books0'],
'cache_path': 'E:\\audio\\remote-cache2.pth', 'cache_path': 'E:\\audio\\remote-cache3.pth',
'sampling_rate': 22050, 'sampling_rate': 22050,
'pad_to_samples': 80960, 'pad_to_samples': 80960,
'phase': 'train', 'phase': 'train',
'n_workers': 0, 'n_workers': 0,
'batch_size': 16, 'batch_size': 4,
'extra_samples': 4, 'extra_samples': 4,
'env_noise_paths': ['E:\\audio\\UrbanSound\\filtered', 'E:\\audio\\UrbanSound\\MSSND'], 'env_noise_paths': ['E:\\audio\\UrbanSound\\filtered', 'E:\\audio\\UrbanSound\\MSSND'],
'env_noise_cache': 'E:\\audio\\UrbanSound\\cache.pth', 'env_noise_cache': 'E:\\audio\\UrbanSound\\cache.pth',
'music_paths': ['E:\\audio\\music\\FMA\\fma_large', 'E:\\audio\\music\\maestro\\maestro-v3.0.0'], 'music_paths': ['E:\\audio\\music\\FMA\\fma_large', 'E:\\audio\\music\\maestro\\maestro-v3.0.0'],
'music_cache': 'E:\\audio\\music\\cache.pth', 'music_cache': 'E:\\audio\\music\\cache.pth',
'openair_path': 'D:\\data\\audio\\openair\\resampled'
} }
from data import create_dataset, create_dataloader, util from data import create_dataset, create_dataloader, util
@ -147,8 +191,7 @@ if __name__ == '__main__':
i = 0 i = 0
for b in tqdm(dl): for b in tqdm(dl):
for b_ in range(b['clip'].shape[0]): for b_ in range(b['clip'].shape[0]):
#pass #torchaudio.save(f'{i}_clip_{b_}_{b["label"][b_].item()}.wav', b['clip'][b_], ds.sampling_rate)
torchaudio.save(f'{i}_clip_{b_}_{b["label"][b_].item()}.wav', b['clip'][b_], ds.sampling_rate) #torchaudio.save(f'{i}_clip_{b_}_aug.wav', b['aug'][b_], ds.sampling_rate)
torchaudio.save(f'{i}_clip_{b_}_aug.wav', b['aug'][b_], ds.sampling_rate)
print(f'{i} aug path: {b["augpath"][b_]} aug volume: {b["augvol"][b_]} clip volume: {b["clipvol"][b_]}') print(f'{i} aug path: {b["augpath"][b_]} aug volume: {b["augvol"][b_]} clip volume: {b["clipvol"][b_]}')
i += 1 i += 1

View File

@ -20,6 +20,11 @@ from utils.util import opt_get
def load_audio(audiopath, sampling_rate): def load_audio(audiopath, sampling_rate):
if audiopath[-4:] == '.wav': if audiopath[-4:] == '.wav':
audio, lsr = load_wav_to_torch(audiopath) audio, lsr = load_wav_to_torch(audiopath)
elif audiopath[-4:] == '.mp3':
# https://github.com/neonbjb/pyfastmp3decoder - Definitely worth it.
from pyfastmp3decoder.mp3decoder import load_mp3
audio, lsr = load_mp3(audiopath, sampling_rate)
audio = torch.FloatTensor(audio)
else: else:
audio, lsr = open_audio(audiopath) audio, lsr = open_audio(audiopath)
audio = torch.FloatTensor(audio) audio = torch.FloatTensor(audio)
@ -149,8 +154,8 @@ class UnsupervisedAudioDataset(torch.utils.data.Dataset):
if __name__ == '__main__': if __name__ == '__main__':
params = { params = {
'mode': 'unsupervised_audio', 'mode': 'unsupervised_audio',
'path': ['\\\\192.168.5.3\\rtx3080_audio_y\\split\\books2', '\\\\192.168.5.3\\rtx3080_audio\\split\\books1', '\\\\192.168.5.3\\rtx3080_audio\\split\\cleaned-2'], 'path': ['\\\\192.168.5.3\\rtx3080_audio\\split\\cleaned\\books0'],
'cache_path': 'E:\\audio\\remote-cache2.pth', 'cache_path': 'E:\\audio\\remote-cache3.pth',
'sampling_rate': 22050, 'sampling_rate': 22050,
'pad_to_samples': 40960, 'pad_to_samples': 40960,
'phase': 'train', 'phase': 'train',

View File

@ -81,7 +81,7 @@ if __name__ == "__main__":
torch.backends.cudnn.benchmark = True torch.backends.cudnn.benchmark = True
want_metrics = False want_metrics = False
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('-opt', type=str, help='Path to options YAML file.', default='../options/test_diffusion_vocoder_10-25.yml') parser.add_argument('-opt', type=str, help='Path to options YAML file.', default='../options/test_diffusion_vocoder_10-28.yml')
opt = option.parse(parser.parse_args().opt, is_train=False) opt = option.parse(parser.parse_args().opt, is_train=False)
opt = option.dict_to_nonedict(opt) opt = option.dict_to_nonedict(opt)
utils.util.loaded_options = opt utils.util.loaded_options = opt