forked from mrq/DL-Art-School
Move UnsupervisedAudioDataset to use my new mp3 loader
This commit is contained in:
parent
2afea126d7
commit
579f0a70ee
|
@ -1,14 +1,15 @@
|
||||||
import random
|
import random
|
||||||
|
import sys
|
||||||
from math import pi
|
from math import pi
|
||||||
|
|
||||||
|
import librosa
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
from torch.utils.data import Dataset
|
from torch.utils.data import Dataset
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from data.audio.unsupervised_audio_dataset import UnsupervisedAudioDataset, load_audio
|
from data.audio.unsupervised_audio_dataset import UnsupervisedAudioDataset, load_audio
|
||||||
from data.util import load_paths_from_cache
|
from data.util import load_paths_from_cache, find_files_of_type, is_audio_file
|
||||||
|
|
||||||
|
|
||||||
# Just all ones.
|
# Just all ones.
|
||||||
from utils.util import opt_get
|
from utils.util import opt_get
|
||||||
|
@ -57,6 +58,14 @@ def _integration_fn_smooth(n):
|
||||||
return fn
|
return fn
|
||||||
|
|
||||||
|
|
||||||
|
def load_rir(path, sr, max_sz):
|
||||||
|
rir = load_audio(path, sr).abs()
|
||||||
|
if rir.shape[-1] > max_sz:
|
||||||
|
rir = rir[:, :max_sz]
|
||||||
|
rir = (rir / torch.norm(rir, p=2)).flip([1])
|
||||||
|
return rir
|
||||||
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Wraps a unsupervised_audio_dataset and applies noise to the output clips, then provides labels depending on what
|
Wraps a unsupervised_audio_dataset and applies noise to the output clips, then provides labels depending on what
|
||||||
noise was added.
|
noise was added.
|
||||||
|
@ -66,11 +75,24 @@ class AudioWithNoiseDataset(Dataset):
|
||||||
self.underlying_dataset = UnsupervisedAudioDataset(opt)
|
self.underlying_dataset = UnsupervisedAudioDataset(opt)
|
||||||
self.env_noise_paths = load_paths_from_cache(opt['env_noise_paths'], opt['env_noise_cache'])
|
self.env_noise_paths = load_paths_from_cache(opt['env_noise_paths'], opt['env_noise_cache'])
|
||||||
self.music_paths = load_paths_from_cache(opt['music_paths'], opt['music_cache'])
|
self.music_paths = load_paths_from_cache(opt['music_paths'], opt['music_cache'])
|
||||||
|
self.openair_paths = find_files_of_type('img', opt['openair_path'], qualifier=is_audio_file)[0]
|
||||||
self.min_volume = opt_get(opt, ['min_noise_volume'], .2)
|
self.min_volume = opt_get(opt, ['min_noise_volume'], .2)
|
||||||
self.max_volume = opt_get(opt, ['max_noise_volume'], .5)
|
self.max_volume = opt_get(opt, ['max_noise_volume'], .5)
|
||||||
self.sampling_rate = self.underlying_dataset.sampling_rate
|
self.sampling_rate = self.underlying_dataset.sampling_rate
|
||||||
|
self.use_gpu_for_reverb_compute = opt_get(opt, ['use_gpu_for_reverb_compute'], True)
|
||||||
|
self.openair_kernels = None
|
||||||
|
|
||||||
|
def load_openair_kernels(self):
|
||||||
|
if self.use_gpu_for_reverb_compute and self.openair_kernels is None:
|
||||||
|
# Load the openair reverbs as CUDA tensors.
|
||||||
|
self.openair_kernels = []
|
||||||
|
for oa in self.openair_paths:
|
||||||
|
self.openair_kernels.append(load_rir(oa, self.underlying_dataset.sampling_rate, self.underlying_dataset.sampling_rate*2).cuda())
|
||||||
|
|
||||||
def __getitem__(self, item):
|
def __getitem__(self, item):
|
||||||
|
# Load on the fly to prevent GPU memory sharing across process errors.
|
||||||
|
self.load_openair_kernels()
|
||||||
|
|
||||||
out = self.underlying_dataset[item]
|
out = self.underlying_dataset[item]
|
||||||
clip = out['clip']
|
clip = out['clip']
|
||||||
augpath = ''
|
augpath = ''
|
||||||
|
@ -80,18 +102,22 @@ class AudioWithNoiseDataset(Dataset):
|
||||||
clipvol = (random.random() * (.8-.5) + .5)
|
clipvol = (random.random() * (.8-.5) + .5)
|
||||||
clip = clip * clipvol
|
clip = clip * clipvol
|
||||||
|
|
||||||
label = random.randint(0,3)
|
label = random.randint(0, 4) # Current excludes GSM corruption.
|
||||||
|
#label = 2
|
||||||
aug = torch.zeros_like(clip)
|
aug = torch.zeros_like(clip)
|
||||||
if label != 0: # 0 is basically "leave it alone"
|
if label > 0 and label < 4: # 0 is basically "leave it alone"
|
||||||
augvol = (random.random() * (self.max_volume-self.min_volume) + self.min_volume)
|
augvol = (random.random() * (self.max_volume-self.min_volume) + self.min_volume)
|
||||||
if label == 1:
|
if label == 1:
|
||||||
|
# Add environmental noise.
|
||||||
augpath = random.choice(self.env_noise_paths)
|
augpath = random.choice(self.env_noise_paths)
|
||||||
intg_fns = [_integration_fn_fully_enabled]
|
intg_fns = [_integration_fn_fully_enabled]
|
||||||
elif label == 2:
|
elif label == 2:
|
||||||
|
# Add music
|
||||||
augpath = random.choice(self.music_paths)
|
augpath = random.choice(self.music_paths)
|
||||||
intg_fns = [_integration_fn_fully_enabled]
|
intg_fns = [_integration_fn_fully_enabled]
|
||||||
augvol *= .5 # Music is often severely in the background.
|
augvol *= .5 # Music is often severely in the background.
|
||||||
elif label == 3:
|
elif label == 3:
|
||||||
|
# Add another voice.
|
||||||
augpath = random.choice(self.underlying_dataset.audiopaths)
|
augpath = random.choice(self.underlying_dataset.audiopaths)
|
||||||
intg_fns = [_integration_fn_smooth, _integration_fn_fully_enabled]
|
intg_fns = [_integration_fn_smooth, _integration_fn_fully_enabled]
|
||||||
aug = load_audio(augpath, self.underlying_dataset.sampling_rate)
|
aug = load_audio(augpath, self.underlying_dataset.sampling_rate)
|
||||||
|
@ -107,11 +133,28 @@ class AudioWithNoiseDataset(Dataset):
|
||||||
placement = random.randint(0, gap-1)
|
placement = random.randint(0, gap-1)
|
||||||
aug = torch.nn.functional.pad(aug, (placement, gap-placement))
|
aug = torch.nn.functional.pad(aug, (placement, gap-placement))
|
||||||
clip = clip + aug
|
clip = clip + aug
|
||||||
clip.clip_(-1, 1)
|
elif label == 4:
|
||||||
|
# Perform reverb (to simulate being in a large room with an omni-mic). This is performed by convolving
|
||||||
|
# impulse recordings from openair over the input clip.
|
||||||
|
if self.use_gpu_for_reverb_compute:
|
||||||
|
rir = random.choice(self.openair_kernels)
|
||||||
|
else:
|
||||||
|
augpath = random.choice(self.openair_paths)
|
||||||
|
rir = load_rir(augpath, self.underlying_dataset.sampling_rate, clip.shape[-1])
|
||||||
|
clip = torch.nn.functional.pad(clip, (rir.shape[1]-1, 0))
|
||||||
|
if self.use_gpu_for_reverb_compute:
|
||||||
|
clip = clip.cuda()
|
||||||
|
clip = torch.nn.functional.conv1d(clip.unsqueeze(0), rir.unsqueeze(0)).squeeze(0).cpu()
|
||||||
|
elif label == 5:
|
||||||
|
# Apply the GSM codec to simulate cellular phone audio.
|
||||||
|
clip = torchaudio.functional.apply_codec(clip, self.underlying_dataset.sampling_rate, format="gsm")
|
||||||
except:
|
except:
|
||||||
print(f"Exception encountered processing {item}, re-trying because this is often just a failed aug.")
|
print(f"Exception encountered processing {item}, re-trying because this is often just a failed aug.")
|
||||||
|
print(sys.exc_info())
|
||||||
|
#raise # Uncomment to surface exceptions.
|
||||||
return self[item]
|
return self[item]
|
||||||
|
|
||||||
|
clip.clip_(-1, 1)
|
||||||
out['clip'] = clip
|
out['clip'] = clip
|
||||||
out['label'] = label
|
out['label'] = label
|
||||||
out['aug'] = aug
|
out['aug'] = aug
|
||||||
|
@ -127,18 +170,19 @@ class AudioWithNoiseDataset(Dataset):
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
params = {
|
params = {
|
||||||
'mode': 'unsupervised_audio_with_noise',
|
'mode': 'unsupervised_audio_with_noise',
|
||||||
'path': ['\\\\192.168.5.3\\rtx3080_audio_y\\split\\books2', '\\\\192.168.5.3\\rtx3080_audio\\split\\books1', '\\\\192.168.5.3\\rtx3080_audio\\split\\cleaned-2'],
|
'path': ['\\\\192.168.5.3\\rtx3080_audio\\split\\cleaned\\books0'],
|
||||||
'cache_path': 'E:\\audio\\remote-cache2.pth',
|
'cache_path': 'E:\\audio\\remote-cache3.pth',
|
||||||
'sampling_rate': 22050,
|
'sampling_rate': 22050,
|
||||||
'pad_to_samples': 80960,
|
'pad_to_samples': 80960,
|
||||||
'phase': 'train',
|
'phase': 'train',
|
||||||
'n_workers': 0,
|
'n_workers': 0,
|
||||||
'batch_size': 16,
|
'batch_size': 4,
|
||||||
'extra_samples': 4,
|
'extra_samples': 4,
|
||||||
'env_noise_paths': ['E:\\audio\\UrbanSound\\filtered', 'E:\\audio\\UrbanSound\\MSSND'],
|
'env_noise_paths': ['E:\\audio\\UrbanSound\\filtered', 'E:\\audio\\UrbanSound\\MSSND'],
|
||||||
'env_noise_cache': 'E:\\audio\\UrbanSound\\cache.pth',
|
'env_noise_cache': 'E:\\audio\\UrbanSound\\cache.pth',
|
||||||
'music_paths': ['E:\\audio\\music\\FMA\\fma_large', 'E:\\audio\\music\\maestro\\maestro-v3.0.0'],
|
'music_paths': ['E:\\audio\\music\\FMA\\fma_large', 'E:\\audio\\music\\maestro\\maestro-v3.0.0'],
|
||||||
'music_cache': 'E:\\audio\\music\\cache.pth',
|
'music_cache': 'E:\\audio\\music\\cache.pth',
|
||||||
|
'openair_path': 'D:\\data\\audio\\openair\\resampled'
|
||||||
}
|
}
|
||||||
from data import create_dataset, create_dataloader, util
|
from data import create_dataset, create_dataloader, util
|
||||||
|
|
||||||
|
@ -147,8 +191,7 @@ if __name__ == '__main__':
|
||||||
i = 0
|
i = 0
|
||||||
for b in tqdm(dl):
|
for b in tqdm(dl):
|
||||||
for b_ in range(b['clip'].shape[0]):
|
for b_ in range(b['clip'].shape[0]):
|
||||||
#pass
|
#torchaudio.save(f'{i}_clip_{b_}_{b["label"][b_].item()}.wav', b['clip'][b_], ds.sampling_rate)
|
||||||
torchaudio.save(f'{i}_clip_{b_}_{b["label"][b_].item()}.wav', b['clip'][b_], ds.sampling_rate)
|
#torchaudio.save(f'{i}_clip_{b_}_aug.wav', b['aug'][b_], ds.sampling_rate)
|
||||||
torchaudio.save(f'{i}_clip_{b_}_aug.wav', b['aug'][b_], ds.sampling_rate)
|
|
||||||
print(f'{i} aug path: {b["augpath"][b_]} aug volume: {b["augvol"][b_]} clip volume: {b["clipvol"][b_]}')
|
print(f'{i} aug path: {b["augpath"][b_]} aug volume: {b["augvol"][b_]} clip volume: {b["clipvol"][b_]}')
|
||||||
i += 1
|
i += 1
|
||||||
|
|
|
@ -20,6 +20,11 @@ from utils.util import opt_get
|
||||||
def load_audio(audiopath, sampling_rate):
|
def load_audio(audiopath, sampling_rate):
|
||||||
if audiopath[-4:] == '.wav':
|
if audiopath[-4:] == '.wav':
|
||||||
audio, lsr = load_wav_to_torch(audiopath)
|
audio, lsr = load_wav_to_torch(audiopath)
|
||||||
|
elif audiopath[-4:] == '.mp3':
|
||||||
|
# https://github.com/neonbjb/pyfastmp3decoder - Definitely worth it.
|
||||||
|
from pyfastmp3decoder.mp3decoder import load_mp3
|
||||||
|
audio, lsr = load_mp3(audiopath, sampling_rate)
|
||||||
|
audio = torch.FloatTensor(audio)
|
||||||
else:
|
else:
|
||||||
audio, lsr = open_audio(audiopath)
|
audio, lsr = open_audio(audiopath)
|
||||||
audio = torch.FloatTensor(audio)
|
audio = torch.FloatTensor(audio)
|
||||||
|
@ -149,8 +154,8 @@ class UnsupervisedAudioDataset(torch.utils.data.Dataset):
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
params = {
|
params = {
|
||||||
'mode': 'unsupervised_audio',
|
'mode': 'unsupervised_audio',
|
||||||
'path': ['\\\\192.168.5.3\\rtx3080_audio_y\\split\\books2', '\\\\192.168.5.3\\rtx3080_audio\\split\\books1', '\\\\192.168.5.3\\rtx3080_audio\\split\\cleaned-2'],
|
'path': ['\\\\192.168.5.3\\rtx3080_audio\\split\\cleaned\\books0'],
|
||||||
'cache_path': 'E:\\audio\\remote-cache2.pth',
|
'cache_path': 'E:\\audio\\remote-cache3.pth',
|
||||||
'sampling_rate': 22050,
|
'sampling_rate': 22050,
|
||||||
'pad_to_samples': 40960,
|
'pad_to_samples': 40960,
|
||||||
'phase': 'train',
|
'phase': 'train',
|
||||||
|
|
|
@ -81,7 +81,7 @@ if __name__ == "__main__":
|
||||||
torch.backends.cudnn.benchmark = True
|
torch.backends.cudnn.benchmark = True
|
||||||
want_metrics = False
|
want_metrics = False
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('-opt', type=str, help='Path to options YAML file.', default='../options/test_diffusion_vocoder_10-25.yml')
|
parser.add_argument('-opt', type=str, help='Path to options YAML file.', default='../options/test_diffusion_vocoder_10-28.yml')
|
||||||
opt = option.parse(parser.parse_args().opt, is_train=False)
|
opt = option.parse(parser.parse_args().opt, is_train=False)
|
||||||
opt = option.dict_to_nonedict(opt)
|
opt = option.dict_to_nonedict(opt)
|
||||||
utils.util.loaded_options = opt
|
utils.util.loaded_options = opt
|
||||||
|
|
Loading…
Reference in New Issue
Block a user