Add audio augmentation to wavfile_dataset, utility to test audio similary
This commit is contained in:
parent
c0f61a2e15
commit
d120e1aa99
60
codes/data/audio/wav_aug.py
Normal file
60
codes/data/audio/wav_aug.py
Normal file
|
@ -0,0 +1,60 @@
|
||||||
|
import random
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torchaudio.sox_effects
|
||||||
|
|
||||||
|
from models.tacotron2.taco_utils import load_wav_to_torch
|
||||||
|
|
||||||
|
|
||||||
|
# Returns random double on [l,h] as a string
|
||||||
|
def rdstr(l=0,h=1):
|
||||||
|
assert h > l
|
||||||
|
i=h-l
|
||||||
|
return str(random.random() * i + l)
|
||||||
|
|
||||||
|
|
||||||
|
# Returns a randint on [s,e] as a string
|
||||||
|
def rdi(e, s=0):
|
||||||
|
return str(random.randint(s,e))
|
||||||
|
|
||||||
|
|
||||||
|
class WavAugmentor:
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def augment(self, wav, sample_rate):
|
||||||
|
speed_effect = ['speed', rdstr(.7, 1)]
|
||||||
|
band_effects = [
|
||||||
|
['reverb', '-w'],
|
||||||
|
['reverb'],
|
||||||
|
['band', rdi(8000, 3000), rdi(1000, 100)],
|
||||||
|
['bandpass', rdi(8000, 3000), rdi(1000, 100)],
|
||||||
|
['bass', rdi(20,-20)],
|
||||||
|
['treble', rdi(20,-20)],
|
||||||
|
['dither'],
|
||||||
|
['equalizer', rdi(3000, 100), rdi(1000, 100), rdi(10, -10)],
|
||||||
|
['hilbert'],
|
||||||
|
['sinc', '3k'],
|
||||||
|
['sinc', '-4k'],
|
||||||
|
['sinc', '3k-4k']
|
||||||
|
]
|
||||||
|
band_effect = random.choice(band_effects)
|
||||||
|
volume_effects = [
|
||||||
|
['loudness', rdi(10,-10)],
|
||||||
|
['overdrive', rdi(20,0), rdi(20,0)],
|
||||||
|
]
|
||||||
|
vol_effect = random.choice(volume_effects)
|
||||||
|
effects = [speed_effect, band_effect, vol_effect]
|
||||||
|
out, sr = torchaudio.sox_effects.apply_effects_tensor(wav, sample_rate, effects)
|
||||||
|
# Add a variable amount of noise
|
||||||
|
out = out + torch.rand_like(out) * random.random() * .05
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sample, _ = load_wav_to_torch('obama1.wav')
|
||||||
|
sample = sample.permute(1,0) / 32768.0
|
||||||
|
aug = WavAugmentor()
|
||||||
|
for j in range(10):
|
||||||
|
out = aug.augment(sample, 24000)
|
||||||
|
torchaudio.save(f'out{j}.wav', out, 24000)
|
|
@ -3,10 +3,13 @@ import random
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.data
|
import torch.utils.data
|
||||||
|
import torchaudio
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from data.audio.wav_aug import WavAugmentor
|
||||||
from data.util import get_image_paths, is_wav_file
|
from data.util import get_image_paths, is_wav_file
|
||||||
from models.tacotron2.taco_utils import load_wav_to_torch
|
from models.tacotron2.taco_utils import load_wav_to_torch
|
||||||
|
from utils.util import opt_get
|
||||||
|
|
||||||
|
|
||||||
class WavfileDataset(torch.utils.data.Dataset):
|
class WavfileDataset(torch.utils.data.Dataset):
|
||||||
|
@ -20,9 +23,15 @@ class WavfileDataset(torch.utils.data.Dataset):
|
||||||
print("Building cache..")
|
print("Building cache..")
|
||||||
self.audiopaths = get_image_paths('img', opt['path'], qualifier=is_wav_file)[0]
|
self.audiopaths = get_image_paths('img', opt['path'], qualifier=is_wav_file)[0]
|
||||||
torch.save(self.audiopaths, cache_path)
|
torch.save(self.audiopaths, cache_path)
|
||||||
|
|
||||||
|
# Parse options
|
||||||
|
self.sampling_rate = opt_get(opt, ['sampling_rate'], 24000)
|
||||||
|
self.augment = opt_get(opt, ['do_augmentation'], False)
|
||||||
self.max_wav_value = 32768.0
|
self.max_wav_value = 32768.0
|
||||||
self.sampling_rate = 24000
|
|
||||||
self.window = 2 * self.sampling_rate
|
self.window = 2 * self.sampling_rate
|
||||||
|
if self.augment:
|
||||||
|
self.augmentor = WavAugmentor()
|
||||||
|
|
||||||
def get_audio_for_index(self, index):
|
def get_audio_for_index(self, index):
|
||||||
audiopath = self.audiopaths[index]
|
audiopath = self.audiopaths[index]
|
||||||
|
@ -46,8 +55,12 @@ class WavfileDataset(torch.utils.data.Dataset):
|
||||||
continue
|
continue
|
||||||
j = random.randint(0, audio_norm.shape[0] - self.window)
|
j = random.randint(0, audio_norm.shape[0] - self.window)
|
||||||
clip1 = audio_norm[j:j+self.window]
|
clip1 = audio_norm[j:j+self.window]
|
||||||
|
if self.augment:
|
||||||
|
clip1 = self.augmentor.augment(clip1, self.sampling_rate)
|
||||||
j = random.randint(0, audio_norm.shape[0]-self.window)
|
j = random.randint(0, audio_norm.shape[0]-self.window)
|
||||||
clip2 = audio_norm[j:j+self.window]
|
clip2 = audio_norm[j:j+self.window]
|
||||||
|
if self.augment:
|
||||||
|
clip2 = self.augmentor.augment(clip2, self.sampling_rate)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'clip1': clip1.unsqueeze(0),
|
'clip1': clip1.unsqueeze(0),
|
||||||
|
@ -66,16 +79,14 @@ if __name__ == '__main__':
|
||||||
'phase': 'train',
|
'phase': 'train',
|
||||||
'n_workers': 0,
|
'n_workers': 0,
|
||||||
'batch_size': 16,
|
'batch_size': 16,
|
||||||
|
'do_augmentation': True,
|
||||||
}
|
}
|
||||||
from data import create_dataset, create_dataloader, util
|
from data import create_dataset, create_dataloader, util
|
||||||
|
|
||||||
ds, c = create_dataset(params, return_collate=True)
|
ds, c = create_dataset(params, return_collate=True)
|
||||||
dl = create_dataloader(ds, params, collate_fn=c)
|
dl = create_dataloader(ds, params, collate_fn=c)
|
||||||
i = 0
|
i = 0
|
||||||
m = []
|
|
||||||
max_text = 0
|
|
||||||
max_mel = 0
|
|
||||||
for b in tqdm(dl):
|
for b in tqdm(dl):
|
||||||
pass
|
torchaudio.save(f'{i}_clip1.wav', b['clip1'], ds.sampling_rate)
|
||||||
m=torch.stack(m)
|
torchaudio.save(f'{i}_clip2.wav', b['clip2'], ds.sampling_rate)
|
||||||
print(m.mean(), m.std())
|
i += 1
|
||||||
|
|
|
@ -216,7 +216,7 @@ class ResNet(nn.Module):
|
||||||
|
|
||||||
return nn.Sequential(*layers)
|
return nn.Sequential(*layers)
|
||||||
|
|
||||||
def _forward_impl(self, x: Tensor) -> Tensor:
|
def _forward_impl(self, x: Tensor, return_pool) -> Tensor:
|
||||||
# See note [TorchScript super()]
|
# See note [TorchScript super()]
|
||||||
x = self.conv1(x)
|
x = self.conv1(x)
|
||||||
x = self.bn1(x)
|
x = self.bn1(x)
|
||||||
|
@ -230,12 +230,14 @@ class ResNet(nn.Module):
|
||||||
|
|
||||||
x = self.avgpool(x)
|
x = self.avgpool(x)
|
||||||
x = torch.flatten(x, 1)
|
x = torch.flatten(x, 1)
|
||||||
|
if return_pool:
|
||||||
|
return x
|
||||||
x = self.fc(x)
|
x = self.fc(x)
|
||||||
|
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def forward(self, x: Tensor) -> Tensor:
|
def forward(self, x: Tensor, return_pool=False) -> Tensor:
|
||||||
return self._forward_impl(x)
|
return self._forward_impl(x, return_pool)
|
||||||
|
|
||||||
|
|
||||||
def _resnet(
|
def _resnet(
|
||||||
|
|
|
@ -2,7 +2,6 @@ import numpy as np
|
||||||
from scipy.io.wavfile import read
|
from scipy.io.wavfile import read
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
|
||||||
def get_mask_from_lengths(lengths, max_len=None):
|
def get_mask_from_lengths(lengths, max_len=None):
|
||||||
if max_len is None:
|
if max_len is None:
|
||||||
max_len = torch.max(lengths).item()
|
max_len = torch.max(lengths).item()
|
||||||
|
|
40
codes/scripts/audio/test_audio_similarity.py
Normal file
40
codes/scripts/audio/test_audio_similarity.py
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
import os
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
from data.util import is_wav_file, get_image_paths
|
||||||
|
from models.audio_resnet import resnet34
|
||||||
|
from models.tacotron2.taco_utils import load_wav_to_torch
|
||||||
|
from scripts.byol.byol_extract_wrapped_model import extract_byol_model_from_state_dict
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
window = 48000
|
||||||
|
root_path = 'D:\\tmp\\clips'
|
||||||
|
paths = get_image_paths('img', root_path, qualifier=is_wav_file)[0]
|
||||||
|
clips = []
|
||||||
|
for path in paths:
|
||||||
|
clip, sr = load_wav_to_torch(os.path.join(root_path, path))
|
||||||
|
if len(clip.shape) > 1:
|
||||||
|
clip = clip[:,0]
|
||||||
|
clip = clip[:window].unsqueeze(0)
|
||||||
|
clip = clip / 32768.0 # Normalize
|
||||||
|
assert sr == 24000
|
||||||
|
clips.append(clip)
|
||||||
|
clips = torch.stack(clips, dim=0)
|
||||||
|
|
||||||
|
resnet = resnet34()
|
||||||
|
sd = torch.load('../experiments/train_byol_audio_clips/models/66000_generator.pth')
|
||||||
|
sd = extract_byol_model_from_state_dict(sd)
|
||||||
|
resnet.load_state_dict(sd)
|
||||||
|
embedding = resnet(clips, return_pool=True)
|
||||||
|
|
||||||
|
for i, path in enumerate(paths):
|
||||||
|
print(f'Using a baseline of {path}..')
|
||||||
|
for j, cpath in enumerate(paths):
|
||||||
|
if i == j:
|
||||||
|
continue
|
||||||
|
l2 = F.mse_loss(embedding[j], embedding[i])
|
||||||
|
print(f'Compared to {cpath}: {l2}')
|
||||||
|
|
|
@ -300,7 +300,7 @@ class Trainer:
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../options/train_gpt_tts_lj.yml')
|
parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../options/train_byol_audio_clips.yml')
|
||||||
parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher')
|
parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher')
|
||||||
parser.add_argument('--local_rank', type=int, default=0)
|
parser.add_argument('--local_rank', type=int, default=0)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
Loading…
Reference in New Issue
Block a user