Add audio augmentation to wavfile_dataset, utility to test audio similary

2021-08-05 22:14:49 -06:00 · 2021-08-05 22:14:49 -06:00 · d120e1aa99
commit d120e1aa99
parent c0f61a2e15
6 changed files with 124 additions and 12 deletions
--- a/codes/data/audio/wav_aug.py
+++ b/codes/data/audio/wav_aug.py
@ -0,0 +1,60 @@
+import random
+
+import torch
+import torchaudio.sox_effects
+
+from models.tacotron2.taco_utils import load_wav_to_torch
+
+
+# Returns random double on [l,h] as a string
+def rdstr(l=0,h=1):
+    assert h > l
+    i=h-l
+    return str(random.random() * i + l)
+
+
+# Returns a randint on [s,e] as a string
+def rdi(e, s=0):
+    return str(random.randint(s,e))
+
+
+class WavAugmentor:
+    def __init__(self):
+        pass
+
+    def augment(self, wav, sample_rate):
+        speed_effect = ['speed', rdstr(.7, 1)]
+        band_effects = [
+            ['reverb', '-w'],
+            ['reverb'],
+            ['band', rdi(8000, 3000), rdi(1000, 100)],
+            ['bandpass', rdi(8000, 3000), rdi(1000, 100)],
+            ['bass', rdi(20,-20)],
+            ['treble', rdi(20,-20)],
+            ['dither'],
+            ['equalizer', rdi(3000, 100), rdi(1000, 100), rdi(10, -10)],
+            ['hilbert'],
+            ['sinc', '3k'],
+            ['sinc', '-4k'],
+            ['sinc', '3k-4k']
+        ]
+        band_effect = random.choice(band_effects)
+        volume_effects = [
+            ['loudness', rdi(10,-10)],
+            ['overdrive', rdi(20,0), rdi(20,0)],
+        ]
+        vol_effect = random.choice(volume_effects)
+        effects = [speed_effect, band_effect, vol_effect]
+        out, sr = torchaudio.sox_effects.apply_effects_tensor(wav, sample_rate, effects)
+        # Add a variable amount of noise
+        out = out + torch.rand_like(out) * random.random() * .05
+        return out
+
+
+if __name__ == '__main__':
+    sample, _ = load_wav_to_torch('obama1.wav')
+    sample = sample.permute(1,0) / 32768.0
+    aug = WavAugmentor()
+    for j in range(10):
+        out = aug.augment(sample, 24000)
+        torchaudio.save(f'out{j}.wav', out, 24000)
--- a/codes/data/audio/wavfile_dataset.py
+++ b/codes/data/audio/wavfile_dataset.py
@ -3,10 +3,13 @@ import random

 import torch
 import torch.utils.data
+import torchaudio
 from tqdm import tqdm

+from data.audio.wav_aug import WavAugmentor
 from data.util import get_image_paths, is_wav_file
 from models.tacotron2.taco_utils import load_wav_to_torch
+from utils.util import opt_get


 class WavfileDataset(torch.utils.data.Dataset):
@ -20,9 +23,15 @@ class WavfileDataset(torch.utils.data.Dataset):
            print("Building cache..")
            self.audiopaths = get_image_paths('img', opt['path'], qualifier=is_wav_file)[0]
            torch.save(self.audiopaths, cache_path)
+
+        # Parse options
+        self.sampling_rate = opt_get(opt, ['sampling_rate'], 24000)
+        self.augment = opt_get(opt, ['do_augmentation'], False)
        self.max_wav_value = 32768.0
-        self.sampling_rate = 24000
+
        self.window = 2 * self.sampling_rate
+        if self.augment:
+            self.augmentor = WavAugmentor()

    def get_audio_for_index(self, index):
        audiopath = self.audiopaths[index]
@ -46,8 +55,12 @@ class WavfileDataset(torch.utils.data.Dataset):
                continue
            j = random.randint(0, audio_norm.shape[0] - self.window)
            clip1 = audio_norm[j:j+self.window]
+            if self.augment:
+                clip1 = self.augmentor.augment(clip1, self.sampling_rate)
            j = random.randint(0, audio_norm.shape[0]-self.window)
            clip2 = audio_norm[j:j+self.window]
+            if self.augment:
+                clip2 = self.augmentor.augment(clip2, self.sampling_rate)

        return {
            'clip1': clip1.unsqueeze(0),
@ -66,16 +79,14 @@ if __name__ == '__main__':
        'phase': 'train',
        'n_workers': 0,
        'batch_size': 16,
+        'do_augmentation': True,
    }
    from data import create_dataset, create_dataloader, util

    ds, c = create_dataset(params, return_collate=True)
    dl = create_dataloader(ds, params, collate_fn=c)
    i = 0
-    m = []
-    max_text = 0
-    max_mel = 0
    for b in tqdm(dl):
-        pass
-    m=torch.stack(m)
-    print(m.mean(), m.std())
+        torchaudio.save(f'{i}_clip1.wav', b['clip1'], ds.sampling_rate)
+        torchaudio.save(f'{i}_clip2.wav', b['clip2'], ds.sampling_rate)
+        i += 1
--- a/codes/models/audio_resnet.py
+++ b/codes/models/audio_resnet.py
@ -216,7 +216,7 @@ class ResNet(nn.Module):

        return nn.Sequential(*layers)

-    def _forward_impl(self, x: Tensor) -> Tensor:
+    def _forward_impl(self, x: Tensor, return_pool) -> Tensor:
        # See note [TorchScript super()]
        x = self.conv1(x)
        x = self.bn1(x)
@ -230,12 +230,14 @@ class ResNet(nn.Module):

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
+        if return_pool:
+            return x
        x = self.fc(x)

        return x

-    def forward(self, x: Tensor) -> Tensor:
-        return self._forward_impl(x)
+    def forward(self, x: Tensor, return_pool=False) -> Tensor:
+        return self._forward_impl(x, return_pool)


 def _resnet(
--- a/codes/models/tacotron2/taco_utils.py
+++ b/codes/models/tacotron2/taco_utils.py
@ -2,7 +2,6 @@ import numpy as np
 from scipy.io.wavfile import read
 import torch

-
 def get_mask_from_lengths(lengths, max_len=None):
    if max_len is None:
        max_len = torch.max(lengths).item()
--- a/codes/scripts/audio/test_audio_similarity.py
+++ b/codes/scripts/audio/test_audio_similarity.py
@ -0,0 +1,40 @@
+import os
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from data.util import is_wav_file, get_image_paths
+from models.audio_resnet import resnet34
+from models.tacotron2.taco_utils import load_wav_to_torch
+from scripts.byol.byol_extract_wrapped_model import extract_byol_model_from_state_dict
+
+if __name__ == '__main__':
+    window = 48000
+    root_path = 'D:\\tmp\\clips'
+    paths = get_image_paths('img', root_path, qualifier=is_wav_file)[0]
+    clips = []
+    for path in paths:
+        clip, sr = load_wav_to_torch(os.path.join(root_path, path))
+        if len(clip.shape) > 1:
+            clip = clip[:,0]
+        clip = clip[:window].unsqueeze(0)
+        clip = clip / 32768.0  # Normalize
+        assert sr == 24000
+        clips.append(clip)
+    clips = torch.stack(clips, dim=0)
+
+    resnet = resnet34()
+    sd = torch.load('../experiments/train_byol_audio_clips/models/66000_generator.pth')
+    sd = extract_byol_model_from_state_dict(sd)
+    resnet.load_state_dict(sd)
+    embedding = resnet(clips, return_pool=True)
+
+    for i, path in enumerate(paths):
+        print(f'Using a baseline of {path}..')
+        for j, cpath in enumerate(paths):
+            if i == j:
+                continue
+            l2 = F.mse_loss(embedding[j], embedding[i])
+            print(f'Compared to {cpath}: {l2}')
+
--- a/codes/train.py
+++ b/codes/train.py
@ -300,7 +300,7 @@ class Trainer:

 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
-    parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../options/train_gpt_tts_lj.yml')
+    parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../options/train_byol_audio_clips.yml')
    parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher')
    parser.add_argument('--local_rank', type=int, default=0)
    args = parser.parse_args()