diff --git a/codes/data/audio/wav_aug.py b/codes/data/audio/wav_aug.py index a35646d0..74723a44 100644 --- a/codes/data/audio/wav_aug.py +++ b/codes/data/audio/wav_aug.py @@ -23,7 +23,9 @@ class WavAugmentor: pass def augment(self, wav, sample_rate): - speed_effect = ['speed', rdstr(.7, 1)] + speed_effect = ['speed', rdstr(.8, 1)] + ''' + Band effects are disabled until I can audit them better. band_effects = [ ['reverb', '-w'], ['reverb'], @@ -39,15 +41,16 @@ class WavAugmentor: ['sinc', '3k-4k'] ] band_effect = random.choice(band_effects) + ''' volume_effects = [ ['loudness', rdi(10,-2)], ['overdrive', rdi(20,0), rdi(20,0)], ] vol_effect = random.choice(volume_effects) - effects = [speed_effect, band_effect, vol_effect] + effects = [speed_effect, vol_effect] out, sr = torchaudio.sox_effects.apply_effects_tensor(wav, sample_rate, effects) # Add a variable amount of noise - out = out + torch.rand_like(out) * random.random() * .05 + out = out + torch.rand_like(out) * random.random() * .03 return out diff --git a/codes/scripts/audio/test_audio_gen.py b/codes/scripts/audio/test_audio_gen.py index ce441c11..fd3c97f8 100644 --- a/codes/scripts/audio/test_audio_gen.py +++ b/codes/scripts/audio/test_audio_gen.py @@ -51,7 +51,7 @@ if __name__ == "__main__": torch.backends.cudnn.benchmark = True want_metrics = False parser = argparse.ArgumentParser() - parser.add_argument('-opt', type=str, help='Path to options YAML file.', default='../options/test_vqvae_audio_lj.yml') + parser.add_argument('-opt', type=str, help='Path to options YAML file.', default='../options/test_gpt_tts_lj.yml') opt = option.parse(parser.parse_args().opt, is_train=False) opt = option.dict_to_nonedict(opt) utils.util.loaded_options = opt diff --git a/codes/scripts/audio/test_audio_similarity.py b/codes/scripts/audio/test_audio_similarity.py index 34d6fb3d..2ffaeb3d 100644 --- a/codes/scripts/audio/test_audio_similarity.py +++ b/codes/scripts/audio/test_audio_similarity.py @@ -20,12 +20,13 @@ if __name__ == '__main__': clip = clip[:,0] clip = clip[:window].unsqueeze(0) clip = clip / 32768.0 # Normalize + clip = clip + torch.rand_like(clip) * .03 # Noise (this is how the model was trained) assert sr == 24000 clips.append(clip) clips = torch.stack(clips, dim=0) resnet = resnet34() - sd = torch.load('../experiments/train_byol_audio_clips/models/66000_generator.pth') + sd = torch.load('../experiments/train_byol_audio_clips/models/57000_generator.pth') sd = extract_byol_model_from_state_dict(sd) resnet.load_state_dict(sd) embedding = resnet(clips, return_pool=True)