From b12f47b36d4b613e0d0bcbda5fc9d49b32131a21 Mon Sep 17 00:00:00 2001
From: James Betker <jbetker@gmail.com>
Date: Wed, 29 Dec 2021 13:56:30 -0700
Subject: [PATCH] Add some noise to voice_voice_clip

---
 codes/models/gpt_voice/voice_voice_clip.py | 27 +++++++++++++++++++++-
 codes/trainer/injectors/spec_augment.py    |  2 +-
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/codes/models/gpt_voice/voice_voice_clip.py b/codes/models/gpt_voice/voice_voice_clip.py
index c128be0e..1447e99f 100644
--- a/codes/models/gpt_voice/voice_voice_clip.py
+++ b/codes/models/gpt_voice/voice_voice_clip.py
@@ -1,3 +1,4 @@
+import random
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -6,6 +7,7 @@ from torch import einsum
 
 from models.gpt_voice.mini_encoder import AudioMiniEncoder
 from models.lucidrains.dalle.transformer import Transformer
+from trainer.injectors.spec_augment import spec_augment
 from trainer.networks import register_model
 from utils.util import opt_get
 
@@ -47,10 +49,33 @@ class VoiceCLIP(nn.Module):
         return_loss=True
     ):
         half_length = min(speech_mels.shape[-1], torch.min(speech_lengths).item() // self.mel_compression_ratio) // 2
-        half_length = (half_length // 4) * 4  # Must be a multiple of 4.
 
+        # Extract two speech MELs from the same clip, apply some random noise to them and also apply specaugment to them.
         first_half = speech_mels[:, :, :half_length]
+        first_half = first_half + torch.rand_like(first_half) * .00001
+        first_half = spec_augment(first_half)
         second_half = speech_mels[:, :, half_length:half_length*2]
+        second_half = second_half + torch.rand_like(second_half) * .00001
+        second_half = spec_augment(second_half)
+
+        # Introduce a random gap between the two clips.
+        potential_gap = half_length // 4
+        if potential_gap > 0:
+            gap = random.randint(0, potential_gap)
+            first_half = first_half[:, :, :-gap]
+            second_half = second_half[:, :, gap:]
+
+        # The clips must be multiples of 4.
+        if first_half.shape[-1] % 4 != 0:
+            first_half = first_half[:, :, :first_half.shape[-1] // 4 * 4]
+        if second_half.shape[-1] % 4 != 0:
+            second_half = second_half[:, :, :second_half.shape[-1] // 4 * 4]
+
+        # Flip the clips randomly
+        if random.random() < .5:
+            t = first_half
+            first_half = second_half
+            second_half = t
 
         first_emb = self.encoder(first_half)
         first_latents = self.to_latent(first_emb)
diff --git a/codes/trainer/injectors/spec_augment.py b/codes/trainer/injectors/spec_augment.py
index 63fa7c6a..6a277216 100644
--- a/codes/trainer/injectors/spec_augment.py
+++ b/codes/trainer/injectors/spec_augment.py
@@ -11,7 +11,7 @@ from trainer.inject import Injector
 from utils.util import opt_get
 
 
-def spec_augment(mel_spectrogram, frequency_masking_para=27, time_masking_para=70, frequency_mask_num=1, time_mask_num=1):
+def spec_augment(mel_spectrogram, frequency_masking_para=27, time_masking_para=5, frequency_mask_num=1, time_mask_num=1):
 
     v = mel_spectrogram.shape[1]
     tau = mel_spectrogram.shape[2]