From b12f47b36d4b613e0d0bcbda5fc9d49b32131a21 Mon Sep 17 00:00:00 2001 From: James Betker Date: Wed, 29 Dec 2021 13:56:30 -0700 Subject: [PATCH] Add some noise to voice_voice_clip --- codes/models/gpt_voice/voice_voice_clip.py | 27 +++++++++++++++++++++- codes/trainer/injectors/spec_augment.py | 2 +- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/codes/models/gpt_voice/voice_voice_clip.py b/codes/models/gpt_voice/voice_voice_clip.py index c128be0e..1447e99f 100644 --- a/codes/models/gpt_voice/voice_voice_clip.py +++ b/codes/models/gpt_voice/voice_voice_clip.py @@ -1,3 +1,4 @@ +import random import torch import torch.nn as nn import torch.nn.functional as F @@ -6,6 +7,7 @@ from torch import einsum from models.gpt_voice.mini_encoder import AudioMiniEncoder from models.lucidrains.dalle.transformer import Transformer +from trainer.injectors.spec_augment import spec_augment from trainer.networks import register_model from utils.util import opt_get @@ -47,10 +49,33 @@ class VoiceCLIP(nn.Module): return_loss=True ): half_length = min(speech_mels.shape[-1], torch.min(speech_lengths).item() // self.mel_compression_ratio) // 2 - half_length = (half_length // 4) * 4 # Must be a multiple of 4. + # Extract two speech MELs from the same clip, apply some random noise to them and also apply specaugment to them. first_half = speech_mels[:, :, :half_length] + first_half = first_half + torch.rand_like(first_half) * .00001 + first_half = spec_augment(first_half) second_half = speech_mels[:, :, half_length:half_length*2] + second_half = second_half + torch.rand_like(second_half) * .00001 + second_half = spec_augment(second_half) + + # Introduce a random gap between the two clips. + potential_gap = half_length // 4 + if potential_gap > 0: + gap = random.randint(0, potential_gap) + first_half = first_half[:, :, :-gap] + second_half = second_half[:, :, gap:] + + # The clips must be multiples of 4. + if first_half.shape[-1] % 4 != 0: + first_half = first_half[:, :, :first_half.shape[-1] // 4 * 4] + if second_half.shape[-1] % 4 != 0: + second_half = second_half[:, :, :second_half.shape[-1] // 4 * 4] + + # Flip the clips randomly + if random.random() < .5: + t = first_half + first_half = second_half + second_half = t first_emb = self.encoder(first_half) first_latents = self.to_latent(first_emb) diff --git a/codes/trainer/injectors/spec_augment.py b/codes/trainer/injectors/spec_augment.py index 63fa7c6a..6a277216 100644 --- a/codes/trainer/injectors/spec_augment.py +++ b/codes/trainer/injectors/spec_augment.py @@ -11,7 +11,7 @@ from trainer.inject import Injector from utils.util import opt_get -def spec_augment(mel_spectrogram, frequency_masking_para=27, time_masking_para=70, frequency_mask_num=1, time_mask_num=1): +def spec_augment(mel_spectrogram, frequency_masking_para=27, time_masking_para=5, frequency_mask_num=1, time_mask_num=1): v = mel_spectrogram.shape[1] tau = mel_spectrogram.shape[2]