From d0b2f931bf9140239606e3e8b5cb6b3f554c9ec8 Mon Sep 17 00:00:00 2001
From: James Betker <jbetker@gmail.com>
Date: Tue, 7 Dec 2021 09:22:30 -0700
Subject: [PATCH] Add feature to diffusion vocoder where the spectrogram
 conditioning layers can be re-trained apart from the rest of the model

---
 .../gpt_voice/unet_diffusion_vocoder_with_ref.py  | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py b/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py
index 93a7d496..8b6acb88 100644
--- a/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py
+++ b/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py
@@ -91,6 +91,7 @@ class DiffusionVocoderWithRef(nn.Module):
             conditioning_inputs_provided=True,
             conditioning_input_dim=80,
             time_embed_dim_multiplier=4,
+            only_train_dvae_connection_layers=False,
     ):
         super().__init__()
 
@@ -131,6 +132,7 @@ class DiffusionVocoderWithRef(nn.Module):
                 )
             ]
         )
+        spectrogram_blocks = []
         self._feature_size = model_channels
         input_block_chans = [model_channels]
         ch = model_channels
@@ -138,7 +140,9 @@ class DiffusionVocoderWithRef(nn.Module):
 
         for level, (mult, num_blocks) in enumerate(zip(channel_mult, num_res_blocks)):
             if ds in spectrogram_conditioning_resolutions:
-                self.input_blocks.append(DiscreteSpectrogramConditioningBlock(discrete_codes, ch))
+                spec_cond_block = DiscreteSpectrogramConditioningBlock(discrete_codes, ch)
+                self.input_blocks.append(spec_cond_block)
+                spectrogram_blocks.append(spec_cond_block)
                 ch *= 2
 
             for _ in range(num_blocks):
@@ -268,6 +272,15 @@ class DiffusionVocoderWithRef(nn.Module):
             zero_module(conv_nd(dims, model_channels, out_channels, kernel_size, padding=padding)),
         )
 
+        if only_train_dvae_connection_layers:
+            for p in self.parameters():
+                p.DO_NOT_TRAIN = True
+                p.requires_grad = False
+            for sb in spectrogram_blocks:
+                for p in sb.parameters():
+                    del p.DO_NOT_TRAIN
+                    p.requires_grad = True
+
     def convert_to_fp16(self):
         """
         Convert the torso of the model to float16.