Add feature to diffusion vocoder where the spectrogram conditioning layers can be re-trained apart from the rest of the model

2021-12-07 09:22:30 -07:00 · 2021-12-07 09:22:30 -07:00 · d0b2f931bf
commit d0b2f931bf
parent 662920bde3
1 changed files with 14 additions and 1 deletions
--- a/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py
+++ b/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py
@ -91,6 +91,7 @@ class DiffusionVocoderWithRef(nn.Module):
            conditioning_inputs_provided=True,
            conditioning_input_dim=80,
            time_embed_dim_multiplier=4,
+            only_train_dvae_connection_layers=False,
    ):
        super().__init__()

@ -131,6 +132,7 @@ class DiffusionVocoderWithRef(nn.Module):
                )
            ]
        )
+        spectrogram_blocks = []
        self._feature_size = model_channels
        input_block_chans = [model_channels]
        ch = model_channels
@ -138,7 +140,9 @@ class DiffusionVocoderWithRef(nn.Module):

        for level, (mult, num_blocks) in enumerate(zip(channel_mult, num_res_blocks)):
            if ds in spectrogram_conditioning_resolutions:
-                self.input_blocks.append(DiscreteSpectrogramConditioningBlock(discrete_codes, ch))
+                spec_cond_block = DiscreteSpectrogramConditioningBlock(discrete_codes, ch)
+                self.input_blocks.append(spec_cond_block)
+                spectrogram_blocks.append(spec_cond_block)
                ch *= 2

            for _ in range(num_blocks):
@ -268,6 +272,15 @@ class DiffusionVocoderWithRef(nn.Module):
            zero_module(conv_nd(dims, model_channels, out_channels, kernel_size, padding=padding)),
        )

+        if only_train_dvae_connection_layers:
+            for p in self.parameters():
+                p.DO_NOT_TRAIN = True
+                p.requires_grad = False
+            for sb in spectrogram_blocks:
+                for p in sb.parameters():
+                    del p.DO_NOT_TRAIN
+                    p.requires_grad = True
+
    def convert_to_fp16(self):
        """
        Convert the torso of the model to float16.