From 3e073cff850adebf4963ae850a7dfee7471f51a6 Mon Sep 17 00:00:00 2001 From: James Betker Date: Wed, 1 Sep 2021 08:33:46 -0600 Subject: [PATCH] Set kernel_size in diffusion_vocoder --- codes/models/diffusion/unet_diffusion.py | 2 +- codes/models/diffusion/unet_diffusion_vocoder.py | 13 +++++++++++-- codes/trainer/ExtensibleTrainer.py | 2 -- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/codes/models/diffusion/unet_diffusion.py b/codes/models/diffusion/unet_diffusion.py index 2446f913..c6db013f 100644 --- a/codes/models/diffusion/unet_diffusion.py +++ b/codes/models/diffusion/unet_diffusion.py @@ -187,7 +187,6 @@ class ResBlock(TimestepBlock): up=False, down=False, kernel_size=3, - padding=1, ): super().__init__() self.channels = channels @@ -196,6 +195,7 @@ class ResBlock(TimestepBlock): self.out_channels = out_channels or channels self.use_conv = use_conv self.use_scale_shift_norm = use_scale_shift_norm + padding = 1 if kernel_size == 3 else 2 self.in_layers = nn.Sequential( normalization(channels), diff --git a/codes/models/diffusion/unet_diffusion_vocoder.py b/codes/models/diffusion/unet_diffusion_vocoder.py index 27324f03..8472b658 100644 --- a/codes/models/diffusion/unet_diffusion_vocoder.py +++ b/codes/models/diffusion/unet_diffusion_vocoder.py @@ -61,6 +61,7 @@ class DiffusionVocoder(nn.Module): use_scale_shift_norm=False, resblock_updown=False, use_new_attention_order=False, + kernel_size=5, ): super().__init__() @@ -82,6 +83,8 @@ class DiffusionVocoder(nn.Module): self.num_heads_upsample = num_heads_upsample self.dims = dims + padding = 1 if kernel_size == 3 else 2 + time_embed_dim = model_channels * 4 self.time_embed = nn.Sequential( linear(model_channels, time_embed_dim), @@ -92,7 +95,7 @@ class DiffusionVocoder(nn.Module): self.input_blocks = nn.ModuleList( [ TimestepEmbedSequential( - conv_nd(dims, in_channels, model_channels, 3, padding=1) + conv_nd(dims, in_channels, model_channels, kernel_size, padding=padding) ) ] ) @@ -127,6 +130,7 @@ class DiffusionVocoder(nn.Module): out_channels=mult * model_channels, dims=dims, use_scale_shift_norm=use_scale_shift_norm, + kernel_size=kernel_size, ) ] ch = mult * model_channels @@ -154,6 +158,7 @@ class DiffusionVocoder(nn.Module): dims=dims, use_scale_shift_norm=use_scale_shift_norm, down=True, + kernel_size=kernel_size, ) if resblock_updown else Downsample( @@ -176,6 +181,7 @@ class DiffusionVocoder(nn.Module): dropout, dims=dims, use_scale_shift_norm=use_scale_shift_norm, + kernel_size=kernel_size, ), AttentionBlock( ch, @@ -189,6 +195,7 @@ class DiffusionVocoder(nn.Module): dropout, dims=dims, use_scale_shift_norm=use_scale_shift_norm, + kernel_size=kernel_size, ), ) self._feature_size += ch @@ -205,6 +212,7 @@ class DiffusionVocoder(nn.Module): out_channels=model_channels * mult, dims=dims, use_scale_shift_norm=use_scale_shift_norm, + kernel_size=kernel_size, ) ] ch = model_channels * mult @@ -228,6 +236,7 @@ class DiffusionVocoder(nn.Module): dims=dims, use_scale_shift_norm=use_scale_shift_norm, up=True, + kernel_size=kernel_size, ) if resblock_updown else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch) @@ -239,7 +248,7 @@ class DiffusionVocoder(nn.Module): self.out = nn.Sequential( normalization(ch), nn.SiLU(), - zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)), + zero_module(conv_nd(dims, model_channels, out_channels, kernel_size, padding=padding)), ) def convert_to_fp16(self): diff --git a/codes/trainer/ExtensibleTrainer.py b/codes/trainer/ExtensibleTrainer.py index a327bdb3..ffe3b8f6 100644 --- a/codes/trainer/ExtensibleTrainer.py +++ b/codes/trainer/ExtensibleTrainer.py @@ -383,8 +383,6 @@ class ExtensibleTrainer(BaseModel): def load(self): for netdict in [self.netsG, self.netsD]: for name, net in netdict.items(): - if not self.opt['networks'][name]['trainable']: - continue load_path = self.opt['path']['pretrain_model_%s' % (name,)] if load_path is None: return