diff --git a/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py b/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py index 40eb5645..a6ffecfc 100644 --- a/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py +++ b/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py @@ -64,15 +64,15 @@ class DiffusionVocoderWithRef(nn.Module): def __init__( self, model_channels, - num_res_blocks, in_channels=1, out_channels=2, # mean and variance discrete_codes=8192, dropout=0, # 38400 -> 19200 -> 9600 -> 4800 -> 2400 -> 1200 -> 600 -> 300 -> 150 for ~2secs@22050Hz - channel_mult=(1, 1, 2, 2, 4, 8, 16, 32, 64), - spectrogram_conditioning_resolutions=(4,8,16,32), - attention_resolutions=(64,128,256), + channel_mult= (1, 1, 2, 2, 4, 6, 8, 12, 16, 24, 32, 48, 64), + num_res_blocks=(1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2), + spectrogram_conditioning_resolutions=(512,), + attention_resolutions=(512,1024,2048,4096), conv_resample=True, dims=1, use_fp16=False, @@ -95,7 +95,6 @@ class DiffusionVocoderWithRef(nn.Module): self.in_channels = in_channels self.model_channels = model_channels self.out_channels = out_channels - self.num_res_blocks = num_res_blocks self.attention_resolutions = attention_resolutions self.dropout = dropout self.channel_mult = channel_mult @@ -134,12 +133,12 @@ class DiffusionVocoderWithRef(nn.Module): ch = model_channels ds = 1 - for level, mult in enumerate(channel_mult): + for level, (mult, num_blocks) in enumerate(zip(channel_mult, num_res_blocks)): if ds in spectrogram_conditioning_resolutions: self.input_blocks.append(DiscreteSpectrogramConditioningBlock(discrete_codes, ch)) ch *= 2 - for _ in range(num_res_blocks): + for _ in range(num_blocks): layers = [ ResBlock( ch, @@ -216,8 +215,8 @@ class DiffusionVocoderWithRef(nn.Module): self._feature_size += ch self.output_blocks = nn.ModuleList([]) - for level, mult in list(enumerate(channel_mult))[::-1]: - for i in range(num_res_blocks + 1): + for level, (mult, num_blocks) in list(enumerate(zip(channel_mult, num_res_blocks)))[::-1]: + for i in range(num_blocks + 1): ich = input_block_chans.pop() layers = [ ResBlock( @@ -240,7 +239,7 @@ class DiffusionVocoderWithRef(nn.Module): use_new_attention_order=use_new_attention_order, ) ) - if level and i == num_res_blocks: + if level and i == num_blocks: out_ch = ch layers.append( ResBlock( @@ -328,8 +327,8 @@ def register_unet_diffusion_vocoder_with_ref(opt_net, opt): # Test for ~4 second audio clip at 22050Hz if __name__ == '__main__': clip = torch.randn(2, 1, 81920) - spec = torch.randint(8192, (2, 500,)) + spec = torch.randint(8192, (2, 160,)) cond = torch.randn(2, 4, 80, 600) ts = torch.LongTensor([555, 556]) - model = DiffusionVocoderWithRef(32, 2) + model = DiffusionVocoderWithRef(32, conditioning_inputs_provided=False) print(model(clip, ts, spec, cond, 4).shape)