From ee8ceed6dae18c720a51a37e4d363154a0a32dde Mon Sep 17 00:00:00 2001 From: James Betker Date: Wed, 20 Jul 2022 23:28:29 -0600 Subject: [PATCH] rework tfd13 further - use a gated activation layer for both attention & convs - add a relativistic learned position bias. I believe this is similar to the T5 position encodings but it is simpler and learned - get rid of prepending to the attention matrix - this doesn't really work that well. the model eventually learns to attend one of its heads to these blocks but why not just concat if it is doing that? --- codes/models/arch_util.py | 57 ++++++++++--- .../audio/music/transformer_diffusion13.py | 81 ++++++++++--------- codes/trainer/eval/music_diffusion_fid.py | 28 +++++-- 3 files changed, 111 insertions(+), 55 deletions(-) diff --git a/codes/models/arch_util.py b/codes/models/arch_util.py index e2c4c4a8..c2c8903a 100644 --- a/codes/models/arch_util.py +++ b/codes/models/arch_util.py @@ -341,6 +341,20 @@ class Downsample(nn.Module): return self.op(x) +class cGLU(nn.Module): + """ + Gated GELU for channel-first architectures. + """ + def __init__(self, dim_in, dim_out=None): + super().__init__() + dim_out = dim_in if dim_out is None else dim_out + self.proj = nn.Conv1d(dim_in, dim_out * 2, 1) + + def forward(self, x): + x, gate = self.proj(x).chunk(2, dim=1) + return x * F.gelu(gate) + + class ResBlock(nn.Module): """ A residual block that can optionally change the number of channels. @@ -439,7 +453,7 @@ class ResBlock(nn.Module): return self.skip_connection(x) + h -def build_local_attention_mask(n, l, fixed_region): +def build_local_attention_mask(n, l, fixed_region=0): """ Builds an attention mask that focuses attention on local region Includes provisions for a "fixed_region" at the start of the sequence where full attention weights will be applied. @@ -465,6 +479,24 @@ def test_local_attention_mask(): print(build_local_attention_mask(9,4,1)) +class RelativeQKBias(nn.Module): + """ + Very simple relative position bias scheme which should be directly added to QK matrix. This bias simply applies to + the distance from the given element. + """ + def __init__(self, l, max_positions=4000): + super().__init__() + self.emb = nn.Parameter(torch.randn(l+1) * .01) + o = torch.arange(0,max_positions) + c = o.unsqueeze(-1).repeat(1,max_positions) + r = o.unsqueeze(0).repeat(max_positions,1) + M = ((-(r-c).abs())+l).clamp(0,l) + self.register_buffer('M', M, persistent=False) + + def forward(self, n): + return self.emb[self.M[:n, :n]].view(1,n,n) + + class AttentionBlock(nn.Module): """ An attention block that allows spatial positions to attend to each other. @@ -507,16 +539,22 @@ class AttentionBlock(nn.Module): self.x_proj = nn.Identity() if out_channels == channels else conv_nd(1, channels, out_channels, 1) self.proj_out = zero_module(conv_nd(1, out_channels, out_channels, 1)) - def forward(self, x, mask=None): + def forward(self, x, mask=None, qk_bias=None): if self.do_checkpoint: - if mask is not None: - return checkpoint(self._forward, x, mask) + if mask is None: + if qk_bias is None: + return checkpoint(self._forward, x) + else: + assert False, 'unsupported: qk_bias but no mask' else: - return checkpoint(self._forward, x) + if qk_bias is None: + return checkpoint(self._forward, x, mask) + else: + return checkpoint(self._forward, x, mask, qk_bias) else: return self._forward(x, mask) - def _forward(self, x, mask=None): + def _forward(self, x, mask=None, qk_bias=0): b, c, *spatial = x.shape if mask is not None: if len(mask.shape) == 2: @@ -529,7 +567,7 @@ class AttentionBlock(nn.Module): if self.do_activation: x = F.silu(x, inplace=True) qkv = self.qkv(x) - h = self.attention(qkv, mask) + h = self.attention(qkv, mask, qk_bias) h = self.proj_out(h) xp = self.x_proj(x) return (xp + h).reshape(b, xp.shape[1], *spatial) @@ -544,7 +582,7 @@ class QKVAttentionLegacy(nn.Module): super().__init__() self.n_heads = n_heads - def forward(self, qkv, mask=None): + def forward(self, qkv, mask=None, qk_bias=0): """ Apply QKV attention. @@ -559,6 +597,7 @@ class QKVAttentionLegacy(nn.Module): weight = torch.einsum( "bct,bcs->bts", q * scale, k * scale ) # More stable with f16 than dividing afterwards + weight = weight + qk_bias if mask is not None: mask = mask.repeat(self.n_heads, 1, 1) weight[mask.logical_not()] = -torch.inf @@ -577,7 +616,7 @@ class QKVAttention(nn.Module): super().__init__() self.n_heads = n_heads - def forward(self, qkv, mask=None): + def forward(self, qkv, mask=None, qk_bias=0): """ Apply QKV attention. diff --git a/codes/models/audio/music/transformer_diffusion13.py b/codes/models/audio/music/transformer_diffusion13.py index 43c29428..a78e3cac 100644 --- a/codes/models/audio/music/transformer_diffusion13.py +++ b/codes/models/audio/music/transformer_diffusion13.py @@ -6,7 +6,8 @@ import torch import torch.nn as nn import torch.nn.functional as F -from models.arch_util import ResBlock, TimestepEmbedSequential, AttentionBlock, build_local_attention_mask +from models.arch_util import ResBlock, TimestepEmbedSequential, AttentionBlock, build_local_attention_mask, cGLU, \ + RelativeQKBias from models.diffusion.nn import timestep_embedding, normalization, zero_module, conv_nd, linear from models.diffusion.unet_diffusion import TimestepBlock from trainer.networks import register_model @@ -22,73 +23,73 @@ def is_sequence(t): class SubBlock(nn.Module): - def __init__(self, inp_dim, contraction_dim, blk_dim, heads, dropout): + def __init__(self, inp_dim, contraction_dim, heads, dropout): super().__init__() self.dropout = nn.Dropout(p=dropout) - self.blk_emb_proj = nn.Conv1d(blk_dim, inp_dim, 1) self.attn = AttentionBlock(inp_dim, out_channels=contraction_dim, num_heads=heads) + self.register_buffer('mask', build_local_attention_mask(n=4000, l=64), persistent=False) + self.pos_bias = RelativeQKBias(l=64) + self.attn_glu = cGLU(contraction_dim) self.attnorm = nn.GroupNorm(8, contraction_dim) self.ff = nn.Conv1d(inp_dim+contraction_dim, contraction_dim, kernel_size=3, padding=1) + self.ff_glu = cGLU(contraction_dim) self.ffnorm = nn.GroupNorm(8, contraction_dim) - self.mask = build_local_attention_mask(n=4000, l=64, fixed_region=8) - self.mask_initialized = False - def forward(self, x, blk_emb): - if self.mask is not None and not self.mask_initialized: - self.mask = self.mask.to(x.device) - self.mask_initialized = True - blk_enc = self.blk_emb_proj(blk_emb) - ah = self.dropout(self.attn(torch.cat([blk_enc, x], dim=-1), mask=self.mask)) - ah = ah[:,:,blk_enc.shape[-1]:] # Strip off the blk_emc used for attention and re-align with x. - ah = F.gelu(self.attnorm(ah)) + def forward(self, x): + ah = self.dropout(self.attn(x, mask=self.mask, qk_bias=self.pos_bias(x.shape[-1]))) + ah = self.attn_glu(self.attnorm(ah)) h = torch.cat([ah, x], dim=1) hf = self.dropout(checkpoint(self.ff, h)) - hf = F.gelu(self.ffnorm(hf)) + hf = self.ff_glu(self.ffnorm(hf)) h = torch.cat([h, hf], dim=1) return h class ConcatAttentionBlock(TimestepBlock): - def __init__(self, trunk_dim, contraction_dim, heads, dropout): + def __init__(self, trunk_dim, contraction_dim, blk_dim, heads, dropout): super().__init__() + self.contraction_dim = contraction_dim self.prenorm = nn.GroupNorm(8, trunk_dim) - self.block1 = SubBlock(trunk_dim, contraction_dim, trunk_dim, heads, dropout) - self.block2 = SubBlock(trunk_dim+contraction_dim*2, contraction_dim, trunk_dim, heads, dropout) + self.block1 = SubBlock(trunk_dim+blk_dim, contraction_dim, heads, dropout) + self.block2 = SubBlock(trunk_dim+blk_dim+contraction_dim*2, contraction_dim, heads, dropout) self.out = nn.Conv1d(contraction_dim*4, trunk_dim, kernel_size=1, bias=False) self.out.weight.data.zero_() def forward(self, x, blk_emb): h = self.prenorm(x) - h = self.block1(h, blk_emb) - h = self.block2(h, blk_emb) - h = self.out(h[:,x.shape[1]:]) + h = torch.cat([h, blk_emb.unsqueeze(-1).repeat(1,1,x.shape[-1])], dim=1) + h = self.block1(h) + h = self.block2(h) + h = self.out(h[:,-self.contraction_dim*4:]) return h + x class ConditioningEncoder(nn.Module): def __init__(self, spec_dim, - embedding_dim, + hidden_dim, + out_dim, num_resolutions, attn_blocks=6, num_attn_heads=4, do_checkpointing=False): super().__init__() attn = [] - self.init = nn.Conv1d(spec_dim, embedding_dim, kernel_size=5, stride=2) - self.resolution_embedding = nn.Embedding(num_resolutions, embedding_dim) + self.init = nn.Conv1d(spec_dim, hidden_dim, kernel_size=5, stride=2) + self.resolution_embedding = nn.Embedding(num_resolutions, hidden_dim) self.resolution_embedding.weight.data.mul(.1) # Reduces the relative influence of this embedding from the start. for a in range(attn_blocks): - attn.append(AttentionBlock(embedding_dim, num_attn_heads, do_checkpoint=do_checkpointing)) - attn.append(ResBlock(embedding_dim, dims=1, checkpointing_enabled=do_checkpointing)) + attn.append(AttentionBlock(hidden_dim, num_attn_heads, do_checkpoint=do_checkpointing)) + attn.append(ResBlock(hidden_dim, dims=1, checkpointing_enabled=do_checkpointing)) self.attn = nn.Sequential(*attn) - self.dim = embedding_dim + self.out = nn.Linear(hidden_dim, out_dim, bias=False) + self.dim = hidden_dim self.do_checkpointing = do_checkpointing def forward(self, x, resolution): h = self.init(x) + self.resolution_embedding(resolution).unsqueeze(-1) h = self.attn(h) - return h[:, :, :5] + return self.out(h[:, :, 0]) class TransformerDiffusion(nn.Module): @@ -97,7 +98,6 @@ class TransformerDiffusion(nn.Module): """ def __init__( self, - time_embed_dim=256, resolution_steps=8, max_window=384, model_channels=1024, @@ -106,6 +106,9 @@ class TransformerDiffusion(nn.Module): in_channels=256, input_vec_dim=1024, out_channels=512, # mean and variance + time_embed_dim=256, + time_proj_dim=64, + cond_proj_dim=256, num_heads=4, dropout=0, use_fp16=False, @@ -128,19 +131,20 @@ class TransformerDiffusion(nn.Module): self.time_embed = nn.Sequential( linear(time_embed_dim, time_embed_dim), nn.SiLU(), - linear(time_embed_dim, model_channels), + linear(time_embed_dim, time_proj_dim), ) self.prior_time_embed = nn.Sequential( linear(time_embed_dim, time_embed_dim), nn.SiLU(), - linear(time_embed_dim, model_channels), + linear(time_embed_dim, time_proj_dim), ) - self.resolution_embed = nn.Embedding(resolution_steps, model_channels) - self.conditioning_encoder = ConditioningEncoder(in_channels, model_channels, resolution_steps, num_attn_heads=model_channels//64) - self.unconditioned_embedding = nn.Parameter(torch.randn(1,model_channels,5)) + self.resolution_embed = nn.Embedding(resolution_steps, time_proj_dim) + self.conditioning_encoder = ConditioningEncoder(in_channels, model_channels, cond_proj_dim, resolution_steps, num_attn_heads=model_channels//64) + self.unconditioned_embedding = nn.Parameter(torch.randn(1,cond_proj_dim)) self.inp_block = conv_nd(1, in_channels+input_vec_dim, model_channels, 3, 1, 1) - self.layers = TimestepEmbedSequential(*[ConcatAttentionBlock(model_channels, contraction_dim, num_heads, dropout) for _ in range(num_layers)]) + self.layers = TimestepEmbedSequential(*[ConcatAttentionBlock(model_channels, contraction_dim, time_proj_dim*3 + cond_proj_dim, + num_heads, dropout) for _ in range(num_layers)]) self.out = nn.Sequential( normalization(model_channels), @@ -246,15 +250,14 @@ class TransformerDiffusion(nn.Module): # Mask out the conditioning input and x_prior inputs for whole batch elements, implementing something similar to classifier-free guidance. if self.training and self.unconditioned_percentage > 0: - unconditioned_batches = torch.rand((x.shape[0], 1, 1), - device=x.device) < self.unconditioned_percentage - code_emb = torch.where(unconditioned_batches, self.unconditioned_embedding.repeat(code_emb.shape[0], 1, 1), code_emb) + unconditioned_batches = torch.rand((x.shape[0], 1), device=x.device) < self.unconditioned_percentage + code_emb = torch.where(unconditioned_batches, self.unconditioned_embedding.repeat(code_emb.shape[0], 1), code_emb) with torch.autocast(x.device.type, enabled=self.enable_fp16): time_emb = self.time_embed(timestep_embedding(timesteps, self.time_embed_dim)) prior_time_emb = self.prior_time_embed(timestep_embedding(prior_timesteps, self.time_embed_dim)) res_emb = self.resolution_embed(resolution) - blk_emb = torch.cat([time_emb.unsqueeze(-1), prior_time_emb.unsqueeze(-1), res_emb.unsqueeze(-1), code_emb], dim=-1) + blk_emb = torch.cat([time_emb, prior_time_emb, res_emb, code_emb], dim=1) h = torch.cat([x, x_prior], dim=1) h = self.inp_block(h) @@ -304,5 +307,5 @@ def remove_conditioning(sd_path): if __name__ == '__main__': - remove_conditioning('X:\\dlas\\experiments\\train_music_diffusion_multilevel_sr_pre\\models\\12500_generator.pth') + #remove_conditioning('X:\\dlas\\experiments\\train_music_diffusion_multilevel_sr_pre\\models\\12500_generator.pth') test_tfd() diff --git a/codes/trainer/eval/music_diffusion_fid.py b/codes/trainer/eval/music_diffusion_fid.py index 8b8c10fa..3fda8f72 100644 --- a/codes/trainer/eval/music_diffusion_fid.py +++ b/codes/trainer/eval/music_diffusion_fid.py @@ -146,7 +146,7 @@ class MusicDiffusionFid(evaluator.Evaluator): # x = x.clamp(-s, s) / s # return x sampler = self.diffuser.ddim_sample_loop if self.ddim else self.diffuser.p_sample_loop - gen_mel = sampler(self.model, mel_norm.shape, model_kwargs={'truth_mel': mel_norm}) + gen_mel = sampler(self.model, mel_norm.shape, model_kwargs={'truth_mel': mel_norm}, eta=.8) gen_mel_denorm = denormalize_torch_mel(gen_mel) output_shape = (1,16,audio.shape[-1]//16) @@ -230,7 +230,6 @@ class MusicDiffusionFid(evaluator.Evaluator): audio = audio.unsqueeze(0) mel = self.spec_fn({'in': audio})['out'] mel_norm = normalize_torch_mel(mel) - #mel_norm = mel_norm[:,:,:448*4] # restricts first stage to optimal training window. conditioning = mel_norm[:,:,:1200] downsampled = F.interpolate(mel_norm, scale_factor=1/16, mode='nearest') stage1_shape = (1, 256, downsampled.shape[-1]*4) @@ -323,19 +322,34 @@ class MusicDiffusionFid(evaluator.Evaluator): if __name__ == '__main__': + """ + # For multilevel SR: diffusion = load_model_from_config('X:\\dlas\\experiments\\train_music_diffusion_multilevel_sr.yml', 'generator', also_load_savepoint=False, strict_load=False, - load_path='X:\\dlas\\experiments\\train_music_diffusion_multilevel_sr\\models\\22000_generator.pth' + load_path='X:\\dlas\\experiments\\train_music_diffusion_multilevel_sr\\models\\4000_generator.pth' ).cuda() opt_eval = {'path': 'Y:\\split\\yt-music-eval', # eval music, mostly electronica. :) #'path': 'E:\\music_eval', # this is music from the training dataset, including a lot more variety. 'diffusion_steps': 128, # basis: 192 'conditioning_free': False, 'conditioning_free_k': 1, 'use_ddim': False, 'clip_audio': False, - 'diffusion_schedule': 'linear', 'diffusion_type': 'chained_sr', - #'causal': True, 'causal_slope': 4, - #'partial_low': 128, 'partial_high': 192 + 'diffusion_schedule': 'cosine', 'diffusion_type': 'chained_sr', } - env = {'rank': 0, 'base_path': 'D:\\tmp\\test_eval_music', 'step': 1, 'device': 'cuda', 'opt': {}} + """ + + # For TFD+cheater trainer + diffusion = load_model_from_config('X:\\dlas\\experiments\\train_music_diffusion_tfd_and_cheater.yml', 'generator', + also_load_savepoint=False, strict_load=False, + load_path='X:\\dlas\\experiments\\train_music_diffusion_tfd14_and_cheater_g2\\models\\20000_generator.pth' + ).cuda() + opt_eval = {'path': 'Y:\\split\\yt-music-eval', # eval music, mostly electronica. :) + #'path': 'E:\\music_eval', # this is music from the training dataset, including a lot more variety. + 'diffusion_steps': 128, # basis: 192 + 'conditioning_free': True, 'conditioning_free_k': 1, 'use_ddim': True, 'clip_audio': True, + 'diffusion_schedule': 'linear', 'diffusion_type': 'from_codes_quant', + } + + + env = {'rank': 0, 'base_path': 'D:\\tmp\\test_eval_music', 'step': 6, 'device': 'cuda', 'opt': {}} eval = MusicDiffusionFid(diffusion, opt_eval, env) fds = [] for i in range(2):