diff --git a/codes/models/audio/music/tfdpc_v3.py b/codes/models/audio/music/tfdpc_v3.py index 71059686..59531c32 100644 --- a/codes/models/audio/music/tfdpc_v3.py +++ b/codes/models/audio/music/tfdpc_v3.py @@ -286,7 +286,7 @@ def inference_tfdpc3_with_cheater(): model = TransformerDiffusionWithConditioningEncoder(in_channels=256, out_channels=512, model_channels=1024, contraction_dim=512, num_heads=8, num_layers=12, dropout=0, use_fp16=False, unconditioned_percentage=0).eval().cuda() - model.load_state_dict(torch.load('x:/dlas/experiments/train_music_cheater_gen_v3/models/59000_generator_ema.pth')) + model.load_state_dict(torch.load('x:/dlas/experiments/train_music_cheater_gen_v3/models/61000_generator_ema.pth')) from trainer.injectors.audio_injectors import TorchMelSpectrogramInjector spec_fn = TorchMelSpectrogramInjector({'n_mel_channels': 256, 'mel_fmax': 11000, 'filter_length': 16000, 'true_normalization': True, diff --git a/codes/models/audio/music/tfdpc_v4.py b/codes/models/audio/music/tfdpc_v4.py index 01dd376f..49066a66 100644 --- a/codes/models/audio/music/tfdpc_v4.py +++ b/codes/models/audio/music/tfdpc_v4.py @@ -229,8 +229,8 @@ class TransformerDiffusionWithPointConditioning(nn.Module): return out def before_step(self, step): - scaled_grad_parameters = list(itertools.chain.from_iterable([lyr.out.parameters() for lyr in self.diff.layers])) + \ - list(itertools.chain.from_iterable([lyr.prenorm.parameters() for lyr in self.diff.layers])) + scaled_grad_parameters = list(itertools.chain.from_iterable([lyr.out.parameters() for lyr in self.layers])) + \ + list(itertools.chain.from_iterable([lyr.prenorm.parameters() for lyr in self.layers])) # Scale back the gradients of the blkout and prenorm layers by a constant factor. These get two orders of magnitudes # higher gradients. Ideally we would use parameter groups, but ZeroRedundancyOptimizer makes this trickier than # directly fiddling with the gradients. @@ -251,7 +251,7 @@ def test_cheater_model(): # For music: model = TransformerDiffusionWithPointConditioning(in_channels=256, out_channels=512, model_channels=1024, - contraction_dim=384, num_heads=6, num_layers=18, dropout=0, + contraction_dim=512, num_heads=8, num_layers=15, dropout=0, unconditioned_percentage=.4) print_network(model) o = model(clip, ts, cl)