From 4d53c66602659e3d0d117fa2528e7fcbea3ed15a Mon Sep 17 00:00:00 2001 From: James Betker Date: Thu, 14 Jul 2022 21:25:03 -0600 Subject: [PATCH] simplify span selecting logic in tfdpc --- codes/models/audio/music/tfdpc_v5.py | 18 +++++++++--------- codes/trainer/eval/music_diffusion_fid.py | 12 ++++++------ codes/trainer/injectors/audio_injectors.py | 12 +++++++----- 3 files changed, 22 insertions(+), 20 deletions(-) diff --git a/codes/models/audio/music/tfdpc_v5.py b/codes/models/audio/music/tfdpc_v5.py index 9f590db6..96f25251 100644 --- a/codes/models/audio/music/tfdpc_v5.py +++ b/codes/models/audio/music/tfdpc_v5.py @@ -84,7 +84,6 @@ class ConditioningEncoder(nn.Module): do_checkpointing=False, time_proj=True): super().__init__() - attn = [] self.init = nn.Conv1d(cond_dim, embedding_dim, kernel_size=1) self.time_proj = time_proj if time_proj: @@ -211,10 +210,11 @@ class TransformerDiffusionWithPointConditioning(nn.Module): conditioning_input[:,:,tstart:tstart+tclip] = 0 if cond_left is None and self.new_cond: - cond_left = conditioning_input[:,:,:max(cond_start, 20)] - left_pt = cond_start-1 - cond_right = conditioning_input[:,:,min(N+cond_start, conditioning_input.shape[-1]-20):] - right_pt = min(cond_right.shape[-1]-1, cond_right.shape[-1] - (conditioning_input.shape[-1] - (N+cond_start))) + assert cond_start > 20 and (cond_start+N+20 <= conditioning_input.shape[-1]) + cond_left = conditioning_input[:,:,:cond_start] + left_pt = -1 + cond_right = conditioning_input[:,:,cond_start+N:] + right_pt = 0 elif cond_left is None: assert conditioning_input.shape[-1] - cond_start - N >= 0, f'Some sort of conditioning misalignment, {conditioning_input.shape[-1], cond_start, N}' cond_pre = conditioning_input[:,:,:cond_start] @@ -292,13 +292,13 @@ class TransformerDiffusionWithPointConditioning(nn.Module): def before_step(self, step): scaled_grad_parameters = list(itertools.chain.from_iterable([lyr.out.parameters() for lyr in self.layers])) + \ list(itertools.chain.from_iterable([lyr.prenorm.parameters() for lyr in self.layers])) + # Scale back the gradients of the blkout and prenorm layers by a constant factor. These get two orders of magnitudes # higher gradients. Ideally we would use parameter groups, but ZeroRedundancyOptimizer makes this trickier than # directly fiddling with the gradients. - if not self.new_cond: # Not really related, I just don't want to add a new config. - for p in scaled_grad_parameters: - if hasattr(p, 'grad') and p.grad is not None: - p.grad *= .2 + for p in scaled_grad_parameters: + if hasattr(p, 'grad') and p.grad is not None: + p.grad *= .2 @register_model diff --git a/codes/trainer/eval/music_diffusion_fid.py b/codes/trainer/eval/music_diffusion_fid.py index ae3a056a..8fb8b2c0 100644 --- a/codes/trainer/eval/music_diffusion_fid.py +++ b/codes/trainer/eval/music_diffusion_fid.py @@ -434,20 +434,20 @@ class MusicDiffusionFid(evaluator.Evaluator): if __name__ == '__main__': - diffusion = load_model_from_config('X:\\dlas\\experiments\\train_music_cheater_gen_ar_prior.yml', 'generator', + diffusion = load_model_from_config('X:\\dlas\\experiments\\train_music_cheater_gen.yml', 'generator', also_load_savepoint=False, - load_path='X:\\dlas\\experiments\\train_music_diffusion_tfd12_cheater_gen_ar_prior\\models\\43500_generator_ema.pth' + load_path='X:\\dlas\\experiments\\train_music_cheater_gen_v5_cosine_40_lyr\\models\\18500_generator_ema.pth' ).cuda() opt_eval = {'path': 'Y:\\split\\yt-music-eval', # eval music, mostly electronica. :) #'path': 'E:\\music_eval', # this is music from the training dataset, including a lot more variety. - 'diffusion_steps': 256, # basis: 192 - 'conditioning_free': True, 'conditioning_free_k': 1, 'use_ddim': True, 'clip_audio': False, - 'diffusion_schedule': 'linear', 'diffusion_type': 'from_ar_prior', + 'diffusion_steps': 128, # basis: 192 + 'conditioning_free': False, 'conditioning_free_k': 1, 'use_ddim': True, 'clip_audio': False, + 'diffusion_schedule': 'cosine', 'diffusion_type': 'cheater_gen', # Slope 1: 1.03x, 2: 1.06, 4: 1.135, 8: 1.27, 16: 1.54 #'causal': True, 'causal_slope': 4, # DONT FORGET TO INCREMENT THE STEP! #'partial_low': 128, 'partial_high': 192 } - env = {'rank': 0, 'base_path': 'D:\\tmp\\test_eval_music', 'step': 200, 'device': 'cuda', 'opt': {}} + env = {'rank': 0, 'base_path': 'D:\\tmp\\test_eval_music', 'step': 201, 'device': 'cuda', 'opt': {}} eval = MusicDiffusionFid(diffusion, opt_eval, env) fds = [] for i in range(2): diff --git a/codes/trainer/injectors/audio_injectors.py b/codes/trainer/injectors/audio_injectors.py index ef8ecbe3..fc81421e 100644 --- a/codes/trainer/injectors/audio_injectors.py +++ b/codes/trainer/injectors/audio_injectors.py @@ -98,12 +98,12 @@ class RandomAudioCropInjector(Injector): self.max_crop_sz = opt['max_crop_size'] self.lengths_key = opt['lengths_key'] self.crop_start_key = opt['crop_start_key'] + self.min_buffer = opt_get(opt, ['min_buffer'], 0) self.rand_buffer_ptr=9999 self.rand_buffer_sz=5000 def forward(self, state): - crop_sz = random.randint(self.min_crop_sz, self.max_crop_sz) inp = state[self.input] if torch.distributed.get_world_size() > 1: # All processes should agree, otherwise all processes wait to process max_crop_sz (effectively). But agreeing too often @@ -121,12 +121,14 @@ class RandomAudioCropInjector(Injector): len = torch.min(lens) else: len = inp.shape[-1] - margin = len - crop_sz + + margin = len - crop_sz - self.min_buffer if margin < 0: - res = {self.output: inp} + start = self.min_buffer else: - start = random.randint(0, margin) - res = {self.output: inp[:, :, start:start+crop_sz]} + start = random.randint(0, margin) + self.min_buffer + + res = {self.output: inp[:, :, start:start+crop_sz]} if self.crop_start_key is not None: res[self.crop_start_key] = start return res