diff --git a/codes/models/audio/music/tfdpc_v5.py b/codes/models/audio/music/tfdpc_v5.py index 8d327baa..9f590db6 100644 --- a/codes/models/audio/music/tfdpc_v5.py +++ b/codes/models/audio/music/tfdpc_v5.py @@ -313,7 +313,7 @@ def test_cheater_model(): # For music: model = TransformerDiffusionWithPointConditioning(in_channels=256, out_channels=512, model_channels=1024, - contraction_dim=512, num_heads=8, num_layers=40, dropout=0, + contraction_dim=512, num_heads=8, num_layers=32, dropout=0, unconditioned_percentage=.4, checkpoint_conditioning=False, regularization=True, new_cond=True) print_network(model) diff --git a/codes/scripts/audio/gen/music_joiner.py b/codes/scripts/audio/gen/music_joiner.py index 02c46583..8000e661 100644 --- a/codes/scripts/audio/gen/music_joiner.py +++ b/codes/scripts/audio/gen/music_joiner.py @@ -4,11 +4,12 @@ import torch import numpy as np import torchaudio import torchvision +from tqdm import tqdm from models.audio.music.tfdpc_v5 import TransformerDiffusionWithPointConditioning -from utils.music_utils import get_cheater_decoder +from utils.music_utils import get_cheater_decoder, get_mel2wav_v3_model from utils.util import load_audio -from trainer.injectors.audio_injectors import TorchMelSpectrogramInjector +from trainer.injectors.audio_injectors import TorchMelSpectrogramInjector, denormalize_mel, pixel_shuffle_1d from trainer.injectors.audio_injectors import MusicCheaterLatentInjector from models.diffusion.respace import SpacedDiffusion from models.diffusion.respace import space_timesteps @@ -22,66 +23,61 @@ def join_music(clip1, clip1_cut, clip2, clip2_cut, mix_time, results_dir): cheater_encoder = MusicCheaterLatentInjector({'in': 'in', 'out': 'out'}, {}).cuda() model = TransformerDiffusionWithPointConditioning(in_channels=256, out_channels=512, model_channels=1024, contraction_dim=512, num_heads=8, num_layers=12, dropout=0, - use_fp16=False, unconditioned_percentage=0).eval().cuda() + use_fp16=False, unconditioned_percentage=0, time_proj=True).eval().cuda() diffuser = SpacedDiffusion(use_timesteps=space_timesteps(4000, [256]), model_mean_type='epsilon', - model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', 4000), - conditioning_free=True, conditioning_free_k=2) - model.load_state_dict(torch.load('x:/dlas/experiments/train_music_cheater_gen_v5/models/72000_generator_ema.pth')) - clip1 = load_audio(clip1, 22050)[:-(clip1_cut*22050)].cuda() + model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', 4000), + conditioning_free=True, conditioning_free_k=1) + model.load_state_dict(torch.load('x:/dlas/experiments/train_music_cheater_gen_v5/models/206000_generator_ema.pth')) + clip1 = load_audio(clip1, 22050).cuda() clip1_mel = spec_fn({'in': clip1.unsqueeze(0)})['out'] clip1_cheater = cheater_encoder({'in': clip1_mel})['out'] - clip2 = load_audio(clip2, 22050)[clip2_cut*22050:].cuda() + clip1_leadin = clip1_cheater[:,:,-60:] + clip1_cheater = clip1_cheater[:,:,-260:-60] + clip2 = load_audio(clip2, 22050).cuda() clip2_mel = spec_fn({'in': clip2.unsqueeze(0)})['out'] clip2_cheater = cheater_encoder({'in': clip2_mel})['out'] + clip2_leadin = clip2_cheater[:,:,:60] + clip2_cheater = clip2_cheater[:,:,60:260] + inp = torch.cat([clip1_leadin, torch.zeros(1,256,240, device='cuda'), clip2_leadin], dim=-1) + mask = torch.ones_like(inp) + mask[:,:,60:-60] = 0 + gen_cheater = diffuser.p_sample_loop_with_guidance(model, inp, mask, # causal=True, causal_slope=4, + model_kwargs={'cond_left': clip1_cheater, 'cond_right': clip2_cheater}) - blank_cheater_sz = (22050*mix_time//4096) - sample_template = torch.cat([clip1_cheater[:,:,-25:], - torch.zeros(1,256,blank_cheater_sz, device='cuda'), - clip2_cheater[:,:,:25]], dim=-1) - mask = torch.ones_like(sample_template) - mask[:,:,25:-25] = 0 - - def custom_conditioning_endpoint_fetch(cond_enc, ts): - clip_sz = 100 - combined_cheater = torch.cat([clip1_cheater[:,:,-clip_sz:], clip2_cheater[:,:,:clip_sz]], dim=-1) - enc = cond_enc(combined_cheater, ts) - start_cond = enc[:,:,clip_sz-25] # About 5 seconds back into the clip. - end_cond = enc[:,:,clip_sz+25] - return start_cond, end_cond - - - gen_cheater = diffuser.p_sample_loop_with_guidance(model, sample_template, mask, - model_kwargs={'custom_conditioning_fetcher': custom_conditioning_endpoint_fetch}) - - cheater_decoder_diffuser = SpacedDiffusion(use_timesteps=space_timesteps(4000, [64]), model_mean_type='epsilon', - model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', 4000), - conditioning_free=True, conditioning_free_k=1) cheater_to_mel = get_cheater_decoder().diff.cuda() - gen_mel = cheater_decoder_diffuser.ddim_sample_loop(cheater_to_mel, (1,256,gen_cheater.shape[-1]*16), progress=True, - model_kwargs={'codes': gen_cheater.permute(0,2,1)}) - torchvision.utils.save_image((gen_mel + 1)/2, f'{results_dir}/mel.png') - - from utils.music_utils import get_mel2wav_v3_model + cheater_decoder_diffuser = SpacedDiffusion(use_timesteps=space_timesteps(4000, [64]), model_mean_type='epsilon', + model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', 4000), + conditioning_free=True, conditioning_free_k=1) m2w = get_mel2wav_v3_model().cuda() spectral_diffuser = SpacedDiffusion(use_timesteps=space_timesteps(4000, [32]), model_mean_type='epsilon', - model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', 4000), - conditioning_free=True, conditioning_free_k=1) - from trainer.injectors.audio_injectors import denormalize_mel - gen_mel_denorm = denormalize_mel(gen_mel) - output_shape = (1,16,gen_mel_denorm.shape[-1]*256//16) - gen_wav = spectral_diffuser.ddim_sample_loop(m2w, output_shape, progress=True, model_kwargs={'codes': gen_mel_denorm}) - from trainer.injectors.audio_injectors import pixel_shuffle_1d - gen_wav = pixel_shuffle_1d(gen_wav, 16) + model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', 4000), + conditioning_free=True, conditioning_free_k=1) + MAX_CONTEXT = 30 * 22050 // 4096 + chunks = torch.split(gen_cheater, MAX_CONTEXT, dim=-1) + gen_wavs = [] + for i, chunk_cheater in enumerate(tqdm(chunks)): + gen_mel = cheater_decoder_diffuser.ddim_sample_loop(cheater_to_mel, (1,256,chunk_cheater.shape[-1]*16), progress=True, + model_kwargs={'codes': chunk_cheater.permute(0,2,1)}) + torchvision.utils.save_image((gen_mel + 1)/2, f'{results_dir}/mel_{i}.png') + + gen_mel_denorm = denormalize_mel(gen_mel) + output_shape = (1,16,gen_mel_denorm.shape[-1]*256//16) + wav = spectral_diffuser.ddim_sample_loop(m2w, output_shape, progress=True, model_kwargs={'codes': gen_mel_denorm}) + gen_wavs.append(pixel_shuffle_1d(wav, 16)) + + gen_wav = torch.cat(gen_wavs, dim=-1) torchaudio.save(f'{results_dir}/out.wav', gen_wav.squeeze(1).cpu(), 22050) if __name__ == '__main__': results_dir = '../results/audio_joiner' - clip1 = 'Y:\\sources\\manual_podcasts_music\\2\\The Glitch Mob - Discography\\2014 - Love, Death Immortality\\2. Our Demons (feat. Aja Volkman).mp3' + #clip1 = 'Y:\\sources\\music\\manual_podcasts_music\\2\\The Glitch Mob - Discography\\2014 - Love, Death Immortality\\2. Our Demons (feat. Aja Volkman).mp3' + clip1 = 'Y:\\separated\\bt-music-5\\[2002] Gutterflower\\02 - Think About Me\\00000\\no_vocals.wav' clip1_cut = 35 # Seconds - clip2 = 'Y:\\sources\\manual_podcasts_music\\2\\The Glitch Mob - Discography\\2014 - Love, Death Immortality\\9. Carry The Sun.mp3' + #clip2 = 'Y:\\sources\\music\\manual_podcasts_music\\2\\The Glitch Mob - Discography\\2014 - Love, Death Immortality\\9. Carry The Sun.mp3' + clip2 = 'Y:\\separated\\bt-music-5\\[2002] Gutterflower\\02 - Think About Me\\00003\\no_vocals.wav' clip2_cut = 1 mix_time = 10 os.makedirs(results_dir, exist_ok=True) diff --git a/codes/scripts/audio/prep_music/test_contrastive_music_pairer.py b/codes/scripts/audio/prep_music/test_contrastive_music_pairer.py deleted file mode 100644 index e69de29b..00000000 diff --git a/codes/trainer/eval/music_diffusion_fid.py b/codes/trainer/eval/music_diffusion_fid.py index b29309be..ae3a056a 100644 --- a/codes/trainer/eval/music_diffusion_fid.py +++ b/codes/trainer/eval/music_diffusion_fid.py @@ -434,20 +434,20 @@ class MusicDiffusionFid(evaluator.Evaluator): if __name__ == '__main__': - diffusion = load_model_from_config('X:\\dlas\\experiments\\train_music_cheater_gen.yml', 'generator', + diffusion = load_model_from_config('X:\\dlas\\experiments\\train_music_cheater_gen_ar_prior.yml', 'generator', also_load_savepoint=False, - load_path='X:\\dlas\\experiments\\train_music_cheater_gen_v5_causal_retrain\\models\\80500_generator_ema.pth' + load_path='X:\\dlas\\experiments\\train_music_diffusion_tfd12_cheater_gen_ar_prior\\models\\43500_generator_ema.pth' ).cuda() opt_eval = {'path': 'Y:\\split\\yt-music-eval', # eval music, mostly electronica. :) #'path': 'E:\\music_eval', # this is music from the training dataset, including a lot more variety. 'diffusion_steps': 256, # basis: 192 - 'conditioning_free': True, 'conditioning_free_k': 1, 'use_ddim': False, 'clip_audio': False, - 'diffusion_schedule': 'linear', 'diffusion_type': 'cheater_gen', + 'conditioning_free': True, 'conditioning_free_k': 1, 'use_ddim': True, 'clip_audio': False, + 'diffusion_schedule': 'linear', 'diffusion_type': 'from_ar_prior', # Slope 1: 1.03x, 2: 1.06, 4: 1.135, 8: 1.27, 16: 1.54 - 'causal': True, 'causal_slope': 4, # DONT FORGET TO INCREMENT THE STEP! + #'causal': True, 'causal_slope': 4, # DONT FORGET TO INCREMENT THE STEP! #'partial_low': 128, 'partial_high': 192 } - env = {'rank': 0, 'base_path': 'D:\\tmp\\test_eval_music', 'step': 104, 'device': 'cuda', 'opt': {}} + env = {'rank': 0, 'base_path': 'D:\\tmp\\test_eval_music', 'step': 200, 'device': 'cuda', 'opt': {}} eval = MusicDiffusionFid(diffusion, opt_eval, env) fds = [] for i in range(2):