music script

This commit is contained in:
James Betker 2022-07-13 21:26:55 -06:00
parent e23c322089
commit 711c53c1f0
4 changed files with 48 additions and 52 deletions

View File

@ -313,7 +313,7 @@ def test_cheater_model():
# For music:
model = TransformerDiffusionWithPointConditioning(in_channels=256, out_channels=512, model_channels=1024,
contraction_dim=512, num_heads=8, num_layers=40, dropout=0,
contraction_dim=512, num_heads=8, num_layers=32, dropout=0,
unconditioned_percentage=.4, checkpoint_conditioning=False,
regularization=True, new_cond=True)
print_network(model)

View File

@ -4,11 +4,12 @@ import torch
import numpy as np
import torchaudio
import torchvision
from tqdm import tqdm
from models.audio.music.tfdpc_v5 import TransformerDiffusionWithPointConditioning
from utils.music_utils import get_cheater_decoder
from utils.music_utils import get_cheater_decoder, get_mel2wav_v3_model
from utils.util import load_audio
from trainer.injectors.audio_injectors import TorchMelSpectrogramInjector
from trainer.injectors.audio_injectors import TorchMelSpectrogramInjector, denormalize_mel, pixel_shuffle_1d
from trainer.injectors.audio_injectors import MusicCheaterLatentInjector
from models.diffusion.respace import SpacedDiffusion
from models.diffusion.respace import space_timesteps
@ -22,66 +23,61 @@ def join_music(clip1, clip1_cut, clip2, clip2_cut, mix_time, results_dir):
cheater_encoder = MusicCheaterLatentInjector({'in': 'in', 'out': 'out'}, {}).cuda()
model = TransformerDiffusionWithPointConditioning(in_channels=256, out_channels=512, model_channels=1024,
contraction_dim=512, num_heads=8, num_layers=12, dropout=0,
use_fp16=False, unconditioned_percentage=0).eval().cuda()
use_fp16=False, unconditioned_percentage=0, time_proj=True).eval().cuda()
diffuser = SpacedDiffusion(use_timesteps=space_timesteps(4000, [256]), model_mean_type='epsilon',
model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', 4000),
conditioning_free=True, conditioning_free_k=2)
model.load_state_dict(torch.load('x:/dlas/experiments/train_music_cheater_gen_v5/models/72000_generator_ema.pth'))
clip1 = load_audio(clip1, 22050)[:-(clip1_cut*22050)].cuda()
model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', 4000),
conditioning_free=True, conditioning_free_k=1)
model.load_state_dict(torch.load('x:/dlas/experiments/train_music_cheater_gen_v5/models/206000_generator_ema.pth'))
clip1 = load_audio(clip1, 22050).cuda()
clip1_mel = spec_fn({'in': clip1.unsqueeze(0)})['out']
clip1_cheater = cheater_encoder({'in': clip1_mel})['out']
clip2 = load_audio(clip2, 22050)[clip2_cut*22050:].cuda()
clip1_leadin = clip1_cheater[:,:,-60:]
clip1_cheater = clip1_cheater[:,:,-260:-60]
clip2 = load_audio(clip2, 22050).cuda()
clip2_mel = spec_fn({'in': clip2.unsqueeze(0)})['out']
clip2_cheater = cheater_encoder({'in': clip2_mel})['out']
clip2_leadin = clip2_cheater[:,:,:60]
clip2_cheater = clip2_cheater[:,:,60:260]
inp = torch.cat([clip1_leadin, torch.zeros(1,256,240, device='cuda'), clip2_leadin], dim=-1)
mask = torch.ones_like(inp)
mask[:,:,60:-60] = 0
gen_cheater = diffuser.p_sample_loop_with_guidance(model, inp, mask, # causal=True, causal_slope=4,
model_kwargs={'cond_left': clip1_cheater, 'cond_right': clip2_cheater})
blank_cheater_sz = (22050*mix_time//4096)
sample_template = torch.cat([clip1_cheater[:,:,-25:],
torch.zeros(1,256,blank_cheater_sz, device='cuda'),
clip2_cheater[:,:,:25]], dim=-1)
mask = torch.ones_like(sample_template)
mask[:,:,25:-25] = 0
def custom_conditioning_endpoint_fetch(cond_enc, ts):
clip_sz = 100
combined_cheater = torch.cat([clip1_cheater[:,:,-clip_sz:], clip2_cheater[:,:,:clip_sz]], dim=-1)
enc = cond_enc(combined_cheater, ts)
start_cond = enc[:,:,clip_sz-25] # About 5 seconds back into the clip.
end_cond = enc[:,:,clip_sz+25]
return start_cond, end_cond
gen_cheater = diffuser.p_sample_loop_with_guidance(model, sample_template, mask,
model_kwargs={'custom_conditioning_fetcher': custom_conditioning_endpoint_fetch})
cheater_decoder_diffuser = SpacedDiffusion(use_timesteps=space_timesteps(4000, [64]), model_mean_type='epsilon',
model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', 4000),
conditioning_free=True, conditioning_free_k=1)
cheater_to_mel = get_cheater_decoder().diff.cuda()
gen_mel = cheater_decoder_diffuser.ddim_sample_loop(cheater_to_mel, (1,256,gen_cheater.shape[-1]*16), progress=True,
model_kwargs={'codes': gen_cheater.permute(0,2,1)})
torchvision.utils.save_image((gen_mel + 1)/2, f'{results_dir}/mel.png')
from utils.music_utils import get_mel2wav_v3_model
cheater_decoder_diffuser = SpacedDiffusion(use_timesteps=space_timesteps(4000, [64]), model_mean_type='epsilon',
model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', 4000),
conditioning_free=True, conditioning_free_k=1)
m2w = get_mel2wav_v3_model().cuda()
spectral_diffuser = SpacedDiffusion(use_timesteps=space_timesteps(4000, [32]), model_mean_type='epsilon',
model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', 4000),
conditioning_free=True, conditioning_free_k=1)
from trainer.injectors.audio_injectors import denormalize_mel
gen_mel_denorm = denormalize_mel(gen_mel)
output_shape = (1,16,gen_mel_denorm.shape[-1]*256//16)
gen_wav = spectral_diffuser.ddim_sample_loop(m2w, output_shape, progress=True, model_kwargs={'codes': gen_mel_denorm})
from trainer.injectors.audio_injectors import pixel_shuffle_1d
gen_wav = pixel_shuffle_1d(gen_wav, 16)
model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', 4000),
conditioning_free=True, conditioning_free_k=1)
MAX_CONTEXT = 30 * 22050 // 4096
chunks = torch.split(gen_cheater, MAX_CONTEXT, dim=-1)
gen_wavs = []
for i, chunk_cheater in enumerate(tqdm(chunks)):
gen_mel = cheater_decoder_diffuser.ddim_sample_loop(cheater_to_mel, (1,256,chunk_cheater.shape[-1]*16), progress=True,
model_kwargs={'codes': chunk_cheater.permute(0,2,1)})
torchvision.utils.save_image((gen_mel + 1)/2, f'{results_dir}/mel_{i}.png')
gen_mel_denorm = denormalize_mel(gen_mel)
output_shape = (1,16,gen_mel_denorm.shape[-1]*256//16)
wav = spectral_diffuser.ddim_sample_loop(m2w, output_shape, progress=True, model_kwargs={'codes': gen_mel_denorm})
gen_wavs.append(pixel_shuffle_1d(wav, 16))
gen_wav = torch.cat(gen_wavs, dim=-1)
torchaudio.save(f'{results_dir}/out.wav', gen_wav.squeeze(1).cpu(), 22050)
if __name__ == '__main__':
results_dir = '../results/audio_joiner'
clip1 = 'Y:\\sources\\manual_podcasts_music\\2\\The Glitch Mob - Discography\\2014 - Love, Death Immortality\\2. Our Demons (feat. Aja Volkman).mp3'
#clip1 = 'Y:\\sources\\music\\manual_podcasts_music\\2\\The Glitch Mob - Discography\\2014 - Love, Death Immortality\\2. Our Demons (feat. Aja Volkman).mp3'
clip1 = 'Y:\\separated\\bt-music-5\\[2002] Gutterflower\\02 - Think About Me\\00000\\no_vocals.wav'
clip1_cut = 35 # Seconds
clip2 = 'Y:\\sources\\manual_podcasts_music\\2\\The Glitch Mob - Discography\\2014 - Love, Death Immortality\\9. Carry The Sun.mp3'
#clip2 = 'Y:\\sources\\music\\manual_podcasts_music\\2\\The Glitch Mob - Discography\\2014 - Love, Death Immortality\\9. Carry The Sun.mp3'
clip2 = 'Y:\\separated\\bt-music-5\\[2002] Gutterflower\\02 - Think About Me\\00003\\no_vocals.wav'
clip2_cut = 1
mix_time = 10
os.makedirs(results_dir, exist_ok=True)

View File

@ -434,20 +434,20 @@ class MusicDiffusionFid(evaluator.Evaluator):
if __name__ == '__main__':
diffusion = load_model_from_config('X:\\dlas\\experiments\\train_music_cheater_gen.yml', 'generator',
diffusion = load_model_from_config('X:\\dlas\\experiments\\train_music_cheater_gen_ar_prior.yml', 'generator',
also_load_savepoint=False,
load_path='X:\\dlas\\experiments\\train_music_cheater_gen_v5_causal_retrain\\models\\80500_generator_ema.pth'
load_path='X:\\dlas\\experiments\\train_music_diffusion_tfd12_cheater_gen_ar_prior\\models\\43500_generator_ema.pth'
).cuda()
opt_eval = {'path': 'Y:\\split\\yt-music-eval', # eval music, mostly electronica. :)
#'path': 'E:\\music_eval', # this is music from the training dataset, including a lot more variety.
'diffusion_steps': 256, # basis: 192
'conditioning_free': True, 'conditioning_free_k': 1, 'use_ddim': False, 'clip_audio': False,
'diffusion_schedule': 'linear', 'diffusion_type': 'cheater_gen',
'conditioning_free': True, 'conditioning_free_k': 1, 'use_ddim': True, 'clip_audio': False,
'diffusion_schedule': 'linear', 'diffusion_type': 'from_ar_prior',
# Slope 1: 1.03x, 2: 1.06, 4: 1.135, 8: 1.27, 16: 1.54
'causal': True, 'causal_slope': 4, # DONT FORGET TO INCREMENT THE STEP!
#'causal': True, 'causal_slope': 4, # DONT FORGET TO INCREMENT THE STEP!
#'partial_low': 128, 'partial_high': 192
}
env = {'rank': 0, 'base_path': 'D:\\tmp\\test_eval_music', 'step': 104, 'device': 'cuda', 'opt': {}}
env = {'rank': 0, 'base_path': 'D:\\tmp\\test_eval_music', 'step': 200, 'device': 'cuda', 'opt': {}}
eval = MusicDiffusionFid(diffusion, opt_eval, env)
fds = []
for i in range(2):