2022-07-01 06:44:48 +00:00
import os
import torch
import numpy as np
import torchaudio
import torchvision
2022-07-14 03:26:55 +00:00
from tqdm import tqdm
2022-07-01 06:44:48 +00:00
from models . audio . music . tfdpc_v5 import TransformerDiffusionWithPointConditioning
2022-07-14 03:26:55 +00:00
from utils . music_utils import get_cheater_decoder , get_mel2wav_v3_model
2022-07-01 06:44:48 +00:00
from utils . util import load_audio
2022-07-19 17:11:46 +00:00
from trainer . injectors . audio_injectors import TorchMelSpectrogramInjector , denormalize_torch_mel , pixel_shuffle_1d
2022-07-01 06:44:48 +00:00
from trainer . injectors . audio_injectors import MusicCheaterLatentInjector
from models . diffusion . respace import SpacedDiffusion
from models . diffusion . respace import space_timesteps
from models . diffusion . gaussian_diffusion import get_named_beta_schedule
2022-07-19 00:40:25 +00:00
def join_music_with_cheaters ( clip1_cheater , clip2_cheater , results_dir ) :
clip1_leadin = clip1_cheater [ : , : , - 60 : ]
clip1_cheater = clip1_cheater [ : , : , : - 60 ]
clip2_leadin = clip2_cheater [ : , : , : 60 ]
clip2_cheater = clip2_cheater [ : , : , 60 : ]
# Original model
model = TransformerDiffusionWithPointConditioning ( in_channels = 256 , out_channels = 512 , model_channels = 1024 ,
contraction_dim = 512 , num_heads = 8 , num_layers = 12 , dropout = 0 ,
use_fp16 = False , unconditioned_percentage = 0 , time_proj = True ) . eval ( ) . cuda ( )
model . load_state_dict ( torch . load ( ' x:/dlas/experiments/train_music_cheater_gen_v5/models/206000_generator_ema.pth ' ) )
diffusion_type = ' linear '
model = TransformerDiffusionWithPointConditioning ( in_channels = 256 , out_channels = 512 , model_channels = 1024 ,
contraction_dim = 512 , num_heads = 8 , num_layers = 32 , dropout = 0 ,
use_fp16 = False , unconditioned_percentage = 0 , time_proj = False ,
new_cond = True , regularization = False ) . eval ( ) . cuda ( )
model . load_state_dict ( torch . load ( ' x:/dlas/experiments/train_music_cheater_gen_v5_cosine_40_lyr/models/64000_generator_ema.pth ' ) )
diffusion_type = ' cosine '
diffuser = SpacedDiffusion ( use_timesteps = space_timesteps ( 4000 , [ 256 ] ) , model_mean_type = ' epsilon ' ,
model_var_type = ' learned_range ' , loss_type = ' mse ' , betas = get_named_beta_schedule ( diffusion_type , 4000 ) ,
conditioning_free = True , conditioning_free_k = 2 )
inp = torch . cat ( [ clip1_leadin , torch . zeros ( 1 , 256 , 240 , device = ' cuda ' ) , clip2_leadin ] , dim = - 1 )
mask = torch . ones_like ( inp )
mask [ : , : , 60 : - 60 ] = 0
gen_cheater = diffuser . ddim_sample_loop_with_guidance ( model , inp , mask , # causal=True, causal_slope=4,
model_kwargs = { ' cond_left ' : clip1_cheater ,
' cond_right ' : clip2_cheater } )
cheater_to_mel = get_cheater_decoder ( ) . diff . cuda ( )
cheater_decoder_diffuser = SpacedDiffusion ( use_timesteps = space_timesteps ( 4000 , [ 64 ] ) , model_mean_type = ' epsilon ' ,
model_var_type = ' learned_range ' , loss_type = ' mse ' ,
betas = get_named_beta_schedule ( ' linear ' , 4000 ) ,
conditioning_free = True , conditioning_free_k = 1 )
m2w = get_mel2wav_v3_model ( ) . cuda ( )
spectral_diffuser = SpacedDiffusion ( use_timesteps = space_timesteps ( 4000 , [ 32 ] ) , model_mean_type = ' epsilon ' ,
model_var_type = ' learned_range ' , loss_type = ' mse ' ,
betas = get_named_beta_schedule ( ' linear ' , 4000 ) ,
conditioning_free = True , conditioning_free_k = 1 )
MAX_CONTEXT = 30 * 22050 / / 4096
chunks = torch . split ( gen_cheater , MAX_CONTEXT , dim = - 1 )
gen_wavs = [ ]
for i , chunk_cheater in enumerate ( tqdm ( chunks ) ) :
gen_mel = cheater_decoder_diffuser . ddim_sample_loop ( cheater_to_mel , ( 1 , 256 , chunk_cheater . shape [ - 1 ] * 16 ) ,
progress = True ,
model_kwargs = { ' codes ' : chunk_cheater . permute ( 0 , 2 , 1 ) } )
torchvision . utils . save_image ( ( gen_mel + 1 ) / 2 , f ' { results_dir } /mel_ { i } .png ' )
2022-07-19 17:11:46 +00:00
gen_mel_denorm = denormalize_torch_mel ( gen_mel )
2022-07-19 00:40:25 +00:00
output_shape = ( 1 , 16 , gen_mel_denorm . shape [ - 1 ] * 256 / / 16 )
wav = spectral_diffuser . ddim_sample_loop ( m2w , output_shape , progress = True ,
model_kwargs = { ' codes ' : gen_mel_denorm } )
gen_wavs . append ( pixel_shuffle_1d ( wav , 16 ) )
gen_wav = torch . cat ( gen_wavs , dim = - 1 )
torchaudio . save ( f ' { results_dir } /out.wav ' , gen_wav . squeeze ( 1 ) . cpu ( ) , 22050 )
def join_music ( clip1 , clip2 , results_dir ) :
2022-07-01 06:44:48 +00:00
with torch . no_grad ( ) :
spec_fn = TorchMelSpectrogramInjector ( { ' n_mel_channels ' : 256 , ' mel_fmax ' : 11000 , ' filter_length ' : 16000 , ' true_normalization ' : True ,
' normalize ' : True , ' in ' : ' in ' , ' out ' : ' out ' } , { } ) . cuda ( )
cheater_encoder = MusicCheaterLatentInjector ( { ' in ' : ' in ' , ' out ' : ' out ' } , { } ) . cuda ( )
2022-07-14 03:26:55 +00:00
clip1 = load_audio ( clip1 , 22050 ) . cuda ( )
2022-07-01 06:44:48 +00:00
clip1_mel = spec_fn ( { ' in ' : clip1 . unsqueeze ( 0 ) } ) [ ' out ' ]
clip1_cheater = cheater_encoder ( { ' in ' : clip1_mel } ) [ ' out ' ]
2022-07-14 03:26:55 +00:00
clip2 = load_audio ( clip2 , 22050 ) . cuda ( )
2022-07-01 06:44:48 +00:00
clip2_mel = spec_fn ( { ' in ' : clip2 . unsqueeze ( 0 ) } ) [ ' out ' ]
clip2_cheater = cheater_encoder ( { ' in ' : clip2_mel } ) [ ' out ' ]
2022-07-19 00:40:25 +00:00
join_music_with_cheaters ( clip1_cheater , clip2_cheater , results_dir )
2022-07-14 03:26:55 +00:00
2022-07-01 06:44:48 +00:00
if __name__ == ' __main__ ' :
2022-07-19 00:40:25 +00:00
things_to_try = {
' goo ' : ( ' Y: \\ separated \\ bt-music-5 \\ [2002] Gutterflower \\ 02 - Think About Me ' , 0 ) ,
' sm1 ' : ( ' Y: \\ separated \\ silk \\ MonstercatSilkShowcase \\ 910 ' , 79 ) ,
' sm2 ' : ( ' Y: \\ separated \\ silk \\ MonstercatSilkShowcase \\ 1025 ' , 105 ) ,
' sm3 ' : ( ' Y: \\ separated \\ silk \\ MonstercatSilkShowcase \\ 1026 ' , 43 ) ,
' sm4 ' : ( ' Y: \\ separated \\ silk \\ MonstercatSilkShowcase \\ 1026 ' , 77 ) ,
' sm5 ' : ( ' Y: \\ separated \\ silk \\ MonstercatSilkShowcase \\ 1027 ' , 8 ) ,
' sm6 ' : ( ' Y: \\ separated \\ silk \\ MonstercatSilkShowcase \\ 1027 ' , 28 ) ,
' sm6 ' : ( ' Y: \\ separated \\ silk \\ MonstercatSilkShowcase \\ 1017 ' , 90 ) ,
' tron ' : ( ' Y: \\ separated \\ bt-music-2 \\ 2011 - TRON Legacy - Translucence (EP) - (320 kbps) \\ 01 Derezzed ' , 0 ) ,
' lateralus ' : ( ' Y: \\ separated \\ bt-music-2 \\ Lateralus \\ 09 - lateralus \\ 00011 ' , 11 ) ,
' streets_have_no_name ' : ( ' Y: \\ separated \\ bt-music-2 \\ U2 - (1987) The Joshua Tree \\ 01-Where The Streets Have No Name ' , 1 ) ,
' shinra ' : ( ' Y: \\ separated \\ bt-music-1 \\ final_fantasy_vii_soundtrack \\ 20-Infiltrating Shinra Tower ' , 1 ) ,
' bombing_run ' : ( ' Y: \\ separated \\ bt-music-1 \\ final_fantasy_vii_soundtrack \\ 02-Opening ~ Bombing Mission ' , 2 ) ,
' machine_gun ' : ( ' Y: \\ separated \\ bt-music-1 \\ ff8-fithos_lusec_wecos_vinosec-1999 \\ 08 - The Man with the Machine Gun ' , 2 ) ,
for k , v in things_to_try . items ( ) :
results_dir = f ' ../results/audio_joiner/ { k } '
src_path , start = v
clip1 = f ' { src_path } \\ { start : 05d } \\ no_vocals.wav '
clip2 = f ' { src_path } \\ { ( start + 2 ) : 05d } \\ no_vocals.wav '
os . makedirs ( results_dir , exist_ok = True )
join_music ( clip1 , clip2 , results_dir )
results_dir = f ' ../results/audio_joiner/machine_gun_from_cheater '
2022-07-01 06:44:48 +00:00
os . makedirs ( results_dir , exist_ok = True )
2022-07-19 00:40:25 +00:00
cheater = torch . tensor ( np . load ( ' Y: \\ separated \\ large_mel_cheaters \\ bt-music-1 \\ ff8-fithos_lusec_wecos_vinosec-1999 \\ 08 - The Man with the Machine Gun \\ 0.npz ' ) [ ' arr_0 ' ] ) . cuda ( )
join_music_with_cheaters ( cheater [ : , : , : 230 ] , cheater [ : , : , - 230 : ] , results_dir )