iq checkin
yeah - I'm at it again...
This commit is contained in:
parent
a073fbfcb8
commit
20ef9cc6b4
170
codes/models/audio/music/instrument_quantizer.py
Normal file
170
codes/models/audio/music/instrument_quantizer.py
Normal file
|
@ -0,0 +1,170 @@
|
|||
import functools
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from models.diffusion.nn import timestep_embedding
|
||||
from models.lucidrains.x_transformers import FeedForward, Attention, Decoder, RMSScaleShiftNorm
|
||||
from trainer.networks import register_model
|
||||
from utils.util import checkpoint
|
||||
|
||||
|
||||
class SelfClassifyingHead(nn.Module):
|
||||
def __init__(self, dim, classes, out_dim, head_depth, seq_len, dropout, init_temperature):
|
||||
super().__init__()
|
||||
self.seq_len = seq_len
|
||||
self.num_classes = classes
|
||||
self.temperature = init_temperature
|
||||
self.dec = Decoder(dim=dim, depth=head_depth, heads=2, ff_dropout=dropout, ff_mult=2, attn_dropout=dropout,
|
||||
use_rmsnorm=True, ff_glu=True, rotary_pos_emb=True)
|
||||
self.to_classes = nn.Linear(dim, classes)
|
||||
self.feedback_codebooks = nn.Linear(classes, dim, bias=False)
|
||||
self.codebooks = nn.Linear(classes, out_dim, bias=False)
|
||||
|
||||
@staticmethod
|
||||
def _compute_perplexity(probs, mask=None):
|
||||
if mask is not None:
|
||||
mask_extended = mask.flatten()[:, None, None].expand(probs.shape)
|
||||
probs = torch.where(mask_extended, probs, torch.zeros_like(probs))
|
||||
marginal_probs = probs.sum(dim=0) / mask.sum()
|
||||
else:
|
||||
marginal_probs = probs.mean(dim=0)
|
||||
|
||||
perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum()
|
||||
return perplexity
|
||||
|
||||
def do_ar_step(self, x, used_codes):
|
||||
h = self.dec(x)
|
||||
h = self.to_classes(h[:,-1])
|
||||
for uc in used_codes:
|
||||
mask = torch.arange(0, self.num_classes, device=x.device).unsqueeze(0).repeat(x.shape[0],1) == uc.unsqueeze(1)
|
||||
h[mask] = -torch.inf
|
||||
c = F.gumbel_softmax(h, tau=self.temperature, hard=self.temperature==1)\
|
||||
|
||||
soft_c = torch.softmax(h, dim=-1)
|
||||
perplexity = self._compute_perplexity(soft_c)
|
||||
return c, perplexity
|
||||
|
||||
def forward(self, x):
|
||||
with torch.no_grad():
|
||||
# Force one of the codebook weights to zero, allowing the model to "skip" any classes it chooses.
|
||||
self.codebooks.weight.data[:, 0] = 0
|
||||
|
||||
# manually perform ar regression over sequence_length=self.seq_len
|
||||
stack = [x]
|
||||
outputs = []
|
||||
results = []
|
||||
codes = []
|
||||
total_perplexity = 0
|
||||
for i in range(self.seq_len):
|
||||
nc, perp = checkpoint(functools.partial(self.do_ar_step, used_codes=codes), torch.stack(stack, dim=1))
|
||||
codes.append(nc.argmax(-1))
|
||||
stack.append(self.feedback_codebooks(nc))
|
||||
outputs.append(self.codebooks(nc))
|
||||
results.append(torch.stack(outputs, dim=1).sum(1))
|
||||
total_perplexity = total_perplexity + perp
|
||||
return results, total_perplexity / self.seq_len, torch.cat(codes, dim=-1)
|
||||
|
||||
|
||||
class VectorResBlock(nn.Module):
|
||||
def __init__(self, dim, dropout):
|
||||
super().__init__()
|
||||
self.norm = nn.BatchNorm1d(dim)
|
||||
self.ff = FeedForward(dim, mult=2, glu=True, dropout=dropout, zero_init_output=True)
|
||||
|
||||
def forward(self, x):
|
||||
h = self.norm(x.unsqueeze(-1)).squeeze(-1)
|
||||
h = self.ff(h)
|
||||
return h + x
|
||||
|
||||
|
||||
class InstrumentQuantizer(nn.Module):
|
||||
def __init__(self, op_dim, dim, num_classes, enc_depth, head_depth, class_seq_len=5, dropout=.1,
|
||||
min_temp=1, max_temp=10, temp_decay=.999):
|
||||
"""
|
||||
|
||||
Args:
|
||||
op_dim:
|
||||
dim:
|
||||
num_classes:
|
||||
enc_depth:
|
||||
head_depth:
|
||||
class_seq_len:
|
||||
dropout:
|
||||
min_temp:
|
||||
max_temp:
|
||||
temp_decay: Temperature decay. Default value of .999 decays ~50% in 1000 steps.
|
||||
"""
|
||||
super().__init__()
|
||||
self.op_dim = op_dim
|
||||
self.proj = nn.Linear(op_dim, dim)
|
||||
self.encoder = nn.ModuleList([VectorResBlock(dim, dropout) for _ in range(enc_depth)])
|
||||
self.heads = SelfClassifyingHead(dim, num_classes, op_dim, head_depth, class_seq_len, dropout, max_temp)
|
||||
self.min_gumbel_temperature = min_temp
|
||||
self.max_gumbel_temperature = max_temp
|
||||
self.gumbel_temperature_decay = temp_decay
|
||||
|
||||
self.codes = torch.zeros((3000000,), dtype=torch.long)
|
||||
self.internal_step = 0
|
||||
self.code_ind = 0
|
||||
self.total_codes = 0
|
||||
|
||||
def forward(self, x):
|
||||
b, c, s = x.shape
|
||||
px = x.permute(0,2,1) # B,S,C shape
|
||||
f = px.reshape(-1, self.op_dim)
|
||||
h = self.proj(f)
|
||||
for lyr in self.encoder:
|
||||
h = lyr(h)
|
||||
|
||||
reconstructions, perplexity, codes = self.heads(h)
|
||||
reconstruction_losses = torch.stack([F.mse_loss(r.reshape(b, s, c), px) for r in reconstructions])
|
||||
r_follow = torch.arange(1, reconstruction_losses.shape[0]+1, device=x.device)
|
||||
reconstruction_losses = (reconstruction_losses * r_follow / r_follow.shape[0])
|
||||
self.log_codes(codes)
|
||||
|
||||
return reconstruction_losses, perplexity
|
||||
|
||||
def log_codes(self, codes):
|
||||
if self.internal_step % 5 == 0:
|
||||
l = codes.shape[0]
|
||||
i = self.code_ind if (self.codes.shape[0] - self.code_ind) > l else self.codes.shape[0] - l
|
||||
self.codes[i:i+l] = codes.cpu()
|
||||
self.code_ind = self.code_ind + l
|
||||
if self.code_ind >= self.codes.shape[0]:
|
||||
self.code_ind = 0
|
||||
self.total_codes += 1
|
||||
|
||||
def get_debug_values(self, step, __):
|
||||
if self.total_codes > 0:
|
||||
return {'histogram_codes': self.codes[:self.total_codes],
|
||||
'temperature': self.heads.temperature}
|
||||
else:
|
||||
return {}
|
||||
|
||||
def update_for_step(self, step, *args):
|
||||
self.internal_step = step
|
||||
self.heads.temperature = max(
|
||||
self.max_gumbel_temperature * self.gumbel_temperature_decay**step,
|
||||
self.min_gumbel_temperature,
|
||||
)
|
||||
|
||||
def get_grad_norm_parameter_groups(self):
|
||||
groups = {
|
||||
'encoder': list(self.encoder.parameters()),
|
||||
'heads': list(self.heads.parameters()),
|
||||
'proj': list(self.proj.parameters()),
|
||||
}
|
||||
return groups
|
||||
|
||||
|
||||
@register_model
|
||||
def register_instrument_quantizer(opt_net, opt):
|
||||
return InstrumentQuantizer(**opt_net['kwargs'])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
inp = torch.randn((4,256,200))
|
||||
model = InstrumentQuantizer(256, 512, 4096, 8, 3)
|
||||
model(inp)
|
|
@ -366,7 +366,7 @@ class RMSScaleShiftNorm(nn.Module):
|
|||
|
||||
ss_emb = self.scale_shift_process(norm_scale_shift_inp)
|
||||
scale, shift = torch.chunk(ss_emb, 2, dim=-1)
|
||||
if len(scale.shape) == 2:
|
||||
if len(scale.shape) == 2 and len(x.shape) == 3:
|
||||
scale = scale.unsqueeze(1)
|
||||
shift = shift.unsqueeze(1)
|
||||
h = norm * (1 + scale) + shift
|
||||
|
|
|
@ -340,7 +340,7 @@ class Trainer:
|
|||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../options/train_music_cheater_gen.yml')
|
||||
parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../options/train_music_instrument_quantizer.yml')
|
||||
parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher')
|
||||
args = parser.parse_args()
|
||||
opt = option.parse(args.opt, is_train=True)
|
||||
|
|
|
@ -438,18 +438,18 @@ class MusicDiffusionFid(evaluator.Evaluator):
|
|||
if __name__ == '__main__':
|
||||
diffusion = load_model_from_config('X:\\dlas\\experiments\\train_music_cheater_gen.yml', 'generator',
|
||||
also_load_savepoint=False,
|
||||
load_path='X:\\dlas\\experiments\\train_music_cheater_gen_v5_cosine_40_lyr\\models\\19500_generator.pth'
|
||||
load_path='X:\\dlas\\experiments\\train_music_cheater_gen_v5_cosine_40_lyr\\models\\40000_generator_ema.pth'
|
||||
).cuda()
|
||||
opt_eval = {'path': 'Y:\\split\\yt-music-eval', # eval music, mostly electronica. :)
|
||||
#'path': 'E:\\music_eval', # this is music from the training dataset, including a lot more variety.
|
||||
'diffusion_steps': 128, # basis: 192
|
||||
'diffusion_steps': 256, # basis: 192
|
||||
'conditioning_free': True, 'conditioning_free_k': 1, 'use_ddim': False, 'clip_audio': False,
|
||||
'diffusion_schedule': 'cosine', 'diffusion_type': 'cheater_gen',
|
||||
# Slope 1: 1.03x, 2: 1.06, 4: 1.135, 8: 1.27, 16: 1.54
|
||||
#'causal': True, 'causal_slope': 4, # DONT FORGET TO INCREMENT THE STEP!
|
||||
#'partial_low': 128, 'partial_high': 192
|
||||
}
|
||||
env = {'rank': 0, 'base_path': 'D:\\tmp\\test_eval_music', 'step': 204, 'device': 'cuda', 'opt': {}}
|
||||
env = {'rank': 0, 'base_path': 'D:\\tmp\\test_eval_music', 'step': 5, 'device': 'cuda', 'opt': {}}
|
||||
eval = MusicDiffusionFid(diffusion, opt_eval, env)
|
||||
fds = []
|
||||
for i in range(2):
|
||||
|
|
Loading…
Reference in New Issue
Block a user