From f8108cfdb20da59ab6e9053c787efd585bcbc419 Mon Sep 17 00:00:00 2001 From: James Betker Date: Sun, 24 Jul 2022 23:43:25 -0600 Subject: [PATCH] update environment and fix a bunch of deps --- .idea/.gitignore | 2 - .idea/dlas.iml | 8 + .idea/inspectionProfiles/Project_Default.xml | 19 +++ .idea/misc.xml | 5 +- .idea/mmsr.iml | 19 --- .idea/modules.xml | 2 +- .idea/other.xml | 6 - .idea/vcs.xml | 5 +- .idea/workspace.xml | 150 ++++++++++++++++++ codes/models/arch_util.py | 22 ++- .../audio/music/unet_diffusion_music_codes.py | 7 +- codes/models/audio/tts/diffusion_encoder.py | 2 +- codes/models/diffusion/unet_diffusion.py | 8 +- codes/models/lucidrains/x_transformers.py | 51 ------ codes/requirements.txt | 5 +- codes/trainer/eval/music_diffusion_fid.py | 14 +- 16 files changed, 209 insertions(+), 116 deletions(-) delete mode 100644 .idea/.gitignore create mode 100644 .idea/dlas.iml create mode 100644 .idea/inspectionProfiles/Project_Default.xml delete mode 100644 .idea/mmsr.iml delete mode 100644 .idea/other.xml create mode 100644 .idea/workspace.xml diff --git a/.idea/.gitignore b/.idea/.gitignore deleted file mode 100644 index e7e9d11d..00000000 --- a/.idea/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -# Default ignored files -/workspace.xml diff --git a/.idea/dlas.iml b/.idea/dlas.iml new file mode 100644 index 00000000..d4ae707c --- /dev/null +++ b/.idea/dlas.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 00000000..b7c8774c --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,19 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index 0adf3fba..a1f64f16 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,7 +1,4 @@ - - - + \ No newline at end of file diff --git a/.idea/mmsr.iml b/.idea/mmsr.iml deleted file mode 100644 index b06487d4..00000000 --- a/.idea/mmsr.iml +++ /dev/null @@ -1,19 +0,0 @@ - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml index e88fd2ba..64a3ac23 100644 --- a/.idea/modules.xml +++ b/.idea/modules.xml @@ -2,7 +2,7 @@ - + \ No newline at end of file diff --git a/.idea/other.xml b/.idea/other.xml deleted file mode 100644 index 58daadce..00000000 --- a/.idea/other.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml index 8d59ed0d..94a25f7f 100644 --- a/.idea/vcs.xml +++ b/.idea/vcs.xml @@ -1,9 +1,6 @@ - - - - + \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml new file mode 100644 index 00000000..9741d16f --- /dev/null +++ b/.idea/workspace.xml @@ -0,0 +1,150 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1658725632364 + + + + + + + + + + + \ No newline at end of file diff --git a/codes/models/arch_util.py b/codes/models/arch_util.py index bae2ba9c..9383ac7b 100644 --- a/codes/models/arch_util.py +++ b/codes/models/arch_util.py @@ -483,16 +483,24 @@ class RelativeQKBias(nn.Module): """ Very simple relative position bias scheme which should be directly added to QK matrix. This bias simply applies to the distance from the given element. + + If symmetric=False, a different bias is applied to each side of the input element, otherwise the bias is symmetric. """ - def __init__(self, l, max_positions=4000): + def __init__(self, l, max_positions=4000, symmetric=True): super().__init__() - self.emb = nn.Parameter(torch.randn(l+1) * .01) - o = torch.arange(0,max_positions) - c = o.unsqueeze(-1).repeat(1,max_positions) - r = o.unsqueeze(0).repeat(max_positions,1) - M = ((-(r-c).abs())+l).clamp(0,l) + if symmetric: + self.emb = nn.Parameter(torch.randn(l+1) * .01) + o = torch.arange(0,max_positions) + c = o.unsqueeze(-1).repeat(1,max_positions) + r = o.unsqueeze(0).repeat(max_positions,1) + M = ((-(r-c).abs())+l).clamp(0,l) + else: + self.emb = nn.Parameter(torch.randn(l*2+2) * .01) + a = torch.arange(0,max_positions) + c = a.unsqueeze(-1) - a + m = (c >= -l).logical_and(c <= l) + M = (l+c+1)*m self.register_buffer('M', M, persistent=False) - self.initted = False def forward(self, n): # Ideally, I'd return this: diff --git a/codes/models/audio/music/unet_diffusion_music_codes.py b/codes/models/audio/music/unet_diffusion_music_codes.py index 50f2ed27..d0818744 100644 --- a/codes/models/audio/music/unet_diffusion_music_codes.py +++ b/codes/models/audio/music/unet_diffusion_music_codes.py @@ -8,7 +8,6 @@ import torch as th import torch.nn as nn import torch.nn.functional as F import torchvision # For debugging, not actually used. -from x_transformers.x_transformers import RelativePositionBias from models.audio.music.gpt_music import GptMusicLower from models.audio.music.music_quantizer import MusicQuantizer @@ -291,10 +290,6 @@ class AttentionBlock(nn.Module): self.attention = QKVAttentionLegacy(self.num_heads) self.proj_out = zero_module(conv_nd(1, channels, channels, 1)) - if relative_pos_embeddings: - self.relative_pos_embeddings = RelativePositionBias(scale=(channels // self.num_heads) ** .5, causal=False, heads=num_heads, num_buckets=32, max_distance=64) - else: - self.relative_pos_embeddings = None def forward(self, x, mask=None): if self.do_checkpoint: @@ -306,7 +301,7 @@ class AttentionBlock(nn.Module): b, c, *spatial = x.shape x = x.reshape(b, c, -1) qkv = self.qkv(self.norm(x)) - h = self.attention(qkv, mask, self.relative_pos_embeddings) + h = self.attention(qkv, mask) h = self.proj_out(h) return (x + h).reshape(b, c, *spatial) diff --git a/codes/models/audio/tts/diffusion_encoder.py b/codes/models/audio/tts/diffusion_encoder.py index 56d25d61..1dc91c0d 100644 --- a/codes/models/audio/tts/diffusion_encoder.py +++ b/codes/models/audio/tts/diffusion_encoder.py @@ -6,7 +6,7 @@ from functools import partial import torch import torch.nn as nn from x_transformers.x_transformers import groupby_prefix_and_trim, FixedPositionalEmbedding, default, RotaryEmbedding, \ - DEFAULT_DIM_HEAD, RelativePositionBias, LearnedAlibiPositionalBias, AlibiPositionalBias, ScaleNorm, RMSNorm, Rezero, \ + DEFAULT_DIM_HEAD, RelativePositionBias, LearnedAlibiPositionalBias, AlibiPositionalBias, ScaleNorm, RMSNorm, \ exists, Attention, FeedForward, Scale, ShiftTokens, GRUGating, Residual, cast_tuple, equals, LayerIntermediates, \ AttentionLayers, not_equals diff --git a/codes/models/diffusion/unet_diffusion.py b/codes/models/diffusion/unet_diffusion.py index d99430e7..e2ed97b7 100644 --- a/codes/models/diffusion/unet_diffusion.py +++ b/codes/models/diffusion/unet_diffusion.py @@ -8,7 +8,6 @@ import torch as th import torch.nn as nn import torch.nn.functional as F import torchvision # For debugging, not actually used. -from x_transformers.x_transformers import RelativePositionBias from models.diffusion.fp16_util import convert_module_to_f16, convert_module_to_f32 from models.diffusion.nn import ( @@ -298,7 +297,6 @@ class AttentionBlock(nn.Module): num_head_channels=-1, use_new_attention_order=False, do_checkpoint=True, - relative_pos_embeddings=False, ): super().__init__() self.channels = channels @@ -320,10 +318,6 @@ class AttentionBlock(nn.Module): self.attention = QKVAttentionLegacy(self.num_heads) self.proj_out = zero_module(conv_nd(1, channels, channels, 1)) - if relative_pos_embeddings: - self.relative_pos_embeddings = RelativePositionBias(scale=(channels // self.num_heads) ** .5, causal=False, heads=num_heads, num_buckets=32, max_distance=64) - else: - self.relative_pos_embeddings = None def forward(self, x, mask=None): if self.do_checkpoint: @@ -335,7 +329,7 @@ class AttentionBlock(nn.Module): b, c, *spatial = x.shape x = x.reshape(b, c, -1) qkv = self.qkv(self.norm(x)) - h = self.attention(qkv, mask, self.relative_pos_embeddings) + h = self.attention(qkv, mask) h = self.proj_out(h) return (x + h).reshape(b, c, *spatial) diff --git a/codes/models/lucidrains/x_transformers.py b/codes/models/lucidrains/x_transformers.py index f4139f1c..32ad19e7 100644 --- a/codes/models/lucidrains/x_transformers.py +++ b/codes/models/lucidrains/x_transformers.py @@ -10,11 +10,8 @@ from collections import namedtuple from einops import rearrange, repeat, reduce from einops.layers.torch import Rearrange -from entmax import entmax15 from torch.utils.checkpoint import checkpoint -from x_transformers.autoregressive_wrapper import AutoregressiveWrapper - DEFAULT_DIM_HEAD = 64 Intermediates = namedtuple('Intermediates', [ @@ -1274,51 +1271,3 @@ class ContinuousTransformerWrapper(nn.Module): if len(res) > 1: return tuple(res) return res[0] - - -class XTransformer(nn.Module): - def __init__( - self, - *, - dim, - tie_token_emb=False, - **kwargs - ): - super().__init__() - enc_kwargs, kwargs = groupby_prefix_and_trim('enc_', kwargs) - dec_kwargs, kwargs = groupby_prefix_and_trim('dec_', kwargs) - - assert 'dim' not in enc_kwargs and 'dim' not in dec_kwargs, 'dimension of either encoder or decoder must be set with `dim` keyword' - enc_transformer_kwargs = pick_and_pop(['num_tokens', 'max_seq_len'], enc_kwargs) - enc_transformer_kwargs['emb_dropout'] = enc_kwargs.pop('emb_dropout', 0) - enc_transformer_kwargs['num_memory_tokens'] = enc_kwargs.pop('num_memory_tokens', None) - enc_transformer_kwargs['use_pos_emb'] = enc_kwargs.pop('use_pos_emb', True) - - dec_transformer_kwargs = pick_and_pop(['num_tokens', 'max_seq_len'], dec_kwargs) - dec_transformer_kwargs['emb_dropout'] = dec_kwargs.pop('emb_dropout', 0) - dec_transformer_kwargs['use_pos_emb'] = dec_kwargs.pop('use_pos_emb', True) - - self.encoder = TransformerWrapper( - **enc_transformer_kwargs, - attn_layers=Encoder(dim=dim, **enc_kwargs) - ) - - self.decoder = TransformerWrapper( - **dec_transformer_kwargs, - attn_layers=Decoder(dim=dim, cross_attend=True, **dec_kwargs) - ) - - if tie_token_emb: - self.decoder.token_emb = self.encoder.token_emb - - self.decoder = AutoregressiveWrapper(self.decoder) - - @torch.no_grad() - def generate(self, seq_in, seq_out_start, seq_len, src_mask=None, src_attn_mask=None, **kwargs): - encodings = self.encoder(seq_in, mask=src_mask, attn_mask=src_attn_mask, return_embeddings=True) - return self.decoder.generate(seq_out_start, seq_len, context=encodings, context_mask=src_mask, **kwargs) - - def forward(self, src, tgt, src_mask=None, tgt_mask=None, src_attn_mask=None): - enc = self.encoder(src, mask=src_mask, attn_mask=src_attn_mask, return_embeddings=True) - out = self.decoder(tgt, context=enc, mask=tgt_mask, context_mask=src_mask) - return out diff --git a/codes/requirements.txt b/codes/requirements.txt index c4b32d28..c79c7909 100644 --- a/codes/requirements.txt +++ b/codes/requirements.txt @@ -30,11 +30,13 @@ Unidecode==1.0.22 tgt == 1.4.4 pyworld == 0.2.10 audio2numpy +SoundFile # For text stuff transformers tokenizers jiwer # calculating WER +omegaconf # lucidrains stuff vector_quantize_pytorch @@ -42,4 +44,5 @@ linear_attention_transformer rotary-embedding-torch axial_positional_embedding g-mlp-pytorch -x-clip \ No newline at end of file +x-clip +x_transformers \ No newline at end of file diff --git a/codes/trainer/eval/music_diffusion_fid.py b/codes/trainer/eval/music_diffusion_fid.py index 08ff3fe1..c4b05356 100644 --- a/codes/trainer/eval/music_diffusion_fid.py +++ b/codes/trainer/eval/music_diffusion_fid.py @@ -315,14 +315,15 @@ class MusicDiffusionFid(evaluator.Evaluator): if __name__ == '__main__': # For multilevel SR: + """ diffusion = load_model_from_config('X:\\dlas\\experiments\\train_music_diffusion_multilevel_sr.yml', 'generator', also_load_savepoint=False, strict_load=False, - load_path='X:\\dlas\\experiments\\train_music_diffusion_multilevel_sr\\models\\18000_generator.pth' + load_path='X:\\dlas\\experiments\\train_music_diffusion_multilevel_sr_archived_prev2\\models\\18000_generator.pth' ).cuda() opt_eval = {'path': 'Y:\\split\\yt-music-eval', # eval music, mostly electronica. :) #'path': 'E:\\music_eval', # this is music from the training dataset, including a lot more variety. - 'diffusion_steps': 256, # basis: 192 - 'conditioning_free': True, 'conditioning_free_k': 1, 'use_ddim': False, 'clip_audio': True, + 'diffusion_steps': 64, # basis: 192 + 'conditioning_free': False, 'conditioning_free_k': 1, 'use_ddim': True, 'clip_audio': True, 'diffusion_schedule': 'cosine', 'diffusion_type': 'chained_sr', } @@ -334,13 +335,12 @@ if __name__ == '__main__': ).cuda() opt_eval = {'path': 'Y:\\split\\yt-music-eval', # eval music, mostly electronica. :) #'path': 'E:\\music_eval', # this is music from the training dataset, including a lot more variety. - 'diffusion_steps': 256, # basis: 192 - 'conditioning_free': True, 'conditioning_free_k': 1, 'use_ddim': False, 'clip_audio': True, + 'diffusion_steps': 64, # basis: 192 + 'conditioning_free': False, 'conditioning_free_k': 1, 'use_ddim': True, 'clip_audio': True, 'diffusion_schedule': 'cosine', 'diffusion_type': 'from_codes_quant', } - """ - env = {'rank': 0, 'base_path': 'D:\\tmp\\test_eval_music', 'step': 11, 'device': 'cuda', 'opt': {}} + env = {'rank': 0, 'base_path': 'D:\\tmp\\test_eval_music', 'step': 12, 'device': 'cuda', 'opt': {}} eval = MusicDiffusionFid(diffusion, opt_eval, env) fds = [] for i in range(2):