diff --git a/.idea/.gitignore b/.idea/.gitignore
deleted file mode 100644
index e7e9d11d..00000000
--- a/.idea/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-# Default ignored files
-/workspace.xml
diff --git a/.idea/dlas.iml b/.idea/dlas.iml
new file mode 100644
index 00000000..d4ae707c
--- /dev/null
+++ b/.idea/dlas.iml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 00000000..b7c8774c
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,19 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 0adf3fba..a1f64f16 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,7 +1,4 @@
-
-
-
-
+
\ No newline at end of file
diff --git a/.idea/mmsr.iml b/.idea/mmsr.iml
deleted file mode 100644
index b06487d4..00000000
--- a/.idea/mmsr.iml
+++ /dev/null
@@ -1,19 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
index e88fd2ba..64a3ac23 100644
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@@ -2,7 +2,7 @@
-
+
\ No newline at end of file
diff --git a/.idea/other.xml b/.idea/other.xml
deleted file mode 100644
index 58daadce..00000000
--- a/.idea/other.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
index 8d59ed0d..94a25f7f 100644
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@@ -1,9 +1,6 @@
-
-
-
-
+
\ No newline at end of file
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
new file mode 100644
index 00000000..9741d16f
--- /dev/null
+++ b/.idea/workspace.xml
@@ -0,0 +1,150 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 1658725632364
+
+
+ 1658725632364
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/codes/models/arch_util.py b/codes/models/arch_util.py
index bae2ba9c..9383ac7b 100644
--- a/codes/models/arch_util.py
+++ b/codes/models/arch_util.py
@@ -483,16 +483,24 @@ class RelativeQKBias(nn.Module):
"""
Very simple relative position bias scheme which should be directly added to QK matrix. This bias simply applies to
the distance from the given element.
+
+ If symmetric=False, a different bias is applied to each side of the input element, otherwise the bias is symmetric.
"""
- def __init__(self, l, max_positions=4000):
+ def __init__(self, l, max_positions=4000, symmetric=True):
super().__init__()
- self.emb = nn.Parameter(torch.randn(l+1) * .01)
- o = torch.arange(0,max_positions)
- c = o.unsqueeze(-1).repeat(1,max_positions)
- r = o.unsqueeze(0).repeat(max_positions,1)
- M = ((-(r-c).abs())+l).clamp(0,l)
+ if symmetric:
+ self.emb = nn.Parameter(torch.randn(l+1) * .01)
+ o = torch.arange(0,max_positions)
+ c = o.unsqueeze(-1).repeat(1,max_positions)
+ r = o.unsqueeze(0).repeat(max_positions,1)
+ M = ((-(r-c).abs())+l).clamp(0,l)
+ else:
+ self.emb = nn.Parameter(torch.randn(l*2+2) * .01)
+ a = torch.arange(0,max_positions)
+ c = a.unsqueeze(-1) - a
+ m = (c >= -l).logical_and(c <= l)
+ M = (l+c+1)*m
self.register_buffer('M', M, persistent=False)
- self.initted = False
def forward(self, n):
# Ideally, I'd return this:
diff --git a/codes/models/audio/music/unet_diffusion_music_codes.py b/codes/models/audio/music/unet_diffusion_music_codes.py
index 50f2ed27..d0818744 100644
--- a/codes/models/audio/music/unet_diffusion_music_codes.py
+++ b/codes/models/audio/music/unet_diffusion_music_codes.py
@@ -8,7 +8,6 @@ import torch as th
import torch.nn as nn
import torch.nn.functional as F
import torchvision # For debugging, not actually used.
-from x_transformers.x_transformers import RelativePositionBias
from models.audio.music.gpt_music import GptMusicLower
from models.audio.music.music_quantizer import MusicQuantizer
@@ -291,10 +290,6 @@ class AttentionBlock(nn.Module):
self.attention = QKVAttentionLegacy(self.num_heads)
self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
- if relative_pos_embeddings:
- self.relative_pos_embeddings = RelativePositionBias(scale=(channels // self.num_heads) ** .5, causal=False, heads=num_heads, num_buckets=32, max_distance=64)
- else:
- self.relative_pos_embeddings = None
def forward(self, x, mask=None):
if self.do_checkpoint:
@@ -306,7 +301,7 @@ class AttentionBlock(nn.Module):
b, c, *spatial = x.shape
x = x.reshape(b, c, -1)
qkv = self.qkv(self.norm(x))
- h = self.attention(qkv, mask, self.relative_pos_embeddings)
+ h = self.attention(qkv, mask)
h = self.proj_out(h)
return (x + h).reshape(b, c, *spatial)
diff --git a/codes/models/audio/tts/diffusion_encoder.py b/codes/models/audio/tts/diffusion_encoder.py
index 56d25d61..1dc91c0d 100644
--- a/codes/models/audio/tts/diffusion_encoder.py
+++ b/codes/models/audio/tts/diffusion_encoder.py
@@ -6,7 +6,7 @@ from functools import partial
import torch
import torch.nn as nn
from x_transformers.x_transformers import groupby_prefix_and_trim, FixedPositionalEmbedding, default, RotaryEmbedding, \
- DEFAULT_DIM_HEAD, RelativePositionBias, LearnedAlibiPositionalBias, AlibiPositionalBias, ScaleNorm, RMSNorm, Rezero, \
+ DEFAULT_DIM_HEAD, RelativePositionBias, LearnedAlibiPositionalBias, AlibiPositionalBias, ScaleNorm, RMSNorm, \
exists, Attention, FeedForward, Scale, ShiftTokens, GRUGating, Residual, cast_tuple, equals, LayerIntermediates, \
AttentionLayers, not_equals
diff --git a/codes/models/diffusion/unet_diffusion.py b/codes/models/diffusion/unet_diffusion.py
index d99430e7..e2ed97b7 100644
--- a/codes/models/diffusion/unet_diffusion.py
+++ b/codes/models/diffusion/unet_diffusion.py
@@ -8,7 +8,6 @@ import torch as th
import torch.nn as nn
import torch.nn.functional as F
import torchvision # For debugging, not actually used.
-from x_transformers.x_transformers import RelativePositionBias
from models.diffusion.fp16_util import convert_module_to_f16, convert_module_to_f32
from models.diffusion.nn import (
@@ -298,7 +297,6 @@ class AttentionBlock(nn.Module):
num_head_channels=-1,
use_new_attention_order=False,
do_checkpoint=True,
- relative_pos_embeddings=False,
):
super().__init__()
self.channels = channels
@@ -320,10 +318,6 @@ class AttentionBlock(nn.Module):
self.attention = QKVAttentionLegacy(self.num_heads)
self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
- if relative_pos_embeddings:
- self.relative_pos_embeddings = RelativePositionBias(scale=(channels // self.num_heads) ** .5, causal=False, heads=num_heads, num_buckets=32, max_distance=64)
- else:
- self.relative_pos_embeddings = None
def forward(self, x, mask=None):
if self.do_checkpoint:
@@ -335,7 +329,7 @@ class AttentionBlock(nn.Module):
b, c, *spatial = x.shape
x = x.reshape(b, c, -1)
qkv = self.qkv(self.norm(x))
- h = self.attention(qkv, mask, self.relative_pos_embeddings)
+ h = self.attention(qkv, mask)
h = self.proj_out(h)
return (x + h).reshape(b, c, *spatial)
diff --git a/codes/models/lucidrains/x_transformers.py b/codes/models/lucidrains/x_transformers.py
index f4139f1c..32ad19e7 100644
--- a/codes/models/lucidrains/x_transformers.py
+++ b/codes/models/lucidrains/x_transformers.py
@@ -10,11 +10,8 @@ from collections import namedtuple
from einops import rearrange, repeat, reduce
from einops.layers.torch import Rearrange
-from entmax import entmax15
from torch.utils.checkpoint import checkpoint
-from x_transformers.autoregressive_wrapper import AutoregressiveWrapper
-
DEFAULT_DIM_HEAD = 64
Intermediates = namedtuple('Intermediates', [
@@ -1274,51 +1271,3 @@ class ContinuousTransformerWrapper(nn.Module):
if len(res) > 1:
return tuple(res)
return res[0]
-
-
-class XTransformer(nn.Module):
- def __init__(
- self,
- *,
- dim,
- tie_token_emb=False,
- **kwargs
- ):
- super().__init__()
- enc_kwargs, kwargs = groupby_prefix_and_trim('enc_', kwargs)
- dec_kwargs, kwargs = groupby_prefix_and_trim('dec_', kwargs)
-
- assert 'dim' not in enc_kwargs and 'dim' not in dec_kwargs, 'dimension of either encoder or decoder must be set with `dim` keyword'
- enc_transformer_kwargs = pick_and_pop(['num_tokens', 'max_seq_len'], enc_kwargs)
- enc_transformer_kwargs['emb_dropout'] = enc_kwargs.pop('emb_dropout', 0)
- enc_transformer_kwargs['num_memory_tokens'] = enc_kwargs.pop('num_memory_tokens', None)
- enc_transformer_kwargs['use_pos_emb'] = enc_kwargs.pop('use_pos_emb', True)
-
- dec_transformer_kwargs = pick_and_pop(['num_tokens', 'max_seq_len'], dec_kwargs)
- dec_transformer_kwargs['emb_dropout'] = dec_kwargs.pop('emb_dropout', 0)
- dec_transformer_kwargs['use_pos_emb'] = dec_kwargs.pop('use_pos_emb', True)
-
- self.encoder = TransformerWrapper(
- **enc_transformer_kwargs,
- attn_layers=Encoder(dim=dim, **enc_kwargs)
- )
-
- self.decoder = TransformerWrapper(
- **dec_transformer_kwargs,
- attn_layers=Decoder(dim=dim, cross_attend=True, **dec_kwargs)
- )
-
- if tie_token_emb:
- self.decoder.token_emb = self.encoder.token_emb
-
- self.decoder = AutoregressiveWrapper(self.decoder)
-
- @torch.no_grad()
- def generate(self, seq_in, seq_out_start, seq_len, src_mask=None, src_attn_mask=None, **kwargs):
- encodings = self.encoder(seq_in, mask=src_mask, attn_mask=src_attn_mask, return_embeddings=True)
- return self.decoder.generate(seq_out_start, seq_len, context=encodings, context_mask=src_mask, **kwargs)
-
- def forward(self, src, tgt, src_mask=None, tgt_mask=None, src_attn_mask=None):
- enc = self.encoder(src, mask=src_mask, attn_mask=src_attn_mask, return_embeddings=True)
- out = self.decoder(tgt, context=enc, mask=tgt_mask, context_mask=src_mask)
- return out
diff --git a/codes/requirements.txt b/codes/requirements.txt
index c4b32d28..c79c7909 100644
--- a/codes/requirements.txt
+++ b/codes/requirements.txt
@@ -30,11 +30,13 @@ Unidecode==1.0.22
tgt == 1.4.4
pyworld == 0.2.10
audio2numpy
+SoundFile
# For text stuff
transformers
tokenizers
jiwer # calculating WER
+omegaconf
# lucidrains stuff
vector_quantize_pytorch
@@ -42,4 +44,5 @@ linear_attention_transformer
rotary-embedding-torch
axial_positional_embedding
g-mlp-pytorch
-x-clip
\ No newline at end of file
+x-clip
+x_transformers
\ No newline at end of file
diff --git a/codes/trainer/eval/music_diffusion_fid.py b/codes/trainer/eval/music_diffusion_fid.py
index 08ff3fe1..c4b05356 100644
--- a/codes/trainer/eval/music_diffusion_fid.py
+++ b/codes/trainer/eval/music_diffusion_fid.py
@@ -315,14 +315,15 @@ class MusicDiffusionFid(evaluator.Evaluator):
if __name__ == '__main__':
# For multilevel SR:
+ """
diffusion = load_model_from_config('X:\\dlas\\experiments\\train_music_diffusion_multilevel_sr.yml', 'generator',
also_load_savepoint=False, strict_load=False,
- load_path='X:\\dlas\\experiments\\train_music_diffusion_multilevel_sr\\models\\18000_generator.pth'
+ load_path='X:\\dlas\\experiments\\train_music_diffusion_multilevel_sr_archived_prev2\\models\\18000_generator.pth'
).cuda()
opt_eval = {'path': 'Y:\\split\\yt-music-eval', # eval music, mostly electronica. :)
#'path': 'E:\\music_eval', # this is music from the training dataset, including a lot more variety.
- 'diffusion_steps': 256, # basis: 192
- 'conditioning_free': True, 'conditioning_free_k': 1, 'use_ddim': False, 'clip_audio': True,
+ 'diffusion_steps': 64, # basis: 192
+ 'conditioning_free': False, 'conditioning_free_k': 1, 'use_ddim': True, 'clip_audio': True,
'diffusion_schedule': 'cosine', 'diffusion_type': 'chained_sr',
}
@@ -334,13 +335,12 @@ if __name__ == '__main__':
).cuda()
opt_eval = {'path': 'Y:\\split\\yt-music-eval', # eval music, mostly electronica. :)
#'path': 'E:\\music_eval', # this is music from the training dataset, including a lot more variety.
- 'diffusion_steps': 256, # basis: 192
- 'conditioning_free': True, 'conditioning_free_k': 1, 'use_ddim': False, 'clip_audio': True,
+ 'diffusion_steps': 64, # basis: 192
+ 'conditioning_free': False, 'conditioning_free_k': 1, 'use_ddim': True, 'clip_audio': True,
'diffusion_schedule': 'cosine', 'diffusion_type': 'from_codes_quant',
}
- """
- env = {'rank': 0, 'base_path': 'D:\\tmp\\test_eval_music', 'step': 11, 'device': 'cuda', 'opt': {}}
+ env = {'rank': 0, 'base_path': 'D:\\tmp\\test_eval_music', 'step': 12, 'device': 'cuda', 'opt': {}}
eval = MusicDiffusionFid(diffusion, opt_eval, env)
fds = []
for i in range(2):