DL-Art-School/codes/models/lucidrains/dalle/attention.py

from inspect import isfunction
from math import ceil

import torch
from torch import nn, einsum
import torch.nn.functional as F
from einops import rearrange, repeat

from rotary_embedding_torch import apply_rotary_emb

# helpers

def exists(val):
    return val is not None

def uniq(arr):
    return{el: True for el in arr}.keys()

def default(val, d):
    if exists(val):
        return val
    return d() if isfunction(d) else d

def max_neg_value(t):
    return -torch.finfo(t.dtype).max

def stable_softmax(t, dim = -1, alpha = 32 ** 2):
    t = t / alpha
    t = t - torch.amax(t, dim = dim, keepdim = True).detach()
    return (t * alpha).softmax(dim = dim)

def apply_pos_emb(pos_emb, qkv):
    n = qkv[0].shape[-2]
    pos_emb = pos_emb[..., :n, :]
    return tuple(map(lambda t: apply_rotary_emb(pos_emb, t), qkv))

# classes

class Attention(nn.Module):
    def __init__(self, dim, seq_len, causal = True, heads = 8, dim_head = 64, dropout = 0., stable = False):
        super().__init__()
        inner_dim = dim_head *  heads
        self.heads = heads
        self.seq_len = seq_len
        self.scale = dim_head ** -0.5

        self.stable = stable
        self.causal = causal

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        )

    def forward(self, x, mask = None, rotary_pos_emb = None):
        b, n, _, h, device = *x.shape, self.heads, x.device
        softmax = torch.softmax if not self.stable else stable_softmax

        qkv = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv)

        if exists(rotary_pos_emb):
            q, k, v = apply_pos_emb(rotary_pos_emb, (q, k, v))

        q = q * self.scale

        dots = torch.einsum('b h i d, b h j d -> b h i j', q, k)
        mask_value = max_neg_value(dots)

        if exists(mask):
            mask = rearrange(mask, 'b j -> b () () j')
            dots.masked_fill_(~mask, mask_value)
            del mask

        if self.causal:
            i, j = dots.shape[-2:]
            mask = torch.ones(i, j, device = device).triu_(j - i + 1).bool()
            dots.masked_fill_(mask, mask_value)

        attn = softmax(dots, dim=-1)

        out = torch.einsum('b h i j, b h j d -> b h i d', attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        out =  self.to_out(out)
        return out

# sparse attention with convolutional pattern, as mentioned in the blog post. customizable kernel size and dilation

class SparseConvCausalAttention(nn.Module):
    def __init__(self, dim, seq_len, image_size = 32, kernel_size = 5, dilation = 1, heads = 8, dim_head = 64, dropout = 0., stable = False, **kwargs):
        super().__init__()
        assert kernel_size % 2 == 1, 'kernel size must be odd'

        inner_dim = dim_head *  heads
        self.seq_len = seq_len
        self.heads = heads
        self.scale = dim_head ** -0.5
        self.image_size = image_size
        self.kernel_size = kernel_size
        self.dilation = dilation

        self.stable = stable

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)

        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        )

    def forward(self, x, mask = None, rotary_pos_emb = None):
        b, n, _, h, img_size, kernel_size, dilation, seq_len, device = *x.shape, self.heads, self.image_size, self.kernel_size, self.dilation, self.seq_len, x.device
        softmax = torch.softmax if not self.stable else stable_softmax

        img_seq_len = img_size ** 2
        text_len = seq_len + 1 - img_seq_len

        # padding

        padding = seq_len - n + 1
        mask = default(mask, lambda: torch.ones(b, text_len, device = device).bool())

        x = F.pad(x, (0, 0, 0, padding), value = 0)
        mask = mask[:, :text_len]

        # derive query / keys / values

        qkv = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h = h), qkv)

        if exists(rotary_pos_emb):
            q, k, v = apply_pos_emb(rotary_pos_emb, (q, k, v))

        q *= self.scale

        ((q_text, q_img), (k_text, k_img), (v_text, v_img)) = map(lambda t: (t[:, :-img_seq_len], t[:, -img_seq_len:]), (q, k, v))

        # text attention

        dots_text = einsum('b i d, b j d -> b i j', q_text, k_text)
        mask_value = max_neg_value(dots_text)

        i, j = dots_text.shape[-2:]
        text_causal_mask = torch.ones(i, j, device = device).triu_(j - i + 1).bool()
        dots_text.masked_fill_(text_causal_mask, mask_value)

        attn_text = softmax(dots_text, dim = -1)
        out_text = einsum('b i j, b j d -> b i d', attn_text, v_text)

        # image attention

        effective_kernel_size = (kernel_size - 1) * dilation + 1
        padding = effective_kernel_size // 2

        k_img, v_img = map(lambda t: rearrange(t, 'b (h w) c -> b c h w', h = img_size), (k_img, v_img))
        k_img, v_img = map(lambda t: F.unfold(t, kernel_size, padding = padding, dilation = dilation), (k_img, v_img))
        k_img, v_img = map(lambda t: rearrange(t, 'b (d j) i -> b i j d', j = kernel_size ** 2), (k_img, v_img))

        # let image attend to all of text

        dots_image = einsum('b i d, b i j d -> b i j', q_img, k_img)
        dots_image_to_text = einsum('b i d, b j d -> b i j', q_img, k_text)

        # calculate causal attention for local convolution

        i, j = dots_image.shape[-2:]
        img_seq = torch.arange(img_seq_len, device = device)
        k_img_indices = rearrange(img_seq.float(), '(h w) -> () () h w', h = img_size)
        k_img_indices = F.pad(k_img_indices, (padding,) * 4, value = img_seq_len) # padding set to be max, so it is never attended to
        k_img_indices = F.unfold(k_img_indices, kernel_size, dilation = dilation)
        k_img_indices = rearrange(k_img_indices, 'b j i -> b i j')

        # mask image attention

        q_img_indices = rearrange(img_seq, 'i -> () i ()')
        causal_mask =  q_img_indices < k_img_indices

        # concat text mask with image causal mask

        causal_mask = repeat(causal_mask, '() i j -> b i j', b = b * h)
        mask = repeat(mask, 'b j -> (b h) i j', i = i, h = h)
        mask = torch.cat((~mask, causal_mask), dim = -1)

        # image can attend to all of text

        dots = torch.cat((dots_image_to_text, dots_image), dim = -1)
        dots.masked_fill_(mask, mask_value)

        attn = softmax(dots, dim = -1)

        # aggregate

        attn_image_to_text, attn_image = attn[..., :text_len], attn[..., text_len:]

        out_image_to_image = einsum('b i j, b i j d -> b i d', attn_image, v_img)
        out_image_to_text = einsum('b i j, b j d -> b i d', attn_image_to_text, v_text)

        out_image = out_image_to_image + out_image_to_text

        # combine attended values for both text and image

        out = torch.cat((out_text, out_image), dim = 1)

        out = rearrange(out, '(b h) n d -> b n (h d)', h = h)
        out =  self.to_out(out)
        return out[:, :n]

# sparse axial causal attention

class SparseAxialCausalAttention(nn.Module):
    def __init__(self, dim, seq_len, image_size = 32, axis = 0, heads = 8, dim_head = 64, dropout = 0., stable = False, **kwargs):
        super().__init__()
        assert axis in {0, 1}, 'axis must be either 0 (along height) or 1 (along width)'
        self.axis = axis

        inner_dim = dim_head *  heads
        self.seq_len = seq_len
        self.heads = heads
        self.scale = dim_head ** -0.5
        self.image_size = image_size

        self.stable = stable

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)

        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        )

    def forward(self, x, mask = None, rotary_pos_emb = None):
        b, n, _, h, img_size, axis, seq_len, device = *x.shape, self.heads, self.image_size, self.axis, self.seq_len, x.device
        softmax = torch.softmax if not self.stable else stable_softmax

        img_seq_len = img_size ** 2
        text_len = seq_len + 1 - img_seq_len

        # padding

        padding = seq_len - n + 1
        mask = default(mask, lambda: torch.ones(b, text_len, device = device).bool())

        x = F.pad(x, (0, 0, 0, padding), value = 0)
        mask = mask[:, :text_len]

        # derive queries / keys / values

        qkv = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h = h), qkv)

        if exists(rotary_pos_emb):
            q, k, v = apply_pos_emb(rotary_pos_emb, (q, k, v))

        q *= self.scale

        ((q_text, q_img), (k_text, k_img), (v_text, v_img)) = map(lambda t: (t[:, :-img_seq_len], t[:, -img_seq_len:]), (q, k, v))

        # text attention

        dots_text = einsum('b i d, b j d -> b i j', q_text, k_text)
        mask_value = max_neg_value(dots_text)

        i, j = dots_text.shape[-2:]
        text_causal_mask = torch.ones(i, j, device = device).triu_(j - i + 1).bool()
        dots_text.masked_fill_(text_causal_mask, mask_value)

        attn_text = softmax(dots_text, dim = -1)
        out_text = einsum('b i j, b j d -> b i d', attn_text, v_text)

        # image attention

        split_axis_einops = 'b (h w) c -> b h w c' if axis == 0 else 'b (h w) c -> b w h c'
        merge_axis_einops = 'b x n d -> b (x n) d' if axis == 0 else 'b x n d -> b (n x) d'

        # split out axis

        q_img, k_img, v_img = map(lambda t: rearrange(t, split_axis_einops, h = img_size), (q_img, k_img, v_img))

        # similarity

        dots_image_to_image = einsum('b x i d, b x j d -> b x i j', q_img, k_img)
        dots_image_to_text = einsum('b x i d, b j d -> b x i j', q_img, k_text)

        dots = torch.cat((dots_image_to_text, dots_image_to_image), dim = -1)

        # mask so image has full attention to text, but causal along axis

        bh, x, i, j = dots.shape
        causal_mask = torch.ones(i, img_size, device = device).triu_(img_size - i + 1).bool()
        causal_mask = repeat(causal_mask, 'i j -> b x i j', b = bh, x = x)

        mask = repeat(mask, 'b j -> (b h) x i j', h = h, x = x, i = i)
        mask = torch.cat((~mask, causal_mask), dim = -1)

        dots.masked_fill_(mask, mask_value)

        # attention.

        attn = softmax(dots, dim = -1)

        # aggregate

        attn_image_to_text, attn_image_to_image = attn[..., :text_len], attn[..., text_len:]

        out_image_to_image = einsum('b x i j, b x j d -> b x i d', attn_image_to_image, v_img)
        out_image_to_text = einsum('b x i j, b j d -> b x i d', attn_image_to_text, v_text)

        out_image = out_image_to_image + out_image_to_text

        # merge back axis

        out_image = rearrange(out_image, merge_axis_einops, x = img_size)

        # combine attended values for both text and image

        out = torch.cat((out_text, out_image), dim = 1)

        out = rearrange(out, '(b h) n d -> b n (h d)', h = h)
        out =  self.to_out(out)
        return out[:, :n]

# microsoft sparse attention CUDA kernel

class SparseAttention(Attention):
    def __init__(
        self,
        *args,
        block_size = 16,
        text_seq_len = 256,
        num_random_blocks = None,
        **kwargs
    ):
        super().__init__(*args, **kwargs)
        from deepspeed.ops.sparse_attention import SparseSelfAttention, VariableSparsityConfig
        self.block_size = block_size

        num_random_blocks = default(num_random_blocks, self.seq_len // block_size // 4)
        global_block_indices = list(range(ceil(text_seq_len / block_size)))

        self.attn_fn = SparseSelfAttention(
            sparsity_config = VariableSparsityConfig(
                num_heads = self.heads,
                block = self.block_size,
                num_random_blocks = num_random_blocks,
                global_block_indices = global_block_indices,
                attention = 'unidirectional' if self.causal else 'bidirectional'
            ),
            max_seq_length = self.seq_len,
            attn_mask_mode = 'add'
        )

    def forward(self, x, mask = None, rotary_pos_emb = None):
        b, n, _, h, device = *x.shape, self.heads, x.device
        remainder = n % self.block_size
        mask = default(mask, lambda: torch.ones(b, n, device = device).bool())

        if remainder > 0:
            padding = self.block_size - remainder
            x = F.pad(x, (0, 0, 0, padding), value = 0)
            mask = F.pad(mask, (0, padding), value = False)

        qkv = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv)

        if exists(rotary_pos_emb):
            q, k, v = apply_pos_emb(rotary_pos_emb, (q, k, v))

        key_pad_mask = None
        if exists(mask):
            key_pad_mask = ~mask

        attn_mask = None
        if self.causal:
            i, j = q.shape[-2], k.shape[-2]
            mask = torch.ones(i, j, device = device).triu_(j - i + 1).bool()
            attn_mask = torch.zeros(i, j, device = device).to(q)
            mask_value = max_neg_value(q) / 2
            attn_mask.masked_fill_(mask, mask_value)

        out = self.attn_fn(q, k, v, attn_mask = attn_mask, key_padding_mask = key_pad_mask)
        out = rearrange(out, 'b h n d -> b n (h d)')
        out = self.to_out(out)
        return out[:, :n]
Remove obsolete lucidrains DALLE stuff, re-create it in a dedicated folder 2021-12-22 20:44:02 +00:00			`from inspect import isfunction`
			`from math import ceil`

			`import torch`
			`from torch import nn, einsum`
			`import torch.nn.functional as F`
			`from einops import rearrange, repeat`

			`from rotary_embedding_torch import apply_rotary_emb`

			`# helpers`

			`def exists(val):`
			`return val is not None`

			`def uniq(arr):`
			`return{el: True for el in arr}.keys()`

			`def default(val, d):`
			`if exists(val):`
			`return val`
			`return d() if isfunction(d) else d`

			`def max_neg_value(t):`
			`return -torch.finfo(t.dtype).max`

			`def stable_softmax(t, dim = -1, alpha = 32 ** 2):`
			`t = t / alpha`
			`t = t - torch.amax(t, dim = dim, keepdim = True).detach()`
			`return (t * alpha).softmax(dim = dim)`

			`def apply_pos_emb(pos_emb, qkv):`
			`n = qkv[0].shape[-2]`
			`pos_emb = pos_emb[..., :n, :]`
			`return tuple(map(lambda t: apply_rotary_emb(pos_emb, t), qkv))`

			`# classes`

			`class Attention(nn.Module):`
			`def __init__(self, dim, seq_len, causal = True, heads = 8, dim_head = 64, dropout = 0., stable = False):`
			`super().__init__()`
			`inner_dim = dim_head * heads`
			`self.heads = heads`
			`self.seq_len = seq_len`
			`self.scale = dim_head ** -0.5`

			`self.stable = stable`
			`self.causal = causal`

			`self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)`
			`self.to_out = nn.Sequential(`
			`nn.Linear(inner_dim, dim),`
			`nn.Dropout(dropout)`
			`)`

			`def forward(self, x, mask = None, rotary_pos_emb = None):`
			`b, n, _, h, device = *x.shape, self.heads, x.device`
			`softmax = torch.softmax if not self.stable else stable_softmax`

			`qkv = self.to_qkv(x).chunk(3, dim = -1)`
			`q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv)`

			`if exists(rotary_pos_emb):`
			`q, k, v = apply_pos_emb(rotary_pos_emb, (q, k, v))`

			`q = q * self.scale`

			`dots = torch.einsum('b h i d, b h j d -> b h i j', q, k)`
			`mask_value = max_neg_value(dots)`

			`if exists(mask):`
			`mask = rearrange(mask, 'b j -> b () () j')`
			`dots.masked_fill_(~mask, mask_value)`
			`del mask`

			`if self.causal:`
			`i, j = dots.shape[-2:]`
			`mask = torch.ones(i, j, device = device).triu_(j - i + 1).bool()`
			`dots.masked_fill_(mask, mask_value)`

			`attn = softmax(dots, dim=-1)`

			`out = torch.einsum('b h i j, b h j d -> b h i d', attn, v)`
			`out = rearrange(out, 'b h n d -> b n (h d)')`
			`out = self.to_out(out)`
			`return out`

			`# sparse attention with convolutional pattern, as mentioned in the blog post. customizable kernel size and dilation`

			`class SparseConvCausalAttention(nn.Module):`
			`def __init__(self, dim, seq_len, image_size = 32, kernel_size = 5, dilation = 1, heads = 8, dim_head = 64, dropout = 0., stable = False, **kwargs):`
			`super().__init__()`
			`assert kernel_size % 2 == 1, 'kernel size must be odd'`

			`inner_dim = dim_head * heads`
			`self.seq_len = seq_len`
			`self.heads = heads`
			`self.scale = dim_head ** -0.5`
			`self.image_size = image_size`
			`self.kernel_size = kernel_size`
			`self.dilation = dilation`

			`self.stable = stable`

			`self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)`

			`self.to_out = nn.Sequential(`
			`nn.Linear(inner_dim, dim),`
			`nn.Dropout(dropout)`
			`)`

			`def forward(self, x, mask = None, rotary_pos_emb = None):`
			`b, n, _, h, img_size, kernel_size, dilation, seq_len, device = *x.shape, self.heads, self.image_size, self.kernel_size, self.dilation, self.seq_len, x.device`
			`softmax = torch.softmax if not self.stable else stable_softmax`

			`img_seq_len = img_size ** 2`
			`text_len = seq_len + 1 - img_seq_len`

			`# padding`

			`padding = seq_len - n + 1`
			`mask = default(mask, lambda: torch.ones(b, text_len, device = device).bool())`

			`x = F.pad(x, (0, 0, 0, padding), value = 0)`
			`mask = mask[:, :text_len]`

			`# derive query / keys / values`

			`qkv = self.to_qkv(x).chunk(3, dim = -1)`
			`q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h = h), qkv)`

			`if exists(rotary_pos_emb):`
			`q, k, v = apply_pos_emb(rotary_pos_emb, (q, k, v))`

			`q *= self.scale`

			`((q_text, q_img), (k_text, k_img), (v_text, v_img)) = map(lambda t: (t[:, :-img_seq_len], t[:, -img_seq_len:]), (q, k, v))`

			`# text attention`

			`dots_text = einsum('b i d, b j d -> b i j', q_text, k_text)`
			`mask_value = max_neg_value(dots_text)`

			`i, j = dots_text.shape[-2:]`
			`text_causal_mask = torch.ones(i, j, device = device).triu_(j - i + 1).bool()`
			`dots_text.masked_fill_(text_causal_mask, mask_value)`

			`attn_text = softmax(dots_text, dim = -1)`
			`out_text = einsum('b i j, b j d -> b i d', attn_text, v_text)`

			`# image attention`

			`effective_kernel_size = (kernel_size - 1) * dilation + 1`
			`padding = effective_kernel_size // 2`

			`k_img, v_img = map(lambda t: rearrange(t, 'b (h w) c -> b c h w', h = img_size), (k_img, v_img))`
			`k_img, v_img = map(lambda t: F.unfold(t, kernel_size, padding = padding, dilation = dilation), (k_img, v_img))`
			`k_img, v_img = map(lambda t: rearrange(t, 'b (d j) i -> b i j d', j = kernel_size ** 2), (k_img, v_img))`

			`# let image attend to all of text`

			`dots_image = einsum('b i d, b i j d -> b i j', q_img, k_img)`
			`dots_image_to_text = einsum('b i d, b j d -> b i j', q_img, k_text)`

			`# calculate causal attention for local convolution`

			`i, j = dots_image.shape[-2:]`
			`img_seq = torch.arange(img_seq_len, device = device)`
			`k_img_indices = rearrange(img_seq.float(), '(h w) -> () () h w', h = img_size)`
			`k_img_indices = F.pad(k_img_indices, (padding,) * 4, value = img_seq_len) # padding set to be max, so it is never attended to`
			`k_img_indices = F.unfold(k_img_indices, kernel_size, dilation = dilation)`
			`k_img_indices = rearrange(k_img_indices, 'b j i -> b i j')`

			`# mask image attention`

			`q_img_indices = rearrange(img_seq, 'i -> () i ()')`
			`causal_mask = q_img_indices < k_img_indices`

			`# concat text mask with image causal mask`

			`causal_mask = repeat(causal_mask, '() i j -> b i j', b = b * h)`
			`mask = repeat(mask, 'b j -> (b h) i j', i = i, h = h)`
			`mask = torch.cat((~mask, causal_mask), dim = -1)`

			`# image can attend to all of text`

			`dots = torch.cat((dots_image_to_text, dots_image), dim = -1)`
			`dots.masked_fill_(mask, mask_value)`

			`attn = softmax(dots, dim = -1)`

			`# aggregate`

			`attn_image_to_text, attn_image = attn[..., :text_len], attn[..., text_len:]`

			`out_image_to_image = einsum('b i j, b i j d -> b i d', attn_image, v_img)`
			`out_image_to_text = einsum('b i j, b j d -> b i d', attn_image_to_text, v_text)`

			`out_image = out_image_to_image + out_image_to_text`

			`# combine attended values for both text and image`

			`out = torch.cat((out_text, out_image), dim = 1)`

			`out = rearrange(out, '(b h) n d -> b n (h d)', h = h)`
			`out = self.to_out(out)`
			`return out[:, :n]`

			`# sparse axial causal attention`

			`class SparseAxialCausalAttention(nn.Module):`
			`def __init__(self, dim, seq_len, image_size = 32, axis = 0, heads = 8, dim_head = 64, dropout = 0., stable = False, **kwargs):`
			`super().__init__()`
			`assert axis in {0, 1}, 'axis must be either 0 (along height) or 1 (along width)'`
			`self.axis = axis`

			`inner_dim = dim_head * heads`
			`self.seq_len = seq_len`
			`self.heads = heads`
			`self.scale = dim_head ** -0.5`
			`self.image_size = image_size`

			`self.stable = stable`

			`self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)`

			`self.to_out = nn.Sequential(`
			`nn.Linear(inner_dim, dim),`
			`nn.Dropout(dropout)`
			`)`

			`def forward(self, x, mask = None, rotary_pos_emb = None):`
			`b, n, _, h, img_size, axis, seq_len, device = *x.shape, self.heads, self.image_size, self.axis, self.seq_len, x.device`
			`softmax = torch.softmax if not self.stable else stable_softmax`

			`img_seq_len = img_size ** 2`
			`text_len = seq_len + 1 - img_seq_len`

			`# padding`

			`padding = seq_len - n + 1`
			`mask = default(mask, lambda: torch.ones(b, text_len, device = device).bool())`

			`x = F.pad(x, (0, 0, 0, padding), value = 0)`
			`mask = mask[:, :text_len]`

			`# derive queries / keys / values`

			`qkv = self.to_qkv(x).chunk(3, dim = -1)`
			`q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h = h), qkv)`

			`if exists(rotary_pos_emb):`
			`q, k, v = apply_pos_emb(rotary_pos_emb, (q, k, v))`

			`q *= self.scale`

			`((q_text, q_img), (k_text, k_img), (v_text, v_img)) = map(lambda t: (t[:, :-img_seq_len], t[:, -img_seq_len:]), (q, k, v))`

			`# text attention`

			`dots_text = einsum('b i d, b j d -> b i j', q_text, k_text)`
			`mask_value = max_neg_value(dots_text)`

			`i, j = dots_text.shape[-2:]`
			`text_causal_mask = torch.ones(i, j, device = device).triu_(j - i + 1).bool()`
			`dots_text.masked_fill_(text_causal_mask, mask_value)`

			`attn_text = softmax(dots_text, dim = -1)`
			`out_text = einsum('b i j, b j d -> b i d', attn_text, v_text)`

			`# image attention`

			`split_axis_einops = 'b (h w) c -> b h w c' if axis == 0 else 'b (h w) c -> b w h c'`
			`merge_axis_einops = 'b x n d -> b (x n) d' if axis == 0 else 'b x n d -> b (n x) d'`

			`# split out axis`

			`q_img, k_img, v_img = map(lambda t: rearrange(t, split_axis_einops, h = img_size), (q_img, k_img, v_img))`

			`# similarity`

			`dots_image_to_image = einsum('b x i d, b x j d -> b x i j', q_img, k_img)`
			`dots_image_to_text = einsum('b x i d, b j d -> b x i j', q_img, k_text)`

			`dots = torch.cat((dots_image_to_text, dots_image_to_image), dim = -1)`

			`# mask so image has full attention to text, but causal along axis`

			`bh, x, i, j = dots.shape`
			`causal_mask = torch.ones(i, img_size, device = device).triu_(img_size - i + 1).bool()`
			`causal_mask = repeat(causal_mask, 'i j -> b x i j', b = bh, x = x)`

			`mask = repeat(mask, 'b j -> (b h) x i j', h = h, x = x, i = i)`
			`mask = torch.cat((~mask, causal_mask), dim = -1)`

			`dots.masked_fill_(mask, mask_value)`

			`# attention.`

			`attn = softmax(dots, dim = -1)`

			`# aggregate`

			`attn_image_to_text, attn_image_to_image = attn[..., :text_len], attn[..., text_len:]`

			`out_image_to_image = einsum('b x i j, b x j d -> b x i d', attn_image_to_image, v_img)`
			`out_image_to_text = einsum('b x i j, b j d -> b x i d', attn_image_to_text, v_text)`

			`out_image = out_image_to_image + out_image_to_text`

			`# merge back axis`

			`out_image = rearrange(out_image, merge_axis_einops, x = img_size)`

			`# combine attended values for both text and image`

			`out = torch.cat((out_text, out_image), dim = 1)`

			`out = rearrange(out, '(b h) n d -> b n (h d)', h = h)`
			`out = self.to_out(out)`
			`return out[:, :n]`

			`# microsoft sparse attention CUDA kernel`

			`class SparseAttention(Attention):`
			`def __init__(`
			`self,`
			`*args,`
			`block_size = 16,`
			`text_seq_len = 256,`
			`num_random_blocks = None,`
			`**kwargs`
			`):`
			`super().__init__(args, *kwargs)`
			`from deepspeed.ops.sparse_attention import SparseSelfAttention, VariableSparsityConfig`
			`self.block_size = block_size`

			`num_random_blocks = default(num_random_blocks, self.seq_len // block_size // 4)`
			`global_block_indices = list(range(ceil(text_seq_len / block_size)))`

			`self.attn_fn = SparseSelfAttention(`
			`sparsity_config = VariableSparsityConfig(`
			`num_heads = self.heads,`
			`block = self.block_size,`
			`num_random_blocks = num_random_blocks,`
			`global_block_indices = global_block_indices,`
			`attention = 'unidirectional' if self.causal else 'bidirectional'`
			`),`
			`max_seq_length = self.seq_len,`
			`attn_mask_mode = 'add'`
			`)`

			`def forward(self, x, mask = None, rotary_pos_emb = None):`
			`b, n, _, h, device = *x.shape, self.heads, x.device`
			`remainder = n % self.block_size`
			`mask = default(mask, lambda: torch.ones(b, n, device = device).bool())`

			`if remainder > 0:`
			`padding = self.block_size - remainder`
			`x = F.pad(x, (0, 0, 0, padding), value = 0)`
			`mask = F.pad(mask, (0, padding), value = False)`

			`qkv = self.to_qkv(x).chunk(3, dim = -1)`
			`q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv)`

			`if exists(rotary_pos_emb):`
			`q, k, v = apply_pos_emb(rotary_pos_emb, (q, k, v))`

			`key_pad_mask = None`
			`if exists(mask):`
			`key_pad_mask = ~mask`

			`attn_mask = None`
			`if self.causal:`
			`i, j = q.shape[-2], k.shape[-2]`
			`mask = torch.ones(i, j, device = device).triu_(j - i + 1).bool()`
			`attn_mask = torch.zeros(i, j, device = device).to(q)`
			`mask_value = max_neg_value(q) / 2`
			`attn_mask.masked_fill_(mask, mask_value)`

			`out = self.attn_fn(q, k, v, attn_mask = attn_mask, key_padding_mask = key_pad_mask)`
			`out = rearrange(out, 'b h n d -> b n (h d)')`
			`out = self.to_out(out)`
			`return out[:, :n]`