incorporated fast attention into attention

This commit is contained in:
Matthew Smith 2023-03-31 11:15:36 +09:00
parent 4ae3b248ee
commit 37b64d41ce

View File

@ -84,31 +84,26 @@ class MultiheadAttention(nn.Module):
q = self.q_proj(query) q = self.q_proj(query)
k = self.k_proj(key) k = self.k_proj(key)
v = self.v_proj(value) v = self.v_proj(value)
q *= self.scaling
q = q.view(bsz, tgt_len, self.num_heads, self.head_dim).transpose(1, 2) q = q.view(bsz, tgt_len, self.num_heads, self.head_dim).transpose(1, 2)
k = k.view(bsz, src_len, self.num_heads, self.head_dim).transpose(1, 2) k = k.view(bsz, src_len, self.num_heads, self.head_dim).transpose(1, 2)
v = v.view(bsz, src_len, self.num_heads, self.head_dim).transpose(1, 2) v = v.view(bsz, src_len, self.num_heads, self.head_dim).transpose(1, 2)
q = q.reshape(bsz * self.num_heads, tgt_len, self.head_dim) q = q.reshape(bsz, self.num_heads, tgt_len, self.head_dim)
k = k.reshape(bsz * self.num_heads, src_len, self.head_dim) k = k.reshape(bsz, self.num_heads, src_len, self.head_dim)
v = v.reshape(bsz * self.num_heads, src_len, self.head_dim) v = v.reshape(bsz, self.num_heads, src_len, self.head_dim)
if incremental_state is not None: if incremental_state is not None:
if "prev_key" in incremental_state: if "prev_key" in incremental_state:
prev_key = incremental_state["prev_key"].view( prev_key = incremental_state["prev_key"].view(
bsz * self.num_heads, -1, self.head_dim bsz, self.num_heads, -1, self.head_dim
) )
prev_value = incremental_state["prev_value"].view( prev_value = incremental_state["prev_value"].view(
bsz * self.num_heads, -1, self.head_dim bsz, self.num_heads, -1, self.head_dim
) )
k = torch.cat([prev_key, k], dim=1) k = torch.cat([prev_key, k], dim=1)
v = torch.cat([prev_value, v], dim=1) v = torch.cat([prev_value, v], dim=1)
incremental_state["prev_key"] = k.view( incremental_state["prev_key"] = k
bsz, self.num_heads, -1, self.head_dim incremental_state["prev_value"] = v
)
incremental_state["prev_value"] = v.view(
bsz, self.num_heads, -1, self.head_dim
)
src_len = k.size(1) src_len = k.size(1)
if self.xpos is not None: if self.xpos is not None:
@ -116,31 +111,36 @@ class MultiheadAttention(nn.Module):
offset = src_len - 1 offset = src_len - 1
else: else:
offset = 0 offset = 0
k, q = map(lambda t: t.view(bsz * self.num_heads, -1, self.head_dim), (k, q))
k = self.xpos(k, offset=0, downscale=True) k = self.xpos(k, offset=0, downscale=True)
q = self.xpos(q, offset=offset, downscale=False) q = self.xpos(q, offset=offset, downscale=False)
k, q = map(lambda t: t.view(bsz, self.num_heads, -1, self.head_dim), (k, q))
attn_weights = torch.bmm(q, k.transpose(1, 2))
if attn_mask is not None: if attn_mask is not None:
attn_weights = torch.nan_to_num(attn_weights)
attn_mask = attn_mask.unsqueeze(0) attn_mask = attn_mask.unsqueeze(0)
attn_weights += attn_mask
if key_padding_mask is not None: if key_padding_mask is not None:
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) # Achieve same result with an additive mask
attn_weights = attn_weights.masked_fill( attn_mask += key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.float32) * float("-inf")
key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
float("-inf"),
)
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if rel_pos is not None: if rel_pos is not None:
rel_pos = rel_pos.view(attn_weights.size()) attn_mask += rel_pos.view(attn_mask.size())
attn_weights = attn_weights + rel_pos
if hasattr(F, "scaled_dot_product_attention"):
attn = F.scaled_dot_product_attention(
q, k, v, attn_mask, self.dropout_module.p
)
attn_weights = None
else:
q *= self.scaling
q, k, v = map(lambda t: t.view(bsz * self.num_heads, -1, self.head_dim), (q, k, v))
attn_weights = torch.bmm(q, k.transpose(1, 2))
attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).type_as( attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).type_as(
attn_weights attn_weights
) )
attn_weights = attn_weights.view(
bsz, self.num_heads, tgt_len, src_len).transpose(1, 0)
attn_probs = self.dropout_module(attn_weights) attn_probs = self.dropout_module(attn_weights)
attn = torch.bmm(attn_probs, v) attn = torch.bmm(attn_probs, v)
@ -150,8 +150,4 @@ class MultiheadAttention(nn.Module):
attn = self.inner_attn_ln(attn) attn = self.inner_attn_ln(attn)
attn = self.out_proj(attn) attn = self.out_proj(attn)
attn_weights = attn_weights.view(
bsz, self.num_heads, tgt_len, src_len
).transpose(1, 0)
return attn, attn_weights return attn, attn_weights