that was actually all pointless since sdpa always had an attention mask fed to it and does not need is_causal to implicitly generate one

2024-11-22 16:51:50 -06:00 · 2024-11-22 16:51:50 -06:00 · ccee5fc11c
commit ccee5fc11c
parent 4aa685e749
1 changed files with 8 additions and 30 deletions
--- a/vall_e/models/arch/llama.py
+++ b/vall_e/models/arch/llama.py
@ -223,9 +223,16 @@ class LlamaAttention_Adapted(LlamaAttention):
 		**kwargs,
 	) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
 		mode = "default" if output_attentions else self.mode
 		non_split_attention = [
 			"default",
 			torch.nn.attention.SDPBackend.MATH,
 			torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION,
 			torch.nn.attention.SDPBackend.FLASH_ATTENTION,
 			torch.nn.attention.SDPBackend.CUDNN_ATTENTION
 		]
 		# split per batch because other attention mechanisms do not have a conditional is_causal per-batch, only for the entire input
-		if isinstance( is_causal, list ) and mode not in ["default"]:
+		if isinstance( is_causal, list ) and mode not in non_split_attention:
 			# initialize lists
 			attn_hidden_states = [ None for _ in is_causal ]
 			self_attn_weights = [ None for _ in is_causal ]
@ -282,35 +289,6 @@ class LlamaAttention_Adapted(LlamaAttention):
 			return attn_hidden_states, output_attentions, []
 			"""
 			h_s = []
 			s_a_w = []
 			p_k_v = []
 			for i, state in enumerate(is_causal):
 				hidden_state, self_attn_weight, present_key_value = self.forward(
 					hidden_states=hidden_states[i].unsqueeze(0),
 					attention_mask=attention_mask[i].unsqueeze(0),
 					is_causal=state,
 					position_ids=position_ids[i].unsqueeze(0),
 					past_key_value=past_key_value,
 					output_attentions=output_attentions,
 					use_cache=False,
 					cache_position=cache_position,
 					position_embeddings=(position_embeddings[0][i].unsqueeze(0), position_embeddings[1][i].unsqueeze(0)) if position_embeddings is not None else None,
 					**kwargs,
 				)
 				h_s.append(hidden_state)
 				s_a_w.append(self_attn_weight)
 				p_k_v.append(present_key_value)
 			return (
 				torch.concat( h_s, dim=0 ),
 				torch.concat( s_a_w, dim=0 ) if s_a_w else None,
 				p_k_v,
 			)
 			"""
 		dropout_rate = self.attention_dropout if self.training else 0.0
 		bsz, q_len, _ = hidden_states.size()