huggingface · vasqu · Aug 22, 2025 · Aug 21, 2025 · Aug 22, 2025 · Aug 22, 2025
diff --git a/src/transformers/integrations/flex_attention.py b/src/transformers/integrations/flex_attention.py
@@ -90,7 +90,7 @@ def compile_friendly_flex_attention(
     value: torch.Tensor,
     training=False,
     **kwargs,
-) -> torch.Tensor:
+) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
     # First call initialise singleton wrapper object, second call invokes the object method to return compiled flex attention
     # Do not use compiled version if already compiling forward (it raises issues)
     flex_attention_compiled = WrappedFlexAttention(training)() if not is_torchdynamo_compiling() else flex_attention
@@ -243,7 +243,7 @@ def flex_attention_forward(
     head_mask: Optional[torch.Tensor] = None,
     s_aux: Optional[torch.Tensor] = None,
     **kwargs,
-) -> tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
     if head_mask is not None:
         logger.warning_once(
             "`flex_attention` does not support `head_mask`. Please set your attention to `eager` if you want this feature."
@@ -290,7 +290,10 @@ def score_mod(score, batch_idx, head_idx, q_idx, kv_idx):
         enable_gqa = False
 
     kernel_options = kwargs.get("kernel_options")
-    attn_output, attention_weights = compile_friendly_flex_attention(
+    # On CPU we must skip returning LSE due to a runtime issue; elsewhere, follow PyTorch API and return it
+    return_lse = query.device.type != "cpu"
+
+    flex_attention_output = compile_friendly_flex_attention(
         query,
         key,
         value,
@@ -301,11 +304,16 @@ def score_mod(score, batch_idx, head_idx, q_idx, kv_idx):
         kernel_options=kernel_options,
         # Last time checked on PyTorch == 2.5.1: Flex Attention always computes the lse regardless.
         # For simplification, we thus always return it as no additional computations are introduced.
-        return_lse=True,
+        return_lse=return_lse,
         training=module.training,
     )
     # lse is returned in float32
-    attention_weights = attention_weights.to(value.dtype)
-    attn_output = attn_output.transpose(1, 2).contiguous()
+    if return_lse:
+        attention_output, lse = flex_attention_output  # type: ignore[misc]
+        lse = lse.to(value.dtype)
+    else:
+        attention_output = flex_attention_output  # type: ignore[assignment]
+        lse = None
 
-    return attn_output, attention_weights
+    attention_output = attention_output.transpose(1, 2).contiguous()
+    return attention_output, lse