Update modeling_flash_attention_utils.py

Cyrilvallez · Cyrilvallez · commit ce42aa748d99 · 2025-05-12T09:49:48.000+02:00
diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py
@@ -150,7 +150,7 @@ def _upad_input(
     indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
 
     # With static caches, the k/v states may be larger than the mask -> we need to slice them to avoid generating garbage
-    # It's a bit of an anti-pattern, but otherwise we silently compute wrong attentions
+    # It's a bit of an anti-pattern, but otherwise we silently compute wrong attentions scores
     if key_layer.shape[1] > (seq_len := attention_mask.shape[-1]):
         key_layer, value_layer = key_layer[:, :seq_len, :, :], value_layer[:, :seq_len, :, :]