Rename flash-attn to flash-attn2 (#4514)

qgallouedec · sergiopaniego · web-flow · commit 102dc4184c86 · 2025-11-13T12:36:43.000-07:00
Co-authored-by: Sergio Paniego Blanco &lt;sergiopaniegoblanco@gmail.com&gt;
diff --git a/docs/source/kernels_hub.md b/docs/source/kernels_hub.md
@@ -27,20 +27,20 @@ from transformers import AutoModelForCausalLM
 
 model = AutoModelForCausalLM.from_pretrained(
     "your-model-name",
-    attn_implementation="kernels-community/flash-attn"  # other options: kernels-community/vllm-flash-attn3, kernels-community/paged-attention
+    attn_implementation="kernels-community/flash-attn2"  # other options: kernels-community/vllm-flash-attn3, kernels-community/paged-attention
 )
 ```
 
 Or when running a TRL training script:
 
 ```bash
-python sft.py ... --attn_implementation kernels-community/flash-attn
+python sft.py ... --attn_implementation kernels-community/flash-attn2
 ```
 
 Or using the TRL CLI:
 
 ```bash
-trl sft ... --attn_implementation kernels-community/flash-attn
+trl sft ... --attn_implementation kernels-community/flash-attn2
 ```
 
 > [!TIP]
@@ -84,7 +84,7 @@ from trl import SFTConfig
 
 model = AutoModelForCausalLM.from_pretrained(
     "your-model-name",
-    attn_implementation="kernels-community/flash-attn"  # choose the desired FlashAttention variant
+    attn_implementation="kernels-community/flash-attn2"  # choose the desired FlashAttention variant
 )
 
 training_args = SFTConfig(
diff --git a/trl/trainer/model_config.py b/trl/trainer/model_config.py
@@ -43,8 +43,8 @@ class ModelConfig:
             be set to `True` for repositories you trust and in which you have read the code, as it will execute code
             present on the Hub on your local machine.
         attn_implementation (`str`, *optional*):
-            Which attention implementation to use. You can run `--attn_implementation=flash_attention_2`, in which case
-            you must install this manually by running `pip install flash-attn --no-build-isolation`.
+            Which attention implementation to use. More information in the [Kernels Hub Integrations
+            Guide](kernels_hub).
         use_peft (`bool`, *optional*, defaults to `False`):
             Whether to use PEFT for training.
         lora_r (`int`, *optional*, defaults to `16`):
diff --git a/trl/trainer/sft_trainer.py b/trl/trainer/sft_trainer.py
@@ -72,9 +72,9 @@
 FLASH_ATTENTION_VARIANTS = {
     "flash_attention_2",
     "flash_attention_3",
-    "kernels-community/flash-attn",
-    "kernels-community/vllm-flash-attn3",
+    "kernels-community/flash-attn2",
     "kernels-community/flash-attn3",
+    "kernels-community/vllm-flash-attn3",
 }