FA2 can continue generation from cache (#39843)

zucchini-nlp · web-flow · commit 78922577e982 · 2025-08-07T19:26:23.000+02:00
* add fa2 support to continue generation from cache

* update q-len
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
@@ -677,24 +677,6 @@ def prepare_inputs_for_generation(
         if encoder_attention_mask is not None:
             model_inputs["attention_mask"] = encoder_attention_mask
 
-        if "flash" in self.config._attn_implementation and self._supports_attention_backend:
-            tensor_kws = {"dtype": torch.int32, "device": self.device}
-            pos = model_inputs["position_ids"][:, -1]
-
-            cu_seq_lens_k = torch.cat([torch.zeros(1, **tensor_kws), pos.cumsum(0).add(1)], 0)
-            max_length_k = int(pos.max()) + 1
-
-            bs, seq_len = input_ids.size()
-            q_len = torch.ones(bs, **tensor_kws) if seq_len == 1 else pos.to(torch.int32).add(1)
-            cu_seq_lens_q = torch.cat([torch.zeros(1, **tensor_kws), q_len.cumsum(0)], 0)
-            max_length_q = int(q_len.max())
-
-            model_inputs.update(
-                cu_seq_lens_q=cu_seq_lens_q.to(self.device),
-                cu_seq_lens_k=cu_seq_lens_k.to(self.device),
-                max_length_q=max_length_q,
-                max_length_k=max_length_k,
-            )
         # 7. Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
         for key, value in kwargs.items():
             if key not in model_inputs:
diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py
@@ -190,7 +190,7 @@ def _upad_input(
     )
 
 
-def _prepare_from_posids(query, key, value, position_ids):
+def _prepare_from_posids(query, key, value, position_ids, query_length):
     """
     This function returns necessary arguments to call `flash_attn_varlen_func`.
     All three query, key, value states will be flattened.
@@ -205,43 +205,66 @@ def _prepare_from_posids(query, key, value, position_ids):
             Value state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
         position_ids (`torch.Tensor`):
             Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
+        query_length (`int`):
+            Sequence length of the input queries.
     Return:
         query (`torch.Tensor`):
             Query state without padding. Shape: (total_target_length, num_heads, head_dim).
         key (`torch.Tensor`):
             Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
         value (`torch.Tensor`):
             Value state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
-        indices_q (`torch.Tensor`):
-            The indices of non-masked tokens from the flattened input target sequence.
         (cu_seqlens_q, cu_seqlens_k) (`tuple[int]`):
             The cumulative sequence lengths for the target (query) and source (key, value), used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
         (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`tuple[int]`):
             Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence i.e. query, `max_seqlen_in_batch_k` for the source sequence i.e. key/value).
     """
+    kv_length = key.shape[1]
     query = query.contiguous().view(-1, query.size(-2), query.size(-1))
     key = key.contiguous().view(-1, key.size(-2), key.size(-1))
     value = value.contiguous().view(-1, value.size(-2), value.size(-1))
 
-    position_ids = position_ids.flatten()
-    indices_q = torch.arange(position_ids.size(0), device=position_ids.device, dtype=torch.int32)
+    # If the lengths are not equal, most probably we are in decoding stage with cache
+    # In that case the position ids will not always start with `0` and we need a better way to infer
+    # cumulative seq lengths.
+    if query_length != kv_length:
+        indices_q = torch.arange(position_ids.size(0), device=position_ids.device, dtype=torch.int32)
 
-    cu_seq_lens = torch.cat(
-        (
-            indices_q[position_ids == 0],
-            torch.tensor(position_ids.size(), device=position_ids.device, dtype=torch.int32),
+        tensor_kws = {"dtype": torch.int32, "device": position_ids.device}
+        last_position_ids = position_ids[:, -1]
+
+        cu_seq_lens_k = torch.cat(
+            [torch.zeros(1, **tensor_kws), last_position_ids.cumsum(0).add(1).to(torch.int32)], 0
         )
-    )
-    # NOTE: With torch compile, this will cause a graph break if you don't set
-    # `TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1` in the environment or call
-    # `torch._dynamo.config.capture_scalar_outputs = True` before doing the forward pass.
-    # This is a limitation of flash attention API, as the function `flash_attn_varlen_func`
-    # requires `max_length_q`, `max_length_k` to be passed as `int` and not `torch.Tensor`.
-    # https://github.com/Dao-AILab/flash-attention/blob/2dd8078adc1d9b74e315ee99718c0dea0de8eeb6/flash_attn/flash_attn_interface.py#L1423-L1424
-    # We should use cu_seq_lens instead of position_ids to get the max length since position_ids is not always increasing
-    # for some models (e.g. qwen2-vl).
-    max_length = cu_seq_lens.diff().max().item()
-    return (query, key, value, indices_q, (cu_seq_lens, cu_seq_lens), (max_length, max_length))
+        max_length_k = int(last_position_ids.max()) + 1
+
+        batch_size, seq_len = query.shape[:2]
+        q_len = torch.ones(batch_size, **tensor_kws) if query_length == 1 else last_position_ids.add(1)
+        cu_seq_lens_q = torch.cat([torch.zeros(1, **tensor_kws), q_len.cumsum(0).to(torch.int32)], 0)
+        max_length_q = int(q_len.max())
+    else:
+        position_ids = position_ids.flatten()
+        indices_q = torch.arange(position_ids.size(0), device=position_ids.device, dtype=torch.int32)
+
+        cu_seq_lens_q = torch.cat(
+            (
+                indices_q[position_ids == 0],
+                torch.tensor(position_ids.size(), device=position_ids.device, dtype=torch.int32),
+            )
+        )
+        cu_seq_lens_k = cu_seq_lens_q
+
+        # NOTE: With torch compile, this will cause a graph break if you don't set
+        # `TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1` in the environment or call
+        # `torch._dynamo.config.capture_scalar_outputs = True` before doing the forward pass.
+        # This is a limitation of flash attention API, as the function `flash_attn_varlen_func`
+        # requires `max_length_q`, `max_length_k` to be passed as `int` and not `torch.Tensor`.
+        # https://github.com/Dao-AILab/flash-attention/blob/2dd8078adc1d9b74e315ee99718c0dea0de8eeb6/flash_attn/flash_attn_interface.py#L1423-L1424
+        # We should use cu_seq_lens instead of position_ids to get the max length since position_ids is not always increasing
+        # for some models (e.g. qwen2-vl).
+        max_length_q = cu_seq_lens_q.diff().max().item()
+        max_length_k = max_length_q
+    return (query, key, value, (cu_seq_lens_q, cu_seq_lens_k), (max_length_q, max_length_k))
 
 
 def _prepare_flash_attention_from_position_ids(query, key, value, position_ids):
@@ -430,8 +453,8 @@ def _flash_attention_forward(
                 raise ValueError(
                     "Position ids should be passed if the attention mask is not passed and the cu_seq-lens are not passed."
                 )
-            q, k, v, idx, (cu_q, cu_k), (mq, mk) = _prepare_from_posids(
-                query_states, key_states, value_states, position_ids
+            q, k, v, (cu_q, cu_k), (mq, mk) = _prepare_from_posids(
+                query_states, key_states, value_states, position_ids, query_length=query_length
             )
         else:
             q = query_states.reshape(-1, query_states.size(-2), query_states.size(-1))
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
@@ -4280,6 +4280,93 @@ def test_flash_attention_3_padding_matches_padding_free_with_position_ids_and_fa
             attn_implementation="flash_attention_3", fa_kwargs=True
         )
 
+    @require_flash_attn
+    @require_torch_gpu
+    @mark.flash_attn_test
+    def test_flash_attention_2_continue_generate_with_position_ids(self):
+        """
+        Tests that the given attention implementation can work with packed sequences and infers the mask
+        from position ids. This test requires the model to use new attention mask API which handles packing.
+        """
+
+        max_new_tokens = 2
+        for model_class in self.all_generative_model_classes:
+            if not model_class._supports_flash_attn:
+                self.skipTest(f"{model_class.__name__} does not support Flash Attention.")
+
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            if config.is_encoder_decoder:
+                self.skipTest("Model is an encoder-decoder")
+
+            if not hasattr(config.get_text_config(), "use_cache"):
+                self.skipTest(f"{model_class.__name__} doesn't support caching")
+
+            if "input_ids" not in inputs_dict or inputs_dict["input_ids"].ndim != 2:
+                self.skipTest("Model dummy inputs should contain text input ids")
+
+            # make sure that all models have enough positions for generation
+            dummy_input_ids = inputs_dict["input_ids"]
+            if hasattr(config, "max_position_embeddings"):
+                config.max_position_embeddings = max_new_tokens + dummy_input_ids.shape[1] + 1
+
+            model = model_class(config)
+            if "position_ids" not in inspect.signature(model.forward).parameters:
+                self.skipTest("Model does not support position_ids")
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model = (
+                    model_class.from_pretrained(
+                        tmpdirname,
+                        torch_dtype=torch.bfloat16,
+                        attn_implementation="flash_attention_2",
+                    )
+                    .to(torch_device)
+                    .eval()
+                )
+
+                # Drop all keys except for `input_ids`. Hard to manipulate with multimodals/head_mask/etc
+                dummy_input_ids = inputs_dict["input_ids"]
+                dummy_position_ids = torch.arange(dummy_input_ids.shape[1], device=torch_device)
+                dummy_position_ids = dummy_position_ids.unsqueeze(0).repeat(dummy_input_ids.shape[0], 1)
+
+                # Store cache for the input prompt
+                output = model(dummy_input_ids, position_ids=dummy_position_ids, use_cache=True)
+                if "past_key_values" not in output:
+                    self.skipTest("This model doesn't return `past_key_values`")
+
+                # create new input_ids and position_ids to continue generation re-using the cache
+                new_input_ids = output.logits[:, -1, :].float().argmax(-1)[:, None]
+                past_length = dummy_input_ids.shape[1]
+                position_ids = torch.arange(past_length, past_length + new_input_ids.shape[1], device=torch_device)
+                position_ids = position_ids.unsqueeze(0).repeat(new_input_ids.shape[0], 1)
+
+                output = model(
+                    input_ids=new_input_ids,
+                    past_key_values=output.past_key_values,
+                    position_ids=position_ids,
+                    use_cache=True,
+                )
+                next_token_logits = output.logits[:, -1, :].float()
+
+                generate_kwargs = {
+                    "pad_token_id": -1,
+                    "eos_token_id": -1,
+                    "forced_eos_token_id": None,
+                    "use_cache": True,
+                    "do_sample": False,
+                    "return_dict_in_generate": True,
+                    "output_logits": True,
+                    "max_new_tokens": max_new_tokens,
+                }
+                generation_out = model.generate(dummy_input_ids, **generate_kwargs)
+                next_token_logits_from_generate = generation_out.logits[-1]
+
+                # acceptable numerical instability
+                # print(next_token_logits_from_generate, next_token_logits)
+                tol = torch.finfo(torch.bfloat16).eps
+                torch.testing.assert_close(next_token_logits_from_generate, next_token_logits, rtol=tol, atol=tol)
+
     def flash_attn_from_config(self, attn_implementation: str):
         r"""
         Tests if the model can be loaded with `attn_implementation` from the config and if the