fix(v1/kv_cache): resolve async KV transfer bug in cascade attention

ayushsatyam146 · ayushsatyam146 · commit 0d66b57bfb0b · 2025-09-02T09:23:06.000+05:30
* Replace ref_cnt-based common prefix detection with running request tracking
* Update get_num_common_prefix_blocks() to accept running_request_ids set
* Fix FullAttentionManager to count actual references from running requests
* Prevent incorrect cascade attention when async KV offloading delays cleanup

This resolves a bug where completed requests with pending async transfers
still contributed to ref_cnt, causing incorrect cascade attention decisions.

Signed-off-by: Ayush Satyam &lt;ayushsatyam146@gmail.com&gt;
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
@@ -136,27 +136,29 @@ def free(self, request_id: str) -> None:
         for manager in self.single_type_managers:
             manager.free(request_id)
 
-    def get_num_common_prefix_blocks(self, request_id: str,
-                                     num_running_requests: int) -> list[int]:
+    def get_num_common_prefix_blocks(
+            self, running_request_id: str, num_running_requests: int,
+            transfering_request_ids: list[str]) -> list[int]:
         """
         Get the number of common prefix blocks for all requests in the RUNNING
-        state for each kv cache group.
+        and TRANSFERING state for each kv cache group.
 
         Args:
-            request_id: The request ID.
+            running_request_id: The request ID of the running request.
             num_running_requests: The total number of requests in the RUNNING
                 state.
+            transfering_request_ids: List of request IDs in transfer state.
 
         Returns:
             list[int]: The number of common prefix blocks for all requests in
                 the RUNNING state for each kv cache group.
         """
-        num_blocks_per_group = [
-            manager.get_num_common_prefix_blocks(request_id,
-                                                 num_running_requests)
+        return [
+            manager.get_num_common_prefix_blocks(running_request_id,
+                                                 num_running_requests,
+                                                 transfering_request_ids)
             for manager in self.single_type_managers
         ]
-        return num_blocks_per_group
 
     def remove_skipped_blocks(self, request_id: str,
                               num_computed_tokens: int) -> None:
@@ -202,8 +204,9 @@ def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
                          enable_kv_cache_events)
         self.num_single_type_manager = len(self.single_type_managers)
 
-    def get_num_common_prefix_blocks(self, request_id: str,
-                                     num_running_requests: int) -> list[int]:
+    def get_num_common_prefix_blocks(
+            self, running_request_id: str, num_running_requests: int,
+            transfering_request_ids: list[str]) -> list[int]:
         return [0] * self.num_single_type_manager
 
     def find_longest_cache_hit(
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
@@ -10,7 +10,7 @@
 from vllm.v1.core.kv_cache_utils import KVCacheBlock
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import PrefixCacheStats
-from vllm.v1.request import Request, RequestStatus
+from vllm.v1.request import Request
 
 logger = init_logger(__name__)
 
@@ -321,46 +321,30 @@ def reset_prefix_cache(self) -> bool:
 
     def get_num_common_prefix_blocks(
         self,
-        request: Request,
+        running_request_id: str,
         num_running_requests: int,
+        transfering_request_ids: list[str],
     ) -> list[int]:
         """Calculate the number of common prefix blocks shared by all requests
-        in the RUNNING state for each kv cache group.
-
-        The function determines this by selecting any request and iterating
-        through its blocks.  A block is considered a common prefix block if its
-        `ref_cnt` equals the total number of requests in the RUNNING state.
-
-        NOTE(woosuk): The number of requests in the RUNNING state is **greater
-        than or equal to** the number of requests scheduled in the current step.
-        This is because the RUNNING state only indicates that:
-        1. The request has not yet finished, and
-        2. The request holds its blocks unfreed.
-
-        While all scheduled requests must be in the RUNNING state, the inverse
-        is not necessarily true. There may be RUNNING requests that are not
-        scheduled in the current step.
+        in the RUNNING state for each kv cache group. A block is considered a
+        common prefix block if it is referenced by ALL currently running
+        requests.
 
-        This can result in an edge case where the number of common prefix blocks
-        is 0, even though all scheduled requests share a common prefix. This
-        occurs because there may be unscheduled RUNNING requests that do not
-        share the common prefix. Currently, this case cannot be easily detected,
-        so the function returns 0 in such cases.
+        This approach correctly handles async KV offloading scenarios where
+        completed requests may still hold block references while no longer
+        being in the RUNNING state.
 
         Args:
-            request: Any request in the RUNNING state, used to identify the
-                common prefix blocks.
+            running_request_id: The request ID of the running request.
             num_running_requests: The total number of requests in the RUNNING
-                state. This can be different from the number of scheduled
-                requests in the current step.
+                state.
+            transfering_request_ids: List of request IDs in transfer state.
 
         Returns:
-            list[int]: The number of common prefix blocks for each kv cache 
-            group.
+            list[int]: Number of common prefix blocks for each kv cache group.
         """
-        assert request.status == RequestStatus.RUNNING
         return self.coordinator.get_num_common_prefix_blocks(
-            request.request_id, num_running_requests)
+            running_request_id, num_running_requests, transfering_request_ids)
 
     def take_events(self) -> list[KVCacheEvent]:
         """Take the KV cache events from the block pool.
@@ -386,4 +370,4 @@ def cache_blocks(self, request: Request, num_computed_tokens: int) -> None:
     def create_empty_block_list(self) -> KVCacheBlocks:
         """Creates a new KVCacheBlocks instance with no blocks."""
         return KVCacheBlocks(tuple([]
-                                   for _ in range(self.num_kv_cache_groups)))
+                                   for _ in range(self.num_kv_cache_groups)))
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
@@ -542,9 +542,18 @@ def schedule(self) -> SchedulerOutput:
             self.kv_cache_config.kv_cache_groups)
         if self.running:
             any_request = self.running[0]
+            running_request_ids = {req.request_id for req in self.running}
+
+            # Include requests in KV transfer state for common prefix calc
+            transferring_request_ids = [
+                req_id for req_id, request in self.requests.items()
+                if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS and
+                any(self.kv_cache_manager.get_blocks(req_id).get_block_ids())
+            ]
             num_common_prefix_blocks = (
                 self.kv_cache_manager.get_num_common_prefix_blocks(
-                    any_request, len(self.running)))
+                    any_request.request_id, len(running_request_ids),
+                    transferring_request_ids))
 
         # Construct the scheduler output.
         new_reqs_data = [
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -168,22 +168,10 @@ def free(self, request_id: str) -> None:
         self.num_cached_block.pop(request_id, None)
 
     @abstractmethod
-    def get_num_common_prefix_blocks(self, request_id: str,
-                                     num_running_requests: int) -> int:
-        """
-        Get the number of common prefix blocks for all requests in the RUNNING
-        state.
-
-        Args:
-            request_id: The request ID.
-            num_running_requests: The total number of requests in the RUNNING
-                state.
-
-        Returns:
-            The number of common prefix blocks for all requests in the RUNNING
-                state.
-        """
-
+    def get_num_common_prefix_blocks(
+            self, running_request_id: str, num_running_requests: int,
+            transfering_request_ids: list[str]) -> int:
+        """Get the number of common prefix blocks for all running requests."""
         raise NotImplementedError
 
     @classmethod
@@ -281,15 +269,30 @@ def remove_skipped_blocks(self, request_id: str,
         # No need to remove blocks for full attention.
         pass
 
-    def get_num_common_prefix_blocks(self, request_id: str,
-                                     num_running_requests: int) -> int:
-        blocks = self.req_to_blocks[request_id]
+    def get_num_common_prefix_blocks(
+            self, running_request_id: str, num_running_requests: int,
+            transfering_request_ids: list[str]) -> int:
+        """Get common prefix blocks using ref_cnt with transferring requests."""
+        if running_request_id not in self.req_to_blocks:
+            return 0
+
+        running_blocks = self.req_to_blocks[running_request_id]
+        transfering_blocks = [
+            self.req_to_blocks[req_id] for req_id in transfering_request_ids
+            if req_id in self.req_to_blocks
+        ]
+
         num_common_blocks = 0
-        for block in blocks:
-            if block.ref_cnt == num_running_requests:
+        for i, block in enumerate(running_blocks):
+            num_transfering_blocks = sum(
+                1 for blocks in transfering_blocks if i < len(blocks)
+                and blocks[i].block_id == running_blocks[i].block_id)
+
+            if block.ref_cnt == num_running_requests + num_transfering_blocks:
                 num_common_blocks += 1
             else:
                 break
+
         return num_common_blocks
 
 
@@ -380,8 +383,12 @@ def remove_skipped_blocks(self, request_id: str,
             blocks[i] = self._null_block
         self.block_pool.free_blocks(removed_blocks)
 
-    def get_num_common_prefix_blocks(self, request_id: str,
-                                     num_running_requests: int) -> int:
+    def get_num_common_prefix_blocks(
+        self,
+        running_request_id: str,
+        num_running_requests: int,
+        transfering_request_ids: list[str],
+    ) -> int:
         """
         NOTE(Chen): The prefix blocks are null blocks for sliding window layers.
         So it's not correct to count ref_cnt like FullAttentionManager. Return 
@@ -506,8 +513,12 @@ def remove_skipped_blocks(self, request_id: str,
             blocks[i] = self._null_block
         self.block_pool.free_blocks(removed_blocks)
 
-    def get_num_common_prefix_blocks(self, request_id: str,
-                                     num_running_requests: int) -> int:
+    def get_num_common_prefix_blocks(
+        self,
+        running_request_id: str,
+        num_running_requests: int,
+        transfering_request_ids: list[str],
+    ) -> int:
         """
         cascade attention is not supported by chunked local attention.
         """
@@ -541,8 +552,12 @@ def remove_skipped_blocks(self, request_id: str,
         # remove blocks.
         pass
 
-    def get_num_common_prefix_blocks(self, request_id: str,
-                                     num_running_requests: int) -> int:
+    def get_num_common_prefix_blocks(
+        self,
+        running_request_id: str,
+        num_running_requests: int,
+        transfering_request_ids: list[str],
+    ) -> int:
         return 0
 
     def allocate_new_blocks(self, request_id: str,
@@ -568,8 +583,9 @@ def cache_blocks(self, request: Request, num_tokens: int) -> None:
         # requests, so this method is not relevant.
         raise ValueError("Should not be called as prefix caching is disabled.")
 
-    def get_num_common_prefix_blocks(self, request_id: str,
-                                     num_running_requests: int) -> int:
+    def get_num_common_prefix_blocks(
+            self, running_request_id: str, num_running_requests: int,
+            transfering_request_ids: list[str]) -> int:
         # Cross-attention blocks contain request-specific encoder states
         # and are not shared between different requests
         return 0