fix(v1/kv_cache): resolve async KV transfer bug in cascade attention

ayushsatyam146 · ayushsatyam146 · commit 912667de00b3 · 2025-10-07T12:45:31.000+05:30
* Replace ref_cnt-based common prefix detection with running request tracking
* Update get_num_common_prefix_blocks() to accept running_request_ids set
* Fix FullAttentionManager to count actual references from running requests
* Prevent incorrect cascade attention when async KV offloading delays cleanup

This resolves a bug where completed requests with pending async transfers
still contributed to ref_cnt, causing incorrect cascade attention decisions.

Signed-off-by: Ayush Satyam &lt;ayushsatyam146@gmail.com&gt;
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
@@ -149,26 +149,31 @@ def free(self, request_id: str) -> None:
             manager.free(request_id)
 
     def get_num_common_prefix_blocks(
-        self, request_id: str, num_running_requests: int
+        self,
+        running_request_id: str,
+        num_running_requests: int,
+        transfering_request_ids: list[str],
     ) -> list[int]:
         """
         Get the number of common prefix blocks for all requests in the RUNNING
         state for each kv cache group.
 
         Args:
-            request_id: The request ID.
-            num_running_requests: The total number of requests in the RUNNING
-                state.
+            running_request_id: The request ID of the running request.
+            num_running_requests: The number of requests in the RUNNING state.
+            transfering_request_ids: List of request IDs in
+                WAITING_FOR_REMOTE_KVS state.
 
         Returns:
             list[int]: The number of common prefix blocks for all requests in
                 the RUNNING state for each kv cache group.
         """
-        num_blocks_per_group = [
-            manager.get_num_common_prefix_blocks(request_id, num_running_requests)
+        return [
+            manager.get_num_common_prefix_blocks(
+                running_request_id, num_running_requests, transfering_request_ids
+            )
             for manager in self.single_type_managers
         ]
-        return num_blocks_per_group
 
     def remove_skipped_blocks(self, request_id: str, num_computed_tokens: int) -> None:
         """
@@ -227,7 +232,10 @@ def __init__(
         self.num_single_type_manager = len(self.single_type_managers)
 
     def get_num_common_prefix_blocks(
-        self, request_id: str, num_running_requests: int
+        self,
+        running_request_id: str,
+        num_running_requests: int,
+        transfering_request_ids: list[str],
     ) -> list[int]:
         return [0] * self.num_single_type_manager
 
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
@@ -10,7 +10,7 @@
 from vllm.v1.core.kv_cache_utils import KVCacheBlock
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import PrefixCacheStats
-from vllm.v1.request import Request, RequestStatus
+from vllm.v1.request import Request
 
 logger = init_logger(__name__)
 
@@ -346,15 +346,18 @@ def reset_prefix_cache(self) -> bool:
 
     def get_num_common_prefix_blocks(
         self,
-        request: Request,
+        running_request_id: str,
         num_running_requests: int,
+        transfering_request_ids: list[str],
     ) -> list[int]:
         """Calculate the number of common prefix blocks shared by all requests
         in the RUNNING state for each kv cache group.
 
-        The function determines this by selecting any request and iterating
-        through its blocks.  A block is considered a common prefix block if its
-        `ref_cnt` equals the total number of requests in the RUNNING state.
+        The function determines this by selecting any running request and
+        iterating through its blocks. A block is considered a common prefix
+        block if it is shared by ALL currently running requests. Transferring
+        requests (those in WAITING_FOR_REMOTE_KVS state) are excluded from
+        this check, as they may not have fully loaded their KV cache yet.
 
         NOTE(woosuk): The number of requests in the RUNNING state is **greater
         than or equal to** the number of requests scheduled in the current step.
@@ -373,19 +376,20 @@ def get_num_common_prefix_blocks(
         so the function returns 0 in such cases.
 
         Args:
-            request: Any request in the RUNNING state, used to identify the
-                common prefix blocks.
+            running_request_id: The request ID of any running request, used to
+                identify the common prefix blocks.
             num_running_requests: The total number of requests in the RUNNING
                 state. This can be different from the number of scheduled
                 requests in the current step.
+            transfering_request_ids: List of request IDs in transfer state
+                (WAITING_FOR_REMOTE_KVS).
 
         Returns:
             list[int]: The number of common prefix blocks for each kv cache
             group.
         """
-        assert request.status == RequestStatus.RUNNING
         return self.coordinator.get_num_common_prefix_blocks(
-            request.request_id, num_running_requests
+            running_request_id, num_running_requests, transfering_request_ids
         )
 
     def take_events(self) -> list[KVCacheEvent]:
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
@@ -595,9 +595,18 @@ def schedule(self) -> SchedulerOutput:
         num_common_prefix_blocks = [0] * len(self.kv_cache_config.kv_cache_groups)
         if self.running:
             any_request = self.running[0]
+            num_running_requests = len(self.running)
+
+            transferring_request_ids = [
+                req_id
+                for req_id, request in self.requests.items()
+                if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS
+            ]
             num_common_prefix_blocks = (
                 self.kv_cache_manager.get_num_common_prefix_blocks(
-                    any_request, len(self.running)
+                    any_request.request_id,
+                    num_running_requests,
+                    transferring_request_ids,
                 )
             )
 
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -183,22 +183,12 @@ def free(self, request_id: str) -> None:
 
     @abstractmethod
     def get_num_common_prefix_blocks(
-        self, request_id: str, num_running_requests: int
+        self,
+        running_request_id: str,
+        num_running_requests: int,
+        transfering_request_ids: list[str],
     ) -> int:
-        """
-        Get the number of common prefix blocks for all requests in the RUNNING
-        state.
-
-        Args:
-            request_id: The request ID.
-            num_running_requests: The total number of requests in the RUNNING
-                state.
-
-        Returns:
-            The number of common prefix blocks for all requests in the RUNNING
-                state.
-        """
-
+        """Get the number of common prefix blocks for all running requests."""
         raise NotImplementedError
 
     @classmethod
@@ -303,15 +293,34 @@ def remove_skipped_blocks(self, request_id: str, num_computed_tokens: int) -> No
         pass
 
     def get_num_common_prefix_blocks(
-        self, request_id: str, num_running_requests: int
+        self,
+        running_request_id: str,
+        num_running_requests: int,
+        transfering_request_ids: list[str],
     ) -> int:
-        blocks = self.req_to_blocks[request_id]
+        """Get common prefix blocks shared by all running requests."""
+
+        reference_blocks = self.req_to_blocks[running_request_id]
+
+        transferring_blocks = [
+            self.req_to_blocks[req_id]
+            for req_id in transfering_request_ids
+            if req_id in self.req_to_blocks
+        ]
+
         num_common_blocks = 0
-        for block in blocks:
-            if block.ref_cnt == num_running_requests:
+        for i, ref_block in enumerate(reference_blocks):
+            transferring_has_block = sum(
+                1
+                for blocks in transferring_blocks
+                if i < len(blocks) and blocks[i].block_id == ref_block.block_id
+            )
+
+            if ref_block.ref_cnt - transferring_has_block == num_running_requests:
                 num_common_blocks += 1
             else:
                 break
+
         return num_common_blocks
 
 
@@ -409,7 +418,10 @@ def remove_skipped_blocks(self, request_id: str, num_computed_tokens: int) -> No
         self.block_pool.free_blocks(removed_blocks)
 
     def get_num_common_prefix_blocks(
-        self, request_id: str, num_running_requests: int
+        self,
+        running_request_id: str,
+        num_running_requests: int,
+        transfering_request_ids: list[str],
     ) -> int:
         """
         NOTE(Chen): The prefix blocks are null blocks for sliding window layers.
@@ -545,7 +557,10 @@ def remove_skipped_blocks(self, request_id: str, num_computed_tokens: int) -> No
         self.block_pool.free_blocks(removed_blocks)
 
     def get_num_common_prefix_blocks(
-        self, request_id: str, num_running_requests: int
+        self,
+        running_request_id: str,
+        num_running_requests: int,
+        transfering_request_ids: list[str],
     ) -> int:
         """
         cascade attention is not supported by chunked local attention.
@@ -597,7 +612,10 @@ def remove_skipped_blocks(self, request_id: str, num_computed_tokens: int) -> No
         pass
 
     def get_num_common_prefix_blocks(
-        self, request_id: str, num_running_requests: int
+        self,
+        running_request_id: str,
+        num_running_requests: int,
+        transfering_request_ids: list[str],
     ) -> int:
         """
         cascade attention is not supported by mamba
@@ -649,7 +667,10 @@ def cache_blocks(self, request: Request, num_tokens: int) -> None:
         raise ValueError("Should not be called as prefix caching is disabled.")
 
     def get_num_common_prefix_blocks(
-        self, request_id: str, num_running_requests: int
+        self,
+        running_request_id: str,
+        num_running_requests: int,
+        transfering_request_ids: list[str],
     ) -> int:
         # Cross-attention blocks contain request-specific encoder states
         # and are not shared between different requests