vllm-project · ywang96 · Jun 6, 2025 · Jun 3, 2025 · gemini-code-assist · Jun 6, 2025
@@ -1006,6 +1006,8 @@ def _update_waiting_for_remote_kv(self, request: Request) -> bool:
         # Now that the blocks are ready, actually cache them.
         block_ids = self.kv_cache_manager.get_block_ids(request.request_id)[0]
         num_computed_tokens = len(block_ids) * self.block_size
+        # Handle the case where num request tokens less then one block.
+        num_computed_tokens = min(num_computed_tokens, request.num_tokens)
-        # Handle the case where num request tokens less then one block.
-        num_computed_tokens = min(num_computed_tokens, request.num_tokens)
+# Cap num_computed_tokens at the actual number of tokens in the request.
-        # Handle the case where num request tokens less then one block.
-        num_computed_tokens = min(num_computed_tokens, request.num_tokens)
+# Cap num_computed_tokens at the actual number of tokens in the request.
         if num_computed_tokens == request.num_tokens:
             num_computed_tokens -= 1
         self.kv_cache_manager.single_type_manager.cache_blocks(