vllm-project · heheda12345 · Jun 13, 2025 · Jun 10, 2025
@@ -381,10 +381,11 @@ def get_block_ids(self, request_id: str) -> tuple[list[int], ...]:
             self.coordinator.get_blocks(request_id)).get_block_ids()
 
     def cache_blocks(self, request: Request, num_computed_tokens: int) -> None:
-        """Cache the blocks for the request."""
-        block_hashes = self.req_to_block_hashes[request.request_id]
-        self.coordinator.cache_blocks(request, block_hashes,
-                                      num_computed_tokens)
+        """Cache the blocks for the request, if enabled."""
+        if self.enable_caching:
+            block_hashes = self.req_to_block_hashes[request.request_id]
+            self.coordinator.cache_blocks(request, block_hashes,
+                                          num_computed_tokens)
 
     def create_empty_block_list(self) -> KVCacheBlocks:
         """Creates a new KVCacheBlocks instance with no blocks."""

@@ -1015,6 +1015,7 @@ def _update_waiting_for_remote_kv(self, request: Request) -> bool:
         num_computed_tokens = min(num_computed_tokens, request.num_tokens)
         if num_computed_tokens == request.num_tokens:
             num_computed_tokens -= 1
+        # This will cache the blocks iff caching is enabled.
         self.kv_cache_manager.cache_blocks(request, num_computed_tokens)
 
         # Update the request state for scheduling.