correctness!

robertgshaw2-redhat · robertgshaw2-redhat · commit 5c3fc8817b0a · 2025-05-07T03:04:35.000Z
Signed-off-by: rshaw@neuralmagic.com &lt;robertgshaw2@gmail.com&gt;
diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
@@ -4,7 +4,7 @@ set -xe
 # Models to run
 MODELS=(
     "Qwen/Qwen3-0.6B"
-    # "deepseek-ai/deepseek-vl2-tiny"
+    "deepseek-ai/deepseek-vl2-tiny"
 )
 
 # Number of prefill and decode instances to create
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
@@ -301,16 +301,20 @@ def allocate_slots(
         if not self.enable_caching:
             return KVCacheBlocks(new_blocks)
 
-        if not delay_cache_blocks:
+        if delay_cache_blocks:
+            # P/D: delay caching the blocks if we need to wait for the
+            # KVs to be recved from remote, but update num_cached_block
+            # with the prefix cache hits to avoid double caching later.
+            assert request.request_id not in self.num_cached_block
+            self.num_cached_block[request.request_id] = len(
+                new_computed_block_list)
+        else:
             self.cache_blocks(
                 request=request,
                 num_tokens=num_tokens,
                 num_computed_tokens=num_computed_tokens,
                 new_computed_block_list=new_computed_block_list,
             )
-        else:
-            self.num_cached_block[request.request_id] = len(
-                new_computed_block_list)
 
         return KVCacheBlocks(new_blocks)
 

Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@ set -xe`
`4`	`4`	`# Models to run`
`5`	`5`	`MODELS=(`
`6`	`6`	`"Qwen/Qwen3-0.6B"`
`7`		`- # "deepseek-ai/deepseek-vl2-tiny"`
	`7`	`+ "deepseek-ai/deepseek-vl2-tiny"`
`8`	`8`	`)`
`9`	`9`
`10`	`10`	`# Number of prefill and decode instances to create`