[Core][Bugfix]: fix prefix caching for blockv2 (#5364)

leiwen83 · wenlei03 · web-flow · commit 1b8a0d71cf5a · 2024-06-14T17:23:56.000-07:00
Signed-off-by: Lei Wen &lt;wenlei03@qiyi.com&gt;
Co-authored-by: Lei Wen &lt;wenlei03@qiyi.com&gt;
diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
@@ -477,3 +477,70 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
         assert expected_token_ids == actual_token_ids
 
     assert baseline_token_ids == test_token_ids
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        "model": "facebook/opt-125m",
+
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+
+        # we keep the blocks small, so that hit eviction quickly
+        "max_model_len": 48,
+        "block_size": 16,
+        "num_gpu_blocks_override": 3,
+
+        # Test APC in v2 block
+        "use_v2_block_manager": True,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{
+    "enable_prefix_caching": False
+}])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "enable_prefix_caching": True,
+}])
+@pytest.mark.parametrize("seed", [1])
+def test_auto_prefix_caching_after_evition_start(baseline_llm_generator,
+                                                 test_llm_generator):
+    """Verify block manager v2 with auto prefix caching could works normal
+    even when eviction started.
+    With APC enabled, all blocks are held by native block at the beginning.
+    Then blocks are managed by evictor instead. If cache hit at the evitor's
+    block, then it could be reused, or we need to recompute its kv cache.
+    """
+    output_len = 10
+    temperature = 0.0
+
+    prompts = [
+        "You are a helpful assistant. Please answer truthfully and write "
+        "out your thinking step by step to be sure you get the right answer. "
+        "If you make a mistake, attempt to correct it. who are you?",
+        "You are a helpful assistant. Please answer truthfully and write out "
+        "your thinking step by step to be sure you get the right answer. You "
+        "are helpful and harmless and you follow ethical guidelines. "
+        "who are you?"
+    ]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    print('Getting token ids with APC disabled')
+    baseline_token_ids = get_token_ids_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+
+    print('Getting token ids with APC enabled')
+    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
+
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+                                                    test_token_ids):
+        assert expected_token_ids == actual_token_ids
+
+    assert baseline_token_ids == test_token_ids
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
@@ -176,14 +176,17 @@ def allocate_mutable(self,
 
             self._refcounter.incr(block_id)
 
-            # the block comes from evictor already contain computed result
+            # Now this block is pop from evictor and ready to write
+            # with new content which most probably different with
+            # original content. So need to tell worker to recompute
+            # its kvcache
             block = self._create_block(
                 prev_block=prev_block,
                 token_ids=[],
                 block_size=self._block_size,
                 allocator=self,
                 block_id=block_id,
-                computed=True,
+                computed=False,
             )
             assert block.content_hash is None