Skip to content
This repository was archived by the owner on Oct 11, 2024. It is now read-only.

Commit 80b908f

Browse files
leiwen83wenlei03
authored andcommitted
[Core][Bugfix]: fix prefix caching for blockv2 (vllm-project#5364)
Signed-off-by: Lei Wen <wenlei03@qiyi.com> Co-authored-by: Lei Wen <wenlei03@qiyi.com>
1 parent d464106 commit 80b908f

File tree

2 files changed

+72
-2
lines changed

2 files changed

+72
-2
lines changed

tests/core/block/e2e/test_correctness.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -482,3 +482,70 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
482482
assert expected_token_ids == actual_token_ids
483483

484484
assert baseline_token_ids == test_token_ids
485+
486+
487+
@pytest.mark.parametrize(
488+
"common_llm_kwargs",
489+
[{
490+
# Use a small model for a fast test.
491+
"model": "facebook/opt-125m",
492+
493+
# skip cuda graph creation for fast test.
494+
"enforce_eager": True,
495+
496+
# we keep the blocks small, so that hit eviction quickly
497+
"max_model_len": 48,
498+
"block_size": 16,
499+
"num_gpu_blocks_override": 3,
500+
501+
# Test APC in v2 block
502+
"use_v2_block_manager": True,
503+
}])
504+
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
505+
@pytest.mark.parametrize("baseline_llm_kwargs", [{
506+
"enable_prefix_caching": False
507+
}])
508+
@pytest.mark.parametrize("test_llm_kwargs", [{
509+
"enable_prefix_caching": True,
510+
}])
511+
@pytest.mark.parametrize("seed", [1])
512+
def test_auto_prefix_caching_after_evition_start(baseline_llm_generator,
513+
test_llm_generator):
514+
"""Verify block manager v2 with auto prefix caching could works normal
515+
even when eviction started.
516+
With APC enabled, all blocks are held by native block at the beginning.
517+
Then blocks are managed by evictor instead. If cache hit at the evitor's
518+
block, then it could be reused, or we need to recompute its kv cache.
519+
"""
520+
output_len = 10
521+
temperature = 0.0
522+
523+
prompts = [
524+
"You are a helpful assistant. Please answer truthfully and write "
525+
"out your thinking step by step to be sure you get the right answer. "
526+
"If you make a mistake, attempt to correct it. who are you?",
527+
"You are a helpful assistant. Please answer truthfully and write out "
528+
"your thinking step by step to be sure you get the right answer. You "
529+
"are helpful and harmless and you follow ethical guidelines. "
530+
"who are you?"
531+
]
532+
533+
sampling_params = SamplingParams(
534+
max_tokens=output_len,
535+
ignore_eos=True,
536+
temperature=temperature,
537+
)
538+
539+
print('Getting token ids with APC disabled')
540+
baseline_token_ids = get_token_ids_from_llm_generator(
541+
baseline_llm_generator, prompts, sampling_params)
542+
543+
print('Getting token ids with APC enabled')
544+
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
545+
prompts, sampling_params)
546+
547+
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
548+
test_token_ids):
549+
assert expected_token_ids == actual_token_ids
550+
551+
assert baseline_token_ids == test_token_ids

vllm/core/block/prefix_caching_block.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -176,14 +176,17 @@ def allocate_mutable(self,
176176

177177
self._refcounter.incr(block_id)
178178

179-
# the block comes from evictor already contain computed result
179+
# Now this block is pop from evictor and ready to write
180+
# with new content which most probably different with
181+
# original content. So need to tell worker to recompute
182+
# its kvcache
180183
block = self._create_block(
181184
prev_block=prev_block,
182185
token_ids=[],
183186
block_size=self._block_size,
184187
allocator=self,
185188
block_id=block_id,
186-
computed=True,
189+
computed=False,
187190
)
188191
assert block.content_hash is None
189192

0 commit comments

Comments
 (0)