Skip to content

Commit 4a3d12c

Browse files
afeldman-nmdtrifiro
authored andcommitted
[Core] Cross-attention KV caching and memory-management (towards eventual encoder/decoder model support) (vllm-project#4837)
1 parent 6bdfb4f commit 4a3d12c

File tree

7 files changed

+735
-69
lines changed

7 files changed

+735
-69
lines changed

tests/core/block/test_block_manager_v2.py

Lines changed: 153 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import pytest
22

3+
from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
4+
STR_NOT_IMPL_ENC_DEC_SWA)
35
from vllm.core.block_manager_v2 import BlockSpaceManagerV2
46
from vllm.core.interfaces import AllocStatus
57
from vllm.sequence import Logprob, SequenceStatus
68
from vllm.utils import chunk_list
79

8-
from ..utils import create_seq_group
10+
from ..utils import create_seq_group, create_seq_group_encoder_decoder
911

1012

1113
@pytest.mark.parametrize("block_size", [16])
@@ -52,6 +54,156 @@ def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
5254
assert can_allocate_result == AllocStatus.LATER
5355

5456

57+
@pytest.mark.parametrize("block_size", [16])
58+
@pytest.mark.parametrize("num_gpu_blocks", [16, 80, 160])
59+
@pytest.mark.parametrize("num_seqs_per_group", [1, 4])
60+
@pytest.mark.parametrize("watermark", [0.0, 0.5])
61+
def test_can_allocate_seq_group_encoder_decoder(block_size: int,
62+
num_seqs_per_group: int,
63+
num_gpu_blocks: int,
64+
watermark: float):
65+
block_manager = BlockSpaceManagerV2(
66+
block_size=block_size,
67+
num_gpu_blocks=num_gpu_blocks,
68+
num_cpu_blocks=1024,
69+
watermark=watermark,
70+
)
71+
num_watermark_blocks = int(watermark * num_gpu_blocks)
72+
73+
num_output_blocks_per_seq = 1
74+
75+
# NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
76+
# the current implementation assumes all seqs are new prompts / don't have
77+
# different output lens.
78+
num_output_blocks = num_output_blocks_per_seq
79+
80+
for bdx, num_prompt_blocks in enumerate(
81+
range(1, num_gpu_blocks - num_output_blocks)):
82+
num_cross_blocks_per_seq = num_prompt_blocks
83+
84+
seq_group = create_seq_group_encoder_decoder(
85+
seq_prompt_len=block_size * num_prompt_blocks,
86+
seq_output_lens=[
87+
block_size * num_output_blocks_per_seq
88+
for _ in range(num_seqs_per_group)
89+
],
90+
request_id=str(bdx))
91+
92+
assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
93+
94+
can_allocate_result = block_manager.can_allocate(seq_group)
95+
96+
num_required_blocks = num_prompt_blocks + \
97+
num_output_blocks + \
98+
num_cross_blocks_per_seq
99+
100+
if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
101+
assert can_allocate_result == AllocStatus.NEVER
102+
elif num_gpu_blocks >= num_required_blocks:
103+
assert can_allocate_result == AllocStatus.OK
104+
else:
105+
assert can_allocate_result == AllocStatus.LATER
106+
107+
108+
@pytest.mark.parametrize("block_size", [16])
109+
@pytest.mark.parametrize("num_gpu_blocks", [16])
110+
@pytest.mark.parametrize("num_seqs_per_group", [1])
111+
@pytest.mark.parametrize("watermark", [0.0, 0.5])
112+
def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
113+
num_seqs_per_group: int,
114+
num_gpu_blocks: int,
115+
watermark: float):
116+
'''
117+
SWA short for Sliding Window Attention.
118+
119+
At time of writing block manager v2 does not support SWA.
120+
121+
However even when SWA is implemented for block manager v2,
122+
there will still most likely be a separate workstream required
123+
to enable SWA for encoder/decoder models.
124+
125+
Therefore this test enforces that one of the following cases
126+
hold true:
127+
1. Block manager v2 does not support SWA at all (true at time of writing)
128+
2. Block manager v2 fails with NotImplementError when SWA is enabled
129+
AND a SequenceGroup with an encoder sequence (i.e. in support of an
130+
encoder/decoder model) is passed into can_allocate() as an argument
131+
132+
The setup for this test is stripped down version of
133+
test_can_allocate_seq_group_encoder_decoder()
134+
'''
135+
136+
with pytest.raises((NotImplementedError, AssertionError)) as exc_info:
137+
block_manager = BlockSpaceManagerV2(
138+
block_size=block_size,
139+
num_gpu_blocks=num_gpu_blocks,
140+
num_cpu_blocks=1024,
141+
watermark=watermark,
142+
sliding_window=5 # SWA
143+
)
144+
145+
num_output_blocks_per_seq = 1
146+
num_prompt_blocks = 1
147+
num_output_blocks = num_output_blocks_per_seq
148+
seq_group = create_seq_group_encoder_decoder(
149+
seq_prompt_len=block_size * num_prompt_blocks,
150+
seq_output_lens=[
151+
block_size * num_output_blocks_per_seq
152+
for _ in range(num_seqs_per_group)
153+
],
154+
request_id="0")
155+
156+
assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
157+
block_manager.can_allocate(seq_group)
158+
159+
# Assert that either
160+
# 1. Block manager v2 constructor fails with assertion that sliding window
161+
# is not yet supported (most likely near-term outcome at time of
162+
# writing), or
163+
# 2. can_allocate() fails with NotImplementedError due to combination of
164+
# encoder/decoder and sliding window attention
165+
if isinstance(exc_info.value, NotImplementedError):
166+
assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA
167+
elif isinstance(exc_info.value, AssertionError):
168+
assert str(exc_info.value) == "Sliding window not yet supported"
169+
170+
171+
@pytest.mark.parametrize("block_size", [16])
172+
@pytest.mark.parametrize("num_gpu_blocks", [16])
173+
@pytest.mark.parametrize("num_seqs_per_group", [1])
174+
@pytest.mark.parametrize("watermark", [0.0, 0.5])
175+
def test_can_allocate_encoder_decoder_fails_with_prefix_cache(
176+
block_size: int, num_seqs_per_group: int, num_gpu_blocks: int,
177+
watermark: float):
178+
179+
block_manager = BlockSpaceManagerV2(
180+
block_size=block_size,
181+
num_gpu_blocks=num_gpu_blocks,
182+
num_cpu_blocks=1024,
183+
watermark=watermark,
184+
enable_caching=True # Prefix cache
185+
)
186+
187+
num_output_blocks_per_seq = 1
188+
num_prompt_blocks = 1
189+
num_output_blocks = num_output_blocks_per_seq
190+
seq_group = create_seq_group_encoder_decoder(
191+
seq_prompt_len=block_size * num_prompt_blocks,
192+
seq_output_lens=[
193+
block_size * num_output_blocks_per_seq
194+
for _ in range(num_seqs_per_group)
195+
],
196+
request_id="0")
197+
198+
assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
199+
200+
# Assert that either can_allocate() fails with NotImplementedError
201+
# due to combination of encoder/decoder and prefix cache
202+
with pytest.raises(NotImplementedError) as exc_info:
203+
block_manager.can_allocate(seq_group)
204+
assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
205+
206+
55207
@pytest.mark.parametrize("block_size", [1, 8])
56208
@pytest.mark.parametrize("prompt_len", [1, 7, 8])
57209
@pytest.mark.parametrize("num_slots_to_append", [1, 8, 129])

0 commit comments

Comments
 (0)