@@ -77,6 +77,159 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
77
77
assert baseline_token_ids == test_token_ids
78
78
79
79
80
+ @pytest .mark .parametrize (
81
+ "common_llm_kwargs" ,
82
+ [{
83
+ # Use a small model for a fast test.
84
+ "model" : "facebook/opt-125m" ,
85
+
86
+ # skip cuda graph creation for fast test.
87
+ "enforce_eager" : True ,
88
+
89
+ # Use a large block size to trigger more copy-on-writes.
90
+ "block_size" : 32 ,
91
+ }])
92
+ @pytest .mark .parametrize ("per_test_common_llm_kwargs" , [{}])
93
+ @pytest .mark .parametrize ("baseline_llm_kwargs" , [{
94
+ "use_v2_block_manager" : False
95
+ }])
96
+ @pytest .mark .parametrize ("test_llm_kwargs" , [{"use_v2_block_manager" : True }])
97
+ @pytest .mark .parametrize ("batch_size" , [10 ])
98
+ @pytest .mark .parametrize ("seed" , [1 ])
99
+ def test_v1_v2_greedy_equality_with_cow (baseline_llm_generator ,
100
+ test_llm_generator , batch_size ):
101
+ """Verify beam search equality with block manager v1 and v2.
102
+
103
+ This requires copy-on-writes; if the v1 and v2 output is the same, then
104
+ we have some confidence cow is working.
105
+ """
106
+ output_len = 128
107
+ temperature = 0.0
108
+
109
+ prompts = [
110
+ "Hello, my name is" ,
111
+ "The president of the United States is" ,
112
+ "The capital of France is" ,
113
+ "The future of AI is" ,
114
+ ]
115
+
116
+ prompts = [prompt for prompt , _ in zip (cycle (prompts ), range (batch_size ))]
117
+
118
+ sampling_params = SamplingParams (
119
+ max_tokens = output_len ,
120
+ ignore_eos = True ,
121
+ temperature = temperature ,
122
+ use_beam_search = True ,
123
+ best_of = 2 ,
124
+ )
125
+
126
+ print ('Getting token ids from block manager v1' )
127
+ baseline_token_ids = get_token_ids_from_llm_generator (
128
+ baseline_llm_generator , prompts , sampling_params )
129
+
130
+ print ('Getting token ids from block manager v2' )
131
+ test_token_ids = get_token_ids_from_llm_generator (test_llm_generator ,
132
+ prompts , sampling_params )
133
+
134
+ for expected_token_ids , actual_token_ids in zip (baseline_token_ids ,
135
+ test_token_ids ):
136
+ assert expected_token_ids == actual_token_ids
137
+
138
+ assert baseline_token_ids == test_token_ids
139
+
140
+
141
+ @pytest .mark .parametrize (
142
+ "common_llm_kwargs" ,
143
+ [{
144
+ # Use a small model for a fast test.
145
+ "model" : "facebook/opt-125m" ,
146
+
147
+ # Our prompts will generate 128 tokens; since the prompts themselves are
148
+ # small, we don't need much KV space beyond 128.
149
+ "max_model_len" : 160 ,
150
+
151
+ # skip cuda graph creation for fast test.
152
+ "enforce_eager" : True ,
153
+
154
+ # Lookahead scheduling only supported in v2 block manager.
155
+ "use_v2_block_manager" : True ,
156
+ }])
157
+ @pytest .mark .parametrize (
158
+ "per_test_common_llm_kwargs" ,
159
+ [
160
+ {
161
+ "block_size" : 16 ,
162
+
163
+ # Allow only 2 sequences of ~128 tokens in worst case.
164
+ # Note 8 = 128/block_size
165
+ "forced_num_gpu_blocks" : 2 * (8 + 1 ),
166
+ },
167
+ {
168
+ "block_size" : 8 ,
169
+
170
+ # Allow only 2 sequences of ~128 tokens in worst case.
171
+ # Note 16 = 128/block_size
172
+ "forced_num_gpu_blocks" : 2 * (16 + 1 ),
173
+ }
174
+ ])
175
+ @pytest .mark .parametrize ("baseline_llm_kwargs" , [{
176
+ "num_lookahead_slots" : 0 ,
177
+ }])
178
+ @pytest .mark .parametrize (
179
+ "test_llm_kwargs" ,
180
+ [{
181
+ # We run one test with block_size < lookahead_slots, one test with
182
+ # block_size > lookahead_slots
183
+ "num_lookahead_slots" : 10 ,
184
+ }])
185
+ @pytest .mark .parametrize ("batch_size" , [4 ])
186
+ @pytest .mark .parametrize ("seed" , [1 ])
187
+ def test_lookahead_greedy_equality_with_preemption (baseline_llm_generator ,
188
+ test_llm_generator ,
189
+ batch_size ):
190
+ """Verify vLLM produces the same output with greedy sampling, when lookahead
191
+ scheduling is used vs. not.
192
+
193
+ Lookahead scheduling is not expected to modify the output, as it simply
194
+ allocates empty slots ahead of the known token ids in a sliding fashion.
195
+
196
+ This test constrains the total number of blocks to force preemption. It also
197
+ varies the block size so that the lookahead size is less than and greater
198
+ than the block size.
199
+ """
200
+ output_len = 128
201
+ temperature = 0.0
202
+
203
+ prompts = [
204
+ "Hello, my name is" ,
205
+ "The president of the United States is" ,
206
+ "The capital of France is" ,
207
+ "The future of AI is" ,
208
+ ]
209
+
210
+ prompts = [prompt for prompt , _ in zip (cycle (prompts ), range (batch_size ))]
211
+
212
+ sampling_params = SamplingParams (
213
+ max_tokens = output_len ,
214
+ ignore_eos = True ,
215
+ temperature = temperature ,
216
+ )
217
+
218
+ print ('Getting token ids without lookahead scheduling' )
219
+ baseline_token_ids = get_token_ids_from_llm_generator (
220
+ baseline_llm_generator , prompts , sampling_params )
221
+
222
+ print ('Getting token ids with lookahead scheduling' )
223
+ test_token_ids = get_token_ids_from_llm_generator (test_llm_generator ,
224
+ prompts , sampling_params )
225
+
226
+ for expected_token_ids , actual_token_ids in zip (baseline_token_ids ,
227
+ test_token_ids ):
228
+ assert expected_token_ids == actual_token_ids
229
+
230
+ assert baseline_token_ids == test_token_ids
231
+
232
+
80
233
def get_token_ids_from_llm_generator (llm_generator , prompts , sampling_params ):
81
234
for llm in llm_generator :
82
235
outputs = llm .generate (prompts , sampling_params , use_tqdm = True )
0 commit comments