Skip to content

Commit 88fcf00

Browse files
authored
Fix some speculative decode tests with tl.dot (vllm-project#17371)
Signed-off-by: Huy Do <huydhn@gmail.com>
1 parent d1f569b commit 88fcf00

File tree

1 file changed

+3
-6
lines changed

1 file changed

+3
-6
lines changed

tests/spec_decode/e2e/test_multistep_correctness.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -456,7 +456,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
456456
@pytest.mark.parametrize(
457457
"common_llm_kwargs",
458458
[{
459-
"block_size": 8,
459+
"block_size": 16,
460460
# 2 for small prompt, 256//8 for generated.
461461
"num_gpu_blocks_override": 2 + 256 // 8,
462462
"max_model_len": (2 + 256 // 8) * 8,
@@ -526,11 +526,8 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
526526
@pytest.mark.parametrize(
527527
"per_test_common_llm_kwargs",
528528
[
529-
# As of this writing, vLLM only compiles with these 3 block sizes by
530-
# default.
531-
{
532-
"block_size": 8,
533-
},
529+
# https://github.com/triton-lang/triton/issues/2266 tl.dot
530+
# doesn't support embedding < 16
534531
{
535532
"block_size": 16,
536533
},

0 commit comments

Comments
 (0)