We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent d1f569b commit 88fcf00Copy full SHA for 88fcf00
tests/spec_decode/e2e/test_multistep_correctness.py
@@ -456,7 +456,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
456
@pytest.mark.parametrize(
457
"common_llm_kwargs",
458
[{
459
- "block_size": 8,
+ "block_size": 16,
460
# 2 for small prompt, 256//8 for generated.
461
"num_gpu_blocks_override": 2 + 256 // 8,
462
"max_model_len": (2 + 256 // 8) * 8,
@@ -526,11 +526,8 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
526
527
"per_test_common_llm_kwargs",
528
[
529
- # As of this writing, vLLM only compiles with these 3 block sizes by
530
- # default.
531
- {
532
533
- },
+ # https://github.com/triton-lang/triton/issues/2266 tl.dot
+ # doesn't support embedding < 16
534
{
535
"block_size": 16,
536
},
0 commit comments