|  | 
| 3 | 3 | 
 | 
| 4 | 4 | import json | 
| 5 | 5 | import os | 
|  | 6 | +from typing import Optional | 
| 6 | 7 | 
 | 
| 7 | 8 | import pytest | 
| 8 | 9 | 
 | 
|  | 
| 20 | 21 | dummy_hf_overrides = {"num_layers": 4, "num_hidden_layers": 4} | 
| 21 | 22 | 
 | 
| 22 | 23 | 
 | 
| 23 |  | -def can_initialize(model: str, extra_args: list[str]): | 
|  | 24 | +def can_initialize(model: str, extra_args: Optional[list[str]] = None): | 
| 24 | 25 | 
 | 
| 25 | 26 |     # Server arguments | 
|  | 27 | +    extra_args = extra_args if extra_args is not None else [] | 
| 26 | 28 |     server_args = [ | 
| 27 | 29 |         "--max-model-len", | 
| 28 | 30 |         "2048", | 
| @@ -65,68 +67,84 @@ def test_llama4_fp8_tensor_moe_flashinfer_cutlass( | 
| 65 | 67 |         monkeypatch: pytest.MonkeyPatch): | 
| 66 | 68 |     monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1") | 
| 67 | 69 |     monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput") | 
| 68 |  | -    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", []) | 
|  | 70 | +    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8") | 
| 69 | 71 | 
 | 
| 70 | 72 | 
 | 
| 71 | 73 | @pytest.mark.skip(reason="Works, but takes too long to run") | 
| 72 | 74 | def test_llama4_fp8_tensor_moe_flashinfer_trtllm( | 
| 73 | 75 |         monkeypatch: pytest.MonkeyPatch): | 
| 74 | 76 |     monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1") | 
| 75 | 77 |     monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency") | 
| 76 |  | -    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", []) | 
|  | 78 | +    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8") | 
| 77 | 79 | 
 | 
| 78 | 80 | 
 | 
| 79 | 81 | @pytest.mark.skip(reason="Works, but takes too long to run") | 
| 80 | 82 | def test_llama4_nvfp4_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch): | 
| 81 | 83 |     monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1") | 
| 82 | 84 |     monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput") | 
| 83 |  | -    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", []) | 
|  | 85 | +    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4") | 
| 84 | 86 | 
 | 
| 85 | 87 | 
 | 
| 86 | 88 | @pytest.mark.skip(reason="RuntimeError: No kernel found for the given options") | 
| 87 | 89 | def test_llama4_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch): | 
| 88 | 90 |     monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1") | 
| 89 | 91 |     monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency") | 
| 90 |  | -    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", []) | 
|  | 92 | +    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4") | 
| 91 | 93 | 
 | 
| 92 | 94 | 
 | 
| 93 | 95 | ## DeepSeekV3 ## | 
| 94 | 96 | 
 | 
| 95 | 97 | 
 | 
| 96 | 98 | def test_deepseek_fp8_block_moe_deep_gemm(monkeypatch: pytest.MonkeyPatch): | 
| 97 | 99 |     monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1") | 
| 98 |  | -    can_initialize("deepseek-ai/DeepSeek-V3.1", []) | 
|  | 100 | +    can_initialize("deepseek-ai/DeepSeek-V3.1") | 
|  | 101 | + | 
|  | 102 | + | 
|  | 103 | +@pytest.mark.skip(reason=("Known issue: lack of kernel support. " | 
|  | 104 | +                          "Expected failure: assert self.block_quant is None")) | 
|  | 105 | +def test_deepseek_fp8_block_moe_flashinfer_cutlass( | 
|  | 106 | +        monkeypatch: pytest.MonkeyPatch): | 
|  | 107 | +    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1") | 
|  | 108 | +    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput") | 
|  | 109 | +    can_initialize("deepseek-ai/DeepSeek-V3.1") | 
|  | 110 | + | 
|  | 111 | + | 
|  | 112 | +def test_deepseek_fp8_block_moe_flashinfer_trtllm( | 
|  | 113 | +        monkeypatch: pytest.MonkeyPatch): | 
|  | 114 | +    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1") | 
|  | 115 | +    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency") | 
|  | 116 | +    can_initialize("deepseek-ai/DeepSeek-V3.1") | 
| 99 | 117 | 
 | 
| 100 | 118 | 
 | 
| 101 | 119 | def test_deepseek_nvfp4_moe_flashinfer_cutlass( | 
| 102 | 120 |         monkeypatch: pytest.MonkeyPatch): | 
| 103 | 121 |     monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1") | 
| 104 | 122 |     monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput") | 
| 105 |  | -    can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", []) | 
|  | 123 | +    can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2") | 
| 106 | 124 | 
 | 
| 107 | 125 | 
 | 
| 108 | 126 | @pytest.mark.skip(reason="RuntimeError: No kernel found for the given options") | 
| 109 | 127 | def test_deepseek_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch): | 
| 110 | 128 |     monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1") | 
| 111 | 129 |     monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency") | 
| 112 |  | -    can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", []) | 
|  | 130 | +    can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2") | 
| 113 | 131 | 
 | 
| 114 | 132 | 
 | 
| 115 | 133 | ## GPT-OSS ## | 
| 116 | 134 | 
 | 
| 117 | 135 | 
 | 
| 118 | 136 | def test_gptoss_mxfp4bf16_moe_flashinfer(monkeypatch: pytest.MonkeyPatch): | 
| 119 | 137 |     monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "1") | 
| 120 |  | -    can_initialize("openai/gpt-oss-20b", []) | 
|  | 138 | +    can_initialize("openai/gpt-oss-20b") | 
| 121 | 139 | 
 | 
| 122 | 140 | 
 | 
| 123 | 141 | def test_gptoss_mxfp4mxfp8_moe_flashinfer_cutlass( | 
| 124 | 142 |         monkeypatch: pytest.MonkeyPatch): | 
| 125 | 143 |     monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "1") | 
| 126 |  | -    can_initialize("openai/gpt-oss-20b", []) | 
|  | 144 | +    can_initialize("openai/gpt-oss-20b") | 
| 127 | 145 | 
 | 
| 128 | 146 | 
 | 
| 129 | 147 | def test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm( | 
| 130 | 148 |         monkeypatch: pytest.MonkeyPatch): | 
| 131 | 149 |     monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1") | 
| 132 |  | -    can_initialize("openai/gpt-oss-20b", []) | 
|  | 150 | +    can_initialize("openai/gpt-oss-20b") | 
0 commit comments