fix tests

bigPYJ1151 · bigPYJ1151 · commit f44b6198c06a · 2025-06-03T06:23:42.000Z
Signed-off-by: jiang.li &lt;jiang1.li@intel.com&gt;
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -35,11 +35,13 @@ function cpu_tests() {
   # offline inference
   docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
     set -e
+    export VLLM_USE_V1=1
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
 
   # Run basic model test
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
+    export VLLM_USE_V1=1
     pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
     pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
     pytest -v -s tests/models/language/generation -m cpu_model
@@ -49,6 +51,7 @@ function cpu_tests() {
   # Run compressed-tensor test
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
+    export VLLM_USE_V1=1
     pytest -s -v \
     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
@@ -62,6 +65,7 @@ function cpu_tests() {
   # Run chunked-prefill and prefix-cache test
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
+    export VLLM_USE_V1=1
     pytest -s -v -k cpu_model \
     tests/basic_correctness/test_chunked_prefill.py"  
 
@@ -70,6 +74,7 @@ function cpu_tests() {
     set -e
     export VLLM_CPU_KVCACHE_SPACE=10 
     export VLLM_CPU_OMP_THREADS_BIND=$1
+    export VLLM_USE_V1=1
     python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
     timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
     python3 benchmarks/benchmark_serving.py \
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
@@ -84,7 +84,10 @@ def test_env(
                        CpuPlatform()):
                 backend = get_attn_backend(16, torch.float16, torch.float16,
                                            block_size, False)
-            assert backend.get_name() == "TORCH_SDPA"
+            if use_v1:
+                assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
+            else:
+                assert backend.get_name() == "TORCH_SDPA"
 
         elif device == "hip":
             with patch("vllm.attention.selector.current_platform",
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
@@ -40,11 +40,16 @@ def __init__(self,
             # compiling the forward method
 
             backend = vllm_config.compilation_config.init_backend(vllm_config)
+            options = None
+            if isinstance(backend, str) and backend == "inductor":
+                options = get_current_vllm_config(
+                ).compilation_config.inductor_compile_config
 
             compiled_callable = torch.compile(
                 self.forward,
                 fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
-                backend=backend)
+                backend=backend,
+                options=options)
 
         self.compiled_callable = compiled_callable
         self.original_code_object = self.__class__.forward.__code__
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
@@ -155,8 +155,24 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         # Note: workaround for v1 gpu_model_runner
         from vllm.config import CompilationLevel
         vllm_config.compilation_config.cudagraph_capture_sizes = []
-        vllm_config.compilation_config.level = CompilationLevel.NO_COMPILATION
-        vllm_config.compilation_config.custom_ops = []
+
+        compilation_config = vllm_config.compilation_config
+        if vllm_config.compilation_config.level == CompilationLevel.PIECEWISE:
+            compilation_config.level = CompilationLevel.DYNAMO_ONCE
+            compilation_config.backend = "inductor"
+            compilation_config.custom_ops += ["none"]
+            compilation_config.inductor_compile_config.update({
+                "dce":
+                True,
+                "size_asserts":
+                False,
+                "nan_asserts":
+                False,
+                "memory_planning":
+                True,
+                "epilogue_fusion":
+                True,
+            })
 
         assert vllm_config.device_config.device_type == "cpu"
 
@@ -192,13 +208,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         # To hint IPEX uses shared memory based AllReduce
         os.environ["LOCAL_WORLD_SIZE"] = str(
             vllm_config.parallel_config.tensor_parallel_size)
-        if sys.platform == "darwin" and \
-                envs.VLLM_WORKER_MULTIPROC_METHOD == "fork":
-            if os.environ.get('VLLM_WORKER_MULTIPROC_METHOD', None) is None:
-                logger.warning(
-                    "Default to spawn method on MacOS. If this is not desired,"
-                    " set VLLM_WORKER_MULTIPROC_METHOD to fork explicitly.")
-                os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
 
         if vllm_config.model_config and vllm_config.model_config.use_mla:
             logger.info(
diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
@@ -18,7 +18,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
         super().__init__(vllm_config, device)
 
         assert device == torch.device("cpu")
-        assert not self.use_spec_decode, "spec decode is not supported."
+        assert self.speculative_config is None, "spec decode is not supported."
         assert not self.model_config.uses_mrope, "mrope is not supported."
         assert self.lora_config is None, "lora is not supported."
 
@@ -58,7 +58,7 @@ def warming_up_model(self) -> None:
         logger.info("Warming up model for the compilation...")
         # Only generate graph for the generic shape
         with _set_global_compilation_settings():
-            self._dummy_run(self.max_num_tokens)
+            self._dummy_run(max(16, self.max_num_reqs))
         logger.info("Warming up done.")
 
     def _init_device_properties(self) -> None: