@@ -155,8 +155,24 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
155
155
# Note: workaround for v1 gpu_model_runner
156
156
from vllm .config import CompilationLevel
157
157
vllm_config .compilation_config .cudagraph_capture_sizes = []
158
- vllm_config .compilation_config .level = CompilationLevel .NO_COMPILATION
159
- vllm_config .compilation_config .custom_ops = []
158
+
159
+ compilation_config = vllm_config .compilation_config
160
+ if vllm_config .compilation_config .level == CompilationLevel .PIECEWISE :
161
+ compilation_config .level = CompilationLevel .DYNAMO_ONCE
162
+ compilation_config .backend = "inductor"
163
+ compilation_config .custom_ops += ["none" ]
164
+ compilation_config .inductor_compile_config .update ({
165
+ "dce" :
166
+ True ,
167
+ "size_asserts" :
168
+ False ,
169
+ "nan_asserts" :
170
+ False ,
171
+ "memory_planning" :
172
+ True ,
173
+ "epilogue_fusion" :
174
+ True ,
175
+ })
160
176
161
177
assert vllm_config .device_config .device_type == "cpu"
162
178
@@ -192,13 +208,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
192
208
# To hint IPEX uses shared memory based AllReduce
193
209
os .environ ["LOCAL_WORLD_SIZE" ] = str (
194
210
vllm_config .parallel_config .tensor_parallel_size )
195
- if sys .platform == "darwin" and \
196
- envs .VLLM_WORKER_MULTIPROC_METHOD == "fork" :
197
- if os .environ .get ('VLLM_WORKER_MULTIPROC_METHOD' , None ) is None :
198
- logger .warning (
199
- "Default to spawn method on MacOS. If this is not desired,"
200
- " set VLLM_WORKER_MULTIPROC_METHOD to fork explicitly." )
201
- os .environ ['VLLM_WORKER_MULTIPROC_METHOD' ] = 'spawn'
202
211
203
212
if vllm_config .model_config and vllm_config .model_config .use_mla :
204
213
logger .info (
0 commit comments