-
-
Notifications
You must be signed in to change notification settings - Fork 8.4k
[CPU] Refine default config for the CPU backend #19539
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -89,10 +89,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: | |
import vllm.envs as envs | ||
from vllm.utils import GiB_bytes | ||
model_config = vllm_config.model_config | ||
# Reminder: Please update docs/features/compatibility_matrix.md | ||
# If the feature combo become valid | ||
if not model_config.enforce_eager: | ||
model_config.enforce_eager = True | ||
|
||
model_config.disable_cascade_attn = True | ||
|
||
|
@@ -171,9 +167,21 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: | |
compilation_config = vllm_config.compilation_config | ||
if (envs.VLLM_USE_V1 and vllm_config.compilation_config.level | ||
== CompilationLevel.PIECEWISE): | ||
|
||
# Note: vLLM V1 is using PIECEWISE level compilation, which will | ||
# take time to compile kernels just-in-time with the inductor | ||
# backend. For CPU CI tests, most of them are executed fast and | ||
# compilations consume too much time, even with torch compile | ||
# cache. So use VLLM_CPU_CI_ENV to indicate the CI environment, | ||
# and just execute model with dynamo + eager mode to save time. | ||
# VLLM_CPU_CI_ENV is only used as an internal variable. | ||
if os.environ.get("VLLM_CPU_CI_ENV", "0") != "0": | ||
backend = "eager" | ||
else: | ||
backend = "inductor" | ||
Comment on lines
+178
to
+181
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
||
compilation_config.level = CompilationLevel.DYNAMO_ONCE | ||
compilation_config.backend = "eager" | ||
compilation_config.custom_ops += ["none"] | ||
compilation_config.backend = backend | ||
compilation_config.inductor_compile_config.update({ | ||
"dce": | ||
True, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -60,7 +60,8 @@ def load_model(self) -> None: | |
def warming_up_model(self) -> None: | ||
logger.info("Warming up model for the compilation...") | ||
# Only generate graph for the generic shape | ||
self._dummy_run(max(16, self.max_num_reqs)) | ||
with _set_global_compilation_settings(self.vllm_config): | ||
self._dummy_run(max(16, self.max_num_reqs)) | ||
logger.info("Warming up done.") | ||
|
||
def _init_device_properties(self) -> None: | ||
|
@@ -71,16 +72,15 @@ def _sync_device(self) -> None: | |
|
||
|
||
@contextmanager | ||
def _set_global_compilation_settings(): | ||
def _set_global_compilation_settings(config: VllmConfig): | ||
import torch._inductor.config | ||
|
||
# Note: The CPPGEMM backend requires freezing parameters. | ||
freezing_value = torch._inductor.config.freezing | ||
torch._inductor.config.freezing = True | ||
# Note: workaround for "ValueError: fast mode: can't pickle cyclic objects | ||
# including object type dict" | ||
force_disable_caches = torch._inductor.config.force_disable_caches | ||
torch._inductor.config.force_disable_caches = True | ||
yield | ||
torch._inductor.config.freezing = freezing_value | ||
torch._inductor.config.force_disable_caches = force_disable_caches | ||
inductor_config = config.compilation_config.inductor_compile_config | ||
try: | ||
# Note: The MKLDNN and CPPGEMM backend requires freezing parameters. | ||
freezing_value = torch._inductor.config.freezing | ||
if inductor_config.get("max_autotune", False): | ||
torch._inductor.config.freezing = True | ||
yield | ||
finally: | ||
torch._inductor.config.freezing = freezing_value | ||
Comment on lines
+75
to
+86
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The logic for setting |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Setting
VLLM_CPU_CI_ENV=0
here seems to override the environment variable set in the docker run command. Is this intentional?