Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 11 additions & 72 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1147,20 +1147,15 @@ def create_engine_config(
else:
envs.set_vllm_use_v1(use_v1)

# Set default arguments for V0 or V1 Engine.
if use_v1:
self._set_default_args_v1(usage_context, model_config)
# Disable chunked prefill for POWER (ppc64le)/ARM/s390x CPUs in V1
if current_platform.is_cpu(
) and current_platform.get_cpu_architecture() in (
CpuArchEnum.POWERPC, CpuArchEnum.S390X, CpuArchEnum.ARM):
logger.info(
"Chunked prefill is not supported for ARM and POWER "
"and S390X CPUs; "
"disabling it for V1 backend.")
self.enable_chunked_prefill = False
else:
self._set_default_args_v0(model_config)
# Set default arguments for V1 Engine.
self._set_default_args(usage_context, model_config)
# Disable chunked prefill for POWER (ppc64le)/ARM/s390x CPUs in V1
if current_platform.is_cpu() and current_platform.get_cpu_architecture(
) in (CpuArchEnum.POWERPC, CpuArchEnum.S390X, CpuArchEnum.ARM):
logger.info("Chunked prefill is not supported for ARM and POWER "
"and S390X CPUs; "
"disabling it for V1 backend.")
self.enable_chunked_prefill = False
Comment on lines +1150 to +1158
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

While this change correctly removes the V0-specific default arguments, the removal of V0 components seems incomplete across the codebase. Several platform-specific files still contain logic that depends on VLLM_USE_V1 and might attempt to use V0 components, which are presumably removed. This could lead to runtime errors if a user sets VLLM_USE_V1=0.

For example, in vllm/platforms/cuda.py and vllm/platforms/rocm.py, the code still selects vllm.worker.worker.Worker if VLLM_USE_V1 is false. If this V0 worker has been removed, this will cause a crash.

It is recommended to update these files to either always use the V1 worker or raise an error if VLLM_USE_V1=0, similar to the approach in cpu.py, xpu.py, and tpu.py which explicitly disallow VLLM_USE_V1=0.

assert self.enable_chunked_prefill is not None

sliding_window: Optional[int] = None
Expand Down Expand Up @@ -1540,64 +1535,8 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:

return True

def _set_default_args_v0(self, model_config: ModelConfig) -> None:
"""Set Default Arguments for V0 Engine."""

max_model_len = model_config.max_model_len
use_long_context = max_model_len > 32768
if self.enable_chunked_prefill is None:
# Chunked prefill not supported for Multimodal or MLA in V0.
if model_config.is_multimodal_model or model_config.use_mla:
self.enable_chunked_prefill = False

# Enable chunked prefill by default for long context (> 32K)
# models to avoid OOM errors in initial memory profiling phase.
elif use_long_context:
is_gpu = current_platform.is_cuda()
use_sliding_window = (model_config.get_sliding_window()
is not None)
use_spec_decode = self.speculative_config is not None

if (is_gpu and not use_sliding_window and not use_spec_decode
and not self.enable_lora):
self.enable_chunked_prefill = True
logger.warning(
"Chunked prefill is enabled by default for models "
"with max_model_len > 32K. Chunked prefill might "
"not work with some features or models. If you "
"encounter any issues, please disable by launching "
"with --enable-chunked-prefill=False.")

if self.enable_chunked_prefill is None:
self.enable_chunked_prefill = False

if not self.enable_chunked_prefill and use_long_context:
logger.warning(
"The model has a long context length (%s). This may cause"
"OOM during the initial memory profiling phase, or result "
"in low performance due to small KV cache size. Consider "
"setting --max-model-len to a smaller value.", max_model_len)

# Disable prefix caching for multimodal models for VLLM_V0.
if self.enable_prefix_caching and model_config.is_multimodal_model:
logger.warning(
"--enable-prefix-caching is not supported for multimodal "
"models in V0 and has been disabled.")
self.enable_prefix_caching = False

if self.enable_prompt_embeds:
logger.warning(
"--enable-prompt-embeds and --enable-prefix-caching "
"are not supported together in V0. Prefix caching has "
"been disabled.")
self.enable_prefix_caching = False

# Set max_num_seqs to 256 for VLLM_V0.
if self.max_num_seqs is None:
self.max_num_seqs = 256

def _set_default_args_v1(self, usage_context: UsageContext,
model_config: ModelConfig) -> None:
def _set_default_args(self, usage_context: UsageContext,
model_config: ModelConfig) -> None:
"""Set Default Arguments for V1 Engine."""

# V1 always uses chunked prefills and prefix caching
Expand Down