Skip to content

[CPU] V1 support for the CPU backend #16441

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Jun 4, 2025
Merged
13 changes: 5 additions & 8 deletions .buildkite/scripts/hardware_ci/run-cpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ set -ex

# allow to bind to different cores
CORE_RANGE=${CORE_RANGE:-48-95}
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
NUMA_NODE=${NUMA_NODE:-1}

export CMAKE_BUILD_PARALLEL_LEVEL=32
Expand All @@ -23,10 +24,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .

# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2

function cpu_tests() {
set -e
Expand Down Expand Up @@ -56,7 +55,7 @@ function cpu_tests() {
# Run AWQ test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
VLLM_USE_V1=0 pytest -s -v \
tests/quantization/test_ipex_quant.py"

# Run chunked-prefill and prefix-cache test
Expand All @@ -68,8 +67,6 @@ function cpu_tests() {
# online serving
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
export VLLM_CPU_KVCACHE_SPACE=10
export VLLM_CPU_OMP_THREADS_BIND=$1
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
Expand All @@ -89,4 +86,4 @@ function cpu_tests() {

# All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
2 changes: 2 additions & 0 deletions docs/usage/v1_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ This living user guide outlines a few known **important changes and limitations*
| **NVIDIA** | <nobr>🚀 Natively Supported</nobr> |
| **AMD** | <nobr>🚧 WIP</nobr> |
| **TPU** | <nobr>🚧 WIP</nobr> |
| **CPU** | <nobr>🚧 WIP</nobr> |

#### Feature / Model

| Feature / Model | Status |
Expand Down
3 changes: 3 additions & 0 deletions requirements/cpu.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# Common dependencies
-r common.txt

numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
numba == 0.61.2; python_version > '3.9'

# Dependencies for CPUs
packaging>=24.2
setuptools>=77.0.3,<80.0.0
Expand Down
5 changes: 4 additions & 1 deletion tests/kernels/attention/test_attention_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,10 @@ def test_env(
CpuPlatform()):
backend = get_attn_backend(16, torch.float16, torch.float16,
block_size, False)
assert backend.get_name() == "TORCH_SDPA"
if use_v1:
assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
else:
assert backend.get_name() == "TORCH_SDPA"

elif device == "hip":
with patch("vllm.attention.selector.current_platform",
Expand Down
1 change: 0 additions & 1 deletion tests/models/language/generation/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@
pytest.param("bigcode/starcoder2-3b"), # starcoder2
pytest.param(
"TitanML/tiny-mixtral", # mixtral
marks=[pytest.mark.cpu_model],
)
])
@pytest.mark.parametrize("max_tokens", [32])
Expand Down
6 changes: 3 additions & 3 deletions vllm/attention/backends/cpu_mla.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def build(self, seq_lens, query_lens, cuda_graph_pad_size, batch_size):
seq_lens_tensor=seq_lens_tensor,
max_query_len=max_query_len,
max_kv_len=max_kv_len,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why does this file require changes?

query_start_loc=query_start_loc,
prefill_query_start_loc=query_start_loc,
kv_start_loc=kv_start_loc,
max_decode_seq_len=input_data.max_decode_seq_len,
num_prefills=input_data.num_prefills,
Expand Down Expand Up @@ -262,8 +262,8 @@ def _forward_prefill(
key=k,
value=v_padded,
out=output,
seqlen_q=prefill_metadata.query_start_loc,
seqlen_k=prefill_metadata.query_start_loc,
seqlen_q=prefill_metadata.prefill_query_start_loc,
seqlen_k=prefill_metadata.prefill_query_start_loc,
max_seqlen_q=prefill_metadata.max_query_len,
max_seqlen_k=prefill_metadata.max_query_len,
pdropout=0.0,
Expand Down
16 changes: 12 additions & 4 deletions vllm/attention/backends/torch_sdpa.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,13 @@ class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata):
# For chunked prefill only
max_query_len: Optional[int] = None
max_kv_len: Optional[int] = None
query_start_loc: Optional[torch.Tensor] = None
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why does this file require changes?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is a naming conflict. The V1 model runner use query_start_loc for logits indexing specially, contains all tokens in a batch. But in torch_sdpa, query_start_loc contains prefill tokens only, so rename it to prefill_query_start_loc.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would try to avoid the renaming in torch sdpa instead of the global change

prefill_query_start_loc: Optional[torch.Tensor] = None
kv_start_loc: Optional[torch.Tensor] = None
prefill_block_tables: Optional[torch.Tensor] = None

# For V1 logits index only
query_start_loc: Optional[torch.Tensor] = None

# Begin encoder attn & enc/dec cross-attn fields...
# Encoder sequence lengths representation
encoder_seq_lens: Optional[List[int]] = None
Expand Down Expand Up @@ -374,7 +377,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
seq_lens_tensor=seq_lens_tensor,
max_query_len=max_query_len,
max_kv_len=max_kv_len,
query_start_loc=query_start_loc,
prefill_query_start_loc=query_start_loc,
kv_start_loc=kv_start_loc,
max_decode_seq_len=input_data.max_decode_seq_len,
num_prefills=input_data.num_prefills,
Expand Down Expand Up @@ -466,6 +469,11 @@ def forward(
Returns:
shape = [num_tokens, num_heads * head_size]
"""

# For warming-up
if attn_metadata is None:
return query

attn_type = self.attn_type
if (attn_type == AttentionType.ENCODER
and (not attn_metadata.is_all_encoder_attn_metadata_set)):
Expand Down Expand Up @@ -533,8 +541,8 @@ def forward(

output = torch.empty_like(query)
if prefill_meta := attn_metadata.prefill_metadata:
assert attn_metadata.seq_lens is not None
if not prefill_meta.prefill_metadata.chunked_prefill: # type: ignore
assert attn_metadata.seq_lens is not None
self._run_sdpa_forward(output,
query,
key,
Expand All @@ -551,7 +559,7 @@ def forward(
query[:prefill_meta.num_prefill_tokens, :, :],
key_cache,
value_cache,
prefill_meta.query_start_loc,
prefill_meta.prefill_query_start_loc,
prefill_meta.kv_start_loc,
prefill_meta.max_query_len,
prefill_meta.max_kv_len,
Expand Down
7 changes: 6 additions & 1 deletion vllm/compilation/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,16 @@ def __init__(self,
# compiling the forward method

backend = vllm_config.compilation_config.init_backend(vllm_config)
options = None
if isinstance(backend, str) and backend == "inductor":
options = get_current_vllm_config(
).compilation_config.inductor_compile_config

compiled_callable = torch.compile(
self.forward,
fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
backend=backend)
backend=backend,
options=options)

self.compiled_callable = compiled_callable
self.original_code_object = self.__class__.forward.__code__
Expand Down
4 changes: 3 additions & 1 deletion vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1386,6 +1386,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
"FLASHINFER",
"FLASHINFER_VLLM_V1",
"ROCM_AITER_MLA",
"TORCH_SDPA_VLLM_V1",
]
if (envs.is_set("VLLM_ATTENTION_BACKEND")
and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):
Expand Down Expand Up @@ -1418,7 +1419,8 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:

# Non-[CUDA, TPU] may be supported on V1, but off by default for now.
v0_hardware = not any(
(current_platform.is_cuda(), current_platform.is_tpu()))
(current_platform.is_cuda(), current_platform.is_tpu(),
current_platform.is_cpu()))
if v0_hardware and _warn_or_fallback( # noqa: SIM103
current_platform.device_name):
return False
Expand Down
67 changes: 57 additions & 10 deletions vllm/platforms/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,10 @@ def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
logger.info("Using CPU MLA backend.")
return "vllm.attention.backends.cpu_mla.CPUMLABackend"
logger.info("Using Torch SDPA backend.")
return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"
if use_v1:
return "vllm.v1.attention.backends.cpu_attn.TorchSDPABackend"
else:
return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"

@classmethod
def get_device_total_memory(cls, device_id: int = 0) -> int:
Expand All @@ -80,6 +83,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
if not model_config.enforce_eager:
model_config.enforce_eager = True

model_config.disable_cascade_attn = True
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is new?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I think the cascade attn is only supported in the flash attn backend so I disable it here.

I noticed support_sleep_mode has became a platform attribute, remove the checking here.


cache_config = vllm_config.cache_config

ipex_available = find_spec("intel_extension_for_pytorch") is not None
Expand Down Expand Up @@ -127,7 +132,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
f" {kv_cache_space}, expect a positive integer value.")

parallel_config = vllm_config.parallel_config
if (parallel_config.distributed_executor_backend is not None
if (parallel_config.world_size > 1
and parallel_config.distributed_executor_backend is not None
and parallel_config.distributed_executor_backend != "mp"):
logger.warning(("%s is not supported on CPU, fallback to mp "
"distributed executor backend."),
Expand All @@ -140,14 +146,51 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
parallel_config.sd_worker_cls = \
"vllm.worker.cpu_worker.CPUWorker"
else:
parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker"
if envs.VLLM_USE_V1:
parallel_config.worker_cls = \
"vllm.v1.worker.cpu_worker.CPUWorker"
else:
parallel_config.worker_cls = \
"vllm.worker.cpu_worker.CPUWorker"

# Note: workaround for v1 gpu_model_runner
from vllm.config import CompilationLevel
vllm_config.compilation_config.cudagraph_capture_sizes = []

compilation_config = vllm_config.compilation_config
if (envs.VLLM_USE_V1 and vllm_config.compilation_config.level
== CompilationLevel.PIECEWISE):
compilation_config.level = CompilationLevel.DYNAMO_ONCE
compilation_config.backend = "eager"
compilation_config.custom_ops += ["none"]
compilation_config.inductor_compile_config.update({
"dce":
True,
"size_asserts":
False,
"nan_asserts":
False,
"memory_planning":
True,
"epilogue_fusion":
True,
})

if vllm_config.lora_config is not None:
compilation_config.level = CompilationLevel.NO_COMPILATION

assert vllm_config.device_config.device_type == "cpu"

#
# Environment variables for CPU executor
#

os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

# Note: to avoid the error 'nthreads cannot be larger than environment
# variable "NUMEXPR_MAX_THREADS" (64)'.
os.environ["NUMEXPR_MAX_THREADS"] = str(len(os.sched_getaffinity(0)))

# Set default threads num for OpenMP parallel
os.environ["OMP_NUM_THREADS"] = str(torch.get_num_threads())

Expand All @@ -170,13 +213,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
# To hint IPEX uses shared memory based AllReduce
os.environ["LOCAL_WORLD_SIZE"] = str(
vllm_config.parallel_config.tensor_parallel_size)
if sys.platform == "darwin" and \
envs.VLLM_WORKER_MULTIPROC_METHOD == "fork":
if os.environ.get('VLLM_WORKER_MULTIPROC_METHOD', None) is None:
logger.warning(
"Default to spawn method on MacOS. If this is not desired,"
" set VLLM_WORKER_MULTIPROC_METHOD to fork explicitly.")
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'

if vllm_config.model_config and vllm_config.model_config.use_mla:
logger.info(
Expand All @@ -203,3 +239,14 @@ def get_device_communicator_cls(cls) -> str:
Get device specific communicator class for distributed communication.
"""
return "vllm.distributed.device_communicators.cpu_communicator.CpuCommunicator" # noqa

@classmethod
def supports_structured_output(cls) -> bool:
return True

@classmethod
def supports_v1(cls, model_config) -> bool:
"""Returns whether the current platform can support v1 for the supplied
model configuration.
"""
return True
Loading