vllm-project · fhl2000 · Jun 20, 2025 · Jun 20, 2025 · Jun 20, 2025 · Jun 20, 2025
diff --git a/.buildkite/scripts/hardware_ci/run-neuron-test.sh b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
@@ -54,10 +54,11 @@ docker run --rm -it --device=/dev/neuron0 --network bridge \
        --name "${container_name}" \
        ${image_name} \
        /bin/bash -c "
+            set -e; # Exit on first error
             python3 /workspace/vllm/examples/offline_inference/neuron.py;
             python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
             for f in /workspace/vllm/tests/neuron/2_core/*.py; do
-                echo 'Running test file: '$f;
+                echo \"Running test file: \$f\";
                 python3 -m pytest \$f -v --capture=tee-sys;
             done
        "
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -271,6 +271,15 @@ steps:
   commands:
     - pytest -v -s prefix_caching
 
+
+- label: Platform Tests (CUDA)
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - vllm/
+  - tests/cuda
+  commands:
+    - pytest -v -s cuda/test_cuda_context.py
+
 - label: Samplers Test # 36min
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:

diff --git a/.github/mergify.yml b/.github/mergify.yml
@@ -45,6 +45,7 @@ pull_request_rules:
       - files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
       - files~=^vllm/model_executor/models/.*llama.*\.py
       - files~=^vllm/transformers_utils/configs/.*llama.*\.py
+      - title~=(?i)llama
   actions:
     label:
       add:
@@ -65,6 +66,19 @@ pull_request_rules:
       add:
         - multi-modality
 
+- name: label-performance
+  description: Automatically apply performance label
+  conditions:
+    - or:
+      - files~=^benchmarks/
+      - files~=^vllm/benchmarks/
+      - files~=^tests/benchmarks/
+      - files~=^\.buildkite/nightly-benchmarks/
+  actions:
+    label:
+      add:
+        - performance
+
 - name: label-qwen
   description: Automatically apply qwen label
   conditions:
@@ -74,7 +88,6 @@ pull_request_rules:
       - files~=^vllm/model_executor/models/.*qwen.*\.py
       - files~=^vllm/reasoning/.*qwen.*\.py
       - title~=(?i)Qwen
-      - body~=(?i)Qwen
   actions:
     label:
       add:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -115,6 +115,11 @@ repos:
     entry: python tools/check_spdx_header.py
     language: python
     types: [python]
+  - id: check-root-lazy-imports
+    name: Check root lazy imports
+    entry: python tools/check_init_lazy_imports.py
+    language: python
+    types: [python]
   - id: check-filenames
     name: Check for spaces in all filenames
     entry: bash

@@ -308,7 +308,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # Keep building Marlin for 9.0 as there are some group sizes and shapes that
   # are not supported by Machete yet.
   # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;8.9;9.0+PTX" "${CUDA_ARCHS}")
   if (MARLIN_ARCHS)
 
     #
@@ -684,7 +684,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
   list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
   # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;8.9;9.0+PTX" "${CUDA_ARCHS}")
   if (MARLIN_MOE_ARCHS)
 
     #

diff --git a/README.md b/README.md
@@ -154,11 +154,13 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
 
 ## Contact Us
 
+<!-- --8<-- [start:contact-us] -->
 - For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
 - For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
 - For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
 - For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
 - For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
+<!-- --8<-- [end:contact-us] -->
 
 ## Media Kit
 

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
@@ -404,8 +404,14 @@ async def async_request_openai_chat_completions(
                         chunk_bytes = chunk_bytes.strip()
                         if not chunk_bytes:
                             continue
+                        chunk_bytes = chunk_bytes.decode("utf-8")
+                        # NOTE: SSE comments (often used as pings) start with a colon.
+                        # These are not JSON data payload and should be skipped.
+                        if chunk_bytes.startswith(":"):
+                            continue
+
+                        chunk = chunk_bytes.removeprefix("data: ")
 
-                        chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
                         if chunk != "[DONE]":
                             timestamp = time.perf_counter()
                             data = json.loads(chunk)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -97,7 +97,7 @@ def run_vllm(
         assert lora_requests is None, "BeamSearch API does not support LoRA"
         prompts = [request.prompt for request in requests]
         # output_len should be the same for all requests.
-        output_len = requests[0][2]
+        output_len = requests[0].expected_output_len
         for request in requests:
             assert request.expected_output_len == output_len
         start = time.perf_counter()

@@ -91,7 +91,7 @@ source to unblock the update process.
 ### FlashInfer
 Here is how to build and install it from source with torch2.7.0+cu128 in vLLM [Dockerfile](https://github.com/vllm-project/vllm/blob/27bebcd89792d5c4b08af7a65095759526f2f9e1/docker/Dockerfile#L259-L271):
 
-```
+```bash
 export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'
 export FLASHINFER_ENABLE_SM90=1
 uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.6.post1"
@@ -105,14 +105,14 @@ team if you want to get the package published there.
 ### xFormers
 Similar to FlashInfer, here is how to build and install xFormers from source:
 
-```
+```bash
 export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
 MAX_JOBS=16 uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30"
 ```
 
 ### Mamba
 
-```
+```bash
 uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
 ```
 

@@ -16,35 +16,33 @@ vllm {chat,complete,serve,bench,collect-env,run-batch}
 
 Start the vLLM OpenAI Compatible API server.
 
-Examples:
+??? Examples
 
-```bash
-# Start with a model
-vllm serve meta-llama/Llama-2-7b-hf
+    ```bash
+    # Start with a model
+    vllm serve meta-llama/Llama-2-7b-hf
 
-# Specify the port
-vllm serve meta-llama/Llama-2-7b-hf --port 8100
+    # Specify the port
+    vllm serve meta-llama/Llama-2-7b-hf --port 8100
 
-# Check with --help for more options
-# To list all groups
-vllm serve --help=listgroup
+    # Check with --help for more options
+    # To list all groups
+    vllm serve --help=listgroup
 
-# To view a argument group
-vllm serve --help=ModelConfig
+    # To view a argument group
+    vllm serve --help=ModelConfig
 
-# To view a single argument
-vllm serve --help=max-num-seqs
+    # To view a single argument
+    vllm serve --help=max-num-seqs
 
-# To search by keyword
-vllm serve --help=max
-```
+    # To search by keyword
+    vllm serve --help=max
+    ```
 
 ## chat
 
 Generate chat completions via the running API server.
 
-Examples:
-
 ```bash
 # Directly connect to localhost API without arguments
 vllm chat
@@ -60,8 +58,6 @@ vllm chat --quick "hi"
 
 Generate text completions based on the given prompt via the running API server.
 
-Examples:
-
 ```bash
 # Directly connect to localhost API without arguments
 vllm complete
@@ -73,6 +69,8 @@ vllm complete --url http://{vllm-serve-host}:{vllm-serve-port}/v1
 vllm complete --quick "The future of AI is"
 ```
 
+</details>
+
 ## bench
 
 Run benchmark tests for latency online serving throughput and offline inference throughput.
@@ -89,8 +87,6 @@ vllm bench {latency, serve, throughput}
 
 Benchmark the latency of a single batch of requests.
 
-Example:
-
 ```bash
 vllm bench latency \
     --model meta-llama/Llama-3.2-1B-Instruct \
@@ -104,8 +100,6 @@ vllm bench latency \
 
 Benchmark the online serving throughput.
 
-Example:
-
 ```bash
 vllm bench serve \
     --model meta-llama/Llama-3.2-1B-Instruct \
@@ -120,8 +114,6 @@ vllm bench serve \
 
 Benchmark offline inference throughput.
 
-Example:
-
 ```bash
 vllm bench throughput \
     --model meta-llama/Llama-3.2-1B-Instruct \
@@ -143,7 +135,8 @@ vllm collect-env
 
 Run batch prompts and write results to file.
 
-Examples:
+<details>
+<summary>Examples</summary>
 
 ```bash
 # Running with a local file
@@ -159,6 +152,8 @@ vllm run-batch \
     --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
 
+</details>
+
 ## More Help
 
 For detailed options of any subcommand, use:

@@ -0,0 +1,6 @@
+---
+title: Contact Us
+---
+[](){ #contactus }
+
+--8<-- "README.md:contact-us"
@@ -57,19 +57,21 @@ By default, we optimize model inference using CUDA graphs which take up extra me
 
 You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage:
 
-```python
-from vllm import LLM
-from vllm.config import CompilationConfig, CompilationLevel
-
-llm = LLM(
-    model="meta-llama/Llama-3.1-8B-Instruct",
-    compilation_config=CompilationConfig(
-        level=CompilationLevel.PIECEWISE,
-        # By default, it goes up to max_num_seqs
-        cudagraph_capture_sizes=[1, 2, 4, 8, 16],
-    ),
-)
-```
+??? Code
+
+    ```python
+    from vllm import LLM
+    from vllm.config import CompilationConfig, CompilationLevel
+
+    llm = LLM(
+        model="meta-llama/Llama-3.1-8B-Instruct",
+        compilation_config=CompilationConfig(
+            level=CompilationLevel.PIECEWISE,
+            # By default, it goes up to max_num_seqs
+            cudagraph_capture_sizes=[1, 2, 4, 8, 16],
+        ),
+    )
+    ```
 
 You can disable graph capturing completely via the `enforce_eager` flag:
 
@@ -127,18 +129,20 @@ reduce the size of the processed multi-modal inputs, which in turn saves memory.
 
 Here are some examples:
 
-```python
-from vllm import LLM
+??? Code
 
-# Available for Qwen2-VL series models
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-          mm_processor_kwargs={
-              "max_pixels": 768 * 768,  # Default is 1280 * 28 * 28
-          })
-
-# Available for InternVL series models
-llm = LLM(model="OpenGVLab/InternVL2-2B",
-          mm_processor_kwargs={
-              "max_dynamic_patch": 4,  # Default is 12
-          })
-```
+    ```python
+    from vllm import LLM
+
+    # Available for Qwen2-VL series models
+    llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+            mm_processor_kwargs={
+                "max_pixels": 768 * 768,  # Default is 1280 * 28 * 28
+            })
+
+    # Available for InternVL series models
+    llm = LLM(model="OpenGVLab/InternVL2-2B",
+            mm_processor_kwargs={
+                "max_dynamic_patch": 4,  # Default is 12
+            })
+    ```
@@ -7,6 +7,8 @@ vLLM uses the following environment variables to configure the system:
 
     All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables).
 
-```python
---8<-- "vllm/envs.py:env-vars-definition"
-```
+??? Code
+
+    ```python
+    --8<-- "vllm/envs.py:env-vars-definition"
+    ```