Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
170 commits
Select commit Hold shift + click to select a range
91373a0
Fix `head_dim` not existing in all model configs (Transformers backen…
hmellor Mar 3, 2025
c41d271
[V0][Metrics] Remove unimplemented `vllm:tokens_total` (#14134)
markmc Mar 3, 2025
2dfdfed
[V0][Metrics] Deprecate some KV/prefix cache metrics (#14136)
markmc Mar 3, 2025
872db2b
[V1] Simplify stats logging (#14082)
njhill Mar 3, 2025
ae122b1
[WIP][[V1][Metrics] Implement max_num_generation_tokens, request_par…
markmc Mar 3, 2025
2b04c20
[Bugfix] Allow shared_experts skip quantization for DeepSeekV2/V3 (#1…
mgoin Mar 3, 2025
19d98e0
[Kernel] Optimize moe intermediate_cache usage (#13625)
mgoin Mar 3, 2025
cd1d3c3
[Docs] Add GPTQModel (#14056)
Qubitium Mar 3, 2025
79e4937
[v1] Add comments to the new ragged paged attention Pallas kernel (#1…
vanbasten23 Mar 3, 2025
c060b71
[Model] Add support for GraniteMoeShared models (#13313)
tjohnson31415 Mar 4, 2025
bb5b640
[core] moe fp8 block quant tuning support (#14068)
divakar-amd Mar 4, 2025
989f4f4
[Misc] Remove lru_cache in NvmlCudaPlatform (#14156)
comaniac Mar 4, 2025
bf13d40
[core] Pass all driver env vars to ray workers unless excluded (#14099)
ruisearch42 Mar 4, 2025
66233af
Use math.prod instead of np.prod for trivial ops (#14142)
zhanwenchen Mar 4, 2025
f78c0be
Fix benchmark_moe.py tuning for CUDA devices (#14164)
mgoin Mar 4, 2025
ac65bc9
[platform] add debug logging during inferring the device type (#14195)
youkaichao Mar 4, 2025
71c4b40
[sleep mode] error out with expandable_segments (#14189)
youkaichao Mar 4, 2025
3610fb4
[doc] add "Failed to infer device type" to faq (#14200)
youkaichao Mar 4, 2025
6247bae
[Bugfix] Restrict MacOS CPU detection (#14210)
mgoin Mar 4, 2025
5db6b2c
[V1][BugFix] Fix remaining sync engine client shutdown errors/hangs (…
njhill Mar 4, 2025
c8525f0
[V0][Metrics] Deprecate some questionable request time metrics (#14135)
markmc Mar 4, 2025
b3cf368
[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)
lk-chen Mar 4, 2025
f89978a
add cutlass support for blackwell fp8 gemm (#13798)
kushanam Mar 4, 2025
beebf47
[TPU][Profiler] Support start_profile/stop_profile in TPU worker (#13…
lsy323 Mar 4, 2025
9badee5
Fix performance when `--generation-config` is not `None` (#14223)
hmellor Mar 4, 2025
e5b2f16
[Frontend] Do `prompt_logprobs` clamping for chat as well as completi…
hmellor Mar 4, 2025
550c7ba
[Docs] Update Dockerfile dependency image (#14215)
mgoin Mar 4, 2025
c2bd219
[v1][Metrics] Add design doc (#12745)
markmc Mar 4, 2025
288ca11
[Security] Serialize using safetensors instead of pickle in Mooncake …
KuntaiDu Mar 4, 2025
4f5b059
Clean up unused padding_idx variables across many model definitions (…
tlrmchlsmth Mar 4, 2025
3e1d223
[ROCm] Disable a few more kernel tests that are broken on ROCm (#14145)
SageMoore Mar 4, 2025
fbfc3ee
[V1][TPU] TPU multimodal model support for ragged attention (#14158)
mgoin Mar 5, 2025
eb59b5a
[misc] announce china meetup (#14248)
youkaichao Mar 5, 2025
5b143d3
Moved numba from common requirements to cuda/rocm specific requiremen…
npanpaliya Mar 5, 2025
e123aaf
Disable GPTQ AllSpark kernels for CUDA Compiler < 12.0 (#14157)
mgoin Mar 5, 2025
0df2510
[Bugfix] Fix gptq_marlin for deepseek-v3 (#13750)
rainkert Mar 5, 2025
ade3f7d
[V1][Bugfix] Do not reset prefix caching metrics (#14235)
comaniac Mar 5, 2025
0a995d5
[Model] New model support for Phi-4-multimodal-instruct (#14119)
congcongchen123 Mar 5, 2025
72c62ea
[V1] EP/TP MoE + DP Attention (#13931)
tlrmchlsmth Mar 5, 2025
6eaf930
[platforms] improve rocm debugging info (#14257)
youkaichao Mar 5, 2025
dae9ec4
Temporarily disable test_awq_gemm_opcheck (#14251)
mgoin Mar 5, 2025
32985be
[Frontend] Allow return_tokens_as_token_ids to be passed as a request…
benchislett Mar 5, 2025
ec79b67
[Misc][V1] Avoid using `envs.VLLM_USE_V1` in mm processing (#14256)
ywang96 Mar 5, 2025
8d6cd32
[Bugfix][V1] Fix allowed_token_ids for v1 Sampler (#14169)
houseroad Mar 5, 2025
961644e
[Doc] Update nginx guide: remove privileged from vllm container run a…
iacolippo Mar 5, 2025
7f89a59
[Doc] [3/N] Refer code examples for common cases in dev multimodal pr…
DarkLight1337 Mar 5, 2025
47d4a7e
Small update for external_launcher backend docs (#14288)
zhe-thoughts Mar 5, 2025
257e200
[V1][Frontend] Add Testing For V1 Runtime Parameters (#14159)
robertgshaw2-redhat Mar 5, 2025
e17e448
[LoRA] Remove linear hack outside transformers backend (#14177)
Isotr0py Mar 5, 2025
7bab4bb
[Misc] Add Qwen2MoeForCausalLM moe tuning support (#14276)
jeejeelee Mar 5, 2025
8f808cf
prefix_caching.md: Fixed typo (#14293)
DaividFrank Mar 5, 2025
f71b00a
[Bugfix] Fix broken vision language example (#14292)
Isotr0py Mar 5, 2025
ca2ca8d
[Docs] Add Meta Slides (#14297)
simon-mo Mar 5, 2025
a32c866
[V1][Minor] Remove obsolete FIXME comment (#14304)
njhill Mar 5, 2025
a4f1ee3
Deprecate `best_of` Sampling Parameter in anticipation for vLLM V1 (#…
vincent-4 Mar 5, 2025
ac60dc7
[V1][BugFix] Fix for mixed top_k batch (#14301)
njhill Mar 5, 2025
1b7624b
[misc] Add FlashMLA as a new option of VLLM_ATTENTION_BACKEND env (#1…
yangsijia-serena Mar 5, 2025
53ea6ad
[V1][Easy] Add empty allowed_token_ids in the v1 sampler test (#14308)
houseroad Mar 5, 2025
ae056e1
init
SageMoore Mar 5, 2025
1e3e76b
[Bugfix] Fix DeepSeek MTP crash when using TP1ModelRunner with CUDA g…
pyc96 Mar 5, 2025
a7ea35a
[Bugfix] Remove num_tokens_across_dp (#14302)
tlrmchlsmth Mar 5, 2025
4dacaa4
[BugFix] Fix prefix caching V0 MLA (#14255)
LucasWilkinson Mar 6, 2025
ffad943
[CI/Build] Use spawn multiprocessing mode for V1 test pipeline (#14243)
russellb Mar 6, 2025
ca100c9
Add benchmark for DeepGEMM and vLLM Block FP8 Dense GEMM (#13917)
mgoin Mar 6, 2025
71eaf89
[Build] Add UV_HTTP_TIMEOUT to avoid timeout during installation (#13…
terrytangyuan Mar 6, 2025
f6bb18f
[BugFix] MLA + V1, illegal memory access and accuracy issues (#14253)
LucasWilkinson Mar 6, 2025
abcc61e
[misc] Mention `ray list nodes` command to troubleshoot ray issues (#…
ruisearch42 Mar 6, 2025
f5f7f00
[Bugfix][Structured Output] Support outlines engine with reasoning ou…
gaocegege Mar 6, 2025
3dbd2d8
[V1] LoRA - Enable more V1 tests (#14315)
varun-sundar-rabindranath Mar 6, 2025
5ee10e9
[Bugfix][CI] ALiBi test case in xformers multi_query_kv_attention (#1…
NickLucche Mar 6, 2025
ed6ea06
[Hardware] Update the flash attn tag to support Blackwell (#14244)
pavanimajety Mar 6, 2025
1769928
[Model] Update Paligemma multimodal processing with PromptUpdate (#1…
kylehh Mar 6, 2025
5d80252
[V1][VLM][Pixtral-HF] Support Pixtral-HF on V1 (#14275)
lk-chen Mar 6, 2025
69ff99f
[Core] Optimizing cross-attention `QKVParallelLinear` computation (#1…
NickLucche Mar 6, 2025
fa82b93
[Frontend][Docs] Transcription API streaming (#13301)
NickLucche Mar 6, 2025
0ddc991
[Doc] Update reasoning with stream example to use OpenAI library (#14…
liuyanyi Mar 6, 2025
4f27044
[Doc] Correct beam_search using in generative_models.md (#14363)
upayuryeva Mar 6, 2025
6bd1dd9
[Kernel] [V1] Improved performance for V1 Triton (ROCm) backend (#14…
tdoublep Mar 6, 2025
caac5c2
[Bugfix][Core] fix abort_seq_group and memory leak when n>1 (#14326)
courage17340 Mar 6, 2025
82551ad
[Core] Don't use cache during multi-modal profiling (#14336)
DarkLight1337 Mar 6, 2025
81b2f4a
[Doc] Fix date typo in README.md (#14366)
jitseklomp Mar 6, 2025
151b08e
[RLHF] use worker_extension_cls for compatibility with V0 and V1 (#14…
youkaichao Mar 6, 2025
bf0560b
Reinstate `best_of` for V0 (#14356)
hmellor Mar 6, 2025
ada1921
Adding cpu inference with VXE ISA for s390x architecture (#12613)
dilipgb Mar 6, 2025
e642ec9
Add authors to license header. (#14371)
tdoublep Mar 6, 2025
9f1710f
Fix mla prefill context performance (#13897)
ZhongYingMatrix Mar 6, 2025
cd57935
[V1] Do not detokenize if sampling param detokenize is False (#14224)
hj-mistral Mar 6, 2025
cc2f9b3
[Distributed] Add enable_expert_parallel arg (#14305)
tlrmchlsmth Mar 6, 2025
d929278
[CI/Build] Use uv python for docker rather than ppa:deadsnakes/ppa (#…
mgoin Mar 6, 2025
8ca2b21
[CI] Disable spawn when running V1 Test (#14345)
tdoublep Mar 6, 2025
99b0915
[Kernel] Add needs_fixed_stride_order tag to most GEMMs (#14306)
tlrmchlsmth Mar 6, 2025
958adce
[Bugfix] Fix use_direct_call condition in FusedMoE layer for (#14382)
tlrmchlsmth Mar 6, 2025
6b2ef5c
[Bug] Fix Attention when ignored in by quant_method (#14313)
mgoin Mar 6, 2025
6832707
[V1][Bugfix] Standardize quantized kv cache rejection for attention b…
mgoin Mar 6, 2025
0422298
[Docs] Add nsight guide to profiling docs (#14298)
mgoin Mar 6, 2025
f1dbffb
cleanup boolean logic
SageMoore Mar 6, 2025
0578e5a
[Hardware][TPU]Enable ragged paged attention kernel and resolve recom…
yaochengji Mar 6, 2025
ad60bbb
[Doc] Fix a typo (#14385)
dyli-google Mar 7, 2025
c34eeec
[Bugfix] Correctly call `cudaProfilerStop` in benchmarks script (#14183)
b8zhong Mar 7, 2025
dae6896
[Perf] Reduce MLA CPU overheads in V1 (#14384)
LucasWilkinson Mar 7, 2025
e174450
[FP8] Refactor apply_fp8_linear and apply_fp8_linear_generic into an …
ProExpertProg Mar 7, 2025
e5e03c2
[BugFix] Illegal Memory Access in the blockwise cutlass fp8 GEMMs (#1…
LucasWilkinson Mar 7, 2025
ddd1ef6
[Bugfix] Fix JambaForCausalLM LoRA (#14370)
jeejeelee Mar 7, 2025
63137cd
[Build] Add nightly wheel fallback when latest commit wheel unavailab…
Isotr0py Mar 7, 2025
8ca7a71
OpenVINO: added CPU-like conditions (#14338)
ilya-lavrenov Mar 7, 2025
c1588a2
[GH] Auto-apply multi-modality label to relevant PRs (#14402)
DarkLight1337 Mar 7, 2025
70da0c0
correct wrong markdown syntax (#14414)
vincent-pli Mar 7, 2025
12c29a8
[Bugfix] Further clean up LoRA test (#14422)
jeejeelee Mar 7, 2025
05fb671
[Bugfix] Clean up multi-modal processors (#14417)
DarkLight1337 Mar 7, 2025
cc10281
[Misc] Set default value of seed to None (#14274)
SmartManoj Mar 7, 2025
0ca3b8e
[BUGFIX] Skip tokenization support for throughput benchmark (#12712)
maleksan85 Mar 7, 2025
f7a6bd0
Fix missing `kv_caches` and `attn_metadata` in `OpenVINOCausalLM` (#1…
hmellor Mar 7, 2025
1e3598e
Use the optimized block sizes after tuning the kernel. (#14329)
vanbasten23 Mar 7, 2025
80e9afb
[V1][Core] Support for Structured Outputs (#12388)
aarnphm Mar 7, 2025
f7ebad2
[Doc] Update prefix_caching.md to match the example image (#14420)
York-RDWang Mar 7, 2025
58abe35
[Benchmarks] Make detokenization optional in benchmark scripts (#11697)
JArnoldAMD Mar 7, 2025
8f9664d
comments
SageMoore Mar 7, 2025
d0feea3
[Kernel] optimize performance of gptq marlin kernel when n is small (…
jinzhen-lin Mar 7, 2025
952a074
[Misc] Add Phi4-MM example (#14343)
jeejeelee Mar 7, 2025
c6359e8
[v1] torch.compile integration explanation (#14437)
youkaichao Mar 7, 2025
8ed5421
[V1] Eagerly remove finished requests from the batch (#14388)
njhill Mar 7, 2025
e1f0835
[V1][Metrics] Fix traceback with preemptions+LoRA (#14220)
markmc Mar 7, 2025
66e16a0
[Bugfix] Fix torch_xla which can't handle None seed introduced in #14…
yarongmu-google Mar 7, 2025
ef64044
[V1] Prompt logprobs + APC compatibility; prompt logprobs reqs cannot…
afeldman-nm Mar 8, 2025
3336814
[Bugfix][V1] Handle MLA in kv_cache_interface (#14462)
tlrmchlsmth Mar 8, 2025
ca7a2d5
Revert "[Perf] Reduce MLA CPU overheads in V1 (#14384)" (#14471)
tlrmchlsmth Mar 8, 2025
980385f
[Bugfix][Disaggregated] Add a check in send_kv_caches_and_hidden_stat…
hasB4K Mar 8, 2025
9f3bc0f
[MISC][V1] Register process killing handler only in the main thread (…
comaniac Mar 8, 2025
4aae667
[core] add `extra_args` to `SamplingParams` (#13300)
akeshet Mar 8, 2025
3b9c6c6
[CI/Build] refactor: set timezone of container to UTC (#12888)
bufferoverflow Mar 8, 2025
47512b3
Default to `generation_config` from model (#12622)
hmellor Mar 8, 2025
7b6fd6e
[Doc]add doc for Qwen models tool calling (#14478)
WangErXiao Mar 8, 2025
c908a07
[Doc] Added QwQ-32B to the supported models list in the reasoning out…
WangErXiao Mar 8, 2025
b8b0ccb
[Bugfix] Make the deviceprofiler include LoRA memory. (#14469)
jeejeelee Mar 8, 2025
be0b399
Add training doc signposting to TRL (#14439)
hmellor Mar 8, 2025
7caff01
[Build/BugFix] Fix hopper 12.8 build (#14354)
LucasWilkinson Mar 8, 2025
cfd0ae8
Add RLHF document (#14482)
hmellor Mar 8, 2025
33f227e
[CI/Build] Use a fixed seed to avoid flaky tests (#14480)
DarkLight1337 Mar 8, 2025
cb8bdfa
[V1] TPU - Add tensor parallel support via Ray (#13618)
alexm-redhat Mar 8, 2025
03fe18a
[VLM] Add TP support for Phi-4-MM (#14453)
Isotr0py Mar 8, 2025
0b7f06b
[Misc] add `use_tqdm_on_load` to reduce logs (#14407)
aarnphm Mar 8, 2025
8d5aa46
[V1][Core] Fix memory issue with logits & sampling (#13776)
ywang96 Mar 8, 2025
9085aab
[benchmarks] Add option to use unique jsonschema for each request (#1…
russellb Mar 8, 2025
e02883c
[Misc] Don't run ruff at all on 3rd party libs (#14493)
DarkLight1337 Mar 8, 2025
206e257
Move requirements into their own directory (#12547)
hmellor Mar 8, 2025
db84f5e
[Bugfix] DeepSeek Accuracy (#14476)
LucasWilkinson Mar 8, 2025
609ef61
[Bugfix] Fix profiling OOM and decouple encoder multimodal profiling …
Isotr0py Mar 8, 2025
0d5e73d
Update CODEOWNERS for structured output (#14496)
russellb Mar 8, 2025
9513290
[Misc] Upgrade to Python 3.9 typing for additional directories (#14492)
DarkLight1337 Mar 8, 2025
eb8b5eb
[V1] Support bad_words in sampler (#13376)
22quinn Mar 8, 2025
5f0b53c
Revert "[V1][Core] Fix memory issue with logits & sampling" (#14504)
robertgshaw2-redhat Mar 9, 2025
b0d5419
[Attention] Default to FlashMLA backend for MLA (#14451)
LucasWilkinson Mar 9, 2025
10f7552
[V1][TPU] Remove unnecessary padding for running on TPU. (#14467)
vanbasten23 Mar 9, 2025
6d7f037
[Feat] Support chunked prefill for LMCache connector (#14505)
YaoJiayi Mar 9, 2025
73ae0b4
[Bugfix] Fix tqdm progress bar when SamplingParams.n > 1 (#12428)
yanyc428 Mar 9, 2025
fb16eea
[Bugfix] Revert QKVCrossParallelLinear usage in Mllama to keep BNB qu…
Isotr0py Mar 9, 2025
212007b
[Hardware][TPU] Fix the recompiling issue in logits processor after w…
yaochengji Mar 9, 2025
a21076e
[Misc] Ensure out-of-tree quantization method recognize by cli args (…
liuyanyi Mar 9, 2025
dc74613
[Bugfix] Wrong requirements path - rocm (#14527)
martinhoyer Mar 10, 2025
1253b15
[Feature] Consolidate performance benchmark datasets (#14036)
JenZhao Mar 10, 2025
460f553
[Misc] Add log information for handle_process_request. (#14130)
chaunceyjiang Mar 10, 2025
60a98b2
[Docs] Mention `model_impl` arg when explaining Transformers fallback…
hmellor Mar 10, 2025
b0746fa
[Frontend] support image embeds (#13955)
chaunceyjiang Mar 10, 2025
89cdaa8
[Kernel] Add more dtype support for GGUF kernels (#14043)
SzymonOzog Mar 10, 2025
001a9c7
[Doc] Update PaliGemma note to a warning (#14565)
DarkLight1337 Mar 10, 2025
9ef3d37
Merge remote-tracking branch 'upstream/main'
gshtras Mar 10, 2025
ff60bf3
Merge remote-tracking branch 'nm/sage/amd-deepseek' into upstream_mer…
gshtras Mar 10, 2025
1095cff
Merge pull request #471 from ROCm/upstream_merge_25_03_10
gshtras Mar 10, 2025
34dbe31
V1 rocm support (#469)
maleksan85 Mar 11, 2025
0f2300e
nightly_fixed_aiter_integration_final_20250305 README update (#470)
Mcirino1 Mar 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,7 @@ main() {

pip install -U transformers

pip install -r requirements-dev.txt
pip install -r requirements/dev.txt
which genai-perf

# check storage
Expand Down
7 changes: 6 additions & 1 deletion .buildkite/run-amd-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,12 @@ if [[ $commands == *" kernels "* ]]; then
--ignore=kernels/test_rand.py \
--ignore=kernels/test_sampler.py \
--ignore=kernels/test_cascade_flash_attn.py \
--ignore=kernels/test_mamba_mixer2.py"
--ignore=kernels/test_mamba_mixer2.py \
--ignore=kernels/test_aqlm.py \
--ignore=kernels/test_machete_mm.py \
--ignore=kernels/test_mha_attn.py \
--ignore=kernels/test_block_fp8.py \
--ignore=kernels/test_permute_cols.py"
fi

#ignore certain Entrypoints tests
Expand Down
2 changes: 1 addition & 1 deletion .buildkite/run-cpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ function cpu_tests() {
# Run basic model test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e
pip install -r vllm/requirements-test.txt
pip install -r vllm/requirements/test.txt
pytest -v -s tests/models/decoder_only/language -m cpu_model
pytest -v -s tests/models/embedding/language -m cpu_model
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
Expand Down
11 changes: 8 additions & 3 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ steps:
fast_check: true
no_gpu: True
commands:
- pip install -r requirements-docs.txt
- pip install -r ../../requirements/docs.txt
- SPHINXOPTS=\"-W\" make html
# Check API reference (if it fails, you may have missing mock imports)
- grep \"sig sig-object py\" build/html/api/inference_params.html
Expand Down Expand Up @@ -78,6 +78,7 @@ steps:
- tests/basic_correctness/test_preemption
- tests/basic_correctness/test_cumem.py
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s basic_correctness/test_cumem.py
- pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py
Expand Down Expand Up @@ -115,6 +116,7 @@ steps:
- tests/entrypoints/test_chat_utils
- tests/entrypoints/offline_mode
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
Expand Down Expand Up @@ -146,8 +148,10 @@ steps:
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
# TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests
- python3 ../examples/offline_inference/rlhf.py
- RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/rlhf_colocate.py
- pushd ../examples/offline_inference
- python3 rlhf.py
- RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
- popd

- label: Metrics, Tracing Test # 10min
num_gpus: 2
Expand Down Expand Up @@ -204,6 +208,7 @@ steps:
- VLLM_USE_V1=1 pytest -v -s v1/engine
- VLLM_USE_V1=1 pytest -v -s v1/sample
- VLLM_USE_V1=1 pytest -v -s v1/worker
- VLLM_USE_V1=1 pytest -v -s v1/structured_output
- VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
- VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
# TODO: accuracy does not match, whether setting
Expand Down
15 changes: 15 additions & 0 deletions .github/mergify.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,21 @@ pull_request_rules:
add:
- frontend

- name: label-multi-modality
description: Automatically apply multi-modality label
conditions:
- or:
- files~=^vllm/multimodal/
- files~=^tests/multimodal/
- files~=^tests/models/multimodal/
- files~=^tests/models/*/audio_language/
- files~=^tests/models/*/vision_language/
- files=tests/models/test_vision.py
actions:
label:
add:
- multi-modality

- name: label-structured-output
description: Automatically apply structured-output label
conditions:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/scripts/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ python_executable=python3

# Update paths
# Install requirements
$python_executable -m pip install -r requirements-rocm.txt
$python_executable -m pip install -r requirements/rocm.txt

# Limit the number of parallel jobs to avoid OOM
export MAX_JOBS=1
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ _build/
hip_compat.h

# Benchmark dataset
benchmarks/*.json
benchmarks/**/*.json

# Linting
actionlint
Expand Down
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ repos:
rev: 0.6.2
hooks:
- id: pip-compile
args: [requirements-test.in, -o, requirements-test.txt]
files: ^requirements-test\.(in|txt)$
args: [requirements/test.in, -o, requirements/test.txt]
files: ^requirements/test\.(in|txt)$
- repo: local
hooks:
- id: mypy-local
Expand Down
2 changes: 1 addition & 1 deletion .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ formats: []
# Optionally declare the Python requirements required to build your docs
python:
install:
- requirements: docs/requirements-docs.txt
- requirements: requirements/docs.txt
80 changes: 54 additions & 26 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ set(ignoreMe "${VLLM_PYTHON_PATH}")
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")

# Supported NVIDIA architectures.
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")

# Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
Expand Down Expand Up @@ -312,7 +312,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Only build Marlin kernels if we are building for at least some compatible archs.
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet.
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
if (MARLIN_ARCHS)
set(MARLIN_SRCS
"csrc/quantization/fp8/fp8_marlin.cu"
Expand All @@ -334,7 +334,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

# Only build AllSpark kernels if we are building for at least some compatible archs.
cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
if (ALLSPARK_ARCHS)
if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND ALLSPARK_ARCHS)
set(ALLSPARK_SRCS
"csrc/quantization/gptq_allspark/allspark_repack.cu"
"csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
Expand All @@ -345,46 +345,74 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
else()
message(STATUS "Not building AllSpark kernels as no compatible archs found"
" in CUDA target architectures")
" in CUDA target architectures, or CUDA not >= 12.0")
endif()


set(SCALED_MM_3X_ARCHS)
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
# CUDA 12.0 or later
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
CUDA_ARCHS "${SCALED_MM_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM90=1")
# Let scaled_mm_c2x know it doesn't need to build these arches
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
"later if you intend on running FP8 quantized models on "
"Hopper.")
else()
message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
message(STATUS "Not building scaled_mm_c3x_sm90 as no compatible archs found "
"in CUDA target architectures")
endif()
endif()

# clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
# build any 3x kernels
set(SCALED_MM_3X_ARCHS)
# The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.8 or later
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
)
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM100=1")
# Let scaled_mm_c2x know it doesn't need to build these arches
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
"not >= 12.8, we recommend upgrading to CUDA 12.8 or "
"later if you intend on running FP8 quantized models on "
"Blackwell.")
else()
message(STATUS "Not building scaled_mm_c3x_100 as no compatible archs found "
"in CUDA target architectures")
endif()
endif()

#
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
# kernels for the remaining archs that are not already built for 3x.
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
"7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
"7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
# subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS)
Expand All @@ -409,17 +437,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# 2:4 Sparse Kernels

# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
# require CUDA 12.2 or later (and only work on Hopper, 9.0a for now).
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
# require CUDA 12.2 or later (and only work on Hopper and Blackwell).
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
CUDA_ARCHS "${SCALED_MM_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
"if you intend on running FP8 sparse quantized models on Hopper.")
Expand All @@ -434,8 +462,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
)
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${FP4_ARCHS}")
Expand Down Expand Up @@ -534,6 +561,7 @@ define_gpu_extension_target(
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
USE_SABI 3
WITH_SOABI)

Expand All @@ -557,7 +585,7 @@ set_gencode_flags_for_srcs(
CUDA_ARCHS "${CUDA_ARCHS}")

if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
if (MARLIN_MOE_ARCHS)
set(MARLIN_MOE_SRC
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
Expand Down
Loading