diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh index fdb8ec5393b36..b2e910e1ba8a7 100644 --- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh @@ -2,7 +2,7 @@ # We can use this script to compute baseline accuracy on GSM for transformers. # # Make sure you have lm-eval-harness installed: -# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10 +# pip install lm-eval==0.4.4 usage() { echo`` diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh index de841d959a4e4..4d32b49a4fac3 100644 --- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh @@ -3,7 +3,7 @@ # We use this for fp8, which HF does not support. # # Make sure you have lm-eval-harness installed: -# pip install lm-eval==0.4.3 +# pip install lm-eval==0.4.4 usage() { echo`` diff --git a/.buildkite/nightly-benchmarks/nightly-annotation.md b/.buildkite/nightly-benchmarks/nightly-annotation.md new file mode 100644 index 0000000000000..1e33793842bf8 --- /dev/null +++ b/.buildkite/nightly-benchmarks/nightly-annotation.md @@ -0,0 +1,28 @@ + +## Description + +This file contains the downloading link for benchmarking results. + +- [benchmarking pipeline](artifact://nightly-pipeline.yaml) +- [benchmarking results](artifact://results.zip) +- [benchmarking code](artifact://nightly-benchmarks.zip) + +Please download the visualization scripts in the post + + +## Results reproduction + +- Find the docker we use in `benchmarking pipeline` +- Deploy the docker, and inside the docker: + - Download `nightly-benchmarks.zip`. + - In the same folder, run the following code +``` +export HF_TOKEN= +apt update +apt install -y git +unzip nightly-benchmarks.zip +VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh +``` + +And the results will be inside `./benchmarks/results`. + diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md index c3d3cbf473968..7dec7a0fe0b4e 100644 --- a/.buildkite/nightly-benchmarks/nightly-descriptions.md +++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md @@ -1,45 +1,39 @@ # Nightly benchmark -The main goal of this benchmarking is two-fold: -- Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload. -- Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md](). - - -## Docker images - -We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images: -- vllm/vllm-openai:v0.5.0.post1 -- nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 -- openmmlab/lmdeploy:v0.5.0 -- ghcr.io/huggingface/text-generation-inference:2.1 - - - - -## Hardware - -One AWS node with 8x NVIDIA A100 GPUs. - - -## Workload description - -We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload: - -- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed). -- Output length: the corresponding output length of these 500 prompts. -- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. -- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed). -- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better). - - - -## Plots - -In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed. - -Benchmarking results - -## Results - -{nightly_results_benchmarking_table} +This benchmark aims to: +- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload. +- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions. + +Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end. + +Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176) + + +## Setup + +- Docker images: + - vLLM: `vllm/vllm-openai:v0.6.2` + - SGLang: `lmsysorg/sglang:v0.3.2-cu121` + - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12` + - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3` + - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.* + - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark. +- Hardware + - 8x Nvidia A100 GPUs +- Workload: + - Dataset + - ShareGPT dataset + - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output) + - Decode-heavy dataset (in average 462 input tokens, 256 output tokens) + - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use. + - Models: llama-3 8B, llama-3 70B. + - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)). + - Average QPS (query per second): 2, 4, 8, 16, 32 and inf. + - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed. + - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better). + +# Known issues + +- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105). +- TGI does not support `ignore-eos` flag. \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index 6e399bb936fbc..199517e8b067c 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -13,7 +13,7 @@ common_pod_spec: &common_pod_spec common_container_settings: &common_container_settings command: - - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh resources: limits: nvidia.com/gpu: 8 @@ -37,7 +37,10 @@ common_container_settings: &common_container_settings steps: - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours." - - label: "A100 trt benchmark" + + + + - label: "A100 vllm step 10" priority: 100 agents: queue: A100 @@ -46,7 +49,21 @@ steps: podSpec: <<: *common_pod_spec containers: - - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 + - image: vllm/vllm-openai:v0.6.2 + <<: *common_container_settings + + + + - label: "A100 sglang benchmark" + priority: 100 + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + <<: *common_pod_spec + containers: + - image: lmsysorg/sglang:v0.3.2-cu121 <<: *common_container_settings - label: "A100 lmdeploy benchmark" @@ -58,11 +75,13 @@ steps: podSpec: <<: *common_pod_spec containers: - - image: openmmlab/lmdeploy:v0.5.0 + - image: openmmlab/lmdeploy:v0.6.1-cu12 <<: *common_container_settings - - - label: "A100 vllm benchmark" + + + + - label: "A100 trt llama-8B" priority: 100 agents: queue: A100 @@ -71,10 +90,25 @@ steps: podSpec: <<: *common_pod_spec containers: - - image: vllm/vllm-openai:latest + - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 <<: *common_container_settings + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_HOME + value: /root/.cache/huggingface + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + - name: TEST_SELECTOR + value: "llama8B" - - label: "A100 tgi benchmark" + + - label: "A100 trt llama-70B" priority: 100 agents: queue: A100 @@ -83,12 +117,54 @@ steps: podSpec: <<: *common_pod_spec containers: - - image: ghcr.io/huggingface/text-generation-inference:2.1 + - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 <<: *common_container_settings + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_HOME + value: /root/.cache/huggingface + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + - name: TEST_SELECTOR + value: "llama70B" + + + # FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image + # - label: "A100 trt benchmark" + # priority: 100 + # agents: + # queue: A100 + # plugins: + # - kubernetes: + # podSpec: + # <<: *common_pod_spec + # containers: + # - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 + # <<: *common_container_settings + + + # FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`. + # - label: "A100 tgi benchmark" + # priority: 100 + # agents: + # queue: A100 + # plugins: + # - kubernetes: + # podSpec: + # <<: *common_pod_spec + # containers: + # - image: ghcr.io/huggingface/text-generation-inference:2.2.0 + # <<: *common_container_settings - wait - - label: "Plot" + - label: "Collect the results" priority: 100 agents: queue: A100 @@ -117,4 +193,4 @@ steps: name: hf-token-secret key: token - - wait \ No newline at end of file + - block: ":rocket: check the results!" \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh deleted file mode 100644 index 627a3e6971578..0000000000000 --- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/bin/bash - -set -o pipefail -set -x - -check_gpus() { - # check the number of GPUs and GPU type. - declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) - if [[ $gpu_count -gt 0 ]]; then - echo "GPU found." - else - echo "Need at least 1 GPU to run benchmarking." - exit 1 - fi - declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') - echo "GPU type is $gpu_type" -} - -check_hf_token() { - # check if HF_TOKEN is available and valid - if [[ -z "$HF_TOKEN" ]]; then - echo "Error: HF_TOKEN is not set." - exit 1 - elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then - echo "Error: HF_TOKEN does not start with 'hf_'." - exit 1 - else - echo "HF_TOKEN is set and valid." - fi -} - -main() { - - check_gpus - check_hf_token - - df -h - - (which wget && which curl) || (apt-get update && apt-get install -y wget curl) - (which jq) || (apt-get update && apt-get -y install jq) - - cd $VLLM_SOURCE_CODE_LOC/benchmarks - wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json - - - # run lmdeploy - if which lmdeploy >/dev/null; then - echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh" - bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh - exit 0 - fi - - # run tgi - if [ -e /tgi-entrypoint.sh ]; then - echo "tgi is available, redirect to run-tgi-nightly.sh" - bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh - exit 0 - fi - - # run trt - if which trtllm-build >/dev/null; then - echo "trtllm is available, redirect to run-trt-nightly.sh" - bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh - exit 0 - fi - - # run vllm - if [ -e /vllm-workspace ]; then - echo "vllm is available, redirect to run-vllm-nightly.sh" - bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh - exit 0 - fi - -} - -main "$@" \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py new file mode 100644 index 0000000000000..6059588fe7277 --- /dev/null +++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py @@ -0,0 +1,95 @@ +import argparse +import json +from pathlib import Path + +import numpy as np +import pandas as pd +from tabulate import tabulate + + +def parse_arguments(): + parser = argparse.ArgumentParser( + description= + 'Parse command line arguments for summary-nightly-results script.') + parser.add_argument('--results-folder', + type=str, + required=True, + help='The folder where the results are stored.') + parser.add_argument('--description', + type=str, + required=True, + help='Description of the results.') + + args = parser.parse_args() + return args + + +def get_perf(df, method, model, metric): + + means = [] + + for qps in [2, 4, 8, 16, "inf"]: + target = df['Test name'].str.contains(model) + target = target & df['Engine'].str.contains(method) + target = target & df['Test name'].str.contains("qps_" + str(qps)) + filtered_df = df[target] + + if filtered_df.empty: + means.append(0.) + else: + means.append(filtered_df[metric].values[0]) + + return np.array(means) + + +def get_perf_w_std(df, method, model, metric): + + if metric in ["TTFT", "ITL"]: + mean = get_perf(df, method, model, "Mean " + metric + " (ms)") + mean = mean.tolist() + std = get_perf(df, method, model, "Std " + metric + " (ms)") + if std.mean() == 0: + std = None + success = get_perf(df, method, model, "Successful req.") + if std is not None: + std = std / np.sqrt(success) + std = std.tolist() + + else: + assert metric == "Tput" + mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf( + df, method, model, "Output Tput (tok/s)") + mean = mean.tolist() + std = None + + return mean, std + + +def main(args): + results_folder = Path(args.results_folder) + + results = [] + + # collect results + for test_file in results_folder.glob("*_nightly_results.json"): + with open(test_file, "r") as f: + results = results + json.loads(f.read()) + + # generate markdown table + df = pd.DataFrame.from_dict(results) + + md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False) + + with open(args.description, "r") as f: + description = f.read() + + description = description.format( + nightly_results_benchmarking_table=md_table) + + with open("nightly_results.md", "w") as f: + f.write(description) + + +if __name__ == '__main__': + args = parse_arguments() + main(args) diff --git a/.buildkite/nightly-benchmarks/scripts/launch-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-server.sh new file mode 100644 index 0000000000000..e9d7d6a8d760a --- /dev/null +++ b/.buildkite/nightly-benchmarks/scripts/launch-server.sh @@ -0,0 +1,241 @@ +#!/bin/bash + +# Currently FP8 benchmark is NOT enabled. + +set -x +server_params=$1 +common_params=$2 + +json2args() { + # transforms the JSON string to command line args, and '_' is replaced to '-' + # example: + # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } + # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 + local json_string=$1 + local args=$( + echo "$json_string" | jq -r ' + to_entries | + map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | + join(" ") + ' + ) + echo "$args" +} + +launch_trt_server() { + + model_path=$(echo "$common_params" | jq -r '.model') + model_name="${model_path#*/}" + model_type=$(echo "$server_params" | jq -r '.model_type') + model_dtype=$(echo "$server_params" | jq -r '.model_dtype') + model_tp_size=$(echo "$common_params" | jq -r '.tp') + max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size') + max_input_len=$(echo "$server_params" | jq -r '.max_input_len') + max_seq_len=$(echo "$server_params" | jq -r '.max_seq_len') + max_num_tokens=$(echo "$server_params" | jq -r '.max_num_tokens') + trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version') + + # create model caching directory + cd ~ + rm -rf models + mkdir -p models + cd models + models_dir=$(pwd) + trt_model_path=${models_dir}/${model_name}-trt-ckpt + trt_engine_path=${models_dir}/${model_name}-trt-engine + + # clone tensorrt backend + cd / + rm -rf tensorrtllm_backend + git clone https://github.com/triton-inference-server/tensorrtllm_backend.git + git lfs install + cd tensorrtllm_backend + git checkout $trt_llm_version + tensorrtllm_backend_dir=$(pwd) + git submodule update --init --recursive + + # build trtllm engine + cd /tensorrtllm_backend + cd ./tensorrt_llm/examples/${model_type} + python3 convert_checkpoint.py \ + --model_dir ${model_path} \ + --dtype ${model_dtype} \ + --tp_size ${model_tp_size} \ + --output_dir ${trt_model_path} + trtllm-build \ + --checkpoint_dir ${trt_model_path} \ + --use_fused_mlp \ + --reduce_fusion disable \ + --workers 8 \ + --gpt_attention_plugin ${model_dtype} \ + --gemm_plugin ${model_dtype} \ + --tp_size ${model_tp_size} \ + --max_batch_size ${max_batch_size} \ + --max_input_len ${max_input_len} \ + --max_seq_len ${max_seq_len} \ + --max_num_tokens ${max_num_tokens} \ + --output_dir ${trt_engine_path} + + # handle triton protobuf files and launch triton server + cd /tensorrtllm_backend + mkdir triton_model_repo + cp -r all_models/inflight_batcher_llm/* triton_model_repo/ + cd triton_model_repo + rm -rf ./tensorrt_llm/1/* + cp -r ${trt_engine_path}/* ./tensorrt_llm/1 + python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false + python3 ../tools/fill_template.py -i preprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5 + python3 ../tools/fill_template.py -i postprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false + python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:$max_batch_size + python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:"False",bls_instance_count:1 + cd /tensorrtllm_backend + python3 scripts/launch_triton_server.py \ + --world_size=${model_tp_size} \ + --model_repo=/tensorrtllm_backend/triton_model_repo & + +} + +launch_tgi_server() { + model=$(echo "$common_params" | jq -r '.model') + tp=$(echo "$common_params" | jq -r '.tp') + dataset_name=$(echo "$common_params" | jq -r '.dataset_name') + dataset_path=$(echo "$common_params" | jq -r '.dataset_path') + port=$(echo "$common_params" | jq -r '.port') + num_prompts=$(echo "$common_params" | jq -r '.num_prompts') + server_args=$(json2args "$server_params") + + if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then + echo "Key 'fp8' exists in common params." + server_command="/tgi-entrypoint.sh \ + --model-id $model \ + --num-shard $tp \ + --port $port \ + --quantize fp8 \ + $server_args" + else + echo "Key 'fp8' does not exist in common params." + server_command="/tgi-entrypoint.sh \ + --model-id $model \ + --num-shard $tp \ + --port $port \ + $server_args" + fi + + echo "Server command: $server_command" + eval "$server_command" & + +} + +launch_lmdeploy_server() { + model=$(echo "$common_params" | jq -r '.model') + tp=$(echo "$common_params" | jq -r '.tp') + dataset_name=$(echo "$common_params" | jq -r '.dataset_name') + dataset_path=$(echo "$common_params" | jq -r '.dataset_path') + port=$(echo "$common_params" | jq -r '.port') + num_prompts=$(echo "$common_params" | jq -r '.num_prompts') + server_args=$(json2args "$server_params") + + server_command="lmdeploy serve api_server $model \ + --tp $tp \ + --server-port $port \ + $server_args" + + # run the server + echo "Server command: $server_command" + bash -c "$server_command" & +} + +launch_sglang_server() { + + model=$(echo "$common_params" | jq -r '.model') + tp=$(echo "$common_params" | jq -r '.tp') + dataset_name=$(echo "$common_params" | jq -r '.dataset_name') + dataset_path=$(echo "$common_params" | jq -r '.dataset_path') + port=$(echo "$common_params" | jq -r '.port') + num_prompts=$(echo "$common_params" | jq -r '.num_prompts') + server_args=$(json2args "$server_params") + + if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then + echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience." + model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model') + server_command="python3 \ + -m sglang.launch_server \ + --tp $tp \ + --model-path $model \ + --port $port \ + $server_args" + else + echo "Key 'fp8' does not exist in common params." + server_command="python3 \ + -m sglang.launch_server \ + --tp $tp \ + --model-path $model \ + --port $port \ + $server_args" + fi + + # run the server + echo "Server command: $server_command" + eval "$server_command" & +} + +launch_vllm_server() { + + export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') + + model=$(echo "$common_params" | jq -r '.model') + tp=$(echo "$common_params" | jq -r '.tp') + dataset_name=$(echo "$common_params" | jq -r '.dataset_name') + dataset_path=$(echo "$common_params" | jq -r '.dataset_path') + port=$(echo "$common_params" | jq -r '.port') + num_prompts=$(echo "$common_params" | jq -r '.num_prompts') + server_args=$(json2args "$server_params") + + if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then + echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience." + model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model') + server_command="python3 \ + -m vllm.entrypoints.openai.api_server \ + -tp $tp \ + --model $model \ + --port $port \ + $server_args" + else + echo "Key 'fp8' does not exist in common params." + server_command="python3 \ + -m vllm.entrypoints.openai.api_server \ + -tp $tp \ + --model $model \ + --port $port \ + $server_args" + fi + + # run the server + echo "Server command: $server_command" + eval "$server_command" & +} + +main() { + + if [[ $CURRENT_LLM_SERVING_ENGINE == "trt" ]]; then + launch_trt_server + fi + + if [[ $CURRENT_LLM_SERVING_ENGINE == "tgi" ]]; then + launch_tgi_server + fi + + if [[ $CURRENT_LLM_SERVING_ENGINE == "lmdeploy" ]]; then + launch_lmdeploy_server + fi + + if [[ $CURRENT_LLM_SERVING_ENGINE == "sglang" ]]; then + launch_sglang_server + fi + + if [[ "$CURRENT_LLM_SERVING_ENGINE" == *"vllm"* ]]; then + launch_vllm_server + fi +} + +main diff --git a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh deleted file mode 100644 index f8262653a6628..0000000000000 --- a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh +++ /dev/null @@ -1,102 +0,0 @@ -#!/bin/bash - - -server_params=$1 -common_params=$2 - - - -model_path=$(echo "$common_params" | jq -r '.model') -model_name="${model_path#*/}" -model_type=$(echo "$server_params" | jq -r '.model_type') -model_dtype=$(echo "$server_params" | jq -r '.model_dtype') -model_tp_size=$(echo "$common_params" | jq -r '.tp') -max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size') -max_input_len=$(echo "$server_params" | jq -r '.max_input_len') -max_output_len=$(echo "$server_params" | jq -r '.max_output_len') -trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version') - -cd ~ -rm -rf models -mkdir -p models -cd models -models_dir=$(pwd) -trt_model_path=${models_dir}/${model_name}-trt-ckpt -trt_engine_path=${models_dir}/${model_name}-trt-engine - -cd ~ -rm -rf tensorrt-demo -git clone https://github.com/neuralmagic/tensorrt-demo.git -cd tensorrt-demo -tensorrt_demo_dir=$(pwd) - -# make sure the parameter inside tensorrt_demo is consistent to envvar -sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt -sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt -sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt -sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt -sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt -sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt - - -cd / -rm -rf tensorrtllm_backend -git clone https://github.com/triton-inference-server/tensorrtllm_backend.git -git lfs install -cd tensorrtllm_backend -git checkout $trt_llm_version -tensorrtllm_backend_dir=$(pwd) -git submodule update --init --recursive -cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/ - -cd /tensorrtllm_backend -cd ./tensorrt_llm/examples/${model_type} - - -if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then - - echo "Key 'fp8' exists in common params. Use quantize.py instead of convert_checkpoint.py" - echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md" - python ../quantization/quantize.py \ - --model_dir ${model_path} \ - --dtype ${model_dtype} \ - --tp_size ${model_tp_size} \ - --output_dir ${trt_model_path} \ - --qformat fp8 \ - --kv_cache_dtype fp8 \ - --calib_size 2 - -else - - echo "Key 'fp8' does not exist in common params. Use convert_checkpoint.py" - python3 convert_checkpoint.py \ - --model_dir ${model_path} \ - --dtype ${model_dtype} \ - --tp_size ${model_tp_size} \ - --output_dir ${trt_model_path} - -fi - - - -trtllm-build \ ---checkpoint_dir=${trt_model_path} \ ---gpt_attention_plugin=${model_dtype} \ ---gemm_plugin=${model_dtype} \ ---remove_input_padding=enable \ ---paged_kv_cache=enable \ ---tp_size=${model_tp_size} \ ---max_batch_size=${max_batch_size} \ ---max_input_len=${max_input_len} \ ---max_output_len=${max_output_len} \ ---max_num_tokens=${max_output_len} \ ---opt_num_tokens=${max_output_len} \ ---output_dir=${trt_engine_path} - -cd /tensorrtllm_backend/triton_model_repo -rm -rf ./tensorrt_llm/1/* -cp -r ${trt_engine_path}/* ./tensorrt_llm/1 -cd /tensorrtllm_backend -python3 scripts/launch_triton_server.py \ ---world_size=${model_tp_size} \ ---model_repo=/tensorrtllm_backend/triton_model_repo & \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh index 1168912c6e229..c6a1bbdeb7d48 100644 --- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh +++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh @@ -8,6 +8,7 @@ main() { (which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which jq) || (apt-get update && apt-get -y install jq) + (which zip) || (apt-get install -y zip) if [ ! -f /workspace/buildkite-agent ]; then echo "buildkite-agent binary not found. Skip plotting the results." @@ -24,17 +25,54 @@ main() { ls ls results/ - # generate figures - python3 -m pip install tabulate pandas matplotlib - python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ - --description $description \ - --results-folder results/ + # upload benchmark results + zip -r results.zip results/ + /workspace/buildkite-agent artifact upload "results.zip" + + # upload benchmarking scripts + cd $VLLM_SOURCE_CODE_LOC/ + zip -r nightly-benchmarks.zip .buildkite/ benchmarks/ + /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip" + + cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/ + # upload benchmarking pipeline + /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml" + + cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/ + /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md + + + + # The figures should be genereated by a separate process outside the CI/CD pipeline + + # # generate figures + # python3 -m pip install tabulate pandas matplotlib + + # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \ + # --description $description \ + # --results-folder results/ + + + # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ + # --description $description \ + # --results-folder results/ \ + # --dataset sharegpt + + # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ + # --description $description \ + # --results-folder results/ \ + # --dataset sonnet_2048_128 + + # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ + # --description $description \ + # --results-folder results/ \ + # --dataset sonnet_128_2048 - # upload results and figures - /workspace/buildkite-agent artifact upload "nightly_results.png" - /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml - /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json - /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md + # # upload results and figures + # /workspace/buildkite-agent artifact upload "nightly_results*.png" + # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml + # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json + # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md } main "$@" \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py deleted file mode 100644 index e5cfcc64a9b2a..0000000000000 --- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py +++ /dev/null @@ -1,135 +0,0 @@ -import argparse -import json -import math -from pathlib import Path - -import matplotlib.pyplot as plt -import pandas as pd -from tabulate import tabulate - - -def parse_arguments(): - parser = argparse.ArgumentParser( - description= - 'Parse command line arguments for summary-nightly-results script.') - parser.add_argument('--results-folder', - type=str, - required=True, - help='The folder where the results are stored.') - parser.add_argument('--description', - type=str, - required=True, - help='Description of the results.') - - args = parser.parse_args() - return args - - -def main(args): - bar_colors = ['#56B4E9', '#009E73', '#D55E00', '#E69F00'] - results_folder = Path(args.results_folder) - - results = [] - - # collect results - for test_file in results_folder.glob("*_nightly_results.json"): - with open(test_file, "r") as f: - results = results + json.loads(f.read()) - - # generate markdown table - df = pd.DataFrame.from_dict(results) - - md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False) - - with open(args.description, "r") as f: - description = f.read() - - description = description.format( - nightly_results_benchmarking_table=md_table) - - with open("nightly_results.md", "w") as f: - f.write(description) - - plt.rcParams.update({'font.size': 20}) - - # plot results - fig, axes = plt.subplots(3, 3, figsize=(16, 14)) - fig.subplots_adjust(hspace=1) - methods = ["vllm", "trt", "lmdeploy", "tgi"] - for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]): - for j, metric in enumerate(["TTFT", "ITL"]): - means, stds = [], [] - for method in methods: - target = df['Test name'].str.contains(model) - target = target & df['Engine'].str.contains(method) - filtered_df = df[target] - - if filtered_df.empty: - means.append(0.) - stds.append(0.) - else: - means.append(filtered_df[f"Mean {metric} (ms)"].values[0]) - std = filtered_df[f"Std {metric} (ms)"].values[0] - success = filtered_df["Successful req."].values[0] - stds.append(std / math.sqrt(success)) - - print(model, metric) - print(means, stds) - - ax = axes[i, j + 1] - - bars = ax.bar( - ["vllm", "trt", "lmdeploy", "tgi"], - means, - yerr=stds, - capsize=10, - ) - for idx, bar in enumerate(bars): - bar.set_color(bar_colors[idx]) - ax.set_ylim(bottom=0) - - ax.set_ylabel(f"{metric} (ms)") - ax.set_title(f"{model} {metric}") - ax.grid(axis='y') - - metric = "Tput" - j = 0 - if True: - tputs = [] - for method in methods: - target = df['Test name'].str.contains(model) - target = target & df['Engine'].str.contains(method) - filtered_df = df[target] - - if filtered_df.empty: - tputs.append(0.) - else: - input_tput = filtered_df["Input Tput (tok/s)"].values[0] - output_tput = filtered_df["Output Tput (tok/s)"].values[0] - tputs.append(input_tput + output_tput) - - print(model, metric) - print(tputs) - - ax = axes[i, j] - - bars = ax.bar( - ["vllm", "trt", "lmdeploy", "tgi"], - tputs, - ) - for idx, bar in enumerate(bars): - bar.set_color(bar_colors[idx]) - - ax.set_ylim(bottom=0) - - ax.set_ylabel("Tput (token/s)") - ax.set_title(f"{model} {metric}") - ax.grid(axis='y') - - fig.tight_layout() - fig.savefig("nightly_results.png", bbox_inches='tight', dpi=400) - - -if __name__ == '__main__': - args = parse_arguments() - main(args) diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh deleted file mode 100644 index d6f112aaa42fd..0000000000000 --- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh +++ /dev/null @@ -1,218 +0,0 @@ -#!/bin/bash - -set -o pipefail - -check_gpus() { - # check the number of GPUs and GPU type. - declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) - if [[ $gpu_count -gt 0 ]]; then - echo "GPU found." - else - echo "Need at least 1 GPU to run benchmarking." - exit 1 - fi - declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') - echo "GPU type is $gpu_type" -} - -kill_gpu_processes() { - pkill lmdeploy || true - # waiting for GPU processes to be fully killed - sleep 10 - # Print the GPU memory usage - # so that we know if all GPU processes are killed. - gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0) - # The memory usage should be 0 MB. - echo "GPU 0 Memory Usage: $gpu_memory_usage MB" -} - -json2args() { - # transforms the JSON string to command line args, and '_' is replaced to '-' - # example: - # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } - # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 - local json_string=$1 - local args=$( - echo "$json_string" | jq -r ' - to_entries | - map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | - join(" ") - ' - ) - echo "$args" -} - -wait_for_server() { - # wait for vllm server to start - # return 1 if vllm server crashes - timeout 1200 bash -c ' - until curl -s localhost:8000/v1/completions > /dev/null; do - sleep 1 - done' && return 0 || return 1 -} - -run_serving_tests() { - # run serving tests using `benchmark_serving.py` - # $1: a json file specifying serving test cases - - local serving_test_file - serving_test_file=$1 - - # Iterate over serving tests - jq -c '.[]' "$serving_test_file" | while read -r params; do - # get the test name, and append the GPU type back to it. - test_name=$(echo "$params" | jq -r '.test_name') - - # if TEST_SELECTOR is set, only run the test cases that match the selector - if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then - echo "Skip test case $test_name." - continue - fi - - # append lmdeploy to the test name - test_name=lmdeploy_$test_name - - # get common parameters - common_params=$(echo "$params" | jq -r '.common_parameters') - model=$(echo "$common_params" | jq -r '.model') - tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') - port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') - - - - # get client and server arguments - server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters') - client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters') - server_args=$(json2args "$server_params") - client_args=$(json2args "$client_params") - qps_list=$(echo "$params" | jq -r '.qps_list') - qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') - echo "Running over qps list $qps_list" - - # check if there is enough GPU to run the test - if [[ $gpu_count -lt $tp ]]; then - echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." - continue - fi - - # prepare tokenizer - rm -rf /tokenizer_cache - mkdir /tokenizer_cache - python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \ - --model "$model" \ - --cachedir /tokenizer_cache - - server_command="lmdeploy serve api_server $model \ - --tp $tp \ - --server-port $port \ - $server_args" - - # run the server - echo "Running test case $test_name" - echo "Server command: $server_command" - bash -c "$server_command" & - - # wait until the server is alive - wait_for_server - if [ $? -eq 0 ]; then - echo "" - echo "lmdeploy server is up and running." - else - echo "" - echo "lmdeploy failed to start within the timeout period." - break - fi - - # get model name - model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py) - - # iterate over different QPS - for qps in $qps_list; do - # remove the surrounding single quote from qps - if [[ "$qps" == *"inf"* ]]; then - echo "qps was $qps" - qps="inf" - echo "now qps is $qps" - fi - - new_test_name=$test_name"_qps_"$qps - - client_command="python3 benchmark_serving.py \ - --backend lmdeploy \ - --tokenizer /tokenizer_cache \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --num-prompts $num_prompts \ - --port $port \ - --save-result \ - --result-dir $RESULTS_FOLDER \ - --result-filename ${new_test_name}.json \ - --request-rate $qps \ - --model \"$model_name\" \ - $client_args" - - echo "Running test case $test_name with qps $qps" - echo "Client command: $client_command" - - eval "$client_command" - - # record the benchmarking commands - jq_output=$(jq -n \ - --arg server "$server_command" \ - --arg client "$client_command" \ - --arg gpu "$gpu_type" \ - --arg engine "lmdeploy" \ - '{ - server_command: $server, - client_command: $client, - gpu_type: $gpu, - engine: $engine - }') - echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" - - done - - # clean up - kill_gpu_processes - rm -rf /root/.cache/huggingface/* - done -} - - -upload_to_buildkite() { - # upload the benchmarking results to buildkite - - # if the agent binary is not found, skip uploading the results, exit 0 - if [ ! -f /workspace/buildkite-agent ]; then - echo "buildkite-agent binary not found. Skip uploading the results." - return 0 - fi - # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md - /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" -} - - -main() { - - check_gpus - # enter vllm directory - cd $VLLM_SOURCE_CODE_LOC/benchmarks - - declare -g RESULTS_FOLDER=results/ - mkdir -p $RESULTS_FOLDER - BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ - - python -m pip install transformers==4.41.2 - - export CURRENT_LLM_SERVING_ENGINE=lmdeploy - run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json - python -m pip install tabulate pandas - python $BENCHMARK_ROOT/scripts/summary-nightly-results.py - upload_to_buildkite - -} - -main "$@" diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh new file mode 100644 index 0000000000000..dd8c15e0700eb --- /dev/null +++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh @@ -0,0 +1,357 @@ +#!/bin/bash + +set -o pipefail +set -x + +check_gpus() { + # check the number of GPUs and GPU type. + declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) + if [[ $gpu_count -gt 0 ]]; then + echo "GPU found." + else + echo "Need at least 1 GPU to run benchmarking." + exit 1 + fi + declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') + echo "GPU type is $gpu_type" +} + +check_hf_token() { + # check if HF_TOKEN is available and valid + if [[ -z "$HF_TOKEN" ]]; then + echo "Error: HF_TOKEN is not set." + exit 1 + elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then + echo "Error: HF_TOKEN does not start with 'hf_'." + exit 1 + else + echo "HF_TOKEN is set and valid." + fi +} + + +upload_to_buildkite() { + # upload the benchmarking results to buildkite + + # if the agent binary is not found, skip uploading the results, exit 0 + if [ ! -f /workspace/buildkite-agent ]; then + echo "buildkite-agent binary not found. Skip uploading the results." + return 0 + fi + # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md + /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" +} + + +get_current_llm_serving_engine() { + + if which lmdeploy >/dev/null; then + echo "Container: lmdeploy" + export CURRENT_LLM_SERVING_ENGINE=lmdeploy + return + fi + + if [ -e /tgi-entrypoint.sh ]; then + echo "Container: tgi" + export CURRENT_LLM_SERVING_ENGINE=tgi + return + fi + + if which trtllm-build >/dev/null; then + echo "Container: tensorrt-llm" + export CURRENT_LLM_SERVING_ENGINE=trt + return + fi + + if [ -e /sgl-workspace ]; then + echo "Container: sglang" + export CURRENT_LLM_SERVING_ENGINE=sglang + return + fi + + if [ -e /vllm-workspace ]; then + echo "Container: vllm" + # move to a completely irrelevant directory, to avoid import vllm from current folder + export CURRENT_LLM_SERVING_ENGINE=vllm + + return + fi +} + +json2args() { + # transforms the JSON string to command line args, and '_' is replaced to '-' + # example: + # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } + # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 + local json_string=$1 + local args=$( + echo "$json_string" | jq -r ' + to_entries | + map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | + join(" ") + ' + ) + echo "$args" +} + +kill_gpu_processes() { + pkill -f python + pkill -f python3 + pkill -f tritonserver + pkill -f pt_main_thread + pkill -f text-generation + pkill -f lmdeploy + + while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do + sleep 1 + done +} + +wait_for_server() { + # wait for vllm server to start + # return 1 if vllm server crashes + timeout 1200 bash -c ' + until curl -s localhost:8000/v1/completions > /dev/null; do + sleep 1 + done' && return 0 || return 1 +} + +ensure_installed() { + # Ensure that the given command is installed by apt-get + local cmd=$1 + if ! which $cmd >/dev/null; then + apt-get update && apt-get install -y $cmd + fi +} + +run_serving_tests() { + # run serving tests using `benchmark_serving.py` + # $1: a json file specifying serving test cases + + local serving_test_file + serving_test_file=$1 + + # Iterate over serving tests + jq -c '.[]' "$serving_test_file" | while read -r params; do + # get the test name, and append the GPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # prepend the current serving engine to the test name + test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name} + + # get common parameters + common_params=$(echo "$params" | jq -r '.common_parameters') + model=$(echo "$common_params" | jq -r '.model') + tp=$(echo "$common_params" | jq -r '.tp') + dataset_name=$(echo "$common_params" | jq -r '.dataset_name') + dataset_path=$(echo "$common_params" | jq -r '.dataset_path') + port=$(echo "$common_params" | jq -r '.port') + num_prompts=$(echo "$common_params" | jq -r '.num_prompts') + reuse_server=$(echo "$common_params" | jq -r '.reuse_server') + + # get client and server arguments + server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters") + client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters") + client_args=$(json2args "$client_params") + qps_list=$(echo "$params" | jq -r '.qps_list') + qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') + echo "Running over qps list $qps_list" + + # check if there is enough GPU to run the test + if [[ $gpu_count -lt $tp ]]; then + echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name." + continue + fi + + if [[ $reuse_server == "true" ]]; then + echo "Reuse previous server for test case $test_name" + else + kill_gpu_processes + bash $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh \ + "$server_params" "$common_params" + fi + + wait_for_server + + if [ $? -eq 0 ]; then + echo "" + echo "$CURRENT_LLM_SERVING_ENGINE server is up and running." + else + echo "" + echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period." + break + fi + + # prepare tokenizer + # this is required for lmdeploy. + cd $VLLM_SOURCE_CODE_LOC/benchmarks + rm -rf /tokenizer_cache + mkdir /tokenizer_cache + python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \ + --model "$model" \ + --cachedir /tokenizer_cache + cd $VLLM_SOURCE_CODE_LOC/benchmarks + + + # change model name for lmdeploy (it will not follow standard hf name) + if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then + model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py) + fi + + # iterate over different QPS + for qps in $qps_list; do + # remove the surrounding single quote from qps + if [[ "$qps" == *"inf"* ]]; then + echo "qps was $qps" + qps="inf" + echo "now qps is $qps" + fi + + new_test_name=$test_name"_qps_"$qps + + backend=$CURRENT_LLM_SERVING_ENGINE + + if [[ $backend = "trt" ]]; then + backend="tensorrt-llm" + fi + + if [[ "$backend" == *"vllm"* ]]; then + backend="vllm" + fi + + if [[ "$dataset_name" = "sharegpt" ]]; then + + client_command="python3 benchmark_serving.py \ + --backend $backend \ + --tokenizer /tokenizer_cache \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --num-prompts $num_prompts \ + --port $port \ + --save-result \ + --result-dir $RESULTS_FOLDER \ + --result-filename ${new_test_name}.json \ + --request-rate $qps \ + --ignore-eos \ + $client_args" + + elif [[ "$dataset_name" = "sonnet" ]]; then + + sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len') + sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len') + sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len') + + client_command="python3 benchmark_serving.py \ + --backend $backend \ + --tokenizer /tokenizer_cache \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --num-prompts $num_prompts \ + --sonnet-input-len $sonnet_input_len \ + --sonnet-output-len $sonnet_output_len \ + --sonnet-prefix-len $sonnet_prefix_len \ + --port $port \ + --save-result \ + --result-dir $RESULTS_FOLDER \ + --result-filename ${new_test_name}.json \ + --request-rate $qps \ + --ignore-eos \ + $client_args" + + else + + echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name." + exit 1 + + fi + + + + echo "Running test case $test_name with qps $qps" + echo "Client command: $client_command" + + eval "$client_command" + + server_command="None" + + # record the benchmarking commands + jq_output=$(jq -n \ + --arg server "$server_command" \ + --arg client "$client_command" \ + --arg gpu "$gpu_type" \ + --arg engine "$CURRENT_LLM_SERVING_ENGINE" \ + '{ + server_command: $server, + client_command: $client, + gpu_type: $gpu, + engine: $engine + }') + echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" + + done + + done + + kill_gpu_processes +} + + +prepare_dataset() { + + # download sharegpt dataset + cd $VLLM_SOURCE_CODE_LOC/benchmarks + wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + + # duplicate sonnet by 4x, to allow benchmarking with input length 2048 + cd $VLLM_SOURCE_CODE_LOC/benchmarks + echo "" > sonnet_4x.txt + for _ in {1..4} + do + cat sonnet.txt >> sonnet_4x.txt + done + +} + +main() { + + # check if the environment variable is successfully injected from yaml + + check_gpus + check_hf_token + get_current_llm_serving_engine + + pip install -U transformers + + # check storage + df -h + + ensure_installed wget + ensure_installed curl + ensure_installed jq + + prepare_dataset + + cd $VLLM_SOURCE_CODE_LOC/benchmarks + declare -g RESULTS_FOLDER=results/ + mkdir -p $RESULTS_FOLDER + BENCHMARK_ROOT=$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/ + + # run the test + run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json + + # upload benchmark results to buildkite + python3 -m pip install tabulate pandas + python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py + upload_to_buildkite + +} + +main "$@" diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh deleted file mode 100644 index fed03654f8b77..0000000000000 --- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh +++ /dev/null @@ -1,216 +0,0 @@ -#!/bin/bash - -set -o pipefail - -check_gpus() { - # check the number of GPUs and GPU type. - declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) - if [[ $gpu_count -gt 0 ]]; then - echo "GPU found." - else - echo "Need at least 1 GPU to run benchmarking." - exit 1 - fi - declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') - echo "GPU type is $gpu_type" -} - -kill_gpu_processes() { - pkill text-generation || true - # waiting for GPU processes to be fully killed - sleep 10 - # Print the GPU memory usage - # so that we know if all GPU processes are killed. - gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0) - # The memory usage should be 0 MB. - echo "GPU 0 Memory Usage: $gpu_memory_usage MB" -} - -json2args() { - # transforms the JSON string to command line args, and '_' is replaced to '-' - # example: - # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } - # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 - local json_string=$1 - local args=$( - echo "$json_string" | jq -r ' - to_entries | - map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | - join(" ") - ' - ) - echo "$args" -} - -wait_for_server() { - timeout 1200 bash -c ' - until curl -s localhost:8000/generate_stream > /dev/null; do - sleep 1 - done' && return 0 || return 1 -} - -run_serving_tests() { - # run serving tests using `benchmark_serving.py` - # $1: a json file specifying serving test cases - - local serving_test_file - serving_test_file=$1 - - # Iterate over serving tests - jq -c '.[]' "$serving_test_file" | while read -r params; do - # get the test name, and append the GPU type back to it. - test_name=$(echo "$params" | jq -r '.test_name') - - - # if TEST_SELECTOR is set, only run the test cases that match the selector - if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then - echo "Skip test case $test_name." - continue - fi - - # append tgi to the test name - test_name=tgi_$test_name - - # get common parameters - common_params=$(echo "$params" | jq -r '.common_parameters') - model=$(echo "$common_params" | jq -r '.model') - tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') - port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') - - # get client and server arguments - server_params=$(echo "$params" | jq -r '.tgi_server_parameters') - client_params=$(echo "$params" | jq -r '.tgi_client_parameters') - server_args=$(json2args "$server_params") - client_args=$(json2args "$client_params") - qps_list=$(echo "$params" | jq -r '.qps_list') - qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') - echo "Running over qps list $qps_list" - - # check if there is enough GPU to run the test - if [[ $gpu_count -lt $tp ]]; then - echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name." - continue - fi - - if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then - echo "Key 'fp8' exists in common params." - server_command="/tgi-entrypoint.sh \ - --model-id $model \ - --num-shard $tp \ - --port $port \ - --quantize fp8 \ - $server_args" - else - echo "Key 'fp8' does not exist in common params." - server_command="/tgi-entrypoint.sh \ - --model-id $model \ - --num-shard $tp \ - --port $port \ - $server_args" - fi - - - - - # run the server - echo "Running test case $test_name" - echo "Server command: $server_command" - eval "$server_command" & - - # wait until the server is alive - wait_for_server - if [ $? -eq 0 ]; then - echo "" - echo "tgi server is up and running." - else - echo "" - echo "tgi failed to start within the timeout period." - break - fi - - # iterate over different QPS - for qps in $qps_list; do - # remove the surrounding single quote from qps - if [[ "$qps" == *"inf"* ]]; then - echo "qps was $qps" - qps="inf" - echo "now qps is $qps" - fi - - new_test_name=$test_name"_qps_"$qps - - client_command="python3 benchmark_serving.py \ - --backend tgi \ - --model $model \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --num-prompts $num_prompts \ - --port $port \ - --save-result \ - --result-dir $RESULTS_FOLDER \ - --result-filename ${new_test_name}.json \ - --request-rate $qps \ - $client_args" - - echo "Running test case $test_name with qps $qps" - echo "Client command: $client_command" - - eval "$client_command" - - # record the benchmarking commands - jq_output=$(jq -n \ - --arg server "$server_command" \ - --arg client "$client_command" \ - --arg gpu "$gpu_type" \ - --arg engine "tgi" \ - '{ - server_command: $server, - client_command: $client, - gpu_type: $gpu, - engine: $engine - }') - echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" - - done - - # clean up - kill_gpu_processes - rm -rf /root/.cache/huggingface/* - done -} - - - -upload_to_buildkite() { - # upload the benchmarking results to buildkite - - # if the agent binary is not found, skip uploading the results, exit 0 - if [ ! -f /workspace/buildkite-agent ]; then - echo "buildkite-agent binary not found. Skip uploading the results." - return 0 - fi - # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md - /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" -} - -main() { - - check_gpus - # enter vllm directory - cd $VLLM_SOURCE_CODE_LOC/benchmarks - declare -g RESULTS_FOLDER=results/ - mkdir -p $RESULTS_FOLDER - BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ - - export CURRENT_LLM_SERVING_ENGINE=tgi - run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json - python -m pip install tabulate pandas - python $BENCHMARK_ROOT/scripts/summary-nightly-results.py - upload_to_buildkite - -} - -main "$@" diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh deleted file mode 100644 index 4a82b9ec64d71..0000000000000 --- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh +++ /dev/null @@ -1,214 +0,0 @@ -#!/bin/bash - -set -o pipefail - -check_gpus() { - # check the number of GPUs and GPU type. - declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) - if [[ $gpu_count -gt 0 ]]; then - echo "GPU found." - else - echo "Need at least 1 GPU to run benchmarking." - exit 1 - fi - declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') - echo "GPU type is $gpu_type" -} - -kill_gpu_processes() { - pkill tritonserver || true - # waiting for GPU processes to be fully killed - sleep 20 - # Print the GPU memory usage - # so that we know if all GPU processes are killed. - gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0) - # The memory usage should be 0 MB. - echo "GPU 0 Memory Usage: $gpu_memory_usage MB" -} - -json2args() { - # transforms the JSON string to command line args, and '_' is replaced to '-' - # example: - # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } - # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 - local json_string=$1 - local args=$( - echo "$json_string" | jq -r ' - to_entries | - map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | - join(" ") - ' - ) - echo "$args" -} - -wait_for_server() { - timeout 1200 bash -c ' - until curl -s localhost:8000/generate_stream > /dev/null; do - sleep 1 - done' && return 0 || return 1 -} - -run_serving_tests() { - # run serving tests using `benchmark_serving.py` - # $1: a json file specifying serving test cases - - local serving_test_file - serving_test_file=$1 - - # Iterate over serving tests - jq -c '.[]' "$serving_test_file" | while read -r params; do - # get the test name, and append the GPU type back to it. - test_name=$(echo "$params" | jq -r '.test_name') - - # if TEST_SELECTOR is set, only run the test cases that match the selector - if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then - echo "Skip test case $test_name." - continue - fi - - # append trt to the test name - test_name=trt_$test_name - - # get common parameters - common_params=$(echo "$params" | jq -r '.common_parameters') - model=$(echo "$common_params" | jq -r '.model') - tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') - port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') - - # get client and server arguments - server_params=$(echo "$params" | jq -r '.trt_server_parameters') - client_params=$(echo "$params" | jq -r '.trt_client_parameters') - client_args=$(json2args "$client_params") - qps_list=$(echo "$params" | jq -r '.qps_list') - qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') - echo "Running over qps list $qps_list" - - # check if there is enough GPU to run the test - if [[ $gpu_count -lt $tp ]]; then - echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name." - continue - fi - - - - cd $VLLM_SOURCE_CODE_LOC/benchmarks - - - echo "Running test case $test_name" - bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params" - - # wait until the server is alive - wait_for_server - if [ $? -eq 0 ]; then - echo "" - echo "trt server is up and running." - else - echo "" - echo "trt failed to start within the timeout period." - break - fi - - # prepare tokenizer - cd $VLLM_SOURCE_CODE_LOC/benchmarks - rm -rf /tokenizer_cache - mkdir /tokenizer_cache - python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \ - --model "$model" \ - --cachedir /tokenizer_cache - cd $VLLM_SOURCE_CODE_LOC/benchmarks - - - # iterate over different QPS - for qps in $qps_list; do - # remove the surrounding single quote from qps - if [[ "$qps" == *"inf"* ]]; then - echo "qps was $qps" - qps="inf" - echo "now qps is $qps" - fi - - new_test_name=$test_name"_qps_"$qps - - client_command="python3 benchmark_serving.py \ - --backend tensorrt-llm \ - --tokenizer /tokenizer_cache \ - --model $model \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --num-prompts $num_prompts \ - --port $port \ - --save-result \ - --result-dir $RESULTS_FOLDER \ - --result-filename ${new_test_name}.json \ - --request-rate $qps \ - $client_args" - - echo "Running test case $test_name with qps $qps" - echo "Client command: $client_command" - - eval "$client_command" - - server_command="" - # record the benchmarking commands - jq_output=$(jq -n \ - --arg server "$server_command" \ - --arg client "$client_command" \ - --arg gpu "$gpu_type" \ - --arg engine "trt" \ - '{ - server_command: $server, - client_command: $client, - gpu_type: $gpu, - engine: $engine - }') - echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" - - done - - # clean up - kill_gpu_processes - rm -rf /root/.cache/huggingface/* - done -} - -upload_to_buildkite() { - # upload the benchmarking results to buildkite - - # if the agent binary is not found, skip uploading the results, exit 0 - if [ ! -f /workspace/buildkite-agent ]; then - echo "buildkite-agent binary not found. Skip uploading the results." - return 0 - fi - # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md - /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" -} - - -main() { - - check_gpus - - - # enter vllm directory - cd $VLLM_SOURCE_CODE_LOC/benchmarks - - declare -g RESULTS_FOLDER=results/ - mkdir -p $RESULTS_FOLDER - BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ - - # update transformers package, to make sure mixtral tokenizer is available - python -m pip install transformers -U - - export CURRENT_LLM_SERVING_ENGINE=trt - run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json - python -m pip install tabulate pandas - python $BENCHMARK_ROOT/scripts/summary-nightly-results.py - upload_to_buildkite - -} - -main "$@" diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh deleted file mode 100644 index 663045b8a9122..0000000000000 --- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh +++ /dev/null @@ -1,221 +0,0 @@ -#!/bin/bash - -set -o pipefail - -check_gpus() { - # check the number of GPUs and GPU type. - declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) - if [[ $gpu_count -gt 0 ]]; then - echo "GPU found." - else - echo "Need at least 1 GPU to run benchmarking." - exit 1 - fi - declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') - echo "GPU type is $gpu_type" -} - -kill_gpu_processes() { - # kill all processes on GPU. - pkill pt_main_thread - sleep 10 - - # remove vllm config file - rm -rf ~/.config/vllm - - # Print the GPU memory usage - # so that we know if all GPU processes are killed. - gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0) - # The memory usage should be 0 MB. - echo "GPU 0 Memory Usage: $gpu_memory_usage MB" -} - -json2args() { - # transforms the JSON string to command line args, and '_' is replaced to '-' - # example: - # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } - # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 - local json_string=$1 - local args=$( - echo "$json_string" | jq -r ' - to_entries | - map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | - join(" ") - ' - ) - echo "$args" -} - -wait_for_server() { - # wait for vllm server to start - # return 1 if vllm server crashes - timeout 1200 bash -c ' - until curl -s localhost:8000/v1/completions > /dev/null; do - sleep 1 - done' && return 0 || return 1 -} - -run_serving_tests() { - # run serving tests using `benchmark_serving.py` - # $1: a json file specifying serving test cases - - local serving_test_file - serving_test_file=$1 - - # Iterate over serving tests - jq -c '.[]' "$serving_test_file" | while read -r params; do - # get the test name, and append the GPU type back to it. - test_name=$(echo "$params" | jq -r '.test_name') - - # if TEST_SELECTOR is set, only run the test cases that match the selector - if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then - echo "Skip test case $test_name." - continue - fi - - # append vllm to the test name - test_name=vllm_$test_name - - - # get common parameters - common_params=$(echo "$params" | jq -r '.common_parameters') - model=$(echo "$common_params" | jq -r '.model') - tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') - port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') - - # get client and server arguments - server_params=$(echo "$params" | jq -r '.vllm_server_parameters') - client_params=$(echo "$params" | jq -r '.vllm_client_parameters') - server_args=$(json2args "$server_params") - client_args=$(json2args "$client_params") - qps_list=$(echo "$params" | jq -r '.qps_list') - qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') - echo "Running over qps list $qps_list" - - # check if there is enough GPU to run the test - if [[ $gpu_count -lt $tp ]]; then - echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." - continue - fi - - if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then - echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience." - model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model') - server_command="python3 \ - -m vllm.entrypoints.openai.api_server \ - -tp $tp \ - --model $model \ - --port $port \ - $server_args" - else - echo "Key 'fp8' does not exist in common params." - server_command="python3 \ - -m vllm.entrypoints.openai.api_server \ - -tp $tp \ - --model $model \ - --port $port \ - $server_args" - fi - - # run the server - echo "Running test case $test_name" - echo "Server command: $server_command" - eval "$server_command" & - - # wait until the server is alive - wait_for_server - if [ $? -eq 0 ]; then - echo "" - echo "vllm server is up and running." - else - echo "" - echo "vllm failed to start within the timeout period." - break - fi - - # iterate over different QPS - for qps in $qps_list; do - # remove the surrounding single quote from qps - if [[ "$qps" == *"inf"* ]]; then - echo "qps was $qps" - qps="inf" - echo "now qps is $qps" - fi - - new_test_name=$test_name"_qps_"$qps - - client_command="python3 benchmark_serving.py \ - --backend vllm \ - --model $model \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --num-prompts $num_prompts \ - --port $port \ - --save-result \ - --result-dir $RESULTS_FOLDER \ - --result-filename ${new_test_name}.json \ - --request-rate $qps \ - $client_args" - - echo "Running test case $test_name with qps $qps" - echo "Client command: $client_command" - - eval "$client_command" - - # record the benchmarking commands - jq_output=$(jq -n \ - --arg server "$server_command" \ - --arg client "$client_command" \ - --arg gpu "$gpu_type" \ - --arg engine "vllm" \ - '{ - server_command: $server, - client_command: $client, - gpu_type: $gpu, - engine: $engine - }') - echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" - - done - - # clean up - kill_gpu_processes - rm -rf /root/.cache/huggingface/* - done -} - - -upload_to_buildkite() { - # upload the benchmarking results to buildkite - - # if the agent binary is not found, skip uploading the results, exit 0 - if [ ! -f /workspace/buildkite-agent ]; then - echo "buildkite-agent binary not found. Skip uploading the results." - return 0 - fi - # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md - /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" -} - -main() { - - check_gpus - # enter vllm directory - cd $VLLM_SOURCE_CODE_LOC/benchmarks - declare -g RESULTS_FOLDER=results/ - mkdir -p $RESULTS_FOLDER - BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ - - export CURRENT_LLM_SERVING_ENGINE=vllm - run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json - - python3 -m pip install tabulate pandas - python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py - upload_to_buildkite - -} - -main "$@" diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py index 782d1ef9aab98..4e4d4cd4ca3c6 100644 --- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py @@ -17,10 +17,17 @@ "request_throughput": "Tput (req/s)", "mean_ttft_ms": "Mean TTFT (ms)", "std_ttft_ms": "Std TTFT (ms)", + "median_ttft_ms": "Median TTFT (ms)", "mean_itl_ms": "Mean ITL (ms)", "std_itl_ms": "Std ITL (ms)", - "input_throughput": "Input Tput (tok/s)", + "median_itl_ms": "Median ITL (ms)", + "mean_tpot_ms": "Mean TPOT (ms)", + "std_tpot_ms": "Std TPOT (ms)", + "median_tpot_ms": "Median TPOT (ms)", + "total_token_throughput": "Total Token Tput (tok/s)", "output_throughput": "Output Tput (tok/s)", + "total_input_tokens": "Total input tokens", + "total_output_tokens": "Total output tokens", "engine": "Engine", } diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index f250833c62710..fda1a7a3ec53c 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -1,16 +1,18 @@ [ { - "test_name": "llama8B_tp1", - "qps_list": [4], + "test_name": "llama8B_tp1_sharegpt", + "qps_list": [4,8,16,32,"inf"], "common_parameters": { - "model": "meta-llama/Meta-Llama-3-8B", + "model": "meta-llama/Meta-Llama-3-8B-Instruct", "tp": 1, "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 500, - "port": 8000 + "port": 8000, + "reuse_server": false }, "lmdeploy_server_parameters": { + "dtype": "bfloat16" }, "lmdeploy_client_parameters": { }, @@ -21,34 +23,158 @@ }, "trt_server_parameters": { "model_type": "llama", - "model_dtype": "float16", - "max_batch_size": 256, + "model_dtype": "bfloat16", + "max_batch_size": 2048, "max_input_len": 4096, - "max_output_len": 4096, - "trt_llm_version": "r24.04" + "max_seq_len": 6144, + "max_num_tokens": 16384, + "trt_llm_version": "v0.11.0" }, "trt_client_parameters": { "endpoint": "/v2/models/ensemble/generate_stream" + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" + }, + "vllm_client_parameters": { + }, + "sglang_server_parameters": { + "disable_radix_cache": "", + "enable_torch_compile": "", + "dtype": "bfloat16" + }, + "sglang_client_parameters": { + } + }, + { + "test_name": "llama8B_tp1_sonnet_512_16", + "qps_list": [4,8,16,32,"inf"], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "tp": 1, + "dataset_name": "sonnet", + "dataset_path": "./sonnet_4x.txt", + "num_prompts": 500, + "port": 8000, + "sonnet_input_len": 512, + "sonnet_output_len": 16, + "sonnet_prefix_len": 50, + "reuse_server": true + }, + "lmdeploy_server_parameters": { + "dtype": "bfloat16" + }, + "lmdeploy_client_parameters": { + }, + "tgi_server_parameters": { + }, + "tgi_client_parameters": { + "endpoint": "/generate_stream" + }, + "trt_server_parameters": { + "model_type": "llama", + "model_dtype": "bfloat16", + "max_batch_size": 2048, + "max_input_len": 4096, + "max_seq_len": 6144, + "max_num_tokens": 16384, + "trt_llm_version": "v0.11.0" + }, + "trt_client_parameters": { + "endpoint": "/v2/models/ensemble/generate_stream" + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" + }, + "vllm_client_parameters": { + }, + "sglang_server_parameters": { + "disable_radix_cache": "", + "enable_torch_compile": "", + "dtype": "bfloat16" + }, + "sglang_client_parameters": { + } + }, + { + "test_name": "llama8B_tp1_sonnet_512_256", + "qps_list": [4,8,16,32,"inf"], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "tp": 1, + "dataset_name": "sonnet", + "dataset_path": "./sonnet_4x.txt", + "num_prompts": 500, + "port": 8000, + "sonnet_input_len": 512, + "sonnet_output_len": 256, + "sonnet_prefix_len": 50, + "reuse_server": true + }, + "lmdeploy_server_parameters": { + "dtype": "bfloat16" + }, + "lmdeploy_client_parameters": { + }, + "tgi_server_parameters": { + }, + "tgi_client_parameters": { + "endpoint": "/generate_stream" + }, + "trt_server_parameters": { + "model_type": "llama", + "model_dtype": "bfloat16", + "max_batch_size": 2048, + "max_input_len": 4096, + "max_seq_len": 6144, + "max_num_tokens": 16384, + "trt_llm_version": "v0.11.0" }, + "trt_client_parameters": { + "endpoint": "/v2/models/ensemble/generate_stream" + }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "" + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" }, "vllm_client_parameters": { + }, + "sglang_server_parameters": { + "disable_radix_cache": "", + "enable_torch_compile": "", + "dtype": "bfloat16" + }, + "sglang_client_parameters": { } }, { - "test_name": "llama70B_tp4", - "qps_list": [2], + "test_name": "llama70B_tp4_sharegpt", + "qps_list": [4,8,16,32,"inf"], "common_parameters": { "model": "meta-llama/Meta-Llama-3-70B-Instruct", "tp": 4, "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 500, - "port": 8000 + "port": 8000, + "reuse_server": false }, "lmdeploy_server_parameters": { + "dtype": "bfloat16" }, "lmdeploy_client_parameters": { }, @@ -59,34 +185,50 @@ }, "trt_server_parameters": { "model_type": "llama", - "model_dtype": "float16", - "max_batch_size": 256, + "model_dtype": "bfloat16", + "max_batch_size": 2048, "max_input_len": 4096, - "max_output_len": 4096, - "trt_llm_version": "r24.04" + "max_seq_len": 6144, + "max_num_tokens": 16384, + "trt_llm_version": "v0.11.0" }, "trt_client_parameters": { "endpoint": "/v2/models/ensemble/generate_stream" - }, + }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "" + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" }, "vllm_client_parameters": { + }, + "sglang_server_parameters": { + "disable_radix_cache": "", + "dtype": "bfloat16" + }, + "sglang_client_parameters": { } }, { - "test_name": "mixtral8x7B_tp2", - "qps_list": [2], + "test_name": "llama70B_tp4_sonnet_512_16", + "qps_list": [4,8,16,32,"inf"], "common_parameters": { - "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", - "tp": 2, - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "tp": 4, + "dataset_name": "sonnet", + "dataset_path": "./sonnet_4x.txt", "num_prompts": 500, - "port": 8000 + "port": 8000, + "sonnet_input_len": 512, + "sonnet_output_len": 16, + "sonnet_prefix_len": 50, + "reuse_server": true }, "lmdeploy_server_parameters": { + "dtype": "bfloat16" }, "lmdeploy_client_parameters": { }, @@ -97,20 +239,85 @@ }, "trt_server_parameters": { "model_type": "llama", - "model_dtype": "float16", - "max_batch_size": 256, + "model_dtype": "bfloat16", + "max_batch_size": 2048, "max_input_len": 4096, - "max_output_len": 4096, - "trt_llm_version": "r24.04" + "max_seq_len": 6144, + "max_num_tokens": 16384, + "trt_llm_version": "v0.11.0" }, "trt_client_parameters": { "endpoint": "/v2/models/ensemble/generate_stream" + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" + }, + "vllm_client_parameters": { }, + "sglang_server_parameters": { + "disable_radix_cache": "", + "dtype": "bfloat16" + }, + "sglang_client_parameters": { + } + }, + { + "test_name": "llama70B_tp4_sonnet_512_256", + "qps_list": [4,8,16,32,"inf"], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "tp": 4, + "dataset_name": "sonnet", + "dataset_path": "./sonnet_4x.txt", + "num_prompts": 500, + "port": 8000, + "sonnet_input_len": 512, + "sonnet_output_len": 256, + "sonnet_prefix_len": 50, + "reuse_server": true + }, + "lmdeploy_server_parameters": { + "dtype": "bfloat16" + }, + "lmdeploy_client_parameters": { + }, + "tgi_server_parameters": { + }, + "tgi_client_parameters": { + "endpoint": "/generate_stream" + }, + "trt_server_parameters": { + "model_type": "llama", + "model_dtype": "bfloat16", + "max_batch_size": 2048, + "max_input_len": 4096, + "max_seq_len": 6144, + "max_num_tokens": 16384, + "trt_llm_version": "v0.11.0" + }, + "trt_client_parameters": { + "endpoint": "/v2/models/ensemble/generate_stream" + }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "" + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" }, "vllm_client_parameters": { + }, + "sglang_server_parameters": { + "disable_radix_cache": "", + "dtype": "bfloat16" + }, + "sglang_client_parameters": { } } ] \ No newline at end of file diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh index 49ae838cf0690..fd60f5b6afeca 100755 --- a/.buildkite/run-cpu-test-ppc64le.sh +++ b/.buildkite/run-cpu-test-ppc64le.sh @@ -18,7 +18,13 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg # Run basic model test docker exec cpu-test bash -c " pip install pytest matplotlib einops transformers_stream_generator - pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported + pytest -v -s tests/models -m \"not vlm\" \ + --ignore=tests/models/test_embedding.py \ + --ignore=tests/models/test_oot_registration.py \ + --ignore=tests/models/test_registry.py \ + --ignore=tests/models/test_jamba.py \ + --ignore=tests/models/test_mamba.py \ + --ignore=tests/models/test_danube3_4b.py" # Mamba kernels and Danube3-4B on CPU is not supported # online inference docker exec cpu-test bash -c " diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 73ce82c5857ab..c2818c38965ea 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -23,16 +23,24 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" # Run basic model test docker exec cpu-test bash -c " pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator + pytest -v -s tests/models/encoder_decoder/language pytest -v -s tests/models/decoder_only/language \ --ignore=tests/models/test_fp8.py \ --ignore=tests/models/decoder_only/language/test_jamba.py \ + --ignore=tests/models/decoder_only/language/test_mamba.py \ + --ignore=tests/models/decoder_only/language/test_granitemoe.py \ --ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported # Run compressed-tensor test +# docker exec cpu-test bash -c " +# pytest -s -v \ +# tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ +# tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token" + +# Run AWQ test docker exec cpu-test bash -c " pytest -s -v \ - tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ - tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token" + tests/quantization/test_ipex_quant.py" # online inference docker exec cpu-test bash -c " diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 427dc14513d45..4c2fe41c739b1 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -77,8 +77,8 @@ steps: - vllm/ - tests/basic_correctness/test_chunked_prefill commands: - - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py - - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py + - VLLM_ATTENTION_BACKEND=XFORMERS VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py + - VLLM_ATTENTION_BACKEND=FLASH_ATTN VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py - label: Core Test # 10min mirror_hardwares: [amd] @@ -88,7 +88,11 @@ steps: - vllm/distributed - tests/core commands: - - pytest -v -s core + - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core/test_scheduler.py + - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/test_chunked_prefill_scheduler.py + - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness.py + - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness_sliding_window.py + - pytest -v -s core --ignore=core/block/e2e/test_correctness.py --ignore=core/test_scheduler.py --ignore=core/test_chunked_prefill_scheduler.py --ignore=core/block/e2e/test_correctness.py --ignore=core/block/e2e/test_correctness_sliding_window.py - label: Entrypoints Test # 40min working_dir: "/vllm-workspace/tests" @@ -98,7 +102,6 @@ steps: - vllm/ commands: - pip install -e ./plugins/vllm_add_dummy_model - - pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api] - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process @@ -118,7 +121,9 @@ steps: - vllm/core/ - tests/distributed - tests/spec_decode/e2e/test_integration_dist_tp4 + - tests/compile commands: + - pytest -v -s compile/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py @@ -186,7 +191,8 @@ steps: - vllm/ - tests/prefix_caching commands: - - pytest -v -s prefix_caching + - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s prefix_caching/test_prefix_caching.py + - pytest -v -s prefix_caching --ignore=prefix_caching/test_prefix_caching.py - label: Samplers Test # 36min source_file_dependencies: @@ -210,7 +216,8 @@ steps: - tests/spec_decode commands: - pytest -v -s spec_decode/e2e/test_multistep_correctness.py - - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py + - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s spec_decode/e2e/test_compatibility.py + - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_compatibility.py - label: LoRA Test %N # 15min each mirror_hardwares: [amd] @@ -226,14 +233,16 @@ steps: - vllm/ - tests/compile commands: - - pytest -v -s compile/test_full_graph_smoke.py + - pytest -v -s compile/test_basic_correctness.py -- label: "PyTorch Fullgraph Test" # 18min - source_file_dependencies: - - vllm/ - - tests/compile - commands: - - pytest -v -s compile/test_full_graph.py +# TODO: re-write in comparison tests, and fix symbolic shape +# for quantization ops. +# - label: "PyTorch Fullgraph Test" # 18min +# source_file_dependencies: +# - vllm/ +# - tests/compile +# commands: +# - pytest -v -s compile/test_full_graph.py - label: Kernels Test %N # 1h each mirror_hardwares: [amd] @@ -270,7 +279,7 @@ steps: - csrc/ - vllm/model_executor/layers/quantization - tests/quantization - command: pytest -v -s quantization + command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization - label: LM Eval Small Models # 53min working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" @@ -278,7 +287,6 @@ steps: - csrc/ - vllm/model_executor/layers/quantization commands: - - pip install lm-eval - export VLLM_WORKER_MULTIPROC_METHOD=spawn - bash ./run-tests.sh -c configs/models-small.txt -t 1 @@ -339,10 +347,11 @@ steps: - pytest -v -s models/encoder_decoder/language - pytest -v -s models/encoder_decoder/vision_language +# This test is used only in PR development phase to test individual models and should never run on main - label: Custom Models Test - #mirror_hardwares: [amd] optional: true commands: + - echo 'Testing custom models...' # PR authors can temporarily add commands below to test individual models # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR* @@ -390,10 +399,10 @@ steps: - tests/distributed/ - vllm/compilation commands: - - pytest -v -s ./compile/test_full_graph_multi_gpu.py + - pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed' - - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus + - TARGET_TEST_SUITE=L4 VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest basic_correctness/ -v -s -m distributed_2_gpus # Avoid importing model tests that cause CUDA reinitialization error - pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus @@ -492,6 +501,5 @@ steps: - csrc/ - vllm/model_executor/layers/quantization commands: - - pip install lm-eval - export VLLM_WORKER_MULTIPROC_METHOD=spawn - bash ./run-tests.sh -c configs/models-large.txt -t 4 diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index e15f129719f8f..cd721971d01d6 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,19 +1,30 @@ # See https://help.github.com/articles/about-codeowners/ # for more info about CODEOWNERS file +# This lists cover the "core" components of vLLM that require careful review +/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/core @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/engine/llm_engine.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/executor/executor_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/worker/worker_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/worker/worker.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/model_executor/layers/sampler.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +CMakeLists.txt @tlrmchlsmth @WoosukKwon + +# Test ownership /tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo /tests/test_inputs.py @DarkLight1337 @ywang96 -/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo +/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo /tests/models @DarkLight1337 @ywang96 /tests/multimodal @DarkLight1337 @ywang96 -/tests/prefix_caching @comaniac @KuntaiDu +/tests/prefix_caching @comaniac @KuntaiDu /tests/spec_decode @njhill @LiuXiaoxuanPKU -/tests/kernels @tlrmchlsmth @WoosukKwon +/tests/kernels @tlrmchlsmth @WoosukKwon /tests/quantization @mgoin @robertgshaw2-neuralmagic -/.buildkite/lm-eval-harness @mgoin @simon-mo +/.buildkite/lm-eval-harness @mgoin @simon-mo /tests/distributed/test_multi_node_assignment.py @youkaichao /tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_same_node.py @youkaichao -/tests/multi_step @alexm-neuralmagic @SolitaryThinker @comaniac +/tests/multi_step @alexm-neuralmagic @comaniac /tests/weight_loading @mgoin @youkaichao /tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000000000..6fddca0d6e4b9 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,7 @@ +version: 2 +updates: + # Maintain dependencies for GitHub Actions + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml new file mode 100644 index 0000000000000..2a0e3239f58da --- /dev/null +++ b/.github/workflows/actionlint.yml @@ -0,0 +1,37 @@ +name: Lint GitHub Actions workflows +on: + push: + branches: + - "main" + paths: + - '.github/workflows/*.ya?ml' + - '.github/workflows/actionlint.*' + pull_request: + branches: + - "main" + paths: + - '.github/workflows/*.ya?ml' + - '.github/workflows/actionlint.*' + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + actionlint: + runs-on: ubuntu-latest + steps: + - name: "Checkout" + uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + with: + fetch-depth: 0 + + - name: "Run actionlint" + run: | + tools/actionlint.sh -color diff --git a/.github/workflows/add_label_automerge.yml b/.github/workflows/add_label_automerge.yml index cd53b764c7200..2e7c7f7f087af 100644 --- a/.github/workflows/add_label_automerge.yml +++ b/.github/workflows/add_label_automerge.yml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Add label - uses: actions/github-script@v5 + uses: actions/github-script@v7 with: script: | github.rest.issues.addLabels({ diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml index 77fb678c8acc8..341fc0665a402 100644 --- a/.github/workflows/clang-format.yml +++ b/.github/workflows/clang-format.yml @@ -17,9 +17,9 @@ jobs: matrix: python-version: ["3.11"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/.github/workflows/matchers/actionlint.json b/.github/workflows/matchers/actionlint.json new file mode 100644 index 0000000000000..4613e1617bfe2 --- /dev/null +++ b/.github/workflows/matchers/actionlint.json @@ -0,0 +1,17 @@ +{ + "problemMatcher": [ + { + "owner": "actionlint", + "pattern": [ + { + "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$", + "file": 1, + "line": 2, + "column": 3, + "message": 4, + "code": 5 + } + ] + } + ] +} diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml index 6ebe512c5dbf6..053684bebb6f2 100644 --- a/.github/workflows/mypy.yaml +++ b/.github/workflows/mypy.yaml @@ -11,15 +11,15 @@ on: - habana_main jobs: - ruff: + mypy: runs-on: ubuntu-latest strategy: matrix: python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies @@ -32,15 +32,4 @@ jobs: pip install types-setuptools - name: Mypy run: | - mypy - mypy tests --follow-imports skip - mypy vllm/attention --follow-imports skip - mypy vllm/distributed --follow-imports skip - mypy vllm/engine --follow-imports skip - mypy vllm/executor --follow-imports skip - mypy vllm/lora --follow-imports skip - mypy vllm/model_executor --follow-imports skip - mypy vllm/prompt_adapter --follow-imports skip - mypy vllm/spec_decode --follow-imports skip - mypy vllm/worker --follow-imports skip - + tools/mypy.sh diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index aeeaf6efab043..96549b3f99181 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -21,16 +21,16 @@ jobs: upload_url: ${{ steps.create_release.outputs.upload_url }} steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Extract branch info shell: bash run: | - echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV + echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV" - name: Create Release id: create_release - uses: "actions/github-script@v6" + uses: "actions/github-script@v7" env: RELEASE_TAG: ${{ env.release_tag }} with: @@ -54,7 +54,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Setup ccache uses: hendrikmuhs/ccache-action@v1.2 @@ -68,7 +68,7 @@ jobs: bash -x .github/workflows/scripts/env.sh - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} @@ -86,10 +86,10 @@ jobs: CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size run: | bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }} - wheel_name=$(ls dist/*whl | xargs -n 1 basename) + wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename) asset_name=${wheel_name//"linux"/"manylinux1"} - echo "wheel_name=${wheel_name}" >> $GITHUB_ENV - echo "asset_name=${asset_name}" >> $GITHUB_ENV + echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV" + echo "asset_name=${asset_name}" >> "$GITHUB_ENV" - name: Upload Release Asset uses: actions/upload-release-asset@v1 diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml index c9ef4a36745b0..98c5570f61aff 100644 --- a/.github/workflows/ruff.yml +++ b/.github/workflows/ruff.yml @@ -17,9 +17,9 @@ jobs: matrix: python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml index b1002578610d4..68eb06dea47a3 100644 --- a/.github/workflows/yapf.yml +++ b/.github/workflows/yapf.yml @@ -17,9 +17,9 @@ jobs: matrix: python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/.gitignore b/.gitignore index 5367ece834890..1ea6e3419db2a 100644 --- a/.gitignore +++ b/.gitignore @@ -199,3 +199,6 @@ hip_compat.h # Benchmark dataset benchmarks/*.json + +# Linting +actionlint diff --git a/CMakeLists.txt b/CMakeLists.txt index 7b24c4abc650e..3a424ad7b110f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -144,27 +144,32 @@ else() endif() -# -# For cuda we want to be able to control which architectures we compile for on -# a per-file basis in order to cut down on compile time. So here we extract -# the set of architectures we want to compile for and remove the from the -# CMAKE_CUDA_FLAGS so that they are not applied globally. -# if(VLLM_GPU_LANG STREQUAL "CUDA") + # + # For cuda we want to be able to control which architectures we compile for on + # a per-file basis in order to cut down on compile time. So here we extract + # the set of architectures we want to compile for and remove the from the + # CMAKE_CUDA_FLAGS so that they are not applied globally. + # clear_cuda_arches(CUDA_ARCH_FLAGS) extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}") message(STATUS "CUDA target architectures: ${CUDA_ARCHS}") + # Filter the target architectures by the supported supported archs + # since for some files we will build for all CUDA_ARCHS. + cuda_archs_loose_intersection(CUDA_ARCHS + "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}") + message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}") +else() + # + # For other GPU targets override the GPU architectures detected by cmake/torch + # and filter them by the supported versions for the current language. + # The final set of arches is stored in `VLLM_GPU_ARCHES`. + # + override_gpu_arches(VLLM_GPU_ARCHES + ${VLLM_GPU_LANG} + "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}") endif() -# -# Override the GPU architectures detected by cmake/torch and filter them by -# the supported versions for the current language. -# The final set of arches is stored in `VLLM_GPU_ARCHES`. -# -override_gpu_arches(VLLM_GPU_ARCHES - ${VLLM_GPU_LANG} - "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}") - # # Query torch for additional GPU compilation flags for the given # `VLLM_GPU_LANG`. @@ -433,6 +438,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu" "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h" "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu" + "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h" + "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu" "csrc/moe/marlin_moe_ops.cu") set_gencode_flags_for_srcs( diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 81a8db2b268b0..5f79356bd32f7 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,30 +1,23 @@ # Contributing to vLLM -Thank you for your interest in contributing to vLLM! -Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. -There are several ways you can contribute to the project: +Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project: - Identify and report any issues or bugs. -- Request or add a new model. +- Request or add support for a new model. - Suggest or implement new features. +- Improve documentation or contribute a how-to guide. -However, remember that contributions aren't just about code. -We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions. +We also believe in the power of community support; thus, answering queries, offering PR reviews, and assisting others are also highly regarded and beneficial contributions. -Finally, one of the most impactful ways to support us is by raising awareness about vLLM. -Talk about it in your blog posts, highlighting how it's driving your incredible projects. -Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository. +Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository! -## Setup for development +## Developing -### Build from source +Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source](https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source) documentation for details. -```bash -pip install -e . # This may take several minutes. -``` -### Testing +## Testing ```bash pip install -r requirements-dev.txt @@ -36,15 +29,16 @@ mypy # Unit tests pytest tests/ ``` -**Note:** Currently, the repository does not pass the mypy tests. +**Note:** Currently, the repository does not pass the ``mypy`` tests. +## Contribution Guidelines -## Contributing Guidelines +### Issues -### Issue Reporting +If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. -If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it. -If not, please file a new issue, providing as much relevant information as possible. +> [!IMPORTANT] +> If you discover a security vulnerability, please follow the instructions [here](/SECURITY.md#reporting-a-vulnerability). ### Pull Requests & Code Reviews @@ -53,4 +47,4 @@ Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE ### Thank You Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. -Your contributions make vLLM a great tool for everyone! +All of your contributions help make vLLM a great tool and community for everyone! diff --git a/Dockerfile b/Dockerfile index 872b1bc47054a..8405e0a88a106 100644 --- a/Dockerfile +++ b/Dockerfile @@ -144,7 +144,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ #################### DEV IMAGE #################### #################### vLLM installation IMAGE #################### # image with vLLM installed -FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base +FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base ARG CUDA_VERSION=12.4.1 ARG PYTHON_VERSION=3.12 WORKDIR /vllm-workspace @@ -182,6 +182,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist RUN --mount=type=cache,target=/root/.cache/pip \ . /etc/environment && \ python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl +COPY examples examples #################### vLLM installation IMAGE #################### diff --git a/Dockerfile.cpu b/Dockerfile.cpu index a9d97a3e0bde4..b9134d4ae41cb 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -22,11 +22,12 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li RUN echo 'ulimit -c 0' >> ~/.bashrc -RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl +RUN pip install intel_extension_for_pytorch==2.4.0 WORKDIR /workspace -ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu +ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" +ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} RUN --mount=type=cache,target=/root/.cache/pip \ --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \ pip install --upgrade pip && \ diff --git a/README.md b/README.md index adaa3dc26783f..7768cbfa06749 100644 --- a/README.md +++ b/README.md @@ -10,22 +10,13 @@ Easy, fast, and cheap LLM serving for everyone

-| Intel® Gaudi® README | Documentation | Blog | Paper | Discord | Twitter/X | - +| Intel® Gaudi® README | Documentation | Blog | Paper | Discord | Twitter/X | Developer Slack |

---- - -**vLLM, AMD, Anyscale Meet & Greet at [Ray Summit 2024](http://raysummit.anyscale.com) (Monday, Sept 30th, 5-7pm PT) at Marriott Marquis San Francisco** - -We are excited to announce our special vLLM event in collaboration with AMD and Anyscale. -Join us to learn more about recent advancements of vLLM on MI300X. -Register [here](https://lu.ma/db5ld9n5) and be a part of the event! - ---- - *Latest News* 🔥 +- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there! +- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/sessioncatalog?tab.day=20241001&search.sessiontracks=1719251906298001uzJ2) from other vLLM contributors and users! - [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing). - [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing). - [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html). @@ -52,7 +43,7 @@ vLLM is fast with: - Speculative decoding - Chunked prefill -**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/4068) that compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)). +**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script. vLLM is flexible and easy to use with: @@ -138,4 +129,4 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs * For technical questions and feature requests, please use Github issues or discussions. * For discussing with fellow users, please use Discord. * For security disclosures, please use Github's security advisory feature. -* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu. \ No newline at end of file +* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu. diff --git a/SECURITY.md b/SECURITY.md index d9a392158472d..ad3f1f16ab560 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -2,11 +2,10 @@ ## Reporting a Vulnerability -If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. -We will investigate all legitimate reports and do our best to quickly fix the problem. +If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem. -Please report security issues using https://github.com/vllm-project/vllm/security/advisories/new +Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). --- -Please see PyTorch Security for more information how to securely interact with models: https://github.com/pytorch/pytorch/blob/main/SECURITY.md -This document mostly references the recommendation from PyTorch, thank you! + +Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models. diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 3def4a6d67acf..4813fde27f0bc 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -23,9 +23,9 @@ class RequestFuncInput: output_len: int model: str best_of: int = 1 - use_beam_search: bool = False logprobs: Optional[int] = None multi_modal_content: Optional[dict] = None + ignore_eos: bool = False @dataclass @@ -48,13 +48,13 @@ async def async_request_tgi( assert api_url.endswith("generate_stream") async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: - assert not request_func_input.use_beam_search params = { "best_of": request_func_input.best_of, "max_new_tokens": request_func_input.output_len, "do_sample": True, "temperature": 0.01, # TGI does not accept 0.0 temperature. "top_p": 0.99, # TGI does not accept 1.0 top_p. + # TGI does not accept ignore_eos flag. } payload = { "inputs": request_func_input.prompt, @@ -119,7 +119,6 @@ async def async_request_trt_llm( assert api_url.endswith("generate_stream") async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: - assert not request_func_input.use_beam_search assert request_func_input.best_of == 1 payload = { "accumulate_tokens": True, @@ -129,6 +128,8 @@ async def async_request_trt_llm( "max_tokens": request_func_input.output_len, "stream": True, } + if request_func_input.ignore_eos: + payload["min_length"] = request_func_input.output_len output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len @@ -183,7 +184,6 @@ async def async_request_deepspeed_mii( ) -> RequestFuncOutput: async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: assert request_func_input.best_of == 1 - assert not request_func_input.use_beam_search payload = { "prompt": request_func_input.prompt, @@ -231,7 +231,6 @@ async def async_request_openai_completions( ), "OpenAI Completions API URL must end with 'completions' or 'profile'." async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: - assert not request_func_input.use_beam_search payload = { "model": request_func_input.model, "prompt": request_func_input.prompt, @@ -240,6 +239,7 @@ async def async_request_openai_completions( "max_tokens": request_func_input.output_len, "logprobs": request_func_input.logprobs, "stream": True, + "ignore_eos": request_func_input.ignore_eos, } headers = { "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" @@ -312,7 +312,6 @@ async def async_request_openai_chat_completions( ), "OpenAI Chat Completions API URL must end with 'chat/completions'." async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: - assert not request_func_input.use_beam_search content = [{"type": "text", "text": request_func_input.prompt}] if request_func_input.multi_modal_content: content.append(request_func_input.multi_modal_content) @@ -327,6 +326,7 @@ async def async_request_openai_chat_completions( "temperature": 0.0, "max_tokens": request_func_input.output_len, "stream": True, + "ignore_eos": request_func_input.ignore_eos, } headers = { "Content-Type": "application/json", @@ -430,4 +430,5 @@ def get_tokenizer( "openai-chat": async_request_openai_chat_completions, "tensorrt-llm": async_request_trt_llm, "scalellm": async_request_openai_completions, + "sglang": async_request_openai_completions, } diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index eadf994cacd34..79a48b2a1a845 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -51,9 +51,8 @@ def main(args: argparse.Namespace): sampling_params = SamplingParams( n=args.n, - temperature=0.0 if args.use_beam_search else 1.0, + temperature=1.0, top_p=1.0, - use_beam_search=args.use_beam_search, ignore_eos=True, max_tokens=args.output_len, ) @@ -222,7 +221,9 @@ def run_to_completion(profile_dir: Optional[str] = None): parser.add_argument("--enable-prefix-caching", action='store_true', help="Enable automatic prefix caching") - parser.add_argument('--use-v2-block-manager', action='store_true') + parser.add_argument('--use-v2-block-manager', + action='store_true', + default=EngineArgs.use_v2_block_manager) parser.add_argument( "--ray-workers-use-nsight", action='store_true', diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index 3e90fdfb78e10..f14092d347343 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -33,6 +33,7 @@ from transformers import PreTrainedTokenizerBase from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import EngineArgs from vllm.utils import FlexibleArgumentParser try: @@ -113,7 +114,7 @@ def repeat_and_sort_requests(requests: List[Tuple[str, int, int]], def main(args): tokenizer = get_tokenizer(args.model, trust_remote_code=True) input_length_range = tuple(map(int, args.input_length_range.split(':'))) - + random.seed(args.seed) if args.dataset_path is not None: print(f"Start to sample {args.num_prompts} prompts" "from {args.dataset_path}") @@ -177,6 +178,7 @@ def main(args): help='enable prefix caching') parser.add_argument('--use-v2-block-manager', action='store_true', + default=EngineArgs.use_v2_block_manager, help='Use BlockSpaceMangerV2') parser.add_argument('--num-prompts', type=int, @@ -194,5 +196,9 @@ def main(args): default='128:256', help='Range of input lengths for sampling prompts,' 'specified as "min:max" (e.g., "128:256").') + parser.add_argument("--seed", + type=int, + default=0, + help='Random seed for reproducibility') args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index 0ba29fabca59b..8843e3a927a01 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -68,7 +68,6 @@ def run_vllm( tensor_parallel_size: int, seed: int, n: int, - use_beam_search: bool, trust_remote_code: bool, dtype: str, max_model_len: Optional[int], @@ -114,9 +113,8 @@ def run_vllm( sampling_params.append( SamplingParams( n=n, - temperature=0.0 if use_beam_search else 1.0, + temperature=1.0, top_p=1.0, - use_beam_search=use_beam_search, ignore_eos=True, max_tokens=output_len, )) @@ -144,15 +142,16 @@ def main(args: argparse.Namespace): args.output_len) if args.backend == "vllm": - elapsed_time = run_vllm( - requests, args.model, args.tokenizer, args.quantization, - args.tensor_parallel_size, args.seed, args.n, args.use_beam_search, - args.trust_remote_code, args.dtype, args.max_model_len, - args.enforce_eager, args.kv_cache_dtype, - args.quantization_param_path, args.device, - args.enable_prefix_caching, args.enable_chunked_prefill, - args.max_num_batched_tokens, args.gpu_memory_utilization, - args.download_dir) + elapsed_time = run_vllm(requests, args.model, args.tokenizer, + args.quantization, args.tensor_parallel_size, + args.seed, args.n, args.trust_remote_code, + args.dtype, args.max_model_len, + args.enforce_eager, args.kv_cache_dtype, + args.quantization_param_path, args.device, + args.enable_prefix_caching, + args.enable_chunked_prefill, + args.max_num_batched_tokens, + args.gpu_memory_utilization, args.download_dir) else: raise ValueError(f"Unknown backend: {args.backend}") total_num_tokens = sum(prompt_len + output_len @@ -203,7 +202,6 @@ def main(args: argparse.Namespace): type=int, default=1, help="Number of generated sequences per prompt.") - parser.add_argument("--use-beam-search", action="store_true") parser.add_argument("--num-prompts", type=int, default=200, diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 56c37b241a359..04999518b7138 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -176,9 +176,9 @@ def sample_sonnet_requests( # Sample the rest of lines per request. sampled_requests: List[Tuple[str, int, int]] = [] for _ in range(num_requests): - sampled_lines = "".join( - prefix_lines + - random.sample(poem_lines, num_input_lines - num_prefix_lines)) + num_lines_needed = num_input_lines - num_prefix_lines + sampled_lines = "".join(prefix_lines + + random.choices(poem_lines, k=num_lines_needed)) prompt = f"{base_prompt}{sampled_lines}" message = [ @@ -391,12 +391,12 @@ async def benchmark( input_requests: List[Tuple[str, int, int]], logprobs: Optional[int], best_of: int, - use_beam_search: bool, request_rate: float, disable_tqdm: bool, profile: bool, selected_percentile_metrics: List[str], selected_percentiles: List[str], + ignore_eos: bool, ): if backend in ASYNC_REQUEST_FUNCS: request_func = ASYNC_REQUEST_FUNCS[backend] @@ -418,8 +418,8 @@ async def benchmark( output_len=test_output_len, logprobs=logprobs, best_of=best_of, - use_beam_search=use_beam_search, multi_modal_content=test_mm_content, + ignore_eos=ignore_eos, ) test_output = await request_func(request_func_input=test_input) if not test_output.success: @@ -439,7 +439,6 @@ async def benchmark( output_len=test_output_len, logprobs=logprobs, best_of=best_of, - use_beam_search=use_beam_search, multi_modal_content=test_mm_content, ) profile_output = await request_func(request_func_input=profile_input) @@ -462,7 +461,6 @@ async def benchmark( output_len=output_len, logprobs=logprobs, best_of=best_of, - use_beam_search=use_beam_search, multi_modal_content=mm_content, ) tasks.append( @@ -481,7 +479,6 @@ async def benchmark( output_len=test_output_len, logprobs=logprobs, best_of=best_of, - use_beam_search=use_beam_search, ) profile_output = await request_func(request_func_input=profile_input) if profile_output.success: @@ -539,7 +536,7 @@ def process_one_metric( # E.g., "Time to First Token" metric_header: str, ): - # This function print and add statistics of the specified + # This function prints and adds statistics of the specified # metric. if metric_attribute_name not in selected_percentile_metrics: return @@ -677,7 +674,6 @@ def main(args: argparse.Namespace): input_requests=input_requests, logprobs=args.logprobs, best_of=args.best_of, - use_beam_search=args.use_beam_search, request_rate=args.request_rate, disable_tqdm=args.disable_tqdm, profile=args.profile, @@ -685,6 +681,7 @@ def main(args: argparse.Namespace): selected_percentiles=[ float(p) for p in args.metric_percentiles.split(",") ], + ignore_eos=args.ignore_eos, )) # Save config and results to json @@ -698,7 +695,6 @@ def main(args: argparse.Namespace): result_json["model_id"] = model_id result_json["tokenizer_id"] = tokenizer_id result_json["best_of"] = args.best_of - result_json["use_beam_search"] = args.use_beam_search result_json["num_prompts"] = args.num_prompts # Metadata @@ -863,6 +859,11 @@ def main(args: argparse.Namespace): "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" " format.", ) + parser.add_argument( + "--ignore-eos", + action="store_true", + help="Set ignore_eos flag when sending the benchmark request." + "Warning: ignore_eos is not supported in deepspeed_mii and tgi.") parser.add_argument( "--percentile-metrics", type=str, diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 68b401d5bbbb7..b7bc2a6402375 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -15,6 +15,7 @@ from vllm.entrypoints.openai.api_server import ( build_async_engine_client_from_engine_args) from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.sampling_params import BeamSearchParams from vllm.utils import FlexibleArgumentParser, merge_async_iterators @@ -72,7 +73,6 @@ def run_vllm( tensor_parallel_size: int, seed: int, n: int, - use_beam_search: bool, trust_remote_code: bool, dtype: str, max_model_len: Optional[int], @@ -90,7 +90,6 @@ def run_vllm( download_dir: Optional[str] = None, load_format: str = EngineArgs.load_format, disable_async_output_proc: bool = False, - use_new_beam_search_impl: bool = False, ) -> float: from vllm import LLM, SamplingParams llm = LLM( @@ -126,29 +125,32 @@ def run_vllm( sampling_params.append( SamplingParams( n=n, - temperature=0.0 if use_beam_search else 1.0, + temperature=1.0, top_p=1.0, - use_beam_search=use_beam_search, ignore_eos=True, max_tokens=output_len, )) - if not use_new_beam_search_impl: + use_beam_search = False + + if not use_beam_search: start = time.perf_counter() llm.generate(prompts, sampling_params, use_tqdm=True) end = time.perf_counter() else: - assert use_beam_search prompts = [prompt for prompt, _, _ in requests] # output_len should be the same for all requests. output_len = requests[0][2] for prompt, input_len, _output_len in requests: assert _output_len == output_len start = time.perf_counter() - llm.beam_search(prompts, - beam_width=n, - max_tokens=output_len, - ignore_eos=True) + llm.beam_search( + prompts, + BeamSearchParams( + beam_width=n, + max_tokens=output_len, + ignore_eos=True, + )) end = time.perf_counter() return end - start @@ -161,7 +163,6 @@ async def run_vllm_async( tensor_parallel_size: int, seed: int, n: int, - use_beam_search: bool, trust_remote_code: bool, dtype: str, max_model_len: Optional[int], @@ -220,9 +221,8 @@ async def run_vllm_async( sampling_params.append( SamplingParams( n=n, - temperature=0.0 if use_beam_search else 1.0, + temperature=1.0, top_p=1.0, - use_beam_search=use_beam_search, ignore_eos=True, max_tokens=output_len, )) @@ -244,11 +244,9 @@ def run_hf( model: str, tokenizer: PreTrainedTokenizerBase, n: int, - use_beam_search: bool, max_batch_size: int, trust_remote_code: bool, ) -> float: - assert not use_beam_search llm = AutoModelForCausalLM.from_pretrained( model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code) if llm.config.model_type == "llama": @@ -280,7 +278,7 @@ def run_hf( padding=True).input_ids llm_outputs = llm.generate( input_ids=input_ids.cuda(), - do_sample=not use_beam_search, + do_sample=True, num_return_sequences=n, temperature=1.0, top_p=1.0, @@ -336,7 +334,7 @@ def main(args: argparse.Namespace): if args.backend == "vllm": run_args = [ requests, args.model, args.tokenizer, args.quantization, - args.tensor_parallel_size, args.seed, args.n, args.use_beam_search, + args.tensor_parallel_size, args.seed, args.n, args.trust_remote_code, args.dtype, args.max_model_len, args.enforce_eager, args.kv_cache_dtype, args.quantization_param_path, args.device, @@ -351,12 +349,11 @@ def main(args: argparse.Namespace): run_args.append(args.disable_frontend_multiprocessing) elapsed_time = uvloop.run(run_vllm_async(*run_args)) else: - elapsed_time = run_vllm(*run_args, args.use_new_beam_search_impl) + elapsed_time = run_vllm(*run_args) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, - args.use_beam_search, args.hf_max_batch_size, - args.trust_remote_code) + args.hf_max_batch_size, args.trust_remote_code) elif args.backend == "mii": elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size, args.output_len) @@ -410,8 +407,6 @@ def main(args: argparse.Namespace): type=int, default=1, help="Number of generated sequences per prompt.") - parser.add_argument("--use-beam-search", action="store_true") - parser.add_argument("--use-new-beam-search-impl", action="store_true") parser.add_argument("--num-prompts", type=int, default=1000, @@ -478,6 +473,7 @@ def main(args: argparse.Namespace): help="Maximum number of forward steps per scheduler call.") parser.add_argument("--use-v2-block-manager", action='store_true', + default=EngineArgs.use_v2_block_manager, help="Enable block manager v2.") parser.add_argument( "--enable-prefix-caching", @@ -566,8 +562,6 @@ def main(args: argparse.Namespace): raise ValueError("dtype must be auto for MII backend.") if args.n != 1: raise ValueError("n must be 1 for MII backend.") - if args.use_beam_search: - raise ValueError("Beam search is not supported for MII backend.") if args.quantization is not None: raise ValueError("Quantization is only for vLLM backend.") if args.hf_max_batch_size is not None: diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index 3c474bd58d04e..bc5f24d3f591c 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -84,7 +84,12 @@ endif() message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}") -list(APPEND LIBS dnnl numa) +list(APPEND LIBS numa) + +# Appending the dnnl library for the AVX2 and AVX512, as it is not utilized by Power architecture. +if (AVX2_FOUND OR AVX512_FOUND) + list(APPEND LIBS dnnl) +endif() # # _C extension diff --git a/csrc/moe/marlin_kernels/marlin_moe_kernel.h b/csrc/moe/marlin_kernels/marlin_moe_kernel.h index 0bd3017226c94..a217401b3d7c2 100644 --- a/csrc/moe/marlin_kernels/marlin_moe_kernel.h +++ b/csrc/moe/marlin_kernels/marlin_moe_kernel.h @@ -38,6 +38,7 @@ using FragA = Vec; using FragB = Vec; using FragC = Vec; using FragS = Vec; // quantization scales +using FragZP = Vec; // Predicated asynchronous global->shared copy; used for inputs A where we apply // predication to handle batchsizes that are not multiples of 16. @@ -175,6 +176,46 @@ __device__ inline FragB dequant(int q) { return frag_b; } +template <> +__device__ inline FragB dequant(int q) { + const int LO = 0x000f000f; + const int HI = 0x00f000f0; + const int EX = 0x64006400; + // Guarantee that the `(a & b) | c` operations are LOP3s. + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + + const int SUB = 0x64006400; + const int MUL = 0x2c002c00; + const int ADD = 0xd400d400; + FragB frag_b; + frag_b[0] = __hsub2(*reinterpret_cast(&lo), + *reinterpret_cast(&SUB)); + frag_b[1] = __hfma2(*reinterpret_cast(&hi), + *reinterpret_cast(&MUL), + *reinterpret_cast(&ADD)); + return frag_b; +} + +template <> +__device__ inline FragB dequant(int q) { + static constexpr uint32_t mask_for_elt_01 = 0x5250; + static constexpr uint32_t mask_for_elt_23 = 0x5351; + static constexpr uint32_t start_byte_for_fp16 = 0x64646464; + + uint32_t lo = prmt(q); + uint32_t hi = prmt(q); + + static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64006400; + + FragB frag_b; + frag_b[0] = __hsub2(*reinterpret_cast(&lo), + *reinterpret_cast(&I8s_TO_F16s_MAGIC_NUM)); + frag_b[1] = __hsub2(*reinterpret_cast(&hi), + *reinterpret_cast(&I8s_TO_F16s_MAGIC_NUM)); + return frag_b; +} + // Multiply dequantized values by the corresponding quantization scale; used // only for grouped quantization. __device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) { @@ -183,11 +224,10 @@ __device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) { frag_b[1] = __hmul2(frag_b[1], s); } -// Given 2 floats multiply by 2 scales (halves) -__device__ inline void scale_float(float* c, FragS& s) { - __half* s_ptr = reinterpret_cast<__half*>(&s); - c[0] = __fmul_rn(c[0], __half2float(s_ptr[0])); - c[1] = __fmul_rn(c[1], __half2float(s_ptr[1])); +__device__ inline void sub_zp(FragB& frag_b, half2& frag_zp, int i) { + half2 zp = __half2half2(reinterpret_cast<__half*>(&frag_zp)[i]); + frag_b[0] = __hsub2(frag_b[0], zp); + frag_b[1] = __hsub2(frag_b[1], zp); } // Same as above, but for act_order (each K is multiplied individually) @@ -205,6 +245,13 @@ __device__ inline void scale4(FragB& frag_b, FragS& frag_s_1, FragS& frag_s_2, frag_b[1] = __hmul2(frag_b[1], s_val_3_4); } +// Given 2 floats multiply by 2 scales (halves) +__device__ inline void scale_float(float* c, FragS& s) { + __half* s_ptr = reinterpret_cast<__half*>(&s); + c[0] = __fmul_rn(c[0], __half2float(s_ptr[0])); + c[1] = __fmul_rn(c[1], __half2float(s_ptr[1])); +} + // Wait until barrier reaches `count`, then lock for current threadblock. __device__ inline void barrier_acquire(int* lock, int count) { if (threadIdx.x == 0) { @@ -248,10 +295,11 @@ template shared // fetch pipeline const bool has_act_order, // whether act_order is enabled + const bool has_zp, // whether zero-points are enabled const int group_blocks = -1 // number of consecutive 16x16 blocks // with a separate quantization scale > -__device__ inline void MarlinMoESingle( +__device__ void MarlinMoESingle( const int4* __restrict__ A, // fp16 input matrix of shape mxk const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn int4* __restrict__ C, // fp16 output buffer of shape mxn @@ -259,6 +307,8 @@ __device__ inline void MarlinMoESingle( const float* __restrict__ topk_weights, // float topk weights const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape // (k/groupsize)xn + const int4* __restrict__ zp_ptr, // 4bit packed zero-points of shape + // (k/groupsize)x(n/pack_factor) const int* __restrict__ g_idx, // int32 group indices of shape k const int* __restrict__ expert_offsets, int num_groups, // number of scale groups per output channel @@ -400,8 +450,12 @@ __device__ inline void MarlinMoESingle( int tb_n_warps = thread_n_blocks / 4; int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps; - constexpr int sorted_sh_stride = threads; - constexpr int sorted_gl_stride = threads; + // Zero-points sizes/strides + int zp_gl_stride = (prob_n / pack_factor) / 4; + constexpr int zp_sh_stride = ((16 * thread_n_blocks) / pack_factor) / 4; + constexpr int zp_tb_groups = s_tb_groups; + constexpr int zp_sh_stage = has_zp ? zp_tb_groups * zp_sh_stride : 0; + int zp_gl_rd_delta = zp_gl_stride; // Global A read index of current thread. int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + @@ -442,6 +496,19 @@ __device__ inline void MarlinMoESingle( int s_sh_wr = threadIdx.x; bool s_sh_wr_pred = threadIdx.x < s_sh_stride; + // Zero-points + int zp_gl_rd; + if constexpr (has_zp) { + if constexpr (group_blocks == -1) { + zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x; + } else { + zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) + + zp_sh_stride * slice_col + threadIdx.x; + } + } + int zp_sh_wr = threadIdx.x; + bool zp_sh_wr_pred = threadIdx.x < zp_sh_stride; + // We use a different scale layout for grouped and column-wise quantization as // we scale a `half2` tile in column-major layout in the former and in // row-major in the latter case. @@ -453,23 +520,29 @@ __device__ inline void MarlinMoESingle( s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + (threadIdx.x % 32) % 4; + // Zero-points have the same read layout as the scales + // (without column-wise case) + constexpr int num_col_threads = 8; + constexpr int num_row_threads = 4; + constexpr int num_ints_per_thread = 8 / pack_factor; + int zp_sh_rd; + if constexpr (has_zp) { + zp_sh_rd = num_ints_per_thread * num_col_threads * + ((threadIdx.x / 32) % (thread_n_blocks / 4)) + + num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads); + } + int sh_first_group_id = -1; int sh_num_groups = -1; constexpr int sh_max_num_groups = 32; - int shs_size; - if constexpr (has_act_order) - shs_size = sh_max_num_groups * s_sh_stride + threads; - else - shs_size = group_blocks > 0 ? stages * s_sh_stage : threads; - extern __shared__ int4 sh[]; // Shared memory storage for global fetch pipelines. int4* sh_a = sh; int4* sh_b = sh_a + (stages * a_sh_stage); int4* sh_g_idx = sh_b + (stages * b_sh_stage); - int4* sh_s = sh_g_idx + (stages * g_idx_stage); - int* sh_sorted = (int*)(sh_s + shs_size); + int4* sh_zp = sh_g_idx + (stages * g_idx_stage); + int4* sh_s = sh_zp + (stages * zp_sh_stage); // Precompute which thread should not read memory in which iterations; this is // needed if there are more threads than required for a certain tilesize or @@ -525,8 +598,10 @@ __device__ inline void MarlinMoESingle( FragA frag_a[2][thread_m_blocks]; I4 frag_b_quant[2][b_thread_vecs]; FragC frag_c[thread_m_blocks][4][2]; - FragS frag_s[2][4]; // No act-order - FragS act_frag_s[2][4][4]; // For act-order + FragS frag_s[2][4]; // No act-order + FragS act_frag_s[2][4][4]; // For act-order + int frag_qzp[2][num_ints_per_thread]; // Zero-points + FragZP frag_zp; // Zero-points in fp16 // Zero accumulators. auto zero_accums = [&]() { @@ -633,6 +708,28 @@ __device__ inline void MarlinMoESingle( } } } + + if constexpr (has_zp && group_blocks != -1) { + int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe; + + if constexpr (group_blocks >= thread_k_blocks) { + // Only fetch zero-points if this tile starts a new group + if (pipe % (group_blocks / thread_k_blocks) == 0) { + if (zp_sh_wr_pred) { + cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]); + } + zp_gl_rd += zp_gl_rd_delta; + } + } else { + for (int i = 0; i < zp_tb_groups; i++) { + if (zp_sh_wr_pred) { + cp_async4(&sh_zp_stage[i * zp_sh_stride + zp_sh_wr], + &zp_ptr[zp_gl_rd]); + } + zp_gl_rd += zp_gl_rd_delta; + } + } + } } } // Insert a fence even when we are winding down the pipeline to ensure that @@ -640,15 +737,9 @@ __device__ inline void MarlinMoESingle( cp_async_fence(); }; - // TODO we are currently hitting illegal memory accesses when fetching - // sorted_ids to shared data: fix this - auto fetch_sorted_ids_to_shared = [&]() { - const int mpt = ceildiv(prob_m, threads); - for (int i = 0; i < mpt; i++) { - if ((i * sorted_gl_stride) + threadIdx.x < prob_m) { - sh_sorted[(i * sorted_sh_stride) + threadIdx.x] = - sorted_ids[(i * sorted_gl_stride) + threadIdx.x]; - } + auto fetch_zp_to_shared = [&]() { + if (zp_sh_wr_pred) { + cp_async4(&sh_zp[zp_sh_wr], &zp_ptr[zp_gl_rd]); } }; @@ -799,8 +890,83 @@ __device__ inline void MarlinMoESingle( } }; + auto fetch_zp_to_registers = [&](int k, int full_pipe) { + // This code does not handle group_blocks == 0, + // which signifies act_order. + // has_zp implies AWQ, which doesn't have act_order, + static_assert(!has_zp || group_blocks != 0); + + if constexpr (has_zp) { + int pipe = full_pipe % stages; + + if constexpr (group_blocks == -1) { + for (int i = 0; i < num_ints_per_thread; i++) { + frag_qzp[k % 2][i] = (reinterpret_cast(sh_zp))[zp_sh_rd + i]; + } + + } else if constexpr (group_blocks >= thread_k_blocks) { + int4* sh_zp_stage = + sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) * + (pipe / (group_blocks / thread_k_blocks))); + for (int i = 0; i < num_ints_per_thread; i++) { + frag_qzp[k % 2][i] = + (reinterpret_cast(sh_zp_stage))[zp_sh_rd + i]; + } + } else { + int warp_id = threadIdx.x / 32; + int n_warps = thread_n_blocks / 4; + + int warp_row = warp_id / n_warps; + + int cur_k = warp_row * 16; + cur_k += k_iter_size * (k % b_sh_wr_iters); + + int k_blocks = cur_k / 16; + int cur_group_id = 0; + + // Suppress bogus and persistent divide-by-zero warning + #pragma nv_diagnostic push + #pragma nv_diag_suppress divide_by_zero + cur_group_id = k_blocks / group_blocks; + #pragma nv_diagnostic pop + + int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe; + + sh_zp_stage += cur_group_id * zp_sh_stride; + + for (int i = 0; i < num_ints_per_thread; i++) { + frag_qzp[k % 2][i] = + (reinterpret_cast(sh_zp_stage))[zp_sh_rd + i]; + } + } + } + }; + // Execute the actual tensor core matmul of a sub-tile. auto matmul = [&](int k) { + if constexpr (has_zp) { + FragB frag_zp_0; + FragB frag_zp_1; + int zp_quant_0, zp_quant_1; + + if constexpr (w_type.size_bits() == 4) { + zp_quant_0 = frag_qzp[k % 2][0]; + zp_quant_1 = zp_quant_0 >> 8; + } else { + static_assert(w_type.size_bits() == 8); + zp_quant_0 = frag_qzp[k % 2][0]; + zp_quant_1 = frag_qzp[k % 2][1]; + } + + frag_zp_0 = dequant(zp_quant_0); + frag_zp_1 = dequant(zp_quant_1); + + frag_zp[0] = frag_zp_0[0]; + frag_zp[1] = frag_zp_0[1]; + frag_zp[2] = frag_zp_1[0]; + frag_zp[3] = frag_zp_1[1]; + } + // We have the m dimension as the inner loop in order to encourage overlapping // dequantization and matmul operations. #pragma unroll @@ -818,6 +984,10 @@ __device__ inline void MarlinMoESingle( FragB frag_b0 = dequant(b_quant_0); FragB frag_b1 = dequant(b_quant_1); + // Apply zero-point to frag_b0 + if constexpr (has_zp) { + sub_zp(frag_b0, frag_zp[j], 0); + } // Apply scale to frag_b0 if constexpr (has_act_order) { @@ -829,6 +999,11 @@ __device__ inline void MarlinMoESingle( } } + // Apply zero-point to frag_b1 + if constexpr (has_zp) { + sub_zp(frag_b1, frag_zp[j], 1); + } + // Apply scale to frag_b1 if constexpr (has_act_order) { scale4(frag_b1, act_frag_s[k % 2][0][j], act_frag_s[k % 2][1][j], @@ -1062,9 +1237,6 @@ __device__ inline void MarlinMoESingle( // Start global fetch and register load pipelines. auto start_pipes = [&]() { - // TODO re-enable after fixing this function - // fetch_sorted_ids_to_shared(); - // __syncthreads(); #pragma unroll for (int i = 0; i < stages - 1; i++) { @@ -1075,6 +1247,12 @@ __device__ inline void MarlinMoESingle( } fetch_scales_to_shared(true, g_idx[slice_k_start], g_idx[last_g_idx]); } + + if constexpr (has_zp && group_blocks == -1) { + if (i == 0) { + fetch_zp_to_shared(); + } + } fetch_to_shared(i, i, i < slice_iters); } @@ -1083,6 +1261,7 @@ __device__ inline void MarlinMoESingle( init_same_group(0); fetch_to_registers(0, 0); fetch_scales_to_registers(0, 0); + fetch_zp_to_registers(0, 0); a_gl_rd += a_gl_rd_delta_o * (stages - 1); slice_k_start_shared_fetch += tb_k * (stages - 1); }; @@ -1102,6 +1281,7 @@ __device__ inline void MarlinMoESingle( for (int k = 0; k < b_sh_wr_iters; k++) { fetch_to_registers(k + 1, pipe % stages); fetch_scales_to_registers(k + 1, pipe); + fetch_zp_to_registers(k + 1, pipe); if (k == b_sh_wr_iters - 2) { fetch_to_shared((pipe + stages - 1) % stages, pipe, slice_iters >= stages); @@ -1236,7 +1416,9 @@ __device__ inline void MarlinMoESingle( } else { s_gl_rd = s_sh_stride * slice_col + threadIdx.x; + zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x; } + start_pipes(); } } @@ -1250,6 +1432,7 @@ template shared // fetch pipeline const bool has_act_order, // whether act_order is enabled + const bool has_zp, // whether zero-points are enabled const int group_blocks = -1 // number of consecutive 16x16 blocks // with a separate quantization scale > @@ -1261,6 +1444,8 @@ __global__ void MarlinMoE( const float* __restrict__ topk_weights, // float topk weights const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape // (k/groupsize)xn + const int4* __restrict__ zp_ptr, // 4bit packed zero-points of shape + // (k/groupsize)x(n/pack_factor) const int* __restrict__ g_idx, // int32 group indices of shape k const int* __restrict__ expert_offsets, int num_groups, // number of scale groups per output channel @@ -1309,29 +1494,29 @@ __global__ void MarlinMoE( if (max_block == 1) { MarlinMoESingle( - A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx, + stages, has_act_order, has_zp, group_blocks>( + A, B, C, sorted_ids_expert, topk_weights, scales_ptr, zp_ptr, g_idx, expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m, prob_n, prob_k, tot_m, locks, replicate_input, apply_weights, current_m_block); } else if (max_block == 2) { MarlinMoESingle( - A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx, + stages, has_act_order, has_zp, group_blocks>( + A, B, C, sorted_ids_expert, topk_weights, scales_ptr, zp_ptr, g_idx, expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m, prob_n, prob_k, tot_m, locks, replicate_input, apply_weights, current_m_block); } else if (max_block == 3) { MarlinMoESingle( - A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx, + stages, has_act_order, has_zp, group_blocks>( + A, B, C, sorted_ids_expert, topk_weights, scales_ptr, zp_ptr, g_idx, expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m, prob_n, prob_k, tot_m, locks, replicate_input, apply_weights, current_m_block); } else { MarlinMoESingle( - A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx, + stages, has_act_order, has_zp, group_blocks>( + A, B, C, sorted_ids_expert, topk_weights, scales_ptr, zp_ptr, g_idx, expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m, prob_n, prob_k, tot_m, locks, replicate_input, apply_weights, current_m_block); @@ -1347,6 +1532,7 @@ template shared // fetch pipeline const bool has_act_order, // whether act_order is enabled + const bool has_zp, // whether zero-points are enabled const int group_blocks = -1 // number of consecutive 16x16 blocks // with a separate quantization scale > @@ -1358,6 +1544,8 @@ __global__ void MarlinMoE( const float* __restrict__ topk_weights, // float topk weights const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape // (k/groupsize)xn + const int4* __restrict__ zp_ptr, // 4bit packed zero-points of shape + // (k/groupsize)x(n/pack_factor) const int* __restrict__ g_idx, // int32 group indices of shape k const int* __restrict__ expert_offsets, int num_groups, // number of scale groups per output channel @@ -1374,7 +1562,6 @@ __global__ void MarlinMoE( int current_m_block, // current m block to start kernel computation from int max_par, // maximum parallelism int cfg_max_m_blocks // upper bound on m blocks - ) { // Marlin is not implemented yet for SM < 8.0 assert(false); @@ -1389,37 +1576,41 @@ __global__ void MarlinMoE( const int USER_THREADS = 256; // Note: This is only used with user-provided thread_k/n const int STAGES = 4; // 4 pipeline stages fit into shared memory -// const int SHARED_MEM = -// 96 * 1024; // max shared memory on compute capability 8.6 (< 8.0) static constexpr int min_thread_n = 64; static constexpr int min_thread_k = 64; #define __CALL_IF_MOE(W_TYPE, THREAD_N_BLOCKS, THREAD_K_BLOCKS, HAS_ACT_ORDER, \ - GROUP_BLOCKS, NUM_THREADS) \ + HAS_ZP, GROUP_BLOCKS, NUM_THREADS) \ else if (q_type == W_TYPE && thread_n_blocks == THREAD_N_BLOCKS && \ thread_k_blocks == THREAD_K_BLOCKS && \ - has_act_order == HAS_ACT_ORDER && group_blocks == GROUP_BLOCKS && \ - num_threads == NUM_THREADS) { \ + has_act_order == HAS_ACT_ORDER && has_zp == HAS_ZP && \ + group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) { \ cudaFuncSetAttribute( \ MarlinMoE, \ + STAGES, HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS>, \ cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \ MarlinMoE \ + STAGES, HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS> \ <<>>( \ A_ptr, B_ptr, C_ptr, sorted_ids_ptr, topk_weights_ptr, s_ptr, \ - g_idx_ptr, expert_offsets_ptr, num_groups, expert_idx, \ + zp_ptr, g_idx_ptr, expert_offsets_ptr, num_groups, expert_idx, \ num_experts, topk, prob_m, prob_n, prob_k, tot_m, locks, \ replicate_input, apply_weights, m_block, max_par, \ cfg_max_m_blocks); \ } -#define GPTQ_CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ - __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS) \ - __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \ - __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS) \ - __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS) \ - __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS) +#define GPTQ_CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS) \ + __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \ + __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS) \ + __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS) \ + __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS) + +#define AWQ_CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \ + __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS) \ + __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS) \ + __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS) } // namespace marlin_moe diff --git a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu new file mode 100644 index 0000000000000..77bc0dd90edde --- /dev/null +++ b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu @@ -0,0 +1,31 @@ +#include "marlin_moe_kernel_ku4.h" + +namespace marlin_moe { + +// We return bool so we can create these different kernel calls as a sequence +// of if-elseif's. +bool call_marlin_moe_kernel_ku4( + vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks, + bool has_act_order, int group_blocks, int num_threads, int blocks, + int max_shared_mem, cudaStream_t stream, const int4* A_ptr, + const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr, + const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr, + const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups, + int expert_idx, int num_experts, int topk, int prob_m, int prob_n, + int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights, + int m_block, int max_par, int cfg_max_m_blocks) { + bool has_zp = true; + + if (false) { + } + AWQ_CALL_IF_MOE(vllm::kU4, 16, 4, 256) + AWQ_CALL_IF_MOE(vllm::kU4, 8, 8, 256) + AWQ_CALL_IF_MOE(vllm::kU4, 8, 4, 128) + AWQ_CALL_IF_MOE(vllm::kU4, 4, 8, 128) + else { + return false; + } + return true; +} + +} // namespace marlin_moe diff --git a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h new file mode 100644 index 0000000000000..833fadf37721f --- /dev/null +++ b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h @@ -0,0 +1,20 @@ +#pragma once + +#include "marlin_moe_kernel.h" + +namespace marlin_moe { + +// We return bool so we can create these different kernel calls as a sequence +// of if-elseif's. +bool call_marlin_moe_kernel_ku4( + vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks, + bool has_act_order, int group_blocks, int num_threads, int blocks, + int max_shared_mem, cudaStream_t stream, const int4* A_ptr, + const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr, + const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr, + const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups, + int expert_idx, int num_experts, int topk, int prob_m, int prob_n, + int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights, + int m_block, int max_par, int cfg_max_m_blocks); + +} // namespace marlin_moe diff --git a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu index cbafd9ffe7474..f7e57b0375945 100644 --- a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu +++ b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu @@ -9,11 +9,13 @@ bool call_marlin_moe_kernel_ku4b8( bool has_act_order, int group_blocks, int num_threads, int blocks, int max_shared_mem, cudaStream_t stream, const int4* A_ptr, const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr, - const float* topk_weights_ptr, const int4* s_ptr, const int* g_idx_ptr, - int* expert_offsets_ptr, int num_groups, int expert_idx, int num_experts, - int topk, int prob_m, int prob_n, int prob_k, int tot_m, int* locks, - bool replicate_input, bool apply_weights, int m_block, int max_par, - int cfg_max_m_blocks) { + const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr, + const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups, + int expert_idx, int num_experts, int topk, int prob_m, int prob_n, + int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights, + int m_block, int max_par, int cfg_max_m_blocks) { + bool has_zp = false; + if (false) { } GPTQ_CALL_IF_MOE(vllm::kU4B8, 16, 4, 256) diff --git a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h index 9eacb42c115f0..494da8f10e262 100644 --- a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h +++ b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h @@ -11,10 +11,10 @@ bool call_marlin_moe_kernel_ku4b8( bool has_act_order, int group_blocks, int num_threads, int blocks, int max_shared_mem, cudaStream_t stream, const int4* A_ptr, const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr, - const float* topk_weights_ptr, const int4* s_ptr, const int* g_idx_ptr, - int* expert_offsets_ptr, int num_groups, int expert_idx, int num_experts, - int topk, int prob_m, int prob_n, int prob_k, int tot_m, int* locks, - bool replicate_input, bool apply_weights, int m_block, int max_par, - int cfg_max_m_blocks); + const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr, + const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups, + int expert_idx, int num_experts, int topk, int prob_m, int prob_n, + int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights, + int m_block, int max_par, int cfg_max_m_blocks); } // namespace marlin_moe diff --git a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu index c46712474f715..a901f0b11cd78 100644 --- a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu +++ b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu @@ -9,11 +9,13 @@ bool call_marlin_moe_kernel_ku8b128( bool has_act_order, int group_blocks, int num_threads, int blocks, int max_shared_mem, cudaStream_t stream, const int4* A_ptr, const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr, - const float* topk_weights_ptr, const int4* s_ptr, const int* g_idx_ptr, - int* expert_offsets_ptr, int num_groups, int expert_idx, int num_experts, - int topk, int prob_m, int prob_n, int prob_k, int tot_m, int* locks, - bool replicate_input, bool apply_weights, int m_block, int max_par, - int cfg_max_m_blocks) { + const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr, + const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups, + int expert_idx, int num_experts, int topk, int prob_m, int prob_n, + int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights, + int m_block, int max_par, int cfg_max_m_blocks) { + bool has_zp = false; + if (false) { } GPTQ_CALL_IF_MOE(vllm::kU8B128, 16, 4, 256) diff --git a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h index 7cd9acafb3b80..f3018aa0c1ab7 100644 --- a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h +++ b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h @@ -9,10 +9,10 @@ bool call_marlin_moe_kernel_ku8b128( bool has_act_order, int group_blocks, int num_threads, int blocks, int max_shared_mem, cudaStream_t stream, const int4* A_ptr, const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr, - const float* topk_weights_ptr, const int4* s_ptr, const int* g_idx_ptr, - int* expert_offsets_ptr, int num_groups, int expert_idx, int num_experts, - int topk, int prob_m, int prob_n, int prob_k, int tot_m, int* locks, - bool replicate_input, bool apply_weights, int m_block, int max_par, - int cfg_max_m_blocks); + const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr, + const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups, + int expert_idx, int num_experts, int topk, int prob_m, int prob_n, + int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights, + int m_block, int max_par, int cfg_max_m_blocks); } diff --git a/csrc/moe/marlin_moe_ops.cu b/csrc/moe/marlin_moe_ops.cu index 661490d95e791..e2db4e4196b6f 100644 --- a/csrc/moe/marlin_moe_ops.cu +++ b/csrc/moe/marlin_moe_ops.cu @@ -30,6 +30,7 @@ #include "core/registration.h" #include "marlin_kernels/marlin_moe_kernel_ku4b8.h" #include "marlin_kernels/marlin_moe_kernel_ku8b128.h" +#include "marlin_kernels/marlin_moe_kernel_ku4.h" template inline std::string str(T x) { @@ -157,6 +158,7 @@ thread_config_t small_batch_thread_configs[] = { {128, 64, 128}, // Reduce N 2X, same K {64, 256, 256}, // Reduce K 2X, increase N 2X {64, 128, 128}, // Reduce K 2X, same N + {64, 64, 128}, // Reduce both 2X }; thread_config_t large_batch_thread_configs[] = { @@ -167,6 +169,7 @@ thread_config_t large_batch_thread_configs[] = { {128, 128, 256}, // Reduce N 2X, increase K 2X {64, 128, 128}, // Reduce N 2X, same K {128, 64, 128}, // Reduce N 4X, increase K 2X + {64, 64, 128}, // Reduce N 4X, same K }; int get_scales_cache_size(thread_config_t const& th_config, int prob_m, @@ -312,27 +315,28 @@ exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k, return exec_config_t{0, {-1, -1, -1}}; } -#define CALL_MOE_KERNEL_FUNCTION(KERNEL_FUNCTION) \ - else if (KERNEL_FUNCTION(q_type, thread_n_blocks, thread_k_blocks, \ - has_act_order, group_blocks, num_threads, blocks, \ - max_shared_mem, stream, A_ptr, B_ptr, C_ptr, \ - sorted_ids_ptr, topk_weights_ptr, s_ptr, g_idx_ptr, \ - expert_offsets_ptr, num_groups, expert_idx, \ - num_experts, topk, prob_m, prob_n, prob_k, tot_m, \ - locks, replicate_input, apply_weights, m_block, \ - max_par, exec_cfg.max_m_blocks)) { \ +#define CALL_MOE_KERNEL_FUNCTION(KERNEL_FUNCTION) \ + else if (KERNEL_FUNCTION( \ + q_type, thread_n_blocks, thread_k_blocks, has_act_order, \ + group_blocks, num_threads, blocks, max_shared_mem, stream, \ + A_ptr, B_ptr, C_ptr, sorted_ids_ptr, topk_weights_ptr, s_ptr, \ + zp_ptr, g_idx_ptr, expert_offsets_ptr, num_groups, expert_idx, \ + num_experts, topk, prob_m, prob_n, prob_k, tot_m, locks, \ + replicate_input, apply_weights, m_block, max_par, \ + exec_cfg.max_m_blocks)) { \ } void marlin_mm_moe(const void* A, const void* B, void* C, const void* sorted_ids, const void* topk_weights, - const void* topk_ids, const void* s, const void* g_idx, - const void* perm, void* a_tmp, void* expert_offsets, - int prob_m, int prob_n, int prob_k, void* workspace, - vllm::ScalarType const& q_type, bool has_act_order, - bool is_k_full, int num_groups, int group_size, - int num_experts, int topk, int moe_block_size, int dev, - cudaStream_t stream, int thread_k, int thread_n, int sms, - int max_par, bool replicate_input, bool apply_weights) { + const void* topk_ids, const void* s, void* zp, + const void* g_idx, const void* perm, void* a_tmp, + void* expert_offsets, int prob_m, int prob_n, int prob_k, + void* workspace, vllm::ScalarType const& q_type, + bool has_act_order, bool is_k_full, bool has_zp, + int num_groups, int group_size, int num_experts, int topk, + int moe_block_size, int dev, cudaStream_t stream, + int thread_k, int thread_n, int sms, int max_par, + bool replicate_input, bool apply_weights) { TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m, ", ", prob_n, ", ", prob_k, "]"); @@ -436,6 +440,8 @@ void marlin_mm_moe(const void* A, const void* B, void* C, const float* topk_weights_ptr = (const float*)topk_weights; const int* sorted_ids_ptr = (const int*)sorted_ids; const int4* s_ptr = (const int4*)s + num_groups * prob_n / 8 * expert_idx; + const int4* zp_ptr = + (const int4*)zp + num_groups * prob_n / (pack_factor * 4) * expert_idx; const int* g_idx_ptr = (const int*)g_idx + prob_k * expert_idx; const int* perm_ptr = (const int*)perm + prob_k * expert_idx; int* locks = (int*)workspace; @@ -456,6 +462,7 @@ void marlin_mm_moe(const void* A, const void* B, void* C, } CALL_MOE_KERNEL_FUNCTION(call_marlin_moe_kernel_ku4b8) CALL_MOE_KERNEL_FUNCTION(call_marlin_moe_kernel_ku8b128) + CALL_MOE_KERNEL_FUNCTION(call_marlin_moe_kernel_ku4) else { TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " + str(prob_n) + ", " + str(prob_k) + "]" + @@ -475,13 +482,21 @@ torch::Tensor marlin_gemm_moe( const torch::Tensor& a, const torch::Tensor& b_q_weights, const torch::Tensor& sorted_ids, const torch::Tensor& topk_weights, const torch::Tensor& topk_ids, const torch::Tensor& b_scales, - const torch::Tensor& g_idx, const torch::Tensor& perm, - torch::Tensor& workspace, vllm::ScalarTypeTorchPtr const& b_q_type, - int64_t size_m, int64_t size_n, int64_t size_k, bool is_k_full, - int64_t num_experts, int64_t topk, int64_t moe_block_size, - bool replicate_input, bool apply_weights) { - TORCH_CHECK(*b_q_type == vllm::kU4B8 || *b_q_type == vllm::kU8B128, - "b_q_type must be uint4b8 or uint8b128. Got = ", b_q_type->str()); + torch::Tensor& b_zeros, const torch::Tensor& g_idx, + const torch::Tensor& perm, torch::Tensor& workspace, + vllm::ScalarTypeTorchPtr const& b_q_type, int64_t size_m, int64_t size_n, + int64_t size_k, bool is_k_full, int64_t num_experts, int64_t topk, + int64_t moe_block_size, bool replicate_input, bool apply_weights) { + bool has_zp = b_zeros.size(1) != 0; + if (has_zp) { + TORCH_CHECK( + *b_q_type == vllm::kU4, + "b_q_type must be u4 when has_zp = True. Got = ", b_q_type->str()); + } else { + TORCH_CHECK( + *b_q_type == vllm::kU4B8 || *b_q_type == vllm::kU8B128, + "b_q_type must be uint4b8 or uint8b128. Got = ", b_q_type->str()); + } int pack_factor = 32 / b_q_type->size_bits(); @@ -543,14 +558,27 @@ torch::Tensor marlin_gemm_moe( } } + // Verify b_zeros + if (has_zp) { + int rank = b_zeros.sizes().size(); + TORCH_CHECK(rank == 3, "b_zeros rank = ", rank, " is not 3"); + TORCH_CHECK(b_zeros.size(1) == num_groups, + "b_zeros dim 1 = ", b_zeros.size(1), + " is not num_groups = ", num_groups); + TORCH_CHECK(b_zeros.size(2) == size_n / pack_factor, + "b_zeros dim 2 = ", b_zeros.size(2), + " is not size_n / pack_factor = ", size_n / pack_factor); + } + marlin_moe::marlin_mm_moe( a.data_ptr(), b_q_weights.data_ptr(), c.data_ptr(), sorted_ids.data_ptr(), topk_weights.data_ptr(), topk_ids.data_ptr(), b_scales.data_ptr(), - g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(), + b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(), expert_offsets.data_ptr(), size_m, size_n, size_k, workspace.data_ptr(), - *b_q_type, has_act_order, is_k_full, num_groups, group_size, num_experts, - topk, moe_block_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k, - thread_n, sms, max_par, replicate_input, apply_weights); + *b_q_type, has_act_order, is_k_full, has_zp, num_groups, group_size, + num_experts, topk, moe_block_size, dev, + at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, max_par, + replicate_input, apply_weights); return c; } diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp index cbc8754f7a5b2..18fbc57ac7834 100644 --- a/csrc/moe/torch_bindings.cpp +++ b/csrc/moe/torch_bindings.cpp @@ -12,7 +12,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { m.def( "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, " "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! " - "g_idx, Tensor! perm, Tensor! workspace, " + "b_zeros, Tensor! g_idx, Tensor! perm, Tensor! workspace, " "__torch__.torch.classes._core_C.ScalarType b_q_type, int size_m, " "int size_n, int size_k, bool is_k_full, int num_experts, int topk, " "int moe_block_size, bool replicate_input, bool apply_weights)" diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu index 227bc19b914a0..5efe15d2b2f6b 100644 --- a/csrc/quantization/gptq_marlin/gptq_marlin.cu +++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu @@ -2260,7 +2260,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, "b_zeros dim 0 = ", b_zeros.size(0), " is not num_groups = ", num_groups); TORCH_CHECK(b_zeros.size(1) == size_n / pack_factor, - "b_zeros dim 1 = ", b_scales.size(1), + "b_zeros dim 1 = ", b_zeros.size(1), " is not size_n / pack_factor = ", size_n / pack_factor); } diff --git a/csrc/quantization/machete/machete_mainloop.cuh b/csrc/quantization/machete/machete_mainloop.cuh index 3d574ad99efda..e8e7b14de0da1 100644 --- a/csrc/quantization/machete/machete_mainloop.cuh +++ b/csrc/quantization/machete/machete_mainloop.cuh @@ -591,24 +591,27 @@ struct MacheteCollectiveMma { tma_load_b = make_tma_copy_B( make_logical_tensor(ptr_B, make_shape(N, K, L), args.dB)); + int32_t scale_k = + (ModeHasScales) ? (K + args.group_size - 1) / args.group_size : 0; + int32_t group_size = (ModeHasScales) ? args.group_size : 0; + if constexpr (ModeHasScales) { - tma_load_scale = make_tma_copy_scale(make_logical_tensor( - args.ptr_S, make_shape(M, args.group_size, L), args.dS)); + tma_load_scale = make_tma_copy_scale( + make_logical_tensor(args.ptr_S, make_shape(M, scale_k, L), args.dS)); } if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) { - tma_load_zero = make_tma_copy_zero(make_logical_tensor( - args.ptr_Z, make_shape(M, args.group_size, L), args.dS)); + tma_load_zero = make_tma_copy_zero( + make_logical_tensor(args.ptr_Z, make_shape(M, scale_k, L), args.dS)); } - if constexpr (KernelConversionMode == ConversionMode::DirectConvert) { - return {tma_load_a, tma_load_b, tma_load_scale, tma_load_zero, 0, 0}; - } else if constexpr (ModeHasScales) { - auto scale_k = (K + args.group_size - 1) / args.group_size; - + if constexpr (KernelConversionMode == ConversionMode::DirectConvert || + KernelConversionMode == ConversionMode::ConvertAndScale || + KernelConversionMode == + ConversionMode::ConvertAndScaleWithZero) { return {tma_load_a, tma_load_b, tma_load_scale, - tma_load_zero, scale_k, args.group_size}; + tma_load_zero, scale_k, group_size}; } else { static_assert(cutlass::detail::dependent_false, "Conversion mode not handled in to_underlying_arguments."); diff --git a/csrc/quantization/machete/machete_pytorch.cu b/csrc/quantization/machete/machete_pytorch.cu index a27f1e7c83df9..ff037756f55ab 100644 --- a/csrc/quantization/machete/machete_pytorch.cu +++ b/csrc/quantization/machete/machete_pytorch.cu @@ -89,6 +89,10 @@ torch::Tensor prepack_B(torch::Tensor const& B, TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { m.impl("machete_prepack_B", &prepack_B); m.impl("machete_gemm", &gemm); +} + +// use CatchAll since supported_schedules has no tensor arguments +TORCH_LIBRARY_IMPL(TORCH_EXTENSION_NAME, CatchAll, m) { m.impl("machete_supported_schedules", &supported_schedules); } diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 80037dda20015..d58f226136918 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -4,6 +4,7 @@ sphinx-copybutton==0.5.2 myst-parser==2.0.0 sphinx-argparse==0.4.0 msgspec +cloudpickle # packages to install to build the documentation pydantic >= 2.8 diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst index 81287762d3c0a..cfd2dcb3bd5d3 100644 --- a/docs/source/getting_started/debugging.rst +++ b/docs/source/getting_started/debugging.rst @@ -1,32 +1,53 @@ .. _debugging: +=============== Debugging Tips =============== -Debugging hang/crash issues ---------------------------- +This document outlines some debugging strategies you can consider. If you think you've discovered a bug, please `search existing issues `_ first to see if it has already been reported. If not, please `file a new issue `_, providing as much relevant information as possible. + +.. note:: + + Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated. -When an vLLM instance hangs or crashes, it is very difficult to debug the issue. But wait a minute, it is also possible that vLLM is doing something that indeed takes a long time: +Hangs downloading a model +---------------------------------------- +If the model isn't already downloaded to disk, vLLM will download it from the internet which can take time and depend on your internet connection. +It's recommended to download the model first using the `huggingface-cli `_ and passing the local path to the model to vLLM. This way, you can isolate the issue. -- **Downloading a model**: Do you have the model already downloaded in your disk? If not, vLLM will download the model from the internet, which can take a long time. Be sure to check the internet connection. It would be better to download the model first using `huggingface-cli `_ and then use the local path to the model. This way, you can isolate the issue. -- **Loading the model from disk**: If the model is large, it can take a long time to load the model from disk. Please take care of the location you store the model. Some clusters have shared filesystems across nodes, e.g. distributed filesystem or network filesystem, which can be slow. It would be better to store the model in a local disk. In addition, please also watch the CPU memory usage. When the model is too large, it might take much CPU memory, which can slow down the operating system because it needs to frequently swap memory between the disk and the memory. -- **Tensor parallel inference**: If the model is too large to fit in a single GPU, you might want to use tensor parallelism to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `the provided script `_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. +Hangs loading a model from disk +---------------------------------------- +If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow. +It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory. -If you have already taken care of the above issues, but the vLLM instance still hangs, with CPU and GPU utilization at near zero, it is likely that the vLLM instance is stuck somewhere. Here are some tips to help debug the issue: +Model is too large +---------------------------------------- +If the model is too large to fit in a single GPU, you might want to `consider tensor parallelism `_ to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `this example `_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. -- Set the environment variable ``export VLLM_LOGGING_LEVEL=DEBUG`` to turn on more logging. -- Set the environment variable ``export CUDA_LAUNCH_BLOCKING=1`` to know exactly which CUDA kernel is causing the trouble. -- Set the environment variable ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL. -- Set the environment variable ``export VLLM_TRACE_FUNCTION=1``. All the function calls in vLLM will be recorded. Inspect these log files, and tell which function crashes or hangs. +Enable more logging +---------------------------------------- +If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue: -With more logging, hopefully you can find the root cause of the issue. +- ``export VLLM_LOGGING_LEVEL=DEBUG`` to turn on more logging. +- ``export CUDA_LAUNCH_BLOCKING=1`` to identify which CUDA kernel is causing the problem. +- ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL. +- ``export VLLM_TRACE_FUNCTION=1`` to record all function calls for inspection in the log files to tell which function crashes or hangs. -If it crashes, and the error trace shows somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a cuda error inside cudagraph. To know the particular cuda operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the :class:`~vllm.LLM` class, to disable the cudagraph optimization. This way, you can locate the exact cuda operation that causes the error. +Incorrect network setup +---------------------------------------- +The vLLM instance cannot get the correct IP address if you have a complicated network config. You can find a log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl`` and the IP address should be the correct one. +If it's not, override the IP address using the environment variable ``export VLLM_HOST_IP=``. -Here are some common issues that can cause hangs: +You might also need to set ``export NCCL_SOCKET_IFNAME=`` and ``export GLOO_SOCKET_IFNAME=`` to specify the network interface for the IP address. -- **Incorrect network setup**: The vLLM instance cannot get the correct IP address if you have complicated network config. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``. You might also need to set ``export NCCL_SOCKET_IFNAME=your_network_interface`` and ``export GLOO_SOCKET_IFNAME=your_network_interface`` to specify the network interface for the IP address. -- **Incorrect hardware/driver**: GPU/CPU communication cannot be established. You can run the following sanity check script to see if the GPU/CPU communication is working correctly. +Error near ``self.graph.replay()`` +---------------------------------------- +If vLLM crashes and the error trace captures it somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a CUDA error inside CUDAGraph. +To identify the particular CUDA operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the :class:`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error. + +Incorrect hardware/driver +---------------------------------------- +If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly. .. code-block:: python @@ -84,33 +105,29 @@ Here are some common issues that can cause hangs: dist.destroy_process_group(gloo_group) dist.destroy_process_group() -.. tip:: +If you are testing with a single node, adjust ``--nproc-per-node`` to the number of GPUs you want to use: - Save the script as ``test.py``. - - If you are testing in a single-node, run it with ``NCCL_DEBUG=TRACE torchrun --nproc-per-node=8 test.py``, adjust ``--nproc-per-node`` to the number of GPUs you want to use. - - If you are testing with multi-nodes, run it with ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py``. Adjust ``--nproc-per-node`` and ``--nnodes`` according to your setup. Make sure ``MASTER_ADDR``: - - - is the correct IP address of the master node - - is reachable from all nodes - - is set before running the script. +.. code-block:: shell - If the script runs successfully, you should see the message ``sanity check is successful!``. + NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py - Note that multi-node environment is more complicated than single-node. If you see errors such as ``torch.distributed.DistNetworkError``, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments: +If you are testing with multi-nodes, adjust ``--nproc-per-node`` and ``--nnodes`` according to your setup and set ``MASTER_ADDR`` to the correct IP address of the master node, reachable from all nodes. Then, run: - - In the first node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py``. - - In the second node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py``. +.. code-block:: shell + + NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py - Adjust ``--nproc-per-node``, ``--nnodes``, and ``--node-rank`` according to your setup. The difference is that you need to execute different commands (with different ``--node-rank``) on different nodes. +If the script runs successfully, you should see the message ``sanity check is successful!``. -If the problem persists, feel free to `open an issue on GitHub `_, with a detailed description of the issue, your environment, and the logs. +.. note:: -Some known issues: + A multi-node environment is more complicated than a single-node one. If you see errors such as ``torch.distributed.DistNetworkError``, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments: -- In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq `_ , which can cause hangs at a low probability (once in about 20 times, depending on the machine configuration). The solution is to upgrade to the latest version of ``vllm`` to include the `fix `_ . + - In the first node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py``. + - In the second node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py``. -.. warning:: + Adjust ``--nproc-per-node``, ``--nnodes``, and ``--node-rank`` according to your setup, being sure to execute different commands (with different ``--node-rank``) on different nodes. - After you find the root cause and solve the issue, remember to turn off all the debugging environment variables defined above, or simply start a new shell to avoid being affected by the debugging settings. If you don't do this, the system might be slow because many debugging functionalities are turned on. +Known Issues +---------------------------------------- +- In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq `_ , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of ``vllm`` to include the `fix `_. diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst index c6db74c18629f..99c695ac4ddb1 100644 --- a/docs/source/getting_started/installation.rst +++ b/docs/source/getting_started/installation.rst @@ -1,19 +1,20 @@ .. _installation: +============ Installation ============ vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries. Requirements ------------- +=========================== * OS: Linux * Python: 3.8 -- 3.12 * GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) Install released versions --------------------------- +=========================== You can install vLLM using pip: @@ -46,8 +47,11 @@ You can install vLLM using pip: Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions. + +.. _install-the-latest-code: + Install the latest code ----------------------------- +========================= LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on x86 platform with cuda 12 for every commit since v0.5.3. You can download and install the latest one with the following command: @@ -75,18 +79,27 @@ These docker images are used for CI and testing only, and they are not intended Latest code can contain bugs and may not be stable. Please use it with caution. -Build from source (without compilation) ---------------------------------------- +.. _build_from_source: + +Build from source +================== -If you want to develop vLLM, and you only need to change the Python code, you can build vLLM without compilation. +.. _python-only-build: -The first step is to follow the previous instructions to install the latest vLLM wheel: +Python-only build (without compilation) +---------------------------------------- + +If you only need to change Python code, you can simply build vLLM without compilation. + +The first step is to install the latest vLLM wheel: .. code-block:: console - $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl + pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl -After verifying that the installation is successful, we have a script for you to copy and link directories, so that you can edit the Python code directly: +You can find more information about vLLM's wheels `above <#install-the-latest-code>`_. + +After verifying that the installation is successful, you can use `the following script `_: .. code-block:: console @@ -94,94 +107,113 @@ After verifying that the installation is successful, we have a script for you to $ cd vllm $ python python_only_dev.py -It will: +The script will: -- Find the installed vLLM in the current environment. -- Copy built files to the current directory. -- Rename the installed vLLM -- Symbolically link the current directory to the installed vLLM. +* Find the installed vLLM package in the current environment. +* Copy built files to the current directory. +* Rename the installed vLLM package. +* Symbolically link the current directory to the installed vLLM package. -This way, you can edit the Python code in the current directory, and the changes will be reflected in the installed vLLM. +Now, you can edit the Python code in the current directory, and the changes will be reflected when you run vLLM. -.. _build_from_source: +Once you have finished editing or want to install another vLLM wheel, you should exit the development environment using `the same script `_ with the ``--quit-dev``(or ``-q`` for short) flag: -Build from source (with compilation) ------------------------------------- +.. code-block:: console + + $ python python_only_dev.py --quit-dev + +The script with ``--quit-dev`` flag will: + +* Remove the symbolic link from the current directory to the vLLM package. +* Restore the original vLLM package from the backup. + +If you update the vLLM wheel and want to rebuild from the source and make further edits, you will need to start `all above <#python-only-build>`_ over again. -If you need to touch the C++ or CUDA code, you need to build vLLM from source: +.. note:: + + There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors. + It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to `the above section <#install-the-latest-code>`_ for instructions on how to install a specified wheel. + +Full build (with compilation) +--------------------------------- + +If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes: .. code-block:: console $ git clone https://github.com/vllm-project/vllm.git $ cd vllm - $ pip install -e . # This can take a long time + $ pip install -e . -.. note:: +.. tip:: - This will uninstall existing PyTorch, and install the version required by vLLM. If you want to use an existing PyTorch installation, there need to be some changes: + Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results. + For example, you can install `ccache `_ using ``conda install ccache`` or ``apt install ccache`` . + As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster. - .. code-block:: console - $ git clone https://github.com/vllm-project/vllm.git - $ cd vllm - $ python use_existing_torch.py - $ pip install -r requirements-build.txt - $ pip install -e . --no-build-isolation +Use an existing PyTorch installation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.: - The differences are: +* Building vLLM with PyTorch nightly or a custom PyTorch build. +* Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run ``pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124`` to `install PyTorch nightly `_, and then build vLLM on top of it. - - ``python use_existing_torch.py``: This script will remove all the PyTorch versions in the requirements files, so that the existing PyTorch installation will be used. - - ``pip install -r requirements-build.txt``: You need to manually install the requirements for building vLLM. - - ``pip install -e . --no-build-isolation``: You need to disable build isolation, so that the build system can use the existing PyTorch installation. +To build vLLM using an existing PyTorch installation: - This is especially useful when the PyTorch dependency cannot be easily installed via pip, e.g.: +.. code-block:: console - - build vLLM with PyTorch nightly or a custom PyTorch build. - - build vLLM with aarch64 and cuda (GH200), where the PyTorch wheels are not available on PyPI. Currently, only PyTorch nightly has wheels for aarch64 with CUDA. You can run ``pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124`` to install PyTorch nightly, and then build vLLM on top of it. + $ git clone https://github.com/vllm-project/vllm.git + $ cd vllm + $ python use_existing_torch.py + $ pip install -r requirements-build.txt + $ pip install -e . --no-build-isolation -.. note:: - vLLM can fully run only on Linux, but you can still build it on other systems (for example, macOS). This build is only for development purposes, allowing for imports and a more convenient dev environment. The binaries will not be compiled and not work on non-Linux systems. You can create such a build with the following commands: +Troubleshooting +~~~~~~~~~~~~~~~~~ - .. code-block:: console +To avoid your system being overloaded, you can limit the number of compilation jobs +to be run simultaneously, via the environment variable ``MAX_JOBS``. For example: - $ export VLLM_TARGET_DEVICE=empty - $ pip install -e . +.. code-block:: console + $ export MAX_JOBS=6 + $ pip install -e . -.. tip:: +This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default `_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory. +A side effect is a much slower build process. - Building from source requires quite a lot compilation. If you are building from source for multiple times, it is beneficial to cache the compilation results. For example, you can install `ccache `_ via either ``conda install ccache`` or ``apt install ccache`` . As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, the subsequent builds will be much faster. +Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. -.. tip:: - To avoid your system being overloaded, you can limit the number of compilation jobs - to be run simultaneously, via the environment variable ``MAX_JOBS``. For example: +.. code-block:: console - .. code-block:: console + $ # Use `--ipc=host` to make sure the shared memory is large enough. + $ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 - $ export MAX_JOBS=6 - $ pip install -e . +If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from `the official website `_. After installation, set the environment variable ``CUDA_HOME`` to the installation path of CUDA Toolkit, and make sure that the ``nvcc`` compiler is in your ``PATH``, e.g.: - This is especially useful when you are building on less powerful machines. For example, when you use WSL, it only `gives you half of the memory by default `_, and you'd better use ``export MAX_JOBS=1`` to avoid compiling multiple files simultaneously and running out of memory. The side effect is that the build process will be much slower. If you only touch the Python code, slow compilation is okay, as you are building in an editable mode: you can just change the code and run the Python script without any re-compilation or re-installation. +.. code-block:: console -.. tip:: - If you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. + $ export CUDA_HOME=/usr/local/cuda + $ export PATH="${CUDA_HOME}/bin:$PATH" - .. code-block:: console +Here is a sanity check to verify that the CUDA Toolkit is correctly installed: - $ # Use `--ipc=host` to make sure the shared memory is large enough. - $ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 +.. code-block:: console - If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from `the official website `_. After installation, set the environment variable ``CUDA_HOME`` to the installation path of CUDA Toolkit, and make sure that the ``nvcc`` compiler is in your ``PATH``, e.g.: + $ nvcc --version # verify that nvcc is in your PATH + $ ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME - .. code-block:: console - $ export CUDA_HOME=/usr/local/cuda - $ export PATH="${CUDA_HOME}/bin:$PATH" +Unsupported OS build +---------------------- - Here is a sanity check to verify that the CUDA Toolkit is correctly installed: +vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. - .. code-block:: console +Simply disable the ``VLLM_TARGET_DEVICE`` environment variable before installing: + +.. code-block:: console - $ nvcc --version # verify that nvcc is in your PATH - $ ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME + $ export VLLM_TARGET_DEVICE=empty + $ pip install -e . diff --git a/docs/source/getting_started/neuron-installation.rst b/docs/source/getting_started/neuron-installation.rst index a9ed4d7fa2cd7..ec99fc013057b 100644 --- a/docs/source/getting_started/neuron-installation.rst +++ b/docs/source/getting_started/neuron-installation.rst @@ -27,6 +27,10 @@ Installation steps: .. _build_from_source_neuron: +.. note:: + + The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with vLLM >= 0.5.3. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel. + Build from source ----------------- diff --git a/docs/source/index.rst b/docs/source/index.rst index b4cd28608d3f0..dc6807deb8261 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -81,12 +81,14 @@ Documentation serving/openai_compatible_server serving/deploying_with_docker + serving/deploying_with_k8s serving/distributed_serving serving/metrics serving/env_vars serving/usage_stats serving/integrations serving/tensorizer + serving/compatibility_matrix serving/faq .. toctree:: diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst index 1f220b723cacd..ae09259c0756c 100644 --- a/docs/source/models/adding_model.rst +++ b/docs/source/models/adding_model.rst @@ -85,21 +85,21 @@ When it comes to the linear layers, we provide the following options to parallel * :code:`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving. * :code:`RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer. * :code:`ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer. -* :code:`MergedColumnParallelLinear`: Column-parallel linear that merges multiple `ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices. +* :code:`MergedColumnParallelLinear`: Column-parallel linear that merges multiple :code:`ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices. * :code:`QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices. -Note that all the linear layers above take `linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization. +Note that all the linear layers above take :code:`linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization. 4. Implement the weight loading logic ------------------------------------- You now need to implement the :code:`load_weights` method in your :code:`*ForCausalLM` class. -This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for `MergedColumnParallelLinear` and `QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately. +This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for :code:`MergedColumnParallelLinear` and :code:`QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately. 5. Register your model ---------------------- -Finally, register your :code:`*ForCausalLM` class to the :code:`_MODELS` in `vllm/model_executor/models/registry.py `_. +Finally, register your :code:`*ForCausalLM` class to the :code:`_VLLM_MODELS` in `vllm/model_executor/models/registry.py `_. 6. Out-of-Tree Model Integration -------------------------------------------- @@ -114,6 +114,18 @@ Just add the following lines in your code: from your_code import YourModelForCausalLM ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) +If your model imports modules that initialize CUDA, consider instead lazy-importing it to avoid an error like :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`: + +.. code-block:: python + + from vllm import ModelRegistry + + ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM") + +.. important:: + If your model is a multimodal model, make sure the model class implements the :class:`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. + Read more about that :ref:`here `. + If you are running api server with :code:`vllm serve `, you can wrap the entrypoint with the following code: .. code-block:: python diff --git a/docs/source/models/performance.rst b/docs/source/models/performance.rst index d8750ddc34e8e..23b5ab79a7378 100644 --- a/docs/source/models/performance.rst +++ b/docs/source/models/performance.rst @@ -22,6 +22,8 @@ If you frequently encounter preemptions from the vLLM engine, consider the follo You can also monitor the number of preemption requests through Prometheus metrics exposed by the vLLM. Additionally, you can log the cumulative number of preemption requests by setting disable_log_stats=False. +.. _chunked-prefill: + Chunked Prefill --------------- vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests. diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 23f08bfa9756e..bf86a72e20b57 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -7,10 +7,12 @@ vLLM supports a variety of generative Transformer models in `HuggingFace Transfo The following is the list of model architectures that are currently supported by vLLM. Alongside each architecture, we include some popular models that use it. ----- +Text-only Language Models +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Text Generation +--------------- -Decoder-only Language Models -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. list-table:: :widths: 25 25 50 5 5 :header-rows: 1 @@ -40,6 +42,11 @@ Decoder-only Language Models - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc. - - ✅︎ + * - :code:`BartForConditionalGeneration` + - BART + - :code:`facebook/bart-base`, :code:`facebook/bart-large-cnn`, etc. + - + - * - :code:`ChatGLMModel` - ChatGLM - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc. @@ -145,6 +152,11 @@ Decoder-only Language Models - :code:`meta-llama/Meta-Llama-3.1-405B-Instruct`, :code:`meta-llama/Meta-Llama-3.1-70B`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-70b-hf`, :code:`01-ai/Yi-34B`, etc. - ✅︎ - ✅︎ + * - :code:`MambaForCausalLM` + - Mamba + - :code:`state-spaces/mamba-130m-hf`, :code:`state-spaces/mamba-790m-hf`, :code:`state-spaces/mamba-2.8b-hf`, etc. + - ✅︎ + - * - :code:`MiniCPMForCausalLM` - MiniCPM - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, etc. @@ -259,11 +271,58 @@ Decoder-only Language Models .. note:: Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. -.. _supported_vlms: +Text Embedding +-------------- + +.. list-table:: + :widths: 25 25 50 5 5 + :header-rows: 1 + + * - Architecture + - Models + - Example HuggingFace Models + - :ref:`LoRA ` + - :ref:`PP ` + * - :code:`Gemma2Model` + - Gemma2-based + - :code:`BAAI/bge-multilingual-gemma2`, etc. + - + - ✅︎ + * - :code:`MistralModel` + - Mistral-based + - :code:`intfloat/e5-mistral-7b-instruct`, etc. + - + - ✅︎ + +Reward Modeling +--------------- + +.. list-table:: + :widths: 25 25 50 5 5 + :header-rows: 1 + + * - Architecture + - Models + - Example HuggingFace Models + - :ref:`LoRA ` + - :ref:`PP ` + * - :code:`Qwen2ForRewardModel` + - Qwen2-based + - :code:`Qwen/Qwen2.5-Math-RM-72B`, etc. + - + - ✅︎ + +.. note:: + As an interim measure, these models are supported via Embeddings API. See `this RFC `_ for upcoming changes. Multimodal Language Models ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. _supported_vlms: + +Text Generation +--------------- + .. list-table:: :widths: 25 25 25 25 5 5 :header-rows: 1 @@ -292,6 +351,12 @@ Multimodal Language Models - :code:`adept/fuyu-8b` etc. - - ✅︎ + * - :code:`ChatGLMModel` + - GLM-4V + - Image + - :code:`THUDM/glm-4v-9b` etc. + - + - ✅︎ * - :code:`InternVLChatModel` - InternVL2 - Image\ :sup:`E+` @@ -324,7 +389,7 @@ Multimodal Language Models - ✅︎ * - :code:`MiniCPMV` - MiniCPM-V - - Image\ :sup:`+` + - Image\ :sup:`E+` - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc. - ✅︎ - ✅︎ @@ -333,7 +398,13 @@ Multimodal Language Models - Image - :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc. - + - + * - :code:`NVLM_D_Model` + - NVLM-D 1.0 + - Image\ :sup:`E+` + - :code:`nvidia/NVLM-D-72B`, etc. - + - ✅︎ * - :code:`PaliGemmaForConditionalGeneration` - PaliGemma - Image\ :sup:`E` @@ -378,6 +449,7 @@ Multimodal Language Models For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now. For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 +---- If your model uses one of the above model architectures, you can seamlessly run your model with vLLM. Otherwise, please refer to :ref:`Adding a New Model ` and :ref:`Enabling Multimodal Inputs ` diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index 3f4f01e3ae7ac..a3ee5da044220 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -6,10 +6,9 @@ Using VLMs vLLM provides experimental support for Vision Language Models (VLMs). See the :ref:`list of supported VLMs here `. This document shows you how to run and serve these models using vLLM. -.. important:: - We are actively iterating on VLM support. Expect breaking changes to VLM usage and development in upcoming releases without prior deprecation. - - We are continuously improving user & developer experience for VLMs. Please `open an issue on GitHub `_ if you have any feedback or feature requests. +.. note:: + We are actively iterating on VLM support. See `this RFC `_ for upcoming changes, + and `open an issue on GitHub `_ if you have any feedback or feature requests. Offline Inference ----------------- @@ -23,14 +22,10 @@ The :class:`~vllm.LLM` class can be instantiated in much the same way as languag llm = LLM(model="llava-hf/llava-1.5-7b-hf") -.. note:: - We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow - the above snippet. Specifically, ``image_feature_size`` can no longer be specified as we now calculate that internally for each model. - To pass an image to the model, note the following in :class:`vllm.inputs.PromptType`: * ``prompt``: The prompt should follow the format that is documented on HuggingFace. -* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`. +* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`. .. code-block:: python @@ -39,7 +34,7 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptT # Load the image using PIL.Image image = PIL.Image.open(...) - + # Single prompt inference outputs = llm.generate({ "prompt": prompt, @@ -62,18 +57,25 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptT print(generated_text) # Inference with image embeddings as input with additional parameters - # Specifically, we are conducting a trial run of Qwen2VL with the new input format, as the model utilizes additional parameters for calculating positional encoding. - image_embeds = torch.load(...) # torch.Tensor of shape (1, image_feature_size, hidden_size of LM) - image_grid_thw = torch.load(...) # torch.Tensor of shape (1, 3) + # Specifically, we are conducting a trial run of Qwen2VL and MiniCPM-V with the new input format, which utilizes additional parameters. + mm_data = {} + + image_embeds = torch.load(...) # torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM) + # For Qwen2VL, image_grid_thw is needed to calculate positional encoding. + mm_data['image'] = { + "image_embeds": image_embeds, + "image_grid_thw": torch.load(...) # torch.Tensor of shape (1, 3), + } + # For MiniCPM-V, image_size_list is needed to calculate details of the sliced image. mm_data['image'] = { "image_embeds": image_embeds, - "image_grid_thw": image_grid_thw, + "image_size_list": [image.size] # list of image sizes } outputs = llm.generate({ "prompt": prompt, "multi_modal_data": mm_data, }) - + for o in outputs: generated_text = o.outputs[0].text print(generated_text) @@ -121,7 +123,7 @@ Instead of passing in a single image, you can pass in a list of images. .. code-block:: python # Refer to the HuggingFace repo for the correct format to use - prompt = "<|user|>\n\n\nWhat is the content of each image?<|end|>\n<|assistant|>\n" + prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n" # Load the images using PIL.Image image1 = PIL.Image.open(...) @@ -140,6 +142,33 @@ Instead of passing in a single image, you can pass in a list of images. A code example can be found in `examples/offline_inference_vision_language_multi_image.py `_. +Multi-image input can be extended to perform video captioning. We show this with `Qwen2-VL `_ as it supports videos: + +.. code-block:: python + + # Specify the maximum number of frames per video to be 4. This can be changed. + llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) + + # Create the request payload. + video_frames = ... # load your video making sure it only has the number of frames specified earlier. + message = { + "role": "user", + "content": [ + {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."}, + ], + } + for i in range(len(video_frames)): + base64_image = encode_image(video_frames[i]) # base64 encoding. + new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} + message["content"].append(new_image) + + # Perform inference and log output. + outputs = llm.chat([message]) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + Online Inference ---------------- diff --git a/docs/source/quantization/fp8.rst b/docs/source/quantization/fp8.rst index d7d9b21b4b949..aacd07a34ad46 100644 --- a/docs/source/quantization/fp8.rst +++ b/docs/source/quantization/fp8.rst @@ -106,7 +106,7 @@ Install ``vllm`` and ``lm-evaluation-harness``: .. code-block:: console - $ pip install vllm lm_eval==0.4.3 + $ pip install vllm lm-eval==0.4.4 Load and run the model in ``vllm``: diff --git a/docs/source/quantization/supported_hardware.rst b/docs/source/quantization/supported_hardware.rst index ea587e0525a74..9bf0cdb80376d 100644 --- a/docs/source/quantization/supported_hardware.rst +++ b/docs/source/quantization/supported_hardware.rst @@ -28,7 +28,7 @@ The table below shows the compatibility of various quantization implementations - ✅︎ - ✗ - ✗ - - ✗ + - ✅︎ - ✗ - ✗ * - GPTQ @@ -61,7 +61,7 @@ The table below shows the compatibility of various quantization implementations - ✅︎ - ✗ - ✗ - - ✗ + - ✅︎ - ✗ - ✗ * - FP8 (W8A8) diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/serving/compatibility_matrix.rst new file mode 100644 index 0000000000000..cac0605ca132b --- /dev/null +++ b/docs/source/serving/compatibility_matrix.rst @@ -0,0 +1,427 @@ +.. _compatibility_matrix: + +Compatibility Matrix +==================== + +The tables below show mutually exclusive features and the support on some hardware. + +.. note:: + + Check the '✗' with links to see tracking issue for unsupported feature/hardware combination. + +Feature x Feature +----------------- + + +.. raw:: html + + + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - Feature + - :ref:`CP ` + - :ref:`APC ` + - :ref:`LoRA ` + - :abbr:`prmpt adptr (Prompt Adapter)` + - :ref:`SD ` + - CUDA graph + - :abbr:`enc-dec (Encoder-Decoder Models)` + - :abbr:`logP (Logprobs)` + - :abbr:`prmpt logP (Prompt Logprobs)` + - :abbr:`async output (Async Output Processing)` + - multi-step + - :abbr:`MM (Multimodal)` + - best-of + - beam-search + - :abbr:`guided dec (Guided Decoding)` + * - :ref:`CP ` + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + * - :ref:`APC ` + - ✅ + - + - + - + - + - + - + - + - + - + - + - + - + - + - + * - :ref:`LoRA ` + - `✗ `__ + - ✅ + - + - + - + - + - + - + - + - + - + - + - + - + - + * - :abbr:`prmpt adptr (Prompt Adapter)` + - ✅ + - ✅ + - ✅ + - + - + - + - + - + - + - + - + - + - + - + - + * - :ref:`SD ` + - ✗ + - ✅ + - ✗ + - ✅ + - + - + - + - + - + - + - + - + - + - + - + * - CUDA graph + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - + - + - + - + - + - + - + - + - + - + * - :abbr:`enc-dec (Encoder-Decoder Models)` + - ✗ + - `✗ `__ + - ✗ + - ✗ + - `✗ `__ + - ✅ + - + - + - + - + - + - + - + - + - + * - :abbr:`logP (Logprobs)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - + - + - + - + - + - + - + - + * - :abbr:`prmpt logP (Prompt Logprobs)` + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + - ✅ + - ✅ + - + - + - + - + - + - + - + * - :abbr:`async output (Async Output Processing)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✅ + - ✗ + - ✅ + - ✅ + - + - + - + - + - + - + * - multi-step + - ✗ + - ✅ + - ✗ + - ✅ + - ✗ + - ✅ + - ✗ + - ✅ + - `✗ `__ + - ✅ + - + - + - + - + - + * - :abbr:`MM (Multimodal)` + - `✗ `__ + - `✗ `__ + - `✗ `__ + - ? + - ? + - ✅ + - ✗ + - ✅ + - ✅ + - ✅ + - ? + - + - + - + - + * - best-of + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + - ✅ + - ✅ + - ✅ + - ? + - `✗ `__ + - ✅ + - + - + - + * - beam-search + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + - ✅ + - ✅ + - ✅ + - ? + - `✗ `__ + - ? + - ✅ + - + - + * - :abbr:`guided dec (Guided Decoding)` + - ✅ + - ✅ + - ? + - ? + - ✅ + - ✅ + - ? + - ✅ + - ✅ + - ✅ + - ✗ + - ? + - ✅ + - ✅ + - + + +Feature x Hardware +^^^^^^^^^^^^^^^^^^ + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - Feature + - Volta + - Turing + - Ampere + - Ada + - Hopper + - CPU + - AMD + * - :ref:`CP ` + - `✗ `__ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✅ + * - :ref:`APC ` + - `✗ `__ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✅ + * - :ref:`LoRA ` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + * - :abbr:`prmpt adptr (Prompt Adapter)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + * - :ref:`SD ` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - CUDA graph + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✅ + * - :abbr:`enc-dec (Encoder-Decoder Models)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✗ + * - :abbr:`logP (Logprobs)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - :abbr:`prmpt logP (Prompt Logprobs)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - :abbr:`async output (Async Output Processing)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✗ + * - multi-step + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + * - :abbr:`MM (Multimodal)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - best-of + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - beam-search + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - :abbr:`guided dec (Guided Decoding)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ diff --git a/docs/source/serving/deploying_with_k8s.rst b/docs/source/serving/deploying_with_k8s.rst new file mode 100644 index 0000000000000..7dc076dc709df --- /dev/null +++ b/docs/source/serving/deploying_with_k8s.rst @@ -0,0 +1,175 @@ +.. _deploying_with_k8s: + +Deploying with Kubernetes +========================== + +Using Kubernetes to deploy vLLM is a scalable and efficient way to serve machine learning models. This guide will walk you through the process of deploying vLLM with Kubernetes, including the necessary prerequisites, steps for deployment, and testing. + +Prerequisites +------------- +Before you begin, ensure that you have the following: + +- A running Kubernetes cluster +- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at `https://github.com/NVIDIA/k8s-device-plugin/` +- Available GPU resources in your cluster + +Deployment Steps +---------------- + +1. **Create a PVC , Secret and Deployment for vLLM** + + +PVC is used to store the model cache and it is optional, you can use hostPath or other storage options + +.. code-block:: yaml + + apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: mistral-7b + namespace: default + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + storageClassName: default + volumeMode: Filesystem + +Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models + +.. code-block:: yaml + + apiVersion: v1 + kind: Secret + metadata: + name: hf-token-secret + namespace: default + type: Opaque + data: + token: "REPLACE_WITH_TOKEN" + + +Create a deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model: + +.. code-block:: yaml + + apiVersion: apps/v1 + kind: Deployment + metadata: + name: mistral-7b + namespace: default + labels: + app: mistral-7b + spec: + replicas: 1 + selector: + matchLabels: + app: mistral-7b + template: + metadata: + labels: + app: mistral-7b + spec: + volumes: + - name: cache-volume + persistentVolumeClaim: + claimName: mistral-7b + # vLLM needs to access the host's shared memory for tensor parallel inference. + - name: shm + emptyDir: + medium: Memory + sizeLimit: "2Gi" + containers: + - name: mistral-7b + image: vllm/vllm-openai:latest + command: ["/bin/sh", "-c"] + args: [ + "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" + ] + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + ports: + - containerPort: 8000 + resources: + limits: + cpu: "10" + memory: 20G + nvidia.com/gpu: "1" + requests: + cpu: "2" + memory: 6G + nvidia.com/gpu: "1" + volumeMounts: + - mountPath: /root/.cache/huggingface + name: cache-volume + - name: shm + mountPath: /dev/shm + livenessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 5 + +2. **Create a Kubernetes Service for vLLM** + +Next, create a Kubernetes Service file to expose the `mistral-7b` deployment: + +.. code-block:: yaml + + apiVersion: v1 + kind: Service + metadata: + name: mistral-7b + namespace: default + spec: + ports: + - name: http-mistral-7b + port: 80 + protocol: TCP + targetPort: 8000 + # The label selector should match the deployment labels & it is useful for prefix caching feature + selector: + app: mistral-7b + sessionAffinity: None + type: ClusterIP + +3. **Deploy and Test** + +Apply the deployment and service configurations using ``kubectl apply -f ``: + +.. code-block:: console + + kubectl apply -f deployment.yaml + kubectl apply -f service.yaml + +To test the deployment, run the following ``curl`` command: + +.. code-block:: console + + curl http://mistral-7b.default.svc.cluster.local/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "facebook/opt-125m", + "prompt": "San Francisco is a", + "max_tokens": 7, + "temperature": 0 + }' + +If the service is correctly deployed, you should receive a response from the vLLM model. + +Conclusion +---------- +Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation. \ No newline at end of file diff --git a/docs/source/serving/deploying_with_kserve.rst b/docs/source/serving/deploying_with_kserve.rst index 7f22766e09aef..01d7ccc6e9300 100644 --- a/docs/source/serving/deploying_with_kserve.rst +++ b/docs/source/serving/deploying_with_kserve.rst @@ -5,4 +5,4 @@ Deploying with KServe vLLM can be deployed with `KServe `_ on Kubernetes for highly scalable distributed model serving. -Please see `this guide `_ for more details on using vLLM with KServe. +Please see `this guide `_ for more details on using vLLM with KServe. diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 8bb7067faa97c..9132e12a36ba5 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -140,7 +140,7 @@ $ vllm serve SOME_MODEL --config config.yaml ``` --- **NOTE** -In case an argument is supplied using command line and the config file, the value from the commandline will take precedence. +In case an argument is supplied simultaneously using command line and the config file, the value from the commandline will take precedence. The order of priorities is `command line > config file values > defaults`. --- diff --git a/examples/llm_engine_example.py b/examples/llm_engine_example.py index ca41f32b12b31..60d894aae9692 100644 --- a/examples/llm_engine_example.py +++ b/examples/llm_engine_example.py @@ -18,9 +18,6 @@ def create_test_prompts() -> List[Tuple[str, SamplingParams]]: temperature=0.8, top_p=0.95, frequency_penalty=0.1)), - ("It is only with the heart that one can see rightly", - SamplingParams(n=3, best_of=3, use_beam_search=True, - temperature=0.0)), ] diff --git a/examples/multilora_inference.py b/examples/multilora_inference.py index 6aa25b4689ec8..043220d979c3c 100644 --- a/examples/multilora_inference.py +++ b/examples/multilora_inference.py @@ -43,15 +43,6 @@ def create_test_prompts( max_tokens=128, stop_token_ids=[32003]), LoRARequest("sql-lora", 1, lora_path)), - ( - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 - SamplingParams(n=3, - best_of=3, - use_beam_search=True, - temperature=0, - max_tokens=128, - stop_token_ids=[32003]), - LoRARequest("sql-lora", 1, lora_path)), ( "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 SamplingParams(temperature=0.0, @@ -60,15 +51,6 @@ def create_test_prompts( max_tokens=128, stop_token_ids=[32003]), LoRARequest("sql-lora2", 2, lora_path)), - ( - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 - SamplingParams(n=3, - best_of=3, - use_beam_search=True, - temperature=0, - max_tokens=128, - stop_token_ids=[32003]), - LoRARequest("sql-lora", 1, lora_path)), ] diff --git a/examples/offline_inference_fakehpu.py b/examples/offline_inference_fakehpu.py index 972d84b60b318..248b5740fa35e 100644 --- a/examples/offline_inference_fakehpu.py +++ b/examples/offline_inference_fakehpu.py @@ -21,7 +21,7 @@ "Wales" ] # Create a sampling params object. -sampling_params = SamplingParams(temperature=0, n=1, use_beam_search=False) +sampling_params = SamplingParams(temperature=0, n=1) # Create an LLM. llm = LLM(model="facebook/opt-125m", max_model_len=32, max_num_seqs=4) diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index b94ef537d783f..8d6818e7dfd3e 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -18,7 +18,7 @@ # LLaVA-1.5 -def run_llava(question, modality): +def run_llava(question: str, modality: str): assert modality == "image" prompt = f"USER: \n{question}\nASSISTANT:" @@ -29,7 +29,7 @@ def run_llava(question, modality): # LLaVA-1.6/LLaVA-NeXT -def run_llava_next(question, modality): +def run_llava_next(question: str, modality: str): assert modality == "image" prompt = f"[INST] \n{question} [/INST]" @@ -40,7 +40,7 @@ def run_llava_next(question, modality): # LlaVA-NeXT-Video # Currently only support for video input -def run_llava_next_video(question, modality): +def run_llava_next_video(question: str, modality: str): assert modality == "video" prompt = f"USER: