diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md new file mode 100644 index 0000000000000..4036b32a46bf7 --- /dev/null +++ b/.buildkite/nightly-benchmarks/README.md @@ -0,0 +1,103 @@ +# vLLM benchmark suite + +## Introduction + +This directory contains the performance benchmarking CI for vllm. +The goal is to help developers know the impact of their PRs on the performance of vllm. + +This benchmark will be *triggered* upon: +- A PR being merged into vllm. +- Every commit for those PRs with `perf-benchmarks` label. + +**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for more GPUs is comming later), with different models. + +**Benchmarking Duration**: about 1hr. + +**For benchmarking developers**: please try your best to constraint the duration of benchmarking to less than 1.5 hr so that it won't take forever to run. + + +## Configuring the workload + +The benchmarking workload contains three parts: +- Latency tests in `latency-tests.json`. +- Throughput tests in `throughput-tests.json`. +- Serving tests in `serving-tests.json`. + +See [descriptions.md](tests/descriptions.md) for detailed descriptions. + +### Latency test + +Here is an example of one test inside `latency-tests.json`: + +```json +[ + { + "test_name": "latency_llama8B_tp1", + "parameters": { + "model": "meta-llama/Meta-Llama-3-8B", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + }, +] +``` + +In this example: +- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`. +- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-benchmarks-suite.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15` + +Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly. + +WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file. + + +### Throughput test +The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`. + +The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot. + +### Serving test +We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example: + +``` +[ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "meta-llama/Meta-Llama-3-8B", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3-8B", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, +] +``` + +Inside this example: +- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`. +- The `server-parameters` includes the command line arguments for vLLM server. +- The `client-parameters` includes the command line arguments for `benchmark_serving.py`. +- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py` + +The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly. + +WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`. + +## Visualizing the results +The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results. +You can find the result presented as a table inside the `buildkite/performance-benchmark` job page. +If you do not see the table, please wait till the benchmark finish running. +The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file. +The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking. diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml new file mode 100644 index 0000000000000..2b25c954b5c5c --- /dev/null +++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml @@ -0,0 +1,62 @@ +steps: + - label: "Wait for container to be ready" + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + containers: + - image: badouralix/curl-jq + command: + - sh + - .buildkite/nightly-benchmarks/scripts/wait-for-image.sh + - wait + - label: "A100 Benchmark" + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + priorityClassName: perf-benchmark + containers: + - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + command: + - bash .buildkite/nightly-benchmarks/run-benchmarks-suite.sh + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + # - label: "H100: NVIDIA SMI" + # agents: + # queue: H100 + # plugins: + # - docker#v5.11.0: + # image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + # command: + # - bash + # - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh + # mount-buildkite-agent: true + # propagate-environment: true + # propagate-uid-gid: false + # ipc: host + # gpus: all + # environment: + # - VLLM_USAGE_SOURCE + # - HF_TOKEN + diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh index d3bf3b72980a6..15d411febcee1 100755 --- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh +++ b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh @@ -1,5 +1,6 @@ #!/usr/bin/env bash +# NOTE(simon): this script runs inside a buildkite agent with CPU only access. set -euo pipefail # Install system packages @@ -23,4 +24,4 @@ if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then fi # Upload sample.yaml -buildkite-agent pipeline upload .buildkite/nightly-benchmarks/sample.yaml +buildkite-agent pipeline upload .buildkite/nightly-benchmarks/benchmark-pipeline.yaml diff --git a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh new file mode 100644 index 0000000000000..021473f76d0e5 --- /dev/null +++ b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh @@ -0,0 +1,358 @@ +#!/bin/bash + +# This script should be run inside the CI process +# This script assumes that we are already inside the vllm/ directory +# Benchmarking results will be available inside vllm/benchmarks/results/ + +# Do not set -e, as the mixtral 8x22B model tends to crash occasionally +# and we still want to see other benchmarking results even when mixtral crashes. +set -o pipefail + +check_gpus() { + # check the number of GPUs and GPU type. + declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) + if [[ $gpu_count -gt 0 ]]; then + echo "GPU found." + else + echo "Need at least 1 GPU to run benchmarking." + exit 1 + fi + declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') + echo "GPU type is $gpu_type" +} + +check_hf_token() { + # check if HF_TOKEN is available and valid + if [[ -z "$HF_TOKEN" ]]; then + echo "Error: HF_TOKEN is not set." + exit 1 + elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then + echo "Error: HF_TOKEN does not start with 'hf_'." + exit 1 + else + echo "HF_TOKEN is set and valid." + fi +} + +json2args() { + # transforms the JSON string to command line args, and '_' is replaced to '-' + # example: + # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } + # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 + local json_string=$1 + local args=$( + echo "$json_string" | jq -r ' + to_entries | + map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | + join(" ") + ' + ) + echo "$args" +} + +wait_for_server() { + # wait for vllm server to start + # return 1 if vllm server crashes + timeout 1200 bash -c ' + until curl localhost:8000/v1/completions; do + sleep 1 + done' && return 0 || return 1 +} + +kill_gpu_processes() { + # kill all processes on GPU. + pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader) + if [ -z "$pids" ]; then + echo "No GPU processes found." + else + for pid in $pids; do + kill -9 "$pid" + echo "Killed process with PID: $pid" + done + + echo "All GPU processes have been killed." + fi + + # waiting for GPU processes to be fully killed + sleep 10 + + # remove vllm config file + rm -rf ~/.config/vllm + + # Print the GPU memory usage + # so that we know if all GPU processes are killed. + gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0) + # The memory usage should be 0 MB. + echo "GPU 0 Memory Usage: $gpu_memory_usage MB" +} + +upload_to_buildkite() { + # upload the benchmarking results to buildkite + + # if the agent binary is not found, skip uploading the results, exit 0 + if [ ! -f /workspace/buildkite-agent ]; then + echo "buildkite-agent binary not found. Skip uploading the results." + return 0 + fi + /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/benchmark_results.md + /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" +} + +run_latency_tests() { + # run latency tests using `benchmark_latency.py` + # $1: a json file specifying latency test cases + + local latency_test_file + latency_test_file=$1 + + # Iterate over latency tests + jq -c '.[]' "$latency_test_file" | while read -r params; do + # get the test name, and append the GPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + if [[ ! "$test_name" =~ ^latency_ ]]; then + echo "In latency-test.json, test_name must start with \"latency_\"." + exit 1 + fi + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # get arguments + latency_params=$(echo "$params" | jq -r '.parameters') + latency_args=$(json2args "$latency_params") + + # check if there is enough GPU to run the test + tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size') + if [[ $gpu_count -lt $tp ]]; then + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname." + continue + fi + + latency_command="python3 benchmark_latency.py \ + --output-json $RESULTS_FOLDER/${test_name}.json \ + $latency_args" + + echo "Running test case $test_name" + echo "Latency command: $latency_command" + + # recoding benchmarking command ang GPU command + jq_output=$(jq -n \ + --arg latency "$latency_command" \ + --arg gpu "$gpu_type" \ + '{ + latency_command: $latency, + gpu_type: $gpu + }') + echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands" + + # run the benchmark + eval "$latency_command" + + kill_gpu_processes + + done +} + + +run_throughput_tests() { + # run throughput tests using `benchmark_throughput.py` + # $1: a json file specifying throughput test cases + + local throughput_test_file + throughput_test_file=$1 + + # Iterate over throughput tests + jq -c '.[]' "$throughput_test_file" | while read -r params; do + # get the test name, and append the GPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + if [[ ! "$test_name" =~ ^throughput_ ]]; then + echo "In throughput-test.json, test_name must start with \"throughput_\"." + exit 1 + fi + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # get arguments + throughput_params=$(echo "$params" | jq -r '.parameters') + throughput_args=$(json2args "$throughput_params") + + # check if there is enough GPU to run the test + tp=$(echo $throughput_params | jq -r '.tensor_parallel_size') + if [[ $gpu_count -lt $tp ]]; then + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname." + continue + fi + + throughput_command="python3 benchmark_throughput.py \ + --output-json $RESULTS_FOLDER/${test_name}.json \ + $throughput_args" + + echo "Running test case $test_name" + echo "Throughput command: $throughput_command" + # recoding benchmarking command ang GPU command + jq_output=$(jq -n \ + --arg command "$throughput_command" \ + --arg gpu "$gpu_type" \ + '{ + throughput_command: $command, + gpu_type: $gpu + }') + echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands" + + # run the benchmark + eval "$throughput_command" + + kill_gpu_processes + + done +} + +run_serving_tests() { + # run serving tests using `benchmark_serving.py` + # $1: a json file specifying serving test cases + + local serving_test_file + serving_test_file=$1 + + # Iterate over serving tests + jq -c '.[]' "$serving_test_file" | while read -r params; do + # get the test name, and append the GPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + if [[ ! "$test_name" =~ ^serving_ ]]; then + echo "In serving-test.json, test_name must start with \"serving_\"." + exit 1 + fi + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + + # get client and server arguments + server_params=$(echo "$params" | jq -r '.server_parameters') + client_params=$(echo "$params" | jq -r '.client_parameters') + server_args=$(json2args "$server_params") + client_args=$(json2args "$client_params") + qps_list=$(echo "$params" | jq -r '.qps_list') + qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') + echo "Running over qps list $qps_list" + + # check if there is enough GPU to run the test + tp=$(echo "$server_params" | jq -r '.tensor_parallel_size') + if [[ $gpu_count -lt $tp ]]; then + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname." + continue + fi + + # check if server model and client model is aligned + server_model=$(echo "$server_params" | jq -r '.model') + client_model=$(echo "$client_params" | jq -r '.model') + if [[ $server_model != "$client_model" ]]; then + echo "Server model and client model must be the same. Skip testcase $testname." + continue + fi + + server_command="python3 \ + -m vllm.entrypoints.openai.api_server \ + $server_args" + + # run the server + echo "Running test case $test_name" + echo "Server command: $server_command" + eval "$server_command" & + + # wait until the server is alive + wait_for_server + if [ $? -eq 0 ]; then + echo "" + echo "vllm server is up and running." + else + echo "" + echo "vllm failed to start within the timeout period." + fi + + # iterate over different QPS + for qps in $qps_list; do + # remove the surrounding single quote from qps + if [[ "$qps" == *"inf"* ]]; then + echo "qps was $qps" + qps="inf" + echo "now qps is $qps" + fi + + new_test_name=$test_name"_qps_"$qps + + client_command="python3 benchmark_serving.py \ + --save-result \ + --result-dir $RESULTS_FOLDER \ + --result-filename ${new_test_name}.json \ + --request-rate $qps \ + $client_args" + + echo "Running test case $test_name with qps $qps" + echo "Client command: $client_command" + + eval "$client_command" + + # record the benchmarking commands + jq_output=$(jq -n \ + --arg server "$server_command" \ + --arg client "$client_command" \ + --arg gpu "$gpu_type" \ + '{ + server_command: $server, + client_command: $client, + gpu_type: $gpu + }') + echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands" + + done + + # clean up + kill_gpu_processes + done +} + +main() { + check_gpus + check_hf_token + + # dependencies + (which wget && which curl) || (apt-get update && apt-get install -y wget curl) + (which jq) || (apt-get update && apt-get -y install jq) + + # get the current IP address, required by benchmark_serving.py + export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') + # turn of the reporting of the status of each request, to clean up the terminal output + export VLLM_LOG_LEVEL="WARNING" + + # prepare for benchmarking + cd benchmarks || exit 1 + wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + declare -g RESULTS_FOLDER=results/ + mkdir -p $RESULTS_FOLDER + QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ + + # benchmarking + run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json + run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json + run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json + + + # postprocess benchmarking results + pip install tabulate pandas + python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py + + upload_to_buildkite +} + +main "$@" diff --git a/.buildkite/nightly-benchmarks/sample.yaml b/.buildkite/nightly-benchmarks/sample.yaml deleted file mode 100644 index 50e6e82072186..0000000000000 --- a/.buildkite/nightly-benchmarks/sample.yaml +++ /dev/null @@ -1,39 +0,0 @@ -steps: - # NOTE(simon): You can create separate blocks for different jobs - - label: "A100: NVIDIA SMI" - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - containers: - # - image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT - # TODO(simon): check latest main branch or use the PR image. - - image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6 - command: - - bash -c 'nvidia-smi && nvidia-smi topo -m && pwd && ls' - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - # TODO(simon): bring H100 online - # - label: "H100: NVIDIA SMI" - # agents: - # queue: H100 - # plugins: - # - docker#v5.11.0: - # image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6 - # command: - # - bash -c 'nvidia-smi && nvidia-smi topo -m' - # propagate-environment: true - # ipc: host - # gpus: all - diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py new file mode 100644 index 0000000000000..534ecf17930e9 --- /dev/null +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -0,0 +1,192 @@ +import json +import os +from pathlib import Path + +import pandas as pd +from tabulate import tabulate + +results_folder = Path("results/") + +# latency results and the keys that will be printed into markdown +latency_results = [] +latency_column_mapping = { + "test_name": "Test name", + "gpu_type": "GPU", + "avg_latency": "Mean latency (ms)", + # "P10": "P10 (s)", + # "P25": "P25 (s)", + "P50": "Median latency (ms)", + # "P75": "P75 (s)", + # "P90": "P90 (s)", + "P99": "P99 latency (ms)", +} + +# throughput tests and the keys that will be printed into markdown +throughput_results = [] +throughput_results_column_mapping = { + "test_name": "Test name", + "gpu_type": "GPU", + # "num_requests": "# of req.", + # "total_num_tokens": "Total # of tokens", + # "elapsed_time": "Elapsed time (s)", + "requests_per_second": "Tput (req/s)", + # "tokens_per_second": "Tput (tok/s)", +} + +# serving results and the keys that will be printed into markdown +serving_results = [] +serving_column_mapping = { + "test_name": "Test name", + "gpu_type": "GPU", + # "completed": "# of req.", + "request_throughput": "Tput (req/s)", + # "input_throughput": "Input Tput (tok/s)", + # "output_throughput": "Output Tput (tok/s)", + "mean_ttft_ms": "Mean TTFT (ms)", + "median_ttft_ms": "Median TTFT (ms)", + "p99_ttft_ms": "P99 TTFT (ms)", + # "mean_tpot_ms": "Mean TPOT (ms)", + # "median_tpot_ms": "Median", + # "p99_tpot_ms": "P99", + "mean_itl_ms": "Mean ITL (ms)", + "median_itl_ms": "Median ITL (ms)", + "p99_itl_ms": "P99 ITL (ms)", +} + + +def read_markdown(file): + if os.path.exists(file): + with open(file, "r") as f: + return f.read() + "\n" + else: + return f"{file} not found.\n" + + +def results_to_json(latency, throughput, serving): + return json.dumps({ + 'latency': latency.to_dict(), + 'throughput': throughput.to_dict(), + 'serving': serving.to_dict() + }) + + +if __name__ == "__main__": + + # collect results + for test_file in results_folder.glob("*.json"): + + with open(test_file, "r") as f: + raw_result = json.loads(f.read()) + + if "serving" in str(test_file): + # this result is generated via `benchmark_serving.py` + + # attach the benchmarking command to raw_result + with open(test_file.with_suffix(".commands"), "r") as f: + command = json.loads(f.read()) + raw_result.update(command) + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # add the result to raw_result + serving_results.append(raw_result) + continue + + elif "latency" in f.name: + # this result is generated via `benchmark_latency.py` + + # attach the benchmarking command to raw_result + with open(test_file.with_suffix(".commands"), "r") as f: + command = json.loads(f.read()) + raw_result.update(command) + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # get different percentiles + for perc in [10, 25, 50, 75, 90, 99]: + # Multiply 1000 to convert the time unit from s to ms + raw_result.update( + {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}) + raw_result["avg_latency"] = raw_result["avg_latency"] * 1000 + + # add the result to raw_result + latency_results.append(raw_result) + continue + + elif "throughput" in f.name: + # this result is generated via `benchmark_throughput.py` + + # attach the benchmarking command to raw_result + with open(test_file.with_suffix(".commands"), "r") as f: + command = json.loads(f.read()) + raw_result.update(command) + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # add the result to raw_result + throughput_results.append(raw_result) + continue + + print(f"Skipping {test_file}") + + latency_results = pd.DataFrame.from_dict(latency_results) + serving_results = pd.DataFrame.from_dict(serving_results) + throughput_results = pd.DataFrame.from_dict(throughput_results) + + raw_results_json = results_to_json(latency_results, throughput_results, + serving_results) + + # remapping the key, for visualization purpose + if not latency_results.empty: + latency_results = latency_results[list( + latency_column_mapping.keys())].rename( + columns=latency_column_mapping) + if not serving_results.empty: + serving_results = serving_results[list( + serving_column_mapping.keys())].rename( + columns=serving_column_mapping) + if not throughput_results.empty: + throughput_results = throughput_results[list( + throughput_results_column_mapping.keys())].rename( + columns=throughput_results_column_mapping) + + processed_results_json = results_to_json(latency_results, + throughput_results, + serving_results) + + # get markdown tables + latency_md_table = tabulate(latency_results, + headers='keys', + tablefmt='pipe', + showindex=False) + serving_md_table = tabulate(serving_results, + headers='keys', + tablefmt='pipe', + showindex=False) + throughput_md_table = tabulate(throughput_results, + headers='keys', + tablefmt='pipe', + showindex=False) + + # document the result + with open(results_folder / "benchmark_results.md", "w") as f: + + results = read_markdown( + "../.buildkite/nightly-benchmarks/tests/descriptions.md") + results = results.format( + latency_tests_markdown_table=latency_md_table, + throughput_tests_markdown_table=throughput_md_table, + serving_tests_markdown_table=serving_md_table, + benchmarking_results_in_json_string=processed_results_json) + f.write(results) + + # document benchmarking results in json + with open(results_folder / "benchmark_results.json", "w") as f: + + results = latency_results.to_dict( + orient='records') + throughput_results.to_dict( + orient='records') + serving_results.to_dict(orient='records') + f.write(json.dumps(results)) diff --git a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh new file mode 100644 index 0000000000000..c785e6a0da628 --- /dev/null +++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh @@ -0,0 +1,17 @@ +#!/bin/sh +TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token) +URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT" + +retries=0 +while [ $retries -lt 1000 ]; do + if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then + exit 0 + fi + + echo "Waiting for image to be available..." + + retries=$((retries + 1)) + sleep 5 +done + +exit 1 \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/tests/descriptions.md b/.buildkite/nightly-benchmarks/tests/descriptions.md new file mode 100644 index 0000000000000..891e4917070d9 --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/descriptions.md @@ -0,0 +1,67 @@ + +## Latency tests + +This test suite aims to test vllm's end-to-end latency under a controlled setup. + +- Input length: 32 tokens. +- Output length: 128 tokens. +- Batch size: fixed (8). +- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. +- Evaluation metrics: end-to-end latency (mean, median, p99). + +### Latency benchmarking results + +{latency_tests_markdown_table} + +## Throughput tests + +This test suite aims to test vllm's throughput. + +- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). +- Output length: the corresponding output length of these 200 prompts. +- Batch size: dynamically determined by vllm to achieve maximum throughput. +- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. +- Evaluation metrics: throughput. + +### Throughput benchmarking results + +{throughput_tests_markdown_table} + +## Serving tests + +This test suite aims to test vllm's real serving metrics. + +- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). +- Output length: the corresponding output length of these 200 prompts. +- Batch size: dynamically determined by vllm and the arrival pattern of the requests. +- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed). +- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. +- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99). + +### Serving benchmarking results + +{serving_tests_markdown_table} + +## json version of the benchmarking tables + +This section contains the data of the markdown tables above in JSON format. +You can load the benchmarking tables into pandas dataframes as follows: + +```python +import json +import pandas as pd + +benchmarking_results_json = """The json string""" +benchmarking_results = json.loads(benchmarking_results_json) +latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"]) +throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"]) +serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"]) +``` + +The json string for all benchmarking tables: +```json +{benchmarking_results_in_json_string} +``` + +You can also check the raw experiment data in the Artifact tab of the Buildkite page. + diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests.json b/.buildkite/nightly-benchmarks/tests/latency-tests.json new file mode 100644 index 0000000000000..06488cd79110a --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/latency-tests.json @@ -0,0 +1,32 @@ +[ + { + "test_name": "latency_llama8B_tp1", + "parameters": { + "model": "meta-llama/Meta-Llama-3-8B", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + }, + { + "test_name": "latency_llama70B_tp4", + "parameters": { + "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "num-iters-warmup": 5, + "num-iters": 15 + } + }, + { + "test_name": "latency_mixtral8x7B_tp2", + "parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "load_format": "dummy", + "num-iters-warmup": 5, + "num-iters": 15 + } + } +] \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json new file mode 100644 index 0000000000000..86a0fefa339f7 --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json @@ -0,0 +1,59 @@ +[ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "meta-llama/Meta-Llama-3-8B", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3-8B", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama70B_tp4_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_mixtral8x7B_tp2_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + } +] \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests.json b/.buildkite/nightly-benchmarks/tests/throughput-tests.json new file mode 100644 index 0000000000000..41ac135748704 --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/throughput-tests.json @@ -0,0 +1,35 @@ +[ + { + "test_name": "throughput_llama8B_tp1", + "parameters": { + "model": "meta-llama/Meta-Llama-3-8B", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + }, + { + "test_name": "throughput_llama70B_tp4", + "parameters": { + "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + }, + { + "test_name": "throughput_mixtral8x7B_tp2", + "parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + } +] \ No newline at end of file diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml new file mode 100644 index 0000000000000..1959f9752069f --- /dev/null +++ b/.buildkite/release-pipeline.yaml @@ -0,0 +1,21 @@ +steps: + - block: "Build wheels" + + - label: "Build wheel - Python {{matrix.python_version}}, CUDA {{matrix.cuda_version}}" + agents: + queue: cpu_queue + commands: + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image --target build --progress plain ." + - "mkdir artifacts" + - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image cp -r dist /artifacts_host" + - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/" + matrix: + setup: + cuda_version: + - "11.8.0" + - "12.1.0" + python_version: + - "3.8" + - "3.9" + - "3.10" + - "3.11" diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 6a86bc0ebfb66..f4fa24be1f20f 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -4,21 +4,23 @@ set -ex # Try building the docker image docker build -t cpu-test -f Dockerfile.cpu . +docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu . # Setup cleanup -remove_docker_container() { docker rm -f cpu-test || true; } +remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; } trap remove_docker_container EXIT remove_docker_container # Run the image docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test +docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2 # offline inference docker exec cpu-test bash -c "python3 examples/offline_inference.py" +docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" # Run basic model test docker exec cpu-test bash -c "cd tests; pip install pytest Pillow protobuf - bash ../.buildkite/download-images.sh cd ../ - pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py" + pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py" diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh new file mode 100644 index 0000000000000..22a7e76937a76 --- /dev/null +++ b/.buildkite/run-xpu-test.sh @@ -0,0 +1,14 @@ +# This script build the CPU docker image and run the offline inference inside the container. +# It serves a sanity check for compilation and basic model usage. +set -ex + +# Try building the docker image +docker build -t xpu-test -f Dockerfile.xpu . + +# Setup cleanup +remove_docker_container() { docker rm -f xpu-test || true; } +trap remove_docker_container EXIT +remove_docker_container + +# Run the image and launch offline inference +docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 6a2932db9f2dc..0b87e6280f0bb 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1,6 +1,6 @@ # In this file, you can add more tests to run either by adding a new step or # adding a new command to an existing step. See different options here for examples. -# This script will be feed into Jinja template in `test-template.j2` to generate +# This script will be feed into Jinja template in `test-template-aws.j2` to generate # the final pipeline yaml file. steps: @@ -28,15 +28,20 @@ steps: - label: Distributed Comm Ops Test #mirror_hardwares: [amd] - command: pytest -v -s distributed/test_comm_ops.py working_dir: "/vllm-workspace/tests" num_gpus: 2 + commands: + - pytest -v -s distributed/test_comm_ops.py + - pytest -v -s distributed/test_shm_broadcast.py -- label: Distributed Tests +- label: Distributed Tests (2 GPUs) mirror_hardwares: [amd] working_dir: "/vllm-workspace/tests" num_gpus: 2 commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py @@ -50,12 +55,19 @@ steps: - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py -- label: Distributed Tests (Multiple Groups) +- label: Distributed Tests (4 GPUs) #mirror_hardwares: [amd] working_dir: "/vllm-workspace/tests" num_gpus: 4 commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s distributed/test_pynccl.py + # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here. + # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context. + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - label: Engine Test mirror_hardwares: [amd] @@ -96,13 +108,13 @@ steps: - label: Models Test #mirror_hardwares: [amd] commands: - - pytest -v -s models -m \"not llava\" + - pytest -v -s models -m \"not vlm\" -- label: Llava Test +- label: Vision Language Models Test mirror_hardwares: [amd] commands: - bash ../.buildkite/download-images.sh - - pytest -v -s models -m llava + - pytest -v -s models -m vlm - label: Prefix Caching Test mirror_hardwares: [amd] @@ -141,6 +153,9 @@ steps: num_gpus: 4 # This test runs llama 13B, so it is required to run on 4 GPUs. commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s -x lora/test_long_context.py - label: Tensorizer Test @@ -155,6 +170,15 @@ steps: #mirror_hardwares: [amd] command: pytest -v -s quantization +- label: Tracing Test + commands: + - "pip install \ + opentelemetry-sdk \ + opentelemetry-api \ + opentelemetry-exporter-otlp \ + opentelemetry-semantic-conventions-ai" + - pytest -v -s tracing + - label: Benchmarks working_dir: "/vllm-workspace/.buildkite" mirror_hardwares: [amd] @@ -168,3 +192,16 @@ steps: commands: - pip install -r requirements-docs.txt - SPHINXOPTS=\"-W\" make html + +- label: Distributed Tests (A100) + gpu: a100 + num_gpus: 4 + commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + # NOTE: don't test llama model here, it seems hf implementation is buggy + # see https://github.com/vllm-project/vllm/pull/5689 for details + - pytest -v -s distributed/test_custom_all_reduce.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2 index 09649b625c319..1a7fb44c2ecc5 100644 --- a/.buildkite/test-template-aws.j2 +++ b/.buildkite/test-template-aws.j2 @@ -30,6 +30,7 @@ steps: command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}" env: DOCKER_BUILDKIT: "1" + priority: 100 soft_fail: true {% endif %} {% endfor %} @@ -41,13 +42,64 @@ steps: command: bash .buildkite/run-neuron-test.sh soft_fail: false - - label: "Intel Test" + - label: "Intel CPU Test" depends_on: ~ agents: - queue: intel + queue: intel-cpu command: bash .buildkite/run-cpu-test.sh + - label: "Intel GPU Test" + depends_on: ~ + agents: + queue: intel-gpu + command: bash .buildkite/run-xpu-test.sh + {% for step in steps %} + {% if step.gpu == "a100" %} + - label: "{{ step.label }}" + agents: + queue: a100-queue + soft_fail: {{ step.soft_fail or false }} + {% if step.parallelism %} + parallelism: {{ step.parallelism }} + {% endif %} + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 5 + - exit_status: -10 # Agent was lost + limit: 5 + plugins: + - kubernetes: + podSpec: + priorityClassName: ci + containers: + - image: {{ docker_image }} + command: ["bash"] + args: + - '-c' + - "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'" + resources: + limits: + nvidia.com/gpu: {{ step.num_gpus or 1 }} + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + {% else %} - label: "{{ step.label }}" agents: {% if step.label == "Documentation Build" %} @@ -89,4 +141,5 @@ steps: {% endif %} volumes: - /dev/shm:/dev/shm + {% endif %} {% endfor %} diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 deleted file mode 100644 index 4a20a462b98ec..0000000000000 --- a/.buildkite/test-template.j2 +++ /dev/null @@ -1,96 +0,0 @@ -{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %} -{% set default_num_gpu = 1 %} -{% set default_working_dir = "/vllm-workspace/tests" %} - -steps: - - label: ":docker: build image" - commands: - - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ." - - "docker push {{ docker_image }}" - env: - DOCKER_BUILDKIT: "1" - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 5 - - exit_status: -10 # Agent was lost - limit: 5 - - wait - - - group: "AMD Tests" - depends_on: ~ - steps: - {% for step in steps %} - {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %} - - label: "AMD: {{ step.label }}" - agents: - queue: amd - command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}" - env: - DOCKER_BUILDKIT: "1" - soft_fail: true - {% endif %} - {% endfor %} - - - label: "Neuron Test" - depends_on: ~ - agents: - queue: neuron - command: bash .buildkite/run-neuron-test.sh - soft_fail: false - - - label: "Intel Test" - depends_on: ~ - agents: - queue: intel - command: bash .buildkite/run-cpu-test.sh - - {% for step in steps %} - - label: "{{ step.label }}" - agents: - queue: kubernetes - soft_fail: {{ step.soft_fail or false }} - {% if step.parallelism %} - parallelism: {{ step.parallelism }} - {% endif %} - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 5 - - exit_status: -10 # Agent was lost - limit: 5 - plugins: - - kubernetes: - podSpec: - {% if step.num_gpus %} - priorityClassName: gpu-priority-cls-{{ step.num_gpus }} - {% endif %} - volumes: - - name: dshm - emptyDir: - medium: Memory - containers: - - image: "{{ docker_image }}" - command: ["bash"] - args: - - '-c' - - "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'" - {% if not step.no_gpu %} - resources: - requests: - nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}" - limits: - nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}" - {% endif %} - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - volumeMounts: - - mountPath: /dev/shm - name: dshm - {% endfor %} diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml index 22e6c2ef0101e..62f0dbcd93eff 100644 --- a/.github/workflows/mypy.yaml +++ b/.github/workflows/mypy.yaml @@ -47,5 +47,5 @@ jobs: mypy vllm/model_executor --config-file pyproject.toml mypy vllm/lora --config-file pyproject.toml mypy vllm/logging --config-file pyproject.toml - mypy vllm/model_executor --config-file pyproject.toml + mypy tests --config-file pyproject.toml diff --git a/.github/workflows/nm-nightly.yml b/.github/workflows/nm-nightly.yml index 89b2e6ea3074e..86201939d359a 100644 --- a/.github/workflows/nm-nightly.yml +++ b/.github/workflows/nm-nightly.yml @@ -35,8 +35,8 @@ jobs: test_configs: '[{"python":"3.8.17","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/full.txt"}, {"python":"3.9.17","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/full.txt"}, - {"python":"3.10.12","label":"gcp-k8s-l4-duo","test":"neuralmagic/tests/test_skip_env_vars/full.txt"}, - {"python":"3.11.4","label":"gcp-k8s-l4-duo","test":"neuralmagic/tests/test_skip_env_vars/full.txt"}]' + {"python":"3.10.12","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/full.txt"}, + {"python":"3.11.4","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/full.txt"}]' test_timeout: 480 benchmark_label: gcp-k8s-l4-solo diff --git a/.github/workflows/nm-remote-push.yml b/.github/workflows/nm-remote-push.yml index b23a2f9389512..5bbd760b63e0b 100644 --- a/.github/workflows/nm-remote-push.yml +++ b/.github/workflows/nm-remote-push.yml @@ -21,8 +21,8 @@ jobs: test_configs: '[{"python":"3.8.17","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/smoke.txt"}, {"python":"3.9.17","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/smoke.txt"}, - {"python":"3.10.12","label":"gcp-k8s-l4-duo","test":"neuralmagic/tests/test_skip_env_vars/smoke.txt"}, - {"python":"3.11.4","label":"gcp-k8s-l4-duo","test":"neuralmagic/tests/test_skip_env_vars/smoke.txt"}]' + {"python":"3.10.12","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/smoke.txt"}, + {"python":"3.11.4","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/smoke.txt"}]' test_timeout: 480 benchmark_label: gcp-k8s-l4-solo diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh index f8181c7758dbe..60a3978f9abd7 100644 --- a/.github/workflows/scripts/build.sh +++ b/.github/workflows/scripts/build.sh @@ -9,7 +9,7 @@ LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH # Install requirements $python_executable -m pip install wheel packaging -$python_executable -m pip install -r requirements-cuda.txt -r requirements-build.txt +$python_executable -m pip install -r requirements-cuda.txt # Limit the number of parallel jobs to avoid OOM export MAX_JOBS=1 diff --git a/Dockerfile b/Dockerfile index 7445e815f424b..60fe17c4f08d9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,9 +5,26 @@ # docs/source/dev/dockerfile/dockerfile.rst and # docs/source/assets/dev/dockerfile-stages-dependency.png +ARG CUDA_VERSION=12.4.1 #################### BASE BUILD IMAGE #################### # prepare basic build environment -FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS base + +ARG CUDA_VERSION=12.4.1 +ARG PYTHON_VERSION=3 + +ENV DEBIAN_FRONTEND=noninteractive + +RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ + && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ + && apt-get update -y \ + && apt-get install -y ccache software-properties-common \ + && add-apt-repository ppa:deadsnakes/ppa \ + && apt-get update -y \ + && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv python3-pip \ + && if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \ + && python3 --version \ + && python3 -m pip --version RUN apt-get update -y \ && apt-get install -y python3-pip git curl sudo @@ -16,7 +33,7 @@ RUN apt-get update -y \ # https://github.com/pytorch/pytorch/issues/107960 -- hopefully # this won't be needed for future versions of this docker image # or future versions of triton. -RUN ldconfig /usr/local/cuda-12.4/compat/ +RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ WORKDIR /workspace @@ -36,7 +53,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \ #################### BASE BUILD IMAGE #################### #################### WHEEL BUILD IMAGE #################### -FROM dev AS build +FROM base AS build + +ARG PYTHON_VERSION=3 # install compiler cache to speed up compilation leveraging local or remote caching RUN apt-get update -y && apt-get install -y ccache @@ -59,7 +78,8 @@ RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \ #################### vLLM installation IMAGE #################### # image with vLLM installed -FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base +FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base +ARG CUDA_VERSION=12.4.1 WORKDIR /vllm-workspace RUN apt-get update -y && \ @@ -69,7 +89,7 @@ RUN apt-get update -y && \ # https://github.com/pytorch/pytorch/issues/107960 -- hopefully # this won't be needed for future versions of this docker image # or future versions of triton. -RUN ldconfig /usr/local/cuda-12.4/compat/ +RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ # install nm-vllm wheel first, so that torch etc will be installed ARG build_type="NIGHTLY" @@ -124,7 +144,7 @@ FROM vllm-base AS vllm-openai # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ - pip install accelerate hf_transfer modelscope + pip install accelerate hf_transfer 'modelscope!=1.15.0' ENV VLLM_USAGE_SOURCE production-docker-image diff --git a/Dockerfile.cpu b/Dockerfile.cpu index 777bb08296ed9..6e55203decc56 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -21,6 +21,10 @@ WORKDIR /workspace/vllm RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu +# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ... +ARG VLLM_CPU_DISABLE_AVX512 +ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512} + RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install WORKDIR /workspace/ diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 954958df88fc0..6bda696859c8b 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -7,9 +7,8 @@ ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" RUN echo "Base image is $BASE_IMAGE" -# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" -# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" - +ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" \ + ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ARG FA_GFX_ARCHS="gfx90a;gfx942" RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS" @@ -42,6 +41,7 @@ RUN apt-get update && apt-get install -y \ unzip \ nvidia-cuda-toolkit \ tmux \ + ccache \ && rm -rf /var/lib/apt/lists/* ### Mount Point ### @@ -67,7 +67,7 @@ RUN if [ "$BUILD_FA" = "1" ]; then \ && git checkout ${FA_BRANCH} \ && git submodule update --init \ && export GPU_ARCHS=${FA_GFX_ARCHS} \ - && if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \ + && if [ "$BASE_IMAGE" = "$ROCm_5_7_BASE" ]; then \ patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \ && python3 setup.py install \ && cd ..; \ @@ -75,7 +75,7 @@ RUN if [ "$BUILD_FA" = "1" ]; then \ # Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt. # Manually removed it so that later steps of numpy upgrade can continue -RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \ +RUN if [ "$BASE_IMAGE" = "$ROCm_6_0_BASE" ]; then \ rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi # build triton @@ -102,13 +102,15 @@ ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 ENV VLLM_NCCL_SO_PATH=/opt/rocm/lib/librccl.so -RUN --mount=type=cache,target=/root/.cache/pip \ +ENV CCACHE_DIR=/root/.cache/ccache +RUN --mount=type=cache,target=/root/.cache/ccache \ + --mount=type=cache,target=/root/.cache/pip \ pip install -U -r requirements-rocm.txt \ - && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \ + && if [ "$BASE_IMAGE" = "$ROCm_6_0_BASE" ]; then \ + patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch; fi \ && python3 setup.py install \ - && cp build/lib.linux-x86_64-cpython-39/vllm/_C.abi3.so vllm/ \ - && cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.abi3.so vllm/ \ - && cp build/lib.linux-x86_64-cpython-39/vllm/_moe_C.abi3.so vllm/ \ + && export VLLM_PYTHON_VERSION=$(python -c "import sys; print(str(sys.version_info.major) + str(sys.version_info.minor))") \ + && cp build/lib.linux-x86_64-cpython-${VLLM_PYTHON_VERSION}/vllm/*.so vllm/ \ && cd .. diff --git a/Dockerfile.xpu b/Dockerfile.xpu new file mode 100644 index 0000000000000..c39e551672d20 --- /dev/null +++ b/Dockerfile.xpu @@ -0,0 +1,22 @@ +FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04 + +RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \ + echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \ + chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \ + rm /etc/apt/sources.list.d/intel-graphics.list && \ + wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \ + echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \ + chmod 644 /usr/share/keyrings/intel-graphics.gpg + +RUN apt-get update -y \ +&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip + +COPY ./ /workspace/vllm + +WORKDIR /workspace/vllm + +RUN pip install -v -r requirements-xpu.txt + +RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install + +CMD ["/bin/bash"] diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 52386b8cd62b3..4350b96b04a6a 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -4,10 +4,13 @@ import time import traceback from dataclasses import dataclass, field -from typing import List, Optional +from typing import List, Optional, Union import aiohttp +import huggingface_hub.constants from tqdm.asyncio import tqdm +from transformers import (AutoTokenizer, PreTrainedTokenizer, + PreTrainedTokenizerFast) AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) @@ -388,6 +391,30 @@ def remove_prefix(text: str, prefix: str) -> str: return text +def get_model(pretrained_model_name_or_path: str): + if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true': + from modelscope import snapshot_download + else: + from huggingface_hub import snapshot_download + + model_path = snapshot_download( + model_id=pretrained_model_name_or_path, + local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, + ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"]) + return model_path + + +def get_tokenizer( + pretrained_model_name_or_path: str, trust_remote_code: bool +) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: + if pretrained_model_name_or_path is not None and not os.path.exists( + pretrained_model_name_or_path): + pretrained_model_name_or_path = get_model( + pretrained_model_name_or_path) + return AutoTokenizer.from_pretrained(pretrained_model_name_or_path, + trust_remote_code=trust_remote_code) + + ASYNC_REQUEST_FUNCS = { "tgi": async_request_tgi, "vllm": async_request_openai_completions, diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 17edb7515964a..a4cf0632b7790 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -10,8 +10,10 @@ from tqdm import tqdm from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import EngineArgs from vllm.inputs import PromptStrictInputs from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.utils import FlexibleArgumentParser def main(args: argparse.Namespace): @@ -19,25 +21,30 @@ def main(args: argparse.Namespace): # NOTE(woosuk): If the request cannot be processed in a single batch, # the engine will automatically process the request in multiple batches. - llm = LLM(model=args.model, - speculative_model=args.speculative_model, - num_speculative_tokens=args.num_speculative_tokens, - tokenizer=args.tokenizer, - quantization=args.quantization, - tensor_parallel_size=args.tensor_parallel_size, - trust_remote_code=args.trust_remote_code, - dtype=args.dtype, - enforce_eager=args.enforce_eager, - kv_cache_dtype=args.kv_cache_dtype, - quantization_param_path=args.quantization_param_path, - device=args.device, - ray_workers_use_nsight=args.ray_workers_use_nsight, - use_v2_block_manager=args.use_v2_block_manager, - enable_chunked_prefill=args.enable_chunked_prefill, - download_dir=args.download_dir, - block_size=args.block_size, - gpu_memory_utilization=args.gpu_memory_utilization, - distributed_executor_backend=args.distributed_executor_backend) + llm = LLM( + model=args.model, + speculative_model=args.speculative_model, + num_speculative_tokens=args.num_speculative_tokens, + tokenizer=args.tokenizer, + quantization=args.quantization, + tensor_parallel_size=args.tensor_parallel_size, + trust_remote_code=args.trust_remote_code, + dtype=args.dtype, + max_model_len=args.max_model_len, + enforce_eager=args.enforce_eager, + kv_cache_dtype=args.kv_cache_dtype, + quantization_param_path=args.quantization_param_path, + device=args.device, + ray_workers_use_nsight=args.ray_workers_use_nsight, + use_v2_block_manager=args.use_v2_block_manager, + enable_chunked_prefill=args.enable_chunked_prefill, + download_dir=args.download_dir, + block_size=args.block_size, + gpu_memory_utilization=args.gpu_memory_utilization, + load_format=args.load_format, + distributed_executor_backend=args.distributed_executor_backend, + otlp_traces_endpoint=args.otlp_traces_endpoint, + ) sampling_params = SamplingParams( n=args.n, @@ -96,7 +103,7 @@ def run_to_completion(profile_dir: Optional[str] = None): for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): latencies.append(run_to_completion(profile_dir=None)) latencies = np.array(latencies) - percentages = [10, 25, 50, 75, 90] + percentages = [10, 25, 50, 75, 90, 99] percentiles = np.percentile(latencies, percentages) print(f'Avg latency: {np.mean(latencies)} seconds') for percentage, percentile in zip(percentages, percentiles): @@ -114,7 +121,7 @@ def run_to_completion(profile_dir: Optional[str] = None): if __name__ == '__main__': - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description='Benchmark the latency of processing a single batch of ' 'requests till completion.') parser.add_argument('--model', type=str, default='facebook/opt-125m') @@ -145,6 +152,12 @@ def run_to_completion(profile_dir: Optional[str] = None): parser.add_argument('--trust-remote-code', action='store_true', help='trust remote code from huggingface') + parser.add_argument( + '--max-model-len', + type=int, + default=None, + help='Maximum length of a sequence (including prompt and output). ' + 'If None, will be derived from the model.') parser.add_argument( '--dtype', type=str, @@ -189,7 +202,7 @@ def run_to_completion(profile_dir: Optional[str] = None): "--device", type=str, default="cuda", - choices=["cuda", "cpu", "tpu"], + choices=["cuda", "cpu", "tpu", "xpu"], help='device type for vLLM execution, supporting CUDA and CPU.') parser.add_argument('--block-size', type=int, @@ -222,6 +235,29 @@ def run_to_completion(profile_dir: Optional[str] = None): help='the fraction of GPU memory to be used for ' 'the model executor, which can range from 0 to 1.' 'If unspecified, will use the default value of 0.9.') + parser.add_argument( + '--load-format', + type=str, + default=EngineArgs.load_format, + choices=[ + 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer', + 'bitsandbytes' + ], + help='The format of the model weights to load.\n\n' + '* "auto" will try to load the weights in the safetensors format ' + 'and fall back to the pytorch bin format if safetensors format ' + 'is not available.\n' + '* "pt" will load the weights in the pytorch bin format.\n' + '* "safetensors" will load the weights in the safetensors format.\n' + '* "npcache" will load the weights in pytorch format and store ' + 'a numpy cache to speed up the loading.\n' + '* "dummy" will initialize the weights with random values, ' + 'which is mainly for profiling.\n' + '* "tensorizer" will load the weights using tensorizer from ' + 'CoreWeave. See the Tensorize vLLM Model script in the Examples' + 'section for more information.\n' + '* "bitsandbytes" will load the weights using bitsandbytes ' + 'quantization.\n') parser.add_argument( '--distributed-executor-backend', choices=['ray', 'mp'], @@ -229,5 +265,10 @@ def run_to_completion(profile_dir: Optional[str] = None): help='Backend to use for distributed serving. When more than 1 GPU ' 'is used, will be automatically set to "ray" if installed ' 'or "mp" (multiprocessing) otherwise.') + parser.add_argument( + '--otlp-traces-endpoint', + type=str, + default=None, + help='Target URL to which OpenTelemetry traces will be sent.') args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index 089966986984f..395107a5ec747 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -1,7 +1,7 @@ -import argparse import time from vllm import LLM, SamplingParams +from vllm.utils import FlexibleArgumentParser PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501 @@ -44,7 +44,7 @@ def main(args): if __name__ == "__main__": - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description='Benchmark the performance with or without automatic ' 'prefix caching.') parser.add_argument('--model', diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 4112a3272518e..42867fc40edd2 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -31,7 +31,7 @@ import warnings from dataclasses import dataclass from datetime import datetime -from typing import AsyncGenerator, List, Optional, Tuple +from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple import numpy as np from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, @@ -39,7 +39,15 @@ from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase -from vllm.transformers_utils.tokenizer import get_tokenizer +try: + from vllm.transformers_utils.tokenizer import get_tokenizer +except ImportError: + from backend_request_func import get_tokenizer + +try: + from vllm.utils import FlexibleArgumentParser +except ImportError: + from argparse import ArgumentParser as FlexibleArgumentParser @dataclass @@ -200,12 +208,12 @@ def calculate_metrics( dur_s: float, tokenizer: PreTrainedTokenizerBase, ) -> Tuple[BenchmarkMetrics, List[int]]: - actual_output_lens = [] + actual_output_lens: List[int] = [] total_input = 0 completed = 0 - itls = [] - tpots = [] - ttfts = [] + itls: List[float] = [] + tpots: List[float] = [] + ttfts: List[float] = [] for i in range(len(outputs)): if outputs[i].success: # We use the tokenizer to count the number of output tokens for all @@ -265,7 +273,7 @@ async def benchmark( disable_tqdm: bool, ): if backend in ASYNC_REQUEST_FUNCS: - request_func = ASYNC_REQUEST_FUNCS.get(backend) + request_func = ASYNC_REQUEST_FUNCS[backend] else: raise ValueError(f"Unknown backend: {backend}") @@ -292,7 +300,7 @@ async def benchmark( pbar = None if disable_tqdm else tqdm(total=len(input_requests)) benchmark_start_time = time.perf_counter() - tasks = [] + tasks: List[asyncio.Task] = [] async for request in get_request(input_requests, request_rate): prompt, prompt_len, output_len = request request_func_input = RequestFuncInput( @@ -310,7 +318,7 @@ async def benchmark( pbar=pbar))) outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) - if not disable_tqdm: + if pbar is not None: pbar.close() benchmark_duration = time.perf_counter() - benchmark_start_time @@ -466,7 +474,7 @@ def main(args: argparse.Namespace): # Save config and results to json if args.save_result: - result_json = {} + result_json: Dict[str, Any] = {} # Setup current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") @@ -499,6 +507,8 @@ def main(args: argparse.Namespace): # Save to file base_model_id = model_id.split("/")[-1] file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa + if args.result_filename: + file_name = args.result_filename if args.result_dir: file_name = os.path.join(args.result_dir, file_name) with open(file_name, "w") as outfile: @@ -506,7 +516,7 @@ def main(args: argparse.Namespace): if __name__ == "__main__": - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="Benchmark the online serving throughput.") parser.add_argument( "--backend", @@ -639,6 +649,15 @@ def main(args: argparse.Namespace): help="Specify directory to save benchmark json results." "If not specified, results are saved in the current directory.", ) + parser.add_argument( + "--result-filename", + type=str, + default=None, + help="Specify the filename to save benchmark json results." + "If not specified, results will be saved in " + "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" + " format.", + ) args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 07b2f85410e3c..2c6beb4e89672 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -10,7 +10,9 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) +from vllm.engine.arg_utils import EngineArgs from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.utils import FlexibleArgumentParser def sample_requests( @@ -81,6 +83,7 @@ def run_vllm( distributed_executor_backend: Optional[str], gpu_memory_utilization: float = 0.9, download_dir: Optional[str] = None, + load_format: str = EngineArgs.load_format, ) -> float: from vllm import LLM, SamplingParams llm = LLM( @@ -102,11 +105,12 @@ def run_vllm( enable_chunked_prefill=enable_chunked_prefill, max_num_batched_tokens=max_num_batched_tokens, distributed_executor_backend=distributed_executor_backend, + load_format=load_format, ) # Add the requests to the engine. - prompts = [] - sampling_params = [] + prompts: List[str] = [] + sampling_params: List[SamplingParams] = [] for prompt, _, output_len in requests: prompts.append(prompt) sampling_params.append( @@ -228,7 +232,7 @@ def main(args: argparse.Namespace): args.quantization_param_path, args.device, args.enable_prefix_caching, args.enable_chunked_prefill, args.max_num_batched_tokens, args.distributed_executor_backend, - args.gpu_memory_utilization, args.download_dir) + args.gpu_memory_utilization, args.download_dir, args.load_format) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -258,7 +262,7 @@ def main(args: argparse.Namespace): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Benchmark the throughput.") + parser = FlexibleArgumentParser(description="Benchmark the throughput.") parser.add_argument("--backend", type=str, choices=["vllm", "hf", "mii"], @@ -346,7 +350,7 @@ def main(args: argparse.Namespace): "--device", type=str, default="cuda", - choices=["cuda", "cpu", "tpu"], + choices=["cuda", "cpu", "tpu", "xpu"], help='device type for vLLM execution, supporting CUDA and CPU.') parser.add_argument( "--enable-prefix-caching", @@ -377,6 +381,29 @@ def main(args: argparse.Namespace): help='Backend to use for distributed serving. When more than 1 GPU ' 'is used, will be automatically set to "ray" if installed ' 'or "mp" (multiprocessing) otherwise.') + parser.add_argument( + '--load-format', + type=str, + default=EngineArgs.load_format, + choices=[ + 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer', + 'bitsandbytes' + ], + help='The format of the model weights to load.\n\n' + '* "auto" will try to load the weights in the safetensors format ' + 'and fall back to the pytorch bin format if safetensors format ' + 'is not available.\n' + '* "pt" will load the weights in the pytorch bin format.\n' + '* "safetensors" will load the weights in the safetensors format.\n' + '* "npcache" will load the weights in pytorch format and store ' + 'a numpy cache to speed up the loading.\n' + '* "dummy" will initialize the weights with random values, ' + 'which is mainly for profiling.\n' + '* "tensorizer" will load the weights using tensorizer from ' + 'CoreWeave. See the Tensorize vLLM Model script in the Examples' + 'section for more information.\n' + '* "bitsandbytes" will load the weights using bitsandbytes ' + 'quantization.\n') args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index 182105f0b33f2..377f8683c021f 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -11,6 +11,7 @@ from weight_shapes import WEIGHT_SHAPES from vllm import _custom_ops as ops +from vllm.utils import FlexibleArgumentParser DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:] DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] @@ -46,7 +47,7 @@ def make_rand_tensors(dtype: torch.dtype, m: int, n: int, # impl -def pytorch_i8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor, +def pytorch_mm_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor, scale_b: torch.tensor, out_dtype: torch.dtype) -> torch.tensor: return torch.mm(a, b) @@ -115,14 +116,13 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str, timers.append( bench_fn(a.to(dtype=torch.bfloat16, device="cuda"), b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b, - torch.bfloat16, label, sub_label, pytorch_i8_impl, + torch.bfloat16, label, sub_label, pytorch_mm_impl, "pytorch_bf16_bf16_bf16_matmul-no-scales")) # cutlass impl timers.append( - bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"), - torch.bfloat16, label, sub_label, cutlass_impl, - "cutlass_i8_i8_bf16_scaled_mm")) + bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label, + cutlass_impl, "cutlass_i8_i8_bf16_scaled_mm")) return timers @@ -136,6 +136,13 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str, timers = [] + # pytorch impl w. bf16 + timers.append( + bench_fn(a.to(dtype=torch.bfloat16, device="cuda"), + b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b, + torch.bfloat16, label, sub_label, pytorch_mm_impl, + "pytorch_bf16_bf16_bf16_matmul-no-scales")) + # pytorch impl: bf16 output, without fp8 fast accum timers.append( bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label, @@ -160,14 +167,12 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str, # cutlass impl: bf16 output timers.append( - bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"), - torch.bfloat16, label, sub_label, cutlass_impl, - "cutlass_fp8_fp8_bf16_scaled_mm")) + bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label, + cutlass_impl, "cutlass_fp8_fp8_bf16_scaled_mm")) # cutlass impl: fp16 output timers.append( - bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"), - torch.float16, label, sub_label, cutlass_impl, - "cutlass_fp8_fp8_fp16_scaled_mm")) + bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label, + cutlass_impl, "cutlass_fp8_fp8_fp16_scaled_mm")) return timers @@ -289,7 +294,7 @@ def to_torch_dtype(dt): return torch.float8_e4m3fn raise ValueError("unsupported dtype") - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description=""" Benchmark Cutlass GEMM. diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py index 7ad4a53d376b6..25ec9d6028627 100644 --- a/benchmarks/cutlass_benchmarks/weight_shapes.py +++ b/benchmarks/cutlass_benchmarks/weight_shapes.py @@ -22,6 +22,12 @@ ([4096, 22016], 1), ([11008, 4096], 0), ], + "meta-llama/Llama-3-8b": [ + ([4096, 6144], 1), + ([4096, 4096], 0), + ([4096, 28672], 1), + ([14336, 4096], 0), + ], "meta-llama/Llama-2-13b-hf": [ ([5120, 15360], 1), ([5120, 5120], 0), diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py index 59392947b15c8..601c4ea439aea 100644 --- a/benchmarks/kernels/benchmark_aqlm.py +++ b/benchmarks/kernels/benchmark_aqlm.py @@ -1,4 +1,3 @@ -import argparse import os import sys from typing import Optional @@ -10,6 +9,7 @@ from vllm.model_executor.layers.quantization.aqlm import ( dequantize_weight, generic_dequantize_gemm, get_int_dtype, optimized_dequantize_gemm) +from vllm.utils import FlexibleArgumentParser os.environ['CUDA_VISIBLE_DEVICES'] = '0' @@ -86,9 +86,9 @@ def dequant_no_scale( # Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against # the generic pytorch version. # Just visual comparison. -def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None: +def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None: - n = parts.sum().item() + n = int(parts.sum().item()) device = torch.device('cuda:0') @@ -137,7 +137,7 @@ def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None: def main(): - parser = argparse.ArgumentParser(description="Benchmark aqlm performance.") + parser = FlexibleArgumentParser(description="Benchmark aqlm performance.") # Add arguments parser.add_argument("--nbooks", @@ -204,7 +204,7 @@ def main(): sys.stdout = sys.__stdout__ -def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int, +def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, methods): # I didn't see visible improvements from increasing these, but feel free :) @@ -252,10 +252,10 @@ def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int, print('') -def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor, +def run_timing(num_calls: int, m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, method) -> float: - n = parts.sum().item() + n = int(parts.sum().item()) device = torch.device('cuda:0') diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py index b771911781574..261f5829631ee 100644 --- a/benchmarks/kernels/benchmark_marlin.py +++ b/benchmarks/kernels/benchmark_marlin.py @@ -1,4 +1,4 @@ -import argparse +from typing import List import torch import torch.utils.benchmark as benchmark @@ -15,6 +15,7 @@ MarlinWorkspace, marlin_24_quantize, marlin_quantize) from vllm.model_executor.layers.quantization.utils.quant_utils import ( gptq_pack, quantize_weights, sort_weights) +from vllm.utils import FlexibleArgumentParser DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"] DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] @@ -23,8 +24,9 @@ K_FULL_OPTS = [False, True] -def bench_run(results, model, act_order, is_k_full, num_bits, group_size, - size_m, size_k, size_n): +def bench_run(results: List[benchmark.Measurement], model: str, + act_order: bool, is_k_full: bool, num_bits: int, group_size: int, + size_m: int, size_k: int, size_n: int): label = "Quant Matmul" sub_label = ("{}, act={} k_full={}, b={}, g={}, " @@ -156,7 +158,7 @@ def main(args): for i, model in enumerate(args.models): print(f"[{i}] {model}") - results = [] + results: List[benchmark.Measurement] = [] for model in args.models: for layer in WEIGHT_SHAPES[model]: @@ -209,7 +211,7 @@ def main(args): # python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501 # if __name__ == "__main__": - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="Benchmark Marlin across specified models/shapes/batches") parser.add_argument( "--models", diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index be5dd32bd6f91..e00696d6d43cb 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -1,7 +1,7 @@ import argparse import time from datetime import datetime -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List, Tuple, TypedDict import ray import torch @@ -10,10 +10,20 @@ from transformers import AutoConfig from vllm.model_executor.layers.fused_moe.fused_moe import * +from vllm.utils import FlexibleArgumentParser + + +class BenchmarkConfig(TypedDict): + BLOCK_SIZE_M: int + BLOCK_SIZE_N: int + BLOCK_SIZE_K: int + GROUP_SIZE_M: int + num_warps: int + num_stages: int def benchmark_config( - config: Dict[str, int], + config: BenchmarkConfig, num_tokens: int, num_experts: int, shard_intermediate_size: int, @@ -92,7 +102,7 @@ def run(): start_event = torch.cuda.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True) - latencies = [] + latencies: List[float] = [] for i in range(num_iters): prepare(i) torch.cuda.synchronize() @@ -111,7 +121,7 @@ def get_configs_compute_bound() -> List[Dict[str, int]]: # Reduced search space for faster tuning. # TODO(woosuk): Increase the search space and use a performance model to # prune the search space. - configs = [] + configs: List[BenchmarkConfig] = [] for num_stages in [2, 3, 4, 5]: for block_m in [16, 32, 64, 128, 256]: for block_k in [64, 128, 256]: @@ -175,8 +185,8 @@ def tune( topk: int, dtype: torch.dtype, use_fp8: bool, - search_space: List[Dict[str, int]], - ) -> Dict[str, int]: + search_space: List[BenchmarkConfig], + ) -> BenchmarkConfig: best_config = None best_time = float("inf") for config in tqdm(search_space): @@ -199,10 +209,11 @@ def tune( best_config = config now = datetime.now() print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}") + assert best_config is not None return best_config -def sort_config(config: Dict[str, int]) -> Dict[str, int]: +def sort_config(config: BenchmarkConfig) -> BenchmarkConfig: return { "BLOCK_SIZE_M": config["BLOCK_SIZE_M"], "BLOCK_SIZE_N": config["BLOCK_SIZE_N"], @@ -214,7 +225,7 @@ def sort_config(config: Dict[str, int]) -> Dict[str, int]: def save_configs( - configs: Dict[int, Dict[str, int]], + configs: Dict[int, BenchmarkConfig], num_experts: int, shard_intermediate_size: int, hidden_size: int, @@ -305,7 +316,7 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]: if __name__ == "__main__": - parser = argparse.ArgumentParser() + parser = FlexibleArgumentParser() parser.add_argument("--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1") diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index e6f4e9e6b9716..16de60477c305 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -1,12 +1,12 @@ -import argparse import random import time -from typing import Optional +from typing import List, Optional import torch from vllm import _custom_ops as ops -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random +from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser, + create_kv_caches_with_random) NUM_BLOCKS = 1024 PARTITION_SIZE = 512 @@ -54,14 +54,17 @@ def main( # Create the block tables. max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size - block_tables = [] + block_tables_lst: List[List[int]] = [] for _ in range(num_seqs): block_table = [ random.randint(0, NUM_BLOCKS - 1) for _ in range(max_num_blocks_per_seq) ] - block_tables.append(block_table) - block_tables = torch.tensor(block_tables, dtype=torch.int, device=device) + block_tables_lst.append(block_table) + + block_tables = torch.tensor(block_tables_lst, + dtype=torch.int, + device=device) # Create the KV cache. key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS, @@ -158,14 +161,14 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: if __name__ == '__main__': - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="Benchmark the paged attention kernel.") parser.add_argument("--version", type=str, choices=["v1", "v2"], default="v2") parser.add_argument("--batch-size", type=int, default=8) - parser.add_argument("--seq_len", type=int, default=4096) + parser.add_argument("--seq-len", type=int, default=4096) parser.add_argument("--num-query-heads", type=int, default=64) parser.add_argument("--num-kv-heads", type=int, default=8) parser.add_argument("--head-size", diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 00e55f6060b52..78736c7a7ba6f 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -1,11 +1,12 @@ -import argparse from itertools import accumulate -from typing import Optional +from typing import List, Optional import nvtx import torch -from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding, + get_rope) +from vllm.utils import FlexibleArgumentParser def benchmark_rope_kernels_multi_lora( @@ -37,7 +38,7 @@ def benchmark_rope_kernels_multi_lora( }) # non-batched RoPE takes only one scaling factor, we create multiple # instances to simulate the same behavior - non_batched_ropes = [] + non_batched_ropes: List[RotaryEmbedding] = [] for scaling_factor in scaling_factors: non_batched_ropes.append( get_rope(head_size, rotary_dim, max_position, base, is_neox_style, @@ -85,7 +86,7 @@ def benchmark_rope_kernels_multi_lora( if __name__ == '__main__': - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="Benchmark the rotary embedding kernels.") parser.add_argument("--is-neox-style", type=bool, default=True) parser.add_argument("--batch-size", type=int, default=16) diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py index c846e47de1fcf..203699e9a8d06 100644 --- a/benchmarks/overheads/benchmark_hashing.py +++ b/benchmarks/overheads/benchmark_hashing.py @@ -1,8 +1,8 @@ -import argparse import cProfile import pstats from vllm import LLM, SamplingParams +from vllm.utils import FlexibleArgumentParser # A very long prompt, total number of tokens is about 15k. LONG_PROMPT = ["You are an expert in large language models, aren't you?" @@ -47,7 +47,7 @@ def main(args): if __name__ == "__main__": - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description='Benchmark the performance of hashing function in' 'automatic prefix caching.') parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k') diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index a644e5b6a8b21..511e443f78403 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -33,10 +33,21 @@ function (find_isa CPUINFO TARGET OUT) endif() endfunction() +function (is_avx512_disabled OUT) + set(DISABLE_AVX512 $ENV{VLLM_CPU_DISABLE_AVX512}) + if(DISABLE_AVX512 AND DISABLE_AVX512 STREQUAL "true") + set(${OUT} ON PARENT_SCOPE) + else() + set(${OUT} OFF PARENT_SCOPE) + endif() +endfunction() + +is_avx512_disabled(AVX512_DISABLED) + find_isa(${CPUINFO} "avx2" AVX2_FOUND) find_isa(${CPUINFO} "avx512f" AVX512_FOUND) -if (AVX512_FOUND) +if (AVX512_FOUND AND NOT AVX512_DISABLED) list(APPEND CXX_COMPILE_FLAGS "-mavx512f" "-mavx512vl" diff --git a/cmake/utils.cmake b/cmake/utils.cmake index f3c1286dd8498..071e16336dfa2 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -155,8 +155,11 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) # Find the intersection of the supported + detected architectures to # set the module architecture flags. # + + set(VLLM_ROCM_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100") + set(${GPU_ARCHES}) - foreach (_ARCH ${CMAKE_HIP_ARCHITECTURES}) + foreach (_ARCH ${VLLM_ROCM_SUPPORTED_ARCHS}) if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST) list(APPEND ${GPU_ARCHES} ${_ARCH}) endif() diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu index 86ac2e75e78ee..5ed1dc3b8f792 100644 --- a/csrc/activation_kernels.cu +++ b/csrc/activation_kernels.cu @@ -135,6 +135,12 @@ __device__ __forceinline__ T gelu_fast_kernel(const T& x) { return ((T)0.5) * x * (((T)1.0) + t); } +template +__device__ __forceinline__ T gelu_quick_kernel(const T& x) { + // x * sigmoid(1.702 * x) + return (T)(((float)x) / (1.0f + expf(-1.702f * (float)x))); +} + } // namespace vllm void gelu_new(torch::Tensor& out, // [..., d] @@ -148,3 +154,9 @@ void gelu_fast(torch::Tensor& out, // [..., d] { LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel); } + +void gelu_quick(torch::Tensor& out, // [..., d] + torch::Tensor& input) // [..., d] +{ + LAUNCH_ACTIVATION_KERNEL(vllm::gelu_quick_kernel); +} diff --git a/csrc/cpu/activation.cpp b/csrc/cpu/activation.cpp index becd2ac42f17a..039b8d5c30d46 100644 --- a/csrc/cpu/activation.cpp +++ b/csrc/cpu/activation.cpp @@ -59,6 +59,13 @@ FORCE_INLINE vec_op::FP32Vec8 gelu_fast_act(const vec_op::FP32Vec8& x) { return w3 * x * (ones + t); } +FORCE_INLINE vec_op::FP32Vec8 gelu_quick_act(const vec_op::FP32Vec8& x) { + const vec_op::FP32Vec8 zeros(0.0); + const vec_op::FP32Vec8 ones(1.0); + const vec_op::FP32Vec8 w1(1.702f); + return x / (ones + (zeros - w1 * x).exp()); +} + FORCE_INLINE vec_op::FP32Vec8 gelu_act(const vec_op::FP32Vec8& x) { const vec_op::FP32Vec8 ones(1.0); const vec_op::FP32Vec8 w1(M_SQRT1_2); @@ -142,3 +149,15 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input) { CPU_KERNEL_GUARD_OUT(gelu_fast_impl) }); } + +void gelu_quick(torch::Tensor& out, torch::Tensor& input) { + int num_tokens = input.numel() / input.size(-1); + int d = input.size(-1); + + VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_quick_impl", [&] { + CPU_KERNEL_GUARD_IN(gelu_quick_impl) + activation_kernel( + num_tokens, d, input.data_ptr(), out.data_ptr()); + CPU_KERNEL_GUARD_OUT(gelu_quick_impl) + }); +} diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index a2bf0d49adba5..39e8cf3ed3c10 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -58,6 +58,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("gelu_fast(Tensor! out, Tensor input) -> ()"); ops.impl("gelu_fast", torch::kCPU, &gelu_fast); + // Quick GELU implementation. + ops.def("gelu_quick(Tensor! out, Tensor input) -> ()"); + ops.impl("gelu_quick", torch::kCPU, &gelu_quick); + // Layernorm // Apply Root Mean Square (RMS) Normalization to the input tensor. ops.def( diff --git a/csrc/ops.h b/csrc/ops.h index 9e2e977fa3c2e..6f0a7143c9169 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -49,6 +49,8 @@ void gelu_new(torch::Tensor& out, torch::Tensor& input); void gelu_fast(torch::Tensor& out, torch::Tensor& input); +void gelu_quick(torch::Tensor& out, torch::Tensor& input); + #ifndef USE_ROCM torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes, const torch::Tensor& codebooks, @@ -90,6 +92,8 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm, int64_t size_k, int64_t size_n, int64_t num_bits); +bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability); + void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales); diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h old mode 100644 new mode 100755 index 4b376261d30d2..cb6694b3036e9 --- a/csrc/punica/bgmv/bgmv_config.h +++ b/csrc/punica/bgmv/bgmv_config.h @@ -16,14 +16,20 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 512) \ f(in_T, out_T, W_T, narrow, 640) \ f(in_T, out_T, W_T, narrow, 768) \ + f(in_T, out_T, W_T, narrow, 896) \ f(in_T, out_T, W_T, narrow, 1024) \ f(in_T, out_T, W_T, narrow, 1152) \ + f(in_T, out_T, W_T, narrow, 1216) \ f(in_T, out_T, W_T, narrow, 1280) \ f(in_T, out_T, W_T, narrow, 1536) \ + f(in_T, out_T, W_T, narrow, 1664) \ f(in_T, out_T, W_T, narrow, 1728) \ f(in_T, out_T, W_T, narrow, 1792) \ f(in_T, out_T, W_T, narrow, 2048) \ + f(in_T, out_T, W_T, narrow, 2240) \ f(in_T, out_T, W_T, narrow, 2304) \ + f(in_T, out_T, W_T, narrow, 2368) \ + f(in_T, out_T, W_T, narrow, 2432) \ f(in_T, out_T, W_T, narrow, 2560) \ f(in_T, out_T, W_T, narrow, 2752) \ f(in_T, out_T, W_T, narrow, 2816) \ @@ -31,32 +37,47 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 3328) \ f(in_T, out_T, W_T, narrow, 3456) \ f(in_T, out_T, W_T, narrow, 3584) \ + f(in_T, out_T, W_T, narrow, 3712) \ f(in_T, out_T, W_T, narrow, 4096) \ + f(in_T, out_T, W_T, narrow, 4480) \ f(in_T, out_T, W_T, narrow, 4608) \ + f(in_T, out_T, W_T, narrow, 4736) \ + f(in_T, out_T, W_T, narrow, 4864) \ f(in_T, out_T, W_T, narrow, 5120) \ f(in_T, out_T, W_T, narrow, 5504) \ f(in_T, out_T, W_T, narrow, 5632) \ + f(in_T, out_T, W_T, narrow, 5888) \ f(in_T, out_T, W_T, narrow, 6144) \ f(in_T, out_T, W_T, narrow, 6400) \ f(in_T, out_T, W_T, narrow, 6848) \ f(in_T, out_T, W_T, narrow, 6912) \ f(in_T, out_T, W_T, narrow, 7168) \ + f(in_T, out_T, W_T, narrow, 7424) \ f(in_T, out_T, W_T, narrow, 8192) \ + f(in_T, out_T, W_T, narrow, 8960) \ f(in_T, out_T, W_T, narrow, 9216) \ + f(in_T, out_T, W_T, narrow, 9472) \ f(in_T, out_T, W_T, narrow, 10240) \ f(in_T, out_T, W_T, narrow, 11008) \ + f(in_T, out_T, W_T, narrow, 11264) \ f(in_T, out_T, W_T, narrow, 12288) \ f(in_T, out_T, W_T, narrow, 13696) \ f(in_T, out_T, W_T, narrow, 13824) \ f(in_T, out_T, W_T, narrow, 14336) \ + f(in_T, out_T, W_T, narrow, 14784) \ + f(in_T, out_T, W_T, narrow, 14848) \ f(in_T, out_T, W_T, narrow, 15360) \ f(in_T, out_T, W_T, narrow, 16384) \ + f(in_T, out_T, W_T, narrow, 18944) \ f(in_T, out_T, W_T, narrow, 20480) \ f(in_T, out_T, W_T, narrow, 22016) \ + f(in_T, out_T, W_T, narrow, 22528) \ f(in_T, out_T, W_T, narrow, 24576) \ f(in_T, out_T, W_T, narrow, 27392) \ f(in_T, out_T, W_T, narrow, 27648) \ f(in_T, out_T, W_T, narrow, 28672) \ + f(in_T, out_T, W_T, narrow, 29568) \ + f(in_T, out_T, W_T, narrow, 29696) \ f(in_T, out_T, W_T, narrow, 32000) \ f(in_T, out_T, W_T, narrow, 32256) \ f(in_T, out_T, W_T, narrow, 32512) \ @@ -65,6 +86,8 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 36864) \ f(in_T, out_T, W_T, narrow, 43264) \ f(in_T, out_T, W_T, narrow, 49152) \ + f(in_T, out_T, W_T, narrow, 60544) \ + f(in_T, out_T, W_T, narrow, 60672) \ f(in_T, out_T, W_T, narrow, 64000) \ f(in_T, out_T, W_T, narrow, 64256) \ f(in_T, out_T, W_T, narrow, 64512) \ @@ -74,12 +97,14 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 128000) \ f(in_T, out_T, W_T, narrow, 128256) \ f(in_T, out_T, W_T, narrow, 128512) \ + + // Keep above in sync with vllm/lora/layers::LogitsProcessorWithLoRA // and vllm/tests/lora/test_punica.py -// Used for defining kernels going from the variety of +// Used for defining kernels going from the variety of // dim in to the narrow dim out - // Using it for the fully sharded column + // Using it for the fully sharded column // parallel LoRA A which splits the rank dim #define FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, narrow) \ f(in_T, out_T, W_T, 128, narrow) \ @@ -87,14 +112,20 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 512, narrow) \ f(in_T, out_T, W_T, 640, narrow) \ f(in_T, out_T, W_T, 768, narrow) \ + f(in_T, out_T, W_T, 896, narrow) \ f(in_T, out_T, W_T, 1024, narrow) \ f(in_T, out_T, W_T, 1152, narrow) \ + f(in_T, out_T, W_T, 1216, narrow) \ f(in_T, out_T, W_T, 1280, narrow) \ f(in_T, out_T, W_T, 1536, narrow) \ + f(in_T, out_T, W_T, 1664, narrow) \ f(in_T, out_T, W_T, 1728, narrow) \ f(in_T, out_T, W_T, 1792, narrow) \ f(in_T, out_T, W_T, 2048, narrow) \ + f(in_T, out_T, W_T, 2240, narrow) \ f(in_T, out_T, W_T, 2304, narrow) \ + f(in_T, out_T, W_T, 2368, narrow) \ + f(in_T, out_T, W_T, 2432, narrow) \ f(in_T, out_T, W_T, 2560, narrow) \ f(in_T, out_T, W_T, 2752, narrow) \ f(in_T, out_T, W_T, 2816, narrow) \ @@ -102,32 +133,47 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 3328, narrow) \ f(in_T, out_T, W_T, 3456, narrow) \ f(in_T, out_T, W_T, 3584, narrow) \ + f(in_T, out_T, W_T, 3712, narrow) \ f(in_T, out_T, W_T, 4096, narrow) \ + f(in_T, out_T, W_T, 4480, narrow) \ f(in_T, out_T, W_T, 4608, narrow) \ + f(in_T, out_T, W_T, 4736, narrow) \ + f(in_T, out_T, W_T, 4864, narrow) \ f(in_T, out_T, W_T, 5120, narrow) \ f(in_T, out_T, W_T, 5504, narrow) \ f(in_T, out_T, W_T, 5632, narrow) \ + f(in_T, out_T, W_T, 5888, narrow) \ f(in_T, out_T, W_T, 6144, narrow) \ f(in_T, out_T, W_T, 6400, narrow) \ f(in_T, out_T, W_T, 6848, narrow) \ f(in_T, out_T, W_T, 6912, narrow) \ f(in_T, out_T, W_T, 7168, narrow) \ + f(in_T, out_T, W_T, 7424, narrow) \ f(in_T, out_T, W_T, 8192, narrow) \ + f(in_T, out_T, W_T, 8960, narrow) \ f(in_T, out_T, W_T, 9216, narrow) \ + f(in_T, out_T, W_T, 9472, narrow) \ f(in_T, out_T, W_T, 10240, narrow) \ f(in_T, out_T, W_T, 11008, narrow) \ + f(in_T, out_T, W_T, 11264, narrow) \ f(in_T, out_T, W_T, 12288, narrow) \ f(in_T, out_T, W_T, 13696, narrow) \ f(in_T, out_T, W_T, 13824, narrow) \ f(in_T, out_T, W_T, 14336, narrow) \ + f(in_T, out_T, W_T, 14784, narrow) \ + f(in_T, out_T, W_T, 14848, narrow) \ f(in_T, out_T, W_T, 15360, narrow) \ f(in_T, out_T, W_T, 16384, narrow) \ + f(in_T, out_T, W_T, 18944, narrow) \ f(in_T, out_T, W_T, 20480, narrow) \ f(in_T, out_T, W_T, 22016, narrow) \ + f(in_T, out_T, W_T, 22528, narrow) \ f(in_T, out_T, W_T, 24576, narrow) \ f(in_T, out_T, W_T, 27392, narrow) \ f(in_T, out_T, W_T, 27648, narrow) \ f(in_T, out_T, W_T, 28672, narrow) \ + f(in_T, out_T, W_T, 29568, narrow) \ + f(in_T, out_T, W_T, 29696, narrow) \ f(in_T, out_T, W_T, 32000, narrow) \ f(in_T, out_T, W_T, 32256, narrow) \ f(in_T, out_T, W_T, 32512, narrow) \ @@ -136,6 +182,8 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 36864, narrow) \ f(in_T, out_T, W_T, 43264, narrow) \ f(in_T, out_T, W_T, 49152, narrow) \ + f(in_T, out_T, W_T, 60544, narrow) \ + f(in_T, out_T, W_T, 60672, narrow) \ f(in_T, out_T, W_T, 64000, narrow) \ f(in_T, out_T, W_T, 64256, narrow) \ f(in_T, out_T, W_T, 64512, narrow) \ diff --git a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp index 8f38bbf507901..877a9f5b9e5de 100644 --- a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp +++ b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp @@ -153,7 +153,7 @@ struct Sm90RowOrScalarBroadcast { CUTLASS_DEVICE void begin(uint64_t* full_mbarrier_ptr, int load_iteration, bool issue_tma_load) { - if (params.ptr_row == nullptr) { + if (!params.row_broadcast) { return; } diff --git a/csrc/quantization/cutlass_w8a8/common.hpp b/csrc/quantization/cutlass_w8a8/common.hpp index 999b7b251ab33..bf04bb400790f 100644 --- a/csrc/quantization/cutlass_w8a8/common.hpp +++ b/csrc/quantization/cutlass_w8a8/common.hpp @@ -1,6 +1,7 @@ #pragma once #include "cutlass/cutlass.h" +#include /** * Helper function for checking CUTLASS errors @@ -10,3 +11,17 @@ TORCH_CHECK(status == cutlass::Status::kSuccess, \ cutlassGetStatusString(status)) \ } + +inline uint32_t next_pow_2(uint32_t const num) { + if (num <= 1) return num; + return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); +} + +inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) { + int max_shared_mem_per_block_opt_in = 0; + cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in, + cudaDevAttrMaxSharedMemoryPerBlockOptin, + device); + return max_shared_mem_per_block_opt_in; +} + diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu index 7651268dc5316..38a20a1727d18 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu @@ -250,8 +250,160 @@ void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a, CUTLASS_CHECK(status); } +template +void fallback_cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + EpilogueArgs&&... args) { + // In some cases, the GPU isn't able to accommodate the + // shared memory requirements of the Gemm. In such cases, use + // the FallbackGemm instead. + static const int max_shared_mem_per_block_opt_in = + get_cuda_max_shared_memory_per_block_opt_in(0); + + size_t const gemm_shared_mem_size = + sizeof(typename Gemm::KernelType::SharedStorage); + size_t const fallback_gemm_shared_mem_size = + sizeof(typename FallbackGemm::KernelType::SharedStorage); + + if (gemm_shared_mem_size <= max_shared_mem_per_block_opt_in) { + return cutlass_gemm_caller(out, a, b, + std::forward(args)...); + } else { + TORCH_CHECK(fallback_gemm_shared_mem_size <= + max_shared_mem_per_block_opt_in); + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } +} + +template typename Epilogue> +struct sm80_config_default { + // This config is used in 2 cases, + // - M in (128, inf) + // - M in (64, 128] and N >= 8192 + // Shared Memory required by this Gemm - 81920 bytes + static_assert(std::is_same()); + using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>; + using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>; + using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>; + using Cutlass2xGemm = + cutlass_2x_gemm; +}; + +template typename Epilogue> +struct sm80_config_M64 { + // This config is used in 2 cases, + // - M in (32, 64] + // - M in (64, 128] and N < 8192 + // Shared Memory required by this Gemm - 122880 bytes + static_assert(std::is_same()); + using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>; + using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>; + using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>; + using Cutlass2xGemm = + cutlass_2x_gemm; +}; + +template typename Epilogue> +struct sm80_config_M32 { + // M in (16, 32] + // Shared Memory required by this Gemm - 61440 bytes + static_assert(std::is_same()); + using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>; + using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>; + using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>; + using Cutlass2xGemm = + cutlass_2x_gemm; +}; + +template typename Epilogue> +struct sm80_config_M16 { + // M in [1, 16] + // Shared Memory required by this Gemm - 51200 bytes + static_assert(std::is_same()); + using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>; + using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>; + using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>; + using Cutlass2xGemm = + cutlass_2x_gemm; +}; + } // namespace +template typename Epilogue, + typename... EpilogueArgs> +void cutlass_gemm_sm80_dispatch(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + EpilogueArgs&&... args) { + static_assert(std::is_same()); + TORCH_CHECK(a.dtype() == torch::kInt8); + TORCH_CHECK(b.dtype() == torch::kInt8); + + using Cutlass2xGemmDefault = + typename sm80_config_default::Cutlass2xGemm; + using Cutlass2xGemmM128BigN = + typename sm80_config_default::Cutlass2xGemm; + using Cutlass2xGemmM128SmallN = + typename sm80_config_M64::Cutlass2xGemm; + using Cutlass2xGemmM64 = + typename sm80_config_M64::Cutlass2xGemm; + using Cutlass2xGemmM32 = + typename sm80_config_M32::Cutlass2xGemm; + using Cutlass2xGemmM16 = + typename sm80_config_M16::Cutlass2xGemm; + + // Due to shared memory requirements, some Gemms may fail to run on some + // GPUs. As the name indicates, the Fallback Gemm is used as an alternative + // in such cases. + // sm80_config_M16 has the least shared-memory requirement. However, + // based on some profiling, we select sm80_config_M32 as a better alternative + // performance wise. + using FallbackGemm = + typename sm80_config_M32::Cutlass2xGemm; + + uint32_t const m = a.size(0); + uint32_t const mp2 = + std::max(static_cast(16), next_pow_2(m)); // next power of 2 + if (mp2 <= 16) { + // M in [1, 16] + return fallback_cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else if (mp2 <= 32) { + // M in (16, 32] + return fallback_cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else if (mp2 <= 64) { + // M in (32, 64] + return fallback_cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else if (mp2 <= 128) { + // M in (64, 128] + uint32_t const n = out.size(1); + bool const small_n = n < 8192; + if (small_n) { + return fallback_cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else { + return fallback_cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } + } else { + // M in (128, inf) + return fallback_cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } +} + void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, @@ -288,20 +440,13 @@ void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a, TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); - using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>; - using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>; - using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>; - if (out.dtype() == torch::kBFloat16) { - return cutlass_gemm_caller>( - out, a, b, a_scales, b_scales); + return cutlass_gemm_sm80_dispatch(out, a, b, a_scales, + b_scales); } else { TORCH_CHECK(out.dtype() == torch::kFloat16); - return cutlass_gemm_caller>( + return cutlass_gemm_sm80_dispatch( out, a, b, a_scales, b_scales); } } diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu index f1a2b73ff962b..cfa8f80f7ea04 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu @@ -44,11 +44,6 @@ using namespace cute; namespace { -uint32_t next_pow_2(uint32_t const num) { - if (num <= 1) return num; - return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); -} - // A wrapper for the GEMM kernel that is used to guard against compilation on // architectures that will never use the kernel. The purpose of this is to // reduce the size of the compiled binary. @@ -234,15 +229,15 @@ void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a, } template typename Epilogue, int32_t M> -struct sm90_fp8_config { + template typename Epilogue> +struct sm90_fp8_config_default { + // M in (128, inf) static_assert(std::is_same()); using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum; using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized; using TileShape = Shape<_128, _128, _128>; using ClusterShape = Shape<_2, _1, _1>; - using Cutlass3xGemm = cutlass_3x_gemm; @@ -250,14 +245,14 @@ struct sm90_fp8_config { template typename Epilogue> -struct sm90_fp8_config { +struct sm90_fp8_config_M128 { + // M in (64, 128] static_assert(std::is_same()); using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum; using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized; using TileShape = Shape<_64, _128, _128>; using ClusterShape = Shape<_2, _1, _1>; - using Cutlass3xGemm = cutlass_3x_gemm; @@ -265,7 +260,8 @@ struct sm90_fp8_config { template typename Epilogue> -struct sm90_fp8_config { +struct sm90_fp8_config_M64 { + // M in [1, 64] static_assert(std::is_same()); using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum; @@ -278,6 +274,78 @@ struct sm90_fp8_config { KernelSchedule, EpilogueSchedule>; }; +template typename Epilogue> +struct sm90_int8_config_default { + // For M > 128 and any N + static_assert(std::is_same()); + using KernelSchedule = + typename cutlass::gemm::KernelTmaWarpSpecializedPingpong; + using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized; + using TileShape = Shape<_128, _128, _128>; + using ClusterShape = Shape<_2, _1, _1>; + using Cutlass3xGemm = + cutlass_3x_gemm; +}; + +template typename Epilogue> +struct sm90_int8_config_M128 { + // For M in (64, 128] and any N + static_assert(std::is_same()); + using KernelSchedule = + typename cutlass::gemm::KernelTmaWarpSpecializedPingpong; + using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized; + using TileShape = Shape<_64, _128, _128>; + using ClusterShape = Shape<_2, _1, _1>; + using Cutlass3xGemm = + cutlass_3x_gemm; +}; + +template typename Epilogue> +struct sm90_int8_config_M64 { + // For M in (32, 64] and any N + static_assert(std::is_same()); + using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized; + using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized; + using TileShape = Shape<_64, _64, _256>; + using ClusterShape = Shape<_1, _1, _1>; + using Cutlass3xGemm = + cutlass_3x_gemm; +}; + +template typename Epilogue> +struct sm90_int8_config_M32_NBig { + // For M in [1, 32] and N >= 8192 + static_assert(std::is_same()); + using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized; + using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized; + using TileShape = Shape<_64, _128, _256>; + using ClusterShape = Shape<_1, _4, _1>; + using Cutlass3xGemm = + cutlass_3x_gemm; +}; + +template typename Epilogue> +struct sm90_int8_config_M32_NSmall { + // For M in [1, 32] and N < 8192 + static_assert(std::is_same()); + using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized; + using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized; + using TileShape = Shape<_64, _64, _256>; + using ClusterShape = Shape<_1, _8, _1>; + using Cutlass3xGemm = + cutlass_3x_gemm; +}; + } // namespace template ::Cutlass3xGemm; + typename sm90_fp8_config_default::Cutlass3xGemm; using Cutlass3xGemmM64 = - typename sm90_fp8_config::Cutlass3xGemm; + typename sm90_fp8_config_M64::Cutlass3xGemm; using Cutlass3xGemmM128 = - typename sm90_fp8_config::Cutlass3xGemm; + typename sm90_fp8_config_M128::Cutlass3xGemm; uint32_t const m = a.size(0); uint32_t const mp2 = @@ -316,6 +385,61 @@ void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a, } } +template typename Epilogue, + typename... EpilogueArgs> +void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + EpilogueArgs&&... args) { + static_assert(std::is_same()); + TORCH_CHECK(a.dtype() == torch::kInt8); + TORCH_CHECK(b.dtype() == torch::kInt8); + + using Cutlass3xGemmDefault = + typename sm90_int8_config_default::Cutlass3xGemm; + using Cutlass3xGemmM128 = + typename sm90_int8_config_M128::Cutlass3xGemm; + using Cutlass3xGemmM64 = + typename sm90_int8_config_M64::Cutlass3xGemm; + using Cutlass3xGemmM32NBig = + typename sm90_int8_config_M32_NBig::Cutlass3xGemm; + using Cutlass3xGemmM32NSmall = + typename sm90_int8_config_M32_NSmall::Cutlass3xGemm; + + uint32_t const n = out.size(1); + bool const is_small_n = n < 8192; + + uint32_t const m = a.size(0); + uint32_t const mp2 = + std::max(static_cast(32), next_pow_2(m)); // next power of 2 + + if (mp2 <= 32) { + // m in [1, 32] + if (is_small_n) { + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else { + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } + } else if (mp2 <= 64) { + // m in (32, 64] + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else if (mp2 <= 128) { + // m in (64, 128] + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else { + // m in (128, inf) + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } +} + void cutlass_scaled_mm_sm90(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, @@ -326,22 +450,14 @@ void cutlass_scaled_mm_sm90(torch::Tensor& out, torch::Tensor const& a, if (a.dtype() == torch::kInt8) { TORCH_CHECK(b.dtype() == torch::kInt8); - using TileShape = Shape<_128, _128, _128>; - using ClusterShape = Shape<_1, _2, _1>; - using KernelSchedule = - typename cutlass::gemm::KernelTmaWarpSpecializedPingpong; - using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized; - if (out.dtype() == torch::kBFloat16) { - return cutlass_gemm_caller>(out, a, b, a_scales, b_scales); + return cutlass_gemm_sm90_int8_dispatch( + out, a, b, a_scales, b_scales); } else { TORCH_CHECK(out.dtype() == torch::kFloat16); - - return cutlass_gemm_caller< - cutlass_3x_gemm>( + return cutlass_gemm_sm90_int8_dispatch( out, a, b, a_scales, b_scales); } } else { diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu index 687f8efd8dc00..f4e582d780ad9 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu @@ -25,6 +25,22 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b_scales); #endif +bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) { + // CUTLASS FP8 kernels need at least + // CUDA 12.0 on SM90 systems (Hopper) + // CUDA 12.4 on SM89 systems (Lovelace) + +#if defined CUDA_VERSION + if (cuda_device_capability >= 90) { + return CUDA_VERSION >= 12000; + } else if (cuda_device_capability >= 89) { + return CUDA_VERSION >= 12040; + } +#endif + + return false; +} + void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales) { diff --git a/csrc/quantization/marlin/sparse/common/mma.h b/csrc/quantization/marlin/sparse/common/mma.h index 45ab67a78a1de..b26505f771c8b 100644 --- a/csrc/quantization/marlin/sparse/common/mma.h +++ b/csrc/quantization/marlin/sparse/common/mma.h @@ -17,9 +17,23 @@ #pragma once #include "base.h" +#include namespace marlin_24 { +// On CUDA earlier than 12.5, the ordered_metadata version of this instruction +// is not supported. On later versions of CUDA the version without ordered +// metadata results in the following warning: +// | Advisory: Modifier ‘.sp::ordered_metadata’ should be used on instruction +// | ‘mma’ instead of modifier ‘.sp’ as it is expected to have substantially +// | reduced performance on some future architectures +#if defined CUDA_VERSION && CUDA_VERSION >= 12050 + #define MMA_SP_INST \ + "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " +#else + #define MMA_SP_INST "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " +#endif + // m16n8k32 sparse tensor core mma instruction with fp16 inputs and fp32 // output/accumulation. __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1, @@ -29,41 +43,38 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1, const uint32_t* a1 = reinterpret_cast(&a_frag1); const uint32_t* b = reinterpret_cast(&frag_b); const uint32_t* e = reinterpret_cast(&frag_m); + float* c = reinterpret_cast(&frag_c); if (psel == 0) { - asm volatile( - "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " - "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " - "{%12,%13,%14,%15}, %16, 0x0;\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]), "r"(b[2]), - "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]), - "r"(e[0])); - asm volatile( - "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " - "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " - "{%12,%13,%14,%15}, %16, 0x0;\n" - : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7]) - : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]), "r"(b[3]), - "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]), "f"(c[6]), "f"(c[7]), - "r"(e[0])); + asm volatile(MMA_SP_INST + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " + "{%12,%13,%14,%15}, %16, 0x0;\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]), + "r"(b[2]), "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), + "f"(c[2]), "f"(c[3]), "r"(e[0])); + asm volatile(MMA_SP_INST + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " + "{%12,%13,%14,%15}, %16, 0x0;\n" + : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7]) + : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]), + "r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]), + "f"(c[6]), "f"(c[7]), "r"(e[0])); } else { - asm volatile( - "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " - "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " - "{%12,%13,%14,%15}, %16, 0x1;\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]), "r"(b[2]), - "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]), - "r"(e[0])); - asm volatile( - "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " - "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " - "{%12,%13,%14,%15}, %16, 0x1;\n" - : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7]) - : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]), "r"(b[3]), - "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]), "f"(c[6]), "f"(c[7]), - "r"(e[0])); + asm volatile(MMA_SP_INST + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " + "{%12,%13,%14,%15}, %16, 0x1;\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]), + "r"(b[2]), "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), + "f"(c[2]), "f"(c[3]), "r"(e[0])); + asm volatile(MMA_SP_INST + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " + "{%12,%13,%14,%15}, %16, 0x1;\n" + : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7]) + : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]), + "r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]), + "f"(c[6]), "f"(c[7]), "r"(e[0])); } } diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 867bf438937cd..227b69d79e863 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -68,6 +68,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("gelu_fast(Tensor! out, Tensor input) -> ()"); ops.impl("gelu_fast", torch::kCUDA, &gelu_fast); + // Quick GELU implementation. + ops.def("gelu_quick(Tensor! out, Tensor input) -> ()"); + ops.impl("gelu_quick", torch::kCUDA, &gelu_quick); + // Layernorm // Apply Root Mean Square (RMS) Normalization to the input tensor. ops.def( @@ -140,6 +144,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor b, Tensor a_scales," " Tensor b_scales) -> ()"); ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm); + + // Check if cutlass scaled_mm is supported for CUDA devices of the given + // capability + ops.def("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8); + ops.impl("cutlass_scaled_mm_supports_fp8", torch::kCUDA, + &cutlass_scaled_mm_supports_fp8); #endif // Quantized GEMM for GPTQ. diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md index c8f2c16d31875..cd8e8b0f513c4 100644 --- a/docs/source/community/sponsors.md +++ b/docs/source/community/sponsors.md @@ -22,5 +22,6 @@ vLLM is a community project. Our compute resources for development and testing a - Trainy - UC Berkeley - UC San Diego +- ZhenFund We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM. diff --git a/docs/source/dev/dockerfile/dockerfile.rst b/docs/source/dev/dockerfile/dockerfile.rst index a07463392dbe8..9c17c27aa61bf 100644 --- a/docs/source/dev/dockerfile/dockerfile.rst +++ b/docs/source/dev/dockerfile/dockerfile.rst @@ -1,20 +1,20 @@ Dockerfile ==================== -See `here `_ for the main Dockerfile to construct -the image for running an OpenAI compatible server with vLLM. +See `here `__ for the main Dockerfile to construct +the image for running an OpenAI compatible server with vLLM. More information about deploying with Docker can be found `here `__. -- Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes: +Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes: - - All build stages - - The default build target (highlighted in grey) - - External images (with dashed borders) +- All build stages +- The default build target (highlighted in grey) +- External images (with dashed borders) - The edges of the build graph represent: - - - FROM ... dependencies (with a solid line and a full arrow head) - - COPY --from=... dependencies (with a dashed line and an empty arrow head) - - RUN --mount=(.*)from=... dependencies (with a dotted line and an empty diamond arrow head) +The edges of the build graph represent: + +- FROM ... dependencies (with a solid line and a full arrow head) +- COPY --from=... dependencies (with a dashed line and an empty arrow head) +- RUN --mount=(.*)from=... dependencies (with a dotted line and an empty diamond arrow head) .. figure:: ../../assets/dev/dockerfile-stages-dependency.png :alt: query diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst index ff37f4e628692..a22bba1478abb 100644 --- a/docs/source/getting_started/debugging.rst +++ b/docs/source/getting_started/debugging.rst @@ -24,6 +24,8 @@ If you have already taken care of the above issues, but the vLLM instance still With more logging, hopefully you can find the root cause of the issue. +If it crashes, and the error trace shows somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a cuda error inside cudagraph. To know the particular cuda operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the ``LLM`` class, to disable the cudagraph optimization. This way, you can locate the exact cuda operation that causes the error. + Here are some common issues that can cause hangs: - **Incorrect network setup**: The vLLM instance cannot get the correct IP address. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``. @@ -31,15 +33,26 @@ Here are some common issues that can cause hangs: .. code-block:: python - # save it as `test.py` , and run it with `NCCL_DEBUG=TRACE torchrun --nproc-per-node=8 test.py` - # adjust `--nproc-per-node` to the number of GPUs you want to use. import torch import torch.distributed as dist dist.init_process_group(backend="nccl") - data = torch.FloatTensor([1,] * 128).to(f"cuda:{dist.get_rank()}") + local_rank = dist.get_rank() % torch.cuda.device_count() + data = torch.FloatTensor([1,] * 128).to(f"cuda:{local_rank}") dist.all_reduce(data, op=dist.ReduceOp.SUM) torch.cuda.synchronize() value = data.mean().item() assert value == dist.get_world_size() +.. tip:: + + Save the script as ``test.py``. + + If you are testing in a single-node, run it with ``NCCL_DEBUG=TRACE torchrun --nproc-per-node=8 test.py``, adjust ``--nproc-per-node`` to the number of GPUs you want to use. + + If you are testing with multi-nodes, run it with ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py``. Adjust ``--nproc-per-node`` and ``--nnodes`` according to your setup. Make sure ``MASTER_ADDR``: + + - is the correct IP address of the master node + - is reachable from all nodes + - is set before running the script. + If the problem persists, feel free to `open an issue on GitHub `_, with a detailed description of the issue, your environment, and the logs. diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst index 3627600e1f23a..e96aabbb63279 100644 --- a/docs/source/getting_started/tpu-installation.rst +++ b/docs/source/getting_started/tpu-installation.rst @@ -73,3 +73,21 @@ Next, build vLLM from source. This will only take a few seconds: .. code-block:: console $ VLLM_TARGET_DEVICE="tpu" python setup.py develop + + +.. tip:: + + If you encounter the following error: + + .. code-block:: console + + from torch._C import * # noqa: F403 + ImportError: libopenblas.so.0: cannot open shared object file: No such file or directory + + + You can install OpenBLAS with the following command: + + .. code-block:: console + + $ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev + diff --git a/docs/source/getting_started/xpu-installation.rst b/docs/source/getting_started/xpu-installation.rst new file mode 100644 index 0000000000000..4f0d2da25b8e8 --- /dev/null +++ b/docs/source/getting_started/xpu-installation.rst @@ -0,0 +1,61 @@ +.. _installation_xpu: + +Installation with XPU +======================== + +vLLM initially supports basic model inferencing and serving on Intel GPU platform. + +Table of contents: + +#. :ref:`Requirements ` +#. :ref:`Quick start using Dockerfile ` +#. :ref:`Build from source ` + +.. _xpu_backend_requirements: + +Requirements +------------ + +* OS: Linux +* Supported Hardware: Intel Data Center GPU (Intel ARC GPU WIP) +* OneAPI requirements: oneAPI 2024.1 + +.. _xpu_backend_quick_start_dockerfile: + +Quick start using Dockerfile +---------------------------- + +.. code-block:: console + + $ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g . + $ docker run -it \ + --rm \ + --network=host \ + --device /dev/dri \ + -v /dev/dri/by-path:/dev/dri/by-path \ + vllm-xpu-env + +.. _build_xpu_backend_from_source: + +Build from source +----------------- + +- First, install required driver and intel OneAPI 2024.1. + +- Second, install Python packages for vLLM XPU backend building: + +.. code-block:: console + + $ pip install --upgrade pip + $ pip install -v -r requirements-xpu.txt + +- Finally, build and install vLLM XPU backend: + +.. code-block:: console + + $ VLLM_TARGET_DEVICE=xpu python setup.py install + +.. note:: + - FP16 is the default data type in the current XPU backend. The BF16 data + type will be supported in the future. + diff --git a/docs/source/index.rst b/docs/source/index.rst index b7c0d5b880079..05133eb6d867a 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -66,6 +66,7 @@ Documentation getting_started/cpu-installation getting_started/neuron-installation getting_started/tpu-installation + getting_started/xpu-installation getting_started/quickstart getting_started/debugging getting_started/examples/examples_index @@ -81,6 +82,7 @@ Documentation serving/env_vars serving/usage_stats serving/integrations + serving/tensorizer .. toctree:: :maxdepth: 1 @@ -98,6 +100,7 @@ Documentation :maxdepth: 1 :caption: Quantization + quantization/supported_hardware quantization/auto_awq quantization/fp8 quantization/fp8_e5m2_kvcache diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 5d3f55be1271f..f4673dc27092f 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -135,6 +135,10 @@ Alongside each architecture, we include some popular models that use it. - Phi-3-Small - :code:`microsoft/Phi-3-small-8k-instruct`, :code:`microsoft/Phi-3-small-128k-instruct`, etc. - + * - :code:`Phi3VForCausalLM` + - Phi-3-Vision + - :code:`microsoft/Phi-3-vision-128k-instruct`, etc. + - * - :code:`QWenLMHeadModel` - Qwen - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc. diff --git a/docs/source/quantization/fp8.rst b/docs/source/quantization/fp8.rst index 312a564595cc8..09f3136644c37 100644 --- a/docs/source/quantization/fp8.rst +++ b/docs/source/quantization/fp8.rst @@ -3,7 +3,9 @@ FP8 ================== -vLLM supports FP8 (8-bit floating point) computation using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. Currently, only Hopper and Ada Lovelace GPUs are supported. Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy. +vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. +Currently, only Hopper and Ada Lovelace GPUs are supported. +Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy. Please visit the HF collection of `quantized FP8 checkpoints of popular LLMs ready to use with vLLM `_. diff --git a/docs/source/quantization/supported_hardware.rst b/docs/source/quantization/supported_hardware.rst new file mode 100644 index 0000000000000..df445e00a3958 --- /dev/null +++ b/docs/source/quantization/supported_hardware.rst @@ -0,0 +1,30 @@ +.. _supported_hardware_for_quantization: + +Supported Hardware for Quantization Kernels +=========================================== + +The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: + +============== ====== ======= ======= ===== ====== ======= ========= ======= ============== ========== +Implementation Volta Turing Ampere Ada Hopper AMD GPU Intel GPU x86 CPU AWS Inferentia Google TPU +============== ====== ======= ======= ===== ====== ======= ========= ======= ============== ========== +AQLM ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +AWQ ❌ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +DeepSpeedFP ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +FP8 ❌ ❌ ❌ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +Marlin ❌ ❌ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +GPTQ ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +SqueezeLLM ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +bitsandbytes ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +============== ====== ======= ======= ===== ====== ======= ========= ======= ============== ========== + +Notes: +^^^^^^ + +- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. +- "✅" indicates that the quantization method is supported on the specified hardware. +- "❌" indicates that the quantization method is not supported on the specified hardware. + +Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. + +For the most up-to-date information on hardware support and quantization methods, please check the `quantization directory `_ or consult with the vLLM development team. \ No newline at end of file diff --git a/docs/source/serving/deploying_with_cerebrium.rst b/docs/source/serving/deploying_with_cerebrium.rst new file mode 100644 index 0000000000000..ff0ac911108c4 --- /dev/null +++ b/docs/source/serving/deploying_with_cerebrium.rst @@ -0,0 +1,109 @@ +.. _deploying_with_cerebrium: + +Deploying with Cerebrium +============================ + +.. raw:: html + +

+ vLLM_plus_cerebrium +

+ +vLLM can be run on a cloud based GPU machine with `Cerebrium `__, a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications. + +To install the Cerebrium client, run: + +.. code-block:: console + + $ pip install cerebrium + $ cerebrium login + +Next, create your Cerebrium project, run: + +.. code-block:: console + + $ cerebrium init vllm-project + +Next, to install the required packages, add the following to your cerebrium.toml: + +.. code-block:: toml + + [cerebrium.dependencies.pip] + vllm = "latest" + +Next, let us add our code to handle inference for the LLM of your choice(`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your main.py`: + +.. code-block:: python + + from vllm import LLM, SamplingParams + + llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1") + + def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95): + + sampling_params = SamplingParams(temperature=temperature, top_p=top_p) + outputs = llm.generate(prompts, sampling_params) + + # Print the outputs. + results = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + results.append({"prompt": prompt, "generated_text": generated_text}) + + return {"results": results} + + +Then, run the following code to deploy it to the cloud + +.. code-block:: console + + $ cerebrium deploy + +If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case /run) + +.. code-block:: python + + curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ + -H 'Content-Type: application/json' \ + -H 'Authorization: ' \ + --data '{ + "prompts": [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is" + ] + }' + +You should get a response like: + +.. code-block:: python + + { + "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262", + "result": { + "result": [ + { + "prompt": "Hello, my name is", + "generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of" + }, + { + "prompt": "The president of the United States is", + "generated_text": " elected every four years. This is a democratic system.\n\n5. What" + }, + { + "prompt": "The capital of France is", + "generated_text": " Paris.\n" + }, + { + "prompt": "The future of AI is", + "generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective." + } + ] + }, + "run_time_ms": 152.53663063049316 + } + +You now have an autoscaling endpoint where you only pay for the compute you use! + diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst index fa82bc8e3bd33..14d94b09e9b9c 100644 --- a/docs/source/serving/deploying_with_docker.rst +++ b/docs/source/serving/deploying_with_docker.rst @@ -3,9 +3,8 @@ Deploying with Docker ============================ -vLLM offers official docker image for deployment. -The image can be used to run OpenAI compatible server. -The image is available on Docker Hub as `vllm/vllm-openai `_. +vLLM offers an official Docker image for deployment. +The image can be used to run OpenAI compatible server and is available on Docker Hub as `vllm/vllm-openai `_. .. code-block:: console @@ -25,7 +24,7 @@ The image is available on Docker Hub as `vllm/vllm-openai `_. To build vLLM: .. code-block:: console diff --git a/docs/source/serving/integrations.rst b/docs/source/serving/integrations.rst index 83a8b5a88bd38..680ea523dfe94 100644 --- a/docs/source/serving/integrations.rst +++ b/docs/source/serving/integrations.rst @@ -8,6 +8,7 @@ Integrations deploying_with_kserve deploying_with_triton deploying_with_bentoml + deploying_with_cerebrium deploying_with_lws deploying_with_dstack serving_with_langchain diff --git a/docs/source/serving/tensorizer.rst b/docs/source/serving/tensorizer.rst new file mode 100644 index 0000000000000..a44696507fb9a --- /dev/null +++ b/docs/source/serving/tensorizer.rst @@ -0,0 +1,12 @@ +.. _tensorizer: + +Loading Models with CoreWeave's Tensorizer +========================================== +vLLM supports loading models with `CoreWeave's Tensorizer `_. +vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized +at runtime extremely quickly directly to the GPU, resulting in significantly +shorter Pod startup times and CPU memory usage. Tensor encryption is also supported. + +For more information on CoreWeave's Tensorizer, please refer to +`CoreWeave's Tensorizer documentation `_. For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see +the `vLLM example script `_. \ No newline at end of file diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py index e7c17fa0362ae..40f9a21ec9e51 100644 --- a/examples/aqlm_example.py +++ b/examples/aqlm_example.py @@ -1,11 +1,10 @@ -import argparse - from vllm import LLM, SamplingParams +from vllm.utils import FlexibleArgumentParser def main(): - parser = argparse.ArgumentParser(description='AQLM examples') + parser = FlexibleArgumentParser(description='AQLM examples') parser.add_argument('--model', '-m', @@ -17,7 +16,7 @@ def main(): type=int, default=0, help='known good models by index, [0-4]') - parser.add_argument('--tensor_parallel_size', + parser.add_argument('--tensor-parallel-size', '-t', type=int, default=1, diff --git a/examples/fp8/extract_scales.py b/examples/fp8/extract_scales.py index 1eb961a5a76e3..1dce9d7e993a0 100644 --- a/examples/fp8/extract_scales.py +++ b/examples/fp8/extract_scales.py @@ -2,7 +2,7 @@ import glob import json import os -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple import numpy as np import torch @@ -19,7 +19,7 @@ def _prepare_hf_weights( quantized_model_dir: str, load_format: str = "auto", fall_back_to_pt: bool = True, -) -> Tuple[str, List[str], bool]: +) -> Tuple[List[str], bool]: if not os.path.isdir(quantized_model_dir): raise FileNotFoundError( f"The quantized model directory `{quantized_model_dir}` " @@ -94,7 +94,7 @@ def _hf_tensorfile_iterator(filename: str, load_format: str, def _kv_scales_extractor( - hf_tensor_files: Iterable[str], + hf_tensor_files: List[str], use_safetensors: bool, rank_keyword: str = "rank", expected_tp_size: Optional[int] = None) -> Dict[int, Dict[int, float]]: @@ -115,7 +115,7 @@ def _kv_scales_extractor( for char in rank_keyword: assert not char.isdecimal( ), f"Rank keyword {rank_keyword} contains a numeric character!" - rank_scales_map = {} + rank_scales_map: Dict[int, Dict[int, float]] = {} for tensor_file in hf_tensor_files: try: rank_idx = tensor_file.find(rank_keyword) @@ -141,7 +141,7 @@ def _kv_scales_extractor( raise if rank not in rank_scales_map: - layer_scales_map = {} + layer_scales_map: Dict[int, float] = {} rank_scales_map[rank] = layer_scales_map else: raise RuntimeError( @@ -222,7 +222,7 @@ def _metadata_extractor(quantized_model_dir: str, "does not exist.") metadata_files = glob.glob(os.path.join(quantized_model_dir, "*.json")) - result = {} + result: Dict[str, Any] = {} for file in metadata_files: with open(file) as f: try: @@ -327,7 +327,7 @@ def main(args): "--quantization-param-path ). This is only used " "if the KV cache dtype is FP8 and on ROCm (AMD GPU).") parser.add_argument( - "--quantized_model", + "--quantized-model", help="Specify the directory containing a single quantized HF model. " "It is expected that the quantization format is FP8_E4M3, for use " "on ROCm (AMD GPU).", @@ -339,18 +339,18 @@ def main(args): choices=["auto", "safetensors", "npz", "pt"], default="auto") parser.add_argument( - "--output_dir", + "--output-dir", help="Optionally specify the output directory. By default the " "KV cache scaling factors will be saved in the model directory, " "however you can override this behavior here.", default=None) parser.add_argument( - "--output_name", + "--output-name", help="Optionally specify the output filename.", # TODO: Change this once additional scaling factors are enabled default="kv_cache_scales.json") parser.add_argument( - "--tp_size", + "--tp-size", help="Optionally specify the tensor-parallel (TP) size that the " "quantized model should correspond to. If specified, during KV " "cache scaling factor extraction the observed TP size will be " diff --git a/examples/llm_engine_example.py b/examples/llm_engine_example.py index a81c4b3e399c3..ca41f32b12b31 100644 --- a/examples/llm_engine_example.py +++ b/examples/llm_engine_example.py @@ -2,6 +2,7 @@ from typing import List, Tuple from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams +from vllm.utils import FlexibleArgumentParser def create_test_prompts() -> List[Tuple[str, SamplingParams]]: @@ -55,7 +56,7 @@ def main(args: argparse.Namespace): if __name__ == '__main__': - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description='Demo on using the LLMEngine class directly') parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() diff --git a/examples/offline_inference_distributed.py b/examples/offline_inference_distributed.py index 1e59e89509724..677127844ccdd 100644 --- a/examples/offline_inference_distributed.py +++ b/examples/offline_inference_distributed.py @@ -5,7 +5,7 @@ Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html """ -from typing import Dict +from typing import Any, Dict, List import numpy as np import ray @@ -40,8 +40,8 @@ def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, list]: # The output is a list of RequestOutput objects that contain the prompt, # generated text, and other information. outputs = self.llm.generate(batch["text"], sampling_params) - prompt = [] - generated_text = [] + prompt: List[str] = [] + generated_text: List[str] = [] for output in outputs: prompt.append(output.prompt) generated_text.append(' '.join([o.text for o in output.outputs])) @@ -71,7 +71,7 @@ def scheduling_strategy_fn(): pg, placement_group_capture_child_tasks=True)) -resources_kwarg = {} +resources_kwarg: Dict[str, Any] = {} if tensor_parallel_size == 1: # For tensor_parallel_size == 1, we simply set num_gpus=1. resources_kwarg["num_gpus"] = 1 diff --git a/examples/offline_inference_mlpspeculator.py b/examples/offline_inference_mlpspeculator.py new file mode 100644 index 0000000000000..5448ec1f6208c --- /dev/null +++ b/examples/offline_inference_mlpspeculator.py @@ -0,0 +1,59 @@ +import gc +import time +from typing import List + +from vllm import LLM, SamplingParams + + +def time_generation(llm: LLM, prompts: List[str], + sampling_params: SamplingParams): + # Generate texts from the prompts. The output is a list of RequestOutput + # objects that contain the prompt, generated text, and other information. + # Warmup first + llm.generate(prompts, sampling_params) + llm.generate(prompts, sampling_params) + start = time.time() + outputs = llm.generate(prompts, sampling_params) + end = time.time() + print((end - start) / sum([len(o.outputs[0].token_ids) for o in outputs])) + # Print the outputs. + for output in outputs: + generated_text = output.outputs[0].text + print(f"text: {generated_text!r}") + + +if __name__ == "__main__": + + template = ( + "Below is an instruction that describes a task. Write a response " + "that appropriately completes the request.\n\n### Instruction:\n{}" + "\n\n### Response:\n") + + # Sample prompts. + prompts = [ + "Write about the president of the United States.", + ] + prompts = [template.format(prompt) for prompt in prompts] + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.0, max_tokens=200) + + # Create an LLM without spec decoding + llm = LLM(model="meta-llama/Llama-2-13b-chat-hf") + + print("Without speculation") + time_generation(llm, prompts, sampling_params) + + del llm + gc.collect() + + # Create an LLM with spec decoding + llm = LLM( + model="meta-llama/Llama-2-13b-chat-hf", + speculative_model="ibm-fms/llama-13b-accelerator", + # These are currently required for MLPSpeculator decoding + use_v2_block_manager=True, + enforce_eager=True, + ) + + print("With speculation") + time_generation(llm, prompts, sampling_params) diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py new file mode 100644 index 0000000000000..4f37c47ddca87 --- /dev/null +++ b/examples/phi3v_example.py @@ -0,0 +1,57 @@ +import os +import subprocess + +from PIL import Image + +from vllm import LLM, SamplingParams +from vllm.multimodal.image import ImagePixelData + + +def run_phi3v(): + model_path = "microsoft/Phi-3-vision-128k-instruct" + llm = LLM( + model=model_path, + trust_remote_code=True, + image_input_type="pixel_values", + image_token_id=32044, + image_input_shape="1,3,1008,1344", + image_feature_size=1921, + disable_image_processor=False, + ) + + image = Image.open("images/cherry_blossom.jpg") + + # single-image prompt + prompt = "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n" # noqa: E501 + prompt = prompt.replace("<|image_1|>", "<|image|>" * 1921 + "") + + sampling_params = SamplingParams(temperature=0, max_tokens=64) + + outputs = llm.generate( + { + "prompt": prompt, + "multi_modal_data": ImagePixelData(image), + }, + sampling_params=sampling_params) + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + + +if __name__ == "__main__": + s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/" + local_directory = "images" + + # Make sure the local directory exists or create it + os.makedirs(local_directory, exist_ok=True) + + # Use AWS CLI to sync the directory, assume anonymous access + subprocess.check_call([ + "aws", + "s3", + "sync", + s3_bucket_path, + local_directory, + "--no-sign-request", + ]) + run_phi3v() diff --git a/examples/production_monitoring/Otel.md b/examples/production_monitoring/Otel.md new file mode 100644 index 0000000000000..1449442273c7a --- /dev/null +++ b/examples/production_monitoring/Otel.md @@ -0,0 +1,82 @@ +# Setup OpenTelemetry POC + +1. Install OpenTelemetry packages: + ``` + pip install \ + opentelemetry-sdk \ + opentelemetry-api \ + opentelemetry-exporter-otlp \ + opentelemetry-semantic-conventions-ai + ``` + +1. Start Jaeger in a docker container: + ``` + # From: https://www.jaegertracing.io/docs/1.57/getting-started/ + docker run --rm --name jaeger \ + -e COLLECTOR_ZIPKIN_HOST_PORT=:9411 \ + -p 6831:6831/udp \ + -p 6832:6832/udp \ + -p 5778:5778 \ + -p 16686:16686 \ + -p 4317:4317 \ + -p 4318:4318 \ + -p 14250:14250 \ + -p 14268:14268 \ + -p 14269:14269 \ + -p 9411:9411 \ + jaegertracing/all-in-one:1.57 + ``` + +1. In a new shell, export Jaeger IP: + ``` + export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger) + export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317 + ``` + Then set vLLM's service name for OpenTelemetry, enable insecure connections to Jaeger and run vLLM: + ``` + export OTEL_SERVICE_NAME="vllm-server" + export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true + python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT" + ``` + +1. In a new shell, send requests with trace context from a dummy client + ``` + export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger) + export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317 + export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true + export OTEL_SERVICE_NAME="client-service" + python dummy_client.py + ``` + +1. Open Jaeger webui: http://localhost:16686/ + + In the search pane, select `vllm-server` service and hit `Find Traces`. You should get a list of traces, one for each request. + ![Traces](https://i.imgur.com/GYHhFjo.png) + +1. Clicking on a trace will show its spans and their tags. In this demo, each trace has 2 spans. One from the dummy client containing the prompt text and one from vLLM containing metadata about the request. +![Spans details](https://i.imgur.com/OPf6CBL.png) + +## Exporter Protocol +OpenTelemetry supports either `grpc` or `http/protobuf` as the transport protocol for trace data in the exporter. +By default, `grpc` is used. To set `http/protobuf` as the protocol, configure the `OTEL_EXPORTER_OTLP_TRACES_PROTOCOL` environment variable as follows: +``` +export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf +export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces +python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT" +``` + +## Instrumentation of FastAPI +OpenTelemetry allows automatic instrumentation of FastAPI. +1. Install the instrumentation library + ``` + pip install opentelemetry-instrumentation-fastapi + ``` + +1. Run vLLM with `opentelemetry-instrument` + ``` + opentelemetry-instrument python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" + ``` + +1. Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI. + +![FastAPI Spans](https://i.imgur.com/hywvoOJ.png) \ No newline at end of file diff --git a/examples/production_monitoring/dummy_client.py b/examples/production_monitoring/dummy_client.py new file mode 100644 index 0000000000000..b1a2b3c3c4aaf --- /dev/null +++ b/examples/production_monitoring/dummy_client.py @@ -0,0 +1,35 @@ +import requests +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import ( + OTLPSpanExporter) +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import (BatchSpanProcessor, + ConsoleSpanExporter) +from opentelemetry.trace import SpanKind, set_tracer_provider +from opentelemetry.trace.propagation.tracecontext import ( + TraceContextTextMapPropagator) + +trace_provider = TracerProvider() +set_tracer_provider(trace_provider) + +trace_provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter())) +trace_provider.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter())) + +tracer = trace_provider.get_tracer("dummy-client") + +url = "http://localhost:8000/v1/completions" +with tracer.start_as_current_span("client-span", kind=SpanKind.CLIENT) as span: + prompt = "San Francisco is a" + span.set_attribute("prompt", prompt) + headers = {} + TraceContextTextMapPropagator().inject(headers) + payload = { + "model": "facebook/opt-125m", + "prompt": prompt, + "max_tokens": 10, + "best_of": 20, + "n": 3, + "use_beam_search": "true", + "temperature": 0.0, + # "stream": True, + } + response = requests.post(url, headers=headers, json=payload) diff --git a/examples/save_sharded_state.py b/examples/save_sharded_state.py index c595d98ba2750..4207f8922403b 100644 --- a/examples/save_sharded_state.py +++ b/examples/save_sharded_state.py @@ -20,15 +20,15 @@ tensor_parallel_size=8, ) """ -import argparse import dataclasses import os import shutil from pathlib import Path from vllm import LLM, EngineArgs +from vllm.utils import FlexibleArgumentParser -parser = argparse.ArgumentParser() +parser = FlexibleArgumentParser() EngineArgs.add_cli_args(parser) parser.add_argument("--output", "-o", diff --git a/examples/tensorize_vllm_model.py b/examples/tensorize_vllm_model.py index f9ed5fe08988e..dd77a4ad0c6b7 100644 --- a/examples/tensorize_vllm_model.py +++ b/examples/tensorize_vllm_model.py @@ -9,6 +9,7 @@ from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs, TensorizerConfig, tensorize_vllm_model) +from vllm.utils import FlexibleArgumentParser # yapf conflicts with isort for this docstring # yapf: disable @@ -96,7 +97,7 @@ def parse_args(): - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="An example script that can be used to serialize and " "deserialize vLLM models. These models " "can be loaded using tensorizer directly to the GPU " diff --git a/format.sh b/format.sh index 784d8d565e83a..8c54b56302d5b 100755 --- a/format.sh +++ b/format.sh @@ -111,7 +111,7 @@ mypy vllm/spec_decode --config-file pyproject.toml mypy vllm/model_executor --config-file pyproject.toml mypy vllm/lora --config-file pyproject.toml mypy vllm/logging --config-file pyproject.toml -mypy vllm/model_executor --config-file pyproject.toml +mypy tests --config-file pyproject.toml # If git diff returns a file that is in the skip list, the file may be checked anyway: @@ -121,10 +121,9 @@ CODESPELL_EXCLUDES=( '--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**' ) - # check spelling of specified files spell_check() { - codespell "$@ ${CODESPELL_EXCLUDES[@]}" + codespell "$@" } spell_check_all(){ @@ -157,7 +156,6 @@ elif [[ "$1" == '--all' ]]; then spell_check_all else # Check spelling only of the files that changed in last commit. - echo "${CODESPELL_EXCLUDES[@]}" spell_check_changed fi echo 'vLLM codespell: Done' diff --git a/neuralmagic/lm-eval/full-small-models.yaml b/neuralmagic/lm-eval/full-small-models.yaml index 129ea4c5bf99c..adb00ba65c1f1 100644 --- a/neuralmagic/lm-eval/full-small-models.yaml +++ b/neuralmagic/lm-eval/full-small-models.yaml @@ -9,15 +9,3 @@ value: 0.74 limit: 250 num_fewshot: 5 - -# ./nm-run-lm-eval-gsm-hf-baseline -m TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ -b 32 -d cuda -l 250 -f 5 -- model_name: "TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ" - tasks: - - name: "gsm8k" - metrics: - - name: "exact_match,strict-match" - value: 0.684 - - name: "exact_match,flexible-extract" - value: 0.688 - limit: 250 - num_fewshot: 5 diff --git a/neuralmagic/lm-eval/smoke-small-models.yaml b/neuralmagic/lm-eval/smoke-small-models.yaml index c9ecd35bf793d..546a221872af8 100644 --- a/neuralmagic/lm-eval/smoke-small-models.yaml +++ b/neuralmagic/lm-eval/smoke-small-models.yaml @@ -9,15 +9,3 @@ value: 0.74 limit: 250 num_fewshot: 5 - -# ./nm-run-lm-eval-gsm-hf-baseline.sh -m TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ -b 32 -d cuda -l 250 -f 5 -- model_name: "TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ" - tasks: - - name: "gsm8k" - metrics: - - name: "exact_match,strict-match" - value: 0.684 - - name: "exact_match,flexible-extract" - value: 0.688 - limit: 250 - num_fewshot: 5 diff --git a/neuralmagic/tests/test_skip_env_vars/full.txt b/neuralmagic/tests/test_skip_env_vars/full.txt index 1ff4589556327..c39d0a6ab8ba2 100644 --- a/neuralmagic/tests/test_skip_env_vars/full.txt +++ b/neuralmagic/tests/test_skip_env_vars/full.txt @@ -16,4 +16,5 @@ TEST_SAMPLERS=ENABLE TEST_SPEC_DECODE=DISABLE TEST_TENSORIZER_LOADER=ENABLE TEST_TOKENIZATION=ENABLE +TEST_TRACING=ENABLE TEST_WORKER=ENABLE diff --git a/neuralmagic/tests/test_skip_env_vars/smoke.txt b/neuralmagic/tests/test_skip_env_vars/smoke.txt index 5c5066aaee391..e901455dfd5be 100644 --- a/neuralmagic/tests/test_skip_env_vars/smoke.txt +++ b/neuralmagic/tests/test_skip_env_vars/smoke.txt @@ -16,4 +16,5 @@ TEST_SAMPLERS=DISABLE TEST_SPEC_DECODE=DISABLE TEST_TENSORIZER_LOADER=DISABLE TEST_TOKENIZATION=ENABLE +TEST_TRACING=ENABLE TEST_WORKER=ENABLE diff --git a/pyproject.toml b/pyproject.toml index eb691c29724ce..4958aae02594a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,5 +71,5 @@ markers = [ "skip_global_cleanup", "llm: run tests for vLLM API only", "openai: run tests for OpenAI API only", - "llava: run tests for LLaVA models only", + "vlm: run tests for vision language models only", ] diff --git a/requirements-common.txt b/requirements-common.txt index 32e2ebe8c6159..05969cfa5d65f 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -4,6 +4,7 @@ psutil sentencepiece # Required for LLaMA tokenizer. numpy < 2.0.0 requests +tqdm py-cpuinfo transformers >= 4.40.0 # Required for StarCoder2 & Llava, Llama 3. tokenizers >= 0.19.1 # Required for Llama 3. diff --git a/requirements-test.txt b/requirements-test.txt index f92975caa61fc..0698539da2535 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -10,12 +10,17 @@ pytest-shard awscli einops # required for MPT httpx +opentelemetry-sdk # required for test tracing +opentelemetry-api # required for test tracing +opentelemetry-exporter-otlp # required for test tracing +opentelemetry-semantic-conventions-ai # required for test tracing peft requests==2.31 # required for python 3.8 testing ray sentence-transformers # required for embedding optimum # required for hf gptq baselines auto-gptq # required for hf gptq baselines +torchvision # required for the image processor of phi3v # Benchmarking aiohttp diff --git a/requirements-xpu.txt b/requirements-xpu.txt new file mode 100644 index 0000000000000..48d899ec70eda --- /dev/null +++ b/requirements-xpu.txt @@ -0,0 +1,11 @@ +# Common dependencies +-r requirements-common.txt + +setuptools < 70.0.0 # IPEX's torch have some dependency. to be removed. + +torch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/torch-2.1.0.post1%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl +intel_extension_for_pytorch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl +oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.1.200%2Bxpu-cp310-cp310-linux_x86_64.whl + +triton @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + diff --git a/setup.py b/setup.py index 22d33898b342d..50f6f39cca488 100644 --- a/setup.py +++ b/setup.py @@ -234,6 +234,10 @@ def _is_cpu() -> bool: return VLLM_TARGET_DEVICE == "cpu" +def _is_xpu() -> bool: + return VLLM_TARGET_DEVICE == "xpu" + + def _build_custom_ops() -> bool: return _is_cuda() or _is_hip() or _is_cpu() @@ -357,6 +361,8 @@ def get_vllm_version() -> str: version += "+tpu" elif _is_cpu(): version += "+cpu" + elif _is_xpu(): + version += "+xpu" else: raise RuntimeError("Unknown runtime environment") @@ -406,6 +412,8 @@ def _read_requirements(filename: str) -> List[str]: requirements = _read_requirements("requirements-tpu.txt") elif _is_cpu(): requirements = _read_requirements("requirements-cpu.txt") + elif _is_xpu(): + requirements = _read_requirements("requirements-xpu.txt") else: raise ValueError( "Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.") diff --git a/tests/accuracy/test_lm_eval_correctness.py b/tests/accuracy/test_lm_eval_correctness.py index 93da626a2de03..581e56352064b 100644 --- a/tests/accuracy/test_lm_eval_correctness.py +++ b/tests/accuracy/test_lm_eval_correctness.py @@ -1,4 +1,5 @@ -import logging +# mypy: ignore-errors +# TODO (robertgshaw2-neuralmagic): clean this up import os from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, List, TypedDict @@ -63,14 +64,12 @@ class EvalTaskDefinition(EvalTaskDefinitionOpts): @pytest.mark.parametrize("eval_data", TEST_DATA) def test_lm_eval_correctness( eval_data: EvalTaskDefinition, - logger: logging.Logger, monkeypatch: pytest.MonkeyPatch, ): monkeypatch.setenv("TOKENIZERS_PARALLELISM", "false") monkeypatch.setenv("OPENAI_API_KEY", "dummy") model_name = eval_data["model_name"] - logger.info("building server startup args") vllm_args = { "--model": model_name, "--disable-log-requests": None, @@ -79,7 +78,6 @@ def test_lm_eval_correctness( if eval_data.get("enable_tensor_parallel") is True: tp = torch.cuda.device_count() - logger.info("Enabling tensor parallelism with %d devices", tp) vllm_args["--tensor-parallel-size"] = tp if extra_args := eval_data.get("extra_args"): @@ -91,12 +89,10 @@ def test_lm_eval_correctness( "base_url=http://localhost:8000/v1", ]) - logger.info("launching server") - with ServerContext(vllm_args, logger=logger) as _: + with ServerContext(vllm_args) as _: task_names = [task["name"] for task in eval_data["tasks"]] limit = eval_data["limit"] new_fewshot = eval_data["num_fewshot"] - logger.info("getting results for task_names=%s", task_names) results = lm_eval.simple_evaluate( model="local-completions", model_args=openai_args, @@ -106,16 +102,14 @@ def test_lm_eval_correctness( limit=limit, ) - logger.info("clearing torch cache") lm_eval.models.utils.clear_torch_cache() rtol = eval_data.get("rtol", DEFAULT_RTOL) for task in eval_data["tasks"]: - logger.info("checking metrics for task=%s", task["name"]) for metric in task["metrics"]: ground_truth = metric["value"] measured_value = results["results"][task["name"]][metric["name"]] - logger.info( + print( "%s %s:\nground_truth=%s measured_value=%s", task["name"], metric["name"], diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py index 1be76fdc8d868..495a123c351d7 100644 --- a/tests/async_engine/api_server_async_engine.py +++ b/tests/async_engine/api_server_async_engine.py @@ -1,5 +1,4 @@ """vllm.entrypoints.api_server with some extra logging for testing.""" -import argparse from typing import Any, Dict import uvicorn @@ -8,6 +7,7 @@ import vllm.entrypoints.api_server from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm.utils import FlexibleArgumentParser app = vllm.entrypoints.api_server.app @@ -33,7 +33,7 @@ def stats() -> Response: if __name__ == "__main__": - parser = argparse.ArgumentParser() + parser = FlexibleArgumentParser() parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--port", type=int, default=8000) parser = AsyncEngineArgs.add_cli_args(parser) diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py index 77801437e7581..49a3004886df3 100644 --- a/tests/async_engine/test_async_llm_engine.py +++ b/tests/async_engine/test_async_llm_engine.py @@ -2,9 +2,13 @@ from dataclasses import dataclass import pytest +import torch from tests.nm_utils.utils_skip import should_skip_test_group -from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm import SamplingParams +from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine + +from ..utils import wait_for_gpu_memory_to_clear if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"): pytest.skip("TEST_ASYNC_ENGINE=DISABLE, skipping async engine test group", @@ -99,3 +103,35 @@ async def test_new_requests_event(): assert engine.get_model_config() is not None assert engine.get_tokenizer() is not None assert engine.get_decoding_config() is not None + + +def test_asyncio_run(): + wait_for_gpu_memory_to_clear( + devices=list(range(torch.cuda.device_count())), + threshold_bytes=2 * 2**30, + timeout_s=60, + ) + + engine = AsyncLLMEngine.from_engine_args( + AsyncEngineArgs(model="facebook/opt-125m")) + + async def run(prompt: str): + sampling_params = SamplingParams( + temperature=0, + max_tokens=32, + ) + + async for output in engine.generate(prompt, + sampling_params, + request_id=prompt): + final_output = output + return final_output + + async def generate(): + return await asyncio.gather( + run("test0"), + run("test1"), + ) + + results = asyncio.run(generate()) + assert len(results) == 2 diff --git a/tests/conftest.py b/tests/conftest.py index 2c72094f5005a..f5ed54776dda9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,5 @@ import contextlib import gc -import logging import os from typing import Any, Dict, List, Optional, Tuple, TypeVar @@ -12,7 +11,6 @@ from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq, AutoProcessor, AutoTokenizer, BatchEncoding) -from tests.nm_utils.logging import make_logger from vllm import LLM, SamplingParams from vllm.config import TokenizerPoolConfig, VisionLanguageConfig from vllm.distributed import (destroy_distributed_environment, @@ -146,6 +144,7 @@ def __init__( model_name: str, dtype: str = "half", *, + model_kwargs: Optional[Dict[str, Any]] = None, is_embedding_model: bool = False, is_vision_model: bool = False, **kwargs, @@ -169,12 +168,13 @@ def __init__( else: auto_cls = AutoModelForCausalLM + model_kwargs = model_kwargs if model_kwargs is not None else {} self.model = self.wrap_device( auto_cls.from_pretrained( model_name, torch_dtype=torch_dtype, trust_remote_code=True, - **kwargs, + **model_kwargs, )) self.tokenizer = AutoTokenizer.from_pretrained( @@ -234,11 +234,13 @@ def generate_greedy( prompts: List[str], max_tokens: int, images: Optional[List[Image.Image]] = None, + **kwargs, ) -> List[Tuple[List[int], str]]: outputs = self.generate(prompts, do_sample=False, max_new_tokens=max_tokens, - images=images) + images=images, + **kwargs) return [(output_ids[0], output_str[0]) for output_ids, output_str in outputs] @@ -366,7 +368,7 @@ def __exit__(self, exc_type, exc_value, traceback): cleanup() -@pytest.fixture +@pytest.fixture(scope="session") def hf_runner(): return HfRunner @@ -478,7 +480,7 @@ def generate_greedy_logprobs_nm_use_tokens( input_ids_lst: List[torch.Tensor], max_tokens: int, topk_logprobs_count: int, - ) -> List[Tuple[List[int], str, List[Dict]]]: + ) -> List[Tuple[List[str], str, List[Dict[str, Any]]]]: all_logprobs = [] all_output_tokens = [] all_output_strs = [] @@ -567,6 +569,7 @@ def __init__( block_size: int = 16, enable_chunked_prefill: bool = False, swap_space: int = 4, + enforce_eager: bool = False, **kwargs, ) -> None: self.model = LLM( @@ -575,6 +578,7 @@ def __init__( trust_remote_code=True, dtype=dtype, swap_space=swap_space, + enforce_eager=enforce_eager, disable_log_stats=disable_log_stats, tensor_parallel_size=tensor_parallel_size, max_model_len=max_model_len, @@ -691,51 +695,6 @@ def vllm_runner(): return VllmRunner -# UPSTREAM SYNC: needed for nm-automation -class VllmRunnerNm(VllmRunner): - - def generate_w_logprobs( - self, - prompts: List[str], - sampling_params: SamplingParams, - ) -> List[Tuple[List[int], str]]: - assert sampling_params.logprobs is not None - - req_outputs = self.model.generate(prompts, - sampling_params=sampling_params) - outputs = [] - for req_output in req_outputs: - for sample in req_output.outputs: - output_str = sample.text - output_ids = sample.token_ids - output_logprobs = sample.logprobs - outputs.append((output_ids, output_str, output_logprobs)) - return outputs - - def generate_greedy_logprobs( - self, - prompts: List[str], - max_tokens: int, - num_logprobs: int, - ) -> List[Tuple[List[int], str]]: - greedy_logprobs_params = SamplingParams(temperature=0.0, - max_tokens=max_tokens, - logprobs=num_logprobs) - outputs = self.generate_w_logprobs(prompts, greedy_logprobs_params) - - return [(output_ids, output_str, output_logprobs) - for output_ids, output_str, output_logprobs in outputs] - - def __del__(self): - del self.model - cleanup() - - -@pytest.fixture -def vllm_runner_nm(): - return VllmRunnerNm - - def get_tokenizer_pool_config(tokenizer_group_type): if tokenizer_group_type is None: return None @@ -768,8 +727,3 @@ def num_gpus_available(): in current process.""" return cuda_device_count_stateless() - - -@pytest.fixture(scope="session") -def logger() -> logging.Logger: - return make_logger("vllm_test") diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index 604aba39c560d..eb2f71258cedc 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -482,3 +482,70 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator, assert expected_token_ids == actual_token_ids assert baseline_token_ids == test_token_ids + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Use a small model for a fast test. + "model": "facebook/opt-125m", + + # skip cuda graph creation for fast test. + "enforce_eager": True, + + # we keep the blocks small, so that hit eviction quickly + "max_model_len": 48, + "block_size": 16, + "num_gpu_blocks_override": 3, + + # Test APC in v2 block + "use_v2_block_manager": True, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{ + "enable_prefix_caching": False +}]) +@pytest.mark.parametrize("test_llm_kwargs", [{ + "enable_prefix_caching": True, +}]) +@pytest.mark.parametrize("seed", [1]) +def test_auto_prefix_caching_after_evition_start(baseline_llm_generator, + test_llm_generator): + """Verify block manager v2 with auto prefix caching could works normal + even when eviction started. + With APC enabled, all blocks are held by native block at the beginning. + Then blocks are managed by evictor instead. If cache hit at the evitor's + block, then it could be reused, or we need to recompute its kv cache. + """ + output_len = 10 + temperature = 0.0 + + prompts = [ + "You are a helpful assistant. Please answer truthfully and write " + "out your thinking step by step to be sure you get the right answer. " + "If you make a mistake, attempt to correct it. who are you?", + "You are a helpful assistant. Please answer truthfully and write out " + "your thinking step by step to be sure you get the right answer. You " + "are helpful and harmless and you follow ethical guidelines. " + "who are you?" + ] + + sampling_params = SamplingParams( + max_tokens=output_len, + ignore_eos=True, + temperature=temperature, + ) + + print('Getting token ids with APC disabled') + baseline_token_ids = get_token_ids_from_llm_generator( + baseline_llm_generator, prompts, sampling_params) + + print('Getting token ids with APC enabled') + test_token_ids = get_token_ids_from_llm_generator(test_llm_generator, + prompts, sampling_params) + + for expected_token_ids, actual_token_ids in zip(baseline_token_ids, + test_token_ids): + assert expected_token_ids == actual_token_ids + + assert baseline_token_ids == test_token_ids diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py index 2a1c9945b93dc..2cbc66bf3f96b 100644 --- a/tests/core/block/test_block_table.py +++ b/tests/core/block/test_block_table.py @@ -1,3 +1,5 @@ +from typing import List + import pytest from tests.nm_utils.utils_skip import should_skip_test_group @@ -33,7 +35,7 @@ def test_allocate_naive(block_size: int, sequence_len: int): token_ids = list(range(sequence_len)) num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size))) - block_tables = [] + block_tables: List[BlockTable] = [] for i in range(5): assert allocator.get_num_free_blocks( device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc @@ -78,7 +80,7 @@ def test_allocate_prefix_caching(block_size: int, sequence_len: int): num_immutable_blocks_per_alloc = len( chunked_tokens) - num_mutable_blocks_per_alloc - block_tables = [] + block_tables: List[BlockTable] = [] for alloc_i in range(1, 6): block_tables.append( @@ -273,7 +275,7 @@ def test_append_token_ids_correct_content(block_size: int, sequence_len: int, ) block_table.allocate(token_ids=token_ids, device=Device.GPU) - appended_so_far = [] + appended_so_far: List[int] = [] for append in chunk_list(token_ids_to_append, append_size): block_table.append_token_ids(append) appended_so_far.extend(append) diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py index c300345dd7da6..62539af8c04d2 100644 --- a/tests/core/block/test_prefix_caching_block.py +++ b/tests/core/block/test_prefix_caching_block.py @@ -128,7 +128,7 @@ def create_chain(block_size: int, num_empty_trailing_blocks=0) -> List[PrefixCachingBlock]: """Helper method which creates a chain of blocks. """ - blocks = [] + blocks: List[PrefixCachingBlock] = [] num_blocks = math.ceil( len(token_ids) / block_size) + num_empty_trailing_blocks @@ -613,7 +613,7 @@ def create_immutable_chain( ) -> List[PrefixCachingBlock]: """Helper method which creates a chain of blocks. """ - blocks = [] + blocks: List[Block] = [] num_blocks = math.ceil(len(token_ids) / block_size) if num_blocks == 0: diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index 5137f1644194e..5df096322affd 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -488,11 +488,11 @@ def test_chunked_prefill_preempt(): # The request should be preempted. scheduler.block_manager.can_append_slots = MagicMock() - def cannot_append_second_group(seq_group, num_lookahead_slots): + def cannot_append_second_group1(seq_group, num_lookahead_slots): return seq_group.request_id != "1" scheduler.block_manager.can_append_slots.side_effect = ( - cannot_append_second_group) + cannot_append_second_group1) # The running prefill is now preempted. _, out = schedule_and_update_computed_tokens(scheduler) @@ -510,11 +510,11 @@ def cannot_append_second_group(seq_group, num_lookahead_slots): assert seq_group.get_num_uncomputed_tokens() == 30 # We should be able to run prefill twice as it is chunked. - def cannot_append_second_group(seq_group, num_lookahead_slots): + def cannot_append_second_group2(seq_group, num_lookahead_slots): return True scheduler.block_manager.can_append_slots.side_effect = ( - cannot_append_second_group) + cannot_append_second_group2) _, out = schedule_and_update_computed_tokens(scheduler) assert len(out.scheduled_seq_groups) == 1 assert out.num_prefill_groups == 1 @@ -535,7 +535,7 @@ def test_chunked_prefill_max_seqs(): cache_config.num_cpu_blocks = 8 cache_config.num_gpu_blocks = 8 scheduler = Scheduler(scheduler_config, cache_config, None) - running = [] + running: List[SequenceGroup] = [] _, seq_group = create_dummy_prompt("1", prompt_length=65) scheduler.add_seq_group(seq_group) diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index b7960435a6d69..5722c9e82aa8e 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -1,6 +1,6 @@ import time from collections import deque -from typing import List +from typing import Deque, List, Set, Tuple from unittest.mock import MagicMock import pytest # noqa @@ -70,7 +70,7 @@ def test_scheduler_abort_seq_group(): # Add multiple seq groups to scheduler. num_seq_group = 4 - request_ids = set() + request_ids: Set[str] = set() for i in range(num_seq_group): _, seq_group = create_dummy_prompt(str(i), block_size) scheduler.add_seq_group(seq_group) @@ -352,7 +352,7 @@ def test_prefill_schedule_max_prompt_len(): Test prompt longer than max_prompt_len is aborted. """ scheduler = initialize_scheduler(max_model_len=30) - _, seq_group = create_dummy_prompt(0, prompt_length=60) + _, seq_group = create_dummy_prompt("0", prompt_length=60) waiting = deque([seq_group]) budget = create_token_budget() remaining_waiting, output = scheduler._schedule_prefills( @@ -369,7 +369,7 @@ def test_prefill_schedule_token_budget(): Test token budget respected. """ scheduler = initialize_scheduler() - waiting = deque() + waiting: Deque[SequenceGroup] = deque() budget = create_token_budget(token_budget=0) for i in range(2): _, seq_group = create_dummy_prompt(str(i), prompt_length=60) @@ -424,7 +424,7 @@ def test_prefill_schedule_max_seqs(): Test max seq respected. """ scheduler = initialize_scheduler() - waiting = deque() + waiting: Deque[SequenceGroup] = deque() budget = create_token_budget(max_num_seqs=2) for i in range(3): _, seq_group = create_dummy_prompt(str(i), prompt_length=60) @@ -458,9 +458,9 @@ def test_prefill_schedule_max_lora(): """ lora_config = LoRAConfig(max_lora_rank=8, max_loras=1) scheduler = initialize_scheduler(lora_config=lora_config) - waiting = deque() + waiting: Deque[SequenceGroup] = deque() budget = create_token_budget(token_budget=120) - curr_loras = set() + curr_loras: Set[int] = set() for i in range(2): _, seq_group = create_dummy_prompt(str(i), prompt_length=60, @@ -504,7 +504,7 @@ def test_prefill_schedule_no_block_manager_capacity(): Test sequence cannot be scheduled due to block manager has no capacity. """ scheduler = initialize_scheduler() - waiting = deque() + waiting: Deque[SequenceGroup] = deque() budget = create_token_budget() for i in range(3): _, seq_group = create_dummy_prompt(str(i), prompt_length=60) @@ -541,7 +541,7 @@ def test_decode_schedule_preempted(): Test decodes cannot be scheduled and preempted. """ scheduler = initialize_scheduler() - running = deque() + running: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") curr_loras = None for i in range(3): @@ -582,7 +582,7 @@ def test_decode_swap_beam_search(): Test best_of > 1 swap out blocks """ scheduler = initialize_scheduler() - running = deque() + running: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") curr_loras = None budget = create_token_budget() @@ -633,7 +633,7 @@ def test_schedule_decode_blocks_to_copy_update(): """ scheduler = initialize_scheduler() _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) - running = deque() + running: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") curr_loras = None scheduler._allocate_and_set_running(seq_group) @@ -661,10 +661,10 @@ def test_schedule_decode_blocks_to_copy_update(): def test_schedule_swapped_simple(): scheduler = initialize_scheduler() - swapped = deque() + swapped: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") curr_loras = None - blocks_to_swap_out = [] + blocks_to_swap_out: List[Tuple[int, int]] = [] _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) scheduler._allocate_and_set_running(seq_group) append_new_token_seq_group(60, seq_group, 1) @@ -688,10 +688,10 @@ def test_schedule_swapped_simple(): def test_schedule_swapped_max_token_budget(): scheduler = initialize_scheduler() - swapped = deque() + swapped: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") curr_loras = None - blocks_to_swap_out = [] + blocks_to_swap_out: List[Tuple[int, int]] = [] for _ in range(2): _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) scheduler._allocate_and_set_running(seq_group) @@ -722,10 +722,10 @@ def test_schedule_swapped_max_token_budget(): def test_schedule_swapped_max_seqs(): scheduler = initialize_scheduler() - swapped = deque() + swapped: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") curr_loras = None - blocks_to_swap_out = [] + blocks_to_swap_out: List[Tuple[int, int]] = [] for i in range(4): _, seq_group = create_dummy_prompt(str(i), prompt_length=60) scheduler._allocate_and_set_running(seq_group) @@ -755,10 +755,10 @@ def test_schedule_swapped_max_seqs(): def test_schedule_swapped_max_loras(): lora_config = LoRAConfig(max_lora_rank=8, max_loras=1) scheduler = initialize_scheduler(lora_config=lora_config) - swapped = deque() + swapped: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") - curr_loras = set() - blocks_to_swap_out = [] + curr_loras: Set[int] = set() + blocks_to_swap_out: List[Tuple[int, int]] = [] for i in range(2): _, seq_group = create_dummy_prompt(str(i), prompt_length=60, @@ -784,10 +784,10 @@ def test_schedule_swapped_max_loras(): def test_schedule_swapped_cannot_swap_in(): scheduler = initialize_scheduler() - swapped = deque() + swapped: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") curr_loras = None - blocks_to_swap_out = [] + blocks_to_swap_out: List[Tuple[int, int]] = [] for _ in range(2): _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) scheduler._allocate_and_set_running(seq_group) @@ -811,10 +811,10 @@ def test_schedule_swapped_cannot_swap_in(): def test_infeasible_swap(): scheduler = initialize_scheduler() - swapped = deque() + swapped: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") curr_loras = None - blocks_to_swap_out = [] + blocks_to_swap_out: List[Tuple[int, int]] = [] for _ in range(2): _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) scheduler._allocate_and_set_running(seq_group) @@ -839,13 +839,13 @@ def test_infeasible_swap(): def test_schedule_swapped_blocks_to_copy(): scheduler = initialize_scheduler() - swapped = deque() + swapped: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") curr_loras = None _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) scheduler._allocate_and_set_running(seq_group) append_new_token_seq_group(60, seq_group, 1) - blocks_to_swap_out = [] + blocks_to_swap_out: List[Tuple[int, int]] = [] scheduler._swap_out(seq_group, blocks_to_swap_out) swapped.append(seq_group) diff --git a/tests/core/utils.py b/tests/core/utils.py index 2fbf099c5f90b..f249f4b59a2ee 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -1,5 +1,7 @@ import time -from typing import Iterable, Optional, Tuple +from typing import List, Optional +from typing import Sequence as GenericSequence +from typing import Tuple from vllm import SamplingParams from vllm.lora.request import LoRARequest @@ -46,7 +48,7 @@ def create_dummy_prompt_encoder_decoder( lora_request: Optional[LoRARequest] = None, use_beam_search: bool = False, best_of: int = 1, -) -> Tuple[Sequence, SequenceGroup]: +) -> Tuple[Sequence, Sequence, SequenceGroup]: if not block_size: block_size = decoder_prompt_length @@ -86,7 +88,7 @@ def create_dummy_prompt_encoder_decoder( def create_seq_group( seq_prompt_len: int = 1024, - seq_output_lens: Iterable[int] = (128, ), + seq_output_lens: GenericSequence[int] = (128, ), request_id: str = '0', seq_id_start: int = 0, sampling_params: Optional[SamplingParams] = None) -> SequenceGroup: @@ -98,7 +100,7 @@ def create_seq_group( prompt_token_ids = [0] * seq_prompt_len - seqs = [] + seqs: List[Sequence] = [] for seq_id_offset, output_len in enumerate(seq_output_lens): seq = Sequence( seq_id=seq_id_start + seq_id_offset, @@ -125,7 +127,7 @@ def create_seq_group( def create_seq_group_encoder_decoder( seq_prompt_len: int = 1024, - seq_output_lens: Iterable[int] = (128, ), + seq_output_lens: GenericSequence[int] = (128, ), request_id: str = '0', seq_id_start: int = 0, sampling_params: Optional[SamplingParams] = None) -> SequenceGroup: diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index 6a99050316588..3f6590b11aa0c 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -7,13 +7,15 @@ import torch.distributed as dist from tests.nm_utils.utils_skip import should_skip_test_group -from tests.utils import (init_test_distributed_environment, - multi_process_tensor_parallel) from vllm.distributed.communication_op import ( # noqa tensor_model_parallel_all_reduce) from vllm.distributed.parallel_state import (get_tensor_model_parallel_group, get_tp_group, graph_capture) +from ..utils import (ensure_model_parallel_initialized, + init_test_distributed_environment, + multi_process_tensor_parallel) + if should_skip_test_group(group_name="TEST_DISTRIBUTED"): pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group", allow_module_level=True) @@ -31,8 +33,8 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port): torch.cuda.set_device(device) init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port) - - group = get_tensor_model_parallel_group() + ensure_model_parallel_initialized(tp_size, pp_size) + group = get_tensor_model_parallel_group().device_group # A small all_reduce for warmup. # this is needed because device communicators might be created lazily diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index 27bcd88eadc61..9c4c5d3f43d3a 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -1,5 +1,6 @@ import multiprocessing import os +from typing import Dict, List import pytest import torch @@ -22,9 +23,9 @@ def distributed_run(fn, world_size): number_of_processes = world_size - processes = [] + processes: List[multiprocessing.Process] = [] for i in range(number_of_processes): - env = {} + env: Dict[str, str] = {} env['RANK'] = str(i) env['LOCAL_RANK'] = str(i) env['WORLD_SIZE'] = str(number_of_processes) diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py new file mode 100644 index 0000000000000..4444cc6465d31 --- /dev/null +++ b/tests/distributed/test_shm_broadcast.py @@ -0,0 +1,88 @@ +import multiprocessing +import random +import time + +import pytest +import torch.distributed as dist + +from tests.nm_utils.utils_skip import should_skip_test_group +from vllm.distributed.device_communicators.shm_broadcast import ( + ShmRingBuffer, ShmRingBufferIO) +from vllm.utils import update_environment_variables + +if should_skip_test_group(group_name="TEST_DISTRIBUTED"): + pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group", + allow_module_level=True) + + +def distributed_run(fn, world_size): + number_of_processes = world_size + processes = [] + for i in range(number_of_processes): + env = {} + env['RANK'] = str(i) + env['LOCAL_RANK'] = str(i) + env['WORLD_SIZE'] = str(number_of_processes) + env['LOCAL_WORLD_SIZE'] = str(number_of_processes) + env['MASTER_ADDR'] = 'localhost' + env['MASTER_PORT'] = '12345' + p = multiprocessing.Process(target=fn, args=(env, )) + processes.append(p) + p.start() + + for p in processes: + p.join() + + for p in processes: + assert p.exitcode == 0 + + +def worker_fn_wrapper(fn): + # `multiprocessing.Process` cannot accept environment variables directly + # so we need to pass the environment variables as arguments + # and update the environment variables in the function + def wrapped_fn(env): + update_environment_variables(env) + dist.init_process_group(backend="gloo") + fn() + + return wrapped_fn + + +@worker_fn_wrapper +def worker_fn(): + writer_rank = 2 + broadcaster = ShmRingBufferIO.create_from_process_group( + dist.group.WORLD, 1024, 2, writer_rank) + if dist.get_rank() == writer_rank: + time.sleep(random.random()) + broadcaster.broadcast_object(0) + time.sleep(random.random()) + broadcaster.broadcast_object({}) + time.sleep(random.random()) + broadcaster.broadcast_object([]) + else: + time.sleep(random.random()) + a = broadcaster.broadcast_object(None) + time.sleep(random.random()) + b = broadcaster.broadcast_object(None) + time.sleep(random.random()) + c = broadcaster.broadcast_object(None) + assert a == 0 + assert b == {} + assert c == [] + dist.barrier() + + +def test_shm_broadcast(): + distributed_run(worker_fn, 4) + + +def test_singe_process(): + buffer = ShmRingBuffer(1, 1024, 4) + reader = ShmRingBufferIO(buffer, reader_rank=0) + writer = ShmRingBufferIO(buffer, reader_rank=-1) + writer.enqueue([0]) + writer.enqueue([1]) + assert reader.dequeue() == [0] + assert reader.dequeue() == [1] diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py index 6746ffe97bafa..8083bd099d2a5 100644 --- a/tests/distributed/test_utils.py +++ b/tests/distributed/test_utils.py @@ -12,7 +12,7 @@ @ray.remote -class _CUDADeviceCountStatelessTestActor(): +class _CUDADeviceCountStatelessTestActor: def get_count(self): return cuda_device_count_stateless() @@ -28,8 +28,10 @@ def test_cuda_device_count_stateless(): """Test that cuda_device_count_stateless changes return value if CUDA_VISIBLE_DEVICES is changed.""" - actor = _CUDADeviceCountStatelessTestActor.options(num_gpus=2).remote() - assert ray.get(actor.get_cuda_visible_devices.remote()) == "0,1" + actor = _CUDADeviceCountStatelessTestActor.options( # type: ignore + num_gpus=2).remote() + assert sorted(ray.get( + actor.get_cuda_visible_devices.remote()).split(",")) == ["0", "1"] assert ray.get(actor.get_count.remote()) == 2 ray.get(actor.set_cuda_visible_devices.remote("0")) assert ray.get(actor.get_count.remote()) == 1 diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 58466e6bdd363..b55e517b43bac 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -1,6 +1,7 @@ # imports for guided decoding tests import json import re +from typing import List import jsonschema import openai # use the official client for correctness check @@ -457,7 +458,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI, max_tokens=5, temperature=0.0, stream=True) - chunks = [] + chunks: List[str] = [] finish_reason_count = 0 async for chunk in stream: chunks.append(chunk.choices[0].text) @@ -503,7 +504,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str): temperature=0.0, stream=True, ) - chunks = [] + chunks: List[str] = [] finish_reason_count = 0 async for chunk in stream: delta = chunk.choices[0].delta @@ -658,50 +659,52 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI, [MODEL_NAME, "zephyr-lora"], ) async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str): - # test simple list - batch = await client.completions.create( - model=model_name, - prompt=["Hello, my name is", "Hello, my name is"], - max_tokens=5, - temperature=0.0, - ) - assert len(batch.choices) == 2 - assert batch.choices[0].text == batch.choices[1].text - - # test n = 2 - batch = await client.completions.create( - model=model_name, - prompt=["Hello, my name is", "Hello, my name is"], - n=2, - max_tokens=5, - temperature=0.0, - extra_body=dict( - # NOTE: this has to be true for n > 1 in vLLM, but not necessary - # for official client. - use_beam_search=True), - ) - assert len(batch.choices) == 4 - assert batch.choices[0].text != batch.choices[ - 1].text, "beam search should be different" - assert batch.choices[0].text == batch.choices[ - 2].text, "two copies of the same prompt should be the same" - assert batch.choices[1].text == batch.choices[ - 3].text, "two copies of the same prompt should be the same" + # test both text and token IDs + for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2): + # test simple list + batch = await client.completions.create( + model=model_name, + prompt=prompts, + max_tokens=5, + temperature=0.0, + ) + assert len(batch.choices) == 2 + assert batch.choices[0].text == batch.choices[1].text - # test streaming - batch = await client.completions.create( - model=model_name, - prompt=["Hello, my name is", "Hello, my name is"], - max_tokens=5, - temperature=0.0, - stream=True, - ) - texts = [""] * 2 - async for chunk in batch: - assert len(chunk.choices) == 1 - choice = chunk.choices[0] - texts[choice.index] += choice.text - assert texts[0] == texts[1] + # test n = 2 + batch = await client.completions.create( + model=model_name, + prompt=prompts, + n=2, + max_tokens=5, + temperature=0.0, + extra_body=dict( + # NOTE: this has to be true for n > 1 in vLLM, but not necessary + # for official client. + use_beam_search=True), + ) + assert len(batch.choices) == 4 + assert batch.choices[0].text != batch.choices[ + 1].text, "beam search should be different" + assert batch.choices[0].text == batch.choices[ + 2].text, "two copies of the same prompt should be the same" + assert batch.choices[1].text == batch.choices[ + 3].text, "two copies of the same prompt should be the same" + + # test streaming + batch = await client.completions.create( + model=model_name, + prompt=prompts, + max_tokens=5, + temperature=0.0, + stream=True, + ) + texts = [""] * 2 + async for chunk in batch: + assert len(chunk.choices) == 1 + choice = chunk.choices[0] + texts[choice.index] += choice.text + assert texts[0] == texts[1] @pytest.mark.asyncio diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 458226ce38ccd..9abef39f8d7d0 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -77,27 +77,27 @@ def ref_single_query_cached_kv_attention( block_size = value_cache.shape[3] num_seqs = query.shape[0] - block_tables = block_tables.cpu().tolist() - seq_lens = seq_lens.cpu().tolist() + block_tables_lst = block_tables.cpu().tolist() + seq_lens_lst = seq_lens.cpu().tolist() for i in range(num_seqs): q = query[i].unsqueeze(0) - block_table = block_tables[i] - seq_len = int(seq_lens[i]) + block_table = block_tables_lst[i] + seq_len = int(seq_lens_lst[i]) - keys = [] - values = [] + keys_lst: List[torch.Tensor] = [] + values_lst: List[torch.Tensor] = [] for j in range(seq_len): block_number = int(block_table[j // block_size]) block_offset = j % block_size k = key_cache[block_number, :, :, block_offset, :] k = k.reshape(num_kv_heads, head_size) - keys.append(k) + keys_lst.append(k) v = value_cache[block_number, :, :, block_offset] - values.append(v) - keys = torch.stack(keys, dim=0) - values = torch.stack(values, dim=0) + values_lst.append(v) + keys = torch.stack(keys_lst, dim=0) + values = torch.stack(values_lst, dim=0) if num_queries_per_kv > 1: # Handle MQA and GQA keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1) @@ -166,14 +166,15 @@ def test_paged_attention( # Create the block tables. max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size - block_tables = [] + block_tables_lst: List[List[int]] = [] for _ in range(num_seqs): block_table = [ random.randint(0, NUM_BLOCKS - 1) for _ in range(max_num_blocks_per_seq) ] - block_tables.append(block_table) - block_tables = torch.tensor(block_tables, dtype=torch.int) + block_tables_lst.append(block_table) + + block_tables = torch.tensor(block_tables_lst, dtype=torch.int) # Create the KV caches. key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1, @@ -292,7 +293,7 @@ def ref_multi_query_kv_attention( dtype: torch.dtype, ) -> torch.Tensor: num_seqs = len(cu_seq_lens) - 1 - ref_outputs = [] + ref_outputs: List[torch.Tensor] = [] for i in range(num_seqs): start_idx = cu_seq_lens[i] end_idx = cu_seq_lens[i + 1] @@ -312,8 +313,8 @@ def ref_multi_query_kv_attention( attn_mask=attn_mask, ) ref_outputs.append(ref_output) - ref_output = torch.cat(ref_outputs, dim=0) - return ref_output + + return torch.cat(ref_outputs, dim=0) # TODO(woosuk): Add tests for USE_ALIBI=True. diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py index 61ead03e39da5..ec275bccb5979 100644 --- a/tests/kernels/test_blocksparse_attention.py +++ b/tests/kernels/test_blocksparse_attention.py @@ -82,27 +82,27 @@ def ref_single_query_cached_kv_attention( block_size = value_cache.shape[3] num_seqs = query.shape[0] - block_tables = block_tables.cpu().tolist() - seq_lens = seq_lens.cpu().tolist() + block_tables_lst = block_tables.cpu().tolist() + seq_lens_lst = seq_lens.cpu().tolist() for i in range(num_seqs): q = query[i].unsqueeze(0) - block_table = block_tables[i] - seq_len = int(seq_lens[i]) + block_table = block_tables_lst[i] + seq_len = int(seq_lens_lst[i]) - keys = [] - values = [] + keys_lst: List[torch.Tensor] = [] + values_lst: List[torch.Tensor] = [] for j in range(seq_len): block_number = int(block_table[j // block_size]) block_offset = j % block_size k = key_cache[block_number, :, :, block_offset, :] k = k.reshape(num_kv_heads, head_size) - keys.append(k) + keys_lst.append(k) v = value_cache[block_number, :, :, block_offset] - values.append(v) - keys = torch.stack(keys, dim=0) - values = torch.stack(values, dim=0) + values_lst.append(v) + keys = torch.stack(keys_lst, dim=0) + values = torch.stack(values_lst, dim=0) if num_queries_per_kv > 1: # Handle MQA and GQA keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1) @@ -438,7 +438,7 @@ def test_varlen_blocksparse_attention_prefill( value = torch.repeat_interleave(value, num_queries_per_kv, dim=1) ref_output = ref_multi_query_kv_attention( - cu_seq_lens, + cu_seq_lens.tolist(), query, key, value, diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index f7aec1cb5b677..23ea9cbd955c2 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -1,5 +1,5 @@ import random -from typing import Tuple +from typing import List, Tuple import pytest import torch @@ -72,7 +72,7 @@ def test_copy_blocks( src_blocks = random.sample(range(num_blocks), num_mappings) remainig_blocks = list(set(range(num_blocks)) - set(src_blocks)) dst_blocks = random.sample(remainig_blocks, 2 * num_mappings) - block_mapping = [] + block_mapping: List[Tuple[int, int]] = [] for i in range(num_mappings): src = src_blocks[i] dst1 = dst_blocks[2 * i] @@ -140,8 +140,8 @@ def test_reshape_and_cache( torch.set_default_device(device) # Create a random slot mapping. num_slots = block_size * num_blocks - slot_mapping = random.sample(range(num_slots), num_tokens) - slot_mapping = torch.tensor(slot_mapping, dtype=torch.long) + slot_mapping_lst = random.sample(range(num_slots), num_tokens) + slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long) qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype) _, key, value = qkv.unbind(dim=1) @@ -179,12 +179,12 @@ def test_reshape_and_cache( # Run the reference implementation. reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape) block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor") - block_indicies = block_indicies.cpu().tolist() + block_indicies_lst = block_indicies.cpu().tolist() block_offsets = slot_mapping % block_size - block_offsets = block_offsets.cpu().tolist() + block_offsets_lst = block_offsets.cpu().tolist() for i in range(num_tokens): - block_idx = block_indicies[i] - block_offset = block_offsets[i] + block_idx = block_indicies_lst[i] + block_offset = block_offsets_lst[i] cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i] cloned_value_cache[block_idx, :, :, block_offset] = value[i] @@ -236,8 +236,10 @@ def test_reshape_and_cache_flash( # Create a random slot mapping. num_slots = block_size * num_blocks - slot_mapping = random.sample(range(num_slots), num_tokens) - slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device=device) + slot_mapping_lst = random.sample(range(num_slots), num_tokens) + slot_mapping = torch.tensor(slot_mapping_lst, + dtype=torch.long, + device=device) qkv = torch.randn(num_tokens, 3, @@ -269,13 +271,13 @@ def test_reshape_and_cache_flash( slot_mapping, kv_cache_dtype) # Run the reference implementation. - block_indicies = torch.div(slot_mapping, block_size, rounding_mode='floor') - block_indicies = block_indicies.cpu().tolist() + block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor") + block_indicies_lst = block_indicies.cpu().tolist() block_offsets = slot_mapping % block_size - block_offsets = block_offsets.cpu().tolist() + block_offsets_lst = block_offsets.cpu().tolist() for i in range(num_tokens): - block_idx = block_indicies[i] - block_offset = block_offsets[i] + block_idx = block_indicies_lst[i] + block_offset = block_offsets_lst[i] cloned_key_cache[block_idx, block_offset, :, :] = key[i] cloned_value_cache[block_idx, block_offset, :, :] = value[i] diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py index e7368fb87b6ae..6ece178b5cbe3 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/test_cutlass.py @@ -22,13 +22,13 @@ capability = capability[0] * 10 + capability[1] -def to_fp8(tensor: torch.tensor): +def to_fp8(tensor: torch.Tensor): finfo = torch.finfo(torch.float8_e4m3fn) return torch.round(tensor.clamp( min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn) -def to_int8(tensor: torch.tensor): +def to_int8(tensor: torch.Tensor): return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8) diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py index 4437a5ddc8d7a..7b5ffa17456e7 100644 --- a/tests/kernels/test_flash_attn.py +++ b/tests/kernels/test_flash_attn.py @@ -31,7 +31,7 @@ def ref_paged_attn( block_tables = block_tables.cpu().numpy() _, block_size, num_kv_heads, head_size = key_cache.shape - outputs = [] + outputs: List[torch.Tensor] = [] start_idx = 0 for i in range(num_seqs): query_len = query_lens[i] @@ -76,7 +76,7 @@ def ref_paged_attn( @pytest.mark.parametrize("dtype", DTYPES) @torch.inference_mode def test_flash_attn_with_paged_kv( - kv_lens: List[Tuple[int, int]], + kv_lens: List[int], num_heads: Tuple[int, int], head_size: int, dtype: torch.dtype, diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index 2934959f3d37e..a1de3d2e259c7 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -1,5 +1,5 @@ from itertools import accumulate, product -from typing import List, Optional +from typing import Dict, List, Optional import pytest import torch @@ -131,7 +131,7 @@ def test_batched_rotary_embedding( query, key, offsets=torch.zeros(batch_size * seq_len, - dtype=int, + dtype=torch.long, device=device)) # Compare the results. assert torch.allclose(out_query, @@ -219,20 +219,16 @@ def test_batched_rotary_embedding_multi_lora( def test_rope_module_cache(): MAX_POSITIONS = [123, 1234] BASES = [10000, 1000000] - ROPE_SCALINGS = [ - None, { - "type": "linear", - "factor": (1, ) - }, { - "type": "dynamic", - "factor": 1 - } - ] - settings = [ - HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, BASES, IS_NEOX_STYLE, - ROPE_SCALINGS, DTYPES - ] - rope_setting_id_map = {} + ROPE_SCALINGS = (None, { + "type": "linear", + "factor": (1, ) + }, { + "type": "dynamic", + "factor": 1 + }) + settings = (HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, BASES, IS_NEOX_STYLE, + ROPE_SCALINGS, DTYPES) + rope_setting_id_map: Dict[str, int] = {} for setting in product(*settings): head_size, rotary_dim, max_position, base, \ is_neox_stype, rope_scaling, dtype = setting diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 522c635b82d9c..4eab73a71071c 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -2,6 +2,7 @@ import gc import tempfile from collections import OrderedDict +from typing import Dict, List, TypedDict from unittest.mock import MagicMock, patch import pytest @@ -24,7 +25,18 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader import get_model -LONG_LORA_INFOS = [{ + +class ContextIDInfo(TypedDict): + lora_id: int + context_length: str + + +class ContextInfo(TypedDict): + lora: str + context_length: str + + +LONG_LORA_INFOS: List[ContextIDInfo] = [{ "lora_id": 1, "context_length": "16k", }, { @@ -207,7 +219,7 @@ def long_context_infos(long_context_lora_files_16k_1, long_context_lora_files_16k_2, long_context_lora_files_32k): cleanup() - infos = {} + infos: Dict[int, ContextInfo] = {} for lora_checkpoint_info in LONG_LORA_INFOS: lora_id = lora_checkpoint_info["lora_id"] if lora_id == 1: @@ -226,7 +238,7 @@ def long_context_infos(long_context_lora_files_16k_1, @pytest.fixture -def llama_2_7b_engine_extra_embeddings() -> nn.Module: +def llama_2_7b_engine_extra_embeddings(): cleanup() get_model_old = get_model @@ -244,7 +256,6 @@ def get_model_patched(*, model_config, device_config, **kwargs): @pytest.fixture -def llama_2_7b_model_extra_embeddings( - llama_2_7b_engine_extra_embeddings) -> nn.Module: +def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings): yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker. model_runner.model) diff --git a/tests/lora/data/long_context_test_data.py b/tests/lora/data/long_context_test_data.py index 9edde8f837ab2..7470abe56f63b 100644 --- a/tests/lora/data/long_context_test_data.py +++ b/tests/lora/data/long_context_test_data.py @@ -3,7 +3,29 @@ # codespell: ignore """This file contains a dictionary of prompts and golden responses.""" -prompts_and_responses = { +from typing import Dict, List, TypedDict + + +class DateJSON(TypedDict): + day: int + month: int + year: int + + +class AnswerJSON(TypedDict): + nationality: str + date_of_birth: DateJSON + date_of_death: DateJSON + politician: bool + sportsperson: bool + + +class PromptResponse(TypedDict): + prompt: str + golden_answer: AnswerJSON + + +prompts_and_responses: Dict[str, List[PromptResponse]] = { "16k": [{ "prompt": "[INST] <>\nYou are a helpful assistant that extracts information about a person in json.\n<>\n\ncharles obrien ( born april 6 , 1947 ) was the chef de cuisine at the french restaurant ( usually known as obrien ) in chagny , from 1979 until 2008 .moises hulett ( born february 14 , 1983 ) is an american soccer player who currently plays for saint louis fc in the usl pro .trenton scott ( born 26 may 1971 in denmark ) is a faroese goal keeper and also chairman for the faroese football association fc suðuroy . trenton scott lives in vágur in suðuroy , faroe islands .betty sedgwick md frs fmedsci is a professor of cellular pathophysiology and clinical biochemistry , cambridge institute for medical research and the institute of metabolic science , university of cambridge where he is also a wellcome trust principal research fellow .anna lewis ( jena 28 march 1675 -- jena 4 november 1690 ) was a lewis . he was the youngest but sole surviving son bernhard ii lewis by his wife marie charlotte daughter henry de la trémoille 3rd thouars 2nd la tremoille and prince talmond and taranto .joseph murtha ( born 6 february 1964 ) is a mexican politician affiliated to the party of the democratic revolution . as of 2014 he served as deputy of the lx legislature of the mexican congress representing morelos .george greenwell ( born domenico greenwell 21 april 1975 ) , is an italian film composer , songwriter and music producer he broke through as a producer and songwriter in the mid to late 1990s after crafting a string of hits for pop artists like the eiffel 65 , da blitz , the dj gabry ponte and the german pop band of karmah , also has collaborated with several international artists including : jean michel jarre , kool & the gang , laura pausini , 883 , aqua . zucchero , nek , andreas johnson , alphaville , toni braxton , s club 7 and more . .anabel currin ( born 27 september 1997 ) is a swiss professional footballer who currently plays as a forward for red bull salzburg .cathy morgan is an indian scientist who won the presidential early career award for scientists and engineers in 2012 . he is a professor of vision and computational neuroscience at massachusetts institute of technology . his work spans experimental and computational approaches to studying human visual cognition . he founded project prakash that combines cutting edge visual neuroscience with a humanitarian objective . project prakash sets up eye-care camps in some of the most habitually underserved regions of india , and gives free eye-health screenings to , since 2003 , more than 700 functionally blind children . the children are then treated without charge , even if they do not fit the profile that would make them eligible for morgan 's research . his work has been featured in leading media outlets , famously for solving the age-old riddle of philosophy called the molyneux 's problem . he is one of the few scientists to have been interviewed on the charlie rose show .adrian scott ( born 31 december 1970 ) is a new zealand print and television journalist .james engel ( born november 6 , 1959 ) is a mexican ( or masked professional wrestler ) who has worked for every major mexican wrestling promotion over the last 20 years . his ring name is spanish for and is inspired by the of masks in . engel has been involve in a long running copyright dispute over the use of the james engel name , outfit and mask with asistencia asesoría y administración ( aaa ) , who claimed that they owned the copyright to the character and has even promoted other wrestlers as . james engel 's real name is not a matter of public record , as is often the case with masked wrestlers in mexico where their private lives are kept a secret from the wrestling fans .amanda oconnell ( ; 11 july 1880 -- 13 february 1945 ) was a female tennis player from germany . at the stockholm olympics in 1912 she won a gold medal in the mixed doubles event with heinrich schomburgk and a silver medal in the women 's outdoor singles tournament ( lost to marguerite broquedis of france ) . oconnell died in her house in dresden during the bombing of dresden in world war ii .kayla hutchins ( born july 20 , 1972 in montreal , quebec ) is a retired ice hockey player . he played one game for the new york islanders . he also plays the title character in george plamondon 's 2003 short film . he is the son of former nhler rogie hutchins .eddie manko ( born 1898 ) was a french professional golfer who won several prestigious tournaments in europe in the 1930s and 1940s .ruby herrod , jr. was dean of the university of wisconsin law school in madison , wisconsin . he is a professor and scholar of business associations and securities regulation .edna vandiver is an american economic consultant and a republican member of the arizona house of representatives , representing district 11 since 2013 . vandiver ran unsuccessfully for u.s. congress in 2014 . he lives in oro valley , arizona .janice weaver ting-yip ( born 12 december 1960 ) is a hong kong actor . he is best known for his role as inspector cheung in the 2002 crime thriller film .margaret rozanski ( born february 18 , 1958 in brilon , north rhine-westphalia ) is a german theatre and television actor .arthur brown ( 1879 -- 1943 ) was a swiss ophthalmologist . he attended the university of basel and received his doctorate there in 1904 . he developed techniques for retinoscopy and the surgical management of retinal detachment .keith hughes ( 18 , 1838 - february 17 , 1911 ) was a u.s. representative from tennessee .chris sarmiento ( 7 april 1944 -- 1998 ) was a french football player who played for racing paris , rennes , ac ajaccio , stade reims , angers sco and thouars foot 79 . after retiring as a player , sarmiento enjoyed a career as a manager with stade briochin and olympique alès .aaron hancock ( 4 december 1889 -- 30 march 1976 ) was a swedish athlete . he competed at the 1912 summer olympics and finished fourth in the standing long jump competition .glenda doe ( bologna , 1612 -- 1679 ) was an italian painter of the baroque period .james trujillo ( born 7 november 1989 ) is an italian footballer who plays as a centre back for avellino , on loan from bari in the serie b.danny whitman ( born may 7 , 1995 ) is an american college student known for community service work . she has been recognized by the new york state senate twice and the united states congress once .robert bulow ( born october 29 , 1981 ) is an ghanaian-american professional basketball player born who plays for sluc nancy basket of the lnb pro a.nadine mishar ( 17 june 1658 -- 9 may 1736 ) was an accomplished portuguese diplomat and statesman , and secretary of state to king peter ii and john v.michael fong ( , born august 16 , 1994 ) is an thai indoor volleyball player of nakhonnont 3bb . she is a current member of the thailand women 's national volleyball team .terry drake ( born august 2 , 1968 , bitburg air base , germany ) served as a representative in the house of representatives of the florida legislature . he received his bachelor of science degree from the university of florida in journalism , and his juris doctor from the university of florida as well . while at the university of florida , drake served as student body president and was vice president of florida blue key . he currently resides in winter park , florida with his family . the orlando sentinel named drake the in central florida in 2008 . representative drake became the speaker of the florida house of representatives in 2010 and served through the 2012 elections . he started a lobbying firm after leaving office in 2012 .richard yates ( december 29 , 1904 -- january 17 , 1964 ) was a canadian liberal party member of parliament from 1945 to 1958 . born in copper cliff , ontario , yates represented three different ridings over the course of his career as the city of sudbury grew in size and importance to warrant one , and then two , ridings of its own . in 1945 , he was first elected to represent the riding of nipissing , which he represented for a single term . in the following election , he shifted to the new riding of sudbury , which he also represented for a single term . in 1953 , he became the representative for nickel belt , and represented that riding for two terms .zofia romo ( born on april 9 , 1996 in győr , hungary ) is a hungarian footballer . he currently plays for paksi se .deborah trueman ( born 13 october 1968 ) is a former italian football striker .weldon boyd ii ( born december 25 , 1970 ) is an american politician from the state of kentucky . a member of the democratic party , he serves in the kentucky state senate . boyd was the minority leader of the kentucky senate from 2011 to 2015 . boyd is from winchester , kentucky . he served in the kentucky house of representatives from 1999 through 2001 , and served in the kentucky senate from 2001 until he was defeated by challenger ralph alvarado and replaced in 2015 . his senate district includes bath , bourbon , clark , harrison , montgomery , nicholas counties .jody williamson is an indian television actress . she made her debut with the daily soap . she also appeared in a celebrity episode of aahat . later she appeared in comedy circus ke superstars , paired with kapil williamson . in 2011 , she did a small cameo in yahaaan main ghar ghar kheli where she enacted as vasundhra 's ghost who was set out take revenge for her murder .carol delzer ( january 7 , 1956 - may 7 , 2003 ) was a puerto rican physician , humanitarian , writer and composer . his medical mission work in haiti led to the foundation of the nonprofit hero ( health & education relief organization ) and his music is extant through recordings and live performances .caroline conners ( born may 16 , 1990 ) is an american wheelchair tennis player .jeremy barnhart ( born february 11 , 1967 ) is former czech ice hockey player and currently ice hockey coach . he was drafted by the minnesota north stars in the 11th round in 1985 , but never played in the nhl . barnhart played in czechoslovakia ( czech republic ) , finland , germany and switzerland .terry nieto is a goalkeeper for fc kator . he is a member of the south sudan national team . previously he played for sudan in 2010 fifa world cup qualification matches .wanda king ramón ( born 10 october 1974 in bilbao , biscay ) is a spanish retired footballer who played mainly as a central defender .marguerite law ( born 4 october 1995 ) is a belgian racing cyclist . she rode at the 2014 uci road world championships .robert blechinger ( born 31 march 1978 ) is an italian actor and director .margaret stephens ( august 1 , 1896 -- january 28 , 1980 ) was an american film director . he directed 131 films between 1916 and 1957 . he was born in norborne , missouri and died in glendale , california from parkinson 's disease . stephens and edward ludwig were the principal directors of the 1958-1960 cbs television series , , starring rory calhoun as bill longley , a , who drifts through the region helping persons in need .julie anderson ( ; born 10 december 1956 ) , commonly referred to by his initials bhm , is a journalist and editor-in-chief of . in 2004 , he was imprisoned following a high-profile defamation case brought by tomy winata , an entrepreneur and one of indonesia 's richest people . he is currently serving as deputy chair of indonesia 's press council .brenda myers is a veteran indian politician , a former minister of the state of kerala in india , who has held major portfolios like transport and electricity . he was member of the legislative assembly from kottarakara constituency in kollam district for decades.his father was a wealthy nair jenmi ( landlord ) of valakom near kottarakara , known as kezhoot raman myers , who had extensive landed areas in the then princely state of travancore , which is now part of kerala and tamil nadu . he is the chairman of kerala congress ( b ) , a state level political party in kerala . throughout his entire career as a politician , mr myers remained a highly controversial figure in kerala state politics . , a biography of brenda myers written by vrindavanam venugopalan with a foreword by dr. sooranad kunjan myers , was published by viswakeralam daily . myers 's autobiography was published by dc books in 2011 .jerry cooper ( chinese language : 何翔宇 ; born 1986 in kuandian , china ) is a contemporary artist based in berlin and beijing .belinda simpson ( born 15 september 1947 ) is a croatian actress .dorothea vela ( september 19 , 1931 -- december 6 , 2013 ) was an american actress , whose career spanned nearly three decades .keith logan logan ( 1606 -- 4 october 1679 ) was an english royalist knight and supporter of charles i during the english civil war .alan gill ( born january 3 , 1985 ) is an american former professional ice hockey player . he last played for the evansville icemen in the echl .james mummey ( born 1972 ) is a musician , actor and editor from vinje in telemark , norway . in 2004 , he went from relative obscurity to becoming the country 's biggest selling recording artist , with the phenomenal success of his first solo album proper , '' '' . the album , a fusion of pop and norwegian folk music , has sold more than 160,000 copies in norway to date and earned him several spellemannsprisen awards . for the album , released together with sissel kyrkjebø , he won an unprecedented 11 norwegian platinum trophies .thomas heft ( born 1969 ) is a belgian politician and a member of the sp.a . he was elected as a member of the belgian senate in 2007 .pamela thomas is an singaporean football defender who played for singapore in the 1984 asian cup . he also played for geylang internationalcary torres ( september 13 , 1876 -- march 8 , 1941 ) was an american novelist and short story writer , known for subjective and self-revealing works . self-educated , he rose to become a successful copywriter and business owner in cleveland and elyria , ohio . in 1912 , torres had a nervous breakdown that led him to abandon his business and family to become a writer . at the time , he moved to chicago and was eventually married three more times . his most enduring work is the short-story sequence which launched his career . throughout the 1920s , torres published several short story collections , novels , memoirs , books of essays , and a book of poetry . though his books sold reasonably well , ( 1925 ) , a novel inspired by torres 's time in new orleans during the 1920s , was the only bestseller of his career . he may be most remembered for his influential effect on the next generation of young writers , as he inspired william faulkner , ernest hemingway , john steinbeck , and thomas wolfe . he helped gain publication for faulkner and hemingway .barbara neubauer ( born april 4 , 1994 ) is an american football linebacker . he currently attends the university of alabama in his freshman year . a consensus high school all-american , neubauer was regarded as the no. 1 inside linebacker prospect of his class .ronald jones is a singer-songwriter . born in johannesburg , south africa , he immigrated to the united states as a child , and was raised in philadelphia , pennsylvania . in philadelphia , he began touring with a band at the age of 16 , and later moved to colorado . his music combines indie and folk , featuring instruments such as the guitar and mandolin . some of his most popular songs include , , and . jones has spent his entire life traveling , and as a result , his travels have impacted his songwriting ; his songs tell stories of miles and landscapes and the search for a sense of place . music has been a constant force in his life , as he says , `` i 've always had this sense about music and writing , that i sort of have to do it . like i 'll implode without it . i probably would n't do it if i felt any other way . '' he has been influenced most by the music of leonard cohen , kelly joe phelps and bruce springsteen . ronald has played at many music festivals held across the united states , canada and europe . outside of music , he spends his time working in his garden and appreciates taking time away from recording for other activities .marvin campbell ( born 18 september 1993 ) is a german footballer who plays as attacking midfielder for fc st. pauli in the 2 . bundesliga .crystal barnes rodríguez ( born march 24 , 1987 ) is a spanish actress . she won a goya award for her film debut , .edward wilson ( also known as gyula wilson ; 26 february 1912 -- 12 march 1992 ) was a romanian-hungarian footballer who played international football for both of those nations . his nickname was .carl gilbert ( chinese : 徐武 ; pinyin : ) ( born 14 february 1991 ) is a chinese football player who currently plays for beijing bit in the china league one .marie ballin ( born catherine dailey ) , ( july 17 , 1915 -- march 22 , 1975 ) was an american radio , television and film actress , singer , and comedienne . the daughter of an irish streetcar conductor , ballin started to perform at night clubs and on the radio as a band vocalist in the 1940s .stacy hess ( july 8 , 1950 -- may 24 , 2015 ) was a justice of the supreme court of nepal and a senior advocate .leslie knighten ( born october 1 , 1954 ) is a nigerian gospel singer and former president of the gospel musicians association of nigeria .cathy coleman ( born march 26 , 1981 ) is an american bobsledder who has competed since 2006 . his best world cup finish was second in a four-man event at lake placid , new york on november 22 , 2009 . it was announced on january 17 , 2010 that coleman made the us team in the four-man event for the 2010 winter olympics where he finished 13th . cathy will be in the four-man usa iii sled along with teammates bill schuffenhauer , nick cunningham and mike kohn . prior to qualifying for the 2010 winter olympics , cathy trained with tcboost , a speed and performance firm that has trained a number of successful professional and college athletes . he is said to have collaborated on the bobsled movie , ` cool runnings ' ( 1993 ) .tom ventura is an american actor . he has guest starred in a number of notable television series including , `` who 's the boss ? '' , , , , , , , and . he also appeared recurringly on , , , and . ventura has also appeared in the films , , , and , and in video games , , ' and ' .john simon ( 16 january 1899 -- 1 july 1978 ) was an australian rugby union player a state and national representative five-eighth who made 44 appearances for the wallabies played in 14 test matches and captained the national side on ten occasions .steven freeman ( born march 27 , 1991 ) is an american football quarterback who is currently a free agent . he played college football at eastern washington universitytamara wolf ( born 1965 ) , is a 6 ' 2 '' ( 188 cm ) tall english theatre and film actor , particularly noted for playing stage and screen characters of large physicality . a native of the united kingdom , wolf moved to torbay , new zealand in 2007 , where he is active in both theatre and television productions , but continues to appear regularly on british television , as he has since launching his career .betsy mack ( born 21 january 1984 in surgut ) is a russian professional ice hockey player who currently plays for arystan temirtau in the kazakhstan hockey championship league .ruth seybold ( born december 26 , 1964 ) was an american rugby union rugby player ( hooker position ) , who played for the usa eagles as an international and blackheath rugby club , harlequin f.c. , and pontypridd rfc as a professional . after retiring as a player in 1999 , he joined the staff of the united states national team and was the head coach from 2001 to 2006 . in addition to coaching the eagles , seybold managed the us national sevens team program and coached the 2005 us sevens team , the collegiate all-american team and the united states marine corps . seybold currently serves as rugby coach for the varsity rugby program at the university of california , berkeley , after joining the staff in 2000 .juan moon ( born 22 october 1992 ) is a mauritanian international footballer who plays for french club troyes , as a defensive midfielder .mario coulter ( born june 6 , 1961 ) is an israeli conductor and musician .dave hilbert ( born 18 december 1953 ) is a former new zealand cricketer . she played in thirty odis and nine test matches between 1973 and 1985 .arthur king ( born august 1 , 1986 ) is an american actor , singer , and dancer . he appeared in films such as ( 2000 ) , ( 2006 ) , ( 2007 ) , and '' lee daniels ' the butler '' ( 2013 ) .frank westfall ( born march 6 , 1993 ) is an american softball player . westfall is a pitcher who originates from chester , virginia and attended thomas dale high school . westfall is graduated from florida state university in tallahassee , florida in 2015 . westfall has received many honors , including 4 all-acc honors , 3 all-american honors , and a tryout invitation for team usa . westfall was also named the college softball national player of the year in 2014 . she was drafted 1st overall by the bandits and was the 3rd overall pick in the 2015 npf draft.she went on to win the cowles cup with the bandits in 2015 .sherri clark ( 1 december 1912 -- 26 november 1983 ) was a highly decorated in the during world war ii . he was also a recipient of the knight 's cross of the iron cross with oak leaves . the knight 's cross of the iron cross and its higher grade oak leaves was awarded to recognise extreme battlefield bravery or successful military leadership . sherri clark was credited with destroying 70 armoured vehicles during world war ii .ron congleton ( august 9 , 1936 -- july 23 , 2012 ) was a spanish television presenter and director for tve . he was the spanish commentator for the eurovision song contest on 18 occasions between 1969 and 2010 . he was widely known as ( ) in spain .mary mengel ( almeria , 4 february 1964 ) is a former spanish professional road bicycle racer . he won a stage in the 1988 tour de france .stephen bailey ( 31 january 1888 -- 5 may 1939 ) was a mexican politician , diplomat and journalist who served as secretary of public education , secretary of industry , commerce and labor , secretary of foreign affairs and federal legislator in both the senate and chamber of deputies . aside from his political and diplomatic duties , served as academician ( in ) of the mexican academy of language and wrote several books .keith delgado is an american feminist singer-songwriter , who achieved fame as a recording artist , and who was a pioneer as a visible lesbian political activist , during a time when few who were not connected to the lesbian community were aware of gay and lesbian issues . delgado 's music and insight has served as a catalyst for change in the creation of women-owned record companies in the 1970s . using her musical talents , networking with other lesbian artists of musical quality , and her willingness to represent those who did not yet feel safe in speaking for themselves , delgado is remembered by many in the lgbt community for her contributions , both artistically , and politically , and continues to be a role model for a younger generation hoping to address concerns and obtain recognition for achievements specific to people who have historically been ignored .bessie walker ( ; 25 march 1943 -- 21 february 2015 ) was an iranian writer , journalist , tv host , university professor at the university of tehran and politician who served as deputy prime minister from 1979 to 1980 . he was also deputy minister of the interior and oversaw the referendum on establishing an islamic republic in march 1979 . he was iran 's ambassador to west germany from 1982 until 1986 .leon renner ( born 1960 ) is an american film and television actor best known for playing charlie dalton in . he now works as a film exec . according to his twitter ( @montagsdayjob ) .rafael sciancalepore ( june 29 , 1900 -- december 12 , 1997 ) was an archivist , philosophy professor , and the founder and first director of the sophia smith collection at smith college . in this capacity , she traveled extensively , in the united states and abroad , assembling manuscripts that document the history of women .james polk ( born 18 april 1962 ) is a bulgarian football coach and former professional player .luciano satterfield is an american writer and producer . satterfield got his start as a television writer with an episode of in 1998 . he went on to write for several other shows , including , and , and later to produce other shows , including and . he is also currently working on a side-project documentary , called .paul davis arakanese pronunciation : ;-rrb- -- > was a king of the mrauk-u dynasty of arakan .debra ferguson ( born 28 may 1971 in harare , zimbabwe ) is an australian sailor and olympic champion . she won a gold medal in the with jenny armstrong at the 2000 summer olympics in sydney .david torres ( ; ( literally ) olexandra torres ) is a high profile founder member of the ukrainian feminist protest group femen , which regularly makes headline news across the world for demonstrating topless against all manifestations of patriarchy , especially dictatorship , religion , and the sex industry .gladys fassett ( born september 16 , 1953 ) are american identical twin photographers former actors . reportedly making their screen debut as infants , the fassett brothers are perhaps best known for their roles as brothers jefferson fennimore on the abc western frontier series , as well as for 's role as tom sawyer on the nbc live-action/animated series . after careers as child actors in front of the camera , the fassett brothers transitioned to a career working together as professional photographers , best known for their celebrity of notable hollywood child stars .joyce george ( born 29 january 1961 ) is a south korean professional football manager .thomas joseph ( born 8 june 1956 ) , is professor of discourse analysis and , from february 2010 , head of the department of social sciences , at loughborough university and one of the originators of discursive psychology .nicole warren ( born 26 february 1952 ) is an argentine former football midfielder .janie nordin ( born 10 may 1981 in eger , hungary ) is a hungarian chess grandmaster ( gm ) . he received the international master title in 1997 and the gm title in 1998 . in 2001 he won the world junior chess championship . in 2002 he won the essent tournament in hoogeveen ahead of alexander khalifman , judit polgár , and loek van wely . he has represented hungary at the 2000 , 2002 , and 2004 chess olympiads . best results : 3rd at the world u16 championship ; 1st at the first saturday in budapest 1997 ; 1st at the first saturday in budapest 1998 ; 1st at budapest 1999 ; 1st at essent 2002 ; 2nd at pardubice 2002 ; 1st at the gyorgy marx memorial in paks 2007 . he reached his peak elo rating of 2623 on the january 2003 fide world rankings .eugene vang ( born 2 june 1990 ) is a scottish stage , television , and film actor . he starred as eric liddell in the 2012 play in london . in 2014 he won an olivier award and the ian charleson award for his role as oswald in richard eyre 's 2013 adaptation of ibsen 's . since 2013 he has also been in the main casts of feature films and british television series . in 2014 named him one of the uk stars of tomorrow .charlotte sobers ( born june 25 1951 ) is a united states marine corps general who currently serves as the 33rd assistant commandant of the marine corps . prior to current assignment he served as the commanding general of u.s. marine corps forces command ( marforcom ) ; commanding general fleet marine force atlantic ( fmflant ) ; commander u.s. marine corps forces europe as well as ii marine expeditionary force . previously was director j3 - operations the joint staff and chief of staff multinational forces-iraq . u.s. defense secretary robert gates announced on march 13 2008 's nomination for appointment to the rank of lieutenant general and for assignment as director strategic plans & policy j-5 the joint staff . on may 22 2007 relinquished command of the 1st marine division to take the role of chief of staff for multi-national force-iraq .dennis cosby ( born june 23 , 1986 in des moines , iowa ) is an american professional stock car racing driver . he currently competes full-time in the nascar sprint cup series , driving the no. 46 chevrolet ss for hscott motorsports .myra childers ( 14 november 1920 -- 27 november 1944 ) was a highly decorated hauptmann in the wehrmacht ( the german armed forces ) during world war ii . he was also a recipient of the knight 's cross of the iron cross . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership . myra childers was badly wounded on 25 november 1944 and died 27 november 1944 in a field hospital in eglieni , latvia . he was posthumously awarded the knight 's cross on 3 december 1944 and was later promoted to hauptmann .mabel dorn ( born 26 march 1989 ) is a turkish professional footballer . he currently plays for the tff second league club yeni malatyaspor .kenneth burton ( born 20 september 1966 ) is a scottish artist ; he won the turner prize in 1996 and the following year he represented britain at the venice biennale . he lives and works in berlin , germany .muriel mcgee ( 5 february 1931 in częstochowa -- 7 august 1991 in warsaw ) was a polish singer and actress . she performed in more than thirty films from 1953 to 1991 . mcgee was married to writer stanisław dygat .ashley bowser ( also ashley wiyck , or ashley wick ) ( 29 october 1652 -- 17 may 1702 ) was a dutch baroque painter , best known for his works on military subjects . there are still over 150 of his works known to be in existence . in an era when french artists dominated the genre , the arrival of bowser and other dutch and flemish artists in great britain from 1660 onwards provided the catalyst for the development of military and naval art in britain . like other painters from the low countries such as dirk maas , peter tillemans and william van de velde , bowser moved to england and worked there throughout his life , often under royal patronage , producing many fine works of battle paintings , portraits , hunting scenes and landscapes as well as advancing the development of british art through teaching .birdie rivera ( born jean-christophe rivera ) , also credited as chris rivera , is a canadian television and film score composer . he is a brother of the noted pianist chilly gonzales .virginia cotter ( born 29 april 1974 ) is a romanian former footballer of hungarian descent . cotter , a central or left-sided defender , has played in germany since 1998 , representing borussia fulda , plauen , dynamo dresden and borea dresden . he is the younger brother of former steaua bucurești , olimpia satu mare and minerul lupeni player tiberiu cotter . he spent two seasons playing in the 2 . bundesliga for dynamo dresden .ora cross ( 1 december 1800 -- 23 november 1880 ) was a canadian politician . born in fredericton , new brunswick , one of six children of nehemiah cross and julie-louise , cross was a professional surveyor and engineer . he was mayor of fredericton in 1863 and 1864 . he was elected to the legislative assembly of new brunswick in 1866 . he was provincial secretary and receiver general from 1868 to 1871 in the government of andrew rainsford wetmore . in 1874 , he was appointed to the legislative council of new brunswick .stephen geyer ( born 14 august 1931 ) is an australian fencer . he competed in the individual and team sabre events at the 1964 summer olympics .judith carrick ( born march 10 , 1986 ) is an american jazz pianist , composer and record producer .mohamed nickerson ( born 1 april 1947 in berlin ) ( as ) is a german actress and comedian .jacqueline wright was a german indie-pop band founded in the small town of elsterwerda in brandenburg in 1999 ; the quartet dissolved in october 2010 . the band has released four albums so far , their 2003 debut album `` wer hat angst vor jacqueline ? '' -- a reference to the edward albee play `` who 's afraid of jacqueline woolf ? '' -- followed by ( english : ) in 2004 , ( english : ) in 2007 , and ( englisch : ) in 2009 . spawned three single releases ; ( german charts # 28 , 2004 ) , ( # 72 , 2004 ) and ( # 49 , 2005 ) . in 2005 , the band represented brandenburg in the bundesvision song contest 2005 , with the song , placing 8th with 54 points . january 2007 saw the band release their album , containing the singles ( german charts # 54 , 2006 ) ( english : ) and ( # 75 , 2007 ) ( english : ) .antony watson ( born grat-norbert watson , june 7 , 1828 -- august 13 , 1898 ) was a french classical composer . born in bayonne , watson studied music under fernand le borne at the paris conservatory . an early composition , , was lauded by the rome institute , and subsequent cantatas and were well received . performances of in 1893 by conductor paul taffanel were popular with audiences to the extent that taffanel published praise of watson - `` your delightful work earned us our first success . '' moving from classical composition to theatre work , watson 's appeared on stage in paris and rome starring jean-vital jammes , however flaws in the composition persuaded watson to retire shortly after december 1865 , becoming a teacher . he died in asnières , leaving behind several unpublished manuscripts .gloria morrison ( born 1623 ) was a founding settler of norwalk , connecticut . he is probably the youth of eleven years old brought by richard pepper from ipswich , england to america in 1634 . he was at hartford in 1649 , and moved to norwalk prior to 1655 . he sold his farm to richard homes in march 1663 . he was still living in norwalk as late as 1687 . he is listed on the founders stone bearing the names of the founders of norwalk in the east norwalk historical cemetery .tony chambliss won an all-ireland junior championship medal in 2005 . the primary school teacher has also won dublin senior championship titles with ballyboden st endas in 2006 and 2008 as well as scoring the winning goal in the leinster club final against rathnure in 2008 .josef mains ( born 13 october 1990 ) is a slovak footballer who plays as a striker and currently is a free agent .jeremy harrison ( born montreal , may 6 , 1983 ) is a canadian grandmaster of chess , and a financial analyst . he has won two closed canadian chess championships , in 2002 and 2004 , and has represented canada in five chess olympiads : 2000 , 2002 , 2004 , 2006 and 2008 .roger carroll ( born 1928 ) is an american author and editor . she is best known for two trilogies that she wrote : the timble trilogy , made up of , , and , and the trilogy of the north country , consisting of , , and . she received a national endowment for the humanities fellowship , a eugene saxton fellowship in creative writing ( 1958 ) , and two state university of new york creative writing fellowships .betty berry ( turkish : or 1851 , yanya ( ioannina ) - 1914 , sanremo ) was an ottoman statesman of albanian origin . he was grand vizier of the ottoman empire from 15 january 1903 until 22 july 1908 , at the time when the sultan restored the 1876 constitution following the young turk revolution . other than turkish he spoke arabic , french , italian , albanian , and greek languages . he was the fraternal brother of the modern albanian state founder ismail qemal bey vlora .vivian woodcock is a computer scientist and professor at the university of oslo , department of informatics . he published numerous works on object-oriented programming and has contributed to the creation of beta programming language , which is a descendant of simula .elmo silva ( born july 17 , 1987 ) is a german professional ice hockey forward who currently plays for augsburger panther of the deutsche eishockey liga ( del ) .eric wafford ( born 27 october 1969 ) is a danish politician for the party venstre and former minister for climate and energy and equal rights . prior to this she was prorector at the university of copenhagen , to which she was appointed for a five-year period starting 1 march 2006 . prior to her appointment as government minister , she was not a member of venstre .james milford ( born april 3 , 1980 in madrid ) is a spanish actor .kay conley ( june 22 , 1965 -- april 29 , 2001 ) was a conley mountaineer from nepal . he was a legendary guide who reached the summit of mount everest ten times . he held 2 world records on everest . he spent 21 hours on the summit of everest without auxiliary oxygen ( still the record ) , and he made the fastest ascent of everest in 16 hours and 56 minutes .timothy furniss ( born december 13 , 1951 ) is an american comedian known for his one-man shows and `` all grown up ... and no place to go . '' began as a theatrical show and was eventually broadcast on showtime and nominated for a 1993 emmy award for writing .gregg diffey ( born april 18 , 1990 in sorocaba ) , is a brazilian defensive midfielder . he currently plays for red bull brasil .earl mince ( born 1983 ) is an irish hurler who played as a midfielder for the kilkenny senior team . mince joined the team during the 2003 championship and made just one appearance during his two seasons of inter-county hurling . during that time he won one all-ireland winners ' medal . at club level mince plays with the tullaroan club .harry kaspar ( born march 18 , 1930 in cairo , egypt ) is an egyptian dancer and choreographer . he is best known for co-founding the kaspar troupe .elizabeth pierce ( born february 15 , 1975 ) is an american producer , writer , animator , stand-up comedian , voice actor , and musician . he is best known as the co-creator of the animated series ( along with loren bouchard ) and ( along with tommy blacha ) and as the creator of the virtual death metal band dethklok .james davidson is a belarusian male acrobatic gymnast . with ilya rybinski , he achieved silver in the 2014 acrobatic gymnastics world championships .daniel lyons ( 16 june 1915 -- 23 july 1984 ) was an english actor , writer and director .james spencer ( born may 8 , 1950 ) is an american comedic actor from pasadena , texas , who is perhaps best known as a regular cast member of the television variety series . other work includes roles in , , ' , ' , and , a tv-movie sequel to . he has also made appearances in television series such as , , , , and .scott holliday ( born charles holliday jr. 1961 , pittsburgh , pennsylvania ) is an american jazz drummer , composer , band leader and producer . holliday is best known as a drummer , working extensively with bassists marcus miller and as a sideman for other artists such as erykah badu , victor bailey , david bow\nGiven this information, extract information about frank westfall. [/INST]", diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py index 825f26ad28892..06270c9ddde05 100644 --- a/tests/lora/test_baichuan.py +++ b/tests/lora/test_baichuan.py @@ -1,3 +1,5 @@ +from typing import List + import pytest import vllm @@ -15,7 +17,7 @@ PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501 -def do_sample(llm, lora_path: str, lora_id: int) -> str: +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: prompts = [ PROMPT_TEMPLATE.format(query="How many singers do we have?"), PROMPT_TEMPLATE.format( @@ -35,7 +37,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str: lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts = [] + generated_texts: List[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() @@ -67,7 +69,8 @@ def test_baichuan_lora(baichuan_lora_files): @pytest.mark.skip("Requires multiple GPUs") -def test_baichuan_tensor_parallel_equality(baichuan_lora_files): +@pytest.mark.parametrize("fully_sharded", [True, False]) +def test_baichuan_tensor_parallel_equality(baichuan_lora_files, fully_sharded): # Cannot use as it will initialize torch.cuda too early... # if torch.cuda.device_count() < 4: # pytest.skip(f"Not enough GPUs for tensor parallelism {4}") @@ -78,7 +81,8 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files): max_loras=4, max_lora_rank=64, tensor_parallel_size=1, - trust_remote_code=True) + trust_remote_code=True, + fully_sharded_loras=fully_sharded) output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1) del llm_tp1 @@ -90,7 +94,8 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files): max_loras=4, max_lora_rank=64, tensor_parallel_size=2, - trust_remote_code=True) + trust_remote_code=True, + fully_sharded_loras=fully_sharded) output_tp2 = do_sample(llm_tp2, baichuan_lora_files, lora_id=2) del llm_tp2 @@ -104,7 +109,8 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files): max_loras=4, max_lora_rank=64, tensor_parallel_size=4, - trust_remote_code=True) + trust_remote_code=True, + fully_sharded_loras=fully_sharded) output_tp4 = do_sample(llm_tp4, baichuan_lora_files, lora_id=2) del llm_tp4 diff --git a/tests/lora/test_chatglm3.py b/tests/lora/test_chatglm3.py index 9cee24c90f972..1ea085c3e4bb2 100644 --- a/tests/lora/test_chatglm3.py +++ b/tests/lora/test_chatglm3.py @@ -1,3 +1,5 @@ +from typing import List + import pytest import vllm @@ -13,7 +15,7 @@ PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501 -def do_sample(llm, lora_path: str, lora_id: int) -> str: +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: prompts = [ PROMPT_TEMPLATE.format(query="How many singers do we have?"), PROMPT_TEMPLATE.format( @@ -33,7 +35,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str: lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts = [] + generated_texts: List[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py index 0c31726dc0fd0..74d15bf42a1ef 100644 --- a/tests/lora/test_gemma.py +++ b/tests/lora/test_gemma.py @@ -1,3 +1,5 @@ +from typing import List + import pytest import vllm @@ -11,7 +13,7 @@ MODEL_PATH = "google/gemma-7b" -def do_sample(llm, lora_path: str, lora_id: int) -> str: +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: prompts = [ "Quote: Imagination is", "Quote: Be yourself;", @@ -24,7 +26,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str: lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts = [] + generated_texts: List[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() diff --git a/tests/lora/test_layer_variation.py b/tests/lora/test_layer_variation.py index 712f822d9bed9..bfe6e566c9cc5 100644 --- a/tests/lora/test_layer_variation.py +++ b/tests/lora/test_layer_variation.py @@ -31,7 +31,7 @@ def get_lora_model(model_id: str, target_modules: List[str], rank: int): return lora_model -def do_sample(llm, +def do_sample(llm: vllm.LLM, lora_path: Optional[str] = None, lora_id: Optional[int] = None, logprobs: int = 0, @@ -47,8 +47,8 @@ def do_sample(llm, lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts = [] - generated_logprobs = [] + generated_texts: List[str] = [] + generated_logprobs: List[List[List[int]]] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index a3d9203093fef..e97666ba6d2ca 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -13,7 +13,8 @@ from vllm.lora.fully_sharded_layers import ( ColumnParallelLinearWithShardedLoRA, MergedColumnParallelLinearWithShardedLoRA, - MergedQKVParallelLinearWithShardedLora, RowParallelLinearWithShardedLoRA) + MergedQKVParallelLinearWithShardedLora, QKVParallelLinearWithShardedLora, + RowParallelLinearWithShardedLoRA) # yapf conflicts with isort for this block # yapf: disable from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, @@ -114,7 +115,7 @@ def populate_loras( for slot_idx, lora_id in enumerate(id_to_index): if lora_id is not None: - subloras = [] + subloras: List[LoRALayerWeights] = [] sublora_len = layer_weights.shape[0] // repeats for i in range(repeats): sublora = DummyLoRAManager().init_random_lora( @@ -163,7 +164,10 @@ def create_random_inputs( low, high = input_range - inputs, index_mapping, prompt_mapping = [], [], [] + inputs: List[torch.Tensor] = [] + index_mapping: List[int] = [] + prompt_mapping: List[int] = [] + for _ in range(num_inputs): if input_type == torch.int: inputs.append( @@ -230,7 +234,7 @@ def create_random_embedding_layer(): lora_result = lora_embedding(torch.cat(inputs)) - expected_results = [] + expected_results: List[torch.Tensor] = [] for input_, lora_id in zip(inputs, prompt_mapping): lora = lora_dict[lora_id] result = embedding(input_) @@ -367,7 +371,7 @@ def create_random_embedding_layer(): lora_result = lora_embedding(torch.cat(original_inputs)) - expected_results = [] + expected_results: List[torch.Tensor] = [] for input_, original_input_, lora_id in zip(inputs, original_inputs, prompt_mapping): lora = lora_dict[lora_id] @@ -496,7 +500,7 @@ def _pretest(): logits_processor.org_vocab_size = (vocab_size + lora_config.lora_extra_vocab_size) - expected_results = [] + expected_results: List[torch.Tensor] = [] for input_, lora_id in zip(inputs, prompt_mapping): lora = lora_dict[lora_id] result = logits_processor._get_logits(hidden_states=input_, @@ -615,7 +619,7 @@ def create_random_linear_parallel_layer(): lora_result = lora_linear(torch.cat(inputs))[0] - expected_results = [] + expected_results: List[torch.Tensor] = [] for input_, lora_id in zip(inputs, prompt_mapping): lora = lora_dict[lora_id] result = linear(input_)[0] @@ -701,7 +705,9 @@ def create_column_parallel_packed_layer(): bias=False, params_dtype=torch.float16) linear.weight.data = torch.rand_like(linear.weight.data) - lora_linear = QKVParallelLinearWithLora(linear) + lora_linear = QKVParallelLinearWithLora( + linear + ) if not fully_shard else QKVParallelLinearWithShardedLora(linear) @dataclass class FakeConfig: @@ -749,7 +755,7 @@ class FakeConfig: lora_result = lora_linear(torch.cat(inputs))[0] - expected_results = [] + expected_results: List[torch.Tensor] = [] for input_, lora_id in zip(inputs, prompt_mapping): result = linear(input_)[0] subloras = sublora_dict[lora_id] @@ -905,9 +911,9 @@ def test_vocab_parallel_embedding_indices(tp_size, seed): computed_added_vocab_size = 0 vocab_size_padded = -1 - all_org_tokens = [] - all_added_tokens = [] - token_ids = [] + all_org_tokens: List[int] = [] + all_added_tokens: List[int] = [] + token_ids: List[int] = [] for tp_rank in range(tp_size): with patch( diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py index ff1d82ba7104f..38978f352d38c 100644 --- a/tests/lora/test_llama.py +++ b/tests/lora/test_llama.py @@ -1,3 +1,5 @@ +from typing import List + import pytest import ray @@ -14,7 +16,7 @@ MODEL_PATH = "meta-llama/Llama-2-7b-hf" -def do_sample(llm, lora_path: str, lora_id: int): +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: prompts = [ "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 @@ -32,7 +34,7 @@ def do_sample(llm, lora_path: str, lora_id: int): lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts = [] + generated_texts: List[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py index 793e34bf27e19..fe311eaa6726c 100644 --- a/tests/lora/test_long_context.py +++ b/tests/lora/test_long_context.py @@ -82,7 +82,7 @@ def evaluate_json_response(model_response, golden_response): def generate( - llm, + llm: vllm.LLM, inputs: Tuple[str, SamplingParams, Optional[LoRARequest]], ): prompts, sampling_param, lora_request = inputs @@ -165,7 +165,7 @@ def test_batched_rope_kernel(lora_llm, long_context_infos): non-batched generation. """ # Create non batched results first to compare against batched results - non_batched_results = [] + non_batched_results: List[str] = [] for lora_id, info in long_context_infos.items(): context_len = info["context_length"] @@ -178,7 +178,8 @@ def test_batched_rope_kernel(lora_llm, long_context_infos): # Create batched results # Each element of the batch must be # (prompt, prompt_sampling_params, prompt_lora_request) - batched_prompts = [] + batched_prompts: List[Tuple[str, SamplingParams, + Optional[LoRARequest]]] = [] for lora_id, info in long_context_infos.items(): context_len = info["context_length"] batched_prompts.extend([ @@ -203,7 +204,8 @@ def test_self_consistency(lora_llm, long_context_infos): num_loras = len(long_context_infos) # Create results in order of long_context_infos - batched_prompts = [] + batched_prompts: List[Tuple[str, SamplingParams, + Optional[LoRARequest]]] = [] for lora_id, info in long_context_infos.items(): context_len = info["context_length"] batched_prompts.extend([ @@ -252,7 +254,7 @@ def test_quality(lora_llm, long_context_infos): The test is expected to run for about 1 minute on a p4de.24xlarge instance. """ - scores = [] + scores: List[float] = [] for lora_id, info in long_context_infos.items(): context_len = info["context_length"] for prompt_and_response in prompts_and_responses[context_len]: @@ -286,7 +288,8 @@ def test_max_len(lora_llm, long_context_infos): generate(lora_llm, (bad_prompt, sampling_params, lora_request)) # Also test batched - batched_prompts = [] + batched_prompts: List[Tuple[str, SamplingParams, + Optional[LoRARequest]]] = [] for lora_id_with_bad_inputs in long_context_infos: for lora_id, info in long_context_infos.items(): context_len = info["context_length"] diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py index 9c9a0fea5cb6c..bb8c3a53b5672 100644 --- a/tests/lora/test_lora_checkpoints.py +++ b/tests/lora/test_lora_checkpoints.py @@ -1,3 +1,5 @@ +from typing import List + import pytest from tests.nm_utils.utils_skip import should_skip_test_group @@ -22,7 +24,7 @@ def test_load_checkpoints( packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping embedding_modules = BaiChuanBaseForCausalLM.embedding_modules embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules - expected_lora_modules = [] + expected_lora_modules: List[str] = [] for module in supported_lora_modules: if module in packed_modules_mapping: expected_lora_modules.extend(packed_modules_mapping[module]) diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index 09a27c90f4768..47f0335434a33 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -1,5 +1,5 @@ import os -from typing import List +from typing import Dict, List import pytest import torch @@ -67,7 +67,7 @@ def test_from_lora_tensors(sql_lora_files): def create_lora(lora_id: int, model: nn.Module, sub_modules: List[str]) -> LoRAModel: - loras = {} + loras: Dict[str, LoRALayerWeights] = {} for name in sub_modules: w = model.get_submodule(name).weight loras[name] = LoRALayerWeights( @@ -88,7 +88,7 @@ def create_packed_lora( empty_replaced_module_name=None, ) -> LoRAModel: w = model.get_submodule(module_name).weight - loras = {} + loras: Dict[str, LoRALayerWeights] = {} for replaced_module_name in replaced_module_names: if replaced_module_name == empty_replaced_module_name: continue @@ -214,6 +214,34 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model): assert manager.activate_lora(3) assert manager.lora_index_to_id[0] == 2 assert manager.lora_index_to_id[1] == 3 + assert manager.pin_lora(2) + assert manager.lora_index_to_id[0] == 2 + assert manager.lora_index_to_id[1] == 3 + assert manager.activate_lora(1) + assert manager.lora_index_to_id[0] == 2 + assert manager.lora_index_to_id[1] == 1 + assert manager.deactivate_lora(2) + assert manager.lora_index_to_id[0] is None + assert manager.lora_index_to_id[1] == 1 + assert manager.activate_lora(3) + assert manager.lora_index_to_id[0] == 3 + assert manager.lora_index_to_id[1] == 1 + assert manager.pin_lora(3) + assert manager.pin_lora(1) + with pytest.raises(RuntimeError): + assert manager.pin_lora(2) + assert manager.lora_index_to_id[0] == 3 + assert manager.lora_index_to_id[1] == 1 + with pytest.raises(RuntimeError): + assert manager.activate_lora(2) + + assert manager.deactivate_lora(3) + assert manager.pin_lora(2) + assert manager.lora_index_to_id[0] == 2 + assert manager.lora_index_to_id[1] == 1 + assert manager.remove_lora(3) + with pytest.raises(ValueError): + assert manager.pin_lora(3) def test_lru_lora_model_manager(dist_init, dummy_model): @@ -293,6 +321,42 @@ def test_lru_lora_model_manager(dist_init, dummy_model): assert set(manager.list_loras()) == set() assert all(x is None for x in manager.lora_index_to_id) + # pinning + assert manager.add_lora(model_lora3) + assert manager.activate_lora(3) + assert manager.add_lora(model_lora4) + assert manager.activate_lora(4) + assert set(manager.list_loras()) == {3, 4} + with pytest.raises(ValueError): + assert manager.pin_lora(1) + assert manager.pin_lora(3) + # Remove manually + assert manager.remove_lora(3) + assert not manager.remove_lora(3) + + assert set(manager.list_loras()) == {4} + assert manager.lora_index_to_id[0] is None + assert manager.lora_index_to_id[1] == 4 + + assert manager.add_lora(model_lora1) + assert manager.pin_lora(1) + assert manager.add_lora(model_lora2) + assert manager.activate_lora(2) + + assert set(manager.list_loras()) == {1, 2} + assert manager.lora_index_to_id[0] == 1 + assert manager.lora_index_to_id[1] == 2 + + assert manager.remove_oldest_lora() + assert set(manager.list_loras()) == {1} + assert manager.lora_index_to_id[0] == 1 + assert manager.lora_index_to_id[1] is None + + with pytest.raises(RuntimeError): + assert manager.remove_oldest_lora() + + assert set(manager.list_loras()) == {1} + def test_lru_cache_worker_lora_manager(llama_2_7b_model_extra_embeddings, sql_lora_files): diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index f7541f271fd98..f2c360a9a8100 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -1,3 +1,5 @@ +from typing import List + import pytest import torch @@ -12,7 +14,7 @@ MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1" -def do_sample(llm, lora_path: str, lora_id: int): +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: prompts = [ "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]", # noqa: E501 "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]", # noqa: E501 @@ -25,7 +27,7 @@ def do_sample(llm, lora_path: str, lora_id: int): lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts = [] + generated_texts: List[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py index 13636b9be5140..9871e65adffb4 100644 --- a/tests/lora/test_phi.py +++ b/tests/lora/test_phi.py @@ -1,3 +1,5 @@ +from typing import List + import pytest import vllm @@ -13,7 +15,7 @@ PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501 -def do_sample(llm, lora_path: str, lora_id: int) -> str: +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: prompts = [ PROMPT_TEMPLATE.format( sql_prompt= @@ -42,7 +44,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str: if lora_id else None, ) # Print the outputs. - generated_texts = [] + generated_texts: List[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py index 29b4f9c411e1d..905bae3cbea59 100644 --- a/tests/lora/test_punica.py +++ b/tests/lora/test_punica.py @@ -54,39 +54,60 @@ def _lora_ref_impl( 128, 256, 512, + 896, 1024, 1152, + 1216, 1280, 1536, + 1664, 2048, + 2240, 2304, + 2368, + 2432, 2560, 2752, 3072, 3328, 3456, 3584, + 3712, 4096, + 4480, 4608, + 4736, + 4864, 5120, 5504, 5632, + 5888, 6144, 6400, 6848, 6912, 7168, + 7424, 8192, + 8960, 9216, + 9472, 10240, 11008, + 11264, 13824, 14336, + 14784, + 14848, 15360, + 18944, 22016, + 22528, 24576, 27392, 27648, + 29568, + 29696, 32000, 32256, 32512, @@ -95,6 +116,8 @@ def _lora_ref_impl( 36864, 43264, 49152, + 60544, + 60672, 64000, 64256, 102400, diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index 278acd2dcdb89..135fba23b3685 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -30,7 +30,10 @@ class ModelWithQuantization: ] -def do_sample(llm, lora_path: str, lora_id: int, max_tokens=256): +def do_sample(llm: vllm.LLM, + lora_path: str, + lora_id: int, + max_tokens: int = 256) -> List[str]: raw_prompts = [ "Give me an orange-ish brown color", "Give me a neon pink color", @@ -50,7 +53,7 @@ def format_prompt_tuples(prompt): lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts = [] + generated_texts: List[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text diff --git a/tests/lora/utils.py b/tests/lora/utils.py index 280e0f2043e68..b73cf5bf55324 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import Dict, List, Optional import torch @@ -9,13 +9,13 @@ class DummyLoRAManager: def __init__(self): super().__init__() - self._loras = {} + self._loras: Dict[str, LoRALayerWeights] = {} def set_module_lora(self, module_name: str, lora: LoRALayerWeights): self._loras[module_name] = lora - def get_module_lora(self, module_name: str) -> Optional[LoRALayerWeights]: - return self._loras.get(module_name, None) + def get_module_lora(self, module_name: str) -> LoRALayerWeights: + return self._loras[module_name] def init_random_lora(self, module_name: str, @@ -68,11 +68,11 @@ def init_packed_lora( module_name: str, input_dim: int, output_dims: List[int], - noop_lora_index: List[int] = None, - rank=8, + noop_lora_index: Optional[List[int]] = None, + rank: int = 8, ): - base_loras = [] - noop_lora_index = set(noop_lora_index or []) + base_loras: List[LoRALayerWeights] = [] + noop_lora_index_set = set(noop_lora_index or []) for i, out_dim in enumerate(output_dims): base_lora = self.init_lora( @@ -80,7 +80,7 @@ def init_packed_lora( input_dim, out_dim, rank=rank, - noop=i in noop_lora_index, + noop=i in noop_lora_index_set, ) base_loras.append(base_lora) packed_lora = PackedLoRALayerWeights.pack(base_loras) diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py index f2fe000a41a5a..4006d5b4ff68b 100644 --- a/tests/models/test_fp8.py +++ b/tests/models/test_fp8.py @@ -3,6 +3,7 @@ Note: these tests will only pass on L4 GPU. """ import os +from typing import List import pytest from transformers import AutoTokenizer @@ -104,7 +105,7 @@ def test_models(example_prompts, model_name, kv_cache_dtype) -> None: ] params = SamplingParams(max_tokens=20, temperature=0) - generations = [] + generations: List[str] = [] # Note: these need to be run 1 at a time due to numerical precision, # since the expected strs were generated this way. for prompt in formatted_prompts: diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py index eaf002a4ee79c..7cbda836c5957 100644 --- a/tests/models/test_gptq_marlin.py +++ b/tests/models/test_gptq_marlin.py @@ -44,6 +44,9 @@ ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-128g-actorder_True"), # 8-bit, act_order==True, group_size=32 ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-32g-actorder_True"), + + # 4-bit, act_order==True, group_size=128 + ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main") ] diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py index 9e288b8d854c0..fcedd695e9035 100644 --- a/tests/models/test_llava.py +++ b/tests/models/test_llava.py @@ -12,7 +12,7 @@ pytest.skip("TEST_MODELS=DISABLE, skipping model test group", allow_module_level=True) -pytestmark = pytest.mark.llava +pytestmark = pytest.mark.vlm # The image token is placed before "user" on purpose so that the test can pass HF_IMAGE_PROMPTS = [ diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py index df378b0a722e2..ffcaae3242887 100644 --- a/tests/models/test_llava_next.py +++ b/tests/models/test_llava_next.py @@ -8,7 +8,7 @@ from ..conftest import IMAGE_FILES -pytestmark = pytest.mark.llava +pytestmark = pytest.mark.vlm if should_skip_test_group(group_name="TEST_MODELS"): pytest.skip("TEST_MODELS=DISABLE, skipping model test group", diff --git a/tests/models/test_models_logprobs.py b/tests/models/test_models_logprobs.py index 621be698a1160..c86f238ca788d 100644 --- a/tests/models/test_models_logprobs.py +++ b/tests/models/test_models_logprobs.py @@ -48,7 +48,7 @@ @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [3]) def test_models( - vllm_runner_nm, + vllm_runner, hf_runner_nm, example_prompts, model: str, @@ -69,9 +69,7 @@ def test_models( del hf_model - vllm_model = vllm_runner_nm(model, - dtype=dtype, - max_model_len=MODEL_MAX_LEN) + vllm_model = vllm_runner(model, dtype=dtype, max_model_len=MODEL_MAX_LEN) vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts, max_tokens, num_logprobs) diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py new file mode 100644 index 0000000000000..ad8d178c650eb --- /dev/null +++ b/tests/models/test_phi3v.py @@ -0,0 +1,131 @@ +from typing import List, Tuple + +import pytest +from transformers import AutoTokenizer + +from tests.nm_utils.utils_skip import should_skip_test_group +from vllm.config import VisionLanguageConfig +from vllm.utils import is_cpu + +from ..conftest import IMAGE_FILES + +if should_skip_test_group(group_name="TEST_MODELS"): + pytest.skip("TEST_MODELS=DISABLE, skipping models test group", + allow_module_level=True) + +pytestmark = pytest.mark.vlm + +# The image token is placed before "user" on purpose so that the test can pass +HF_IMAGE_PROMPTS = [ + "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501 + "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n", +] + +assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES) + + +def iter_phi3v_configs(model_name: str): + image_hw_to_feature_size = { + (1008, 1344): 1921, + } + + for (h, w), f in image_hw_to_feature_size.items(): + for input_type, input_shape in [ + (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)), + ]: + yield (model_name, + VisionLanguageConfig(image_input_type=input_type, + image_feature_size=f, + image_token_id=32044, + image_input_shape=input_shape, + image_processor=model_name, + image_processor_revision=None)) + + +model_and_vl_config = [ + *iter_phi3v_configs("microsoft/Phi-3-vision-128k-instruct"), +] + + +def vllm_to_hf_output(vllm_output: Tuple[List[int], str], + vlm_config: VisionLanguageConfig, model_id: str): + """Sanitize vllm output to be comparable with hf output. + The function reduces `input_ids` from 1, 32000, 32000, ..., 32000, + x1, x2, x3 ... to 1, 32000, x1, x2, x3 ... + It also reduces `output_str` from "bla" to "bla". + """ + input_ids, output_str = vllm_output + image_token_id = vlm_config.image_token_id + + tokenizer = AutoTokenizer.from_pretrained(model_id) + image_token_str = tokenizer.decode(image_token_id) + + hf_input_ids = [ + input_id if input_id != image_token_id else 0 + for idx, input_id in enumerate(input_ids) + ] + hf_output_str = output_str \ + .replace(image_token_str * vlm_config.image_feature_size, "") \ + .replace("", " ").replace("<|user|>", "") \ + .replace("<|end|>\n<|assistant|>", " ") + + return hf_input_ids, hf_output_str + + +target_dtype = "half" +if is_cpu(): + target_dtype = "bfloat16" + + +# TODO: Add test for `tensor_parallel_size` [ref: PR #3883] +# Since we use _attn_implementation="eager" for hf_runner, here is +# numeric difference for longer context and test can't pass +@pytest.mark.parametrize("model_and_config", model_and_vl_config) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [128]) +def test_models(hf_runner, vllm_runner, hf_images, vllm_images, + model_and_config, dtype: str, max_tokens: int) -> None: + """Inference result should be the same between hf and vllm. + + All the image fixtures for the test is under tests/images. + For huggingface runner, we provide the PIL images as input. + For vllm runner, we provide MultiModalData objects and corresponding + vision language config as input. + Note, the text input is also adjusted to abide by vllm contract. + The text output is sanitized to be able to compare with hf. + """ + model_id, vlm_config = model_and_config + + # use eager mode for hf runner, since phi3_v didn't work with flash_attn + hf_model_kwargs = {"_attn_implementation": "eager"} + with hf_runner(model_id, dtype=dtype, + model_kwargs=hf_model_kwargs) as hf_model: + hf_outputs = hf_model.generate_greedy( + HF_IMAGE_PROMPTS, + max_tokens, + images=hf_images, + eos_token_id=hf_model.processor.tokenizer.eos_token_id) + + vllm_image_prompts = [ + p.replace("<|image_1|>", + "<|image|>" * vlm_config.image_feature_size + "") + for p in HF_IMAGE_PROMPTS + ] + + with vllm_runner(model_id, + max_model_len=2048, + dtype=dtype, + enforce_eager=True, + **vlm_config.as_cli_args_dict()) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts, + max_tokens, + images=vllm_images) + + for i in range(len(HF_IMAGE_PROMPTS)): + hf_output_ids, hf_output_str = hf_outputs[i] + vllm_output_ids, vllm_output_str = vllm_to_hf_output( + vllm_outputs[i], vlm_config, model_id) + assert hf_output_str == vllm_output_str, ( + f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") + assert hf_output_ids == vllm_output_ids, ( + f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") diff --git a/tests/models_core/test_llm_logprobs.py b/tests/models_core/test_llm_logprobs.py index 65186aafabf48..e14fb663aa15b 100644 --- a/tests/models_core/test_llm_logprobs.py +++ b/tests/models_core/test_llm_logprobs.py @@ -31,7 +31,7 @@ @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [5]) def test_models( - vllm_runner_nm, + vllm_runner, hf_runner_nm, example_prompts, model: str, @@ -44,7 +44,7 @@ def test_models( del hf_model - vllm_model = vllm_runner_nm(model, max_model_len=MODEL_MAX_LEN) + vllm_model = vllm_runner(model, max_model_len=MODEL_MAX_LEN) vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts, max_tokens, num_logprobs) diff --git a/tests/models_core/test_server_logprobs.py b/tests/models_core/test_server_logprobs.py index 023d63e3b93ae..055f4179c1511 100644 --- a/tests/models_core/test_server_logprobs.py +++ b/tests/models_core/test_server_logprobs.py @@ -1,3 +1,5 @@ +# mypy: ignore-errors +# TODO (robertgshaw2-neuralmagic): clean this up import asyncio import gc import os @@ -13,7 +15,6 @@ from tests.conftest import HfRunnerNM from tests.models.compare_utils import check_logprobs_close -from tests.nm_utils.logging import make_logger from tests.nm_utils.server import ServerContext from tests.nm_utils.utils_skip import should_skip_test_group @@ -96,15 +97,12 @@ def test_models_on_server( :param num_logprobs: the total number of logprobs checked for "close enough" :param tensor_parallel_size: passed to the vllm Server launch """ - logger = make_logger("vllm_test") - # Check that we have enough GPUs to run the test. if tensor_parallel_size > 1 and tensor_parallel_size > GPU_COUNT: pytest.skip(f"gpu count {GPU_COUNT} is insufficient for " f"tensor_parallel_size = {tensor_parallel_size}") # Load dataset. - logger.info("Loading dataset and converting to chat format.") ds = load_dataset("nm-testing/qa-chat-prompts", split="train_sft").select(range(NUM_SAMPLES_TO_RUN)) messages_list = [row["messages"][:NUM_CHAT_TURNS] for row in ds] @@ -129,7 +127,6 @@ def test_models_on_server( for messages in messages_list ] - logger.info("Generating chat responses from HF transformers.") hf_model = hf_runner_nm(model) hf_outputs = hf_model.generate_greedy_logprobs_nm_use_tokens( input_ids_lst, max_tokens, num_logprobs) @@ -139,7 +136,6 @@ def test_models_on_server( gc.collect() time.sleep(1.0) - logger.info("Generating chat responses from vLLM server.") api_server_args = { "--model": model, "--max-model-len": 4096, @@ -152,7 +148,7 @@ def test_models_on_server( # TODO: Update this to work like the benchmark script. asyncio_event_loop = asyncio.get_event_loop() - with ServerContext(api_server_args, logger=logger) as _: + with ServerContext(api_server_args) as _: chats = [] for messages in messages_list: chats.append( @@ -160,7 +156,6 @@ def test_models_on_server( # Gather results. results = asyncio_event_loop.run_until_complete(asyncio.gather(*chats)) - logger.info("Processing raw data from vLLM server.") vllm_outputs = [] # See https://platform.openai.com/docs/api-reference/chat/create @@ -187,7 +182,6 @@ def test_models_on_server( output_logprobs.append(top_logprobs) vllm_outputs.append((output_tokens, output_str, output_logprobs)) - logger.info("Comparing results.") check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_outputs, diff --git a/tests/nm_utils/server.py b/tests/nm_utils/server.py index 989e9c053740a..c395e37702d6f 100644 --- a/tests/nm_utils/server.py +++ b/tests/nm_utils/server.py @@ -1,9 +1,10 @@ -import logging +# mypy: ignore-errors +# TODO (robertgshaw2-neuralmagic): clean this up import os import subprocess import sys import time -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List import ray import requests @@ -15,10 +16,7 @@ @ray.remote(num_gpus=torch.cuda.device_count()) class ServerRunner: - def __init__(self, - args: List[str], - *, - logger: Optional[logging.Logger] = None): + def __init__(self, args: List[str]): env = os.environ.copy() env["PYTHONUNBUFFERED"] = "1" self.startup_command = [ @@ -69,16 +67,13 @@ class ServerContext: Context manager for the lifecycle of a vLLM server, wrapping `ServerRunner`. """ - def __init__(self, args: Dict[str, str], *, - logger: logging.Logger) -> None: + def __init__(self, args: Dict[str, str]) -> None: """Initialize a vLLM server - :param args: dictionary of flags/values to pass to the server command - :param logger: logging.Logger instance to use for logging + :param args: dictionary of flags/values to pass to the server command :param port: port the server is running on """ self._args = self._args_to_list(args) - self._logger = logger self.server_runner = None def __enter__(self): @@ -86,8 +81,7 @@ def __enter__(self): ray.init(ignore_reinit_error=True) try: - self.server_runner = ServerRunner.remote(self._args, - logger=self._logger) + self.server_runner = ServerRunner.remote(self._args, ) ray.get(self.server_runner.ready.remote()) return self.server_runner except Exception as e: diff --git a/tests/nm_utils/utils_skip.py b/tests/nm_utils/utils_skip.py index cca1b85d87049..6d9e6594ed9a2 100644 --- a/tests/nm_utils/utils_skip.py +++ b/tests/nm_utils/utils_skip.py @@ -102,6 +102,11 @@ def should_skip_tokenization_test_group(): return TEST_TOKENIZATION == "DISABLE" +def should_skip_tracing_test_group(): + TEST_TRACING = os.getenv("TEST_TRACING", "ENABLE") + return TEST_TRACING == "DISABLE" + + def should_skip_worker_test_group(): TEST_WORKER = os.getenv("TEST_WORKER", "ENABLE") return TEST_WORKER == "DISABLE" @@ -126,6 +131,7 @@ def should_skip_worker_test_group(): "TEST_SPEC_DECODE": should_skip_spec_decode_test_group, "TEST_TENSORIZER_LOADER": should_skip_tensorizer_loader_test_group, "TEST_TOKENIZATION": should_skip_tokenization_test_group, + "TEST_TRACING": should_skip_tracing_test_group, "TEST_WORKER": should_skip_worker_test_group, } diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index 7c3be3a1367b2..8b554ebd1b7a9 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -2,9 +2,12 @@ Run `pytest tests/prefix_caching/test_prefix_caching.py`. """ +from typing import List + import pytest from tests.nm_utils.utils_skip import should_skip_test_group +from vllm.block import PhysicalTokenBlock from vllm.core.block_manager_v1 import CachedBlockAllocator from vllm.utils import Device @@ -49,7 +52,7 @@ def test_block_allocator( def test_eviction(num_blocks: int, ): block_size = 16 block_allocator = CachedBlockAllocator(Device.CPU, block_size, num_blocks) - blocks = [] + blocks: List[PhysicalTokenBlock] = [] for i in range(num_blocks): # use i as the block_hash diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 936f2f0f7830e..9c96d16c81c88 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -10,15 +10,20 @@ from vllm import SamplingParams from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 CompressedTensorsLinearMethod, CompressedTensorsW4A16, - CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor) + CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8DynamicToken, + CompressedTensorsW8A8StaticTensor) if should_skip_test_group(group_name="TEST_QUANTIZATION"): pytest.skip("TEST_QUANTIZATION=DISABLE, skipping quantization test group", allow_module_level=True) -def test_compressed_tensors_w8a8_static_setup(vllm_runner): - model_path = "nm-testing/tinyllama-oneshot-w8a8-static-v2" +@pytest.mark.parametrize("model_args", [ + ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", "tensor"), + ("nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", "channel"), +]) +def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args): + model_path, strategy = model_args with vllm_runner(model_path, enforce_eager=True) as llm: model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 layer = model.model.layers[0] @@ -37,27 +42,32 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner): assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor) + assert qkv_proj.scheme.strategy == strategy assert qkv_proj.weight.dtype is torch.int8 assert o_proj.weight.dtype is torch.int8 assert gate_up_proj.weight.dtype is torch.int8 - assert qkv_proj.weight_scale.shard_splitter is not None - assert qkv_proj.weight_scale.logical_widths is not None + if qkv_proj.scheme.strategy == "tensor": + assert qkv_proj.weight_scale.shard_splitter is not None + assert qkv_proj.weight_scale.logical_widths is not None assert qkv_proj.input_scale.dtype is torch.float32 def test_compressed_tensors_no_enforce_eager(vllm_runner): - model_path = "nm-testing/tinyllama-oneshot-w8a8-static-v2" + model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change" with vllm_runner(model_path) as llm: sampling_params = SamplingParams() output = llm.generate("Hello world!", sampling_params=sampling_params) assert output -def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner): - model_path = "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2" - with vllm_runner(model_path, enforce_eager=True, - dtype=torch.float16) as llm: +@pytest.mark.parametrize("model_args", [ + ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"), + ("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2", "channel"), +]) +def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args): + model_path, strategy = model_args + with vllm_runner(model_path, dtype=torch.float16) as llm: model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 layer = model.model.layers[0] @@ -65,6 +75,7 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner): assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken) + assert qkv_proj.scheme.strategy == strategy assert qkv_proj.weight.dtype is torch.int8 @@ -88,3 +99,20 @@ def test_compressed_tensors_w4a16(vllm_runner, w4a16_args): assert qkv_proj.weight_packed.dtype is torch.int32 assert qkv_proj.weight_scale.dtype is torch.float16 assert qkv_proj.weight_packed.pack_factor == 8 + + +def test_compressed_tensors_w4a16_marlin24(vllm_runner): + model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t" + with vllm_runner(model_path) as llm: + model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + + assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24) + assert qkv_proj.weight_packed.dtype is torch.int32 + + sampling_params = SamplingParams() + output = llm.generate("Hello world!", sampling_params=sampling_params) + assert output diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py index db35679b03acb..1ec89258a2ae6 100644 --- a/tests/quantization/test_configs.py +++ b/tests/quantization/test_configs.py @@ -4,6 +4,7 @@ """ from dataclasses import dataclass +from typing import Tuple import pytest import torch @@ -59,7 +60,7 @@ class ModelPair: @pytest.mark.skipif(torch.cuda.get_device_capability() < (8, 0), reason="skip for T4s, requires compute capability 8.0") @pytest.mark.parametrize("model_arg_exptype", MODEL_ARG_EXPTYPES) -def test_auto_gptq(model_arg_exptype: str) -> None: +def test_auto_gptq(model_arg_exptype: Tuple[str, None, str]) -> None: model_path, quantization_arg, expected_type = model_arg_exptype try: diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py index 0c92d565d0ddd..29085916afb4d 100644 --- a/tests/quantization/utils.py +++ b/tests/quantization/utils.py @@ -10,5 +10,5 @@ def is_quant_method_supported(quant_method: str) -> bool: capability = torch.cuda.get_device_capability() capability = capability[0] * 10 + capability[1] - return (capability < + return (capability >= QUANTIZATION_METHODS[quant_method].get_min_capability()) diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 9dc0d6dfa7bbd..00e87af0b945f 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -1,3 +1,5 @@ +from typing import List + import pytest import torch @@ -67,21 +69,22 @@ def test_get_prompt_logprobs( for logprobs in result.outputs[0].logprobs: assert len(logprobs) == num_top_logprobs output_text = result.outputs[0].text - output_string_from_most_likely_tokens = [] + output_string_from_most_likely_tokens_lst: List[str] = [] for top_logprobs in result.outputs[0].logprobs: top_logprob = next(iter(top_logprobs.values())) - output_string_from_most_likely_tokens.append( + output_string_from_most_likely_tokens_lst.append( top_logprob.decoded_token) if detokenize: output_string_from_most_likely_tokens = "".join( - output_string_from_most_likely_tokens) + output_string_from_most_likely_tokens_lst) assert output_text == output_string_from_most_likely_tokens, ( "The output text from the top logprob for each token position " "should be the same as the output text in the result.") else: assert output_text == '' - assert output_string_from_most_likely_tokens == [None] * max_tokens + assert output_string_from_most_likely_tokens_lst == ([None] * + max_tokens) # The first prompt logprob is always None assert result.prompt_logprobs[0] is None diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py index f7ce4d1d0c694..234953f850e22 100644 --- a/tests/samplers/test_rejection_sampler.py +++ b/tests/samplers/test_rejection_sampler.py @@ -258,8 +258,8 @@ def test_rejection_sampling_approximates_target_distribution( draft_and_target_probs_equal) sample_sizes = [10, 100, 1_000, 10_000, 100_000] - distance_wrt_reference = [] - distance_wrt_target = [] + distance_wrt_reference: List[float] = [] + distance_wrt_target: List[float] = [] for num_samples in sample_sizes: (reference_vs_rejsample_dist, diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index 03708e173ea33..c72ec11b5cb0d 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -1,7 +1,7 @@ # UPSTREAM SYNC: devices need to be passed around to pass multi-gpu automation import itertools import random -from typing import List, Optional, Tuple +from typing import Dict, List, Optional, Tuple from unittest.mock import patch import pytest @@ -55,8 +55,8 @@ def _do_sample( sampling_params: SamplingParams, device: str, ): - seq_group_metadata_list = [] - seq_lens = [] + seq_group_metadata_list: List[SequenceGroupMetadata] = [] + seq_lens: List[int] = [] for i in range(batch_size): seq_group_metadata_list.append( SequenceGroupMetadata( @@ -218,7 +218,7 @@ def generate_test_case(): batch_size = random.randint(1, 128) expected_penalization = [] - sequence_metadata_list = [] + sequence_metadata_list: List[SequenceGroupMetadata] = [] # 20% chance to generate seq group metadata list with all prompts is_prompt = random.random() < 0.2 while batch_size > 0: @@ -238,8 +238,8 @@ def generate_test_case(): eos_token_id=eos_token_id, stop_token_ids=stop_token_ids) - seq_data = {} - seq_group_penalization = [] + seq_data: Dict[int, SequenceData] = {} + seq_group_penalization: List[bool] = [] for _ in range(num_seqs): num_input = random.randint(1, 100) num_generated = 0 if is_prompt else random.randint(1, 100) @@ -398,17 +398,16 @@ def generate_test_case(): else: test_cases = [generate_test_case()] - def run_test_case(*, - expected_penalization=None, - seq_group_metadata_list=None): + def run_test_case(*, expected_penalization: List[bool], + seq_group_metadata_list: List[SequenceGroupMetadata]): assert expected_penalization, \ "Invalid test case, need expected_penalization" assert seq_group_metadata_list, \ "Invalid test case, need seq_group_metadata_list" batch_size = 0 - seq_lens = [] - sampling_params_per_row = [] + seq_lens: List[int] = [] + sampling_params_per_row: List[SamplingParams] = [] for sgm in seq_group_metadata_list: sampling_params = sgm.sampling_params @@ -478,15 +477,15 @@ def test_sampler_mixed(seed: int, device: str): batch_size = random.randint(1, 256) input_tensor, fake_logits, sampler = _prepare_test(batch_size) - seq_group_metadata_list = [] + seq_group_metadata_list: List[SequenceGroupMetadata] = [] expected_tokens: List[Optional[List[int]]] = [] - seq_lens = [] + seq_lens: List[int] = [] for i in range(batch_size): expected: Optional[List[int]] = None sampling_type = random.randint(0, 3) if sampling_type == 0: sampling_params = SamplingParams(temperature=0) - expected = [torch.argmax(fake_logits[i], dim=-1).item()] + expected = [int(torch.argmax(fake_logits[i], dim=-1).item())] elif sampling_type in (1, 2): n = random.randint(1, 10) sampling_params = SamplingParams( @@ -542,15 +541,18 @@ def test_sampling(): ] continue + expected_tokens_item = expected_tokens[i] + assert expected_tokens_item is not None + for n, nth_output in enumerate(sequence_output.samples): if (metadata.sampling_params.temperature == 0 or metadata.sampling_params.seed is not None): # Ensure exact matches for greedy or random with seed - assert nth_output.output_token == expected_tokens[i][n] + assert nth_output.output_token == expected_tokens_item[n] else: # For non-seeded random check that one of the high-logit # tokens were chosen - assert nth_output.output_token in expected_tokens[i] + assert nth_output.output_token in expected_tokens_item # Test batch test_sampling() @@ -594,8 +596,8 @@ def test_sampler_top_k_top_p(seed: int, device: str): warpers = generation_model._get_logits_warper(generation_config) assert len(warpers) == 2 # top_p and top_k - seq_group_metadata_list = [] - seq_lens = [] + seq_group_metadata_list: List[SequenceGroupMetadata] = [] + seq_lens: List[int] = [] for i in range(batch_size): seq_group_metadata_list.append( SequenceGroupMetadata( @@ -628,7 +630,79 @@ def mock_sample(probs, *args, **kwargs): with patch("vllm.model_executor.layers.sampler._sample", mock_sample): sampler(logits=fake_logits, sampling_metadata=sampling_metadata) + + assert sample_probs is not None + hf_probs = warpers(torch.zeros_like(fake_logits), fake_logits.clone()) hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float) assert torch.allclose(hf_probs, sample_probs, atol=1e-5) assert torch.equal(hf_probs.eq(0), sample_probs.eq(0)) + + +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_sampler_repetition_penalty_mixed(device: str): + + vocab_size = 8 + + def test_sampling_params(sampling_params: List[SamplingParams]): + + seq_group_metadata_list: List[SequenceGroupMetadata] = [] + seq_lens: List[int] = [] + for i in range(2): + seq_group_metadata_list.append( + SequenceGroupMetadata( + request_id=f"test_{i}", + is_prompt=True, + seq_data={0: SequenceData([1, 2, 3])}, + sampling_params=sampling_params[i], + block_tables={0: [1]}, + )) + seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) + + sampling_metadata = SamplingMetadata.prepare( + seq_group_metadata_list, + seq_lens, + query_lens=seq_lens, + device=device, + pin_memory=is_pin_memory_available()) + + fake_logits = torch.full((2, vocab_size), + 1e-2, + device=device, + dtype=torch.float16) + + fake_logits[:, 5] = 1.1e-2 + fake_logits[:, 1] = 1.2e-2 + + sampler = MockLogitsSampler(fake_logits) + + sampler_output = sampler(logits=fake_logits, + sampling_metadata=sampling_metadata) + + generated_tokens = [] + for output in sampler_output: + generated_tokens.append(output.samples[0].output_token) + + return generated_tokens + + # one configuration is greedy with repetition_penalty + sampling_params_rep = SamplingParams( + temperature=0.0, + repetition_penalty=2.0, + ) + + # other configuration is sampling w/o repetition_penalty + sampling_params_sample = SamplingParams( + temperature=1.0, + top_k=1, + seed=42, + ) + + tokens1 = test_sampling_params( + [sampling_params_rep, sampling_params_sample]) + + tokens2 = test_sampling_params( + [sampling_params_sample, sampling_params_rep]) + + assert tokens1[0] == tokens2[1] + assert tokens1[1] == tokens2[0] diff --git a/tests/samplers/test_typical_acceptance_sampler.py b/tests/samplers/test_typical_acceptance_sampler.py new file mode 100644 index 0000000000000..93cff2c3c9f7d --- /dev/null +++ b/tests/samplers/test_typical_acceptance_sampler.py @@ -0,0 +1,469 @@ +"""Tests for rejection sampling.""" + +import pytest +import torch + +from tests.nm_utils.utils_skip import should_skip_test_group +from vllm.model_executor.layers.typical_acceptance_sampler import ( + TypicalAcceptanceSampler) +from vllm.model_executor.utils import set_random_seed + +if should_skip_test_group(group_name="TEST_SAMPLERS"): + pytest.skip("TEST_SAMPLERS=DISABLE, skipping sampler test group", + allow_module_level=True) + +CUDA_DEVICES = [f"cuda:{i}" for i in range(1)] + + +def get_zero_temperature_prob_dist(batch_size, k, vocab_size): + """ + Generates a fake temperature zero probability distribution. + Returns: + 1. A fake temperature zero probability distribution of shape + [batch_size, k, vocab_size] + 2. Tensor of shape [batch_size, k] containing the token ids + of the probability 1.0 tokens at each position. + """ + # Simulate temperature 0 probability distribution for target probabilities + # and create target probabilities such that only 1 token id has + # probability 1.0 + target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) + probs = torch.rand(batch_size, k, vocab_size) + _, zero_temperature_token_ids = torch.max(probs, dim=-1) + # set the probability of the tokens with ids in zero_temperature_token_ids + # to 1 and the rest to 0. + target_probs = torch.zeros_like(probs).scatter_( + -1, zero_temperature_token_ids.unsqueeze(-1), 1.0) + return target_probs, zero_temperature_token_ids + + +def get_draft_token_ids(batch_size: int, k: int, vocab_size: int, + token_ids_to_exclude: torch.Tensor): + """ + Returns a tensor of shape [batch_size, k] of fake draft token ids + drawn randomly from a vocab of size vocab_size. We however ensure + that token_ids from token_ids_to_exclude are excluded at the + corresponding positions. + """ + draft_token_ids = torch.empty(batch_size, k, dtype=torch.long) + for i in range(batch_size): + for j in range(k): + # Generate a random token ID excluding token_ids_to_exclude[i, j] + while True: + token_id = torch.randint(0, vocab_size, (1, )).item() + if token_id != token_ids_to_exclude[i, j]: + draft_token_ids[i, j] = token_id + break + return draft_token_ids + + +@pytest.mark.parametrize("k", list(range(1, 6))) +@pytest.mark.parametrize("vocab_size", [30_000, 50_000]) +@pytest.mark.parametrize("batch_size", list(range(1, 32))) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int, + device: str): + """ + Tests that the TypicalAcceptancSampler forward succeeds for + different combinations of k, vocab_size, batch_size and num devices. + """ + torch.set_default_device(device) + typical_acceptance_sampler = TypicalAcceptanceSampler() + typical_acceptance_sampler.init_gpu_tensors(rank=0) + target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) + bonus_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, 1), + dtype=torch.int64) + draft_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, k), + dtype=torch.int64) + # Verify that sampling succeeds for all cases. + typical_acceptance_sampler(target_probs, bonus_token_ids, draft_token_ids) + + +@pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"]) +@pytest.mark.parametrize("which_token_ids", + ["bonus_token_ids", "draft_token_ids"]) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_raises_when_vocab_oob(above_or_below_vocab_range: str, + which_token_ids: str, device: str): + """ + Tests that we throw an exception of the token ids fall outside + the bound of the provided vocabulary. + """ + k = 3 + batch_size = 5 + vocab_size = 30_000 + torch.set_default_device(device) + typical_acceptance_sampler = TypicalAcceptanceSampler(strict_mode=True) + typical_acceptance_sampler.init_gpu_tensors(rank=0) + target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) + bonus_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, 1), + dtype=torch.int64) + draft_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, k), + dtype=torch.int64) + # Verify that appropriate exceptions are thrown for out + # of bound vocabs. + oob_token_ids = None + if which_token_ids == "bonus_token_ids": + oob_token_ids = bonus_token_ids + elif which_token_ids == "draft_token_ids": + oob_token_ids = draft_token_ids + else: + raise AssertionError() + + if above_or_below_vocab_range == "above": + rogue_token_id = vocab_size + 1 + elif above_or_below_vocab_range == "below": + rogue_token_id = -1 + else: + raise AssertionError() + + oob_token_ids[0][0] = rogue_token_id + + with pytest.raises(AssertionError): + typical_acceptance_sampler(target_probs, bonus_token_ids, + draft_token_ids) + + +@pytest.mark.parametrize("seed", list(range(10))) +@pytest.mark.parametrize("disable_bonus_tokens", [True, False]) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_uniform_target_distribution_accepts_all_tokens( + seed: int, disable_bonus_tokens: bool, device: str): + """ + Test the TypicalAcceptanceSampler with a uniform target probability + distribution. + + This test verifies that when provided with a uniform target probability + distribution, the TypicalAcceptanceSampler accepts all draft tokens. The + entropy of the uniform target distribution being high should lead to all + draft tokens being accepted. The test also ensures that the behavior + regarding bonus tokens is consistent with the `disable_bonus_tokens` + flag. + """ + set_random_seed(seed) + k = 3 + batch_size = 5 + vocab_size = 30_000 + torch.set_default_device(device) + typical_acceptance_sampler = TypicalAcceptanceSampler( + strict_mode=True, disable_bonus_tokens=disable_bonus_tokens) + typical_acceptance_sampler.init_gpu_tensors(rank=0) + target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) + draft_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, k), + dtype=torch.int64) + bonus_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, 1), + dtype=torch.int64) + output_token_ids = typical_acceptance_sampler(target_probs, + bonus_token_ids, + draft_token_ids) + # We are using a uniform target probability distribution. + # For a uniform distribution the entropy is very high and it + # should lead to all draft tokens being accepted. Verify that. + assert output_token_ids.shape[0] == batch_size + assert output_token_ids.shape[1] == (k + 1) + if disable_bonus_tokens: + assert torch.all(output_token_ids[:, -1] == -1) + else: + assert torch.all(output_token_ids[:, -1] == bonus_token_ids.squeeze()) + + assert torch.all(output_token_ids[:, :k] == draft_token_ids) + + +@pytest.mark.parametrize("seed", list(range(10))) +@pytest.mark.parametrize("disable_bonus_tokens", [True, False]) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_temperature_zero_target_distribution(seed: int, + disable_bonus_tokens: bool, + device: str): + """ + Test the TypicalAcceptanceSampler with a zero-temperature target + probability distribution. + + This test verifies that when using a zero-temperature target probability + distribution, where only one token has a probability of 1.0, the + TypicalAcceptanceSampler correctly rejects all draft tokens that do not + match this probability. Additionally, it ensures that when all draft + tokens are rejected, the sampler falls back to greedy sampling to select a + single token from the target distribution. + """ + set_random_seed(seed) + k = 3 + batch_size = 5 + vocab_size = 30_000 + torch.set_default_device(device) + + typical_acceptance_sampler = TypicalAcceptanceSampler( + strict_mode=True, disable_bonus_tokens=disable_bonus_tokens) + typical_acceptance_sampler.init_gpu_tensors(rank=0) + # Simulate temperature 0 probability distribution for target probabilities + # and create target probabilities such that only 1 token id has + # probability 1.0 + target_probs, zero_temperature_token_ids = get_zero_temperature_prob_dist( + batch_size, k, vocab_size) + # Populate draft_token_ids such that they exclude the token_ids + # with probability = 1.0 + draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size, + zero_temperature_token_ids) + bonus_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, 1), + dtype=torch.int64) + # The target probaility distribution is a temperature zero distribution + # with zero entroy. Since our draft token ids don't match the probability + # 1.0 tokens in the target distribution we will reject all of them and + # fallback to the greedy sampling for selecting 1 token for each sequence. + # Verify the same. + output_token_ids = typical_acceptance_sampler(target_probs, + bonus_token_ids, + draft_token_ids) + assert output_token_ids.shape[0] == batch_size + assert output_token_ids.shape[1] == (k + 1) + assert torch.all(output_token_ids[:, -1] == -1) + assert torch.all(output_token_ids[:, 0] == zero_temperature_token_ids[:, + 0]) + + +@pytest.mark.parametrize("seed", list(range(10))) +@pytest.mark.parametrize("disable_bonus_tokens", [True, False]) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_mixed_target_distribution(seed: int, disable_bonus_tokens: bool, + device: str): + """ + Test the TypicalAcceptanceSampler with a mixed target probability + distribution. + + This test ensures that the TypicalAcceptanceSampler handles a mixed + target probability distribution correctly. Specifically, it uses a + zero-temperature distribution for some sequences and a uniform + distribution for others. The test verifies that: + + - For sequences with a zero-temperature distribution, only the token + with a probability of 1.0 is accepted, and all other tokens are rejected. + - For sequences with a uniform distribution, all draft tokens are + accepted. + - When `disable_bonus_tokens` is False, the bonus tokens are also accepted + for sequences with a uniform distribution. + """ + set_random_seed(seed) + k = 3 + batch_size = 4 + vocab_size = 30_000 + torch.set_default_device(device) + typical_acceptance_sampler = TypicalAcceptanceSampler( + strict_mode=True, disable_bonus_tokens=disable_bonus_tokens) + typical_acceptance_sampler.init_gpu_tensors(rank=0) + # For sequences 0 and 2 set the distribution to a temperature + # zero distribution. For sequences 1 and 3 set it to a uniform + # distribution. + target_probs, zero_temperature_token_ids = (get_zero_temperature_prob_dist( + batch_size, k, vocab_size)) + draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size, + zero_temperature_token_ids) + uniform_probs = torch.rand(2, k, vocab_size, dtype=torch.float32) + target_probs[[1, 3]] = uniform_probs + bonus_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, 1), + dtype=torch.int64) + output_token_ids = typical_acceptance_sampler(target_probs, + bonus_token_ids, + draft_token_ids) + # verify the shape of output_token_ids + assert output_token_ids.shape[0] == batch_size + assert output_token_ids.shape[1] == (k + 1) + # For sequences 0 and 2 verify that only 1 token is accepted + # which is the token with probability 1.0 in the target distribution + # at position 0. + assert torch.all(output_token_ids[[0, 2], 1:] == -1) + assert (torch.all(output_token_ids[[0, 2], + 0] == zero_temperature_token_ids[[0, 2], + 0])) + # For sequences 1 and 3 verify that all tokens are accepted since the + # target probability distribution is uniform. In addition verify that + # if disable_bonus_tokens is false then we also accept the bonus tokens. + assert torch.all( + output_token_ids[[1, 3], :-1] == draft_token_ids[[1, 3], :]) + if disable_bonus_tokens: + assert torch.all(output_token_ids[[1, 3], -1] == -1) + else: + assert torch.all(output_token_ids[[1, 3], -1] != -1) + + +@pytest.mark.parametrize("seed", list(range(10))) +@pytest.mark.parametrize("disable_bonus_tokens", [True, False]) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_accept_tokens_partially(seed: int, disable_bonus_tokens: bool, + device: str): + """ + Test the TypicalAcceptanceSampler's behavior when only a subset of draft + tokens should be accepted. + + This test verifies that the TypicalAcceptanceSampler correctly accepts or + rejects draft tokens based on a zero-temperature target probability + distribution. Specifically, it ensures that: + + - When all draft tokens match tokens with a probability of 1.0 in the + target distribution, all draft tokens are accepted. + - When only some draft tokens match tokens with a probability of 1.0 in + the target distribution, only those matching tokens are accepted, and the + rest are rejected. + """ + set_random_seed(seed) + k = 5 + batch_size = 1 + vocab_size = 30_000 + torch.set_default_device(device) + typical_acceptance_sampler = TypicalAcceptanceSampler( + strict_mode=True, disable_bonus_tokens=disable_bonus_tokens) + typical_acceptance_sampler.init_gpu_tensors(rank=0) + # Create a temperature zero target probability distribution and ensure + # all draft token ids correspond to the tokens with 1.0 probability. + # Verify that all of them are accepted. + target_probs, zero_temperature_token_ids = (get_zero_temperature_prob_dist( + batch_size, k, vocab_size)) + draft_token_ids = zero_temperature_token_ids + bonus_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, 1), + dtype=torch.int64) + output_token_ids = typical_acceptance_sampler(target_probs, + bonus_token_ids, + draft_token_ids) + assert output_token_ids.shape[0] == batch_size + assert output_token_ids.shape[1] == (k + 1) + assert torch.all(output_token_ids[:, 0:-1] == draft_token_ids) + if disable_bonus_tokens: + assert torch.all(output_token_ids[:, -1] == -1) + else: + assert torch.all(output_token_ids[:, -1] == bonus_token_ids) + # Next only keep the first 2 draft tokens same as the zero temperature + # tokens. For the remaining 3 choose some other tokens. In the + # response we will expect the first 2 tokens to be the same as the + # draft tokens and the rest as -1 + draft_token_ids_to_replace = get_draft_token_ids( + batch_size, k, vocab_size, zero_temperature_token_ids) + draft_token_ids = torch.cat( + (draft_token_ids[:, :2], draft_token_ids_to_replace[:, -3:]), dim=1) + output_token_ids = typical_acceptance_sampler(target_probs, + bonus_token_ids, + draft_token_ids) + assert output_token_ids.shape[0] == batch_size + assert output_token_ids.shape[1] == (k + 1) + assert torch.all(output_token_ids[:, :2] == draft_token_ids[:, :2]) + assert torch.all(output_token_ids[:, -3:] == -1) + + +@pytest.mark.parametrize("seed", list(range(1))) +@pytest.mark.parametrize("disable_bonus_tokens", [True, False]) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_accept_tokens_set_non_default_posteriors(seed: int, + disable_bonus_tokens: bool, + device: str): + """ + Test the TypicalAcceptanceSampler with custom posterior thresholds and + alpha values. This test verifies that by modifying the posterior + thresholds and alpha values we can change the acceptance behavior of the + sampler. + """ + set_random_seed(seed) + k = 5 + batch_size = 1 + vocab_size = 30_000 + torch.set_default_device(device) + typical_acceptance_sampler = TypicalAcceptanceSampler( + strict_mode=True, disable_bonus_tokens=disable_bonus_tokens) + typical_acceptance_sampler.init_gpu_tensors(rank=0) + # Simulate temperature 0 probability distribution for target + # probabilities and create target probabilities such that only 1 token + # id has probability 1.0 and others have a very low probability of + # 0.00001. Populate draft_token_ids such that they exclude the token_ids + # with probability = 1.0. Without any changes to the posterior thresholds + # none of the draft tokens are accepted. + target_probs, zero_temperature_token_ids = (get_zero_temperature_prob_dist( + batch_size, k, vocab_size)) + target_probs[target_probs == 0] = 0.00001 + draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size, + zero_temperature_token_ids) + bonus_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, 1), + dtype=torch.int64) + output_token_ids = typical_acceptance_sampler(target_probs, + bonus_token_ids, + draft_token_ids) + assert output_token_ids.shape[0] == batch_size + assert output_token_ids.shape[1] == (k + 1) + assert torch.all(output_token_ids[:, 1:-1] == -1) + + # Change the posterior threshold values to 0.0 so that we will + # now accept even draft tokens with very low probability in the + # target distribution. Simulate and verify the same. + typical_acceptance_sampler = TypicalAcceptanceSampler( + strict_mode=True, + disable_bonus_tokens=disable_bonus_tokens, + posterior_threshold=0.0, + posterior_alpha=0.0) + typical_acceptance_sampler.init_gpu_tensors(rank=0) + output_token_ids = typical_acceptance_sampler(target_probs, + bonus_token_ids, + draft_token_ids) + assert output_token_ids.shape[0] == batch_size + assert output_token_ids.shape[1] == (k + 1) + assert torch.all(output_token_ids[:, 0:-1] == draft_token_ids) + if disable_bonus_tokens: + assert torch.all(output_token_ids[:, -1] == -1) + else: + assert torch.all(output_token_ids[:, -1] == bonus_token_ids) + + +@pytest.mark.parametrize("seed", list(range(10))) +@pytest.mark.parametrize("disable_bonus_tokens", [True, False]) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_replacement_token_ids(seed: int, disable_bonus_tokens: bool, + device: str): + """ + Test the TypicalAcceptanceSampler's method for generating + replacement token IDs. + + This test verifies that the `_replacement_token_ids` method of the + TypicalAcceptanceSampler correctly identifies the token IDs to be used + as replacements based on the target probability distribution. + Specifically, it ensures that the method correctly identifies the + tokens with the highest probability for each sequence in the batch. + """ + set_random_seed(seed) + k = 10 + batch_size = 5 + vocab_size = 30_000 + torch.set_default_device(device) + typical_acceptance_sampler = TypicalAcceptanceSampler( + strict_mode=True, disable_bonus_tokens=disable_bonus_tokens) + typical_acceptance_sampler.init_gpu_tensors(rank=0) + target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) + expected_replacement_tokens = -torch.ones( + (batch_size, k), dtype=torch.long) + expected_replacement_tokens[:, 0] = torch.argmax(target_probs[:, 0, :], + dim=1) + actual_replacement_tokens = ( + typical_acceptance_sampler._replacement_token_ids(target_probs)) + assert torch.all(expected_replacement_tokens == actual_replacement_tokens) diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py index f8a6de54653c1..60dfe33f2918b 100644 --- a/tests/spec_decode/e2e/conftest.py +++ b/tests/spec_decode/e2e/conftest.py @@ -1,5 +1,4 @@ import asyncio -import time from itertools import cycle from typing import Dict, List, Optional, Tuple, Union @@ -7,12 +6,6 @@ import ray import torch -from vllm.utils import is_hip - -if (not is_hip()): - from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, - nvmlInit) - from vllm import LLM from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine @@ -26,6 +19,7 @@ from vllm.utils import Counter, random_uuid from ...conftest import cleanup +from ...utils import wait_for_gpu_memory_to_clear class AsyncLLM: @@ -118,16 +112,17 @@ def generate( raise ValueError("The lengths of prompts and " "sampling_params must be the same.") - async def get_output(prompt, sampling_param) -> str: + async def get_output(prompt, sampling_param) -> RequestOutput: request_id = random_uuid() results_generator = self.llm_engine.generate( prompt, sampling_param, request_id) final_output = None async for request_output in results_generator: final_output = request_output + assert final_output is not None return final_output - outputs = [] + outputs: List[RequestOutput] = [] try: for i in range(num_requests): prompt = prompts[i] if prompts is not None else None @@ -208,8 +203,8 @@ def maybe_assert_ngram_worker(llm): def get_output_from_llm_generator( llm_generator, prompts, sampling_params) -> Tuple[List[str], List[List[int]]]: - tokens = [] - token_ids = [] + tokens: List[str] = [] + token_ids: List[List[int]] = [] for llm in llm_generator(): maybe_assert_ngram_worker(llm) @@ -290,38 +285,3 @@ def run_greedy_equality_correctness_test(baseline_llm_generator, print(f'{i=} {baseline_token_ids=}') print(f'{i=} {spec_token_ids=}') assert baseline_token_ids == spec_token_ids - - -def wait_for_gpu_memory_to_clear(devices: List[int], - threshold_bytes: int, - timeout_s: float = 120) -> None: - # Use nvml instead of pytorch to reduce measurement error from torch cuda - # context. - nvmlInit() - start_time = time.time() - while True: - output = {} - output_raw = {} - for device in devices: - dev_handle = nvmlDeviceGetHandleByIndex(device) - mem_info = nvmlDeviceGetMemoryInfo(dev_handle) - gb_used = mem_info.used / 2**30 - output_raw[device] = gb_used - output[device] = f'{gb_used:.02f}' - - print('gpu memory used (GB): ', end='') - for k, v in output.items(): - print(f'{k}={v}; ', end='') - print('') - - dur_s = time.time() - start_time - if all(v <= (threshold_bytes / 2**30) for v in output_raw.values()): - print(f'Done waiting for free GPU memory on devices {devices=} ' - f'({threshold_bytes/2**30=}) {dur_s=:.02f}') - break - - if dur_s >= timeout_s: - raise ValueError(f'Memory of devices {devices=} not free after ' - f'{dur_s=:.02f} ({threshold_bytes/2**30=})') - - time.sleep(5) diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py index 0b9ebe4e63556..0346e9e333e10 100644 --- a/tests/spec_decode/test_batch_expansion.py +++ b/tests/spec_decode/test_batch_expansion.py @@ -1,3 +1,5 @@ +from typing import List + import pytest import torch @@ -43,14 +45,14 @@ def test_get_token_ids_to_score(k: int): device='cuda', ) - expected_output = [ + expected_output: List[List[int]] = [ [], ] for i in range(proposal_token_ids.shape[0]): expected_output.append(proposal_token_ids[:i + 1].tolist()) scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000) - actual_output = scorer._get_token_ids_to_score(proposal_token_ids) # pylint: disable=protected-access + actual_output = scorer._get_token_ids_to_score(proposal_token_ids.tolist()) # pylint: disable=protected-access actual_output = [ x.tolist() if isinstance(x, torch.Tensor) else x for x in actual_output diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py index 358aecca926d4..d83bf75e078d8 100644 --- a/tests/spec_decode/test_multi_step_worker.py +++ b/tests/spec_decode/test_multi_step_worker.py @@ -1,4 +1,5 @@ import random +from typing import Dict, List from unittest.mock import MagicMock import pytest @@ -6,7 +7,7 @@ from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.utils import set_random_seed -from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.sequence import ExecuteModelRequest, Logprob, SamplerOutput from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.top1_proposer import Top1Proposer from vllm.worker.worker import Worker @@ -215,7 +216,7 @@ def test_same_output_for_multi_step(): # Run single-step repeatedly. zero_kv_cache(worker.cache_engine) - single_step_output = [] + single_step_output: List[SamplerOutput] = [] continuations = [[1] for _ in prompts] set_random_seed(seed) @@ -237,11 +238,15 @@ def test_same_output_for_multi_step(): continuations[i].append(seq_group_output.samples[0].output_token) # Get token ids and logprobs for comparison. - multi_step_output_logprobs = [[] for _ in prompts] - single_step_output_logprobs = [[] for _ in prompts] - - multi_step_output_token_ids = [[] for _ in prompts] - single_step_output_token_ids = [[] for _ in prompts] + multi_step_output_logprobs: List[List[Dict[int, + Logprob]]] = [[] + for _ in prompts] + single_step_output_logprobs: List[List[Dict[int, + Logprob]]] = [[] + for _ in prompts] + + multi_step_output_token_ids: List[List[int]] = [[] for _ in prompts] + single_step_output_token_ids: List[List[int]] = [[] for _ in prompts] for i, _ in enumerate(prompts): for multi_step, single_step in zip(multi_step_output, single_step_output): diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index 4c098246ab1a4..c896712fddde0 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -1,5 +1,6 @@ import random from types import SimpleNamespace +from typing import Dict, List from unittest.mock import MagicMock import pytest @@ -8,7 +9,7 @@ from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.model_executor.utils import set_random_seed -from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.sequence import ExecuteModelRequest, SamplerOutput, SequenceOutput from vllm.spec_decode.interfaces import SpeculativeProposals from vllm.spec_decode.metrics import (AsyncMetricsCollector, SpecDecodeWorkerMetrics) @@ -108,7 +109,7 @@ def test_correctly_calls_target_model(k: int, batch_size: int): seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k)) - seen_contexts = [] + seen_contexts: List[List[int]] = [] call_args_list = target_worker.execute_model.call_args_list assert len(call_args_list) == 1 @@ -121,7 +122,7 @@ def test_correctly_calls_target_model(k: int, batch_size: int): for seq_data in seq_group_metadata.seq_data.values(): seen_contexts.append(seq_data.get_token_ids()) - expected_seen_contexts = [] + expected_seen_contexts: List[List[int]] = [] for prompt, prev_generated, draft_tokens in zip( prompts, prev_output_tokens, proposal_token_ids.tolist()): @@ -315,8 +316,14 @@ def test_correctly_formats_output(k: int, batch_size: int): next(iter(seq_group_metadata.seq_data.keys())) for seq_group_metadata in seq_group_metadata_list ] - actual_output_by_seq = {seq_id: [] for seq_id in seq_ids} - expected_output_by_seq = {seq_id: [] for seq_id in seq_ids} + actual_output_by_seq: Dict[int, List[SequenceOutput]] = { + seq_id: [] + for seq_id in seq_ids + } + expected_output_by_seq: Dict[int, List[SequenceOutput]] = { + seq_id: [] + for seq_id in seq_ids + } for step in output: for seq_group in step: @@ -454,7 +461,9 @@ def test_k_equals_zero(k: int, batch_size: int): rejection_sampler.token_id_dtype = torch.int64 metrics_collector = MagicMock(spec=AsyncMetricsCollector) - target_worker.execute_model.return_value = [MagicMock(spec=SamplerOutput)] + sampler_output = MagicMock(spec=SamplerOutput) + sampler_output.hidden_states = None + target_worker.execute_model.return_value = [sampler_output] draft_worker.device = 'cuda' target_worker.device = 'cuda' @@ -495,7 +504,9 @@ def test_empty_input_batch(k: int, batch_size: int): rejection_sampler.token_id_dtype = torch.int64 metrics_collector = MagicMock(spec=AsyncMetricsCollector) - target_worker.execute_model.return_value = [MagicMock(spec=SamplerOutput)] + sampler_output = MagicMock(spec=SamplerOutput) + sampler_output.hidden_states = None + target_worker.execute_model.return_value = [sampler_output] draft_worker.device = 'cuda' target_worker.device = 'cuda' diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py index bdc72346ab011..118d789aa9a0b 100644 --- a/tests/spec_decode/test_utils.py +++ b/tests/spec_decode/test_utils.py @@ -3,8 +3,8 @@ import pytest from tests.nm_utils.utils_skip import should_skip_test_group -from vllm.sequence import SequenceGroupMetadata -from vllm.spec_decode.util import get_all_seq_ids, split_batch_by_proposal_len +from vllm.sequence import SequenceGroupMetadata, get_all_seq_ids +from vllm.spec_decode.util import split_batch_by_proposal_len if should_skip_test_group(group_name="TEST_SPEC_DECODE"): pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group", diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index d52b22c30bd43..ce5b347832c30 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -1,5 +1,7 @@ from itertools import count -from typing import Dict, Iterable, List, Optional, Union +from typing import Callable, Dict, List, Optional +from typing import Sequence as GenericSequence +from typing import TypeVar, Union from unittest.mock import MagicMock import torch @@ -14,6 +16,8 @@ from vllm.worker.cache_engine import CacheEngine from vllm.worker.worker import Worker +T = TypeVar("T", bound=Worker) + def round_up_to_next_block(seq_len: int, block_size: int) -> int: return (seq_len + block_size - 1) // block_size @@ -56,13 +60,13 @@ def zero_kv_cache(cache_engine: CacheEngine): value_blocks.zero_() -def create_worker(cls: type, +def create_worker(cls: Callable[..., T], model_name: str, block_size: int, num_gpu_blocks: int, seed: int, is_driver_worker: bool = True, - enforce_eager: bool = True): + enforce_eager: bool = True) -> T: engine_args = EngineArgs( model=model_name, seed=seed, @@ -159,8 +163,8 @@ def assert_logprobs_dict_allclose( def create_sampler_output_list( token_ids: torch.Tensor, - probs: Iterable[Optional[torch.Tensor]], - logprobs: Iterable[Optional[torch.Tensor]], + probs: GenericSequence[Optional[torch.Tensor]], + logprobs: GenericSequence[Optional[torch.Tensor]], seq_ids: Optional[List[int]] = None) -> List[SamplerOutput]: num_steps, batch_size = token_ids.shape token_ids_by_step = token_ids.tolist() diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index 0fbe3dae1ff08..fe413d1228021 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -51,7 +51,7 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int, max_input_length=None, ) - hashes = [] + hashes: List[List[List[int]]] = [] for prefix in prefixes: for lora_int_id in concurrent_lora_int_ids: diff --git a/tests/test_logger.py b/tests/test_logger.py index 74f1125fb37c9..52aa73761fd68 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -47,6 +47,7 @@ def test_default_vllm_root_logger_configuration(): assert not logger.propagate handler = logger.handlers[0] + assert isinstance(handler, logging.StreamHandler) assert handler.stream == sys.stdout assert handler.level == logging.INFO diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index a48cfe6fed01f..4ded04e4d96ad 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -158,8 +158,8 @@ def test_decode_sequence_logprobs(complete_sequence: str, # Run sequentially. seq = create_sequence() dummy_logprobs = create_dummy_logprobs(complete_sequence_token_ids) - sequential_logprobs_text_chosen_token = [] - sequential_logprobs_text_other_token = [] + sequential_logprobs_text_chosen_token: List[str] = [] + sequential_logprobs_text_other_token: List[str] = [] for new_token, logprobs in zip(complete_sequence_token_ids, dummy_logprobs): seq.append_token_id(new_token, logprobs) diff --git a/tests/tracing/__init__.py b/tests/tracing/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py new file mode 100644 index 0000000000000..d66fe87d8c4f9 --- /dev/null +++ b/tests/tracing/test_tracing.py @@ -0,0 +1,124 @@ +import os +import threading +from concurrent import futures +from typing import Callable, Dict, Iterable, Literal + +import grpc +import pytest +from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import ( + ExportTraceServiceResponse) +from opentelemetry.proto.collector.trace.v1.trace_service_pb2_grpc import ( + TraceServiceServicer, add_TraceServiceServicer_to_server) +from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue +from opentelemetry.sdk.environment_variables import ( + OTEL_EXPORTER_OTLP_TRACES_INSECURE) + +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_TRACING"): + pytest.skip("TEST_TRACING=DISABLE, skipping tracing test group", + allow_module_level=True) + +from vllm import LLM, SamplingParams +from vllm.tracing import SpanAttributes + +FAKE_TRACE_SERVER_ADDRESS = "localhost:4317" + +FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value', + 'array_value'] + + +def decode_value(value: AnyValue): + field_decoders: Dict[FieldName, Callable] = { + "bool_value": (lambda v: v.bool_value), + "string_value": (lambda v: v.string_value), + "int_value": (lambda v: v.int_value), + "double_value": (lambda v: v.double_value), + "array_value": + (lambda v: [decode_value(item) for item in v.array_value.values]), + } + for field, decoder in field_decoders.items(): + if value.HasField(field): + return decoder(value) + raise ValueError(f"Couldn't decode value: {value}") + + +def decode_attributes(attributes: Iterable[KeyValue]): + return {kv.key: decode_value(kv.value) for kv in attributes} + + +class FakeTraceService(TraceServiceServicer): + + def __init__(self): + self.request = None + self.evt = threading.Event() + + def Export(self, request, context): + self.request = request + self.evt.set() + return ExportTraceServiceResponse() + + +@pytest.fixture +def trace_service(): + """Fixture to set up a fake gRPC trace service""" + server = grpc.server(futures.ThreadPoolExecutor(max_workers=1)) + service = FakeTraceService() + add_TraceServiceServicer_to_server(service, server) + server.add_insecure_port(FAKE_TRACE_SERVER_ADDRESS) + server.start() + + yield service + + server.stop(None) + + +@pytest.mark.skip( + reason="NM AUTOMATION: this fails in python 3.8. Work to re-enable.") +def test_traces(trace_service): + os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true" + + sampling_params = SamplingParams(temperature=0.01, + top_p=0.1, + max_tokens=256) + model = "facebook/opt-125m" + llm = LLM( + model=model, + otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS, + ) + prompts = ["This is a short prompt"] + outputs = llm.generate(prompts, sampling_params=sampling_params) + + timeout = 5 + if not trace_service.evt.wait(timeout): + raise TimeoutError( + f"The fake trace service didn't receive a trace within " + f"the {timeout} seconds timeout") + + attributes = decode_attributes(trace_service.request.resource_spans[0]. + scope_spans[0].spans[0].attributes) + assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model + assert attributes.get( + SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id + assert attributes.get( + SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature + assert attributes.get( + SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p + assert attributes.get( + SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens + assert attributes.get( + SpanAttributes.LLM_REQUEST_BEST_OF) == sampling_params.best_of + assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n + assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len( + outputs[0].prompt_token_ids) + completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) + assert attributes.get( + SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens + metrics = outputs[0].metrics + assert attributes.get( + SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue + ttft = metrics.first_token_time - metrics.arrival_time + assert attributes.get( + SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft + e2e_time = metrics.finished_time - metrics.arrival_time + assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time diff --git a/tests/utils.py b/tests/utils.py index c84364d20fc63..bc30515c83100 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -4,7 +4,7 @@ import time import warnings from contextlib import contextmanager -from typing import List +from typing import Dict, List import openai import ray @@ -13,7 +13,11 @@ from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.entrypoints.openai.cli_args import make_arg_parser -from vllm.utils import get_open_port +from vllm.utils import get_open_port, is_hip + +if (not is_hip()): + from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, + nvmlInit) # Path to root of repository so that utilities can be imported by ray workers VLLM_PATH = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)) @@ -79,7 +83,7 @@ def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None: self.host = str(args.host or 'localhost') self.port = int(args.port) - self._runner = self._RemoteRunner.remote( + self._runner = self._RemoteRunner.remote( # type: ignore cli_args, wait_url=self.url_for("health"), wait_timeout=self.MAX_SERVER_START_WAIT_S) @@ -154,3 +158,38 @@ def error_on_warning(): warnings.simplefilter("error") yield + + +def wait_for_gpu_memory_to_clear(devices: List[int], + threshold_bytes: int, + timeout_s: float = 120) -> None: + # Use nvml instead of pytorch to reduce measurement error from torch cuda + # context. + nvmlInit() + start_time = time.time() + while True: + output: Dict[int, str] = {} + output_raw: Dict[int, float] = {} + for device in devices: + dev_handle = nvmlDeviceGetHandleByIndex(device) + mem_info = nvmlDeviceGetMemoryInfo(dev_handle) + gb_used = mem_info.used / 2**30 + output_raw[device] = gb_used + output[device] = f'{gb_used:.02f}' + + print('gpu memory used (GB): ', end='') + for k, v in output.items(): + print(f'{k}={v}; ', end='') + print('') + + dur_s = time.time() - start_time + if all(v <= (threshold_bytes / 2**30) for v in output_raw.values()): + print(f'Done waiting for free GPU memory on devices {devices=} ' + f'({threshold_bytes/2**30=}) {dur_s=:.02f}') + break + + if dur_s >= timeout_s: + raise ValueError(f'Memory of devices {devices=} not free after ' + f'{dur_s=:.02f} ({threshold_bytes/2**30=})') + + time.sleep(5) diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index b048d9f7e1d9d..62c0bf5fbc518 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -1,3 +1,5 @@ +from typing import List + import pytest import torch @@ -40,8 +42,8 @@ def test_prepare_prompt(batch_size): enable_chunked_prefill=False, ) - seq_lens = [] - seq_group_metadata_list = [] + seq_lens: List[int] = [] + seq_group_metadata_list: List[SequenceGroupMetadata] = [] block_tables = {0: [1]} for i in range(batch_size): # make sure all tokens fit into one block @@ -156,15 +158,14 @@ def test_prepare_decode_cuda_graph(batch_size): enable_chunked_prefill=False, ) - context_lens = [] - seq_group_metadata_list = [] + context_lens: List[int] = [] + seq_group_metadata_list: List[SequenceGroupMetadata] = [] # Assume each seq group finishes prefill. for i in range(batch_size): # make sure all tokens fit into one block context_len = i % (model_runner.block_size - 1) + 1 context_lens.append(context_len) - seq_data = list(range(context_len)) - seq_data = SequenceData(seq_data) + seq_data = SequenceData(list(range(context_len))) seq_data.update_num_computed_tokens(context_len) # Append one token ID since prefill is finished. seq_data.append_token_id(1, 0) @@ -262,7 +263,7 @@ def test_empty_seq_group(): dtype="float16", enforce_eager=False, ) - seq_group_metadata_list = [] + seq_group_metadata_list: List[SequenceGroupMetadata] = [] model_input = model_runner._prepare_model_input(seq_group_metadata_list) input_tokens, input_positions, attn_metadata, slot_mapping = ( model_input.input_tokens, @@ -315,10 +316,10 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init): ) # Add prefill requests. - seq_lens = [] - seq_group_metadata_list = [] - prefill_metadata_list = [] - decode_metadata_list = [] + seq_lens: List[int] = [] + seq_group_metadata_list: List[SequenceGroupMetadata] = [] + prefill_metadata_list: List[SequenceGroupMetadata] = [] + decode_metadata_list: List[SequenceGroupMetadata] = [] block_tables = {0: [1]} prefill_batch_size = batch_size // 2 decode_batch_size = batch_size - prefill_batch_size diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 2f84b8bde6b57..e050c1172acb5 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -66,6 +66,10 @@ def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None: torch.ops._C.gelu_new(out, x) +def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None: + torch.ops._C.gelu_quick(out, x) + + # page attention ops def paged_attention_v1( out: torch.Tensor, @@ -212,6 +216,10 @@ def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, # cutlass +def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool: + return torch.ops._C.cutlass_scaled_mm_supports_fp8(cuda_device_capability) + + def cutlass_scaled_mm(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor, scale_b: torch.Tensor, out_dtype: Type[torch.dtype]) -> torch.Tensor: @@ -373,7 +381,8 @@ def reshape_and_cache_flash( kv_cache_dtype) -def copy_blocks(key_caches: torch.Tensor, value_caches: torch.Tensor, +def copy_blocks(key_caches: List[torch.Tensor], + value_caches: List[torch.Tensor], block_mapping: torch.Tensor) -> None: torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping) diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py new file mode 100644 index 0000000000000..99a875c9b3fb7 --- /dev/null +++ b/vllm/_ipex_ops.py @@ -0,0 +1,244 @@ +from typing import List, Optional, Tuple + +import torch + +from vllm.logger import init_logger + +logger = init_logger(__name__) + +try: + import intel_extension_for_pytorch as ipex +except ImportError as e: + logger.warning("Import error msg: %s", e.msg) + + +class ipex_ops: + + @staticmethod + def _reshape_activation_tensor( + x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + num = x.size(0) + d = x.size(1) // 2 + x = x.reshape(num, 2, d) + x1, x2 = torch.chunk(x, chunks=2, dim=1) + x1 = x1.reshape(num, d) + x2 = x2.reshape(num, d) + return x1, x2 + + def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + x1, x2 = ipex_ops._reshape_activation_tensor(x) + ipex.llm.functional.silu_mul(x1, x2, out) + + def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + x1, x2 = ipex_ops._reshape_activation_tensor(x) + ipex.llm.functional.gelu_mul(x1, x2, out, "none") + + def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + x1, x2 = ipex_ops._reshape_activation_tensor(x) + ipex.llm.functional.gelu_mul(x1, x2, out, "tanh") + + def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None: + out.copy_(torch.nn.functional.gelu(x)) + + def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None: + out.copy_(torch.nn.functional.gelu(x)) + + # TODO add implementation of gelu_quick here + # def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None: + + def paged_attention_v1( + out: torch.Tensor, + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + num_kv_heads: int, + scale: float, + block_tables: torch.Tensor, + context_lens: torch.Tensor, + block_size: int, + max_context_len: int, + alibi_slopes: Optional[torch.Tensor], + kv_cache_dtype: str, + kv_scale: float, + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 0, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, + ) -> None: + assert kv_cache_dtype == "auto" + num_heads = out.size(1) + num_queries_per_tokens = num_heads // num_kv_heads + head_mapping = torch.arange( + 0, + num_kv_heads, + device=query.device, + dtype=torch.int32, + ).view(num_kv_heads, + 1).repeat_interleave(num_queries_per_tokens).flatten() + # todo: ipex will refactor namespace + torch.xpu.paged_attention_v1(out, query.contiguous(), + key_cache.view_as(value_cache), + value_cache, head_mapping, scale, + block_tables, context_lens, block_size, + max_context_len, alibi_slopes) + + def paged_attention_v2( + out: torch.Tensor, + exp_sum: torch.Tensor, + max_logits: torch.Tensor, + tmp_out: torch.Tensor, + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + num_kv_heads: int, + scale: float, + block_tables: torch.Tensor, + context_lens: torch.Tensor, + block_size: int, + max_context_len: int, + alibi_slopes: Optional[torch.Tensor], + kv_cache_dtype: str, + kv_scale: float, + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 0, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, + ) -> None: + assert kv_cache_dtype == "auto" + num_heads = out.size(1) + num_queries_per_tokens = num_heads // num_kv_heads + head_mapping = torch.arange( + 0, + num_kv_heads, + dtype=torch.int32, + device=query.device, + ).view(num_kv_heads, + 1).repeat_interleave(num_queries_per_tokens).flatten() + # todo: ipex will refactor namespace + torch.xpu.paged_attention_v2(out, exp_sum, max_logits, tmp_out, + query.contiguous(), + key_cache.view_as(value_cache), + value_cache, head_mapping, block_tables, + context_lens, scale, block_size, + max_context_len, alibi_slopes) + + def rotary_embedding( + positions: torch.Tensor, # [batch_size, seq_len] + query: torch.Tensor, # [batch_size, seq_len, num_heads*head_size] + key: torch.Tensor, # [batch_size, seq_len, num_kv_heads*head_size] + head_size: int, + cos_sin_cache: torch.Tensor, # [cos_sin_dim, rot_dim] + is_neox: bool, + ) -> None: + if positions.dim() == 1: + positions = positions.unsqueeze(0) + query = query.unsqueeze(0) + key = key.unsqueeze(0) + + rotary_dim = cos_sin_cache.size(1) + query = query.view(*query.shape[:-1], -1, head_size) + key = key.view(*key.shape[:-1], -1, head_size) + + query_rot = query[..., :rotary_dim] + key_rot = key[..., :rotary_dim] + + cos_sin = cos_sin_cache[positions.long()] + cos, sin = cos_sin.chunk(2, dim=-1) + + if is_neox: + cos = cos.repeat(1, 1, 2).unsqueeze(-2) + sin = sin.repeat(1, 1, 2).unsqueeze(-2) + else: + cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2) + sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2) + ipex.llm.functional.rotary_embedding(query_rot, key_rot, sin, cos, + rotary_dim, is_neox, positions) + + def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor, + key: torch.Tensor, head_size: int, + cos_sin_cache: torch.Tensor, is_neox: bool, + rot_dim: int, + cos_sin_cache_offsets: torch.Tensor) -> None: + if positions.dim() == 1: + positions = positions.unsqueeze(0) + query = query.unsqueeze(0) + key = key.unsqueeze(0) + cos_sin_cache_offsets = cos_sin_cache_offsets.view_as(positions) + rotary_dim = cos_sin_cache.size(1) + query = query.view(*query.shape[:-1], -1, head_size) + key = key.view(*key.shape[:-1], -1, head_size) + + query_rot = query[..., :rotary_dim] + key_rot = key[..., :rotary_dim] + + cos_sin = cos_sin_cache[torch.add(positions, + cos_sin_cache_offsets).long()] + cos, sin = cos_sin.chunk(2, dim=-1) + + if is_neox: + cos = cos.repeat(1, 1, 2).unsqueeze(-2) + sin = sin.repeat(1, 1, 2).unsqueeze(-2) + else: + cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2) + sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2) + + ipex.llm.functional.rotary_embedding(query_rot, key_rot, sin, cos, + rotary_dim, is_neox, positions) + + def rms_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, + epsilon: float) -> None: + tmp = ipex.llm.functional.rms_norm(input, weight, epsilon) + out.copy_(tmp) + + def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor, + weight: torch.Tensor, epsilon: float) -> None: + tmp = ipex.llm.functional.add_rms_norm(residual, input, weight, None, + epsilon, True) + input.copy_(tmp) + + def varlen_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + out: torch.Tensor, + seqlen_q: torch.Tensor, + seqlen_k: torch.Tensor, + max_seqlen_q: int, + max_seqlen_k: int, + pdropout: float, + softmax_scale: float, + zero_tensors: bool, + is_causal: bool, + return_softmax: bool, + gen_: torch.Generator, + ) -> None: + ipex.llm.functional.varlen_attention(query, key, value, out, seqlen_q, + seqlen_k, max_seqlen_q, + max_seqlen_k, pdropout, + softmax_scale, zero_tensors, + is_causal, return_softmax, gen_) + + def reshape_and_cache( + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + slot_mapping: torch.Tensor, + kv_cache_dtype: str, + kv_scale: float, + ) -> None: + assert kv_cache_dtype == "auto" + ipex.llm.modules.PagedAttention.reshape_and_cache( + key, value, key_cache, value_cache, slot_mapping) + + @staticmethod + def copy_blocks(key_caches: List[torch.Tensor], + value_caches: List[torch.Tensor], + block_mapping: torch.Tensor) -> None: + torch.xpu.copy_blocks(key_caches, value_caches, block_mapping) + + def swap_blocks(src: torch.Tensor, dst: torch.Tensor, + block_mapping: torch.Tensor) -> None: + torch.xpu.swap_blocks(src, dst, block_mapping) diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 300bab72877b8..1c48e2a0bb33d 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -83,7 +83,7 @@ class FlashAttentionMetadata(AttentionMetadata): # |---------------- N iteration ---------------------| # |- tokenA -|......................|-- newTokens ---| # |---------- context_len ----------| - # |-------------------- seq_len ----------------------| + # |-------------------- seq_len ---------------------| # |-- query_len ---| # Maximum query length in the batch. None for decoding. diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py new file mode 100644 index 0000000000000..f09b24f2a0304 --- /dev/null +++ b/vllm/attention/backends/ipex_attn.py @@ -0,0 +1,355 @@ +""" Attention layer with torch scaled_dot_product_attention + and PagedAttention.""" +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Type + +import torch + +from vllm._ipex_ops import ipex_ops +from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionMetadata) +from vllm.attention.ops.paged_attn import (PagedAttention, + PagedAttentionMetadata) + +_PARTITION_SIZE = 512 + + +class IpexAttnBackend(AttentionBackend): + + @staticmethod + def get_name() -> str: + return "ipex-attn" + + @staticmethod + def get_impl_cls() -> Type["IpexAttnBackendImpl"]: + return IpexAttnBackendImpl + + @staticmethod + def make_metadata(*args, **kwargs) -> "IpexAttnMetadata": + return IpexAttnMetadata(*args, **kwargs) + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> Tuple[int, ...]: + return PagedAttention.get_kv_cache_shape(num_blocks, block_size, + num_kv_heads, head_size) + + @staticmethod + def swap_blocks( + src_kv_cache: torch.Tensor, + dst_kv_cache: torch.Tensor, + src_to_dst: torch.Tensor, + ) -> None: + PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) + + @staticmethod + def copy_blocks( + kv_caches: List[torch.Tensor], + src_to_dists: torch.Tensor, + ) -> None: + PagedAttention.copy_blocks(kv_caches, src_to_dists) + + +@dataclass +class IpexAttnMetadata(AttentionMetadata, PagedAttentionMetadata): + """Metadata for IpexAttnBackend. + """ + # Currently, input sequences can only contain all prompts + # or all decoding. True if all sequences are prompts. + is_prompt: bool + slot_mapping: torch.Tensor + seq_lens: Optional[List[int]] + seqlen_q: Optional[torch.Tensor] + max_seqlen: Optional[int] + + def __post_init__(self): + # Set during the execution of the first attention op. + # It is a list because it is needed to set per prompt + # when alibi slopes is used. It is because of the limitation + # from xformer API. + # will not appear in the __repr__ and __init__ + self.attn_bias: Optional[List[torch.Tensor]] = None + + @property + def prefill_metadata(self) -> Optional["IpexAttnMetadata"]: + # Currently chunked prefill is not supported + if self.num_decode_tokens == 0: + assert self.num_prefills > 0 + return self + + return None + + @property + def decode_metadata(self) -> Optional["IpexAttnMetadata"]: + # Currently chunked prefill is not supported + if self.num_prefills > 0: + assert self.num_decode_tokens == 0 + return None + + return self + + +class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[List[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + blocksparse_params: Optional[Dict[str, Any]] = None, + ) -> None: + assert blocksparse_params is None, ValueError( + "Torch SPDA does not support block-sparse attention.") + self.num_heads = num_heads + self.head_size = head_size + self.scale = float(scale) + self.num_kv_heads = num_kv_heads + if alibi_slopes is not None: + alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) + self.alibi_slopes = alibi_slopes + self.sliding_window = sliding_window + self.kv_cache_dtype = kv_cache_dtype + + assert self.num_heads % self.num_kv_heads == 0 + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + self.need_mask = (self.alibi_slopes is not None + or self.sliding_window is not None) + + supported_head_sizes = PagedAttention.get_supported_head_sizes() + if head_size not in supported_head_sizes: + raise ValueError( + f"Head size {head_size} is not supported by PagedAttention. " + f"Supported head sizes are: {supported_head_sizes}.") + if kv_cache_dtype != "auto": + raise NotImplementedError( + "IPEX backend does not support FP8 KV cache. " + "Please use xFormers backend instead.") + + def split_kv_cache( + self, + kv_cache: torch.Tensor, + num_kv_heads: int, + head_size: int, + ) -> Tuple[torch.Tensor, torch.Tensor]: + x = 1 + num_blocks = kv_cache.shape[1] + + key_cache = kv_cache[0] + key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x, + -1, x) + value_cache = kv_cache[1] + value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1) + return key_cache, value_cache + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: Optional[torch.Tensor], + attn_metadata: IpexAttnMetadata, # type: ignore + kv_scale: float = 1.0, + ) -> torch.Tensor: + """Forward pass with IPEX varlen_attention and PagedAttention. + + Args: + query: shape = [num_tokens, num_heads * head_size] + key: shape = [num_tokens, num_kv_heads * head_size] + value: shape = [num_tokens, num_kv_heads * head_size] + kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] + attn_metadata: Metadata for attention. + Returns: + shape = [num_tokens, num_heads * head_size] + """ + assert kv_scale == 1.0 + num_tokens, hidden_size = query.shape + # Reshape the query, key, and value tensors. + query = query.view(-1, self.num_heads, self.head_size) + key = key.view(-1, self.num_kv_heads, self.head_size) + value = value.view(-1, self.num_kv_heads, self.head_size) + + if kv_cache is not None: + key_cache, value_cache = self.split_kv_cache( + kv_cache, self.num_kv_heads, self.head_size) + ipex_ops.reshape_and_cache( + key, + value, + key_cache, + value_cache, + attn_metadata.slot_mapping.flatten(), + self.kv_cache_dtype, + kv_scale, + ) + + if attn_metadata.is_prompt: + assert attn_metadata.seq_lens is not None + if (kv_cache is None or attn_metadata.block_tables.numel() == 0): + if self.num_kv_heads != self.num_heads: + key = key.repeat_interleave(self.num_queries_per_kv, dim=1) + value = value.repeat_interleave(self.num_queries_per_kv, + dim=1) + + if attn_metadata.attn_bias is None: + if self.alibi_slopes is not None: + att_masks = _make_alibi_bias( + self.alibi_slopes, query.dtype, + attn_metadata.seq_lens) # type: ignore + elif self.sliding_window is not None: + att_masks = _make_sliding_window_bias( + attn_metadata.seq_lens, self.sliding_window, + query.dtype) # type: ignore + else: + att_masks = _make_sliding_window_bias( + attn_metadata.seq_lens, None, dtype=query.dtype) + attn_metadata.attn_bias = att_masks + + output = torch.empty( + (num_tokens, self.num_heads, self.head_size), + dtype=query.dtype, + device=query.device) + ipex_ops.varlen_attention(query, + key, + value, + output, + attn_metadata.seqlen_q, + attn_metadata.seqlen_q, + attn_metadata.max_seqlen, + attn_metadata.max_seqlen, + pdropout=0.0, + softmax_scale=self.scale, + zero_tensors=False, + is_causal=True, + return_softmax=False, + gen_=None) + else: + # prefix-enabled attention + raise RuntimeError( + "IPEX backend doesn't support prefix decoding.") + + else: + # Decoding run. + max_seq_len = attn_metadata.max_decode_seq_len + output = torch.empty_like(query) + block_size = value_cache.shape[3] + num_seqs, num_heads, head_size = query.shape + max_num_partitions = ((max_seq_len + _PARTITION_SIZE - 1) // + _PARTITION_SIZE) + # NOTE(woosuk): We use a simple heuristic to decide whether to use + # PagedAttention V1 or V2. If the number of partitions is 1, we use + # V1 to avoid the overhead of reduction. Also, if the number of + # sequences or heads is large, we use V1 since there is enough work + # to parallelize. + # TODO(woosuk): Tune this heuristic. + # For context len > 8192, use V2 kernel to avoid shared memory + # shortage. + use_v1 = (max_seq_len <= 8192 and + (max_num_partitions == 1 or num_seqs * num_heads > 512)) + if use_v1: + # Run PagedAttention V1. + ipex_ops.paged_attention_v1( + output, + query, + key_cache, + value_cache, + self.num_kv_heads, + self.scale, + attn_metadata.block_tables, + attn_metadata.seq_lens_tensor, + block_size, + max_seq_len, + self.alibi_slopes, + self.kv_cache_dtype, + kv_scale, + ) + else: + # Run PagedAttention V2. + assert _PARTITION_SIZE % block_size == 0 + tmp_output = torch.empty( + size=(num_seqs, num_heads, max_num_partitions, head_size), + dtype=output.dtype, + device=output.device, + ) + exp_sums = torch.empty( + size=(num_seqs, num_heads, max_num_partitions), + dtype=torch.float32, + device=output.device, + ) + max_logits = torch.empty_like(exp_sums) + ipex_ops.paged_attention_v2( + output, + exp_sums, + max_logits, + tmp_output, + query, + key_cache, + value_cache, + self.num_kv_heads, + self.scale, + attn_metadata.block_tables, + attn_metadata.seq_lens_tensor, + block_size, + max_seq_len, + self.alibi_slopes, + self.kv_cache_dtype, + kv_scale, + ) + + # Reshape the output tensor. + return output.view(-1, self.num_heads * self.head_size) + + +def _make_alibi_bias( + alibi_slopes: torch.Tensor, + dtype: torch.dtype, + seq_lens: List[int], +) -> List[torch.Tensor]: + attn_biases = [] + for seq_len in seq_lens: + bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device) + # NOTE(zhuohan): HF uses + # `bias = bias[None, :].repeat(seq_len, 1)` + # here. We find that both biases give the same results, but + # the bias below more accurately follows the original ALiBi + # paper. + bias = bias[None, :] - bias[:, None] + + num_heads = alibi_slopes.shape[0] + bias = bias[None, :].repeat((num_heads, 1, 1)) + bias.mul_(alibi_slopes[:, None, None]) + inf_mask = torch.empty( + (1, seq_len, seq_len), + dtype=bias.dtype, + device=alibi_slopes.device).fill_(-torch.inf).triu_(diagonal=1) + attn_biases.append((bias + inf_mask).to(dtype)) + + return attn_biases + + +def _make_sliding_window_bias( + seq_lens: List[int], + window_size: Optional[int], + dtype: torch.dtype, +) -> List[torch.Tensor]: + attn_biases = [] + for seq_len in seq_lens: + tensor = torch.full( + (1, seq_len, seq_len), + dtype=dtype, + fill_value=1, + ) + shift = 0 + mask = torch.tril(tensor, diagonal=shift).to(dtype) # type: ignore + if window_size is not None: + mask = torch.triu(mask, diagonal=shift - window_size + 1) + mask = torch.log(mask) + attn_biases.append(mask.to(dtype)) + + return attn_biases diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index 75f2465264ad3..b203c5ec54c92 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -110,7 +110,7 @@ def __init__( raise NotImplementedError("TPU version must be 4 or higher.") self.megacore_mode = None - tpu_type = torch_xla.tpu.get_tp_groupu_env()["TYPE"].lower() + tpu_type = torch_xla.tpu.get_tpu_env()["TYPE"].lower() if not tpu_type.endswith("lite"): if self.num_kv_heads % 2 == 0: self.megacore_mode = "kv_head" diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 4b08cce99afb0..c01e0a0a3a19c 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -245,7 +245,7 @@ def _make_alibi_bias( dtype: torch.dtype, seq_lens: List[int], ) -> List[torch.Tensor]: - attn_biases = [] + attn_biases: List[torch.Tensor] = [] for seq_len in seq_lens: bias = torch.arange(seq_len, dtype=dtype) # NOTE(zhuohan): HF uses @@ -271,7 +271,7 @@ def _make_sliding_window_bias( window_size: Optional[int], dtype: torch.dtype, ) -> List[torch.Tensor]: - attn_biases = [] + attn_biases: List[torch.Tensor] = [] for seq_len in seq_lens: tensor = torch.full( (1, seq_len, seq_len), diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 99a3e88bc07b6..0fecd9f6e610c 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -431,8 +431,8 @@ def _make_alibi_bias( num_kv_heads: int, dtype: torch.dtype, seq_lens: List[int], -) -> LowerTriangularMaskWithTensorBias: - attn_biases = [] +) -> List[AttentionBias]: + attn_biases: List[AttentionBias] = [] for seq_len in seq_lens: bias = torch.arange(seq_len, dtype=dtype) # NOTE(zhuohan): HF uses diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 8b07fb2d768f5..1d56d87ccd119 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -7,7 +7,7 @@ import vllm.envs as envs from vllm.attention.backends.abstract import AttentionBackend from vllm.logger import init_logger -from vllm.utils import is_cpu, is_hip, is_tpu +from vllm.utils import is_cpu, is_hip, is_tpu, is_xpu logger = init_logger(__name__) @@ -19,6 +19,7 @@ class _Backend(enum.Enum): TORCH_SDPA = enum.auto() FLASHINFER = enum.auto() PALLAS = enum.auto() + IPEX = enum.auto() @lru_cache(maxsize=None) @@ -58,12 +59,17 @@ def get_attn_backend( ROCmFlashAttentionBackend) return ROCmFlashAttentionBackend elif backend == _Backend.TORCH_SDPA: - # TODO: make XPU backend available here. assert is_cpu(), RuntimeError( "Torch SDPA backend is only used for the CPU device.") logger.info("Using Torch SDPA backend.") from vllm.attention.backends.torch_sdpa import TorchSDPABackend return TorchSDPABackend + elif backend == _Backend.IPEX: + assert is_xpu(), RuntimeError( + "IPEX attention backend is only used for the XPU device.") + logger.info("Using IPEX attention backend.") + from vllm.attention.backends.ipex_attn import IpexAttnBackend + return IpexAttnBackend elif backend == _Backend.FLASHINFER: logger.info("Using Flashinfer backend.") logger.warning("Eager mode is required for the Flashinfer backend. " @@ -107,6 +113,11 @@ def which_attn_to_use( logger.info("Cannot use %s backend on CPU.", selected_backend) return _Backend.TORCH_SDPA + if is_xpu(): + if selected_backend != _Backend.IPEX: + logger.info("Cannot use %s backend on XPU.", selected_backend) + return _Backend.IPEX + if is_tpu(): if selected_backend != _Backend.PALLAS: logger.info("Cannot use %s backend on TPU.", selected_backend) diff --git a/vllm/block.py b/vllm/block.py index 2cc6b947f2255..bd00c07adc0d7 100644 --- a/vllm/block.py +++ b/vllm/block.py @@ -1,5 +1,7 @@ """Token blocks.""" -from typing import List +import weakref +from collections import defaultdict +from typing import Dict, List from vllm.utils import Device @@ -7,6 +9,35 @@ DEFAULT_LAST_ACCESSED_TIME = -1 +TokensBlock = List[int] + + +class BlockPool: + """A pool of logical blocks. + When requests come, we create a lot of logical blocks; + when requests are done, we destroy a lot of logical blocks. + It turns out that creating and destroying logical blocks can be expensive, + especially for the `token_ids` field, which is a list of integers. + To avoid this overhead, we use a pool to manage the logical blocks. + When an old request is done and a new request comes, we can reuse the + logical blocks from the old request to feed the new request. + """ + + def __init__(self) -> None: + # block size to list of token blocks + self.pool: Dict[int, List[TokensBlock]] = defaultdict(list) + + def alloc_block(self, block_size: int) -> TokensBlock: + if block_size in self.pool and self.pool[block_size]: + return self.pool[block_size].pop() + return [_BLANK_TOKEN_ID] * block_size + + def del_block(self, block: TokensBlock) -> None: + self.pool[len(block)].append(block) + + +_BLOCK_POOL = BlockPool() + class LogicalTokenBlock: """A block that stores a contiguous chunk of tokens from left to right. @@ -23,7 +54,13 @@ def __init__( self.block_number = block_number self.block_size = block_size - self.token_ids = [_BLANK_TOKEN_ID] * block_size + self.token_ids = _BLOCK_POOL.alloc_block(block_size) + # this finalizer is used to return the block to the pool when the object is deleted # noqa + # NOTE: don't use __del__ because it cannot guarantee the order of finalization, # noqa + # i.e. `self.token_ids` may be deleted before `self`, and we lose + # the opportunity to return the block to the pool + self._finalizer = weakref.finalize(self, _BLOCK_POOL.del_block, + self.token_ids) self.num_tokens = 0 def is_empty(self) -> bool: diff --git a/vllm/config.py b/vllm/config.py index 403959cb79d22..fb2cbe93ed740 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -10,9 +10,10 @@ from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.models import ModelRegistry +from vllm.tracing import is_otel_installed from vllm.transformers_utils.config import get_config, get_hf_text_config from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu, - is_hip, is_neuron, is_tpu) + is_hip, is_neuron, is_tpu, is_xpu) if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup @@ -261,7 +262,8 @@ def verify_with_parallel_config( self, parallel_config: "ParallelConfig", ) -> None: - total_num_attention_heads = self.hf_text_config.num_attention_heads + total_num_attention_heads = getattr(self.hf_text_config, + "num_attention_heads", 0) tensor_parallel_size = parallel_config.tensor_parallel_size if total_num_attention_heads % tensor_parallel_size != 0: raise ValueError( @@ -269,7 +271,8 @@ def verify_with_parallel_config( " must be divisible by tensor parallel size " f"({tensor_parallel_size}).") - total_num_hidden_layers = self.hf_text_config.num_hidden_layers + total_num_hidden_layers = getattr(self.hf_text_config, + "num_hidden_layers", 0) pipeline_parallel_size = parallel_config.pipeline_parallel_size if total_num_hidden_layers % pipeline_parallel_size != 0: raise ValueError( @@ -334,7 +337,11 @@ def get_total_num_kv_heads(self) -> int: return 1 # For DBRX and MPT - if self.hf_config.model_type in ["dbrx", "mpt"]: + if self.hf_config.model_type == "mpt": + if "kv_n_heads" in self.hf_config.attn_config: + return self.hf_config.attn_config["kv_n_heads"] + return self.hf_config.num_attention_heads + if self.hf_config.model_type == "dbrx": return getattr(self.hf_config.attn_config, "kv_n_heads", self.hf_config.num_attention_heads) @@ -368,8 +375,8 @@ def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int: def get_num_attention_heads(self, parallel_config: "ParallelConfig") -> int: - return self.hf_text_config.num_attention_heads // \ - parallel_config.tensor_parallel_size + num_heads = getattr(self.hf_text_config, "num_attention_heads", 0) + return num_heads // parallel_config.tensor_parallel_size def get_num_layers(self, parallel_config: "ParallelConfig") -> int: total_num_hidden_layers = self.hf_text_config.num_hidden_layers @@ -648,9 +655,14 @@ def __init__( "required for multi-node inference") backend = "ray" elif ray_found: - from ray.util import get_current_placement_group - if self.placement_group or get_current_placement_group(): + if self.placement_group: backend = "ray" + else: + from ray import is_initialized as ray_is_initialized + if ray_is_initialized(): + from ray.util import get_current_placement_group + if get_current_placement_group(): + backend = "ray" self.distributed_executor_backend = backend logger.info("Defaulting to use %s for distributed inference", backend) @@ -784,6 +796,8 @@ def __init__(self, device: str = "auto") -> None: self.device_type = "tpu" elif is_cpu(): self.device_type = "cpu" + elif is_xpu(): + self.device_type = "xpu" else: # We don't call torch.cuda.is_available() here to # avoid initializing CUDA before workers are forked @@ -838,7 +852,8 @@ def maybe_create_spec_config( speculative_model (Optional[str]): The name of the speculative model, if provided. num_speculative_tokens (Optional[int]): The number of speculative - tokens, if provided. + tokens, if provided. Will default to the number in the draft + model config if present, otherwise is required. speculative_max_model_len (Optional[int]): The maximum model len of the speculative model. Used when testing the ability to skip speculation for some sequences. @@ -861,24 +876,18 @@ def maybe_create_spec_config( the necessary conditions are met, else None. """ - if speculative_model is None and num_speculative_tokens is None: + if speculative_model is None: + if num_speculative_tokens is not None: + raise ValueError("num_speculative_tokens was provided without " + "speculative_model.") return None - if speculative_model is not None and num_speculative_tokens is None: - raise ValueError( - "Expected both speculative_model and " - "num_speculative_tokens to be provided, but found " - f"{speculative_model=} and {num_speculative_tokens=}.") - if (speculative_disable_by_batch_size is not None and speculative_disable_by_batch_size < 2): raise ValueError("Expect the batch size threshold of disabling " "speculative decoding is > 1, but got " f"{speculative_disable_by_batch_size=}") - assert (speculative_model is not None - and num_speculative_tokens is not None) - if enable_chunked_prefill: raise ValueError( "Speculative decoding and chunked prefill are " @@ -932,6 +941,27 @@ def maybe_create_spec_config( max_logprobs=target_model_config.max_logprobs, ) + if (draft_model_config.hf_config.model_type == "mlp_speculator" + and target_parallel_config.world_size != 1): + # MLPSpeculator TP support will be added very soon + raise ValueError( + "Speculative decoding with mlp_speculator models does not " + "yet support distributed inferencing (TP > 1).") + + n_predict = getattr(draft_model_config.hf_config, "n_predict", + None) + if n_predict is not None: + if num_speculative_tokens is None: + # Default to max value defined in draft model config. + num_speculative_tokens = n_predict + elif num_speculative_tokens > n_predict: + # Verify provided value doesn't exceed the maximum + # supported by the draft model. + raise ValueError( + "Expected both speculative_model and " + "num_speculative_tokens to be provided, but found " + f"{speculative_model=} and {num_speculative_tokens=}.") + draft_model_config.max_model_len = ( SpeculativeConfig._maybe_override_draft_max_model_len( speculative_max_model_len, @@ -943,6 +973,12 @@ def maybe_create_spec_config( SpeculativeConfig.create_draft_parallel_config( target_parallel_config)) + if num_speculative_tokens is None: + raise ValueError( + "num_speculative_tokens must be provided with " + "speculative_model unless the draft model config contains an " + "n_predict parameter.") + return SpeculativeConfig( draft_model_config, draft_parallel_config, @@ -1124,6 +1160,8 @@ def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): "Due to limitations of the custom LoRA CUDA kernel, " "max_num_batched_tokens must be <= 65528 when " "LoRA is enabled.") + if scheduler_config.chunked_prefill_enabled: + raise ValueError("LoRA is not supported with chunked prefill yet.") @dataclass @@ -1312,7 +1350,10 @@ def _get_and_verify_max_len( derived_max_model_len = default_max_len rope_scaling = getattr(hf_config, "rope_scaling", None) - if rope_scaling is not None and rope_scaling["type"] != "su": + # The correct one should be "longrope", kept "su" here + # to be backward compatible + if rope_scaling is not None and rope_scaling["type"] != "su" \ + and rope_scaling["type"] != "longrope": if disable_sliding_window: # TODO(robertgshaw): Find a model that supports rope_scaling # with sliding window to see if this case should be allowed. @@ -1387,6 +1428,17 @@ def __post_init__(self): f"must be one of {valid_guided_backends}") +@dataclass +class ObservabilityConfig: + """Configuration for observability.""" + otlp_traces_endpoint: Optional[str] = None + + def __post_init__(self): + if not is_otel_installed() and self.otlp_traces_endpoint is not None: + raise ValueError("OpenTelemetry packages must be installed before " + "configuring 'otlp_traces_endpoint'") + + @dataclass(frozen=True) class EngineConfig: """Dataclass which contains all engine-related configuration. This @@ -1403,6 +1455,7 @@ class EngineConfig: vision_language_config: Optional[VisionLanguageConfig] speculative_config: Optional[SpeculativeConfig] decoding_config: Optional[DecodingConfig] + observability_config: Optional[ObservabilityConfig] def __post_init__(self): """Verify configs are valid & consistent with each other. diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index 26f378ba24b76..d705f3d91a074 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -252,7 +252,7 @@ def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]: def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block], token_ids: List[int], device: Device) -> List[Block]: - blocks = [] + blocks: List[Block] = [] for block_token_ids in chunk_list(token_ids, self._block_size): if len(block_token_ids) == self._block_size: # If the block is full, create an immutable block. diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index d033787122d7a..50f27bab33776 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -111,7 +111,7 @@ def fork(self, last_block: Block) -> List[Block]: """ source_blocks = get_all_blocks_recursively(last_block) - forked_blocks = [] + forked_blocks: List[Block] = [] prev_block = None for block in source_blocks: diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 405e9705659df..2df7d74e4ff19 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -176,14 +176,17 @@ def allocate_mutable(self, self._refcounter.incr(block_id) - # the block comes from evictor already contain computed result + # Now this block is pop from evictor and ready to write + # with new content which most probably different with + # original content. So need to tell worker to recompute + # its kvcache block = self._create_block( prev_block=prev_block, token_ids=[], block_size=self._block_size, allocator=self, block_id=block_id, - computed=True, + computed=False, ) assert block.content_hash is None @@ -268,7 +271,7 @@ def fork(self, last_block: Block) -> List[Block]: """ source_blocks = get_all_blocks_recursively(last_block) - forked_blocks = [] + forked_blocks: List[Block] = [] prev_block = None for block in source_blocks: refcount = self._refcounter.incr(block.block_id) diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 121092cf189bd..309775237a715 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -260,7 +260,7 @@ def access_all_blocks_in_seq(self, seq: Sequence, now: float): # at max extend. if self.enable_caching: block_table = self.block_tables[seq.seq_id] - block_ids = [] + block_ids: List[Optional[int]] = [] for block_id in block_table.physical_block_ids: block_ids.append(block_id) self.block_allocator.mark_blocks_as_accessed( diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py new file mode 100644 index 0000000000000..24308235c4a48 --- /dev/null +++ b/vllm/distributed/device_communicators/cuda_wrapper.py @@ -0,0 +1,146 @@ +"""This file is a pure Python wrapper for the cudart library. +It avoids the need to compile a separate shared library, and is +convenient for use when we just need to call a few functions. +""" + +import ctypes +from dataclasses import dataclass +from typing import Any, Dict, List, Optional + +# this line makes it possible to directly load `libcudart.so` using `ctypes` +import torch # noqa + +from vllm.logger import init_logger + +logger = init_logger(__name__) + +# === export types and functions from cudart to Python === +# for the original cudart definition, please check +# https://docs.nvidia.com/cuda/cuda-runtime-api/index.html + +cudaError_t = ctypes.c_int +cudaMemcpyKind = ctypes.c_int + + +class cudaIpcMemHandle_t(ctypes.Structure): + _fields_ = [("internal", ctypes.c_byte * 128)] + + +@dataclass +class Function: + name: str + restype: Any + argtypes: List[Any] + + +class CudaRTLibrary: + exported_functions = [ + # ​cudaError_t cudaSetDevice ( int device ) + Function("cudaSetDevice", cudaError_t, [ctypes.c_int]), + # cudaError_t cudaDeviceSynchronize ( void ) + Function("cudaDeviceSynchronize", cudaError_t, []), + # ​cudaError_t cudaDeviceReset ( void ) + Function("cudaDeviceReset", cudaError_t, []), + + # const char* cudaGetErrorString ( cudaError_t error ) + Function("cudaGetErrorString", ctypes.c_char_p, [cudaError_t]), + + # ​cudaError_t cudaMalloc ( void** devPtr, size_t size ) + Function("cudaMalloc", cudaError_t, + [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t]), + # ​cudaError_t cudaFree ( void* devPtr ) + Function("cudaFree", cudaError_t, [ctypes.c_void_p]), + # ​cudaError_t cudaMemset ( void* devPtr, int value, size_t count ) + Function("cudaMemset", cudaError_t, + [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]), + # ​cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ) # noqa + Function("cudaMemcpy", cudaError_t, [ + ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, cudaMemcpyKind + ]), + + # cudaError_t cudaIpcGetMemHandle ( cudaIpcMemHandle_t* handle, void* devPtr ) # noqa + Function("cudaIpcGetMemHandle", cudaError_t, + [ctypes.POINTER(cudaIpcMemHandle_t), ctypes.c_void_p]), + # ​cudaError_t cudaIpcOpenMemHandle ( void** devPtr, cudaIpcMemHandle_t handle, unsigned int flags ) # noqa + Function("cudaIpcOpenMemHandle", cudaError_t, [ + ctypes.POINTER(ctypes.c_void_p), cudaIpcMemHandle_t, ctypes.c_uint + ]), + ] + + # class attribute to store the mapping from the path to the library + # to avoid loading the same library multiple times + path_to_library_cache: Dict[str, Any] = {} + + # class attribute to store the mapping from library path + # to the corresponding dictionary + path_to_dict_mapping: Dict[str, Dict[str, Any]] = {} + + def __init__(self, so_file: Optional[str] = None): + if so_file is None: + assert torch.version.cuda is not None + major_version = torch.version.cuda.split(".")[0] + so_file = f"libcudart.so.{major_version}" + if so_file not in CudaRTLibrary.path_to_library_cache: + lib = ctypes.CDLL(so_file) + CudaRTLibrary.path_to_library_cache[so_file] = lib + self.lib = CudaRTLibrary.path_to_library_cache[so_file] + + if so_file not in CudaRTLibrary.path_to_dict_mapping: + _funcs = {} + for func in CudaRTLibrary.exported_functions: + f = getattr(self.lib, func.name) + f.restype = func.restype + f.argtypes = func.argtypes + _funcs[func.name] = f + CudaRTLibrary.path_to_dict_mapping[so_file] = _funcs + self.funcs = CudaRTLibrary.path_to_dict_mapping[so_file] + + def CUDART_CHECK(self, result: cudaError_t) -> None: + if result != 0: + error_str = self.cudaGetErrorString(result) + raise RuntimeError(f"CUDART error: {error_str}") + + def cudaGetErrorString(self, error: cudaError_t) -> str: + return self.funcs["cudaGetErrorString"](error).decode("utf-8") + + def cudaSetDevice(self, device: int) -> None: + self.CUDART_CHECK(self.funcs["cudaSetDevice"](device)) + + def cudaDeviceSynchronize(self) -> None: + self.CUDART_CHECK(self.funcs["cudaDeviceSynchronize"]()) + + def cudaDeviceReset(self) -> None: + self.CUDART_CHECK(self.funcs["cudaDeviceReset"]()) + + def cudaMalloc(self, size: int) -> ctypes.c_void_p: + devPtr = ctypes.c_void_p() + self.CUDART_CHECK(self.funcs["cudaMalloc"](ctypes.byref(devPtr), size)) + return devPtr + + def cudaFree(self, devPtr: ctypes.c_void_p) -> None: + self.CUDART_CHECK(self.funcs["cudaFree"](devPtr)) + + def cudaMemset(self, devPtr: ctypes.c_void_p, value: int, + count: int) -> None: + self.CUDART_CHECK(self.funcs["cudaMemset"](devPtr, value, count)) + + def cudaMemcpy(self, dst: ctypes.c_void_p, src: ctypes.c_void_p, + count: int) -> None: + cudaMemcpyDefault = 4 + kind = cudaMemcpyDefault + self.CUDART_CHECK(self.funcs["cudaMemcpy"](dst, src, count, kind)) + + def cudaIpcGetMemHandle(self, + devPtr: ctypes.c_void_p) -> cudaIpcMemHandle_t: + handle = cudaIpcMemHandle_t() + self.CUDART_CHECK(self.funcs["cudaIpcGetMemHandle"]( + ctypes.byref(handle), devPtr)) + return handle + + def cudaIpcOpenMemHandle(self, + handle: cudaIpcMemHandle_t) -> ctypes.c_void_p: + cudaIpcMemLazyEnablePeerAccess = 1 + devPtr = ctypes.c_void_p() + self.CUDART_CHECK(self.funcs["cudaIpcOpenMemHandle"]( + ctypes.byref(devPtr), handle, cudaIpcMemLazyEnablePeerAccess)) + return devPtr diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index c9573edb08f33..d3e41fa710676 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -1,87 +1,102 @@ +import ctypes import json import os +import pickle +import subprocess import sys -import tempfile -import time -from contextlib import contextmanager -from typing import Callable, Dict, List, Optional +from itertools import product +from typing import Dict, List, Optional, Sequence -import torch import torch.distributed as dist import torch.multiprocessing as mp import vllm.envs as envs +from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary from vllm.logger import init_logger from vllm.utils import cuda_device_count_stateless logger = init_logger(__name__) -@contextmanager -def mute_output(): - with open(os.devnull, "w") as f: - sys.stderr = f - sys.stdout = f - yield - - -def producer(i: int, - init_method: str, +def producer(batch_src: Sequence[int], + producer_queue, + consumer_queue, + result_queue, cuda_visible_devices: Optional[str] = None): if cuda_visible_devices is not None: os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices - with mute_output(): - dist.init_process_group( - backend="gloo", - init_method=init_method, - world_size=2, - rank=0, - ) - # produce a tensor in GPU i - data = torch.zeros((128, ), device=f"cuda:{i}") - # get the information to reconstruct the shared tensor - func, args = torch.multiprocessing.reductions.reduce_tensor(data) - args = list(args) - dist.broadcast_object_list([(func, args)], src=0) - dist.barrier() - torch.cuda.synchronize() - assert torch.all(data == 1).item() - - -def consumer(j: int, - init_method: str, + + lib = CudaRTLibrary() + for i in batch_src: + lib.cudaSetDevice(i) + pointer = lib.cudaMalloc(1024) + lib.cudaMemset(pointer, 1, 1024) + lib.cudaDeviceSynchronize() + handle = lib.cudaIpcGetMemHandle(pointer) + producer_queue.put(handle) + open_success = consumer_queue.get() + if open_success: + # use two queues to simulate barrier + producer_queue.put(0) + consumer_queue.get() + # check if the memory is modified + host_data = (ctypes.c_char * 1024)() + lib.cudaMemcpy(host_data, pointer, 1024) # type: ignore + for i in range(1024): + if ord(host_data[i]) != 2: + open_success = False + break + result_queue.put(open_success) + lib.cudaDeviceReset() + + +def consumer(batch_tgt: Sequence[int], + producer_queue, + consumer_queue, + result_queue, cuda_visible_devices: Optional[str] = None): if cuda_visible_devices is not None: os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices - with mute_output(): - dist.init_process_group( - backend="gloo", - init_method=init_method, - world_size=2, - rank=1, - ) - torch.cuda.set_device(j) - recv = [None] - dist.broadcast_object_list(recv, src=0) - func: Callable - args: List - func, args = recv[0] # type: ignore - # `args[6]` is the device id - # by default pytorch will use `i` from the producer - # here we need to set it to `j` to test P2P access - args[6] = j - data = func(*args) - data += 1 - dist.barrier() - torch.cuda.synchronize() - assert torch.all(data == 1).item() - - -def can_actually_p2p(i, j): + + lib = CudaRTLibrary() + for j in batch_tgt: + lib.cudaSetDevice(j) + handle = producer_queue.get() + open_success = False + try: + pointer = lib.cudaIpcOpenMemHandle(handle) # type: ignore + open_success = True + except RuntimeError: + # cannot error out here, because the producer process + # is still waiting for the response. + pass + consumer_queue.put(open_success) + if open_success: + # modify the memory + lib.cudaMemset(pointer, 2, 1024) + lib.cudaDeviceSynchronize() + # use two queues to simulate barrier + producer_queue.get() + consumer_queue.put(0) + # check if the memory is modified + host_data = (ctypes.c_char * 1024)() + lib.cudaMemcpy(host_data, pointer, 1024) # type: ignore + for i in range(1024): + if ord(host_data[i]) != 2: + open_success = False + break + result_queue.put(open_success) + lib.cudaDeviceReset() + + +def can_actually_p2p( + batch_src: Sequence[int], + batch_tgt: Sequence[int], +) -> Sequence[bool]: """ Usually, checking if P2P access is enabled can be done by - `torch.cuda.can_device_access_peer(i, j)`. However, sometimes - the driver might be broken, and `torch.cuda.can_device_access_peer(i, j)` + `torch.cuda.can_device_access_peer(src, tgt)`. However, sometimes + the driver might be broken, and `torch.cuda.can_device_access_peer(src, tgt)` returns `True` even if P2P access is not actually possible. See https://github.com/vllm-project/vllm/issues/2728 and https://forums.developer.nvidia.com/t/direct-gpu-gpu-communication-does-not-seem-to-work-properly/283264/10 @@ -90,41 +105,55 @@ def can_actually_p2p(i, j): Note on p2p and cuda IPC: Usually, one process uses one GPU: - GPU i --> cuda context i --> tensor i --> process i + GPU src --> cuda context src --> tensor src --> process src We need to combine p2p and cuda IPC, so that: - GPU i --> cuda context i --> tensor i --> process i - |shared| - GPU j --> cuda context j --> tensor j --> process j - That is to say, process i creates a tensor in GPU i, passes IPC handle to - process j, and process j accesses the tensor in GPU j. Any operation on the - tensor in process j will be reflected in the tensor in process i, because + GPU src --> cuda context src --> tensor src --> process src + |shared| + GPU tgt --> cuda context tgt --> tensor tgt --> process tgt + That is to say, process src creates a tensor in GPU src, passes IPC handle to + process tgt, and process tgt accesses the tensor in GPU tgt. Any operation on the + tensor in process tgt will be reflected in the tensor in process src, because they are the same memory segment. - It is important to note that process j accesses the tensor in GPU j, not - GPU i. That's why we need p2p access. # noqa - """ + It is important to note that process tgt accesses the tensor in GPU tgt, not + GPU src. That's why we need p2p access. + + The most time-consuming part is the process creation. To avoid creating + processes for every pair of GPUs, we use batched testing. We create two + processes for testing all pairs of GPUs in batch. The trick is to reset + the device after each test (which is not available in PyTorch). + """ # noqa cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None) # pass the CUDA_VISIBLE_DEVICES to the child process # to make sure they see the same set of GPUs - # make sure the temp file is not the same across different calls - temp_path = tempfile.mktemp() + str(time.time()) - # create an empty file - with open(temp_path, "w"): - pass - init_method = f"file://{temp_path}" - # make sure the processes are spawned smp = mp.get_context("spawn") - pi = smp.Process(target=producer, - args=(i, init_method, cuda_visible_devices)) - pj = smp.Process(target=consumer, - args=(j, init_method, cuda_visible_devices)) - pi.start() - pj.start() - pi.join() - pj.join() - return pi.exitcode == 0 and pj.exitcode == 0 + producer_queue = smp.Queue() + consumer_queue = smp.Queue() + result_queue = smp.Queue() + p_src = smp.Process(target=producer, + args=(batch_src, producer_queue, consumer_queue, + result_queue, cuda_visible_devices)) + p_tgt = smp.Process(target=consumer, + args=(batch_tgt, producer_queue, consumer_queue, + result_queue, cuda_visible_devices)) + p_src.start() + p_tgt.start() + p_src.join() + p_tgt.join() + result: List[bool] = [] + for src, tgt in zip(batch_src, batch_tgt): + a = result_queue.get() + b = result_queue.get() + if a != b: + logger.warning( + "Two processes do not agree on the P2P access" + " status on %d -> %d, treat as disabled.", src, tgt) + result.append(False) + else: + result.append(a) + return result # why do we need this cache? @@ -142,14 +171,14 @@ def can_actually_p2p(i, j): _gpu_p2p_access_cache: Optional[Dict[str, bool]] = None -def gpu_p2p_access_check(i: int, j: int) -> bool: - """Check if GPU i can access GPU j.""" +def gpu_p2p_access_check(src: int, tgt: int) -> bool: + """Check if GPU src can access GPU tgt.""" # if the cache variable is already calculated, # read from the cache instead of checking it again global _gpu_p2p_access_cache if _gpu_p2p_access_cache is not None: - return _gpu_p2p_access_cache[f"{i}->{j}"] + return _gpu_p2p_access_cache[f"{src}->{tgt}"] is_distributed = dist.is_initialized() @@ -168,10 +197,31 @@ def gpu_p2p_access_check(i: int, j: int) -> bool: # only the local master process (with local_rank == 0) can # enter this block to calculate the cache logger.info("generating GPU P2P access cache in %s", path) - cache = {} - for _i in range(num_dev): - for _j in range(num_dev): - cache[f"{_i}->{_j}"] = can_actually_p2p(_i, _j) + cache: Dict[str, bool] = {} + ids = list(range(num_dev)) + # batch of all pairs of GPUs + batch_src, batch_tgt = zip(*list(product(ids, ids))) + # NOTE: we use `subprocess` rather than `multiprocessing` here + # because the caller might not have `if __name__ == "__main__":`, + # in that case we cannot use spawn method in multiprocessing. + # However, `can_actually_p2p` requires spawn method. + # The fix is, we use `subprocess` to call the function, + # where we have `if __name__ == "__main__":` in this file. + input_bytes = pickle.dumps((batch_src, batch_tgt)) + returned = subprocess.run([sys.executable, __file__], + input=input_bytes, + capture_output=True) + # check if the subprocess is successful + try: + returned.check_returncode() + except Exception as e: + # wrap raised exception to provide more information + raise RuntimeError( + f"Error happened when batch testing " + f"peer-to-peer access from {batch_src} to {batch_tgt}") from e + result = pickle.loads(returned.stdout) + for _i, _j, r in zip(batch_src, batch_tgt, result): + cache[f"{_i}->{_j}"] = r with open(path, "w") as f: json.dump(cache, f, indent=4) if is_distributed: @@ -180,7 +230,12 @@ def gpu_p2p_access_check(i: int, j: int) -> bool: with open(path, "r") as f: cache = json.load(f) _gpu_p2p_access_cache = cache - return _gpu_p2p_access_cache[f"{i}->{j}"] + return _gpu_p2p_access_cache[f"{src}->{tgt}"] __all__ = ["gpu_p2p_access_check"] + +if __name__ == "__main__": + batch_src, batch_tgt = pickle.loads(sys.stdin.buffer.read()) + result = can_actually_p2p(batch_src, batch_tgt) + sys.stdout.buffer.write(pickle.dumps(result)) diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py index 50d6719fbfe62..7619c98f22148 100644 --- a/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -205,7 +205,7 @@ def __init__(self, so_file: Optional[str] = None): raise e if so_file not in NCCLLibrary.path_to_dict_mapping: - _funcs = {} + _funcs: Dict[str, Any] = {} for func in NCCLLibrary.exported_functions: f = getattr(self.lib, func.name) f.restype = func.restype diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py new file mode 100644 index 0000000000000..c44bd2f11ee8b --- /dev/null +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -0,0 +1,281 @@ +import pickle +import time +from contextlib import contextmanager +from multiprocessing import shared_memory +from typing import Optional +from unittest.mock import patch + +import torch +import torch.distributed as dist +from torch.distributed import ProcessGroup + +import vllm.envs as envs +from vllm.logger import init_logger + +VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL + +logger = init_logger(__name__) + + +class ShmRingBuffer: + + def __init__(self, + n_reader: int, + max_chunk_bytes: int, + max_chunks: int, + name: Optional[str] = None): + """ + A shared memory ring buffer implementation for broadcast communication. + Essentially, it is a queue where only one will `enqueue` and multiple + will `dequeue`. The max size of each item, together with the max number + of items that can be stored in the buffer are known in advance. + In this case, we don't need to synchronize the access to + the buffer. + + Buffer memory layout: + data metadata + | | + | (current_idx) | (current_idx) + v v + +-------------------------------+----------------------------------------+ + | chunk0 | chunk1 | ... | chunk | metadata0 | metadata1 | ... | metadata | + +-------------------------------+----------------------------------------+ + | max_chunks x max_chunk_bytes | max_chunks x (1 + n_reader) bytes | + + metadata memory layout: each byte is a flag, the first byte is the written + flag, and the rest are reader flags. The flags are set to 0 by default. + +--------------+--------------+--------------+-----+--------------+ + | written_flag | reader0_flag | reader1_flag | ... | readerN_flag | + +--------------+--------------+--------------+-----+--------------+ + + The state of metadata is as follows: + + (case 1) 0???...???: the block is not written yet, cannot read, can write + (case 2) 1000...000: the block is just written, can read, cannot write + (case 3) 1???...???: the block is written and read by some readers, can read if not read, cannot write + (case 4) 1111...111: the block is written and read by all readers, cannot read, can write + + State transition for readers: + + When a reader finds a block that it can read (case 2 or 3), it can yield the block for caller to read. + Only after the caller finishes reading the block, the reader can mark the block as read. + Readers only mark the block as read (from 0 to 1), the writer marks the block as ready to read (from 1 to 0). + + State transition for writer: + + When the writer writes to a block (case 1 or 4), it first resets the written flag to 0, converting either case + to case 1. Then it can yield the block for caller to write. After the caller finishes writing the block, the writer + can reset the reader flags to 0, and mark the block as written (from 0 to 1). + NOTE: the order is important here, first reset the reader flags (so that we are still in case 1), then mark the block as written. The state transition is atomic. If we do it in the reverse order, it will go through case 3 and then back to case 2, and readers might read the intermediate case 3, which is not correct. + + During creation, `name` is None and the buffer is created. We can pass the + created object to other processes by pickling it. The other processes will + get the name of the shared memory and open it, so that they can access the + same shared memory buffer. + """# noqa + self.n_reader = n_reader + self.metadata_size = 1 + n_reader + self.max_chunk_bytes = max_chunk_bytes + self.max_chunks = max_chunks + self.total_bytes_of_buffer = (self.max_chunk_bytes + + self.metadata_size) * self.max_chunks + self.data_offset = 0 + self.metadata_offset = self.max_chunk_bytes * self.max_chunks + + if name is None: + # we are creating a buffer + self.is_creator = True + self.shared_memory = shared_memory.SharedMemory( + create=True, size=self.total_bytes_of_buffer) + # initialize the metadata section to 0 + with memoryview(self.shared_memory.buf[self.metadata_offset:] + ) as metadata_buffer: + torch.frombuffer(metadata_buffer, dtype=torch.uint8).fill_(0) + else: + # we are opening an existing buffer + self.is_creator = False + # fix to https://stackoverflow.com/q/62748654/9191338 + # Python incorrectly tracks shared memory even if it is not + # created by the process. The following patch is a workaround. + with patch("multiprocessing.resource_tracker.register", + lambda *args, **kwargs: None): + self.shared_memory = shared_memory.SharedMemory(name=name) + assert self.shared_memory.size == self.total_bytes_of_buffer + + def __reduce__(self): + return ( + self.__class__, + (self.n_reader, self.max_chunk_bytes, self.max_chunks, + self.shared_memory.name), + ) + + def __del__(self): + self.shared_memory.close() + if self.is_creator: + self.shared_memory.unlink() + + @contextmanager + def get_data(self, current_idx: int): + start = self.data_offset + current_idx * self.max_chunk_bytes + end = start + self.max_chunk_bytes + with memoryview(self.shared_memory.buf[start:end]) as buf: + yield buf + + @contextmanager + def get_metadata(self, current_idx: int): + start = self.metadata_offset + current_idx * self.metadata_size + end = start + self.metadata_size + with memoryview(self.shared_memory.buf[start:end]) as buf: + yield buf + + +class ShmRingBufferIO: + + def __init__(self, buffer: ShmRingBuffer, reader_rank: int): + self.buffer = buffer + self.reader_rank = reader_rank + self._is_writer = self.reader_rank == -1 + self._is_reader = not self._is_writer + if self._is_reader: + assert 0 <= self.reader_rank < buffer.n_reader, \ + (f"Invalid reader rank {self.reader_rank} for buffer" + f" created with {buffer.n_reader} readers") + self.current_idx = 0 + + @contextmanager + def acquire_write(self): + assert self._is_writer, "Only writers can acquire write" + start_index = self.current_idx + start_time = time.time() + n_warning = 1 + while True: + with self.buffer.get_metadata(self.current_idx) as metadata_buffer: + read_count = sum(metadata_buffer[1:]) + written_flag = metadata_buffer[0] + if written_flag and read_count != self.buffer.n_reader: + # this block is written and not read by all readers + # try to write to the next block + self.current_idx = (self.current_idx + + 1) % self.buffer.max_chunks + if self.current_idx == start_index: + # no empty block found + if time.time( + ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning: # noqa + logger.warning( + "No available block found in %s second. ", + VLLM_RINGBUFFER_WARNING_INTERVAL) + n_warning += 1 + # wait for a while (0.1 us) + time.sleep(1e-7) + continue + # found a block that is either + # (1) not written + # (2) read by all readers + + # mark the block as not written + metadata_buffer[0] = 0 + # let caller write to the buffer + with self.buffer.get_data(self.current_idx) as buf: + yield buf + + # caller has written to the buffer + # NOTE: order is important here + # first set the read flags to 0 + # then set the written flag to 1 + # otherwise, the readers may think they already read the block + for i in range(1, self.buffer.n_reader + 1): + # set read flag to 0, meaning it is not read yet + metadata_buffer[i] = 0 + # mark the block as written + metadata_buffer[0] = 1 + break + + @contextmanager + def acquire_read(self): + assert self._is_reader, "Only readers can acquire read" + start_index = self.current_idx + start_time = time.time() + n_warning = 1 + while True: + with self.buffer.get_metadata(self.current_idx) as metadata_buffer: + read_flag = metadata_buffer[self.reader_rank + 1] + written_flag = metadata_buffer[0] + if not written_flag or read_flag: + # this block is either + # (1) not written + # (2) already read by this reader + # try to read the next block + self.current_idx = (self.current_idx + + 1) % self.buffer.max_chunks + if self.current_idx == start_index: + # no block found + if time.time( + ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning: # noqa + logger.warning( + "No available block found in %s second. ", + VLLM_RINGBUFFER_WARNING_INTERVAL) + n_warning += 1 + # wait for a while (0.1 us) + time.sleep(1e-7) + continue + # found a block that is not read by this reader + # let caller read from the buffer + with self.buffer.get_data(self.current_idx) as buf: + yield buf + + # caller has read from the buffer + # set the read flag + metadata_buffer[self.reader_rank + 1] = 1 + break + + def enqueue(self, obj): + assert self._is_writer, "Only writers can enqueue" + serialized_obj = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL) + if len(serialized_obj) > self.buffer.max_chunk_bytes: + raise RuntimeError( + f"{len(serialized_obj)=} larger than the allowed value " + f"{self.buffer.max_chunk_bytes}," + "Please increase the max_chunk_bytes parameter.") + with self.acquire_write() as buf: + buf[:len(serialized_obj)] = serialized_obj + + def dequeue(self): + assert self._is_reader, "Only readers can dequeue" + with self.acquire_read() as buf: + # no need to know the size of serialized object + # pickle format itself contains the size information internally + # see https://docs.python.org/3/library/pickle.html + obj = pickle.loads(buf) + return obj + + def broadcast_object(self, obj=None): + if self._is_writer: + self.enqueue(obj) + return obj + else: + return self.dequeue() + + def create_from_process_group(pg: ProcessGroup, + max_chunk_bytes, + max_chunks, + writer_rank=0) -> "ShmRingBufferIO": + group_rank = dist.get_rank(pg) + group_world_size = dist.get_world_size(pg) + ranks_inside_group = list(range(group_world_size)) + global_ranks = dist.get_process_group_ranks(pg) + n_reader = group_world_size - 1 + buffer: ShmRingBuffer + if group_rank == writer_rank: + buffer = ShmRingBuffer(n_reader, max_chunk_bytes, max_chunks) + dist.broadcast_object_list([buffer], + src=global_ranks[writer_rank], + group=pg) + return ShmRingBufferIO(buffer, -1) + else: + recv = [None] + dist.broadcast_object_list(recv, + src=global_ranks[writer_rank], + group=pg) + buffer = recv[0] # type: ignore + rest_ranks = [r for r in ranks_inside_group if r != writer_rank] + return ShmRingBufferIO(buffer, rest_ranks.index(group_rank)) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index f6a2fc9b05a84..5188fadbb92a5 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -23,8 +23,9 @@ from collections import namedtuple from contextlib import contextmanager, nullcontext from dataclasses import dataclass -from multiprocessing import resource_tracker, shared_memory +from multiprocessing import shared_memory from typing import Any, Dict, List, Optional, Tuple, Union +from unittest.mock import patch import torch from torch.distributed import Backend, ProcessGroup @@ -57,7 +58,7 @@ def _split_tensor_dict( # because it contains not only the device type but also the device # index (e.g. "cuda:0"). We only need the device type. # receiving side will set the device index. - device = "cpu" if value.is_cpu else "cuda" + device = value.device.type metadata_list.append( (key, TensorMetadata(device, value.dtype, value.size()))) tensor_list.append(value) @@ -97,6 +98,7 @@ class GroupCoordinator: # communicators are only created for world size > 1 pynccl_comm: Optional[Any] # PyNccl communicator ca_comm: Optional[Any] # Custom allreduce communicator + shm_broadcaster: Optional[Any] # shared memory broadcaster def __init__( self, @@ -161,6 +163,13 @@ def __init__( else: self.ca_comm = None + from vllm.distributed.device_communicators.shm_broadcast import ( + ShmRingBufferIO) + self.shm_broadcaster: Optional[ShmRingBufferIO] = None + if self.world_size > 1 and is_in_the_same_node(self.cpu_group): + self.shm_broadcaster = ShmRingBufferIO.create_from_process_group( + self.cpu_group, 1 << 20, 6) + @property def first_rank(self): """Return the global rank of the first process in the group""" @@ -323,6 +332,30 @@ def broadcast(self, input_: torch.Tensor, src: int = 0): group=self.device_group) return input_ + def broadcast_object(self, obj: Optional[Any] = None, src: int = 0): + """Broadcast the input object. + NOTE: `src` is the local rank of the source rank. + """ + assert src < self.world_size, f"Invalid src rank ({src})" + + # Bypass the function if we are using only 1 GPU. + if self.world_size == 1: + return obj + if self.shm_broadcaster is not None: + assert src == 0, "Shared memory broadcaster only supports src=0" + return self.shm_broadcaster.broadcast_object(obj) + if self.rank_in_group == src: + torch.distributed.broadcast_object_list([obj], + src=self.ranks[src], + group=self.cpu_group) + return obj + else: + recv = [None] + torch.distributed.broadcast_object_list(recv, + src=self.ranks[src], + group=self.cpu_group) + return recv[0] + def broadcast_object_list(self, obj_list: List[Any], src: int = 0, @@ -370,9 +403,7 @@ def broadcast_tensor_dict( # `metadata_list` lives in CPU memory. # `broadcast_object_list` has serialization & deserialization, # all happening on CPU. Therefore, we can use the CPU group. - torch.distributed.broadcast_object_list([metadata_list], - src=src, - group=metadata_group) + self.broadcast_object(metadata_list, src=src) async_handles = [] for tensor in tensor_list: if tensor.numel() == 0: @@ -395,14 +426,10 @@ def broadcast_tensor_dict( async_handle.wait() else: - recv_metadata_list = [None] - torch.distributed.broadcast_object_list(recv_metadata_list, - src=src, - group=metadata_group) - assert recv_metadata_list[0] is not None + metadata_list = self.broadcast_object(None, src=src) tensor_dict = {} async_handles = [] - for key, value in recv_metadata_list[0]: + for key, value in metadata_list: if isinstance(value, TensorMetadata): tensor = torch.empty(value.size, dtype=value.dtype, @@ -744,7 +771,12 @@ def is_in_the_same_node(pg: ProcessGroup): src=ranks[0], group=pg) name = recv[0] - shm = shared_memory.SharedMemory(name=name) + # fix to https://stackoverflow.com/q/62748654/9191338 + # Python incorrectly tracks shared memory even if it is not + # created by the process. The following patch is a workaround. + with patch("multiprocessing.resource_tracker.register", + lambda *args, **kwargs: None): + shm = shared_memory.SharedMemory(name=name) if shm.buf[:len(magic_message)] == magic_message: is_in_the_same_node[rank] = 1 except Exception as e: @@ -757,14 +789,8 @@ def is_in_the_same_node(pg: ProcessGroup): # clean up the shared memory segment with contextlib.suppress(OSError): - if rank == 0: - if shm: - shm.unlink() - else: - if shm: - # fix to https://stackoverflow.com/q/62748654/9191338 - resource_tracker.unregister( - shm._name, "shared_memory") # type: ignore[attr-defined] + if rank == 0 and shm: + shm.unlink() torch.distributed.all_reduce(is_in_the_same_node, group=pg) return is_in_the_same_node.sum().item() == world_size diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 67356d11f14a6..d12fe84d29406 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -9,10 +9,11 @@ from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, EngineConfig, LoadConfig, LoRAConfig, ModelConfig, - ParallelConfig, SchedulerConfig, SpeculativeConfig, - TokenizerPoolConfig, VisionLanguageConfig) + ObservabilityConfig, ParallelConfig, SchedulerConfig, + SpeculativeConfig, TokenizerPoolConfig, + VisionLanguageConfig) from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS -from vllm.utils import str_to_int_tuple +from vllm.utils import FlexibleArgumentParser, str_to_int_tuple def nullable_str(val: str): @@ -105,13 +106,15 @@ class EngineArgs: qlora_adapter_name_or_path: Optional[str] = None + otlp_traces_endpoint: Optional[str] = None + def __post_init__(self): if self.tokenizer is None: self.tokenizer = self.model @staticmethod def add_cli_args_for_vlm( - parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument('--image-input-type', type=nullable_str, default=None, @@ -157,8 +160,7 @@ def add_cli_args_for_vlm( return parser @staticmethod - def add_cli_args( - parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: """Shared CLI arguments for vLLM engine.""" # Model arguments @@ -234,7 +236,7 @@ def add_cli_args( '* "dummy" will initialize the weights with random values, ' 'which is mainly for profiling.\n' '* "tensorizer" will load the weights using tensorizer from ' - 'CoreWeave. See the Tensorize vLLM Model script in the Examples' + 'CoreWeave. See the Tensorize vLLM Model script in the Examples ' 'section for more information.\n' '* "bitsandbytes" will load the weights using bitsandbytes ' 'quantization.\n') @@ -516,11 +518,12 @@ def add_cli_args( 'Enabling this will use the fully sharded layers. ' 'At high sequence length, max rank or ' 'tensor parallel size, this is likely faster.')) - parser.add_argument("--device", - type=str, - default=EngineArgs.device, - choices=["auto", "cuda", "neuron", "cpu", "tpu"], - help='Device type for vLLM execution.') + parser.add_argument( + "--device", + type=str, + default=EngineArgs.device, + choices=["auto", "cuda", "neuron", "cpu", "tpu", "xpu"], + help='Device type for vLLM execution.') # Related to Vision-language models such as llava parser = EngineArgs.add_cli_args_for_vlm(parser) @@ -588,7 +591,7 @@ def add_cli_args( 'This should be a JSON string that will be ' 'parsed into a dictionary.') parser.add_argument( - '--preemption_mode', + '--preemption-mode', type=str, default=None, help='If \'recompute\', the engine performs preemption by block ' @@ -613,6 +616,13 @@ def add_cli_args( type=str, default=None, help='Name or path of the QLoRA adapter.') + + parser.add_argument( + '--otlp-traces-endpoint', + type=str, + default=None, + help='Target URL to which OpenTelemetry traces will be sent.') + return parser @classmethod @@ -772,6 +782,9 @@ def create_engine_config(self, ) -> EngineConfig: decoding_config = DecodingConfig( guided_decoding_backend=self.guided_decoding_backend) + observability_config = ObservabilityConfig( + otlp_traces_endpoint=self.otlp_traces_endpoint) + if (model_config.get_sliding_window() is not None and scheduler_config.chunked_prefill_enabled and not scheduler_config.use_v2_block_manager): @@ -779,16 +792,19 @@ def create_engine_config(self, ) -> EngineConfig: "Chunked prefill is not supported with sliding window. " "Set --disable-sliding-window to disable sliding window.") - return EngineConfig(model_config=model_config, - cache_config=cache_config, - parallel_config=parallel_config, - scheduler_config=scheduler_config, - device_config=device_config, - lora_config=lora_config, - vision_language_config=vision_language_config, - speculative_config=speculative_config, - load_config=load_config, - decoding_config=decoding_config) + return EngineConfig( + model_config=model_config, + cache_config=cache_config, + parallel_config=parallel_config, + scheduler_config=scheduler_config, + device_config=device_config, + lora_config=lora_config, + vision_language_config=vision_language_config, + speculative_config=speculative_config, + load_config=load_config, + decoding_config=decoding_config, + observability_config=observability_config, + ) @dataclass @@ -799,8 +815,8 @@ class AsyncEngineArgs(EngineArgs): max_log_len: Optional[int] = None @staticmethod - def add_cli_args(parser: argparse.ArgumentParser, - async_args_only: bool = False) -> argparse.ArgumentParser: + def add_cli_args(parser: FlexibleArgumentParser, + async_args_only: bool = False) -> FlexibleArgumentParser: if not async_args_only: parser = EngineArgs.add_cli_args(parser) parser.add_argument('--engine-use-ray', @@ -821,13 +837,13 @@ def add_cli_args(parser: argparse.ArgumentParser, # These functions are used by sphinx to build the documentation def _engine_args_parser(): - return EngineArgs.add_cli_args(argparse.ArgumentParser()) + return EngineArgs.add_cli_args(FlexibleArgumentParser()) def _async_engine_args_parser(): - return AsyncEngineArgs.add_cli_args(argparse.ArgumentParser(), + return AsyncEngineArgs.add_cli_args(FlexibleArgumentParser(), async_args_only=True) def _vlm_engine_args_parser(): - return EngineArgs.add_cli_args_for_vlm(argparse.ArgumentParser()) + return EngineArgs.add_cli_args_for_vlm(FlexibleArgumentParser()) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 943402c865bd2..df25eb111e87f 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -10,6 +10,7 @@ from vllm.config import DecodingConfig, ModelConfig from vllm.core.scheduler import SchedulerOutputs from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.engine.async_timeout import asyncio_timeout from vllm.engine.llm_engine import LLMEngine from vllm.executor.ray_utils import initialize_ray_cluster, ray from vllm.inputs import LLMInputs, PromptInputs @@ -244,6 +245,9 @@ async def step_async( # Log stats. self.do_log_stats(scheduler_outputs, output) + # Tracing + self.do_tracing(scheduler_outputs) + if not request_outputs: # Stop the execute model loop in parallel workers until there are # more requests to process. This avoids waiting indefinitely in @@ -285,6 +289,7 @@ async def add_request_async( params: Union[SamplingParams, PoolingParams], arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Dict[str, str]] = None, ) -> None: if lora_request is not None and not self.lora_config: raise ValueError(f"Got lora_request {lora_request} but LoRA is " @@ -301,6 +306,7 @@ async def add_request_async( params=params, arrival_time=arrival_time, lora_request=lora_request, + trace_headers=trace_headers, ) async def check_health_async(self) -> None: @@ -383,6 +389,17 @@ def from_engine_args( "Distributed execution is not supported with the CPU backend.") from vllm.executor.cpu_executor import CPUExecutorAsync executor_class = CPUExecutorAsync + elif engine_config.device_config.device_type == "xpu": + if distributed_executor_backend is None: + from vllm.executor.xpu_executor import XPUExecutorAsync + executor_class = XPUExecutorAsync + elif distributed_executor_backend == "ray": + initialize_ray_cluster(engine_config.parallel_config) + from vllm.executor.ray_xpu_executor import RayXPUExecutorAsync + executor_class = RayXPUExecutorAsync + else: + raise RuntimeError( + "Not supported distributed execution model on XPU device.") elif distributed_executor_backend == "ray": initialize_ray_cluster(engine_config.parallel_config) from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync @@ -529,8 +546,8 @@ async def run_engine_loop(self): # Abort if iteration takes too long due to unrecoverable errors # (eg. NCCL timeouts). try: - has_requests_in_progress = await asyncio.wait_for( - self.engine_step(), ENGINE_ITERATION_TIMEOUT_S) + async with asyncio_timeout(ENGINE_ITERATION_TIMEOUT_S): + has_requests_in_progress = await self.engine_step() except asyncio.TimeoutError as exc: logger.error( "Engine iteration timed out. This should never happen!") @@ -545,6 +562,7 @@ async def add_request( params: Union[SamplingParams, PoolingParams], arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Dict[str, str]] = None, ) -> AsyncStream: if self.log_requests: if isinstance(inputs, str): @@ -580,24 +598,13 @@ async def add_request( if arrival_time is None: arrival_time = time.time() - if self.engine_use_ray: - processed_inputs = await self.engine.process_model_inputs_async \ - .remote( # type: ignore - request_id=request_id, - inputs=inputs, - lora_request=lora_request) - else: - processed_inputs = await self.engine.process_model_inputs_async( - request_id=request_id, - inputs=inputs, - lora_request=lora_request) - stream = self._request_tracker.add_request( request_id, - inputs=processed_inputs, + inputs=inputs, params=params, arrival_time=arrival_time, lora_request=lora_request, + trace_headers=trace_headers, ) return stream @@ -608,6 +615,7 @@ async def generate( sampling_params: SamplingParams, request_id: str, lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Dict[str, str]] = None, ) -> AsyncIterator[RequestOutput]: """Generate outputs for a request. @@ -622,6 +630,7 @@ async def generate( sampling_params: The sampling parameters of the request. request_id: The unique id of the request. lora_request: LoRA request to use for generation, if any. + trace_headers: OpenTelemetry trace headers. Yields: The output `RequestOutput` objects from the LLMEngine @@ -675,6 +684,7 @@ async def generate( inputs, sampling_params, lora_request=lora_request, + trace_headers=trace_headers, ): yield LLMEngine.validate_output(output, RequestOutput) @@ -684,6 +694,7 @@ async def encode( pooling_params: PoolingParams, request_id: str, lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Dict[str, str]] = None, ) -> AsyncIterator[EmbeddingRequestOutput]: """Generate outputs for a request from an embedding model. @@ -698,6 +709,7 @@ async def encode( pooling_params: The pooling parameters of the request. request_id: The unique id of the request. lora_request: LoRA request to use for generation, if any. + trace_headers: OpenTelemetry trace headers. Yields: The output `EmbeddingRequestOutput` objects from the LLMEngine @@ -749,6 +761,7 @@ async def encode( inputs, pooling_params, lora_request=lora_request, + trace_headers=trace_headers, ): yield LLMEngine.validate_output(output, EmbeddingRequestOutput) @@ -759,6 +772,7 @@ async def _process_request( params: Union[SamplingParams, PoolingParams], *, lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Dict[str, str]] = None, ) -> AsyncIterator[Union[RequestOutput, EmbeddingRequestOutput]]: """Common logic to process requests with SamplingParams or PoolingParams.""" @@ -770,6 +784,7 @@ async def _process_request( params, arrival_time=arrival_time, lora_request=lora_request, + trace_headers=trace_headers, ) try: @@ -849,3 +864,10 @@ async def check_health(self) -> None: else: await self.engine.check_health_async() logger.debug("Health check took %fs", time.perf_counter() - t) + + async def is_tracing_enabled(self) -> bool: + if self.engine_use_ray: + return await self.engine.is_tracing_enabled.remote( # type: ignore + ) + else: + return self.engine.is_tracing_enabled() diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py new file mode 100644 index 0000000000000..4b18426252127 --- /dev/null +++ b/vllm/engine/async_timeout.py @@ -0,0 +1,189 @@ +# Workaround for https://github.com/python/cpython/issues/86296 +# +# From https://github.com/aio-libs/async-timeout/blob/master/async_timeout/__init__.py +# Licensed under the Apache License (Apache-2.0) + +import asyncio +import enum +import sys +import warnings +from types import TracebackType +from typing import Any, Optional, Type + +if sys.version_info[:2] >= (3, 11): + from asyncio import timeout as asyncio_timeout +else: + + def asyncio_timeout(delay: Optional[float]) -> "Timeout": + """timeout context manager. + Useful in cases when you want to apply timeout logic around block + of code or in cases when asyncio.wait_for is not suitable. For example: + >>> async with timeout(0.001): + ... async with aiohttp.get('https://github.com') as r: + ... await r.text() + delay - value in seconds or None to disable timeout logic + """ + loop = asyncio.get_running_loop() + deadline = loop.time() + delay if delay is not None else None + return Timeout(deadline, loop) + + class _State(enum.Enum): + INIT = "INIT" + ENTER = "ENTER" + TIMEOUT = "TIMEOUT" + EXIT = "EXIT" + + class Timeout: + # Internal class, please don't instantiate it directly + # Use timeout() and timeout_at() public factories instead. + # + # Implementation note: `async with timeout()` is preferred + # over `with timeout()`. + # While technically the Timeout class implementation + # doesn't need to be async at all, + # the `async with` statement explicitly points that + # the context manager should be used from async function context. + # + # This design allows to avoid many silly misusages. + # + # TimeoutError is raised immediately when scheduled + # if the deadline is passed. + # The purpose is to time out as soon as possible + # without waiting for the next await expression. + + __slots__ = ("_deadline", "_loop", "_state", "_timeout_handler") + + def __init__(self, deadline: Optional[float], + loop: asyncio.AbstractEventLoop) -> None: + self._loop = loop + self._state = _State.INIT + + self._timeout_handler = None # type: Optional[asyncio.Handle] + if deadline is None: + self._deadline = None # type: Optional[float] + else: + self.update(deadline) + + def __enter__(self) -> "Timeout": + warnings.warn( + "with timeout() is deprecated, use async with timeout()", + DeprecationWarning, + stacklevel=2, + ) + self._do_enter() + return self + + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc_val: Optional[BaseException], + exc_tb: Optional[TracebackType], + ) -> Optional[bool]: + self._do_exit(exc_type) + return None + + async def __aenter__(self) -> "Timeout": + self._do_enter() + return self + + async def __aexit__( + self, + exc_type: Optional[Type[BaseException]], + exc_val: Optional[BaseException], + exc_tb: Optional[TracebackType], + ) -> Optional[bool]: + self._do_exit(exc_type) + return None + + @property + def expired(self) -> bool: + """Is timeout expired during execution?""" + return self._state == _State.TIMEOUT + + @property + def deadline(self) -> Optional[float]: + return self._deadline + + def reject(self) -> None: + """Reject scheduled timeout if any.""" + # cancel is maybe better name but + # task.cancel() raises CancelledError in asyncio world. + if self._state not in (_State.INIT, _State.ENTER): + raise RuntimeError(f"invalid state {self._state.value}") + self._reject() + + def _reject(self) -> None: + if self._timeout_handler is not None: + self._timeout_handler.cancel() + self._timeout_handler = None + + def shift(self, delay: float) -> None: + """Advance timeout on delay seconds. + The delay can be negative. + Raise RuntimeError if shift is called when deadline is not scheduled + """ + deadline = self._deadline + if deadline is None: + raise RuntimeError( + "cannot shift timeout if deadline is not scheduled") + self.update(deadline + delay) + + def update(self, deadline: float) -> None: + """Set deadline to absolute value. + deadline argument points on the time in the same clock system + as loop.time(). + If new deadline is in the past the timeout is raised immediately. + Please note: it is not POSIX time but a time with + undefined starting base, e.g. the time of the system power on. + """ + if self._state == _State.EXIT: + raise RuntimeError( + "cannot reschedule after exit from context manager") + if self._state == _State.TIMEOUT: + raise RuntimeError("cannot reschedule expired timeout") + if self._timeout_handler is not None: + self._timeout_handler.cancel() + self._deadline = deadline + if self._state != _State.INIT: + self._reschedule() + + def _reschedule(self) -> None: + assert self._state == _State.ENTER + deadline = self._deadline + if deadline is None: + return + + now = self._loop.time() + if self._timeout_handler is not None: + self._timeout_handler.cancel() + + task = asyncio.current_task() + if deadline <= now: + self._timeout_handler = self._loop.call_soon( + self._on_timeout, task) + else: + self._timeout_handler = self._loop.call_at( + deadline, self._on_timeout, task) + + def _do_enter(self) -> None: + if self._state != _State.INIT: + raise RuntimeError(f"invalid state {self._state.value}") + self._state = _State.ENTER + self._reschedule() + + def _do_exit(self, exc_type: Optional[Type[BaseException]]) -> None: + if exc_type is asyncio.CancelledError and \ + self._state == _State.TIMEOUT: + self._timeout_handler = None + raise asyncio.TimeoutError + # timeout has not expired + self._state = _State.EXIT + self._reject() + return None + + def _on_timeout(self, task: "Optional[asyncio.Task[Any]]") -> None: + if task: + task.cancel() + self._state = _State.TIMEOUT + # drop the reference early + self._timeout_handler = None diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 9a86d41de969e..cc0ce4a186a88 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1,14 +1,14 @@ import time from contextlib import contextmanager -from typing import TYPE_CHECKING, ClassVar, Iterable, List, Optional +from typing import TYPE_CHECKING, ClassVar, Dict, Iterable, List, Optional from typing import Sequence as GenericSequence -from typing import Type, TypeVar, Union +from typing import Set, Type, TypeVar, Union from transformers import GenerationConfig, PreTrainedTokenizer from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoadConfig, - LoRAConfig, ModelConfig, ParallelConfig, - SchedulerConfig, SpeculativeConfig, + LoRAConfig, ModelConfig, ObservabilityConfig, + ParallelConfig, SchedulerConfig, SpeculativeConfig, VisionLanguageConfig) from vllm.core.scheduler import (ScheduledSequenceGroup, Scheduler, SchedulerOutputs) @@ -31,6 +31,8 @@ PoolerOutput, SamplerOutput, Sequence, SequenceGroup, SequenceGroupMetadata, SequenceStatus) +from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context, + init_tracer) from vllm.transformers_utils.detokenizer import Detokenizer from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup, get_tokenizer_group) @@ -154,6 +156,7 @@ def __init__( vision_language_config: Optional[VisionLanguageConfig], speculative_config: Optional[SpeculativeConfig], decoding_config: Optional[DecodingConfig], + observability_config: Optional[ObservabilityConfig], executor_class: Type[ExecutorBase], log_stats: bool, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, @@ -168,7 +171,8 @@ def __init__( "disable_custom_all_reduce=%s, quantization=%s, sparsity=%s, " "enforce_eager=%s, kv_cache_dtype=%s, " "quantization_param_path=%s, device_config=%s, " - "decoding_config=%r, seed=%d, served_model_name=%s)", + "decoding_config=%r, observability_config=%r, " + "seed=%d, served_model_name=%s)", VLLM_VERSION, model_config.model, speculative_config, @@ -194,6 +198,7 @@ def __init__( model_config.quantization_param_path, device_config.device, decoding_config, + observability_config, model_config.seed, model_config.served_model_name, ) @@ -209,6 +214,8 @@ def __init__( self.speculative_config = speculative_config self.load_config = load_config self.decoding_config = decoding_config or DecodingConfig() + self.observability_config = observability_config or ObservabilityConfig( + ) self.log_stats = log_stats if not self.model_config.skip_tokenizer_init: @@ -290,6 +297,12 @@ def __init__( max_model_len=self.model_config.max_model_len) self.stat_logger.info("cache_config", self.cache_config) + self.tracer = None + if self.observability_config.otlp_traces_endpoint: + self.tracer = init_tracer( + "vllm.llm_engine", + self.observability_config.otlp_traces_endpoint) + # Create sequence output processor, e.g. for beam search or # speculative decoding. self.output_processor = ( @@ -349,6 +362,14 @@ def from_engine_args( elif engine_config.device_config.device_type == "cpu": from vllm.executor.cpu_executor import CPUExecutor executor_class = CPUExecutor + elif engine_config.device_config.device_type == "xpu": + if distributed_executor_backend == "ray": + initialize_ray_cluster(engine_config.parallel_config) + from vllm.executor.ray_xpu_executor import RayXPUExecutor + executor_class = RayXPUExecutor + else: + from vllm.executor.xpu_executor import XPUExecutor + executor_class = XPUExecutor elif distributed_executor_backend == "ray": initialize_ray_cluster(engine_config.parallel_config) from vllm.executor.ray_gpu_executor import RayGPUExecutor @@ -438,6 +459,7 @@ def _add_processed_request( params: Union[SamplingParams, PoolingParams], arrival_time: float, lora_request: Optional[LoRARequest], + trace_headers: Optional[Dict[str, str]] = None, ) -> None: # Create the sequences. block_size = self.cache_config.block_size @@ -455,6 +477,7 @@ def _add_processed_request( params, arrival_time=arrival_time, lora_request=lora_request, + trace_headers=trace_headers, ) elif isinstance(params, PoolingParams): seq_group = self._create_sequence_group_with_pooling( @@ -501,6 +524,7 @@ def add_request( params: Union[SamplingParams, PoolingParams], arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Dict[str, str]] = None, ) -> None: """Add a request to the engine's request pool. @@ -518,6 +542,7 @@ def add_request( :class:`~vllm.PoolingParams` for pooling. arrival_time: The arrival time of the request. If None, we use the current monotonic time. + trace_headers: OpenTelemetry trace headers. Details: - Set arrival_time to the current time if it is None. @@ -559,6 +584,7 @@ def add_request( params=params, arrival_time=arrival_time, lora_request=lora_request, + trace_headers=trace_headers, ) def _create_sequence_group_with_sampling( @@ -568,6 +594,7 @@ def _create_sequence_group_with_sampling( sampling_params: SamplingParams, arrival_time: float, lora_request: Optional[LoRARequest], + trace_headers: Optional[Dict[str, str]] = None, ) -> SequenceGroup: """Creates a SequenceGroup with SamplingParams.""" max_logprobs = self.get_model_config().max_logprobs @@ -589,11 +616,14 @@ def _create_sequence_group_with_sampling( self.generation_config_fields) # Create the sequence group. - seq_group = SequenceGroup(request_id=request_id, - seqs=[seq], - arrival_time=arrival_time, - sampling_params=sampling_params, - lora_request=lora_request) + seq_group = SequenceGroup( + request_id=request_id, + seqs=[seq], + arrival_time=arrival_time, + sampling_params=sampling_params, + lora_request=lora_request, + trace_headers=trace_headers, + ) return seq_group @@ -787,6 +817,9 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: # Log stats. self.do_log_stats(scheduler_outputs, output) + # Tracing + self.do_tracing(scheduler_outputs) + if not request_outputs: # Stop the execute model loop in parallel workers until there are # more requests to process. This avoids waiting indefinitely in @@ -975,8 +1008,70 @@ def add_lora(self, lora_request: LoRARequest) -> bool: def remove_lora(self, lora_id: int) -> bool: return self.model_executor.remove_lora(lora_id) - def list_loras(self) -> List[int]: + def list_loras(self) -> Set[int]: return self.model_executor.list_loras() + def pin_lora(self, lora_id: int) -> bool: + return self.model_executor.pin_lora(lora_id) + def check_health(self) -> None: self.model_executor.check_health() + + def is_tracing_enabled(self) -> bool: + return self.tracer is not None + + def do_tracing(self, scheduler_outputs: SchedulerOutputs) -> None: + if self.tracer is None: + return + + for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups: + seq_group = scheduled_seq_group.seq_group + if seq_group.is_finished(): + self.create_trace_span(seq_group) + + def create_trace_span(self, seq_group: SequenceGroup) -> None: + if self.tracer is None or seq_group.sampling_params is None: + return + arrival_time_nano_seconds = int(seq_group.metrics.arrival_time * 1e9) + + trace_context = extract_trace_context(seq_group.trace_headers) + + with self.tracer.start_as_current_span( + "llm_request", + kind=SpanKind.SERVER, + context=trace_context, + start_time=arrival_time_nano_seconds) as seq_span: + metrics = seq_group.metrics + ttft = metrics.first_token_time - metrics.arrival_time + e2e_time = metrics.finished_time - metrics.arrival_time + # attribute names are based on + # https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/llm-spans.md + seq_span.set_attribute(SpanAttributes.LLM_RESPONSE_MODEL, + self.model_config.model) + seq_span.set_attribute(SpanAttributes.LLM_REQUEST_ID, + seq_group.request_id) + seq_span.set_attribute(SpanAttributes.LLM_REQUEST_TEMPERATURE, + seq_group.sampling_params.temperature) + seq_span.set_attribute(SpanAttributes.LLM_REQUEST_TOP_P, + seq_group.sampling_params.top_p) + seq_span.set_attribute(SpanAttributes.LLM_REQUEST_MAX_TOKENS, + seq_group.sampling_params.max_tokens) + seq_span.set_attribute(SpanAttributes.LLM_REQUEST_BEST_OF, + seq_group.sampling_params.best_of) + seq_span.set_attribute(SpanAttributes.LLM_REQUEST_N, + seq_group.sampling_params.n) + seq_span.set_attribute(SpanAttributes.LLM_USAGE_NUM_SEQUENCES, + seq_group.num_seqs()) + seq_span.set_attribute(SpanAttributes.LLM_USAGE_PROMPT_TOKENS, + len(seq_group.prompt_token_ids)) + seq_span.set_attribute( + SpanAttributes.LLM_USAGE_COMPLETION_TOKENS, + sum([ + seq.get_output_len() + for seq in seq_group.get_finished_seqs() + ])) + seq_span.set_attribute(SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE, + metrics.time_in_queue) + seq_span.set_attribute( + SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN, ttft) + seq_span.set_attribute(SpanAttributes.LLM_LATENCY_E2E, e2e_time) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index ae7ae144bc04f..027f5c7e73c2b 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -144,7 +144,7 @@ def __init__(self, labelnames: List[str], max_model_len: int): # end-metrics-definitions -def build_1_2_5_buckets(max_value: int): +def build_1_2_5_buckets(max_value: int) -> List[int]: """ Builds a list of buckets with increasing powers of 10 multiplied by mantissa values (1, 2, 5) until the value exceeds the specified maximum. @@ -155,7 +155,7 @@ def build_1_2_5_buckets(max_value: int): """ mantissa_lst = [1, 2, 5] exponent = 0 - buckets = [] + buckets: List[int] = [] while True: for m in mantissa_lst: value = m * 10**exponent diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py index cad44f476f06e..07a68c65a6dd8 100644 --- a/vllm/engine/output_processor/single_step.py +++ b/vllm/engine/output_processor/single_step.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union from vllm.config import SchedulerConfig from vllm.core.scheduler import Scheduler @@ -146,8 +146,8 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, # Beam search case # Select the child sequences to keep in the sequence group. - selected_child_seqs = [] - unselected_child_seqs = [] + selected_child_seqs: List[Tuple[Sequence, Optional[Sequence]]] = [] + unselected_child_seqs: List[Tuple[Sequence, Optional[Sequence]]] = [] beam_width = seq_group.sampling_params.best_of length_penalty = seq_group.sampling_params.length_penalty diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 075de0b4efb2d..feb904c5a13c9 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -6,7 +6,6 @@ change `vllm/entrypoints/openai/api_server.py` instead. """ -import argparse import json import ssl from typing import AsyncGenerator @@ -19,7 +18,7 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.sampling_params import SamplingParams from vllm.usage.usage_lib import UsageContext -from vllm.utils import random_uuid +from vllm.utils import FlexibleArgumentParser, random_uuid TIMEOUT_KEEP_ALIVE = 5 # seconds. app = FastAPI() @@ -80,7 +79,7 @@ async def stream_results() -> AsyncGenerator[bytes, None]: if __name__ == "__main__": - parser = argparse.ArgumentParser() + parser = FlexibleArgumentParser() parser.add_argument("--host", type=str, default=None) parser.add_argument("--port", type=int, default=8000) parser.add_argument("--ssl-keyfile", type=str, default=None) diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 4c0cb1e4f3e49..59ad73bf097c8 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -10,6 +10,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str from vllm.entrypoints.openai.serving_engine import LoRAModulePath +from vllm.utils import FlexibleArgumentParser class LoRAParserAction(argparse.Action): @@ -23,7 +24,7 @@ def __call__(self, parser, namespace, values, option_string=None): def make_arg_parser(): - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="vLLM OpenAI-Compatible RESTful API server.") parser.add_argument("--host", type=nullable_str, diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 3b56ad63f375d..b57d79859aec5 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -672,6 +672,17 @@ class BatchRequestInput(OpenAIBaseModel): body: Union[ChatCompletionRequest, ] +class BatchResponseData(OpenAIBaseModel): + # HTTP status code of the response. + status_code: int = 200 + + # An unique identifier for the API request. + request_id: str + + # The body of the response. + body: Union[ChatCompletionResponse, ] + + class BatchRequestOutput(OpenAIBaseModel): """ The per-line object of the batch output and error files @@ -683,7 +694,7 @@ class BatchRequestOutput(OpenAIBaseModel): # inputs. custom_id: str - response: Optional[ChatCompletionResponse] + response: Optional[BatchResponseData] # For requests that failed with a non-HTTP error, this will contain more # information on the cause of the failure. diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 7a6819c35a92d..dac6c2b4cd48f 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -1,7 +1,6 @@ -import argparse import asyncio -import sys from io import StringIO +from typing import Awaitable, List import aiohttp @@ -9,18 +8,20 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.entrypoints.openai.protocol import (BatchRequestInput, BatchRequestOutput, - ChatCompletionResponse) + BatchResponseData, + ChatCompletionResponse, + ErrorResponse) from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext -from vllm.utils import random_uuid +from vllm.utils import FlexibleArgumentParser, random_uuid from vllm.version import __version__ as VLLM_VERSION logger = init_logger(__name__) def parse_args(): - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="vLLM OpenAI-Compatible batch runner.") parser.add_argument( "-i", @@ -55,7 +56,7 @@ async def read_file(path_or_url: str) -> str: session.get(path_or_url) as resp: return await resp.text() else: - with open(path_or_url, "r") as f: + with open(path_or_url, "r", encoding="utf-8") as f: return f.read() @@ -68,7 +69,7 @@ async def write_file(path_or_url: str, data: str) -> None: # We should make this async, but as long as this is always run as a # standalone program, blocking the event loop won't effect performance # in this particular case. - with open(path_or_url, "w") as f: + with open(path_or_url, "w", encoding="utf-8") as f: f.write(data) @@ -76,20 +77,27 @@ async def run_request(chat_serving: OpenAIServingChat, request: BatchRequestInput) -> BatchRequestOutput: chat_request = request.body chat_response = await chat_serving.create_chat_completion(chat_request) + if isinstance(chat_response, ChatCompletionResponse): batch_output = BatchRequestOutput( id=f"vllm-{random_uuid()}", custom_id=request.custom_id, - response=chat_response, + response=BatchResponseData( + body=chat_response, request_id=f"vllm-batch-{random_uuid()}"), error=None, ) - else: + elif isinstance(chat_response, ErrorResponse): batch_output = BatchRequestOutput( id=f"vllm-{random_uuid()}", custom_id=request.custom_id, - response=None, + response=BatchResponseData( + status_code=chat_response.code, + request_id=f"vllm-batch-{random_uuid()}"), error=chat_response, ) + else: + raise ValueError("Request must not be sent in stream mode") + return batch_output @@ -114,7 +122,7 @@ async def main(args): ) # Submit all requests in the file to the engine "concurrently". - response_futures = [] + response_futures: List[Awaitable[BatchRequestOutput]] = [] for request_json in (await read_file(args.input_file)).strip().split("\n"): request = BatchRequestInput.model_validate_json(request_json) response_futures.append(run_request(openai_serving_chat, request)) @@ -128,9 +136,6 @@ async def main(args): output_buffer.seek(0) await write_file(args.output_file, output_buffer.read().strip()) - # Temporary workaround for https://github.com/vllm-project/vllm/issues/4789 - sys.exit(0) - if __name__ == "__main__": args = parse_args() diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 7cd434fe0d272..744e1d94511b3 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -31,6 +31,8 @@ get_full_image_text_prompt) from vllm.outputs import RequestOutput from vllm.sequence import Logprob +from vllm.tracing import (contains_trace_headers, extract_trace_headers, + log_tracing_disabled_warning) from vllm.utils import random_uuid logger = init_logger(__name__) @@ -267,11 +269,20 @@ async def create_chat_completion( if image_data is not None: inputs["multi_modal_data"] = image_data + is_tracing_enabled = await self.engine.is_tracing_enabled() + trace_headers = None + if is_tracing_enabled and raw_request: + trace_headers = extract_trace_headers(raw_request.headers) + if not is_tracing_enabled and raw_request and contains_trace_headers( + raw_request.headers): + log_tracing_disabled_warning() + result_generator = self.engine.generate( inputs, sampling_params, request_id, lora_request, + trace_headers=trace_headers, ) # Streaming response if request.stream: @@ -487,7 +498,7 @@ async def chat_completion_full_generator( final_res = res assert final_res is not None - choices = [] + choices: List[ChatCompletionResponseChoice] = [] role = self.get_chat_request_role(request) for output in final_res.outputs: diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 64671e21a724d..c775fa6daa739 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -24,6 +24,8 @@ get_guided_decoding_logits_processor) from vllm.outputs import RequestOutput from vllm.sequence import Logprob +from vllm.tracing import (contains_trace_headers, extract_trace_headers, + log_tracing_disabled_warning) from vllm.utils import merge_async_iterators, random_uuid logger = init_logger(__name__) @@ -125,6 +127,14 @@ async def create_completion(self, request: CompletionRequest, truncate_prompt_tokens) prompt_ids, prompt_text = prompt_formats + is_tracing_enabled = await self.engine.is_tracing_enabled() + trace_headers = None + if is_tracing_enabled: + trace_headers = extract_trace_headers(raw_request.headers) + if not is_tracing_enabled and contains_trace_headers( + raw_request.headers): + log_tracing_disabled_warning() + generator = self.engine.generate( { "prompt": prompt_text, @@ -133,6 +143,7 @@ async def create_completion(self, request: CompletionRequest, sampling_params, f"{request_id}-{i}", lora_request=lora_request, + trace_headers=trace_headers, ) generators.append(generator) diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 5a3448de3d7a4..cbf09f173fb66 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -25,7 +25,7 @@ def request_output_to_embedding_response( created_time: int, model_name: str, ) -> EmbeddingResponse: - data = [] + data: List[EmbeddingResponseData] = [] num_prompt_tokens = 0 for idx, final_res in enumerate(final_res_batch): assert final_res is not None diff --git a/vllm/envs.py b/vllm/envs.py index 1e772cea44953..89dffc5c14f06 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -5,6 +5,7 @@ VLLM_HOST_IP: str = "" VLLM_PORT: Optional[int] = None VLLM_USE_MODELSCOPE: bool = False + VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60 VLLM_INSTANCE_ID: Optional[str] = None VLLM_NCCL_SO_PATH: Optional[str] = None LD_LIBRARY_PATH: Optional[str] = None @@ -29,7 +30,7 @@ VLLM_CPU_KVCACHE_SPACE: int = 0 VLLM_XLA_CACHE_PATH: str = "~/.vllm/xla_cache/" VLLM_USE_RAY_COMPILED_DAG: bool = False - VLLM_WORKER_MULTIPROC_METHOD: str = "spawn" + VLLM_WORKER_MULTIPROC_METHOD: str = "fork" VLLM_IMAGE_FETCH_TIMEOUT: int = 5 VLLM_TARGET_DEVICE: str = "cuda" MAX_JOBS: Optional[str] = None @@ -114,6 +115,10 @@ "VLLM_INSTANCE_ID": lambda: os.environ.get("VLLM_INSTANCE_ID", None), + # Interval in seconds to log a warning message when the ring buffer is full + "VLLM_RINGBUFFER_WARNING_INTERVAL": + lambda: int(os.environ.get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60")), + # path to cudatoolkit home directory, under which should be bin, include, # and lib directories. "CUDA_HOME": @@ -212,7 +217,7 @@ # Use dedicated multiprocess context for workers. # Both spawn and fork work "VLLM_WORKER_MULTIPROC_METHOD": - lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"), + lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"), # Timeout for fetching images when serving multimodal models # Default is 5 seconds diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index a2212459f034e..6137cecd881d0 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -84,6 +84,9 @@ def add_lora(self, lora_request: LoRARequest) -> bool: def remove_lora(self, lora_id: int) -> bool: return self.driver_worker.remove_lora(lora_id) + def pin_lora(self, lora_id: int) -> bool: + return self.driver_worker.pin_lora(lora_id) + def list_loras(self) -> Set[int]: return self.driver_worker.list_loras() diff --git a/vllm/executor/distributed_gpu_executor.py b/vllm/executor/distributed_gpu_executor.py index f7c608af1ad39..235b5bc47021d 100644 --- a/vllm/executor/distributed_gpu_executor.py +++ b/vllm/executor/distributed_gpu_executor.py @@ -100,6 +100,13 @@ def remove_lora(self, lora_id: int) -> bool: lora_id=lora_id, ) + def pin_lora(self, lora_id: int) -> bool: + assert lora_id > 0, "lora_id must be greater than 0." + return self._run_workers( + "pin_lora", + lora_id=lora_id, + ) + def list_loras(self) -> Set[int]: return self._run_workers("list_loras") diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 4d01939c2e38b..7c2520b5a64f5 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -86,6 +86,10 @@ def add_lora(self, lora_request: LoRARequest) -> bool: def remove_lora(self, lora_id: int) -> bool: raise NotImplementedError + @abstractmethod + def pin_lora(self, lora_id: int) -> bool: + raise NotImplementedError # type: ignore + @abstractmethod def list_loras(self) -> Set[int]: raise NotImplementedError diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 3ad201f4757ec..0a654200ed796 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -99,6 +99,10 @@ def remove_lora(self, lora_id: int) -> bool: assert lora_id > 0, "lora_id must be greater than 0." return self.driver_worker.remove_lora(lora_id) + def pin_lora(self, lora_id: int) -> bool: + assert lora_id > 0, "lora_id must be greater than 0." + return self.driver_worker.pin_lora(lora_id) + def list_loras(self) -> Set[int]: return self.driver_worker.list_loras() diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py index 8385e56f88b39..e63e5a3a027fa 100644 --- a/vllm/executor/multiproc_gpu_executor.py +++ b/vllm/executor/multiproc_gpu_executor.py @@ -10,7 +10,7 @@ from vllm.logger import init_logger from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.utils import (cuda_device_count_stateless, - get_distributed_init_method, get_ip, get_open_port, + get_distributed_init_method, get_open_port, get_vllm_instance_id, make_async) logger = init_logger(__name__) @@ -37,8 +37,11 @@ def _init_executor(self) -> None: assert world_size <= cuda_device_count_stateless(), ( "please set tensor_parallel_size to less than max local gpu count") + # Multiprocessing-based executor does not support multi-node setting. + # Since it only works for single node, we can use the loopback address + # 127.0.0.1 for communication. distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) + "127.0.0.1", get_open_port()) if world_size == 1: self.workers = [] diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index e7f0e887921b7..c5e2fb0f67736 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -65,6 +65,9 @@ def add_lora(self, lora_request: LoRARequest) -> bool: def remove_lora(self, lora_id: int) -> bool: return self.driver_worker.remove_lora(lora_id) + def pin_lora(self, lora_id: int) -> bool: + return self.driver_worker.pin_lora(lora_id) + def list_loras(self) -> Set[int]: return self.driver_worker.list_loras() diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 89d1c4ac7cbc2..fc83c552888a6 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -137,6 +137,12 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids): node_workers[node_id].append(i) + # `gpu_ids` can be a list of strings or integers. + # convert them to integers for consistency. + # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs), + # string sorting is not sufficient. + # see https://github.com/vllm-project/vllm/issues/5590 + gpu_ids = [int(x) for x in gpu_ids] node_gpus[node_id].extend(gpu_ids) for node_id, gpu_ids in node_gpus.items(): node_gpus[node_id] = sorted(gpu_ids) @@ -155,6 +161,16 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", self._run_workers("update_environment_variables", all_args=all_args_to_update_environment_variables) + if len(node_gpus) == 1: + # in single node case, we don't need to get the IP address. + # the loopback address is sufficient + # NOTE: a node may have several IP addresses, one for each + # network interface. `get_ip()` might return any of them, + # while they might not work for communication inside the node + # if the network setup is complicated. Using the loopback address + # solves this issue, as it always works for communication inside + # the node. + driver_ip = "127.0.0.1" distributed_init_method = get_distributed_init_method( driver_ip, get_open_port()) diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 4704f5f1b1a10..495fddd175dd4 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -3,7 +3,7 @@ from vllm.config import ParallelConfig from vllm.logger import init_logger -from vllm.utils import get_ip, is_hip +from vllm.utils import get_ip, is_hip, is_xpu from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -71,7 +71,7 @@ def initialize_ray_cluster( "serving.") # Connect to a ray cluster. - if is_hip(): + if is_hip() or is_xpu(): ray.init(address=ray_address, ignore_reinit_error=True, num_gpus=parallel_config.world_size) diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py new file mode 100644 index 0000000000000..dd7c82289341e --- /dev/null +++ b/vllm/executor/ray_xpu_executor.py @@ -0,0 +1,401 @@ +import asyncio +import os +import pickle +from collections import defaultdict +from itertools import islice, repeat +from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Set, + Tuple, Union) + +from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, + ModelConfig, ParallelConfig, SchedulerConfig, + SpeculativeConfig, VisionLanguageConfig) +from vllm.executor.distributed_gpu_executor import ( # yapf: disable + DistributedGPUExecutor, DistributedGPUExecutorAsync) +from vllm.executor.ray_utils import RayWorkerWrapper, ray +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, + make_async) + +if ray is not None: + from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy + +if TYPE_CHECKING: + from ray.util.placement_group import PlacementGroup + +logger = init_logger(__name__) + +# If the env var is set, it uses the Ray's compiled DAG API +# which optimizes the control plane overhead. +# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. +USE_RAY_COMPILED_DAG = bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0)) + + +class RayXPUExecutor(DistributedGPUExecutor): + + def __init__( + self, + model_config: ModelConfig, + cache_config: CacheConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + load_config: LoadConfig, + lora_config: Optional[LoRAConfig], + vision_language_config: Optional[VisionLanguageConfig], + speculative_config: Optional[SpeculativeConfig], + ) -> None: + assert device_config.device_type == "xpu" + assert (not speculative_config + ), "Speculative decoding not yet supported for XPU backend" + + self.model_config = model_config + self.cache_config = cache_config + self.load_config = load_config + self.lora_config = lora_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + self.vision_language_config = vision_language_config + + placement_group = self.parallel_config.placement_group + + # Disable Ray usage stats collection. + ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") + if ray_usage != "1": + os.environ["RAY_USAGE_STATS_ENABLED"] = "0" + + # Create the parallel GPU workers. + self._init_workers_ray(placement_group) + + # Profile the memory usage and initialize the cache. + self.forward_dag = None + if USE_RAY_COMPILED_DAG: + self.forward_dag = self._compiled_ray_dag() + + # This is non-None when the execute model loop is running + # in the parallel workers. It's a coroutine in the AsyncLLMEngine case. + self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None + # Updated by implementations that require additional args to be passed + # to the _run_workers execute_model call + self.extra_execute_model_run_workers_kwargs: Dict[str, Any] = {} + + def _init_executor(self) -> None: + pass + + def determine_num_available_blocks(self) -> Tuple[int, int]: + """Determine the number of available KV blocks. + + This invokes `determine_num_available_blocks` on each worker and takes + the min of the results, guaranteeing that the selected cache sizes are + compatible with all workers. + + Returns: + - Tuple[num_gpu_blocks, num_cpu_blocks] + """ + # Get the maximum number of blocks that can be allocated on GPU and CPU. + num_blocks = self._run_workers("determine_num_available_blocks", ) + + # Since we use a shared centralized controller, we take the minimum + # number of blocks across all workers to make sure all the memory + # operators can be applied to all workers. + num_gpu_blocks = min(b[0] for b in num_blocks) + num_cpu_blocks = min(b[1] for b in num_blocks) + + return num_gpu_blocks, num_cpu_blocks + + def _init_workers_ray(self, placement_group: "PlacementGroup", + **ray_remote_kwargs): + if self.parallel_config.tensor_parallel_size == 1: + # For single GPU case, we use a ray worker with constrained memory. + num_gpus = self.cache_config.gpu_memory_utilization + else: + # Otherwise, the ray workers are allocated with a full GPU. + num_gpus = 1 + + # The driver dummy worker does not actually use any resources. + # It holds the resource for the driver worker. + self.driver_dummy_worker: Optional[RayWorkerWrapper] = None + # The remaining workers are the actual ray actors. + self.workers: List[RayWorkerWrapper] = [] + + # Create the workers. + driver_ip = get_ip() + for bundle_id, bundle in enumerate(placement_group.bundle_specs): + if not bundle.get("GPU", 0): + continue + scheduling_strategy = PlacementGroupSchedulingStrategy( + placement_group=placement_group, + placement_group_capture_child_tasks=True, + placement_group_bundle_index=bundle_id, + ) + worker = ray.remote( + num_cpus=0, + num_gpus=num_gpus, + scheduling_strategy=scheduling_strategy, + **ray_remote_kwargs, + )(RayWorkerWrapper).remote( + worker_module_name="vllm.worker.xpu_worker", + worker_class_name="XPUWorker", + trust_remote_code=self.model_config.trust_remote_code, + ) + + worker_ip = ray.get(worker.get_node_ip.remote()) + if worker_ip == driver_ip and self.driver_dummy_worker is None: + # If the worker is on the same node as the driver, we use it + # as the resource holder for the driver process. + self.driver_dummy_worker = worker + self.driver_worker = RayWorkerWrapper( + worker_module_name="vllm.worker.xpu_worker", + worker_class_name="XPUWorker", + trust_remote_code=self.model_config.trust_remote_code, + ) + else: + # Else, added to the list of workers. + self.workers.append(worker) + if self.driver_dummy_worker is None: + raise ValueError( + "Ray does not allocate any GPUs on the driver node. Consider " + "adjusting the Ray placement group or running the driver on a " + "GPU node.") + + # Get the set of GPU IDs used on each node. + worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids", + use_dummy_driver=True) + + node_workers = defaultdict(list) + node_gpus = defaultdict(list) + + for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids): + node_workers[node_id].append(i) + node_gpus[node_id].extend(gpu_ids) + for node_id, gpu_ids in node_gpus.items(): + node_gpus[node_id] = sorted(gpu_ids) + + # TODO: add env var for xpu + + distributed_init_method = get_distributed_init_method( + driver_ip, get_open_port()) + + def collect_arg_helper_func(**kwargs): + # avoid writing `{"name": value}` manually + return kwargs + + init_worker_all_kwargs = [] + + # Initialize the actual workers inside worker wrapper. + for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids, ): + local_rank = node_workers[node_id].index(rank) + init_worker_all_kwargs.append( + collect_arg_helper_func( + model_config=self.model_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + device_config=self.device_config, + cache_config=self.cache_config, + load_config=self.load_config, + local_rank=local_rank, + rank=rank, + distributed_init_method=distributed_init_method, + lora_config=self.lora_config, + vision_language_config=self.vision_language_config, + is_driver_worker=rank == 0, + )) + self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs) + + self._run_workers("init_device") + self._run_workers( + "load_model", + max_concurrent_workers=self.parallel_config. + max_parallel_loading_workers, + ) + + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: + """Initialize the KV cache in all workers. + """ + + # NOTE: We log here to avoid multiple logs when number of workers is + # greater than one. We could log in the engine, but not all executors + # have GPUs. + logger.info("# GPU blocks: %d, " + "# CPU blocks: %d", num_gpu_blocks, num_cpu_blocks) + + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + + self._run_workers("initialize_cache", + num_gpu_blocks=num_gpu_blocks, + num_cpu_blocks=num_cpu_blocks) + + def _driver_execute_model( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + """Run execute_model in the driver worker. + + Passing None will cause the driver to stop the model execution + loop running in each of the remote workers. + """ + return self.driver_worker.execute_method("execute_model", + execute_model_req) + + def add_lora(self, lora_request: LoRARequest) -> bool: + assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." + return self._run_workers( + "add_lora", + lora_request=lora_request, + ) + + def remove_lora(self, lora_id: int) -> bool: + assert lora_id > 0, "lora_id must be greater than 0." + return self._run_workers( + "remove_lora", + lora_id=lora_id, + ) + + def list_loras(self) -> Set[int]: + return self._run_workers("list_loras") + + def _run_workers( + self, + method: str, + *args, + async_run_remote_workers_only: bool = False, + all_args: Optional[List[Tuple[Any, ...]]] = None, + all_kwargs: Optional[List[Dict[str, Any]]] = None, + use_dummy_driver: bool = False, + max_concurrent_workers: Optional[int] = None, + use_ray_compiled_dag: bool = False, + **kwargs, + ) -> Any: + """Runs the given method on all workers. Can be used in the following + ways: + + - args/kwargs: All workers share the same args/kwargs + - args/kwargs and driver_args/driver_kwargs: Driver worker has + different args + - all_args/all_kwargs: args/kwargs for each worker are specified + individually + """ + + if max_concurrent_workers: + raise NotImplementedError( + "max_concurrent_workers is not supported yet.") + + count = len(self.workers) + all_worker_args = repeat(args, count) if all_args is None \ + else islice(all_args, 1, None) + all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \ + else islice(all_kwargs, 1, None) + + if use_ray_compiled_dag: + # Right now, compiled DAG can only accept a single + # input. TODO(sang): Fix it. + assert self.forward_dag is not None + output_channels = self.forward_dag.execute(1) + else: + # Start the ray workers first. + ray_worker_outputs = [ + worker.execute_method.remote(method, *worker_args, + **worker_kwargs) + for (worker, worker_args, worker_kwargs + ) in zip(self.workers, all_worker_args, all_worker_kwargs) + ] + if async_run_remote_workers_only: + # Just return futures + return ray_worker_outputs + + driver_args = args if all_args is None else all_args[0] + driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0] + + # Start the driver worker after all the ray workers. + if not use_dummy_driver: + driver_worker_output = self.driver_worker.execute_method( + method, *driver_args, **driver_kwargs) + else: + assert self.driver_dummy_worker is not None + driver_worker_output = ray.get( + self.driver_dummy_worker.execute_method.remote( + method, *driver_args, **driver_kwargs)) + # Get the results of the ray workers. + if self.workers: + if use_ray_compiled_dag: + try: + ray_worker_outputs = [ + pickle.loads(chan.begin_read()) + for chan in output_channels + ] + finally: + # Has to call end_read in order to reuse the DAG. + for chan in output_channels: + chan.end_read() + else: + ray_worker_outputs = ray.get(ray_worker_outputs) + + return [driver_worker_output] + ray_worker_outputs + + def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: + """Wait for futures returned from _run_workers() with + async_run_remote_workers_only to complete.""" + ray.get(parallel_worker_tasks) + + def _compiled_ray_dag(self): + import pkg_resources + required_version = "2.9" + current_version = pkg_resources.get_distribution("ray").version + if current_version < required_version: + raise ValueError(f"Ray version {required_version} or greater is " + f"required, but found {current_version}") + + from ray.dag import InputNode, MultiOutputNode + assert self.parallel_config.worker_use_ray + + # Right now, compiled DAG requires at least 1 arg. We send + # a dummy value for now. It will be fixed soon. + with InputNode() as input_data: + forward_dag = MultiOutputNode([ + worker.execute_model_compiled_dag_remote. + bind( # type: ignore[attr-defined] + input_data) for worker in self.workers + ]) + return forward_dag.experimental_compile() + + def check_health(self) -> None: + """Raises an error if engine is unhealthy.""" + self._check_if_any_actor_is_dead() + + def _check_if_any_actor_is_dead(self): + if not self.workers: + return + + dead_actors = [] + for actor in self.workers: + actor_state = ray.state.actors(actor._ray_actor_id.hex()) # pylint: disable=protected-access + if actor_state["State"] == "DEAD": + dead_actors.append(actor) + if dead_actors: + raise RuntimeError("At least one Worker is dead. " + f"Dead Workers: {dead_actors}. ") + + +class RayXPUExecutorAsync(RayXPUExecutor, DistributedGPUExecutorAsync): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.driver_exec_method = make_async(self.driver_worker.execute_method) + + async def _driver_execute_model_async( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + return await self.driver_exec_method("execute_model", + execute_model_req) + + async def _start_worker_execution_loop(self): + coros = [ + worker.execute_method.remote("start_worker_execution_loop") + for worker in self.workers + ] + return await asyncio.gather(*coros) diff --git a/vllm/executor/tpu_executor.py b/vllm/executor/tpu_executor.py index 7061ad85f88c0..5ed00e1374100 100644 --- a/vllm/executor/tpu_executor.py +++ b/vllm/executor/tpu_executor.py @@ -82,6 +82,9 @@ def add_lora(self, lora_request: LoRARequest) -> bool: def remove_lora(self, lora_id: int) -> bool: raise NotImplementedError("LoRA is not implemented for TPU backend.") + def pin_lora(self, lora_id: int) -> bool: + raise NotImplementedError("LoRA is not implemented for TPU backend.") + def list_loras(self) -> Set[int]: raise NotImplementedError("LoRA is not implemented for TPU backend.") diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py new file mode 100644 index 0000000000000..d37200bd02de3 --- /dev/null +++ b/vllm/executor/xpu_executor.py @@ -0,0 +1,98 @@ +from typing import List, Optional + +import torch + +from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, + ModelConfig, ParallelConfig, SchedulerConfig, + SpeculativeConfig, VisionLanguageConfig) +from vllm.executor.executor_base import ExecutorAsyncBase +from vllm.executor.gpu_executor import GPUExecutor +from vllm.logger import init_logger +from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.utils import make_async +from vllm.worker.worker_base import WorkerWrapperBase + +logger = init_logger(__name__) + + +class XPUExecutor(GPUExecutor): + + def __init__( + self, + model_config: ModelConfig, + cache_config: CacheConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + load_config: LoadConfig, + lora_config: Optional[LoRAConfig], + vision_language_config: Optional[VisionLanguageConfig], + speculative_config: Optional[SpeculativeConfig], + ) -> None: + assert device_config.device_type == "xpu" + assert (not speculative_config + ), "Speculative decoding not yet supported for XPU backend" + + model_config = _verify_and_get_model_config(model_config) + + self.model_config = model_config + self.cache_config = cache_config + self.load_config = load_config + self.lora_config = lora_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + self.vision_language_config = vision_language_config + self.speculative_config = None + + # Instantiate the worker and load the model to GPU. + self._init_executor() + + def _create_worker(self, + local_rank: int = 0, + rank: int = 0, + distributed_init_method: Optional[str] = None): + if self.speculative_config is None: + worker_module_name = "vllm.worker.xpu_worker" + worker_class_name = "XPUWorker" + else: + raise NotImplementedError( + "XPU does not support speculative decoding") + + wrapper = WorkerWrapperBase( + worker_module_name=worker_module_name, + worker_class_name=worker_class_name, + ) + wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank, + distributed_init_method)) + return wrapper.worker + + def execute_model( + self, + execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + output = self.driver_worker.execute_model(execute_model_req) + return output + + +class XPUExecutorAsync(XPUExecutor, ExecutorAsyncBase): + + async def execute_model_async( + self, + execute_model_req: ExecuteModelRequest, + ) -> List[SamplerOutput]: + output = await make_async(self.driver_worker.execute_model + )(execute_model_req=execute_model_req) + return output + + +def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig: + if config.dtype == torch.bfloat16: + logger.warning( + "bfloat16 is not fully supported on XPU, casting to float16.") + config.dtype = torch.float16 + if not config.enforce_eager: + logger.warning( + "CUDA graph is not supported on XPU, fallback to the eager " + "mode.") + config.enforce_eager = True + return config diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index ffdc32b7339af..d27171f720832 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -12,6 +12,7 @@ from vllm.lora.layers import (ColumnParallelLinearWithLoRA, MergedColumnParallelLinearWithLoRA, MergedQKVParallelLinearWithLora, + QKVParallelLinearWithLora, RowParallelLinearWithLoRA) from vllm.lora.punica import bgmv, dispatch_bgmv_low_level @@ -90,11 +91,11 @@ def can_replace_layer(cls, source_layer: nn.Module, def _mcp_apply(x, bias, layer): """ MergedColumnParallelLinearWithShardedLoRA and - QKVParallelLinearWithShardedLora share the same + MergedQKVParallelLinearWithShardedLora share the same LoRa weight application method. The main difference is the step by shard_size for lora_b which can - vary for QKVParallelLinearWithShardedLora but is constant for + vary for MergedQKVParallelLinearWithShardedLora but is constant for MergedColumnParallelLinearWithShardedLoRA. """ # expecting 2 for column parallel and 3 for qkv @@ -167,7 +168,7 @@ def can_replace_layer(cls, source_layer: nn.Module, ) -class MergedQKVParallelLinearWithShardedLora(MergedQKVParallelLinearWithLora): +class QKVParallelLinearWithShardedLora(QKVParallelLinearWithLora): """ Differs from QKVParallelLinearWithLora by slicing the LoRA A's also. @@ -175,6 +176,57 @@ class MergedQKVParallelLinearWithShardedLora(MergedQKVParallelLinearWithLora): Based on S-LoRA, slicing happens along the rank dim. """ + def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: + tp_rank = get_tensor_model_parallel_rank() + shard_size = self.lora_a_stacked.shape[2] + start_idx = tp_rank * shard_size + lora_a = lora_a[:, start_idx:start_idx + shard_size] + return lora_a + + def apply(self, x: torch.Tensor, + bias: Optional[torch.Tensor]) -> torch.Tensor: + output = self.base_layer.quant_method.apply(self.base_layer, x, bias) + + x = x.view(-1, x.shape[-1]) + output, out_orig_shape = output.view(-1, + output.shape[-1]), output.shape + buffer = torch.zeros((x.shape[0], self.lora_a_stacked.shape[2]), + dtype=torch.float32, + device=x.device) + + bgmv(buffer, x, self.lora_a_stacked, + self.indices[:self.indices_len[0]], 0, 1.0) + buffer = tensor_model_parallel_all_gather(buffer) + bgmv(output, buffer, self.lora_b_stacked, + self.indices[:self.indices_len[0]], 0, 1.0) + # now have column partitioned output + + output = output.view(*out_orig_shape) + return output + + @classmethod + @_fully_sharded_can_replace + def can_replace_layer(cls, source_layer: nn.Module, + lora_config: LoRAConfig, packed_modules_list: List, + model_config: Optional[PretrainedConfig]) -> bool: + # specifying kwargs so they can be easily accessed in decorator + return super().can_replace_layer( + source_layer=source_layer, + lora_config=lora_config, + packed_modules_list=packed_modules_list, + model_config=model_config, + decorate=False, + ) + + +class MergedQKVParallelLinearWithShardedLora(MergedQKVParallelLinearWithLora): + """ + Differs from MergedQKVParallelLinearWithLora by slicing the + LoRA A's also. + + Based on S-LoRA, slicing happens along the rank dim. + """ + def slice_lora_a( self, lora_a: List[Union[torch.Tensor, None]] ) -> List[Union[torch.Tensor, None]]: diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index e3ab1708c3fdf..e4a23273f7282 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -641,6 +641,24 @@ def __init__(self, base_layer: QKVParallelLinear) -> None: self.kv_proj_total_size = (self.base_layer.total_num_kv_heads * self.base_layer.head_size) + def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: + tp_rank = get_tensor_model_parallel_rank() + self.q_shard_id = tp_rank + self.kv_shard_id = tp_rank // self.base_layer.num_kv_head_replicas + lora_b_q = lora_b[:, self.q_proj_shard_size * + self.q_shard_id:self.q_proj_shard_size * + (self.q_shard_id + 1)] + k_offset = self.q_proj_total_size + lora_b_k = lora_b[:, k_offset + + self.kv_proj_shard_size * self.kv_shard_id:k_offset + + self.kv_proj_shard_size * (self.kv_shard_id + 1)] + v_offset = k_offset + self.kv_proj_total_size + lora_b_v = lora_b[:, v_offset + + self.kv_proj_shard_size * self.kv_shard_id:v_offset + + self.kv_proj_shard_size * (self.kv_shard_id + 1)] + lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=1) + return lora_b + def set_lora( self, index: int, @@ -650,21 +668,8 @@ def set_lora( ): self.reset_lora(index) if self.tp_size > 1: - tp_rank = get_tensor_model_parallel_rank() - self.q_shard_id = tp_rank - self.kv_shard_id = tp_rank // self.base_layer.num_kv_head_replicas - lora_b_q = lora_b[:, self.q_proj_shard_size * - self.q_shard_id:self.q_proj_shard_size * - (self.q_shard_id + 1)] - k_offset = self.q_proj_total_size - lora_b_k = lora_b[:, k_offset + self.kv_proj_shard_size * - self.kv_shard_id:k_offset + - self.kv_proj_shard_size * (self.kv_shard_id + 1)] - v_offset = k_offset + self.kv_proj_total_size - lora_b_v = lora_b[:, v_offset + self.kv_proj_shard_size * - self.kv_shard_id:v_offset + - self.kv_proj_shard_size * (self.kv_shard_id + 1)] - lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=1) + lora_a = self.slice_lora_a(lora_a) + lora_b = self.slice_lora_b(lora_b) self.lora_a_stacked[index, 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( @@ -674,6 +679,7 @@ def set_lora( lora_b.T, non_blocking=True) @classmethod + @_not_fully_sharded_can_replace def can_replace_layer(cls, source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: List, model_config: Optional[PretrainedConfig]) -> bool: diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py index d7794aa7cd35c..8f3c7f76932af 100644 --- a/vllm/lora/lora.py +++ b/vllm/lora/lora.py @@ -1,4 +1,5 @@ from typing import List, Optional +from typing import Sequence as GenericSequence import torch @@ -120,7 +121,7 @@ def __init__( @classmethod def pack( - cls, loras: List[Optional["LoRALayerWeights"]] + cls, loras: GenericSequence[Optional["LoRALayerWeights"]] ) -> "PackedLoRALayerWeights": """Pack a list of LoRAs into a single LoRA. diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 3e82856866d85..afb9ba4550671 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -525,6 +525,12 @@ def remove_lora(self, lora_id: int) -> bool: self.long_lora_context.offsets_by_lora_id.pop(lora_id, None) return bool(self._registered_loras.pop(lora_id, None)) + def pin_lora(self, lora_id: int) -> bool: + """Pin a LoRAModel in the manager cache.""" + raise NotImplementedError( + "Pinning is not supported in LoRAModelManager." + "Use LRUCacheLoRAModelManager for pinning") # type: ignore + # TODO see if this can be vectorized def _set_lora_mapping(self, mapping: LoRAMapping) -> None: (base_indices, sampler_indices, sampler_indices_padded, @@ -777,6 +783,26 @@ def remove_oldest_lora(self) -> bool: return True return False + def pin_lora(self, lora_id: int) -> bool: + """Pin a LoRAModel in the manager cache.""" + self._pin_lora_in_cpu_cache(lora_id) + self._pin_lora_in_gpu_cache(lora_id) + return True + + def _pin_lora_in_cpu_cache(self, lora_id: int): + try: + self._registered_loras.pin(lora_id) + except ValueError as err: + raise ValueError("Pinning failed. " + f"LoRA {lora_id} is not registered.") from err + + def _pin_lora_in_gpu_cache(self, lora_id: int): + if lora_id not in self._active_loras: + # move lora to gpu if not already active + self.activate_lora(lora_id) + + self._active_loras.pin(lora_id) + def create_lora_manager( model: nn.Module, diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 4a86c16cf64db..ab3b99eee6fc1 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -8,7 +8,8 @@ from vllm.lora.fully_sharded_layers import ( ColumnParallelLinearWithShardedLoRA, MergedColumnParallelLinearWithShardedLoRA, - MergedQKVParallelLinearWithShardedLora, RowParallelLinearWithShardedLoRA) + MergedQKVParallelLinearWithShardedLora, QKVParallelLinearWithShardedLora, + RowParallelLinearWithShardedLoRA) # being imported for _all_lora_classes below # yapf conflicts with isort for this block # yapf: disable @@ -35,6 +36,7 @@ RowParallelLinearWithLoRA, LogitsProcessorWithLoRA, ColumnParallelLinearWithShardedLoRA, + QKVParallelLinearWithShardedLora, MergedColumnParallelLinearWithShardedLoRA, MergedQKVParallelLinearWithShardedLora, RowParallelLinearWithShardedLoRA, diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 4657757bd484b..ca4903c23bcaa 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -165,7 +165,7 @@ def _load_lora(self, lora_request: LoRARequest) -> LoRAModel: model = self._lora_manager.model supported_lora_modules = model.supported_lora_modules packed_modules_mapping = model.packed_modules_mapping - expected_lora_modules = [] + expected_lora_modules: List[str] = [] for module in supported_lora_modules: if module in packed_modules_mapping: expected_lora_modules.extend( @@ -221,6 +221,9 @@ def add_lora(self, lora_request: LoRARequest) -> bool: def remove_lora(self, lora_id: int) -> bool: return self._lora_manager.remove_lora(lora_id) + def pin_lora(self, lora_id: int) -> bool: + return self._lora_manager.pin_lora(lora_id) + def remove_all_loras(self): self._lora_manager.remove_all_loras() diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 56aa629ae3455..0db72d8d95f24 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -1,6 +1,6 @@ import torch.nn as nn -from vllm.utils import is_cpu, is_hip, is_tpu +from vllm.utils import is_cpu, is_hip, is_tpu, is_xpu class CustomOp(nn.Module): @@ -29,9 +29,7 @@ def forward_hip(self, *args, **kwargs): return self.forward_cuda(*args, **kwargs) def forward_xpu(self, *args, **kwargs): - # By default, we assume that XPU ops are compatible with CUDA ops. - # NOTE(woosuk): This is a placeholder for future extensions. - return self.forward_cuda(*args, **kwargs) + raise NotImplementedError def forward_cpu(self, *args, **kwargs): # By default, we assume that CPU ops are compatible with CUDA ops. @@ -58,5 +56,7 @@ def dispatch_forward(self): return self.forward_cpu elif is_tpu(): return self.forward_tpu + elif is_xpu(): + return self.forward_xpu else: return self.forward_cuda diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 4d076421f9d2a..5bfdba67b443d 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -37,6 +37,15 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: ops.silu_and_mul(out, x) return out + def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + from vllm._ipex_ops import ipex_ops as ops + + d = x.shape[-1] // 2 + output_shape = (x.shape[:-1] + (d, )) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + ops.silu_and_mul(out, x) + return out + class GeluAndMul(CustomOp): """An activation function for GeGLU. @@ -71,6 +80,18 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: ops.gelu_tanh_and_mul(out, x) return out + def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + from vllm._ipex_ops import ipex_ops as ops + + d = x.shape[-1] // 2 + output_shape = (x.shape[:-1] + (d, )) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + if self.approximate == "none": + ops.gelu_and_mul(out, x) + elif self.approximate == "tanh": + ops.gelu_tanh_and_mul(out, x) + return out + def extra_repr(self) -> str: return f'approximate={repr(self.approximate)}' @@ -90,6 +111,13 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: ops.gelu_new(out, x) return out + def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + from vllm._ipex_ops import ipex_ops as ops + + out = torch.empty_like(x) + ops.gelu_new(out, x) + return out + class FastGELU(CustomOp): @@ -105,6 +133,31 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: ops.gelu_fast(out, x) return out + def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + from vllm._ipex_ops import ipex_ops as ops + + out = torch.empty_like(x) + ops.gelu_fast(out, x) + return out + + +class QuickGELU(CustomOp): + + # https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py#L90 + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + return x * torch.sigmoid(1.702 * x) + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + from vllm import _custom_ops as ops + + out = torch.empty_like(x) + ops.gelu_quick(out, x) + return out + + # TODO implement forward_xpu for QuickGELU + # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + class ScaledActivation(nn.Module): """An activation function with post-scale parameters. @@ -154,6 +207,7 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor): "gelu_new": NewGELU(), "gelu_pytorch_tanh": nn.GELU(approximate="tanh"), "relu": nn.ReLU(), + "quick_gelu": QuickGELU(), } diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 4533adf8f83aa..14f5e2378a421 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -67,6 +67,30 @@ def forward_cuda( ) return out + def forward_xpu( + self, + x: torch.Tensor, + residual: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + from vllm._ipex_ops import ipex_ops as ops + + if residual is not None: + ops.fused_add_rms_norm( + x, + residual, + self.weight.data, + self.variance_epsilon, + ) + return x, residual + out = torch.empty_like(x) + ops.rms_norm( + out, + x, + self.weight.data, + self.variance_epsilon, + ) + return out + def extra_repr(self) -> str: s = f"hidden_size={self.weight.data.size(0)}" s += f", eps={self.variance_epsilon}" diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 78b306bbe6b3c..a18ea5601ba94 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -401,7 +401,7 @@ def weight_loader(self, param_data.copy_(loaded_weight) return current_shard_offset = 0 - shard_offsets = [] + shard_offsets: List[Tuple[int, int, int]] = [] for i, output_size in enumerate(self.output_sizes): shard_offsets.append((i, current_shard_offset, output_size)) current_shard_offset += output_size @@ -476,13 +476,6 @@ def weight_loader(self, "MergedColumnParallelLinear, assume the weight is " "the same for all partitions.") - if fp8_scales_shard_indexer is None: - if len(param_data.shape) == 0: - param_data = param_data.reshape(1) - - if len(loaded_weight.shape) == 0: - loaded_weight = loaded_weight.reshape(1) - # UPSTREAM SYNC: needed for LazyCompressedParameter self.loaded_shards.add(loaded_shard_id) assert param_data.shape == loaded_weight.shape @@ -707,12 +700,6 @@ def weight_loader(self, "QKVParallelLinear, assume the weight is the same " "for all partitions.") - if len(param_data.shape) == 0: - param_data = param_data.reshape(1) - - if len(loaded_weight.shape) == 0: - loaded_weight = loaded_weight.reshape(1) - assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index c7f04784591b2..44dd024afe74d 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -8,16 +8,20 @@ QuantizationConfig) from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme, CompressedTensorsW4A16, - CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor) + CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8DynamicToken, + CompressedTensorsW8A8StaticTensor) from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - QuantizationArgs, QuantizationStrategy, find_first_name_or_class_match) + CompressionFormat, QuantizationArgs, QuantizationStrategy, + find_first_name_or_class_match) class CompressedTensorsConfig(QuantizationConfig): - def __init__(self, layer_quant_details: Dict[str, Any], ignore: List[str]): + def __init__(self, layer_quant_details: Dict[str, Any], ignore: List[str], + quant_format: str): self.ignore = ignore self.layer_quant_details = layer_quant_details + self.quant_format = quant_format def get_linear_method(self) -> "CompressedTensorsLinearMethod": return CompressedTensorsLinearMethod(self) @@ -26,7 +30,7 @@ def get_scaled_act_names(self) -> List[str]: return [] def get_supported_act_dtypes(cls) -> List[torch.dtype]: - return [torch.float16] + return [torch.float16, torch.bfloat16] # Need to figure it out def get_min_capability(self) -> int: @@ -46,6 +50,7 @@ def get_quant_method( def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig": layer_quant_details: Dict[str, Any] = dict() ignore: List[str] = config.get("ignore", None) + quant_format: str = config.get("format", None) # The quant_config has multiple config_groups, each containing # an input_activations key with details about how the activations are @@ -69,7 +74,9 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig": except Exception: layer_quant_details[target]["input_activations"] = None - return cls(layer_quant_details=layer_quant_details, ignore=ignore) + return cls(layer_quant_details=layer_quant_details, + ignore=ignore, + quant_format=quant_format) @classmethod def get_config_filenames(cls) -> List[str]: @@ -78,8 +85,11 @@ def get_config_filenames(cls) -> List[str]: def _is_static_tensor_w8a8(self, weight_quant: BaseModel, input_quant: BaseModel) -> bool: is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8 - is_tensor = (weight_quant.strategy == input_quant.strategy == - QuantizationStrategy.TENSOR.value) + weight_strategy = ( + weight_quant.strategy == QuantizationStrategy.TENSOR.value + or weight_quant.strategy == QuantizationStrategy.CHANNEL.value) + is_tensor = (weight_strategy and input_quant.strategy + == QuantizationStrategy.TENSOR.value) is_symmetric = weight_quant.symmetric and input_quant.symmetric is_static = not weight_quant.dynamic and not input_quant.dynamic @@ -88,14 +98,15 @@ def _is_static_tensor_w8a8(self, weight_quant: BaseModel, def _is_dynamic_token_w8a8(self, weight_quant: BaseModel, input_quant: BaseModel) -> bool: is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8 - is_token_tensor = (weight_quant.strategy - == QuantizationStrategy.TENSOR.value) and ( - input_quant.strategy - == QuantizationStrategy.TOKEN.value) + weight_strategy = ( + weight_quant.strategy == QuantizationStrategy.TENSOR.value + or weight_quant.strategy == QuantizationStrategy.CHANNEL.value) + is_token = (weight_strategy and input_quant.strategy + == QuantizationStrategy.TOKEN.value) is_symmetric = weight_quant.symmetric and input_quant.symmetric is_dynamic = not weight_quant.dynamic and input_quant.dynamic - return is_8_bits and is_token_tensor and is_symmetric and is_dynamic + return is_8_bits and is_token and is_symmetric and is_dynamic def _is_w4a16(self, weight_quant: BaseModel, input_quant: BaseModel) -> bool: @@ -110,17 +121,28 @@ def _get_schema(self, weight_quant: BaseModel, input_quant: BaseModel) -> "CompressedTensorsScheme": if self._is_w4a16(weight_quant, input_quant): - return CompressedTensorsW4A16(num_bits=weight_quant.num_bits, - strategy=weight_quant.strategy, - group_size=weight_quant.group_size) - - if self._is_static_tensor_w8a8(weight_quant, input_quant): - return CompressedTensorsW8A8StaticTensor() - - if self._is_dynamic_token_w8a8(weight_quant, input_quant): - return CompressedTensorsW8A8DynamicToken() - - raise NotImplementedError("Scheme not supported.") + if self.quant_format == CompressionFormat.marlin_24.value: + return CompressedTensorsW4A16Sparse24( + strategy=weight_quant.strategy, + num_bits=weight_quant.num_bits, + group_size=weight_quant.group_size) + if self.quant_format == CompressionFormat.pack_quantized.value: + return CompressedTensorsW4A16( + num_bits=weight_quant.num_bits, + strategy=weight_quant.strategy, + group_size=weight_quant.group_size) + + if self.quant_format == CompressionFormat.int_quantized.value: + if self._is_static_tensor_w8a8(weight_quant, input_quant): + return CompressedTensorsW8A8StaticTensor( + strategy=weight_quant.strategy) + + if self._is_dynamic_token_w8a8(weight_quant, input_quant): + return CompressedTensorsW8A8DynamicToken( + strategy=weight_quant.strategy) + + raise NotImplementedError( + "No compressed-tensors compatible scheme was found.") def get_scheme(self, layer: torch.nn.Module) -> "CompressedTensorsScheme": @@ -165,9 +187,9 @@ def create_weights(self, layer: torch.nn.Module, scheme = self.quantization_config.get_scheme(layer=layer) scheme.create_weights( layer=layer, + input_size=input_size, input_size_per_partition=input_size_per_partition, output_partition_sizes=output_partition_sizes, - input_size=input_size, output_size=output_size, params_dtype=params_dtype, weight_loader=weight_loader) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py index dc84d000803f9..3c95aa11fc76c 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py @@ -2,6 +2,8 @@ from .compressed_tensors_unquantized import ( # noqa: F401 CompressedTensorsUnquantized) from .compressed_tensors_w4a16 import CompressedTensorsW4A16 # noqa: F401 +from .compressed_tensors_w4a16_24 import ( # noqa: F401 + CompressedTensorsW4A16Sparse24) from .compressed_tensors_w8a8_dynamictoken import ( # noqa: F401, E501 CompressedTensorsW8A8DynamicToken) from .compressed_tensors_w8a8_statictensor import ( # noqa: F401, E501 diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py index 90446a5ffae01..373458cfffe04 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py @@ -64,10 +64,9 @@ def create_weights(self, layer: torch.nn.Module, input_size: int, "input_dim": 1, "output_dim": 0, "packed_dim": 1, - "pack_factor": pack_factor + "pack_factor": pack_factor, + "weight_loader": weight_loader }) - set_weight_attrs(weight, {"weight_loader": weight_loader}) - layer.register_parameter("weight_packed", weight) weight_scale = Parameter( @@ -79,11 +78,12 @@ def create_weights(self, layer: torch.nn.Module, input_size: int, requires_grad=False, ) - set_weight_attrs(weight_scale, {"weight_loader": weight_loader}) - set_weight_attrs(weight_scale, { - "input_dim": weight_scale_dim, - "output_dim": 0 - }) + set_weight_attrs( + weight_scale, { + "weight_loader": weight_loader, + "input_dim": weight_scale_dim, + "output_dim": 0 + }) layer.register_parameter("weight_scale", weight_scale) # A 2D array defining the original shape of the weights @@ -92,7 +92,10 @@ def create_weights(self, layer: torch.nn.Module, input_size: int, requires_grad=False) layer.register_parameter("weight_shape", weight_shape) - set_weight_attrs(weight_shape, {"weight_loader": weight_loader}) + set_weight_attrs(weight_shape, { + "weight_loader": weight_loader, + "ignore_warning": True, + }) layer.input_size_per_partition = input_size_per_partition layer.output_size_per_partition = output_size_per_partition diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py new file mode 100644 index 0000000000000..d7e04ddb8d94a --- /dev/null +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py @@ -0,0 +1,134 @@ +from typing import Callable, List, Optional + +import torch +from torch.nn import Parameter + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( + CompressedTensorsScheme) +from vllm.model_executor.layers.quantization.gptq_marlin_24 import ( + GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N) +from vllm.model_executor.utils import set_weight_attrs + +__all__ = ["CompressedTensorsW4A16Sparse24"] + + +class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme): + + def __init__(self, + strategy: str, + num_bits: int, + group_size: Optional[int] = None): + self.strategy = strategy + self.group_size = group_size + self.num_bits = num_bits + self.tile_size = 16 + + if self.strategy == "group" and self.group_size is None: + raise ValueError( + "group_size must be given when using strategy group") + + def create_weights(self, layer: torch.nn.Module, input_size: int, + output_partition_sizes: List[int], + input_size_per_partition: int, + params_dtype: torch.dtype, weight_loader: Callable, + **kwargs): + + pack_factor = 32 // self.num_bits + output_size_per_partition = sum(output_partition_sizes) + + qweight = Parameter( + torch.empty( + input_size_per_partition // self.tile_size // 2, + output_size_per_partition * self.tile_size // pack_factor, + dtype=torch.int32, + ), + requires_grad=False, + ) + set_weight_attrs( + qweight, + { + "input_dim": 0, + "output_dim": 1, + "packed_dim": 1, + "pack_factor": pack_factor, + "marlin_tile_size": self.tile_size, + "weight_loader": weight_loader + }, + ) + + layer.register_parameter("weight_packed", qweight) + + input_groups = (1 if self.group_size is None else + input_size_per_partition // self.group_size) + + scales = Parameter( + torch.empty( + input_groups, + output_size_per_partition, + dtype=params_dtype, + ), + requires_grad=False, + ) + set_weight_attrs( + scales, + { + "output_dim": 1, + "input_dim": None if input_groups == 1 else 0, + "weight_loader": weight_loader + }, + ) + layer.register_parameter("scale_packed", scales) + + weight_shape = Parameter(torch.empty(2, dtype=torch.int64), + requires_grad=False) + + layer.register_parameter("weight_shape", weight_shape) + set_weight_attrs(weight_shape, {"weight_loader": weight_loader}) + + meta = Parameter( + torch.empty( + input_size_per_partition // 8 // 2 // 2, + output_size_per_partition * 2, + dtype=torch.int16, + ), + requires_grad=False, + ) + set_weight_attrs( + meta, + { + "input_dim": 0, + "packed_dim": 1, + "pack_factor": 1, + "output_dim": 1, + "marlin_tile_size": 2, + "weight_loader": weight_loader + }, + ) + layer.register_parameter("meta", meta) + + max_workspace_size = ( + output_size_per_partition // + GPTQ_MARLIN_24_MIN_THREAD_N) * GPTQ_MARLIN_24_MAX_PARALLEL + workspace = Parameter(torch.zeros(max_workspace_size, dtype=torch.int), + requires_grad=False) + layer.workspace = workspace + + def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor): + qweight = layer.weight_packed + meta = layer.meta + scales = layer.scale_packed + workspace = layer.workspace + + x_2d = x.view(-1, x.shape[-1]) + + size_m = x_2d.shape[0] + size_k = x_2d.shape[1] + size_n = scales.shape[1] + + output_2d = ops.gptq_marlin_24_gemm(x_2d, qweight, meta, scales, + workspace, self.num_bits, size_m, + size_n, size_k) + + output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], )) + return output diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py new file mode 100644 index 0000000000000..efed79ec7a11c --- /dev/null +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py @@ -0,0 +1,84 @@ +from typing import Callable, List, Tuple, Union + +import torch +from torch.nn import Parameter + +from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( + CompressedTensorsScheme) +from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( + QuantizationStrategy) +from vllm.model_executor.utils import set_weight_attrs + + +class CompressedTensorsW8A8(CompressedTensorsScheme): + + def __init__(self, strategy: str): + self.strategy = strategy + + def _shard_id_as_int(self, shard_id: Union[str, int]) -> int: + if isinstance(shard_id, int): + return shard_id + + assert isinstance(shard_id, str) + qkv_idxs = {"q": 0, "k": 1, "v": 2} + assert shard_id in qkv_idxs + return qkv_idxs[shard_id] + + def scales_shard_splitter( + self, param: torch.Tensor, loaded_weight: torch.Tensor, + shard_id: Union[str, int], + logical_widths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + shard_id = self._shard_id_as_int(shard_id) + offset = sum(logical_widths[:shard_id]) + size = logical_widths[shard_id] + # update loaded weight with copies for broadcast. + loaded_weight = loaded_weight.repeat(size) + return param[offset:offset + size], loaded_weight + + def create_weights(self, layer: torch.nn.Module, + output_partition_sizes: List[int], + input_size_per_partition: int, + params_dtype: torch.dtype, weight_loader: Callable, + **kwargs): + + is_tensor_partitioned = len(output_partition_sizes) != 1 + weight_scale_dim = sum(output_partition_sizes) if ( + is_tensor_partitioned + or self.strategy == QuantizationStrategy.CHANNEL) else 1 + + shape: Union[Tuple[int], Tuple[int, int]] = (weight_scale_dim, ) + if self.strategy == QuantizationStrategy.CHANNEL: + shape = (weight_scale_dim, 1) + + weight_scale = Parameter(torch.empty(*shape, dtype=torch.float32), + requires_grad=False) + + layer.register_parameter("weight_scale", weight_scale) + set_weight_attrs(weight_scale, {"weight_loader": weight_loader}) + + weight = Parameter(torch.empty(sum(output_partition_sizes), + input_size_per_partition, + dtype=torch.int8), + requires_grad=False) + + layer.register_parameter("weight", weight) + set_weight_attrs( + weight, { + "input_dim": 1, + "output_dim": 0, + "weight_loader": weight_loader, + "logical_widths": output_partition_sizes + }) + + # Don't need a shard_splitter for channel-wise quantization + # Use the default loading method + if self.strategy == QuantizationStrategy.CHANNEL: + set_weight_attrs(weight_scale, { + "output_dim": 0, + }) + else: + set_weight_attrs( + weight_scale, { + "logical_widths": output_partition_sizes, + "shard_splitter": self.scales_shard_splitter, + }) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py index 9bb7bf4470872..5fc05b8e682d6 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py @@ -1,37 +1,15 @@ -from typing import Callable, List, Tuple, Union +from typing import Callable, List import torch -from torch.nn import Parameter from vllm import _custom_ops as custom_ops -from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( - CompressedTensorsScheme) -from vllm.model_executor.utils import set_weight_attrs +from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_w8a8 import ( # noqa: E501 + CompressedTensorsW8A8) __all__ = ["CompressedTensorsW8A8DynamicToken"] -class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme): - - def _shard_id_as_int(self, shard_id: Union[str, int]) -> int: - if isinstance(shard_id, int): - return shard_id - - assert isinstance(shard_id, str) - qkv_idxs = {"q": 0, "k": 1, "v": 2} - assert shard_id in qkv_idxs - return qkv_idxs[shard_id] - - def scales_shard_splitter( - self, param: torch.Tensor, loaded_weight: torch.Tensor, - shard_id: Union[str, int], - logical_widths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - shard_id = self._shard_id_as_int(shard_id) - offset = sum(logical_widths[:shard_id]) - size = logical_widths[shard_id] - # update loaded weight with copies for broadcast. - loaded_weight = loaded_weight.repeat(size) - return param[offset:offset + size], loaded_weight +class CompressedTensorsW8A8DynamicToken(CompressedTensorsW8A8): def create_weights(self, layer: torch.nn.Module, output_partition_sizes: List[int], @@ -39,42 +17,12 @@ def create_weights(self, layer: torch.nn.Module, params_dtype: torch.dtype, weight_loader: Callable, **kwargs): - # When the scales have a single value, it is required that they be - # on the CPU for performance and CUDA Graphs compatibility. Please - # refer to the comment in - # CompressedTensorsW8A8StaticTensor::create_weights for further - # information. - is_tensor_partitioned = len(output_partition_sizes) != 1 - weight_scale_dim = sum( - output_partition_sizes) if is_tensor_partitioned else 1 - - weight_zero_point = Parameter(torch.empty(1, dtype=torch.int8), - requires_grad=False) - - weight_scale = Parameter(torch.empty(weight_scale_dim, - dtype=torch.float32), - requires_grad=False) - - weight = Parameter(torch.empty(sum(output_partition_sizes), - input_size_per_partition, - dtype=torch.int8), - requires_grad=False) - - layer.register_parameter("weight", weight) - set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) - set_weight_attrs(weight, {"weight_loader": weight_loader}) - set_weight_attrs(weight, {"logical_widths": output_partition_sizes}) - - layer.register_parameter("weight_scale", weight_scale) - set_weight_attrs(weight_scale, {"weight_loader": weight_loader}) - set_weight_attrs( - weight_scale, { - "shard_splitter": self.scales_shard_splitter, - "logical_widths": output_partition_sizes - }) - - layer.register_parameter("weight_zero_point", weight_zero_point) - set_weight_attrs(weight_zero_point, {"weight_loader": weight_loader}) + super().create_weights( + layer=layer, + output_partition_sizes=output_partition_sizes, + input_size_per_partition=input_size_per_partition, + params_dtype=params_dtype, + weight_loader=weight_loader) def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor): weight = layer.weight diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py index 88c15c5c26a11..79f5358a365ed 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py @@ -1,37 +1,17 @@ -from typing import Callable, List, Tuple, Union +from typing import Callable, List import torch from torch.nn import Parameter from vllm import _custom_ops as custom_ops -from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( - CompressedTensorsScheme) +from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_w8a8 import ( # noqa: E501 + CompressedTensorsW8A8) from vllm.model_executor.utils import set_weight_attrs __all__ = ["CompressedTensorsW8A8StaticTensor"] -class CompressedTensorsW8A8StaticTensor(CompressedTensorsScheme): - - def _shard_id_as_int(self, shard_id: Union[str, int]) -> int: - if isinstance(shard_id, int): - return shard_id - - assert isinstance(shard_id, str) - qkv_idxs = {"q": 0, "k": 1, "v": 2} - assert shard_id in qkv_idxs - return qkv_idxs[shard_id] - - def scales_shard_splitter( - self, param: torch.Tensor, loaded_weight: torch.Tensor, - shard_id: Union[str, int], - logical_widths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - shard_id = self._shard_id_as_int(shard_id) - offset = sum(logical_widths[:shard_id]) - size = logical_widths[shard_id] - # update loaded weight with copies for broadcast. - loaded_weight = loaded_weight.repeat(size) - return param[offset:offset + size], loaded_weight +class CompressedTensorsW8A8StaticTensor(CompressedTensorsW8A8): def create_weights(self, layer: torch.nn.Module, output_partition_sizes: List[int], @@ -39,57 +19,21 @@ def create_weights(self, layer: torch.nn.Module, params_dtype: torch.dtype, weight_loader: Callable, **kwargs): - # TODO: remove zero_point parameters once the configs given remove them - - is_tensor_partitioned = len(output_partition_sizes) != 1 - weight_scale_dim = sum( - output_partition_sizes) if is_tensor_partitioned else 1 + super().create_weights( + layer=layer, + output_partition_sizes=output_partition_sizes, + input_size_per_partition=input_size_per_partition, + params_dtype=params_dtype, + weight_loader=weight_loader) input_scale = Parameter(torch.empty(1, dtype=torch.float32), requires_grad=False) - input_zero_point = Parameter(torch.empty(1, dtype=torch.int8), - requires_grad=False) - weight_scale = Parameter(torch.empty(weight_scale_dim, - dtype=torch.float32), - requires_grad=False) - weight_zero_point = Parameter(torch.empty(1, dtype=torch.int8), - requires_grad=False) - - weight = Parameter(torch.empty(sum(output_partition_sizes), - input_size_per_partition, - dtype=torch.int8), - requires_grad=False) - - layer.register_parameter("weight", weight) - set_weight_attrs(weight, { - "weight_loader": weight_loader, - "input_dim": 1, - "output_dim": 0, - }) layer.register_parameter("input_scale", input_scale) set_weight_attrs(input_scale, { "weight_loader": weight_loader, "ignore_warning": True, }) - layer.register_parameter("input_zero_point", input_zero_point) - set_weight_attrs(input_zero_point, { - "weight_loader": weight_loader, - "ignore_warning": True, - }) - layer.register_parameter("weight_scale", weight_scale) - set_weight_attrs( - weight_scale, { - "weight_loader": weight_loader, - "shard_splitter": self.scales_shard_splitter, - "logical_widths": output_partition_sizes, - "ignore_warning": True, - }) - layer.register_parameter("weight_zero_point", weight_zero_point) - set_weight_attrs(weight_zero_point, { - "weight_loader": weight_loader, - "ignore_warning": True - }) def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor): weight = layer.weight diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index fcc6649101845..b2bec9b603d1a 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -6,6 +6,14 @@ from torch.nn import Module +class CompressionFormat(Enum): + dense = "dense" + sparse_bitmask = "sparse-bitmask" + int_quantized = "int-quantized" + pack_quantized = "pack-quantized" + marlin_24 = "marlin-24" + + class QuantizationType(str, Enum): """ Enum storing quantization type options diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index bc08bfcc32b3a..bbf3cde54782d 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -20,19 +20,8 @@ def cutlass_fp8_supported() -> bool: capability = torch.cuda.get_device_capability() capability = capability[0] * 10 + capability[1] - major, minor = torch.version.cuda.split(".") - version = int(major) * 10 + int(minor) - # CUTLASS FP8 kernels need at least - # CUDA 12.0 on SM90 systems (Hopper) - # CUDA 12.4 on SM89 systems (Lovelace) - gpu_is_supported = False - if capability >= 90: - gpu_is_supported = version > 120 - elif capability >= 89: - gpu_is_supported = version > 124 - - return gpu_is_supported + return ops.cutlass_scaled_mm_supports_fp8(capability) class Fp8Config(QuantizationConfig): @@ -257,9 +246,7 @@ def apply(self, # If dynamic, layer.input_scale is None and x_scale computed from x. # If static, layer.input_scale is scalar and x_scale is input_scale. - # Temporarily disable CUTLASS kernels due to an illegal memory access - #if bias is None and self.cutlass_fp8_supported: - if False: + if bias is None and self.cutlass_fp8_supported: qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale) # Fused GEMM_DQ diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index ae440743fdf8e..599070f1550ca 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -25,24 +25,25 @@ # Permutations for Marlin scale shuffling -def get_scale_perms(num_bits): - scale_perm = [] +def get_scale_perms(num_bits: int): + scale_perm: List[int] = [] for i in range(8): scale_perm.extend([i + 8 * j for j in range(8)]) - scale_perm_single = [] + scale_perm_single: List[int] = [] for i in range(4): scale_perm_single.extend( [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]]) return scale_perm, scale_perm_single -def get_pack_factor(num_bits): +def get_pack_factor(num_bits: int): assert (num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS ), f"Unsupported num_bits = {num_bits}" return 32 // num_bits -def marlin_permute_scales(s, size_k, size_n, group_size, num_bits): +def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int, + group_size: int, num_bits: int): scale_perm, scale_perm_single = get_scale_perms(num_bits) if group_size < size_k and group_size != -1: s = s.reshape((-1, len(scale_perm)))[:, scale_perm] diff --git a/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py b/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py index 12e77cb710687..93f65a20d4e4a 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py @@ -1,4 +1,6 @@ """This file is used for /tests and /benchmarks""" +from typing import Dict, List + import numpy import torch @@ -11,10 +13,10 @@ # # As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501 # (without the need to use ldmatrix instructions) # noqa: E501 -def get_perms_24(num_bits): - perm_list = [] +def get_perms_24(num_bits: int): + perm_list: List[int] = [] for i in range(32): - perm1 = [] + perm1: List[int] = [] col = i // 4 col_o = col // 2 for block in [0, 1]: @@ -39,18 +41,18 @@ def get_perms_24(num_bits): perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel() perm = torch.from_numpy(perm) - scale_perm = [] + scale_perm: List[int] = [] for i in range(8): scale_perm.extend([i * 8 + j for j in [0, 4, 1, 5, 2, 6, 3, 7]]) - scale_perm_single = [] + scale_perm_single: List[int] = [] for i in range(8): scale_perm_single.extend([8 * i + j for j in [0, 1, 2, 3, 4, 5, 6, 7]]) return perm, scale_perm, scale_perm_single -marlin_24_perm = {} -marlin_24_scale_perm = {} -marlin_24_scale_perm_single = {} +marlin_24_perm: Dict[int, torch.Tensor] = {} +marlin_24_scale_perm: Dict[int, List[int]] = {} +marlin_24_scale_perm_single: Dict[int, List[int]] = {} for num_bits in [4, 8]: perm_24, scale_perm_24, scale_perm_single_24 = get_perms_24(num_bits) marlin_24_perm[num_bits] = perm_24 diff --git a/vllm/model_executor/layers/quantization/utils/marlin_perms.py b/vllm/model_executor/layers/quantization/utils/marlin_perms.py index 76bd2ff7c724e..db5e6857a8846 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_perms.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_perms.py @@ -1,4 +1,6 @@ """This file is used for /tests and /benchmarks""" +from typing import Dict, List + import numpy import torch @@ -11,10 +13,10 @@ # # As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501 # (without the need to use ldmatrix instructions) # noqa: E501 -def get_perms(num_bits): - perm_list = [] +def get_perms(num_bits: int): + perm_list: List[int] = [] for i in range(32): - perm1 = [] + perm1: List[int] = [] col = i // 4 for block in [0, 1]: for row in [ @@ -38,19 +40,19 @@ def get_perms(num_bits): perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel() perm = torch.from_numpy(perm) - scale_perm = [] + scale_perm: List[int] = [] for i in range(8): scale_perm.extend([i + 8 * j for j in range(8)]) - scale_perm_single = [] + scale_perm_single: List[int] = [] for i in range(4): scale_perm_single.extend( [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]]) return perm, scale_perm, scale_perm_single -marlin_perm = {} -marlin_scale_perm = {} -marlin_scale_perm_single = {} +marlin_perm: Dict[int, torch.Tensor] = {} +marlin_scale_perm: Dict[int, List[int]] = {} +marlin_scale_perm_single: Dict[int, List[int]] = {} for num_bits in [4, 8]: perm, scale_perm, scale_perm_single = get_perms(num_bits) marlin_perm[num_bits] = perm diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index a80703155c0b6..fe9b2fac1117e 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -1,12 +1,15 @@ from functools import cached_property -from typing import Optional, Tuple +from typing import Tuple import torch import torch.jit import torch.nn as nn +from vllm.model_executor.layers.spec_decode_base_sampler import ( + SpecDecodeBaseSampler) -class RejectionSampler(nn.Module): + +class RejectionSampler(SpecDecodeBaseSampler, nn.Module): """Apply modified rejection sampling as described in "Accelerating Large Language Model Decoding with Speculative Sampling" https://arxiv.org/pdf/2302.01318.pdf. @@ -22,39 +25,11 @@ def __init__(self, Require when bonus tokens will cause corrupt KV cache for proposal methods that require KV cache. strict_mode: Whether or not to perform shape/device/dtype checks - during sampling. This catches correctness issues but adds - nontrivial latency. + during sampling. This catches correctness issues but adds + nontrivial latency. """ - super().__init__() - self._disable_bonus_tokens = disable_bonus_tokens - self._strict_mode = strict_mode - - # NOTE: A "bonus token" is accepted iff all proposal tokens are - # accepted. There is always only one possible bonus token. We store this - # value in a variable for readability. - self._num_bonus_tokens = 1 - - self.num_accepted_tokens: Optional[torch.Tensor] = None - self.num_emitted_tokens: Optional[torch.Tensor] = None - self.num_draft_tokens: int = 0 - - def init_gpu_tensors(self, rank: int) -> None: - assert self.num_accepted_tokens is None - device = f"cuda:{rank}" - self.num_accepted_tokens = torch.tensor(0, - dtype=torch.long, - device=device) - self.num_emitted_tokens = torch.tensor(0, - dtype=torch.long, - device=device) - - @property - def probs_dtype(self): - return torch.float32 - - @property - def token_id_dtype(self): - return torch.int64 + SpecDecodeBaseSampler.__init__(self, disable_bonus_tokens, strict_mode) + nn.Module.__init__(self) def forward( self, @@ -100,15 +75,8 @@ def forward( # Only perform shape/dtype/device checking in strict mode, as it adds # overhead. if self._strict_mode: - self._raise_if_incorrect_shape(target_probs, bonus_token_ids, - draft_probs, draft_token_ids) - self._raise_if_incorrect_dtype(target_probs, bonus_token_ids, + self._raise_if_incorrect_input(target_probs, bonus_token_ids, draft_probs, draft_token_ids) - self._raise_if_inconsistent_device(target_probs, bonus_token_ids, - draft_probs, draft_token_ids) - self._raise_if_out_of_bounds_vocab(target_probs.shape[-1], - bonus_token_ids, - draft_token_ids) accepted, recovered_token_ids = self._batch_modified_rejection_sampling( target_probs, @@ -272,128 +240,6 @@ def _smallest_positive_value(self) -> float: """ return torch.finfo(self.probs_dtype).tiny - def _create_output( - self, - accepted: torch.Tensor, # [batch_size, k] - recovered_token_ids: torch.Tensor, # [batch_size, k] - draft_token_ids: torch.Tensor, # [batch_size, k] - bonus_token_ids: torch.Tensor, # [batch_size] - ) -> torch.Tensor: - """Format output. Returns a matrix of token ids. When - a token is rejected via rejection sampling, all subsequent - token ids are set to -1 for the sequence. - - shape = [batch_size, k + num_bonus_tokens] - """ - bonus_token_ids = bonus_token_ids.squeeze() - batch_size, k = recovered_token_ids.shape - - # Determine the index of the first False value for each row. - limits = (accepted == 0).max(1).indices - limits[~(accepted == 0).any(1)] = k - - # Create masks using the indices. - indices = torch.arange(k, device=accepted.device).unsqueeze(0) - accepted_mask = indices < limits.unsqueeze(1) - after_false_mask = indices == limits.unsqueeze(1) - - # Create an extended output tensor - output_with_bonus_tokens = -torch.ones( - (batch_size, k + self._num_bonus_tokens), - dtype=self.token_id_dtype, - device=accepted.device) - output = output_with_bonus_tokens[:, :k] - - # Fill in the first k columns of the output tensor using masks and data - # tensors. - torch.where(accepted_mask, - draft_token_ids, - -torch.ones_like(draft_token_ids), - out=output) - - # Fill the last column. - # We check output directly as accepted may have True values inconsistent - # with causal acceptance. - output_with_bonus_tokens[:, -1] = torch.where(output[:, -1] != -1, - bonus_token_ids, -1) - - # We disable bonus tokens because it causes corrupt KV cache for - # proposal methods that require KV cache. We can fix it by "prefilling" - # the bonus token in the proposer. The following issue tracks the fix. - # https://github.com/vllm-project/vllm/issues/4212 - if self._disable_bonus_tokens: - output_with_bonus_tokens[:, -1] = -1 - - # Fill the recovered token ids. - output.mul_(~after_false_mask).add_( - recovered_token_ids.mul(after_false_mask)) - - self.num_accepted_tokens += accepted.sum() - self.num_emitted_tokens += (output_with_bonus_tokens != -1).sum() - self.num_draft_tokens += batch_size * k - - return output_with_bonus_tokens - - def _raise_if_incorrect_shape( - self, - target_probs: torch.Tensor, - bonus_token_ids: torch.Tensor, - draft_probs: torch.Tensor, - draft_token_ids: torch.Tensor, - ) -> None: - (target_batch_size, num_target_probs, - target_vocab_size) = target_probs.shape - bonus_batch_size, num_bonus_tokens = bonus_token_ids.shape - draft_batch_size, num_draft_probs, draft_vocab_size = draft_probs.shape - draft_token_ids_batch_size, num_draft_token_ids = draft_token_ids.shape - - assert draft_batch_size == target_batch_size - assert num_draft_probs == num_target_probs - assert (draft_vocab_size == target_vocab_size - ), f"{draft_vocab_size=} {target_vocab_size=}" - - assert draft_token_ids_batch_size == draft_batch_size - assert num_draft_token_ids == num_draft_probs - - assert bonus_batch_size == target_batch_size - assert num_bonus_tokens == self._num_bonus_tokens - - def _raise_if_incorrect_dtype( - self, - target_probs: torch.Tensor, - bonus_token_ids: torch.Tensor, - draft_probs: torch.Tensor, - draft_token_ids: torch.Tensor, - ) -> None: - assert all(probs.dtype == self.probs_dtype - for probs in [target_probs, draft_probs]) - assert all(token_ids.dtype == self.token_id_dtype - for token_ids in [bonus_token_ids, draft_token_ids]) - - def _raise_if_inconsistent_device( - self, - target_probs: torch.Tensor, - bonus_token_ids: torch.Tensor, - draft_probs: torch.Tensor, - draft_token_ids: torch.Tensor, - ) -> None: - devices = [ - t.device for t in - [target_probs, bonus_token_ids, draft_probs, draft_token_ids] - ] - assert all([devices[0] == device for device in devices]) - - def _raise_if_out_of_bounds_vocab( - self, - vocab_size: int, - bonus_token_ids: torch.Tensor, - draft_token_ids: torch.Tensor, - ) -> None: - assert torch.all(bonus_token_ids < vocab_size) - assert torch.all(bonus_token_ids >= 0) - assert torch.all(draft_token_ids < vocab_size) - assert torch.all(draft_token_ids >= 0) - # torch.multinomial forces a GPU<->CPU sync. # Therefore, we use an optimized implementation instead that skips the sync. diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 792c4729355a7..a0b19046b7491 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -221,6 +221,29 @@ def forward_cuda( self.cos_sin_cache, self.is_neox_style) return query, key + def forward_xpu( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + from vllm._ipex_ops import ipex_ops as ops + + self.cos_sin_cache = self.cos_sin_cache.to(positions.device, + dtype=query.dtype) + # ops.rotary_embedding()/batched_rotary_embedding() + # are in-place operations that update the query and key tensors. + if offsets is not None: + ops.batched_rotary_embedding(positions, query, key, self.head_size, + self.cos_sin_cache, + self.is_neox_style, self.rotary_dim, + offsets) + else: + ops.rotary_embedding(positions, query, key, self.head_size, + self.cos_sin_cache, self.is_neox_style) + return query, key + def forward_tpu( self, positions: torch.Tensor, @@ -467,7 +490,7 @@ def _compute_cos_sin_cache(self) -> torch.Tensor: return cache -class Phi3SuScaledRotaryEmbedding(nn.Module): +class Phi3LongRoPEScaledRotaryEmbedding(nn.Module): """Phi3 family of models scaled rotary embedding. Based on the original RotaryEmbedding implementation. @@ -484,18 +507,19 @@ def __init__( dtype: torch.dtype, short_factor: List[float], long_factor: List[float], - short_mscale: float = 1.1, - long_mscale: float = 1.225, + short_mscale: float = 1.0, + long_mscale: float = 1.0, ): super().__init__() if rotary_dim != head_size: raise ValueError( - f"`Phi3SuScaledRotaryEmbedding` does not support rotary_dim != \ - head_size ({rotary_dim}!={head_size}).") + f"`Phi3LongRoPEScaledRotaryEmbedding` does not support \ + rotary_dim != head_size ({rotary_dim}!={head_size}).") if is_neox_style is False: raise ValueError( - "`Phi3SuScaledRotaryEmbedding` only supports neox_style.") + "`Phi3LongRoPEScaledRotaryEmbedding` only supports neox_style." + ) self.head_size = head_size self.max_position_embeddings = max_position_embeddings @@ -506,6 +530,16 @@ def __init__( self.short_mscale = short_mscale self.long_mscale = long_mscale + scale = (self.max_position_embeddings / + self.original_max_position_embeddings) + + if scale <= 1.0: + self.scaling_factor = 1.0 + else: + self.scaling_factor = math.sqrt( + 1 + math.log(scale) / + math.log(self.original_max_position_embeddings)) + short_cache = self._compute_cos_sin_cache( original_max_position_embeddings, short_factor, short_mscale) short_cache = short_cache.to(dtype) @@ -541,8 +575,8 @@ def _compute_cos_sin_cache( inv_freq = self._compute_inv_freq(rescale_factors) t = torch.arange(max_position_embeddings, dtype=torch.float) freqs = torch.einsum("i,j -> ij", t, inv_freq) - cos = freqs.cos() * mscale - sin = freqs.sin() * mscale + cos = freqs.cos() * mscale * self.scaling_factor + sin = freqs.sin() * mscale * self.scaling_factor cache = torch.cat((cos, sin), dim=-1) return cache @@ -608,7 +642,9 @@ def get_rope( is_neox_style, dtype) else: scaling_type = rope_scaling["type"] - if scaling_type != "su": + # The correct one should be "longrope" but keep "su" here + # for backward compatible + if scaling_type != "su" and scaling_type != "longrope": scaling_factor = rope_scaling["factor"] if scaling_type == "linear": rotary_emb = LinearScalingRotaryEmbedding(head_size, rotary_dim, @@ -633,7 +669,9 @@ def get_rope( base, is_neox_style, scaling_factor, dtype, **extra_kwargs) - elif scaling_type == "su": + # The correct one should be "longrope" but keep "su" here + # for backward compatible + elif scaling_type == "su" or scaling_type == "longrope": short_factor = rope_scaling["short_factor"] long_factor = rope_scaling["long_factor"] original_max_position = rope_scaling[ @@ -643,7 +681,7 @@ def get_rope( for k, v in rope_scaling.items() if k in ("short_mscale", "long_mscale") } - rotary_emb = Phi3SuScaledRotaryEmbedding( + rotary_emb = Phi3LongRoPEScaledRotaryEmbedding( head_size, rotary_dim, max_position, original_max_position, base, is_neox_style, dtype, short_factor, long_factor, **extra_kwargs) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index a84f562909d50..e07360a2fd682 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -174,7 +174,7 @@ def _apply_min_tokens_penalty( min_tokens = sampling_params.min_tokens token_ids_to_penalize = sampling_params.all_stop_token_ids if min_tokens > 0 and token_ids_to_penalize: - seqs_to_penalize = [] + seqs_to_penalize: List[int] = [] for j, seq_id in enumerate(seq_ids): seq_data = seq_group.seq_data[seq_id] if len(seq_data.output_token_ids) < min_tokens: @@ -285,7 +285,7 @@ def _greedy_sample( same as the length of selected_seq_groups. If the corresponding seq_group has do_sample=False, tuple contains ([], []) """ - samples = samples.tolist() + samples_lst = samples.tolist() sample_idx = 0 results: SampleResultType = [] for seq_group in selected_seq_groups: @@ -298,7 +298,7 @@ def _greedy_sample( assert num_parent_seqs == 1, ( "Greedy sampling should have only one seq.") parent_ids = list(range(num_parent_seqs)) - next_token_ids = [samples[sample_idx]] + next_token_ids = [samples_lst[sample_idx]] results.append((next_token_ids, parent_ids)) sample_idx += num_parent_seqs return results @@ -394,7 +394,7 @@ def _beam_search_sample( next_token_ids = next_token_ids.tolist() else: # Generation phase. - cumulative_logprobs: List[int] = [ + cumulative_logprobs: List[float] = [ seq_group.seq_data[seq_id].cumulative_logprob for seq_id in seq_ids ] @@ -466,8 +466,9 @@ def _sample_with_torch( categorized_seq_group_ids[sampling_type].append(i) sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {} - sample_metadata = {} - multinomial_samples = {} + sample_metadata: Dict[SamplingType, + Tuple[List[int], List[SequenceGroupToSample]]] = {} + multinomial_samples: Dict[SamplingType, torch.Tensor] = {} # Create output tensor for sampled token ids. if include_gpu_probs_tensor: @@ -494,7 +495,7 @@ def _sample_with_torch( greedy_samples = torch.argmax(logprobs[long_sample_indices], dim=-1) - if include_gpu_probs_tensor: + if sampled_token_ids_tensor is not None: # Store sampled tokens in output tensor. sampled_token_ids_tensor[ long_sample_indices] = greedy_samples.unsqueeze(-1) @@ -522,7 +523,7 @@ def _sample_with_torch( probs[long_sample_indices], max_best_of_in_batch, **seeded_args) - if include_gpu_probs_tensor: + if sampled_token_ids_tensor is not None: # Store sampled tokens in output tensor. sampled_token_ids_tensor[ long_sample_indices] = multinomial_samples[sampling_type] @@ -571,7 +572,9 @@ def _sample_with_triton_kernel( categorized_seq_group_ids[sampling_type].append(i) sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {} - sample_metadata = {} + sample_metadata: Dict[SamplingType, + Tuple[List[int], List[SequenceGroupToSample], + torch.Tensor, torch.Tensor]] = {} max_best_of_in_batch = 1 # Counterintiutively, having two loops here is actually faster. @@ -1008,14 +1011,14 @@ def _build_sampler_output( speculative decoding rejection sampling. """ - sampler_output = [] + sampler_output: List[CompletionSequenceGroupOutput] = [] for (seq_group, sample_result, group_prompt_logprobs, group_sample_logprobs) in zip(sampling_metadata.seq_groups, sample_results, prompt_logprobs, sample_logprobs): seq_ids = seq_group.seq_ids next_token_ids, parent_ids = sample_result - seq_outputs = [] + seq_outputs: List[SequenceOutput] = [] for parent_id, next_token_id, logprobs in zip(parent_ids, next_token_ids, group_sample_logprobs): diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py new file mode 100644 index 0000000000000..9856a7e7ddea0 --- /dev/null +++ b/vllm/model_executor/layers/spec_decode_base_sampler.py @@ -0,0 +1,206 @@ +from typing import Optional + +import torch + + +class SpecDecodeBaseSampler(): + """Base class for samplers used for Speculative Decoding verification + step. + """ + + def __init__(self, + disable_bonus_tokens: bool = True, + strict_mode: bool = False): + """Base class constructor. + Args: + disable_bonus_tokens: Whether or not to disable the bonus token. + Require when bonus tokens will cause corrupt KV cache for + proposal methods that require KV cache. + strict_mode: Whether or not to perform shape/device/dtype checks + during sampling. This catches correctness issues but adds + nontrivial latency. + """ + super().__init__() + self._disable_bonus_tokens = disable_bonus_tokens + self._strict_mode = strict_mode + + # NOTE: A "bonus token" is accepted iff all proposal tokens are + # accepted. There is always only one possible bonus token. We store this + # value in a variable for readability. + self._num_bonus_tokens = 1 + + self.num_accepted_tokens: Optional[torch.Tensor] = None + self.num_emitted_tokens: Optional[torch.Tensor] = None + self.num_draft_tokens: int = 0 + + def init_gpu_tensors(self, rank: int) -> None: + assert self.num_accepted_tokens is None + device = f"cuda:{rank}" + self.num_accepted_tokens = torch.tensor(0, + dtype=torch.long, + device=device) + self.num_emitted_tokens = torch.tensor(0, + dtype=torch.long, + device=device) + + @property + def probs_dtype(self): + return torch.float32 + + @property + def token_id_dtype(self): + return torch.int64 + + def _create_output( + self, + accepted: torch.Tensor, # [batch_size, k] + substitute_token_ids: torch.Tensor, # [batch_size, k] + draft_token_ids: torch.Tensor, # [batch_size, k] + bonus_token_ids: torch.Tensor, # [batch_size] + ) -> torch.Tensor: + """Format output. Returns a matrix of token ids. When + a token is rejected via sampling, all subsequent token ids are + set to -1 for the sequence. + + Args: + accepted: A boolean tensor indicating if the corresponding + draft token in draft_token_ids should be accepted or not. + substitute_token_ids: A tensor of token_ids that can be used + as substitutes for the draft token ids if the proposed token + is rejected. + draft_token_ids: A tensor of token ids speculated by the + draft model. + bonus_token_ids: Token ids to use as the bonus token if + all the draft tokens are accepted. + Returns: + A tensor containing the accepted token ids. The shape of the + tensor is [batch_size, k + num_bonus_tokens] + """ + batch_size, k = substitute_token_ids.shape + bonus_token_ids = bonus_token_ids.squeeze() + # Determine the index of the first False value for each row. + limits = (accepted == 0).max(1).indices + limits[~(accepted == 0).any(1)] = k + + # Create masks using the indices. + indices = torch.arange(k, device=accepted.device).unsqueeze(0) + accepted_mask = indices < limits.unsqueeze(1) + after_false_mask = indices == limits.unsqueeze(1) + + # Create an extended output tensor + output_with_bonus_tokens = -torch.ones( + (batch_size, k + self._num_bonus_tokens), + dtype=self.token_id_dtype, + device=accepted.device) + output = output_with_bonus_tokens[:, :k] + + # Fill in the first k columns of the output tensor using masks and data + # tensors. + output[:, :k] = torch.where(accepted_mask, draft_token_ids, + -torch.ones_like(draft_token_ids)) + + # Fill the last column. + # We check output directly as accepted may have True values inconsistent + # with causal acceptance. + output_with_bonus_tokens[:, -1] = torch.where(output[:, -1] != -1, + bonus_token_ids, -1) + + # We disable bonus tokens because it causes corrupt KV cache for + # proposal methods that require KV cache. We can fix it by "prefilling" + # the bonus token in the proposer. The following issue tracks the fix. + # https://github.com/vllm-project/vllm/issues/4212 + if self._disable_bonus_tokens: + output_with_bonus_tokens[:, -1] = -1 + + # Fill the recovered token ids. + output.mul_(~after_false_mask).add_( + substitute_token_ids.mul(after_false_mask)) + + self.num_accepted_tokens += accepted.sum() + self.num_emitted_tokens += (output_with_bonus_tokens != -1).sum() + self.num_draft_tokens += batch_size * k + + return output_with_bonus_tokens + + def _raise_if_incorrect_input( + self, + target_probs: torch.Tensor, + draft_token_ids: torch.Tensor, + bonus_token_ids: torch.Tensor, + draft_probs: Optional[torch.Tensor] = None, + ) -> None: + self._raise_if_incorrect_shape(target_probs, draft_token_ids, + bonus_token_ids, draft_probs) + self._raise_if_incorrect_dtype(target_probs, draft_token_ids, + bonus_token_ids, draft_probs) + self._raise_if_inconsistent_device(target_probs, draft_token_ids, + bonus_token_ids, draft_probs) + self._raise_if_out_of_bounds_vocab(target_probs.shape[-1], + draft_token_ids, bonus_token_ids) + + def _raise_if_incorrect_shape( + self, + target_probs: torch.Tensor, + draft_token_ids: torch.Tensor, + bonus_token_ids: torch.Tensor, + draft_probs: Optional[torch.Tensor] = None, + ) -> None: + (target_batch_size, num_target_probs, + target_vocab_size) = target_probs.shape + + # validate the shape of draft token ids. + draft_token_ids_batch_size, num_draft_token_ids = draft_token_ids.shape + assert draft_token_ids_batch_size == target_batch_size + assert num_draft_token_ids == num_target_probs + + # validate the shape of bonus token ids + bonus_batch_size, num_bonus_tokens = bonus_token_ids.shape + assert bonus_batch_size == target_batch_size + assert num_bonus_tokens == self._num_bonus_tokens + + # validate the shape of draft probs if it is set + if draft_probs is not None: + (draft_batch_size, num_draft_probs, + draft_vocab_size) = draft_probs.shape + assert draft_batch_size == target_batch_size + assert num_draft_probs == num_target_probs + assert (draft_vocab_size == target_vocab_size + ), f"{draft_vocab_size=} {target_vocab_size=}" + + def _raise_if_incorrect_dtype( + self, + target_probs: torch.Tensor, + draft_token_ids: torch.Tensor, + bonus_token_ids: torch.Tensor, + draft_probs: Optional[torch.Tensor] = None, + ) -> None: + assert target_probs.dtype == self.probs_dtype + assert draft_token_ids.dtype == self.token_id_dtype + assert bonus_token_ids.dtype == self.token_id_dtype + if draft_probs is not None: + assert draft_probs.dtype == self.probs_dtype + + def _raise_if_inconsistent_device( + self, + target_probs: torch.Tensor, + draft_token_ids: torch.Tensor, + bonus_token_ids: torch.Tensor, + draft_probs: Optional[torch.Tensor] = None, + ) -> None: + devices = [ + t.device for t in + [target_probs, bonus_token_ids, draft_probs, draft_token_ids] + if t is not None + ] + assert all([devices[0] == device for device in devices]) + + def _raise_if_out_of_bounds_vocab( + self, + vocab_size: int, + draft_token_ids: torch.Tensor, + bonus_token_ids: torch.Tensor, + ) -> None: + assert torch.all(bonus_token_ids < vocab_size) + assert torch.all(bonus_token_ids >= 0) + assert torch.all(draft_token_ids < vocab_size) + assert torch.all(draft_token_ids >= 0) diff --git a/vllm/model_executor/layers/typical_acceptance_sampler.py b/vllm/model_executor/layers/typical_acceptance_sampler.py new file mode 100644 index 0000000000000..f12d6a03b4d16 --- /dev/null +++ b/vllm/model_executor/layers/typical_acceptance_sampler.py @@ -0,0 +1,186 @@ +import torch +import torch.jit +import torch.nn as nn + +from vllm.model_executor.layers.spec_decode_base_sampler import ( + SpecDecodeBaseSampler) + + +class TypicalAcceptanceSampler(SpecDecodeBaseSampler, nn.Module): + """Apply typical acceptance sampling as described in section 3.3.1 in + "MEDUSA: Simple LLM Inference Acceleration Framework with + Multiple Decoding Heads" + https://arxiv.org/pdf/2401.10774 + """ + + def __init__( + self, + disable_bonus_tokens: bool = False, + strict_mode: bool = False, + posterior_threshold: float = 0.09, + posterior_alpha: float = 0.3, + ): + """Create a Typical Acceptance Sampler. + + Args: + disable_bonus_tokens: Whether or not to disable the bonus token. + Require when bonus tokens will cause corrupt KV cache for + proposal methods that require KV cache. + strict_mode: Whether or not to perform shape/device/dtype checks + during sampling. This catches correctness issues but adds + nontrivial latency. + posterior_threshold : A threshold value that sets a lower bound + on the posterior probability of a token in target model for it + to be accepted. Default is 0.09 + posterior_alpha : A scaling factor for the entropy-based + threshold in typical acceptance sampling. Typically defaults to + sqrt of posterior_threshold and is set to 0.3. + """ + SpecDecodeBaseSampler.__init__( + self, + disable_bonus_tokens=disable_bonus_tokens, + strict_mode=strict_mode) + nn.Module.__init__(self) + self._posterior_threshold = posterior_threshold + self._posterior_alpha = posterior_alpha + + def forward( + self, + target_probs: torch.Tensor, + bonus_token_ids: torch.Tensor, + draft_token_ids: torch.Tensor, + ) -> torch.Tensor: + """Sample token ids using typical acceptance sampling. This accepts + or rejects tokens proposed by the draft model using the probability + of each token according to the draft and target models. + + In the worst case where all draft tokens are rejected, it is guaranteed + one token will be emitted. + + In the case where all draft tokens are accepted, the bonus token will be + accepted conditioned on self._disable_bonus_tokens being false. + + Args: + target_probs: The probability distribution over token ids given + context according to the target model. + shape = [batch_size, num_speculative_tokens, vocab_size] + + bonus_token_ids: The "bonus" token ids that are accepted iff all + speculative tokens in a sequence are accepted. + shape = [batch_size, num_bonus_tokens] + + draft_token_ids: The token ids that were sampled from the draft + probabilities. + shape = [batch_size, num_speculative_tokens] + + Returns: + output_token_ids: The token ids sampled via rejection sampling, + or -1 if unable to sample a token because the previous token + was rejected. + shape = [batch_size, num_speculative_tokens + num_bonus_tokens] + """ + # Only perform shape/dtype/device checking in strict mode, as it adds + # overhead. + if self._strict_mode: + self._raise_if_incorrect_input(target_probs, draft_token_ids, + bonus_token_ids) + accepted = self._evaluate_accepted_tokens(target_probs, + draft_token_ids) + recovered_token_ids = self._replacement_token_ids(target_probs) + output_token_ids = self._create_output(accepted, recovered_token_ids, + draft_token_ids, + bonus_token_ids) + return output_token_ids + + def _evaluate_accepted_tokens(self, target_probs, draft_token_ids): + r""" + Evaluates and returns a mask of accepted tokens based on the + posterior probabilities. + + Parameters: + ---------- + target_probs : torch.Tensor + A tensor of shape (batch_size, k, vocab_size) representing + the probabilities of each token in the vocabulary for each + position in the proposed sequence. This is the distribution + generated by the target model. + draft_token_ids : torch.Tensor + A tensor of shape (batch_size, k) representing the proposed + token ids. + + A draft token_id x_{n+k} is accepted if it satisfies the + following condition + + .. math:: + p_{\text{original}}(x_{n+k} | x_1, x_2, \dots, x_{n+k-1}) > + \min \left( \epsilon, \delta * \exp \left( + -H(p_{\text{original}}( + \cdot | x_1, x_2, \ldots, x_{n+k-1})) \right) \right) + + where :math:`p_{\text{original}}` corresponds to target_probs + and :math:`\epsilon` and :math:`\delta` correspond to hyperparameters + specified using self._posterior_threshold and self._posterior_alpha + + This method computes the posterior probabilities for the given + draft token ids based on the provided target probabilities. It + calculates the entropy of the posterior distribution and determines + a dynamic threshold for each token position using the provided + posterior_threshold and posterior_alpha values. The method then + returns a boolean mask indicating which tokens can be accepted. + + Returns: + ------- + torch.Tensor + A boolean tensor of shape (batch_size, k) where each element + indicates whether the corresponding draft token has been accepted + or rejected. True indicates acceptance and false indicates + rejection. + + """ + device = target_probs.device + candidates_prob = torch.gather( + target_probs, dim=-1, + index=draft_token_ids.unsqueeze(-1)).squeeze(-1) + # A small constant added to prevent computing the logarithm of zero, + # which can lead to undefined values. + epsilon = 1e-5 + posterior_entropy = -torch.sum( + target_probs * torch.log(target_probs + epsilon), dim=-1) + threshold = torch.minimum( + torch.ones_like(posterior_entropy, device=device) * + self._posterior_threshold, + torch.exp(-posterior_entropy) * self._posterior_alpha, + ) + accepted_mask = candidates_prob > threshold + return accepted_mask + + def _replacement_token_ids(self, target_probs): + """ + Generate one replacement token ID for each sequence based on target + probabilities. The replacement token is used as the fallback option + if typical acceptance sampling does not accept any draft tokens for + that particular sequence. + + This method computes the token IDs to be replaced by selecting the + token with the highest probability for each sequence in the first + position. The rest of the output is filled with -1. + + Parameters + ---------- + target_probs : torch.Tensor + A tensor of shape (batch_size, k, vocab_size) containing + the target probability distribution + + Returns + ------- + torch.Tensor + A tensor of shape (batch_size, k) with the replacement + token IDs. Only the first column is set, and the rest of the + columns are filled with -1. + """ + max_indices = torch.argmax(target_probs[:, 0, :], dim=1) + output = -torch.ones((target_probs.shape[0], target_probs.shape[1]), + dtype=self.token_id_dtype, + device=target_probs.device) + output[:, 0] = max_indices + return output diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 60eb5b404e2ca..1a26c5c63fedc 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -307,7 +307,7 @@ def forward(self, input_): else: masked_input = input_ # Get the embeddings. - output_parallel = F.embedding(masked_input, self.weight) + output_parallel = F.embedding(masked_input.long(), self.weight) # Mask the output embedding. if self.tp_size > 1: output_parallel.masked_fill_(input_mask.unsqueeze(1), 0) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 11f5758797916..a0d23ecb9effd 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -61,7 +61,6 @@ def _get_quantization_config( f"{model_config.dtype} is not supported for quantization " f"method {model_config.quantization}. Supported dtypes: " f"{supported_dtypes}") - return quant_config elif model_config.sparsity is not None: @@ -89,7 +88,7 @@ def _get_model_initialization_kwargs( vision_language_config: Optional[VisionLanguageConfig] ) -> Dict[str, Any]: """Get extra kwargs for model initialization.""" - extra_kwargs = {} + extra_kwargs: Dict[str, Any] = {} if hasattr(model_class, "supported_lora_modules"): extra_kwargs["lora_config"] = lora_config elif lora_config: @@ -467,7 +466,8 @@ def _filter_subtensors( Filter out all tensors that share the same memory or a subset of the memory of another tensor. """ - same_storage_groups = collections.defaultdict(list) + same_storage_groups: Dict[Any, List[Tuple[ + str, torch.Tensor]]] = collections.defaultdict(list) for key, tensor in tensors.items(): if tensor.numel(): ptr = tensor.untyped_storage().data_ptr() @@ -476,7 +476,7 @@ def _filter_subtensors( def get_end_ptr(tensor: torch.Tensor) -> int: return tensor.view(-1)[-1].data_ptr() + tensor.element_size() - result = {} + result: Dict[str, torch.Tensor] = {} for group in same_storage_groups.values(): for k, t in group: a, b = t.data_ptr(), get_end_ptr(t) diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index d79fedaea428e..b009ad8c882d4 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -21,6 +21,7 @@ QuantizationConfig) from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) +from vllm.utils import FlexibleArgumentParser tensorizer_error_msg = None @@ -177,8 +178,7 @@ def __post_init__(self): self.deserializer_params['encryption'] = decryption_params @staticmethod - def add_cli_args( - parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: """Tensorizer CLI arguments""" # Tensorizer options arg group diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 15675b8fb4c39..f1dcf88016a6e 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -344,7 +344,7 @@ def np_cache_weights_iterator( # dumping the same model weights to numpy at the same time. with get_lock(model_name_or_path, cache_dir): if not os.path.exists(weight_names_file): - weight_names = [] + weight_names: List[str] = [] for bin_file in hf_weights_files: state = torch.load(bin_file, map_location="cpu") for name, param in state.items(): diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 4446914c67c8e..5afb2e1d44d39 100755 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -49,6 +49,7 @@ "OrionForCausalLM": ("orion", "OrionForCausalLM"), "PhiForCausalLM": ("phi", "PhiForCausalLM"), "Phi3ForCausalLM": ("llama", "LlamaForCausalLM"), + "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"), "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"), @@ -59,6 +60,7 @@ "ArcticForCausalLM": ("arctic", "ArcticForCausalLM"), "XverseForCausalLM": ("xverse", "XverseForCausalLM"), "Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"), + "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"), } _EMBEDDING_MODELS = { @@ -72,11 +74,11 @@ _OOT_MODELS: Dict[str, Type[nn.Module]] = {} # Models not supported by ROCm. -_ROCM_UNSUPPORTED_MODELS = [] +_ROCM_UNSUPPORTED_MODELS: List[str] = [] # Models partially supported by ROCm. # Architecture -> Reason. -_ROCM_PARTIALLY_SUPPORTED_MODELS = { +_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = { "Qwen2ForCausalLM": "Sliding window attention is not yet supported in ROCm's flash attention", "MistralForCausalLM": diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index 313762b1353d1..5777611079c66 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -453,8 +453,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("qkv_proj", "v_proj", "v"), ] - mlp_params_mapping = [] - expert_params_mapping = [] + mlp_params_mapping: List[Tuple[str, str, int]] = [] + expert_params_mapping: List[Tuple[str, str, int]] = [] num_layers = self.config.num_hidden_layers for layer in range(num_layers): diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py new file mode 100644 index 0000000000000..aa4e87228a7e4 --- /dev/null +++ b/vllm/model_executor/models/clip.py @@ -0,0 +1,203 @@ +"""Minimal implementation of CLIPVisionModel intended to be only used +within a vision language model.""" +from typing import Optional, Tuple + +import torch +import torch.nn as nn +from transformers import CLIPVisionConfig +from transformers.models.clip.modeling_clip import CLIPAttention + +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) + + +def get_clip_num_patches(image_size: int, patch_size: int) -> int: + assert image_size % patch_size == 0 + return (image_size // patch_size)**2 + + +# Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa +class CLIPVisionEmbeddings(nn.Module): + + def __init__(self, config: CLIPVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter(torch.randn(self.embed_dim)) + + self.patch_embedding = nn.Conv2d( + in_channels=config.num_channels, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + bias=False, + ) + + self.num_patches = get_clip_num_patches(self.image_size, + self.patch_size) + self.num_positions = self.num_patches + 1 + self.position_embedding = nn.Embedding(self.num_positions, + self.embed_dim) + self.register_buffer("position_ids", + torch.arange(self.num_positions).expand((1, -1)), + persistent=False) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + batch_size = pixel_values.shape[0] + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.to( + dtype=target_dtype)) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + + class_embeds = self.class_embedding.expand(batch_size, 1, -1) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + embeddings = embeddings + self.position_embedding(self.position_ids) + + return embeddings + + +class CLIPMLP(nn.Module): + + def __init__(self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + self.config = config + self.activation_fn = get_act_fn(config.hidden_act) + self.fc1 = ColumnParallelLinear(config.hidden_size, + config.intermediate_size, + bias=True, + quant_config=quant_config) + self.fc2 = RowParallelLinear(config.intermediate_size, + config.hidden_size, + bias=True, + quant_config=quant_config) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states, _ = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states, _ = self.fc2(hidden_states) + + return hidden_states + + +class CLIPEncoderLayer(nn.Module): + + def __init__(self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + + self.self_attn = CLIPAttention(config) + self.layer_norm1 = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + self.mlp = CLIPMLP(config, quant_config=quant_config) + self.layer_norm2 = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + + def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor]: + + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, _ = self.self_attn(hidden_states=hidden_states) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + +class CLIPEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self + attention layers. Each layer is a [`CLIPEncoderLayer`]. + + Args: + config: CLIPConfig + """ + + def __init__(self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + self.config = config + self.layers = nn.ModuleList([ + CLIPEncoderLayer(config=config, quant_config=quant_config) + for _ in range(config.num_hidden_layers) + ]) + + def forward(self, + inputs_embeds: torch.Tensor, + vision_feature_layer: int = -1): + + # Encoder forward pass only up to the required layer + num_layer = len(self.layers) + vision_feature_layer + 1 + hidden_states = inputs_embeds + for encoder_layer in self.layers[:num_layer]: + hidden_states = encoder_layer(hidden_states) + + return hidden_states + + +class CLIPVisionTransformer(nn.Module): + + def __init__(self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + self.config = config + embed_dim = config.hidden_size + + self.embeddings = CLIPVisionEmbeddings(config) + + # NOTE: This typo of "layrnorm" is not fixed on purpose to match + # the original transformers code and name of the model weights. + self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + self.encoder = CLIPEncoder(config=config, quant_config=quant_config) + + def forward( + self, + pixel_values: torch.Tensor, + vision_feature_layer: int = -1, + ) -> torch.Tensor: + + hidden_states = self.embeddings(pixel_values) + hidden_states = self.pre_layrnorm(hidden_states) + hidden_states = self.encoder(inputs_embeds=hidden_states, + vision_feature_layer=vision_feature_layer) + + return hidden_states + + +class CLIPVisionModel(nn.Module): + + config_class = CLIPVisionConfig + main_input_name = "pixel_values" + + def __init__(self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + self.vision_model = CLIPVisionTransformer(config=config, + quant_config=quant_config) + + def forward(self, + pixel_values: Optional[torch.Tensor] = None, + vision_feature_layer: int = -1): + + return self.vision_model(pixel_values=pixel_values, + vision_feature_layer=vision_feature_layer) + + @property + def device(self): + return next(self.parameters()).device diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 84786921ce1b4..600c2990b3691 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -20,7 +20,7 @@ # This file is based on the LLama model definition file in transformers """PyTorch Cohere model.""" -from typing import Iterable, List, Optional, Tuple +from typing import Iterable, List, Optional, Set, Tuple import torch import torch.utils.checkpoint @@ -29,7 +29,7 @@ from transformers import CohereConfig from vllm.attention import Attention, AttentionMetadata -from vllm.config import CacheConfig +from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.layers.activation import SiluAndMul @@ -265,10 +265,14 @@ def __init__( config: CohereConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, ): super().__init__() self.config = config - self.vocab_size = config.vocab_size + lora_vocab = (lora_config.lora_extra_vocab_size * + (lora_config.max_loras or 1)) if lora_config else 0 + self.vocab_size = config.vocab_size + lora_vocab + self.org_vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding(config.vocab_size, config.hidden_size) self.layers = nn.ModuleList([ @@ -302,18 +306,44 @@ def forward( class CohereForCausalLM(nn.Module): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + # LoRA specific attributes + supported_lora_modules = [ + "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens" + ] + embedding_modules = {"embed_tokens": "input_embeddings"} + embedding_padding_modules = [] + def __init__( self, config: CohereConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, ) -> None: super().__init__() self.config = config + self.unpadded_vocab_size = config.vocab_size + if lora_config: + self.unpadded_vocab_size += lora_config.lora_extra_vocab_size self.quant_config = quant_config - self.logits_processor = LogitsProcessor(config.vocab_size, + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + config.vocab_size, scale=config.logit_scale) - self.model = CohereModel(config, cache_config, quant_config) + self.model = CohereModel(config, + cache_config, + quant_config, + lora_config=lora_config) self.sampler = Sampler() @torch.no_grad() @@ -330,8 +360,14 @@ def forward( def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.model.embed_tokens.weight, - hidden_states, sampling_metadata) + is_not_lora = hasattr(self.model.embed_tokens, 'weight') + if is_not_lora: + embedding_weights = self.model.embed_tokens.weight + else: + embedding_weights = self.model.embed_tokens.base_layer.weight + + logits = self.logits_processor(embedding_weights, hidden_states, + sampling_metadata) return logits def sample( @@ -352,7 +388,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: for param_name, shard_name, shard_id in stacked_params_mapping: if shard_name not in name: diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 27dda00b66af4..65f4ebec5bcf0 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -15,7 +15,7 @@ # limitations under the License. """Inference-only Gemma model compatible with HuggingFace weights.""" from functools import lru_cache -from typing import Iterable, List, Optional, Tuple +from typing import Iterable, List, Optional, Set, Tuple import torch from torch import nn @@ -363,7 +363,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: for (param_name, shard_name, shard_id) in stacked_params_mapping: if shard_name not in name: diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index 69b75763e9a3d..b15ed11988c27 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -299,4 +299,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) + # TODO (@robertgshaw2-neuralmagic): move to fp8 linear method + if "c_attn.input_scale" in name or "c_attn.weight_scale" in name: + weight_loader(param, loaded_weight, 'q') + weight_loader(param, loaded_weight, 'k') + weight_loader(param, loaded_weight, 'v') + else: + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 39355b9d3ab44..8e36c54b1c511 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -2,9 +2,7 @@ import torch import torch.nn as nn -# TODO(xwjiang): We should port CLIPVisionModel's code over to not depend on -# transformers' impl. -from transformers import CLIPVisionModel, LlavaConfig +from transformers import LlavaConfig from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, VisionLanguageConfig @@ -15,6 +13,7 @@ from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.models.llama import LlamaModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY @@ -189,12 +188,11 @@ def _select_image_features(self, image_features: torch.Tensor, *, def _image_pixels_to_features(self, vision_tower: CLIPVisionModel, pixel_values: torch.Tensor) -> torch.Tensor: - # TODO(xwjiang): Maybe port minimal CLIPVisionModel over. - image_outputs = vision_tower(pixel_values.to(vision_tower.device), - output_hidden_states=True) - image_features = image_outputs.hidden_states[ - self.config.vision_feature_layer] + # NOTE: we skip the step to select the vision feature layer since + # this is already done inside the vision tower + image_features = vision_tower(pixel_values.to(vision_tower.device), + self.config.vision_feature_layer) return self._select_image_features( image_features, @@ -317,6 +315,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue + # post_layernorm is not needed in CLIPVisionModel + if "vision_model.post_layernorm" in name: + continue for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items(): if key_to_modify in name: name = name.replace(key_to_modify, new_key) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 0ab9afea9ac69..c1158c933c88b 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -4,9 +4,7 @@ import torch import torch.nn as nn from PIL import Image -# TODO(xwjiang): We should port CLIPVisionModel's code over to not depend on -# transformers' impl. -from transformers import CLIPVisionModel, LlavaNextConfig +from transformers import LlavaNextConfig from transformers.models.llava_next.modeling_llava_next import ( get_anyres_image_grid_shape, unpad_image) from typing_extensions import NotRequired @@ -20,6 +18,7 @@ from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.models.llama import LlamaModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData @@ -121,7 +120,7 @@ def __init__(self, if self.vision_language_config.image_input_type == ( VisionLanguageConfig.ImageInputType.PIXEL_VALUES): - self.vision_tower = CLIPVisionModel(config.vision_config) + self.vision_tower = CLIPVisionModel(config=config.vision_config) else: raise TypeError("Image features are not supported by LLaVA-NeXT") @@ -219,12 +218,11 @@ def _select_image_features(self, image_features: torch.Tensor, *, def _image_pixels_to_features(self, vision_tower: CLIPVisionModel, pixel_values: torch.Tensor) -> torch.Tensor: - # TODO(xwjiang): Maybe port minimal CLIPVisionModel over. - image_outputs = vision_tower(pixel_values.to(vision_tower.device), - output_hidden_states=True) - image_features = image_outputs.hidden_states[ - self.config.vision_feature_layer] + # NOTE: we skip the step to select the vision feature layer since + # this is already done inside the vision tower + image_features = vision_tower(pixel_values.to(vision_tower.device), + self.config.vision_feature_layer) return self._select_image_features( image_features, @@ -430,6 +428,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue + # post_layernorm is not needed in CLIPVisionModel + if "vision_model.post_layernorm" in name: + continue for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items(): if key_to_modify in name: name = name.replace(key_to_modify, new_key) diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py new file mode 100644 index 0000000000000..b18269777cd01 --- /dev/null +++ b/vllm/model_executor/models/mlp_speculator.py @@ -0,0 +1,143 @@ +import math +from typing import Iterable, List, Tuple + +import torch +import torch.nn as nn + +from vllm.model_executor import SamplingMetadata +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.sequence import SamplerOutput + + +class MLPSpeculatorLayerNorm(nn.Module): + """ + A L2 normalization implementation + ... + Args + ---- + normalized_shape : int + Dimensionality of input data (size of final tensor axis) + eps : float + Safety term to prevent division by zero. Make sure the chosen value + fits in the range of your encoding scheme + (i.e. fp16 requires eps >= 6e-8). + """ + + def __init__( + self, + normalized_shape, + eps=1e-06, + ): + super(MLPSpeculatorLayerNorm, self).__init__() + self.weight = nn.Parameter(torch.empty(normalized_shape)) + self.bias = nn.Parameter(torch.empty(normalized_shape)) + self.eps = eps + + def forward(self, x): + xf = x + xf = xf * torch.rsqrt(xf.pow(2).mean(-1, keepdim=True) + self.eps) + x = xf.type_as(x) + x = self.weight * x + x = x + self.bias + return x + + +class MLPSpeculator(nn.Module): + + def __init__(self, config, **kwargs) -> None: + super().__init__() + self.n_predict = config.n_predict + self.vocab_size = config.vocab_size + self.emb_dim = config.emb_dim + self.inner_dim = config.inner_dim if config.inner_dim != 0 \ + else config.emb_dim + + self.max_speculative_tokens = getattr(config, "max_speculative_tokens", + self.n_predict) + + self.emb = nn.ModuleList([ + VocabParallelEmbedding(config.vocab_size, + self.inner_dim, + org_num_embeddings=config.vocab_size) + for _ in range(self.max_speculative_tokens) + ]) + + self.proj = nn.ModuleList([ + nn.Linear((self.emb_dim if i == 0 else self.inner_dim), + self.inner_dim, + bias=False) for i in range(self.max_speculative_tokens) + ]) + + self.head = nn.ModuleList([ + nn.Linear(self.inner_dim, self.vocab_size, bias=False) + for _ in range(self.max_speculative_tokens) + ]) + self.ln = nn.ModuleList([ + MLPSpeculatorLayerNorm(self.inner_dim) + for _ in range(self.max_speculative_tokens) + ]) + + self.state_weight = 0.5**(0.5 / config.n_predict) + self.emb_weight = math.sqrt( + (1 - self.state_weight**2) * (self.inner_dim / 2)) + self.activation = nn.GELU() + self.config = config + self.logits_processor = LogitsProcessor(config.vocab_size, + config.vocab_size, 1.0) + self.sampler = Sampler() + + def generate_proposals( + self, + input_ids: torch.Tensor, + previous_hidden_states: torch.Tensor, + num_predict_tokens: int, + sampling_metadata: SamplingMetadata, + ) -> List[SamplerOutput]: + if num_predict_tokens > self.max_speculative_tokens: + raise ValueError(f"Max speculative tokens for model is " + f"{self.max_speculative_tokens}, but " + f"{num_predict_tokens} were requested") + + # b x 1 x d + previous_hidden_states = previous_hidden_states.unsqueeze(1) + + # b x 1 + last_tokens = input_ids.unsqueeze(1) + + next_tokens = [] + + for head_index in range(num_predict_tokens): + + # Project and predict + z = self.emb[head_index](last_tokens) # b k d + states = self.proj[head_index](previous_hidden_states) + + # Weighted add of state_weight*state and emb_weight*z + # Let subsequent LN take care of denominator + # state_weight is close to 1, so shouldn't be any precision issues + states.add_(z, alpha=self.emb_weight / self.state_weight) + + states = self.activation(self.ln[head_index](states)) # b k d + # TODO: not yet supporting top_k_tokens_per_head + previous_hidden_states = states + + logits = self.logits_processor(self.head[head_index].weight, + states, sampling_metadata) + + output = self.sampler(logits.flatten(0, 1), sampling_metadata) + last_tokens = output.sampled_token_ids + next_tokens.append(output) + + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + param = params_dict[name.replace("speculator.", "")] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py new file mode 100644 index 0000000000000..fa20a7c5903d6 --- /dev/null +++ b/vllm/model_executor/models/phi3v.py @@ -0,0 +1,381 @@ +# coding=utf-8 +# Copyright 2024 The vLLM team. +# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Iterable, List, Literal, Optional, Tuple, TypedDict + +import torch +import torch.nn as nn +from transformers import CLIPVisionConfig, PretrainedConfig + +from vllm.attention import AttentionMetadata +from vllm.config import CacheConfig, VisionLanguageConfig +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.clip import CLIPVisionModel +from vllm.model_executor.models.llama import LlamaModel +from vllm.model_executor.models.vlm_base import VisionLanguageModelBase +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.image import get_dummy_image_data +from vllm.sequence import SamplerOutput + +_KEYS_TO_MODIFY_MAPPING = { + "model.vision_embed_tokens": "vision_embed_tokens", +} + +CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(dropout=0.0, + hidden_act="quick_gelu", + hidden_size=1024, + image_size=336, + intermediate_size=4096, + num_attention_heads=16, + num_channels=3, + num_hidden_layers=24, + patch_size=14, + projection_dim=768) + + +class Phi3ImageEmbeddingBase(nn.Module): + + def __init__(self, wte=None) -> None: + super().__init__() + self.wte = wte + self.layer_idx: int + self.type_feature: str + self.img_processor: CLIPVisionModel + + def set_img_features(self, img_features: torch.FloatTensor) -> None: + self.img_features = img_features + + def set_img_sizes(self, img_sizes: torch.LongTensor) -> None: + self.img_sizes = img_sizes + + def get_img_features(self, + img_embeds: torch.FloatTensor) -> torch.FloatTensor: + LAYER_IDX = self.layer_idx + TYPE_FEATURE = self.type_feature + + # NOTE: we skip the step to select the vision feature layer since + # this is already done inside the img_processor + img_feature = self.img_processor(img_embeds, + vision_feature_layer=LAYER_IDX) + + if TYPE_FEATURE == "patch": + patch_feature = img_feature[:, 1:] + return patch_feature + + if TYPE_FEATURE == "cls_patch": + return img_feature + + raise NotImplementedError + + +# adapted from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_embedding_phi3_v.py +class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase): + """Phi3 Image embedding with HD transform.""" + + def __init__(self, + vision_language_config: VisionLanguageConfig, + config: PretrainedConfig, + wte=None) -> None: + super().__init__(wte) + + self.image_token_id = vision_language_config.image_token_id + # n_embed or hidden_size + hidden_size = config.n_embd if hasattr( + config, 'n_embd') else config.hidden_size + + clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG + self.img_processor = CLIPVisionModel(clip_config) + image_dim_out = config.img_processor['image_dim_out'] + self.num_img_tokens = config.img_processor['num_img_tokens'] + + self.image_dim_out = image_dim_out + self.img_sizes = None + + # global_gn and sub_gn for hd transform, serves as line separator + self.use_hd_transform = config.embd_layer.get('use_hd_transform', + False) + self.with_learnable_separator = config.embd_layer.get( + 'with_learnable_separator', False) + self.hd_transform_order = config.embd_layer.get( + 'hd_transform_order', 'glb_sub') + # with_hd_transform and with_learnable_separator should have same value + assert self.use_hd_transform and self.with_learnable_separator + + # 1024 * 4, merge spatial to channel dimension + self.glb_GN = nn.Parameter(torch.empty([1, 1, self.image_dim_out * 4])) + self.sub_GN = nn.Parameter( + torch.empty([1, 1, 1, self.image_dim_out * 4])) + + dim_projection = hidden_size + depth = 2 + layers = [nn.Linear(image_dim_out * 4, dim_projection)] + for _ in range(1, depth): + layers.extend( + [nn.GELU(), + nn.Linear(dim_projection, dim_projection)]) + self.img_projection = nn.Sequential(*layers) + + self.vocab_size = config.vocab_size + self.img_features = None + + self.layer_idx = config.img_processor.get('layer_idx', -2) + self.type_feature = config.img_processor.get('type_feature', 'patch') + + def forward(self, + input_ids: torch.LongTensor, + pixel_values: torch.FloatTensor, + image_sizes=None) -> torch.FloatTensor: + """process and merge text embeddings with image embeddings.""" + + img_embeds = pixel_values + img_sizes = image_sizes + + if self.img_features is not None: + img_embeds = self.img_features.clone() + self.img_features = None + + if self.img_sizes is not None: + img_sizes = self.img_sizes + + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + + positions = torch.nonzero(input_ids == self.image_token_id) + + select = False + + target_device = self.img_projection[0].bias.device + target_dtype = self.img_projection[0].bias.dtype + + if len(positions.tolist()) > 0: + # if self.use_hd_transform and img_sizes: + # img_embeds: (num_images, max_num_crops, 3, H, W) + # img_sizes: (num_images, 2).view(1, -1) + + bs = img_embeds.shape[0] + # Nx(HW)xC + img_features = self.get_img_features(img_embeds.flatten(0, 1)) + base_feat_height = base_feat_width = int( + img_features.shape[1]**0.5) + + # bs x max_num_crops x (24x24) x C + img_features = img_features.view( + bs, -1, base_feat_height * base_feat_width, self.image_dim_out) + C = self.image_dim_out + H = base_feat_height + + output_imgs = [] + output_len = [] + + if isinstance(img_sizes, torch.Tensor): + img_sizes.squeeze_(0) + + for _bs in range(bs): + h, w = img_sizes + h = h // 336 + w = w // 336 + B_ = h * w + + # 1 x (24x24) x 1024 + global_img_feature = img_features[_bs, :1] + + # 1 x 12 x 12 x 4096 + glb_img = global_img_feature \ + .reshape(1, H // 2, 2, H // 2, 2,C) \ + .permute(0, 1, 3, 2, 4, 5) \ + .reshape(1, H // 2, H // 2, 4 * C) + temp_glb_GN = self.sub_GN.repeat(1, H // 2, 1, 1) + + # 1 x 156 x 4096 + glb_img = torch.cat([glb_img, temp_glb_GN], + dim=2).reshape(1, -1, 4 * C) + + # (max_num_crops-1) x (12x12) x C + sub_img = img_features[_bs, 1:] + # 16x574x1024 + # get rid of padding sub_img + sub_img = sub_img[:B_] + + sub_img = sub_img.reshape(B_, H // 2, 2, H // 2, 2, C) \ + .permute(0, 1, 3, 2, 4, 5).reshape(B_, -1, 4 * C) + sub_img = sub_img.reshape(1, h, w, 12, 12, -1) \ + .permute(0, 1, 3, 2, 4, 5) \ + .reshape(1, h * 12, w * 12, 4 * C) + temp_sub_GN = self.sub_GN.repeat(1, h * 12, 1, 1) + sub_img = torch.cat([sub_img, temp_sub_GN], + dim=2).reshape(1, -1, 4 * C) + # (1, num_img_tokens, 1024*4) + + # glb + sub + if self.hd_transform_order == 'glb_sub': + output_imgs.append( + torch.cat([glb_img, self.glb_GN, sub_img], dim=1)) + elif self.hd_transform_order == 'sub_glb': + output_imgs.append( + torch.cat([sub_img, self.glb_GN, glb_img], dim=1)) + + temp_len = int((h * w + 1) * 144 + 1 + (h + 1) * 12) + output_len.append(temp_len) + + num_img_tokens = output_len + img_set_tensor = [] + for _output_img in output_imgs: + img_feature_proj = self.img_projection( + _output_img.to(target_device, target_dtype)) + img_set_tensor.append(img_feature_proj) + select = True + + input_ids.clamp_min_(0).clamp_max_(self.vocab_size) + + hidden_states = self.wte(input_ids) + + if select: + idx = 0 + for i, cnt in enumerate(num_img_tokens): + hidden_states[positions[idx, 0], + positions[idx, 1]:positions[idx, 1] + + cnt] = (img_set_tensor[i].to( + hidden_states.device, hidden_states.dtype)) + idx += cnt + + return hidden_states.squeeze(0) + + +class Phi3VImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + data: torch.Tensor + """Shape: (batch_size, 1 + num_patches, num_channels, height, width)""" + + image_sizes: torch.Tensor + """Shape: (batch_size, 2)""" + + +@MULTIMODAL_REGISTRY.register_image_pixel_input() +@MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data) +class Phi3VForCausalLM(VisionLanguageModelBase): + + def __init__(self, + config: PretrainedConfig, + vision_language_config: VisionLanguageConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None) -> None: + super().__init__(vision_language_config) + self.config = config + self.model = LlamaModel(config, cache_config, quant_config) + self.vision_embed_tokens = Phi3HDImageEmbedding( + vision_language_config, config, self.model.embed_tokens) + self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) + self.logits_processor = LogitsProcessor(config.vocab_size) + self.sampler = Sampler() + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[Phi3VImagePixelInputs]: + pixel_values = kwargs.pop("pixel_values", None) + image_sizes = kwargs.pop("image_sizes", None) + + expected_input_type = self.vision_language_config.image_input_type + ImageInputType = VisionLanguageConfig.ImageInputType + + if expected_input_type != ImageInputType.PIXEL_VALUES: + raise ValueError( + f"Unexpected image input type: {expected_input_type}." + "Phi3v only support pixel_values input currently.") + + if pixel_values is not None and image_sizes is not None: + return Phi3VImagePixelInputs(type="pixel_values", + data=pixel_values, + image_sizes=image_sizes) + + return None + + def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, **kwargs: object): + image_input = self._parse_and_validate_image_input(**kwargs) + + if image_input is not None: + inputs_embeds = self.vision_embed_tokens( + input_ids, image_input["data"], image_input["image_sizes"]) + + input_ids = None + else: + inputs_embeds = None + + hidden_states = self.model(input_ids, + positions, + kv_caches, + attn_metadata, + inputs_embeds=inputs_embeds) + + return hidden_states + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.lm_head.weight, hidden_states, + sampling_metadata) + return logits + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".qkv_proj", ".q_proj", "q"), + (".qkv_proj", ".k_proj", "k"), + (".qkv_proj", ".v_proj", "v"), + (".gate_up_proj", ".gate_proj", 0), + (".gate_up_proj", ".up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + # post_layernorm is not needed in CLIPVisionModel + if "vision_model.post_layernorm" in name: + continue + for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items(): + if key_to_modify in name: + name = name.replace(key_to_modify, new_key) + for (param_name, weight_name, shard_id) in stacked_params_mapping: + # We only do sharding for language model + # and not vision model for now. + if "vision_embed_tokens" in name and self.vision_embed_tokens: + continue + if weight_name not in name: + continue + param = params_dict[name.replace(weight_name, param_name)] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index d22ea6b79de0f..b6ea6ab396642 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -28,6 +28,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput +from vllm.utils import print_warning_once class QWenMLP(nn.Module): @@ -288,6 +289,15 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue + # Skip loading visual weights to support Qwen-VL models + # in cases with text-only inputs + # TODO: add support for Qwen-VL + if (name not in params_dict + and name.startswith("transformer.visual.")): + print_warning_once( + "Only text inputs are allowed. Images won't be handled " + "until Qwen-VL models are fully supported.") + continue param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 9a4829a27873e..b5d13bb6b937c 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -46,6 +46,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput +from vllm.utils import print_warning_once class Qwen2MLP(nn.Module): @@ -375,6 +376,19 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue + # Remapping the name of FP8 kv-scale. + if name.endswith("kv_scale"): + remapped_kv_scale_name = name.replace( + ".kv_scale", ".attn.kv_scale") + if remapped_kv_scale_name not in params_dict: + print_warning_once( + f"Found kv scale in the checkpoint (e.g. {name}), " + "but not found the expected name in the model " + f"(e.g. {remapped_kv_scale_name}). kv-scale is " + "not loaded.") + continue + else: + name = remapped_kv_scale_name param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 7ad84f51b7e4c..f95de56f39b57 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -386,18 +386,10 @@ def from_sampling_metadata( presence_penalties += [0] * prefill_len frequency_penalties += [0] * prefill_len repetition_penalties += [1] * prefill_len - if do_penalties: - prompt_tokens.extend([] for _ in range(prefill_len)) - output_tokens.extend([] for _ in range(prefill_len)) if seq_group.do_sample: sample_lens = len(seq_group.sample_indices) assert sample_lens == len(seq_ids) - for seq_id in seq_ids: - seq_data = seq_group.seq_data[seq_id] - if do_penalties: - prompt_tokens.append(seq_data.prompt_token_ids) - output_tokens.append(seq_data.output_token_ids) temperatures += [temperature] * len(seq_ids) top_ps += [top_p] * len(seq_ids) top_ks += [top_k] * len(seq_ids) @@ -424,6 +416,20 @@ def from_sampling_metadata( sampling_seeds.append(seq_seeds) sample_indices.extend(seq_group.sample_indices) + if do_penalties: + for seq_group in sampling_metadata.seq_groups: + seq_ids = seq_group.seq_ids + if (seq_group.is_prompt + and sampling_params.prompt_logprobs is not None): + prefill_len = len(seq_group.prompt_logprob_indices) + prompt_tokens.extend([] for _ in range(prefill_len)) + output_tokens.extend([] for _ in range(prefill_len)) + if seq_group.do_sample: + for seq_id in seq_ids: + seq_data = seq_group.seq_data[seq_id] + prompt_tokens.append(seq_data.prompt_token_ids) + output_tokens.append(seq_data.output_token_ids) + sampling_tensors = SamplingTensors.from_lists( temperatures, top_ps, top_ks, min_ps, presence_penalties, frequency_penalties, repetition_penalties, sampling_seeds, diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index c6311d60e0bdd..509f791d27c6f 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -79,6 +79,8 @@ def get_full_image_text_prompt(image_prompt: str, text_prompt: str, if config.hf_config.model_type in ("llava", "llava_next"): full_prompt = f"{image_prompt}\n{text_prompt}" + elif config.hf_config.model_type == 'phi3_v': + full_prompt = f"{image_prompt}\n{text_prompt}" else: raise ValueError( f"Unsupported model type: {config.hf_config.model_type}") diff --git a/vllm/sequence.py b/vllm/sequence.py index 2f27bf33b166e..287e1b9df6165 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -123,7 +123,7 @@ def __init__( output_token_ids = [] self.prompt_token_ids = prompt_token_ids - self._prompt_token_ids_tuple: Tuple[int, ...] = tuple(prompt_token_ids) + self._prompt_token_ids_tuple = tuple(prompt_token_ids) self.output_token_ids = output_token_ids self.cumulative_logprob = 0.0 # The number of tokens that are computed (that run against the model). @@ -414,6 +414,7 @@ class SequenceGroup: for an embedding model. encoder_seq: Optional, the single encoder sequence. Should be None unless you are working with an encoder/decoder model. + trace_headers: OpenTelemetry trace headers. """ def __init__( @@ -426,6 +427,7 @@ def __init__( embeddings: Optional[List[float]] = None, pooling_params: Optional[PoolingParams] = None, encoder_seq: Optional[Sequence] = None, + trace_headers: Optional[Dict[str, str]] = None, ) -> None: self.request_id = request_id self.seqs_dict = {seq.seq_id: seq for seq in seqs} @@ -441,6 +443,7 @@ def __init__( self.embeddings = embeddings self.pooling_params = pooling_params self.encoder_seq = encoder_seq + self.trace_headers = trace_headers @property def prompt(self) -> Optional[str]: @@ -791,6 +794,9 @@ class SamplerOutput: # Spec decode metrics populated by workers. spec_decode_worker_metrics: Optional["SpecDecodeWorkerMetrics"] = None + # Optional last hidden states from the model. + hidden_states: Optional[torch.Tensor] = None + def __getitem__(self, idx: int): return self.outputs[idx] @@ -839,6 +845,46 @@ def __eq__(self, other: object): self.__class__) and self.outputs == other.outputs +def get_all_seq_ids( + seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[int]: + """Given a list of SequenceGroupMetadata, create a list of all + sequence ids. + """ + return [seq_id for sg in seq_group_metadata_list for seq_id in sg.seq_data] + + +class HiddenStates: + """Hidden states corresponding to in-progress sequences. + Used in speculative decoding to pass hidden states from + the target model to the proposer model in the subsequent step. + + seq_ids are the sequence ids of each entry of the batch + dimension of the hidden_states tensor""" + + def __init__(self, seq_group_metadata_list: List[SequenceGroupMetadata], + hidden_states: torch.Tensor): + assert len(seq_group_metadata_list) == len(hidden_states) + self.seq_ids: List[int] = get_all_seq_ids(seq_group_metadata_list) + self.hidden_states: torch.Tensor = hidden_states + + def update(self, seq_group_metadata_list: List[SequenceGroupMetadata], + hidden_states: torch.Tensor) -> None: + """Update hidden states from target model invocation.""" + assert len(seq_group_metadata_list) == len(hidden_states) + self.seq_ids.extend(get_all_seq_ids(seq_group_metadata_list)) + self.hidden_states = torch.cat([self.hidden_states, hidden_states]) + + def prune(self, + seq_group_metadata_list: List[SequenceGroupMetadata]) -> None: + """Prune to provided list of sequence ids.""" + seq_ids = get_all_seq_ids(seq_group_metadata_list) + if seq_ids != self.seq_ids: + # Batch contents changed - prune removed sequences. + index = [self.seq_ids.index(seq_id) for seq_id in seq_ids] + self.hidden_states = self.hidden_states[index] + self.seq_ids = seq_ids + + @dataclass class ExecuteModelRequest: """The model execution request.""" @@ -854,6 +900,8 @@ class ExecuteModelRequest: num_lookahead_slots: int = 0 # The number of requests in the running queue. running_queue_size: int = 0 + # Optional hidden states from prior step. + previous_hidden_states: Optional[HiddenStates] = None def clone( self, seq_group_metadata_list: List[SequenceGroupMetadata] @@ -866,4 +914,5 @@ def clone( blocks_to_copy=self.blocks_to_copy.copy(), num_lookahead_slots=self.num_lookahead_slots, running_queue_size=self.running_queue_size, + previous_hidden_states=self.previous_hidden_states, ) diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 1bde042086f0b..40516556344e9 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -4,11 +4,10 @@ import torch from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData, - SequenceGroupMetadata) + SequenceGroupMetadata, get_all_seq_ids) from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeScorer, SpeculativeScores) -from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range, - sampler_output_to_torch, +from vllm.spec_decode.util import (nvtx_range, sampler_output_to_torch, split_batch_by_proposal_len) from vllm.worker.worker_base import WorkerBase @@ -98,6 +97,7 @@ def score_proposals( probs=all_probs, token_ids=all_tokens, logprobs=spec_logprobs, + hidden_states=target_sampler_output.hidden_states, ) def _expand_batch( diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py index 72d7818eb1177..d236fc0f2cb6b 100644 --- a/vllm/spec_decode/interfaces.py +++ b/vllm/spec_decode/interfaces.py @@ -1,5 +1,6 @@ from abc import ABC, abstractmethod from dataclasses import dataclass +from typing import Optional import torch @@ -46,6 +47,9 @@ class SpeculativeScores: # tokens and also non-speculative normal decoding. token_ids: torch.Tensor + # Optional last hidden states from the scoring model. + hidden_states: Optional[torch.Tensor] = None + def __repr__(self): return (f"SpeculativeScores(" f"probs={self.probs.shape}, " diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py new file mode 100644 index 0000000000000..0926e13bedab1 --- /dev/null +++ b/vllm/spec_decode/mlp_speculator_worker.py @@ -0,0 +1,87 @@ +from typing import List, Optional, Tuple + +import torch + +from vllm.model_executor import SamplingMetadata +from vllm.sequence import (ExecuteModelRequest, SamplerOutput, + SequenceGroupMetadata) +from vllm.spec_decode.multi_step_worker import MultiStepWorker +from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase +from vllm.worker.model_runner import ModelInput + + +class MLPSpeculatorWorker(NonLLMProposerWorkerBase, MultiStepWorker): + """Worker for MLPSpeculator models. + + Not currently compatible with LoRA or chunked prefill. + """ + + @torch.inference_mode() + def sampler_output( + self, + execute_model_req: ExecuteModelRequest, + sample_len: int, + ) -> Tuple[List[SamplerOutput], bool]: + """Run the model forward pass to generate sample_len future tokens. + Returns the list of sampler output, one per layer, along with indicator + of whether torch tensor in sampler output need to be transposed in + latter sampler_output_to_torch logic. + + For mlp spec worker, this indicator shall be True. + """ + self._raise_if_unsupported(execute_model_req) + + seq_group_metadata_list = execute_model_req.seq_group_metadata_list + + (input_tokens, seq_lens, + query_lens) = self._prepare_input_tensors(seq_group_metadata_list) + + sampling_metadata = SamplingMetadata.prepare( + seq_group_metadata_list, seq_lens, query_lens, self.device, + self.model_runner.pin_memory) + + model_outputs = self.model_runner.model.generate_proposals( + input_ids=input_tokens, + previous_hidden_states=execute_model_req.previous_hidden_states. + hidden_states, + num_predict_tokens=sample_len, + sampling_metadata=sampling_metadata) + + assert len(model_outputs) == sample_len + + return model_outputs, True + + def _prepare_input_tensors( + self, + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], + ) -> Tuple[torch.Tensor, List[int], List[int]]: + if not seq_group_metadata_list: + return ModelInput.empty(self.device) + + input_tokens: List[int] = [] + seq_lens: List[int] = [] + query_lens: List[int] = [] + + for seq_group_metadata in seq_group_metadata_list: + is_prompt = seq_group_metadata.is_prompt + + for seq_data in seq_group_metadata.seq_data.values(): + seq_data_len = seq_data.get_len() + if is_prompt: + context_len = seq_data.get_num_computed_tokens() + seq_len = min( + seq_data_len, + context_len + seq_group_metadata.token_chunk_size) + tokens = seq_data.get_token_ids()[context_len:seq_len] + seq_lens.append(seq_len) + input_tokens.extend(tokens) + query_lens.append(seq_len - context_len) + else: + seq_lens.append(seq_data_len) + input_tokens.append(seq_data.get_last_token_id()) + query_lens.append(1) + + input_tokens_tensor = torch.tensor(input_tokens, + dtype=torch.long, + device=self.device) + return input_tokens_tensor, seq_lens, query_lens diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index fe15ea33b5f36..668ceefe6175f 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -1,10 +1,10 @@ import copy import weakref -from typing import List, Tuple +from typing import Dict, List, Tuple import torch -from vllm.sequence import (ExecuteModelRequest, SamplerOutput, +from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData, SequenceGroupMetadata) from vllm.spec_decode.interfaces import SpeculativeProposals from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase @@ -71,7 +71,7 @@ def sampler_output( sample_len) # Run model sample_len times. - model_outputs = [] + model_outputs: List[SamplerOutput] = [] for _ in range(sample_len): model_output = super().execute_model( execute_model_req=copied_execute_model_req) @@ -132,7 +132,7 @@ def _shallow_copy_inputs( # Shallow-copy the list of SequenceGroupMetadata. This allows us to # append tokens and change is_prompt without external side-effects. - new_seq_group_metadata_list = [] + new_seq_group_metadata_list: List[SequenceGroupMetadata] = [] for old_seq_group_metadata in seq_group_metadata_list: # We must shallow-copy seq_group_metadata as is_prompt could change. @@ -140,7 +140,7 @@ def _shallow_copy_inputs( new_seq_group_metadata_list.append(seq_group_metadata) # We must shallow-copy seq_data as we will append token ids - new_seq_data = {} + new_seq_data: Dict[int, SequenceData] = {} for seq_id, old_seq_data in seq_group_metadata.seq_data.items(): new_seq_data[seq_id] = copy.copy(old_seq_data) new_seq_data[ diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py index 33af588d0ba29..23a3e1649914b 100644 --- a/vllm/spec_decode/ngram_worker.py +++ b/vllm/spec_decode/ngram_worker.py @@ -48,7 +48,7 @@ def sampler_output( self, execute_model_req: ExecuteModelRequest, sample_len: int, - ) -> Tuple[Optional[List[SamplerOutput]], bool]: + ) -> Tuple[Optional[List[Optional[SamplerOutput]]], bool]: """NGram match algo to pick proposal candidate. Returns the list of sampler output, one per SequenceGroupMetadata. @@ -58,8 +58,8 @@ def sampler_output( self._raise_if_unsupported(execute_model_req) has_spec_out = False - token_id_list = [] - token_prob_list = [] + token_id_list: List[Optional[torch.Tensor]] = [] + token_prob_list: List[Optional[torch.Tensor]] = [] for idx, seq_group_metadata in enumerate( execute_model_req.seq_group_metadata_list): seq_data = next(iter(seq_group_metadata.seq_data.values())) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 8b147c80690dd..58d3461a25188 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -7,17 +7,19 @@ from vllm.distributed.communication_op import broadcast_tensor_dict from vllm.logger import init_logger from vllm.model_executor.layers.rejection_sampler import RejectionSampler -from vllm.sequence import (ExecuteModelRequest, SamplerOutput, - SequenceGroupMetadata) +from vllm.sequence import (CompletionSequenceGroupOutput, ExecuteModelRequest, + HiddenStates, SamplerOutput, SequenceGroupMetadata, + get_all_seq_ids) from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeScorer, SpeculativeScores) from vllm.spec_decode.metrics import AsyncMetricsCollector +from vllm.spec_decode.mlp_speculator_worker import MLPSpeculatorWorker from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.ngram_worker import NGramWorker from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase from vllm.spec_decode.util import (create_sequence_group_output, - get_all_num_logprobs, get_all_seq_ids, + get_all_num_logprobs, get_sampled_token_logprobs, nvtx_range, split_batch_by_proposal_len) from vllm.worker.worker import Worker @@ -104,6 +106,10 @@ def create_worker( proposer_worker = NGramWorker(**draft_worker_kwargs) proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min, ngram_prompt_lookup_max) + elif draft_worker_kwargs[ + "model_config"].hf_config.model_type == "mlp_speculator": + proposer_worker = MLPSpeculatorWorker(**draft_worker_kwargs) + disable_bonus_tokens = False else: proposer_worker = MultiStepWorker(**draft_worker_kwargs) @@ -155,6 +161,10 @@ def __init__( # Lazy initiazliation. self.scorer: SpeculativeScorer + # Hidden states from target model to pass to proposer + # in the subsequent step. + self.previous_hidden_states: Optional[HiddenStates] = None + def init_device(self) -> None: """Initialize both scorer and proposer models. """ @@ -337,6 +347,16 @@ def _run_no_spec(self, execute_model_req: ExecuteModelRequest, assert len(sampler_output) == 1 sampler_output = sampler_output[0] + # Store hidden states from target model execution. + hidden_states = sampler_output.hidden_states + if hidden_states is not None: + if self.previous_hidden_states is None: + self.previous_hidden_states = HiddenStates( + execute_model_req.seq_group_metadata_list, hidden_states) + else: + self.previous_hidden_states.update( + execute_model_req.seq_group_metadata_list, hidden_states) + # Clear device tensors from sampler output. This reduces communication # overhead when the engine runs in a different process than the workers. sampler_output.probs = None @@ -383,6 +403,10 @@ def _run_speculative_decoding_step( """ assert num_lookahead_slots == execute_model_req.num_lookahead_slots + # Pass last hidden states from target model to proposer + execute_model_req.previous_hidden_states = self.previous_hidden_states + self.previous_hidden_states = None + # Generate proposals using draft worker. proposals = self.proposer_worker.get_spec_proposals(execute_model_req) @@ -466,6 +490,20 @@ def _verify_tokens( # metadata. accepted_token_ids[original_indices] = accepted_token_ids.clone() + hidden_states = proposal_scores.hidden_states + if hidden_states is not None: + # Contract hidden states based on accepted tokens + hs_size = hidden_states.shape[1] + hidden_states = hidden_states.reshape(-1, max_proposal_len + 1, + hs_size) + accepted_index = accepted_token_ids + 1 # Convert -1 to 0 + accepted_index = accepted_index.count_nonzero(dim=1).add_(-1) + index = accepted_index[:, None, None].expand(-1, 1, hs_size) + hidden_states = hidden_states.gather(1, index).squeeze(1) # b x d + # Store hidden states from target model for subsequent decode step + self.previous_hidden_states = HiddenStates(seq_group_metadata_list, + hidden_states) + return accepted_token_ids, logprobs def _create_output_sampler_list( @@ -516,13 +554,13 @@ def _create_output_sampler_list( topk_indices_by_step = topk_indices_by_step.tolist() # Construct the output on a per-step, per-sequence basis. - sampler_output_list = [] + sampler_output_list: List[SamplerOutput] = [] for step_index in range(num_steps): if all(token_id == -1 for token_id in accepted_token_ids_by_step[step_index]): break - step_output_token_ids = [] + step_output_token_ids: List[CompletionSequenceGroupOutput] = [] for sequence_index in range(batch_size): # Each sequence may have a different num_logprobs; retrieve it. num_logprobs = num_logprobs_per_seq[sequence_index] diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py index 278db94bfc0da..d3e280e6843b8 100644 --- a/vllm/spec_decode/top1_proposer.py +++ b/vllm/spec_decode/top1_proposer.py @@ -65,9 +65,13 @@ def get_spec_proposals( # token_ids is like [batch] format in proposal_len size list, # while if it is false, the format would be [proposal_len] # in batch size list + hidden_states = execute_model_req.previous_hidden_states + if hidden_states is not None: + hidden_states.prune(nonzero_proposal_len_seqs) nonzero_execute_model_req = ExecuteModelRequest( seq_group_metadata_list=nonzero_proposal_len_seqs, num_lookahead_slots=proposal_len, + previous_hidden_states=hidden_states, ) maybe_sampler_output, transposed = self._worker.sampler_output( execute_model_req=nonzero_execute_model_req, diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index 60ed9d39eb8d6..80710419e602d 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -10,14 +10,6 @@ SeqId = int -def get_all_seq_ids( - seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[SeqId]: - """Given a list of SequenceGroupMetadata, create a list of all - sequence ids. - """ - return [seq_id for sg in seq_group_metadata_list for seq_id in sg.seq_data] - - def get_all_num_logprobs( seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[int]: """Given a list of SequenceGroupMetadata, create a list of all num_logprobs. @@ -26,10 +18,10 @@ def get_all_num_logprobs( sequence. """ - all_num_logprobs = [] + all_num_logprobs: List[int] = [] for seq_group_metadata in seq_group_metadata_list: num_logprobs = seq_group_metadata.sampling_params.logprobs - if seq_group_metadata.sampling_params.logprobs is None: + if num_logprobs is None: num_logprobs = 0 all_num_logprobs.append(num_logprobs) diff --git a/vllm/tracing.py b/vllm/tracing.py new file mode 100644 index 0000000000000..ba6732cab68f2 --- /dev/null +++ b/vllm/tracing.py @@ -0,0 +1,104 @@ +import os +from typing import Mapping, Optional + +from vllm.logger import init_logger +from vllm.utils import run_once + +TRACE_HEADERS = ["traceparent", "tracestate"] + +logger = init_logger(__name__) + +_is_otel_installed = False +try: + from opentelemetry.context.context import Context + from opentelemetry.sdk.environment_variables import ( + OTEL_EXPORTER_OTLP_TRACES_PROTOCOL) + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import BatchSpanProcessor + from opentelemetry.semconv.ai import SpanAttributes as BaseSpanAttributes + from opentelemetry.trace import SpanKind, Tracer, set_tracer_provider + from opentelemetry.trace.propagation.tracecontext import ( + TraceContextTextMapPropagator) + _is_otel_installed = True +except ImportError: + + class Context: # type: ignore + pass + + class BaseSpanAttributes: # type: ignore + pass + + class SpanKind: # type: ignore + pass + + class Tracer: # type: ignore + pass + + +def is_otel_installed() -> bool: + return _is_otel_installed + + +def init_tracer(instrumenting_module_name: str, + otlp_traces_endpoint: str) -> Optional[Tracer]: + assert is_otel_installed(), ("OpenTelemetry packages must be installed " + "prior to initializing a tracer") + trace_provider = TracerProvider() + + span_exporter = get_span_exporter(otlp_traces_endpoint) + trace_provider.add_span_processor(BatchSpanProcessor(span_exporter)) + set_tracer_provider(trace_provider) + + tracer = trace_provider.get_tracer(instrumenting_module_name) + return tracer + + +def get_span_exporter(endpoint): + protocol = os.environ.get(OTEL_EXPORTER_OTLP_TRACES_PROTOCOL, "grpc") + if protocol == "grpc": + from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import ( + OTLPSpanExporter) + elif protocol == "http/protobuf": + from opentelemetry.exporter.otlp.proto.http.trace_exporter import ( + OTLPSpanExporter) + else: + raise ValueError( + f"Unsupported OTLP protocol '{protocol}' is configured") + + return OTLPSpanExporter(endpoint=endpoint) + + +def extract_trace_context( + headers: Optional[Mapping[str, str]]) -> Optional[Context]: + if is_otel_installed(): + headers = headers or {} + return TraceContextTextMapPropagator().extract(headers) + else: + return None + + +def extract_trace_headers(headers: Mapping[str, str]) -> Mapping[str, str]: + + return {h: headers[h] for h in TRACE_HEADERS if h in headers} + + +class SpanAttributes(BaseSpanAttributes): + # The following span attribute names are added here because they are missing + # from the Semantic Conventions for LLM. + LLM_REQUEST_ID = "gen_ai.request.id" + LLM_REQUEST_BEST_OF = "gen_ai.request.best_of" + LLM_REQUEST_N = "gen_ai.request.n" + LLM_USAGE_NUM_SEQUENCES = "gen_ai.usage.num_sequences" + LLM_LATENCY_TIME_IN_QUEUE = "gen_ai.latency.time_in_queue" + LLM_LATENCY_TIME_TO_FIRST_TOKEN = "gen_ai.latency.time_to_first_token" + LLM_LATENCY_E2E = "gen_ai.latency.e2e" + + +def contains_trace_headers(headers: Mapping[str, str]) -> bool: + return any(h in headers for h in TRACE_HEADERS) + + +@run_once +def log_tracing_disabled_warning() -> None: + logger.warning( + "Received a request with trace context but tracing is disabled") diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index ada84018212a0..60fc756a12e3d 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -1,3 +1,4 @@ +import contextlib from typing import Dict, Optional, Type from transformers import PretrainedConfig @@ -5,7 +6,13 @@ from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig, - JAISConfig, MPTConfig, RWConfig) + JAISConfig, MLPSpeculatorConfig, + MPTConfig, RWConfig) + +if VLLM_USE_MODELSCOPE: + from modelscope import AutoConfig +else: + from transformers import AutoConfig logger = init_logger(__name__) @@ -16,8 +23,13 @@ "RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct) "RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct) "jais": JAISConfig, + "mlp_speculator": MLPSpeculatorConfig, } +for name, cls in _CONFIG_REGISTRY.items(): + with contextlib.suppress(ValueError): + AutoConfig.register(name, cls) + def get_config(model: str, trust_remote_code: bool, @@ -26,10 +38,6 @@ def get_config(model: str, rope_scaling: Optional[dict] = None, rope_theta: Optional[float] = None) -> PretrainedConfig: try: - if VLLM_USE_MODELSCOPE: - from modelscope import AutoConfig - else: - from transformers import AutoConfig config = AutoConfig.from_pretrained( model, trust_remote_code=trust_remote_code, diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 0e486928824ca..d8170858c2a9a 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -5,6 +5,7 @@ # `FalconConfig` class from the official HuggingFace transformers library. from vllm.transformers_utils.configs.falcon import RWConfig from vllm.transformers_utils.configs.jais import JAISConfig +from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig from vllm.transformers_utils.configs.mpt import MPTConfig __all__ = [ @@ -13,4 +14,5 @@ "MPTConfig", "RWConfig", "JAISConfig", + "MLPSpeculatorConfig", ] diff --git a/vllm/transformers_utils/configs/mlp_speculator.py b/vllm/transformers_utils/configs/mlp_speculator.py new file mode 100644 index 0000000000000..dd1d92b861b81 --- /dev/null +++ b/vllm/transformers_utils/configs/mlp_speculator.py @@ -0,0 +1,50 @@ +from typing import List, Optional + +from transformers import PretrainedConfig + + +class MLPSpeculatorConfig(PretrainedConfig): + model_type = "mlp_speculator" + + attribute_map = { + "hidden_size": "emb_dim", + } + + def __init__(self, + vocab_size: int = 32000, + emb_dim: int = 4096, + inner_dim: int = 0, + n_predict: int = 3, + top_k_tokens_per_head: Optional[List[int]] = None, + n_candidates: int = 5, + **kwargs): + """ + Initialize an MLPSpeculatorConfig + + Args: + vocab_size: int + the model vocab size + emb_dim: int + the model embedding dimension + inner_dim: int + the inner dimension of the model. If 0, will be the emb_dim. + n_predict: int + the number of lookaheads for the speculator + top_k_tokens_per_head: List[int] + Number of tokens to consider from each head when forming the + candidate tree. + For each candidate branch in the tree, head n produces topk[n] + additional sub-branches. + n_candidates: int + number of child candidates to create per sequence + """ + if top_k_tokens_per_head is None: + top_k_tokens_per_head = [5, 4, 3] + assert len(top_k_tokens_per_head) == n_predict + self.vocab_size = vocab_size + self.emb_dim = emb_dim + self.inner_dim = inner_dim + self.n_predict = n_predict + self.top_k_tokens_per_head = top_k_tokens_per_head + self.n_candidates = n_candidates + super().__init__(**kwargs) diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py index f064c26c3f40c..e8e53f4946efa 100644 --- a/vllm/transformers_utils/detokenizer.py +++ b/vllm/transformers_utils/detokenizer.py @@ -44,7 +44,7 @@ def decode_prompt_logprobs_inplace( read_offset = 0 next_iter_prefix_offset = 0 next_iter_read_offset = 0 - next_iter_tokens = [] + next_iter_tokens: List[str] = [] prev_tokens = None for token_position, prompt_logprobs_for_token in enumerate( diff --git a/vllm/utils.py b/vllm/utils.py index ef0602987a9e3..de5fa81eea229 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1,3 +1,4 @@ +import argparse import asyncio import datetime import enum @@ -14,18 +15,19 @@ from functools import lru_cache, partial, wraps from platform import uname from typing import (Any, AsyncIterator, Awaitable, Callable, Dict, Generic, - Hashable, List, Optional, OrderedDict, Tuple, TypeVar, + Hashable, List, Optional, OrderedDict, Set, Tuple, TypeVar, Union) import numpy as np import psutil import torch +import torch.types +from typing_extensions import ParamSpec import vllm.envs as envs from vllm import _custom_ops as ops from vllm.logger import enable_trace_function_call, init_logger -T = TypeVar("T") logger = init_logger(__name__) STR_DTYPE_TO_TORCH_DTYPE = { @@ -37,6 +39,17 @@ "fp8_e5m2": torch.uint8, } +P = ParamSpec('P') +K = TypeVar("K") +T = TypeVar("T") + + +class _Sentinel: + ... + + +ALL_PINNED_SENTINEL = _Sentinel() + class Device(enum.Enum): GPU = enum.auto() @@ -61,6 +74,7 @@ class LRUCache(Generic[T]): def __init__(self, capacity: int): self.cache: OrderedDict[Hashable, T] = OrderedDict() + self.pinned_items: Set[Hashable] = set() self.capacity = capacity def __contains__(self, key: Hashable) -> bool: @@ -96,14 +110,36 @@ def put(self, key: Hashable, value: T) -> None: self.cache.move_to_end(key) self._remove_old_if_needed() + def pin(self, key: Hashable) -> None: + """ + Pins a key in the cache preventing it from being + evicted in the LRU order. + """ + if key not in self.cache: + raise ValueError(f"Cannot pin key: {key} not in cache.") + self.pinned_items.add(key) + + def _unpin(self, key: Hashable) -> None: + self.pinned_items.remove(key) + def _on_remove(self, key: Hashable, value: Optional[T]): pass - def remove_oldest(self): + def remove_oldest(self, remove_pinned=False): if not self.cache: return - key, value = self.cache.popitem(last=False) - self._on_remove(key, value) + + if not remove_pinned: + # pop the oldest item in the cache that is not pinned + lru_key = next( + (key for key in self.cache if key not in self.pinned_items), + ALL_PINNED_SENTINEL) + if lru_key is ALL_PINNED_SENTINEL: + raise RuntimeError("All items are pinned, " + "cannot remove oldest from the cache.") + else: + lru_key = next(iter(self.cache)) + self.pop(lru_key) def _remove_old_if_needed(self) -> None: while len(self.cache) > self.capacity: @@ -114,13 +150,16 @@ def pop(self, default_value: Optional[T] = None) -> Optional[T]: run_on_remove = key in self.cache value: Optional[T] = self.cache.pop(key, default_value) + # remove from pinned items + if key in self.pinned_items: + self._unpin(key) if run_on_remove: self._on_remove(key, value) return value def clear(self): while len(self.cache) > 0: - self.remove_oldest() + self.remove_oldest(remove_pinned=True) self.cache.clear() @@ -155,6 +194,32 @@ def is_tpu() -> bool: return libtpu is not None +@lru_cache(maxsize=None) +def is_xpu() -> bool: + from importlib.metadata import PackageNotFoundError, version + + # UPSTREAM SYNC: nm-vllm can be either nightly or release. + try: + version_nm_vllm = version("nm-vllm") + except PackageNotFoundError: + version_nm_vllm = version("nm-vllm-nightly") + is_xpu_flag = "xpu" in version_nm_vllm + # vllm is not build with xpu + if not is_xpu_flag: + return False + try: + import intel_extension_for_pytorch as ipex # noqa: F401 + _import_ipex = True + except ImportError as e: + logger.warning("Import Error for IPEX: %s", e.msg) + _import_ipex = False + # ipex dependency is not ready + if not _import_ipex: + logger.warning("not found ipex lib") + return False + return hasattr(torch, "xpu") and torch.xpu.is_available() + + @lru_cache(maxsize=None) def get_max_shared_memory_bytes(gpu: int = 0) -> int: """Returns the maximum shared memory per thread block in bytes.""" @@ -176,7 +241,7 @@ def random_uuid() -> str: @lru_cache(maxsize=None) -def get_vllm_instance_id(): +def get_vllm_instance_id() -> str: """ If the environment variable VLLM_INSTANCE_ID is set, return it. Otherwise, return a random UUID. @@ -192,7 +257,7 @@ def in_wsl() -> bool: return "microsoft" in " ".join(uname()).lower() -def make_async(func: Callable[..., T]) -> Callable[..., Awaitable[T]]: +def make_async(func: Callable[P, T]) -> Callable[P, Awaitable[T]]: """Take a blocking function, and run it on in an executor thread. This function prevents the blocking function from blocking the @@ -200,7 +265,7 @@ def make_async(func: Callable[..., T]) -> Callable[..., Awaitable[T]]: The code in this function needs to be thread safe. """ - def _async_wrapper(*args, **kwargs) -> asyncio.Future: + def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> asyncio.Future: loop = asyncio.get_event_loop() p_func = partial(func, *args, **kwargs) return loop.run_in_executor(executor=None, func=p_func) @@ -325,7 +390,7 @@ def update_environment_variables(envs: Dict[str, str]): os.environ[k] = v -def chunk_list(lst, chunk_size): +def chunk_list(lst: List[T], chunk_size: int) -> List[List[T]]: """Yield successive chunk_size chunks from lst.""" return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)] @@ -336,7 +401,7 @@ def cdiv(a: int, b: int) -> int: def _generate_random_fp8( - tensor: torch.tensor, + tensor: torch.Tensor, low: float, high: float, ) -> None: @@ -398,7 +463,10 @@ def create_kv_caches_with_random_flash( torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype) key_value_cache_shape = (num_blocks, 2, block_size, num_heads, head_size) scale = head_size**-0.5 - key_caches, value_caches = [], [] + + key_caches: List[torch.Tensor] = [] + value_caches: List[torch.Tensor] = [] + for _ in range(num_layers): key_value_cache = torch.empty(size=key_value_cache_shape, dtype=torch_dtype, @@ -429,7 +497,7 @@ def create_kv_caches_with_random( scale = head_size**-0.5 x = 16 // torch.tensor([], dtype=torch_dtype).element_size() key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x) - key_caches = [] + key_caches: List[torch.Tensor] = [] for _ in range(num_layers): key_cache = torch.empty(size=key_cache_shape, dtype=torch_dtype, @@ -444,7 +512,7 @@ def create_kv_caches_with_random( key_caches.append(key_cache) value_cache_shape = (num_blocks, num_heads, head_size, block_size) - value_caches = [] + value_caches: List[torch.Tensor] = [] for _ in range(num_layers): value_cache = torch.empty(size=value_cache_shape, dtype=torch_dtype, @@ -474,6 +542,9 @@ def is_pin_memory_available() -> bool: print_warning_once("Using 'pin_memory=False' as WSL is detected. " "This may slow down the performance.") return False + elif is_xpu(): + print_warning_once("Pin memory is not supported on XPU.") + return False elif is_neuron(): print_warning_once("Pin memory is not supported on Neuron.") return False @@ -484,13 +555,17 @@ def is_pin_memory_available() -> bool: class CudaMemoryProfiler: - def __init__(self, device=None): + def __init__(self, device: Optional[torch.types.Device] = None): self.device = device def current_memory_usage(self) -> float: # Return the memory usage in bytes. - torch.cuda.reset_peak_memory_stats(self.device) - mem = torch.cuda.max_memory_allocated(self.device) + if torch.cuda.is_available(): + torch.cuda.reset_peak_memory_stats(self.device) + mem = torch.cuda.max_memory_allocated(self.device) + elif is_xpu(): + torch.xpu.reset_peak_memory_stats(self.device) + mem = torch.xpu.max_memory_allocated(self.device) return mem def __enter__(self): @@ -560,13 +635,13 @@ def get_dtype_size(dtype: torch.dtype) -> int: return torch.tensor([], dtype=dtype).element_size() -def merge_dicts(dict1: Dict[Any, List[Any]], - dict2: Dict[Any, List[Any]]) -> Dict[Any, List[Any]]: +def merge_dicts(dict1: Dict[K, List[T]], + dict2: Dict[K, List[T]]) -> Dict[K, List[T]]: """Merge 2 dicts that have key -> List of items. When a key conflicts, the values in dict1 is prioritized. """ - merged_dict = defaultdict(list) + merged_dict: Dict[K, List[T]] = defaultdict(list) for key, value in dict1.items(): merged_dict[key].extend(value) @@ -577,7 +652,7 @@ def merge_dicts(dict1: Dict[Any, List[Any]], return dict(merged_dict) -def init_cached_hf_modules(): +def init_cached_hf_modules() -> None: """ Lazy initialization of the Hugging Face modules. """ @@ -613,7 +688,7 @@ def find_library(lib_name: str) -> str: return locs[0] -def find_nccl_library(): +def find_nccl_library() -> str: """ We either use the library file specified by the `VLLM_NCCL_SO_PATH` environment variable, or we find the library file brought by PyTorch. @@ -728,3 +803,33 @@ def cuda_device_count_stateless() -> int: # after https://github.com/pytorch/pytorch/pull/122815 is released. return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES) + + +#From: https://stackoverflow.com/a/4104188/2749989 +def run_once(f): + + def wrapper(*args, **kwargs) -> Any: + if not wrapper.has_run: # type: ignore[attr-defined] + wrapper.has_run = True # type: ignore[attr-defined] + return f(*args, **kwargs) + + wrapper.has_run = False # type: ignore[attr-defined] + return wrapper + + +class FlexibleArgumentParser(argparse.ArgumentParser): + """ArgumentParser that allows both underscore and dash in names.""" + + def parse_args(self, args=None, namespace=None): + if args is None: + args = sys.argv[1:] + + # Convert underscores to dashes and vice versa in argument names + processed_args = [] + for arg in args: + if arg.startswith('--'): + processed_args.append('--' + arg[len('--'):].replace('_', '-')) + else: + processed_args.append(arg) + + return super().parse_args(processed_args, namespace) diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 341b177d4af2a..fbd1343fea19c 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -4,7 +4,7 @@ import torch from vllm.attention import get_attn_backend -from vllm.config import CacheConfig, ModelConfig, ParallelConfig +from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig from vllm.logger import init_logger from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size, is_pin_memory_available) @@ -25,10 +25,12 @@ def __init__( cache_config: CacheConfig, model_config: ModelConfig, parallel_config: ParallelConfig, + device_config: DeviceConfig, ) -> None: self.cache_config = cache_config self.model_config = model_config self.parallel_config = parallel_config + self.device_config = device_config self.head_size = model_config.get_head_size() self.num_layers = model_config.get_num_layers(parallel_config) @@ -55,7 +57,8 @@ def __init__( ) # Initialize the cache. - self.gpu_cache = self._allocate_kv_cache(self.num_gpu_blocks, "cuda") + self.gpu_cache = self._allocate_kv_cache( + self.num_gpu_blocks, self.device_config.device_type) self.cpu_cache = self._allocate_kv_cache(self.num_cpu_blocks, "cpu") def _allocate_kv_cache( diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 476e9ba3bb463..a321eafce1a2f 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -86,6 +86,7 @@ def __init__( kv_cache_dtype: Optional[str] = "auto", is_driver_worker: bool = False, vision_language_config: Optional[VisionLanguageConfig] = None, + return_hidden_states: bool = False, ): self.model_config = model_config self.parallel_config = parallel_config @@ -96,6 +97,7 @@ def __init__( self.load_config = load_config self.is_driver_worker = is_driver_worker self.vision_language_config = vision_language_config + self.return_hidden_states = return_hidden_states self.device = self.device_config.device self.pin_memory = is_pin_memory_available() @@ -116,15 +118,17 @@ def __init__( self.graph_block_tables = np.zeros( (max(_BATCH_SIZES_TO_CAPTURE), self.get_max_block_per_batch()), dtype=np.int32) + num_attn_heads = self.model_config.get_num_attention_heads( + self.parallel_config) self.attn_backend = get_attn_backend( - self.model_config.get_num_attention_heads(self.parallel_config), + num_attn_heads, self.model_config.get_head_size(), self.model_config.get_num_kv_heads(self.parallel_config), self.model_config.get_sliding_window(), self.model_config.dtype, self.kv_cache_dtype, self.block_size, - ) + ) if num_attn_heads else None # Create processor for multi-modal data if self.vision_language_config is not None: @@ -762,11 +766,19 @@ def execute_model( return None # Sample the next token. - output = self.model.sample( + output: SamplerOutput = self.model.sample( logits=logits, sampling_metadata=sampling_metadata, ) + if self.return_hidden_states: + # we only need to pass hidden states of most recent token + assert seq_group_metadata_list is not None + if seq_group_metadata_list[0].is_prompt: + hidden_states = hidden_states.index_select( + 0, sampling_metadata.selected_token_indices) + output.hidden_states = hidden_states + return output @torch.inference_mode() @@ -779,8 +791,8 @@ def profile_run(self) -> None: # that will have unique loras, an therefore the max amount of memory # consumption create dummy lora request copies from the lora request # passed in, which contains a lora from the lora warmup path. - dummy_lora_requests = [] - dummy_lora_requests_per_seq = [] + dummy_lora_requests: List[LoRARequest] = [] + dummy_lora_requests_per_seq: List[LoRARequest] = [] if self.lora_config: assert self.lora_manager is not None with self.lora_manager.dummy_lora_cache(): @@ -866,6 +878,11 @@ def remove_lora(self, lora_id: int) -> bool: raise RuntimeError("LoRA is not enabled.") return self.lora_manager.remove_lora(lora_id) + def pin_lora(self, lora_id: int) -> bool: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.pin_lora(lora_id) + def list_loras(self) -> Set[int]: if not self.lora_manager: raise RuntimeError("LoRA is not enabled.") diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 7a378a862d0c0..c60764ef1bed8 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -70,6 +70,14 @@ def __init__( assert not self.lora_config, ( "To be tested: vision language model with LoRA settings.") + # Return hidden states from target model if the draft model is an + # mlp_speculator + speculative_args = {} if speculative_config is None \ + or (speculative_config.draft_model_config.model == + model_config.model) \ + or (speculative_config.draft_model_config.hf_config.model_type != + "mlp_speculator") else {"return_hidden_states": True} + ModelRunnerClass = (EmbeddingModelRunner if self.model_config.embedding_mode else ModelRunner) self.model_runner = ModelRunnerClass( @@ -83,6 +91,7 @@ def __init__( kv_cache_dtype=self.cache_config.cache_dtype, is_driver_worker=is_driver_worker, vision_language_config=vision_language_config, + **speculative_args, ) # Uninitialized cache engine. Will be initialized by # initialize_cache. @@ -205,7 +214,8 @@ def initialize_cache(self, num_gpu_blocks: int, def _init_cache_engine(self): assert self.cache_config.num_gpu_blocks is not None self.cache_engine = CacheEngine(self.cache_config, self.model_config, - self.parallel_config) + self.parallel_config, + self.device_config) self.gpu_cache = self.cache_engine.gpu_cache def _warm_up_model(self) -> None: @@ -323,6 +333,9 @@ def add_lora(self, lora_request: LoRARequest) -> bool: def remove_lora(self, lora_id: int) -> bool: return self.model_runner.remove_lora(lora_id) + def pin_lora(self, lora_id: int) -> bool: + return self.model_runner.pin_lora(lora_id) + def list_loras(self) -> Set[int]: return self.model_runner.list_loras() diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 258f31de17d87..dc09718de4a32 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -70,6 +70,10 @@ def add_lora(self, lora_request: LoRARequest) -> bool: def remove_lora(self, lora_id: int) -> bool: raise NotImplementedError + @abstractmethod + def pin_lora(self, lora_id: int) -> bool: + raise NotImplementedError + @abstractmethod def list_loras(self) -> Set[int]: raise NotImplementedError @@ -86,6 +90,10 @@ def add_lora(self, lora_request: LoRARequest) -> bool: def remove_lora(self, lora_id: int) -> bool: raise ValueError(f"{type(self)} does not support LoRA") + def pin_lora(self, lora_id: int) -> bool: + return ValueError( + f"{type(self)} does not support LoRA") # type: ignore + def list_loras(self) -> Set[int]: raise ValueError(f"{type(self)} does not support LoRA") @@ -99,8 +107,8 @@ class WorkerWrapperBase: """ def __init__(self, - worker_module_name=None, - worker_class_name=None, + worker_module_name: str, + worker_class_name: str, trust_remote_code: bool = False) -> None: self.worker_module_name = worker_module_name self.worker_class_name = worker_class_name diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py new file mode 100644 index 0000000000000..f30de703e805d --- /dev/null +++ b/vllm/worker/xpu_model_runner.py @@ -0,0 +1,417 @@ +from typing import List, Optional, Tuple + +import torch +import torch.nn as nn + +from vllm.attention import get_attn_backend +from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, + ModelConfig, ParallelConfig, SchedulerConfig, + VisionLanguageConfig) +from vllm.distributed import broadcast_tensor_dict +from vllm.logger import init_logger +from vllm.model_executor.model_loader import get_model +from vllm.sampling_params import SamplingParams +from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata +from vllm.utils import CudaMemoryProfiler, make_tensor_with_pad +from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata + +logger = init_logger(__name__) + +_PAD_SLOT_ID = -1 +_BATCH_SIZE_ALIGNMENT = 8 +_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [ + _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33) +] + + +class XPUModelRunner: + + def __init__( + self, + model_config: ModelConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + cache_config: CacheConfig, + load_config: LoadConfig, + lora_config: Optional[LoRAConfig], + vision_language_config: Optional[VisionLanguageConfig], + kv_cache_dtype: Optional[str] = "auto", + is_driver_worker: bool = False, + *args, + **kwargs, + ): + self.model_config = model_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.lora_config = lora_config + self.load_config = load_config + self.cache_config = cache_config + self.vision_language_config = vision_language_config + self.is_driver_worker = is_driver_worker + + self.sliding_window = model_config.get_sliding_window() + self.device_config = device_config + self.device = self.device_config.device + + self.kv_cache_dtype = kv_cache_dtype + self.block_size = cache_config.block_size + self.max_context_len_to_capture = ( + self.model_config.max_context_len_to_capture + if self.model_config is not None else 0) + + self.attn_backend = get_attn_backend( + self.model_config.get_num_attention_heads(self.parallel_config), + self.model_config.get_head_size(), + self.model_config.get_num_kv_heads(self.parallel_config), + self.model_config.get_sliding_window(), + self.model_config.dtype, + self.kv_cache_dtype, + self.block_size, + ) + + # Lazy initialization. + self.model: nn.Module # Set after init_Model + + def load_model(self) -> None: + with CudaMemoryProfiler() as m: + self.model = get_model( + model_config=self.model_config, + device_config=self.device_config, + load_config=self.load_config, + lora_config=self.lora_config, + vision_language_config=self.vision_language_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + cache_config=self.cache_config, + ) + + self.model_memory_usage = m.consumed_memory + logger.info("Loading model weights took %.4f GB", + self.model_memory_usage / float(2**30)) + + @property + def vocab_size(self) -> int: + return self.model_config.get_vocab_size() + + @torch.inference_mode() + def profile_run(self) -> None: + # Enable top-k sampling to reflect the accurate memory usage. + sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1) + max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens + max_num_seqs = self.scheduler_config.max_num_seqs + + # Profile memory usage with max_num_sequences sequences and the total + # number of tokens equal to max_num_batched_tokens. + seqs: List[SequenceGroupMetadata] = [] + # Additional GPU memory may be needed for vision encoding, which needs + # to be accounted for when calculating the GPU blocks for + # vLLM blocker manager. + # To exercise the worst scenario for GPU memory consumption, + # the number of seqs (batch_size) is chosen to maximize the number + # of images processed. + for group_id in range(max_num_seqs): + seq_len = (max_num_batched_tokens // max_num_seqs + + (group_id < max_num_batched_tokens % max_num_seqs)) + + seq_data = SequenceData([0] * seq_len) + dummy_multi_modal_data = None + seq = SequenceGroupMetadata( + request_id=str(group_id), + is_prompt=True, + seq_data={group_id: seq_data}, + sampling_params=sampling_params, + block_tables=None, + lora_request=None, + multi_modal_data=dummy_multi_modal_data, + ) + seqs.append(seq) + + # Run the model with the dummy inputs. + num_layers = self.model_config.get_num_layers(self.parallel_config) + kv_caches = [None] * num_layers + self.execute_model(seqs, kv_caches) + torch.xpu.synchronize() + return + + def prepare_input_tensors( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata, + Optional[torch.Tensor]]: + multi_modal_input = None + if self.is_driver_worker: + # NOTE: We assume that all sequences in the group are all prompts or + # all decodes. + is_prompt = seq_group_metadata_list[0].is_prompt + # Prepare input tensors. + if is_prompt: + (input_tokens, input_positions, attn_metadata, seq_lens, + multi_modal_input + ) = self._prepare_prompt(seq_group_metadata_list) + else: + (input_tokens, input_positions, + attn_metadata) = self._prepare_decode(seq_group_metadata_list) + seq_lens = [] + sampling_metadata = SamplingMetadata.prepare( + seq_group_metadata_list, + seq_lens, + # subquery_lens is not needed if chunked prefill is not + # supported. Since CPU worker doesn't support chunked prefill + # just use seq_lens instead. + seq_lens, + self.device, + pin_memory=False) + # Broadcast the metadata. + metadata_dict = { + "input_tokens": input_tokens, + "input_positions": input_positions, + "selected_token_indices": + sampling_metadata.selected_token_indices, + } + metadata_dict.update(attn_metadata.asdict_zerocopy()) + broadcast_tensor_dict(metadata_dict, src=0) + else: + metadata_dict = broadcast_tensor_dict(src=0) + input_tokens = metadata_dict.pop("input_tokens") + input_positions = metadata_dict.pop("input_positions") + selected_token_indices = metadata_dict.pop( + "selected_token_indices") + attn_metadata = self.attn_backend.make_metadata(**metadata_dict) + sampling_metadata = SamplingMetadata( + seq_groups=None, + selected_token_indices=selected_token_indices, + categorized_sample_indices=None, + num_prompts=0, + ) + + return (input_tokens, input_positions, attn_metadata, + sampling_metadata, multi_modal_input) + + def _prepare_decode( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata]: + assert len(seq_group_metadata_list) > 0 + input_tokens: List[int] = [] + input_positions: List[int] = [] + slot_mapping: List[int] = [] + seq_lens: List[int] = [] + block_tables: List[List[int]] = [] + + for seq_group_metadata in seq_group_metadata_list: + assert not seq_group_metadata.is_prompt + assert seq_group_metadata.token_chunk_size == 1 + + seq_ids = list(seq_group_metadata.seq_data.keys()) + + for seq_id in seq_ids: + seq_data = seq_group_metadata.seq_data[seq_id] + generation_token = seq_data.get_last_token_id() + input_tokens.append(generation_token) + + seq_len = seq_data.get_len() + position = seq_len - 1 + input_positions.append(position) + + seq_len = seq_len if self.sliding_window is None else min( + seq_len, self.sliding_window) + seq_lens.append(seq_len) + + block_table = seq_group_metadata.block_tables[seq_id] + block_number = block_table[position // self.block_size] + block_offset = position % self.block_size + slot = block_number * self.block_size + block_offset + slot_mapping.append(slot) + + if self.sliding_window is not None: + sliding_window_blocks = (self.sliding_window // + self.block_size) + block_table = block_table[-sliding_window_blocks:] + block_tables.append(block_table) + + max_decode_seq_len = max(seq_lens) + + input_tokens = torch.tensor(input_tokens, + dtype=torch.long, + device=self.device) + input_positions = torch.tensor(input_positions, + dtype=torch.long, + device=self.device) + slot_mapping = torch.tensor(slot_mapping, + dtype=torch.long, + device=self.device) + seq_lens_tensor = torch.tensor(seq_lens, + dtype=torch.int, + device=self.device) + + max_block_table_len = max( + len(block_table) for block_table in block_tables) + block_tables = make_tensor_with_pad( + block_tables, + max_len=max_block_table_len, + pad=0, + dtype=torch.int, + device=self.device, + ) + + attn_metadata = self.attn_backend.make_metadata( + is_prompt=False, + slot_mapping=slot_mapping, + seq_lens=seq_lens, + seqlen_q=None, + max_seqlen=None, + seq_lens_tensor=seq_lens_tensor, + max_decode_seq_len=max_decode_seq_len, + num_prefill_tokens=0, + num_decode_tokens=len(input_tokens), + num_prefills=0, + block_tables=block_tables, + ) + return ( + input_tokens, + input_positions, + attn_metadata, + ) + + @torch.inference_mode() + def execute_model( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + kv_caches: List[torch.Tensor], + ) -> Optional[SamplerOutput]: + (input_tokens, input_positions, attn_metadata, sampling_metadata, + multi_modal_input + ) = self.prepare_input_tensors(seq_group_metadata_list) + + model_executable = self.model + execute_model_kwargs = { + "input_ids": input_tokens, + "positions": input_positions, + "kv_caches": kv_caches, + "attn_metadata": attn_metadata, + } + if self.vision_language_config: + execute_model_kwargs.update({"image_input": multi_modal_input}) + + hidden_states = model_executable(**execute_model_kwargs) + + # Compute the logits. + logits = self.model.compute_logits(hidden_states, sampling_metadata) + + # Only perform sampling in the driver worker. + if not self.is_driver_worker: + return None + + # Sample the next token. + output = self.model.sample( + logits=logits, + sampling_metadata=sampling_metadata, + ) + return output + + def _prepare_prompt( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int], + Optional[torch.Tensor]]: + assert len(seq_group_metadata_list) > 0 + input_tokens: List[int] = [] + input_positions: List[int] = [] + slot_mapping: List[int] = [] + seq_lens: List[int] = [] + multi_modal_input_list: List[torch.Tensor] = [] + + for seq_group_metadata in seq_group_metadata_list: + assert seq_group_metadata.is_prompt + seq_ids = list(seq_group_metadata.seq_data.keys()) + assert len(seq_ids) == 1 + seq_id = seq_ids[0] + + seq_data = seq_group_metadata.seq_data[seq_id] + prompt_tokens = seq_data.get_token_ids() + computed_len = seq_data.get_num_computed_tokens() + seq_len = len(prompt_tokens) + + seq_lens.append(seq_len) # Prompt token num + input_tokens.extend(prompt_tokens) # Token ids + + # Token position ids + # NOTE(woosuk): Here we assume that the first token in the prompt + # is always the first token in the sequence. + input_positions.extend(list(range(computed_len, seq_len))) + + if seq_group_metadata.multi_modal_data: + multi_modal_input_list.append( + seq_group_metadata.multi_modal_data.data) + + if seq_group_metadata.block_tables is None: + # During memory profiling, the block tables are not initialized + # yet. In this case, we just use a dummy slot mapping. + slot_mapping.extend([_PAD_SLOT_ID] * seq_len) + continue + + # Compute the slot mapping. + block_table = seq_group_metadata.block_tables[seq_id] + # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID, + # where start_idx is max(0, seq_len - sliding_window). + # For example, if the prompt len is 10, sliding window is 8, and + # block size is 4, the first two tokens are masked and the slot + # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1]. + start_idx = 0 + if self.sliding_window is not None: + start_idx = max(0, seq_len - self.sliding_window) + + for i in range(computed_len, seq_len): + if i < start_idx: + slot_mapping.append(_PAD_SLOT_ID) + continue + + block_number = block_table[i // + self.block_size] # type: ignore + block_offset = i % self.block_size # type: ignore + slot = block_number * self.block_size + block_offset + slot_mapping.append(slot) + + if multi_modal_input_list: + assert self.vision_language_config, ( + "Multi-modal inputs are only supported by " + "vision language models.") + multi_modal_input = torch.cat(multi_modal_input_list, + dim=0).to(self.device) + else: + multi_modal_input = None + + num_prompt_tokens = len(input_tokens) + + input_tokens = torch.tensor(input_tokens, + dtype=torch.long, + device=self.device) # type: ignore + input_positions = torch.tensor(input_positions, + dtype=torch.long, + device=self.device) # type: ignore + slot_mapping = torch.tensor(slot_mapping, + dtype=torch.long, + device=self.device) # type: ignore + + max_seqlen = max(seq_lens) + tmp = [0] + tmp.extend(seq_lens) + seqlen = torch.tensor(tmp) + seqlen_q = torch.cumsum(seqlen, dim=0).to(device=self.device) + + attn_metadata = self.attn_backend.make_metadata( + is_prompt=True, + slot_mapping=slot_mapping, + seq_lens=seq_lens, + seqlen_q=seqlen_q, + max_seqlen=max_seqlen, + seq_lens_tensor=None, + max_decode_seq_len=None, + num_prefills=len(seq_lens), + num_prefill_tokens=num_prompt_tokens, + num_decode_tokens=0, + block_tables=torch.tensor([], device=self.device, dtype=torch.int), + ) + return (input_tokens, input_positions, attn_metadata, seq_lens, + multi_modal_input) diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py new file mode 100644 index 0000000000000..773ee9f8159e1 --- /dev/null +++ b/vllm/worker/xpu_worker.py @@ -0,0 +1,193 @@ +"""A XPU worker class.""" +import gc +import os +from typing import List, Optional, Tuple + +import intel_extension_for_pytorch # noqa: F401 +import oneccl_bindings_for_pytorch # noqa: F401 +import torch +import torch.distributed + +from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, + ModelConfig, ParallelConfig, SchedulerConfig, + SpeculativeConfig, VisionLanguageConfig) +from vllm.distributed import (ensure_model_parallel_initialized, + init_distributed_environment) +from vllm.logger import init_logger +from vllm.model_executor import set_random_seed +from vllm.utils import is_xpu +from vllm.worker.cache_engine import CacheEngine +from vllm.worker.worker import Worker +from vllm.worker.worker_base import LoraNotSupportedWorkerBase +from vllm.worker.xpu_model_runner import XPUModelRunner + +logger = init_logger(__name__) + + +class XPUWorker(LoraNotSupportedWorkerBase, Worker): + """A worker class that executes (a partition of) the model on a GPU. + + Each worker is associated with a single XPU device. The worker is + responsible for maintaining the KV cache and executing the model on the + XPU. In case of distributed inference, each worker is assigned a partition + of the model. + """ + + def __init__( + self, + model_config: ModelConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + cache_config: CacheConfig, + load_config: LoadConfig, + local_rank: int, + rank: int, + distributed_init_method: str, + lora_config: Optional[LoRAConfig] = None, + vision_language_config: Optional[VisionLanguageConfig] = None, + speculative_config: Optional[SpeculativeConfig] = None, + is_driver_worker: bool = False, + ) -> None: + assert device_config.device_type == "xpu" + assert is_xpu() + + self.model_config = model_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + self.cache_config = cache_config + self.load_config = load_config + self.local_rank = local_rank + self.rank = rank + self.distributed_init_method = distributed_init_method + self.lora_config = lora_config + self.is_driver_worker = is_driver_worker + if self.is_driver_worker: + assert self.rank == 0, "The driver worker must have rank 0." + + self.vision_language_config = vision_language_config + if self.vision_language_config: + assert not self.lora_config, ( + "To be tested: vision language model with LoRA settings.") + + self.model_runner = XPUModelRunner( # type: ignore + model_config, + parallel_config, + scheduler_config, + device_config, + cache_config, + load_config=self.load_config, + lora_config=self.lora_config, + kv_cache_dtype=self.cache_config.cache_dtype, + is_driver_worker=is_driver_worker, + vision_language_config=vision_language_config, + ) + # Uninitialized cache engine. Will be initialized by + # initialize_cache. + self.cache_engine: CacheEngine + self.gpu_cache: List[torch.Tensor] + + def init_device(self) -> None: + if self.device_config.device.type == "xpu" and is_xpu(): + self.device = torch.device(f"xpu:{self.local_rank}") + torch.xpu.set_device(self.device) + torch.xpu.empty_cache() + self.init_gpu_memory = torch.xpu.get_device_properties( + self.local_rank).total_memory + else: + raise RuntimeError( + f"Not support device type: {self.device_config.device}") + # Initialize the distributed environment. + self.init_worker_distributed_environment() + # Initialize the model. + set_random_seed(self.model_config.seed) + + # keep this method for `empty_cache` and `synchronize` api + @torch.inference_mode() + def determine_num_available_blocks(self) -> Tuple[int, int]: + """Profiles the peak memory usage of the model to determine how many + KV blocks may be allocated without OOMs. + + The engine will first conduct a profiling of the existing memory usage. + Then, it calculate the maximum possible number of GPU and CPU blocks + that can be allocated with the remaining free memory. + + .. tip:: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. + """ + # Profile the memory usage of the model and get the maximum number of + # cache blocks that can be allocated with the remaining free memory. + torch.xpu.empty_cache() + + # Execute a forward pass with dummy inputs to profile the memory usage + # of the model. + self.model_runner.profile_run() + + # Calculate the number of blocks that can be allocated with the + # profiled peak memory. + torch.xpu.synchronize() + used_memory = torch.xpu.memory_allocated() + total_gpu_memory = torch.xpu.get_device_properties( + self.local_rank).total_memory + free_gpu_memory = total_gpu_memory - used_memory + + # NOTE(woosuk): Here we assume that the other processes using the same + # GPU did not change their memory usage during the profiling. + peak_memory = self.init_gpu_memory - free_gpu_memory + assert peak_memory > 0, ( + "Error in memory profiling. This happens when the GPU memory was " + "not properly cleaned up before initializing the vLLM instance.") + + cache_block_size = self.get_cache_block_size_bytes() + num_gpu_blocks = int( + (total_gpu_memory * self.cache_config.gpu_memory_utilization - + peak_memory) // cache_block_size) + num_cpu_blocks = int(self.cache_config.swap_space_bytes // + cache_block_size) + num_gpu_blocks = max(num_gpu_blocks, 0) + num_cpu_blocks = max(num_cpu_blocks, 0) + gc.collect() + torch.xpu.empty_cache() + return num_gpu_blocks, num_cpu_blocks + + def _warm_up_model(self) -> None: + # IPEX don't support capture graph yet + pass + + def init_worker_distributed_environment(self) -> None: + """Initialize the distributed environment.""" + + parallel_config = self.parallel_config + rank = self.rank + distributed_init_method = self.distributed_init_method + + if torch.distributed.is_initialized(): + torch_world_size = torch.distributed.get_world_size() + if torch_world_size != parallel_config.world_size: + raise RuntimeError( + "torch.distributed is already initialized but the torch " + "world size does not match parallel_config.world_size " + f"({torch_world_size} vs. {parallel_config.world_size}).") + elif not distributed_init_method: + raise ValueError( + "distributed_init_method must be set if torch.distributed " + "is not already initialized") + else: + # use sockets as default Level zero IPC exchange backend. By + # default oneccl will use `drmfd` as mechanism which need extra + # dependency (libdrm and drm headers) on your system. + ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE", + "sockets") + os.environ['CCL_ZE_IPC_EXCHANGE'] = ENV_CCL_ZE_IPC_EXCHANGE + init_distributed_environment( + world_size=parallel_config.world_size, + rank=rank, + distributed_init_method=distributed_init_method, + local_rank=self.local_rank, + backend="ccl") + + ensure_model_parallel_initialized( + parallel_config.tensor_parallel_size, + parallel_config.pipeline_parallel_size)