From 8b95d03ec34d88c5a05bbf4f92d8ffe5057c7da5 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Mon, 26 Feb 2024 22:49:42 -0500 Subject: [PATCH] GHA Benchmark : Automatic benchmarking on manual trigger (#46) Summary: Add benchmarking workflow and action that runs the benchmarks on a manual trigger. Test: Try it locally. Successful GHA Benchmark Run - https://github.com/neuralmagic/neuralmagic-vllm/actions/runs/8019392326 --------- Co-authored-by: varun Co-authored-by: Varun Sundar Rabindranath --- .github/actions/nm-benchmark/action.yml | 15 ++++++- .github/data/nm_benchmark_configs_list.txt | 2 + .github/workflows/nm-benchmark.yml | 39 ++++++++++++++++--- .../workflows/scripts/nm-run-benchmarks.sh | 21 ++++++---- neuralmagic/benchmarks/common.py | 16 ++++++-- .../benchmarks/configs/benchmark_serving.json | 6 +++ .../configs/benchmark_throughput.json | 3 ++ .../benchmarks/requirements-benchmark.txt | 1 + .../benchmarks/run_benchmark_serving.py | 26 ++++++++++--- .../benchmarks/run_benchmark_throughput.py | 33 +++++++++++----- 10 files changed, 130 insertions(+), 32 deletions(-) create mode 100644 .github/data/nm_benchmark_configs_list.txt mode change 100644 => 100755 .github/workflows/scripts/nm-run-benchmarks.sh diff --git a/.github/actions/nm-benchmark/action.yml b/.github/actions/nm-benchmark/action.yml index 37fb2a60e4c8b..32ccf215fa563 100644 --- a/.github/actions/nm-benchmark/action.yml +++ b/.github/actions/nm-benchmark/action.yml @@ -1,17 +1,30 @@ name: run vllm benchmarks description: 'run vllm benchmarks' inputs: + benchmark_config_list_file: + description: 'Path to a file containing a list of benchmark-configs to run benchmarks with. For reference look at .github/data/nm_benchmark_configs_list.txt' + required: true output_directory: description: 'output directory to store the benchmark results' required: true + python: + description: 'python version, e.g. 3.10.12' + required: true + venv: + description: 'name for python virtual environment' + required: true runs: using: composite steps: - id: benchmark run: | mkdir -p ${{ inputs.output_directory }} + COMMIT=${{ github.sha }} + VENV="${{ inputs.venv }}-${COMMIT:0:7}" + source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate + pip3 install -r neuralmagic/benchmarks/requirements-benchmark.txt SUCCESS=0 - .github/workflows/scripts/nm-run-benchmarks.sh ${{ inputs.output_directory }} || SUCCESS=$? + .github/workflows/scripts/nm-run-benchmarks.sh ${{ inputs.benchmark_config_list_file }} ${{ inputs.output_directory }} || SUCCESS=$? echo "test=${SUCCESS}" >> "$GITHUB_OUTPUT" exit ${SUCCESS} shell: bash diff --git a/.github/data/nm_benchmark_configs_list.txt b/.github/data/nm_benchmark_configs_list.txt new file mode 100644 index 0000000000000..97f1a5057cf69 --- /dev/null +++ b/.github/data/nm_benchmark_configs_list.txt @@ -0,0 +1,2 @@ +neuralmagic/benchmarks/configs/benchmark_serving.json +neuralmagic/benchmarks/configs/benchmark_throughput.json diff --git a/.github/workflows/nm-benchmark.yml b/.github/workflows/nm-benchmark.yml index d7a2c5f999b6e..3a01e8efbf0f7 100644 --- a/.github/workflows/nm-benchmark.yml +++ b/.github/workflows/nm-benchmark.yml @@ -6,6 +6,10 @@ on: description: "requested runner label (specifies instance)" type: string required: true + benchmark_config_list_file: + description: "Path to a file containing a list of benchmark-configs to run benchmarks with. For reference look at .github/data/nm_benchmark_configs_list.txt" + type: string + required: true timeout: description: "approximate number of minutes to keep instance up (should be at least 20)." type: string @@ -14,6 +18,10 @@ on: description: "git commit hash or branch name" type: string required: true + python: + description: "python version, e.g. 3.10.12" + type: string + required: true jobs: BENCHMARK: @@ -35,16 +43,35 @@ jobs: gitref: ${{ inputs.gitref }} label: ${{ inputs.label }} timeout: ${{ inputs.timeout }} - - # Call the `build` action when available - #- name: build - # id: build - # uses: ./.github/actions/build/ + + - name: setenv + id: setenv + uses: ./.github/actions/nm-set-env/ + with: + hf_home: ${{ secrets.NM_HF_HOME }} + + - name: set python + id: set_python + uses: ./.github/actions/nm-set-python/ + with: + python: ${{ inputs.python }} + venv: TEST + + - name: build + id: build + uses: ./.github/actions/nm-build-vllm/ + with: + Gi_per_thread: 1 + python: ${{ inputs.python }} + venv: TEST - name: run benchmarks uses: ./.github/actions/nm-benchmark/ with: + benchmark_config_list_file: ${{ inputs.benchmark_config_list_file }} output_directory: benchmark-results + python: ${{ inputs.python }} + venv: TEST - name: store benchmark result artifacts uses: actions/upload-artifact@v4 @@ -52,7 +79,7 @@ jobs: with: name: ${{ github.run_id }}-${{ inputs.label }} path: benchmark-results - retention-days: 90 + retention-days: 15 ####################################################### # TODO (Varun) : Remove pause once things are automated diff --git a/.github/workflows/scripts/nm-run-benchmarks.sh b/.github/workflows/scripts/nm-run-benchmarks.sh old mode 100644 new mode 100755 index 81b4af83c0031..9bb975530079c --- a/.github/workflows/scripts/nm-run-benchmarks.sh +++ b/.github/workflows/scripts/nm-run-benchmarks.sh @@ -1,15 +1,22 @@ #!/bin/bash +# GHA uses this script to run benchmarks. set -e set -u -if [ $# -ne 1 ]; +if [ $# -ne 2 ]; then - echo "run_benchmarks needs exactly 1 argument - The output path to store the benchmark results" - exit -1 + echo "run_benchmarks needs exactly 2 arguments: " + echo " 1. Path to a .txt file containing the list of benchmark config paths" + echo " 2. The output path to store the benchmark results" + exit 1 fi -output_directory=$1 - -touch $ouptut_directory/bench_test_1.txt -touch $ouptut_directory/bench_test_2.txt \ No newline at end of file +benchmark_config_list_file=$1 +output_directory=$2 + +for bench_config in `cat $benchmark_config_list_file` +do + echo "Running benchmarks for config " $bench_config + python3 -m neuralmagic.benchmarks.run_benchmarks -i $bench_config -o $output_directory +done diff --git a/neuralmagic/benchmarks/common.py b/neuralmagic/benchmarks/common.py index 292494e3c70d9..e5d7cb531af33 100644 --- a/neuralmagic/benchmarks/common.py +++ b/neuralmagic/benchmarks/common.py @@ -4,18 +4,26 @@ from argparse import Namespace from pathlib import Path from typing import NamedTuple, Iterable -# from neuralmagic.tools.call_cmd import call_cmd from vllm.model_executor.weight_utils import prepare_hf_model_weights from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.transformers_utils.config import get_config +# TODO (varun) : find a workaround so we avoid using private methods +from vllm.config import _get_and_verify_max_len -def download_model(hf_model_id: str) -> None: +def download_model(model: str) -> None: """ Downloads a hugging face model to cache """ - prepare_hf_model_weights(hf_model_id) - get_tokenizer(hf_model_id) + prepare_hf_model_weights(model) + get_tokenizer(model) + + +def max_model_length_from_model_id(model: str, + trust_remote_code: bool = False) -> int: + config = get_config(model, trust_remote_code=trust_remote_code) + return _get_and_verify_max_len(config, max_model_len=None) def script_args_to_cla(config: NamedTuple) -> Iterable[list[str]]: diff --git a/neuralmagic/benchmarks/configs/benchmark_serving.json b/neuralmagic/benchmarks/configs/benchmark_serving.json index 55c3b38fe6070..c76a06aa4b4ca 100644 --- a/neuralmagic/benchmarks/configs/benchmark_serving.json +++ b/neuralmagic/benchmarks/configs/benchmark_serving.json @@ -8,6 +8,9 @@ "mistralai/Mistral-7B-Instruct-v0.2", "NousResearch/Llama-2-7b-chat-hf" ], + "max_model_lens": [ + 4096 + ], "sparsity": [], "script_name": "benchmark_serving", "script_args": { @@ -33,6 +36,9 @@ "mistralai/Mistral-7B-Instruct-v0.2", "NousResearch/Llama-2-7b-chat-hf" ], + "max_model_lens": [ + 4096 + ], "sparsity": [], "script_name": "benchmark_serving", "script_args": { diff --git a/neuralmagic/benchmarks/configs/benchmark_throughput.json b/neuralmagic/benchmarks/configs/benchmark_throughput.json index a22466ce44e02..91011097e4b5b 100644 --- a/neuralmagic/benchmarks/configs/benchmark_throughput.json +++ b/neuralmagic/benchmarks/configs/benchmark_throughput.json @@ -8,6 +8,7 @@ "mistralai/Mistral-7B-Instruct-v0.2", "NousResearch/Llama-2-7b-chat-hf" ], + "max_model_lens" : [4096], "script_name": "benchmark_throughput", "script_args": { "backend": [ @@ -44,6 +45,7 @@ "mistralai/Mistral-7B-Instruct-v0.2", "NousResearch/Llama-2-7b-chat-hf" ], + "max_model_lens" : [4096], "script_name": "benchmark_throughput", "script_args": { "backend": [ @@ -87,6 +89,7 @@ "mistralai/Mistral-7B-Instruct-v0.2", "NousResearch/Llama-2-7b-chat-hf" ], + "max_model_lens" : [4096], "script_name": "benchmark_throughput", "script_args": { "backend": [ diff --git a/neuralmagic/benchmarks/requirements-benchmark.txt b/neuralmagic/benchmarks/requirements-benchmark.txt index e2941334976fd..095bba70f1946 100644 --- a/neuralmagic/benchmarks/requirements-benchmark.txt +++ b/neuralmagic/benchmarks/requirements-benchmark.txt @@ -1,3 +1,4 @@ # requirements specific to the vllm neuralmagic fork +requests aiohttp datasets diff --git a/neuralmagic/benchmarks/run_benchmark_serving.py b/neuralmagic/benchmarks/run_benchmark_serving.py index 9783ee6c7a3be..b38d0ce5e0cad 100644 --- a/neuralmagic/benchmarks/run_benchmark_serving.py +++ b/neuralmagic/benchmarks/run_benchmark_serving.py @@ -2,19 +2,20 @@ import subprocess import requests import time +import itertools from typing import NamedTuple, Optional from pathlib import Path from neuralmagic.tools.call_cmd import call_cmd -from neuralmagic.benchmarks.common import download_model, script_args_to_cla, benchmark_configs +from neuralmagic.benchmarks.common import download_model, max_model_length_from_model_id, script_args_to_cla, benchmark_configs from neuralmagic.benchmarks.scripts.common import warmup_server BENCH_SERVER_HOST = "localhost" BENCH_SERVER_PORT = 9000 -def is_server_running(host: str, port: int, timeout=60) -> bool: +def is_server_running(host: str, port: int, timeout=300) -> bool: def try_connection() -> bool: try: @@ -64,12 +65,27 @@ def run_bench(server_cmd: str, bench_cmd: list[str], model: str) -> None: script_path = f"neuralmagic.benchmarks.scripts.{config.script_name}" sparsities = [None] if len(config.sparsity) == 0 else config.sparsity - for model in config.models: + + for model, sparsity in itertools.product(config.models, sparsities): + # download model beforehand so the server can start without any holdup download_model(model) - for sparsity in sparsities: - server_cmd = f"python3 -m vllm.entrypoints.api_server --model {model} --tokenizer {model} --host {BENCH_SERVER_HOST} --port {BENCH_SERVER_PORT} --disable-log-requests" + supported_max_model_len = max_model_length_from_model_id(model) + + # If the requested model-len is too big, try running with the maximum supported for this model. + max_model_lens = set( + map(lambda v: min(v, supported_max_model_len), + config.max_model_lens)) + if (config.max_model_lens != list(max_model_lens)): + print( + f"WARNING: max_model_len modified to {max_model_lens} from {config.max_model_lens} for model {model}" + ) + + for max_model_len in max_model_lens: + + server_cmd = f"python3 -m vllm.entrypoints.api_server --model {model} --tokenizer {model} --max-model-len {max_model_len} --host {BENCH_SERVER_HOST} --port {BENCH_SERVER_PORT} --disable-log-requests" + if sparsity: server_cmd += f" --sparsity {sparsity} " diff --git a/neuralmagic/benchmarks/run_benchmark_throughput.py b/neuralmagic/benchmarks/run_benchmark_throughput.py index 3b83465954393..54e23d2f6a187 100644 --- a/neuralmagic/benchmarks/run_benchmark_throughput.py +++ b/neuralmagic/benchmarks/run_benchmark_throughput.py @@ -3,7 +3,7 @@ from typing import NamedTuple, Optional from neuralmagic.tools.call_cmd import call_cmd -from neuralmagic.benchmarks.common import script_args_to_cla, benchmark_configs +from neuralmagic.benchmarks.common import script_args_to_cla, benchmark_configs, max_model_length_from_model_id def run_benchmark_throughput_script(config: NamedTuple, @@ -15,16 +15,31 @@ def run_benchmark_throughput_script(config: NamedTuple, script_path = f"neuralmagic.benchmarks.scripts.{config.script_name}" for model in config.models: - for script_args in script_args_to_cla(config): - bench_cmd = (["python3", "-m", f"{script_path}"] + script_args + - ["--model", f"{model}"] + ["--tokenizer", f"{model}"]) - if output_directory: - bench_cmd = bench_cmd + [ - "--save-directory", f"{output_directory}" - ] + supported_max_model_len = max_model_length_from_model_id(model) - call_cmd(bench_cmd, stdout=None, stderr=None) + # If the requested model-len is too big, try running with the maximum supported for this model. + max_model_lens = set( + map(lambda v: min(v, supported_max_model_len), + config.max_model_lens)) + if (config.max_model_lens != list(max_model_lens)): + print( + f"WARNING: max_model_len modified to {max_model_lens} from {config.max_model_lens} for model {model}" + ) + + for max_model_len in max_model_lens: + for script_args in script_args_to_cla(config): + bench_cmd = (["python3", "-m", f"{script_path}"] + + script_args + ["--model", f"{model}"] + + ["--tokenizer", f"{model}"] + + ["--max-model-len", f"{max_model_len}"]) + + if output_directory: + bench_cmd = bench_cmd + [ + "--save-directory", f"{output_directory}" + ] + + call_cmd(bench_cmd, stdout=None, stderr=None) if __name__ == '__main__':