diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh index 3f38cf5137535..32bd34c431c89 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh @@ -301,6 +301,104 @@ run_serving_tests() { kill_gpu_processes } +run_genai_perf_tests() { + # run genai-perf tests + + # $1: a json file specifying genai-perf test cases + local genai_perf_test_file + genai_perf_test_file=$1 + + # Iterate over genai-perf tests + jq -c '.[]' "$genai_perf_test_file" | while read -r params; do + # get the test name, and append the GPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # prepend the current serving engine to the test name + test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name} + + # get common parameters + common_params=$(echo "$params" | jq -r '.common_parameters') + model=$(echo "$common_params" | jq -r '.model') + tp=$(echo "$common_params" | jq -r '.tp') + dataset_name=$(echo "$common_params" | jq -r '.dataset_name') + dataset_path=$(echo "$common_params" | jq -r '.dataset_path') + port=$(echo "$common_params" | jq -r '.port') + num_prompts=$(echo "$common_params" | jq -r '.num_prompts') + reuse_server=$(echo "$common_params" | jq -r '.reuse_server') + + # get client and server arguments + server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters") + qps_list=$(echo "$params" | jq -r '.qps_list') + qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') + echo "Running over qps list $qps_list" + + # check if there is enough GPU to run the test + if [[ $gpu_count -lt $tp ]]; then + echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name." + continue + fi + + if [[ $reuse_server == "true" ]]; then + echo "Reuse previous server for test case $test_name" + else + kill_gpu_processes + bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \ + "$server_params" "$common_params" + fi + + if wait_for_server; then + echo "" + echo "$CURRENT_LLM_SERVING_ENGINE server is up and running." + else + echo "" + echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period." + break + fi + + # iterate over different QPS + for qps in $qps_list; do + # remove the surrounding single quote from qps + if [[ "$qps" == *"inf"* ]]; then + echo "qps was $qps" + qps=$num_prompts + echo "now qps is $qps" + fi + + new_test_name=$test_name"_qps_"$qps + backend=$CURRENT_LLM_SERVING_ENGINE + + if [[ "$backend" == *"vllm"* ]]; then + backend="vllm" + fi + #TODO: add output dir. + client_command="genai-perf profile \ + -m $model \ + --service-kind openai \ + --backend vllm \ + --endpoint-type chat \ + --streaming \ + --url localhost:$port \ + --request-rate $qps \ + --num-prompts $num_prompts \ + " + + echo "Client command: $client_command" + + eval "$client_command" + + #TODO: process/record outputs + done + done + + kill_gpu_processes + +} prepare_dataset() { @@ -328,12 +426,17 @@ main() { pip install -U transformers + pip install -r requirements-dev.txt + which genai-perf + # check storage df -h ensure_installed wget ensure_installed curl ensure_installed jq + # genai-perf dependency + ensure_installed libb64-0d prepare_dataset @@ -345,6 +448,10 @@ main() { # run the test run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json" + # run genai-perf tests + run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json" + mv artifacts/ $RESULTS_FOLDER/ + # upload benchmark results to buildkite python3 -m pip install tabulate pandas python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py" diff --git a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json new file mode 100644 index 0000000000000..edbe9f2df0ce0 --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json @@ -0,0 +1,23 @@ +[ + { + "test_name": "llama8B_tp1_genai_perf", + "qps_list": [4,8,16,32], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "tp": 1, + "port": 8000, + "num_prompts": 500, + "reuse_server": false + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" + }, + "genai_perf_input_parameters": { + } + } +] \ No newline at end of file diff --git a/requirements-test.in b/requirements-test.in index 4b4dc376d1fa5..bc76a91ad5356 100644 --- a/requirements-test.in +++ b/requirements-test.in @@ -29,4 +29,7 @@ lm-eval[api]==0.4.4 # required for model evaluation test bitsandbytes>=0.45.0 buildkite-test-collector==0.1.9 +genai_perf==0.0.8 +tritonclient==2.51.0 + numpy < 2.0.0 diff --git a/requirements-test.txt b/requirements-test.txt index f576e42afcbbf..09e009c2e21f4 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -37,7 +37,7 @@ audioread==3.0.1 # via librosa awscli==1.35.23 # via -r requirements-test.in -bitsandbytes>=0.45.0 +bitsandbytes==0.45.0 # via -r requirements-test.in black==24.10.0 # via datamodel-code-generator @@ -75,6 +75,8 @@ colorama==0.4.6 # tqdm-multiprocess contourpy==1.3.0 # via matplotlib +cramjam==2.9.0 + # via fastparquet cupy-cuda12x==13.3.0 # via ray cycler==0.12.1 @@ -109,6 +111,8 @@ email-validator==2.2.0 # via pydantic evaluate==0.4.3 # via lm-eval +fastparquet==2024.11.0 + # via genai-perf fastrlock==0.8.2 # via cupy-cuda12x filelock==3.16.1 @@ -130,8 +134,11 @@ fsspec[http]==2024.9.0 # via # datasets # evaluate + # fastparquet # huggingface-hub # torch +genai-perf==0.0.8 + # via -r requirements-test.in genson==1.3.0 # via datamodel-code-generator h11==0.14.0 @@ -186,6 +193,8 @@ jsonschema==4.23.0 # ray jsonschema-specifications==2024.10.1 # via jsonschema +kaleido==0.2.1 + # via genai-perf kiwisolver==1.4.7 # via matplotlib lazy-loader==0.4 @@ -200,6 +209,8 @@ lm-eval[api]==0.4.4 # via -r requirements-test.in lxml==5.3.0 # via sacrebleu +markdown-it-py==3.0.0 + # via rich markupsafe==3.0.2 # via jinja2 matplotlib==3.9.2 @@ -209,6 +220,8 @@ mbstrdecoder==1.1.3 # dataproperty # pytablewriter # typepy +mdurl==0.1.2 + # via markdown-it-py mistral-common[opencv]==1.5.1 # via # -r requirements-test.in @@ -249,6 +262,8 @@ numpy==1.26.4 # datasets # decord # evaluate + # fastparquet + # genai-perf # librosa # matplotlib # mistral-common @@ -256,15 +271,18 @@ numpy==1.26.4 # numexpr # opencv-python-headless # pandas + # patsy # peft # rouge-score # sacrebleu # scikit-learn # scipy # soxr + # statsmodels # tensorizer # torchvision # transformers + # tritonclient nvidia-cublas-cu12==12.4.5.8 # via # nvidia-cudnn-cu12 @@ -306,30 +324,39 @@ packaging==24.1 # datamodel-code-generator # datasets # evaluate + # fastparquet # huggingface-hub # lazy-loader # matplotlib # peft + # plotly # pooch # pytest # pytest-rerunfailures # ray + # statsmodels # transformers # typepy pandas==2.2.3 # via # datasets # evaluate + # fastparquet + # genai-perf + # statsmodels pathspec==0.12.1 # via black pathvalidate==3.2.1 # via pytablewriter +patsy==1.0.1 + # via statsmodels peft==0.13.2 # via # -r requirements-test.in # lm-eval pillow==10.4.0 # via + # genai-perf # matplotlib # mistral-common # sentence-transformers @@ -338,6 +365,8 @@ platformdirs==4.3.6 # via # black # pooch +plotly==5.24.1 + # via genai-perf pluggy==1.5.0 # via pytest pooch==1.8.2 @@ -360,7 +389,9 @@ psutil==6.1.0 py==1.11.0 # via pytest-forked pyarrow==18.0.0 - # via datasets + # via + # datasets + # genai-perf pyasn1==0.6.1 # via rsa pybind11==2.13.6 @@ -373,6 +404,8 @@ pydantic[email]==2.9.2 # mistral-common pydantic-core==2.23.4 # via pydantic +pygments==2.18.0 + # via rich pyparsing==3.2.0 # via matplotlib pytablewriter==1.2.0 @@ -381,14 +414,18 @@ pytest==8.3.3 # via # -r requirements-test.in # buildkite-test-collector + # genai-perf # pytest-asyncio # pytest-forked + # pytest-mock # pytest-rerunfailures # pytest-shard pytest-asyncio==0.24.0 # via -r requirements-test.in pytest-forked==1.6.0 # via -r requirements-test.in +pytest-mock==3.14.0 + # via genai-perf pytest-rerunfailures==14.0 # via -r requirements-test.in pytest-shard==0.1.2 @@ -399,6 +436,8 @@ python-dateutil==2.9.0.post0 # matplotlib # pandas # typepy +python-rapidjson==1.20 + # via tritonclient pytz==2024.2 # via # pandas @@ -409,9 +448,11 @@ pyyaml==6.0.2 # awscli # datamodel-code-generator # datasets + # genai-perf # huggingface-hub # peft # ray + # responses # timm # transformers ray[adag]==2.40.0 @@ -438,8 +479,13 @@ requests==2.32.3 # mistral-common # pooch # ray + # responses # tiktoken # transformers +responses==0.25.3 + # via genai-perf +rich==13.9.4 + # via genai-perf rouge-score==0.1.2 # via lm-eval rpds-py==0.20.1 @@ -470,6 +516,7 @@ scipy==1.13.1 # librosa # scikit-learn # sentence-transformers + # statsmodels sentence-transformers==3.2.1 # via -r requirements-test.in sentencepiece==0.2.0 @@ -490,6 +537,8 @@ soxr==0.5.0.post1 # via librosa sqlitedict==2.1.0 # via lm-eval +statsmodels==0.14.4 + # via genai-perf sympy==1.13.1 # via torch tabledata==1.3.3 @@ -499,7 +548,9 @@ tabulate==0.9.0 tcolorpy==0.1.6 # via pytablewriter tenacity==9.0.0 - # via lm-eval + # via + # lm-eval + # plotly tensorizer==2.9.0 # via -r requirements-test.in threadpoolctl==3.5.0 @@ -540,6 +591,7 @@ tqdm-multiprocess==0.0.11 # via lm-eval transformers==4.47.0 # via + # genai-perf # lm-eval # peft # sentence-transformers @@ -548,6 +600,10 @@ transformers-stream-generator==0.0.5 # via -r requirements-test.in triton==3.1.0 # via torch +tritonclient==2.51.0 + # via + # -r requirements-test.in + # genai-perf typepy[datetime]==1.3.2 # via # dataproperty @@ -555,6 +611,7 @@ typepy[datetime]==1.3.2 # tabledata typing-extensions==4.12.2 # via + # bitsandbytes # huggingface-hub # librosa # mistral-common @@ -563,10 +620,12 @@ typing-extensions==4.12.2 # torch tzdata==2024.2 # via pandas -urllib3==1.26.20 +urllib3==2.2.3 # via # botocore # requests + # responses + # tritonclient word2number==1.1 # via lm-eval xxhash==3.5.0