From 8b95d03ec34d88c5a05bbf4f92d8ffe5057c7da5 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Mon, 26 Feb 2024 22:49:42 -0500
Subject: [PATCH] GHA Benchmark : Automatic benchmarking on manual trigger
 (#46)

Summary:
Add benchmarking workflow and action that runs the benchmarks on a manual
trigger.

Test:
Try it locally.
Successful GHA Benchmark Run -
https://github.com/neuralmagic/neuralmagic-vllm/actions/runs/8019392326

---------

Co-authored-by: varun <varun@varuns-MacBook-Pro.local>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 .github/actions/nm-benchmark/action.yml       | 15 ++++++-
 .github/data/nm_benchmark_configs_list.txt    |  2 +
 .github/workflows/nm-benchmark.yml            | 39 ++++++++++++++++---
 .../workflows/scripts/nm-run-benchmarks.sh    | 21 ++++++----
 neuralmagic/benchmarks/common.py              | 16 ++++++--
 .../benchmarks/configs/benchmark_serving.json |  6 +++
 .../configs/benchmark_throughput.json         |  3 ++
 .../benchmarks/requirements-benchmark.txt     |  1 +
 .../benchmarks/run_benchmark_serving.py       | 26 ++++++++++---
 .../benchmarks/run_benchmark_throughput.py    | 33 +++++++++++-----
 10 files changed, 130 insertions(+), 32 deletions(-)
 create mode 100644 .github/data/nm_benchmark_configs_list.txt
 mode change 100644 => 100755 .github/workflows/scripts/nm-run-benchmarks.sh

diff --git a/.github/actions/nm-benchmark/action.yml b/.github/actions/nm-benchmark/action.yml
index 37fb2a60e4c8b..32ccf215fa563 100644
--- a/.github/actions/nm-benchmark/action.yml
+++ b/.github/actions/nm-benchmark/action.yml
@@ -1,17 +1,30 @@
 name: run vllm benchmarks
 description: 'run vllm benchmarks'
 inputs:
+  benchmark_config_list_file:
+    description: 'Path to a file containing a list of benchmark-configs to run benchmarks with. For reference look at .github/data/nm_benchmark_configs_list.txt'
+    required: true
   output_directory:
     description: 'output directory to store the benchmark results'
     required: true
+  python:
+    description: 'python version, e.g. 3.10.12'
+    required: true
+  venv:
+    description: 'name for python virtual environment'
+    required: true
 runs:
   using: composite
   steps:
   - id: benchmark
     run: |
       mkdir -p ${{ inputs.output_directory }}
+      COMMIT=${{ github.sha }}
+      VENV="${{ inputs.venv }}-${COMMIT:0:7}"
+      source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate
+      pip3 install -r neuralmagic/benchmarks/requirements-benchmark.txt
       SUCCESS=0
-      .github/workflows/scripts/nm-run-benchmarks.sh ${{ inputs.output_directory }} || SUCCESS=$?
+      .github/workflows/scripts/nm-run-benchmarks.sh ${{ inputs.benchmark_config_list_file }} ${{ inputs.output_directory }} || SUCCESS=$?
       echo "test=${SUCCESS}" >> "$GITHUB_OUTPUT"
       exit ${SUCCESS}
     shell: bash
diff --git a/.github/data/nm_benchmark_configs_list.txt b/.github/data/nm_benchmark_configs_list.txt
new file mode 100644
index 0000000000000..97f1a5057cf69
--- /dev/null
+++ b/.github/data/nm_benchmark_configs_list.txt
@@ -0,0 +1,2 @@
+neuralmagic/benchmarks/configs/benchmark_serving.json
+neuralmagic/benchmarks/configs/benchmark_throughput.json
diff --git a/.github/workflows/nm-benchmark.yml b/.github/workflows/nm-benchmark.yml
index d7a2c5f999b6e..3a01e8efbf0f7 100644
--- a/.github/workflows/nm-benchmark.yml
+++ b/.github/workflows/nm-benchmark.yml
@@ -6,6 +6,10 @@ on:
         description: "requested runner label (specifies instance)"
         type: string
         required: true
+      benchmark_config_list_file:
+        description: "Path to a file containing a list of benchmark-configs to run benchmarks with. For reference look at .github/data/nm_benchmark_configs_list.txt"
+        type: string
+        required: true
       timeout:
         description: "approximate number of minutes to keep instance up (should be at least 20)."
         type: string
@@ -14,6 +18,10 @@ on:
         description: "git commit hash or branch name"
         type: string
         required: true
+      python:
+        description: "python version, e.g. 3.10.12"
+        type: string
+        required: true
 
 jobs:
   BENCHMARK:
@@ -35,16 +43,35 @@ jobs:
           gitref: ${{ inputs.gitref }}
           label: ${{ inputs.label }}
           timeout: ${{ inputs.timeout }}
-  
-      # Call the `build` action when available
-      #- name: build
-      #  id: build
-      #  uses: ./.github/actions/build/
+
+      - name: setenv
+        id: setenv
+        uses: ./.github/actions/nm-set-env/
+        with:
+          hf_home: ${{ secrets.NM_HF_HOME }}
+
+      - name: set python
+        id: set_python
+        uses: ./.github/actions/nm-set-python/
+        with:
+          python: ${{ inputs.python }}
+          venv: TEST
+
+      - name: build
+        id: build
+        uses: ./.github/actions/nm-build-vllm/
+        with:
+          Gi_per_thread: 1
+          python: ${{ inputs.python }}
+          venv: TEST
 
       - name: run benchmarks
         uses: ./.github/actions/nm-benchmark/
         with:
+          benchmark_config_list_file: ${{ inputs.benchmark_config_list_file }}
           output_directory: benchmark-results
+          python: ${{ inputs.python }}
+          venv: TEST
 
       - name: store benchmark result artifacts
         uses: actions/upload-artifact@v4
@@ -52,7 +79,7 @@ jobs:
         with:
           name: ${{ github.run_id }}-${{ inputs.label }}
           path: benchmark-results
-          retention-days: 90
+          retention-days: 15
 
       #######################################################
       # TODO (Varun) : Remove pause once things are automated
diff --git a/.github/workflows/scripts/nm-run-benchmarks.sh b/.github/workflows/scripts/nm-run-benchmarks.sh
old mode 100644
new mode 100755
index 81b4af83c0031..9bb975530079c
--- a/.github/workflows/scripts/nm-run-benchmarks.sh
+++ b/.github/workflows/scripts/nm-run-benchmarks.sh
@@ -1,15 +1,22 @@
 #!/bin/bash
+# GHA uses this script to run benchmarks.
 
 set -e
 set -u
   
-if [ $# -ne 1 ];
+if [ $# -ne 2 ];
 then
-  echo "run_benchmarks needs exactly 1 argument - The output path to store the benchmark results"
-  exit -1
+  echo "run_benchmarks needs exactly 2 arguments: "
+  echo " 1. Path to a .txt file containing the list of benchmark config paths"
+  echo " 2. The output path to store the benchmark results"
+  exit 1
 fi
   
-output_directory=$1
-
-touch $ouptut_directory/bench_test_1.txt
-touch $ouptut_directory/bench_test_2.txt
\ No newline at end of file
+benchmark_config_list_file=$1
+output_directory=$2
+  
+for bench_config in `cat $benchmark_config_list_file`
+do
+  echo "Running benchmarks for config " $bench_config
+  python3 -m neuralmagic.benchmarks.run_benchmarks -i $bench_config -o $output_directory
+done
diff --git a/neuralmagic/benchmarks/common.py b/neuralmagic/benchmarks/common.py
index 292494e3c70d9..e5d7cb531af33 100644
--- a/neuralmagic/benchmarks/common.py
+++ b/neuralmagic/benchmarks/common.py
@@ -4,18 +4,26 @@
 from argparse import Namespace
 from pathlib import Path
 from typing import NamedTuple, Iterable
-# from neuralmagic.tools.call_cmd import call_cmd
 
 from vllm.model_executor.weight_utils import prepare_hf_model_weights
 from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.transformers_utils.config import get_config
+# TODO (varun) : find a workaround so we avoid using private methods
+from vllm.config import _get_and_verify_max_len
 
 
-def download_model(hf_model_id: str) -> None:
+def download_model(model: str) -> None:
     """
      Downloads a hugging face model to cache
      """
-    prepare_hf_model_weights(hf_model_id)
-    get_tokenizer(hf_model_id)
+    prepare_hf_model_weights(model)
+    get_tokenizer(model)
+
+
+def max_model_length_from_model_id(model: str,
+                                   trust_remote_code: bool = False) -> int:
+    config = get_config(model, trust_remote_code=trust_remote_code)
+    return _get_and_verify_max_len(config, max_model_len=None)
 
 
 def script_args_to_cla(config: NamedTuple) -> Iterable[list[str]]:
diff --git a/neuralmagic/benchmarks/configs/benchmark_serving.json b/neuralmagic/benchmarks/configs/benchmark_serving.json
index 55c3b38fe6070..c76a06aa4b4ca 100644
--- a/neuralmagic/benchmarks/configs/benchmark_serving.json
+++ b/neuralmagic/benchmarks/configs/benchmark_serving.json
@@ -8,6 +8,9 @@
 				"mistralai/Mistral-7B-Instruct-v0.2",
 				"NousResearch/Llama-2-7b-chat-hf"
 			],
+			"max_model_lens": [
+				4096
+			],
 			"sparsity": [],
 			"script_name": "benchmark_serving",
 			"script_args": {
@@ -33,6 +36,9 @@
 				"mistralai/Mistral-7B-Instruct-v0.2",
 				"NousResearch/Llama-2-7b-chat-hf"
 			],
+			"max_model_lens": [
+				4096
+			],
 			"sparsity": [],
 			"script_name": "benchmark_serving",
 			"script_args": {
diff --git a/neuralmagic/benchmarks/configs/benchmark_throughput.json b/neuralmagic/benchmarks/configs/benchmark_throughput.json
index a22466ce44e02..91011097e4b5b 100644
--- a/neuralmagic/benchmarks/configs/benchmark_throughput.json
+++ b/neuralmagic/benchmarks/configs/benchmark_throughput.json
@@ -8,6 +8,7 @@
 				"mistralai/Mistral-7B-Instruct-v0.2",
 				"NousResearch/Llama-2-7b-chat-hf"
 			],
+			"max_model_lens" : [4096],
 			"script_name": "benchmark_throughput",
 			"script_args": {
 				"backend": [
@@ -44,6 +45,7 @@
 				"mistralai/Mistral-7B-Instruct-v0.2",
 				"NousResearch/Llama-2-7b-chat-hf"
 			],
+			"max_model_lens" : [4096],
 			"script_name": "benchmark_throughput",
 			"script_args": {
 				"backend": [
@@ -87,6 +89,7 @@
 				"mistralai/Mistral-7B-Instruct-v0.2",
 				"NousResearch/Llama-2-7b-chat-hf"
 			],
+			"max_model_lens" : [4096],
 			"script_name": "benchmark_throughput",
 			"script_args": {
 				"backend": [
diff --git a/neuralmagic/benchmarks/requirements-benchmark.txt b/neuralmagic/benchmarks/requirements-benchmark.txt
index e2941334976fd..095bba70f1946 100644
--- a/neuralmagic/benchmarks/requirements-benchmark.txt
+++ b/neuralmagic/benchmarks/requirements-benchmark.txt
@@ -1,3 +1,4 @@
 # requirements specific to the vllm neuralmagic fork
+requests
 aiohttp
 datasets
diff --git a/neuralmagic/benchmarks/run_benchmark_serving.py b/neuralmagic/benchmarks/run_benchmark_serving.py
index 9783ee6c7a3be..b38d0ce5e0cad 100644
--- a/neuralmagic/benchmarks/run_benchmark_serving.py
+++ b/neuralmagic/benchmarks/run_benchmark_serving.py
@@ -2,19 +2,20 @@
 import subprocess
 import requests
 import time
+import itertools
 
 from typing import NamedTuple, Optional
 from pathlib import Path
 
 from neuralmagic.tools.call_cmd import call_cmd
-from neuralmagic.benchmarks.common import download_model, script_args_to_cla, benchmark_configs
+from neuralmagic.benchmarks.common import download_model, max_model_length_from_model_id, script_args_to_cla, benchmark_configs
 from neuralmagic.benchmarks.scripts.common import warmup_server
 
 BENCH_SERVER_HOST = "localhost"
 BENCH_SERVER_PORT = 9000
 
 
-def is_server_running(host: str, port: int, timeout=60) -> bool:
+def is_server_running(host: str, port: int, timeout=300) -> bool:
 
     def try_connection() -> bool:
         try:
@@ -64,12 +65,27 @@ def run_bench(server_cmd: str, bench_cmd: list[str], model: str) -> None:
     script_path = f"neuralmagic.benchmarks.scripts.{config.script_name}"
 
     sparsities = [None] if len(config.sparsity) == 0 else config.sparsity
-    for model in config.models:
+
+    for model, sparsity in itertools.product(config.models, sparsities):
+
         # download model beforehand so the server can start without any holdup
         download_model(model)
 
-        for sparsity in sparsities:
-            server_cmd = f"python3 -m vllm.entrypoints.api_server --model {model} --tokenizer {model} --host {BENCH_SERVER_HOST} --port {BENCH_SERVER_PORT} --disable-log-requests"
+        supported_max_model_len = max_model_length_from_model_id(model)
+
+        # If the requested model-len is too big, try running with the maximum supported for this model.
+        max_model_lens = set(
+            map(lambda v: min(v, supported_max_model_len),
+                config.max_model_lens))
+        if (config.max_model_lens != list(max_model_lens)):
+            print(
+                f"WARNING: max_model_len modified to {max_model_lens} from {config.max_model_lens} for model {model}"
+            )
+
+        for max_model_len in max_model_lens:
+
+            server_cmd = f"python3 -m vllm.entrypoints.api_server --model {model} --tokenizer {model} --max-model-len {max_model_len} --host {BENCH_SERVER_HOST} --port {BENCH_SERVER_PORT} --disable-log-requests"
+
             if sparsity:
                 server_cmd += f" --sparsity {sparsity} "
 
diff --git a/neuralmagic/benchmarks/run_benchmark_throughput.py b/neuralmagic/benchmarks/run_benchmark_throughput.py
index 3b83465954393..54e23d2f6a187 100644
--- a/neuralmagic/benchmarks/run_benchmark_throughput.py
+++ b/neuralmagic/benchmarks/run_benchmark_throughput.py
@@ -3,7 +3,7 @@
 from typing import NamedTuple, Optional
 
 from neuralmagic.tools.call_cmd import call_cmd
-from neuralmagic.benchmarks.common import script_args_to_cla, benchmark_configs
+from neuralmagic.benchmarks.common import script_args_to_cla, benchmark_configs, max_model_length_from_model_id
 
 
 def run_benchmark_throughput_script(config: NamedTuple,
@@ -15,16 +15,31 @@ def run_benchmark_throughput_script(config: NamedTuple,
     script_path = f"neuralmagic.benchmarks.scripts.{config.script_name}"
 
     for model in config.models:
-        for script_args in script_args_to_cla(config):
-            bench_cmd = (["python3", "-m", f"{script_path}"] + script_args +
-                         ["--model", f"{model}"] + ["--tokenizer", f"{model}"])
 
-            if output_directory:
-                bench_cmd = bench_cmd + [
-                    "--save-directory", f"{output_directory}"
-                ]
+        supported_max_model_len = max_model_length_from_model_id(model)
 
-            call_cmd(bench_cmd, stdout=None, stderr=None)
+        # If the requested model-len is too big, try running with the maximum supported for this model.
+        max_model_lens = set(
+            map(lambda v: min(v, supported_max_model_len),
+                config.max_model_lens))
+        if (config.max_model_lens != list(max_model_lens)):
+            print(
+                f"WARNING: max_model_len modified to {max_model_lens} from {config.max_model_lens} for model {model}"
+            )
+
+        for max_model_len in max_model_lens:
+            for script_args in script_args_to_cla(config):
+                bench_cmd = (["python3", "-m", f"{script_path}"] +
+                             script_args + ["--model", f"{model}"] +
+                             ["--tokenizer", f"{model}"] +
+                             ["--max-model-len", f"{max_model_len}"])
+
+                if output_directory:
+                    bench_cmd = bench_cmd + [
+                        "--save-directory", f"{output_directory}"
+                    ]
+
+                call_cmd(bench_cmd, stdout=None, stderr=None)
 
 
 if __name__ == '__main__':