Skip to content

Commit 77928e0

Browse files
varun-sundar-rabindranathVarun Sundar Rabindranathrsnm2
authored
Add NM benchmarking scripts & utils (vllm-project#14)
Summary: Add benchmarking scripts and utils. Things to note : - All files are stored in `neuralmagic` folder. - neuralmagic/benchmarks/scripts/* : Actual benchmarking scripts that interact with vllm engine. - neuralmagic/benchmarks/configs/* : JSON config files that define what benchmark commands to run. - neuralmagic/benchmarks/run_*.py : Scripts that consume some config file and run the benchmark scripts. - neuralmagic/tools : Add tools Testing: Local testing --------- Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: rsnm2 <rshaw@neuralmagic.com>
1 parent a3f00c5 commit 77928e0

18 files changed

+1926
-0
lines changed

neuralmagic/__init__.py

Whitespace-only changes.

neuralmagic/benchmarks/README.md

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# Directory Structure:
2+
3+
- scripts/*.py - Benchmark scripts that perform the metric computation.
4+
5+
- configs/*.json - Config JSON files. These JSONs define what benchmark script to run and what combination of script parameters to use.
6+
7+
- *.py - Benchmark drivers. Given a config JSON, executes all the commands defined by the config JSON.
8+
9+
# Run Benchmark scripts
10+
11+
All `scripts/benchmark_*.py` files can be executed on their own.
12+
13+
Run `python -m neuralmagic/benchmarks/scripts/* --help` for script description and How-To run.
14+
15+
# Benchmarking drivers and Configs
16+
17+
All the benchmark driver *.py files, input a JSON config file and an output directory path.
18+
19+
As mentioned above, the config file defines what benchmark-script to run and what arguments to run it with.
20+
21+
The following is an example config JSON,
22+
23+
```
24+
{
25+
"description": "Benchmark vllm engine throughput - with dataset",
26+
"models": [
27+
"facebook/opt-125m",
28+
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
29+
],
30+
"sparsity" : [],
31+
"script_name": "benchmark_throughput",
32+
"script_args": {
33+
"dataset": [
34+
"sharegpt",
35+
"ultrachat"
36+
],
37+
"output-len": [
38+
128
39+
],
40+
"num-prompts": [
41+
1000
42+
],
43+
}
44+
}
45+
```
46+
This config tells the benchmark driver to run benchmark_throughput script on all the listed models with all possible script-args combinations.
47+
i.e. the config essentially translates to,
48+
49+
python -m neuralmagic.benchmarks.benchmark_throughput.py --model facebook/opt-125m --dataset sharegpt --output-len 128 --num-prompts 1000
50+
51+
python -m neuralmagic.benchmarks.benchmark_throughput.py --model facebook/opt-125m --dataset ultrachat --output-len 128 --num-prompts 1000
52+
53+
python -m neuralmagic.benchmarks.benchmark_throughput.py --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --dataset sharegpt --output-len 128 --num-prompts 1000
54+
55+
python -m neuralmagic.benchmarks.benchmark_throughput.py --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --dataset ultrachat --output-len 128 --num-prompts 1000
56+
57+
# Benchmarking with driver
58+
```
59+
python3 -m neuralmagic.benchmarks.run_benchmarks -i <path-to-config-file> -o <output-directory-path>
60+
```
61+
62+
# About sparsity
63+
The benchmark configs have a `sparsity` field. Populate this field with proper sparsity identifiers to inform vllm about model sparsity.
64+
For the list of valid sparsity args, check `vllm/model_executor/layers/sparsity/*`

neuralmagic/benchmarks/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from neuralmagic.benchmarks.run_benchmark_serving import run_benchmark_serving_script
2+
from neuralmagic.benchmarks.run_benchmark_throughput import run_benchmark_throughput_script
3+
4+
__all__ = [run_benchmark_serving_script, run_benchmark_throughput_script]

neuralmagic/benchmarks/common.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import itertools
2+
import json
3+
4+
from argparse import Namespace
5+
from pathlib import Path
6+
from typing import NamedTuple, Iterable
7+
# from neuralmagic.tools.call_cmd import call_cmd
8+
9+
from vllm.model_executor.weight_utils import prepare_hf_model_weights
10+
from vllm.transformers_utils.tokenizer import get_tokenizer
11+
12+
13+
def download_model(hf_model_id: str) -> None:
14+
"""
15+
Downloads a hugging face model to cache
16+
"""
17+
prepare_hf_model_weights(hf_model_id)
18+
get_tokenizer(hf_model_id)
19+
20+
21+
def script_args_to_cla(config: NamedTuple) -> Iterable[list[str]]:
22+
#config is a NamedTuple constructed from some JSON in neuralmagic/benchmarks/configs
23+
24+
kv = vars(config.script_args)
25+
26+
keys = kv.keys()
27+
arg_lists = kv.values()
28+
assert all(map(lambda le: isinstance(le, list), arg_lists))
29+
30+
# Empty lists are arguments without any values (e.g. boolean args)
31+
key_args = []
32+
for k, v in zip(keys, arg_lists):
33+
if len(v) == 0:
34+
key_args.append(k)
35+
36+
key_args_cla = list(map(lambda k: f"--{k}", key_args))
37+
38+
# Remove empty lists from arg_lists and remove key args from keys
39+
arg_lists = filter(lambda arg_list: len(arg_list) != 0, arg_lists)
40+
keys = filter(lambda k: k not in key_args, keys)
41+
42+
for args in itertools.product(*arg_lists):
43+
cla = key_args_cla
44+
for name, value in zip(keys, args):
45+
cla.extend([f"--{name}", f"{value}"])
46+
yield cla
47+
48+
49+
def benchmark_configs(config_file_path: Path) -> Iterable[NamedTuple]:
50+
"""
51+
Give a path to a config file in `neuralmagic/benchmarks/configs/*` return an Iterable of
52+
(sub)configs in the file
53+
"""
54+
assert config_file_path.exists()
55+
56+
configs = None
57+
with open(config_file_path, "r") as f:
58+
configs = json.load(f, object_hook=lambda d: Namespace(**d))
59+
assert configs is not None
60+
61+
for config in configs.configs:
62+
yield config
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
{
2+
"configs": [
3+
{
4+
"description": "Benchmark vllm serving",
5+
"models": [
6+
"facebook/opt-125m",
7+
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
8+
"mistralai/Mistral-7B-Instruct-v0.2",
9+
"NousResearch/Llama-2-7b-chat-hf"
10+
],
11+
"sparsity": [],
12+
"script_name": "benchmark_serving",
13+
"script_args": {
14+
"nr-qps-pair_": [
15+
"50,0.5",
16+
"100,1",
17+
"200,2",
18+
"500,5"
19+
],
20+
"best-of": [
21+
1
22+
],
23+
"dataset": [
24+
"sharegpt"
25+
]
26+
}
27+
},
28+
{
29+
"description": "Benchmark vllm serving",
30+
"models": [
31+
"facebook/opt-125m",
32+
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
33+
"mistralai/Mistral-7B-Instruct-v0.2",
34+
"NousResearch/Llama-2-7b-chat-hf"
35+
],
36+
"sparsity": [],
37+
"script_name": "benchmark_serving",
38+
"script_args": {
39+
"num-prompts_": [
40+
50,
41+
100
42+
],
43+
"request-rate_": [
44+
0.5,
45+
"inf"
46+
],
47+
"best-of": [
48+
1
49+
],
50+
"dataset": [
51+
"sharegpt"
52+
]
53+
}
54+
}
55+
]
56+
}
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
{
2+
"configs": [
3+
{
4+
"description": "Benchmark vllm engine throughput - with dataset",
5+
"models": [
6+
"facebook/opt-125m",
7+
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
8+
"mistralai/Mistral-7B-Instruct-v0.2",
9+
"NousResearch/Llama-2-7b-chat-hf"
10+
],
11+
"script_name": "benchmark_throughput",
12+
"script_args": {
13+
"backend": [
14+
"vllm"
15+
],
16+
"dataset": [
17+
"sharegpt"
18+
],
19+
"output-len": [
20+
128
21+
],
22+
"tensor-parallel-size": [
23+
1
24+
],
25+
"n": [
26+
1
27+
],
28+
"num-prompts": [
29+
1000
30+
],
31+
"seed": [
32+
0
33+
],
34+
"dtype": [
35+
"auto"
36+
]
37+
}
38+
},
39+
{
40+
"description": "Benchmark vllm engine prefill throughput - synthetic",
41+
"models": [
42+
"facebook/opt-125m",
43+
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
44+
"mistralai/Mistral-7B-Instruct-v0.2",
45+
"NousResearch/Llama-2-7b-chat-hf"
46+
],
47+
"script_name": "benchmark_throughput",
48+
"script_args": {
49+
"backend": [
50+
"vllm"
51+
],
52+
"input-len": [
53+
1,
54+
16,
55+
32,
56+
64,
57+
128,
58+
256,
59+
512,
60+
1024
61+
],
62+
"output-len": [
63+
1
64+
],
65+
"tensor-parallel-size": [
66+
1
67+
],
68+
"n": [
69+
1
70+
],
71+
"num-prompts": [
72+
1
73+
],
74+
"seed": [
75+
0
76+
],
77+
"dtype": [
78+
"auto"
79+
]
80+
}
81+
},
82+
{
83+
"description": "Benchmark vllm engine decode throughput - synthetic",
84+
"models": [
85+
"facebook/opt-125m",
86+
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
87+
"mistralai/Mistral-7B-Instruct-v0.2",
88+
"NousResearch/Llama-2-7b-chat-hf"
89+
],
90+
"script_name": "benchmark_throughput",
91+
"script_args": {
92+
"backend": [
93+
"vllm"
94+
],
95+
"input-len": [
96+
2
97+
],
98+
"output-len": [
99+
128
100+
],
101+
"tensor-parallel-size": [
102+
1
103+
],
104+
"n": [
105+
1
106+
],
107+
"num-prompts": [
108+
1,
109+
4,
110+
8,
111+
16,
112+
32,
113+
64
114+
],
115+
"seed": [
116+
0
117+
],
118+
"dtype": [
119+
"auto"
120+
]
121+
}
122+
}
123+
]
124+
}

0 commit comments

Comments
 (0)