Skip to content

Commit f2d8307

Browse files
authored
Merge remote-tracking branch 'upstream/main' into tokenizer-endpoints
2 parents 72b640d + 67882db commit f2d8307

File tree

192 files changed

+8498
-1195
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

192 files changed

+8498
-1195
lines changed

.buildkite/nightly-benchmarks/README.md

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,24 @@ This benchmark will be *triggered* upon:
1313

1414
**Benchmarking Duration**: about 1hr.
1515

16-
## Configuring the workload for the quick benchmark
16+
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to less than 1.5 hr so that it won't take forever to run.
1717

18-
The workload of the quick benchmark contains two parts: latency tests in `latency-tests.json`, throughput tests in `throughput-tests.json` and serving tests in `serving-tests.json`.
18+
19+
## Configuring the workload
20+
21+
The benchmarking workload contains three parts:
22+
- Latency tests in `latency-tests.json`.
23+
- Throughput tests in `throughput-tests.json`.
24+
- Serving tests in `serving-tests.json`.
25+
26+
See [descriptions.md](tests/descriptions.md) for detailed descriptions.
1927

2028
### Latency test
2129

2230
Here is an example of one test inside `latency-tests.json`:
2331

2432
```json
2533
[
26-
...
2734
{
2835
"test_name": "latency_llama8B_tp1",
2936
"parameters": {
@@ -34,7 +41,6 @@ Here is an example of one test inside `latency-tests.json`:
3441
"num_iters": 15
3542
}
3643
},
37-
...
3844
]
3945
```
4046

@@ -57,7 +63,6 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t
5763

5864
```
5965
[
60-
...
6166
{
6267
"test_name": "serving_llama8B_tp1_sharegpt",
6368
"qps_list": [1, 4, 16, "inf"],
@@ -77,7 +82,6 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t
7782
"num_prompts": 200
7883
}
7984
},
80-
...
8185
]
8286
```
8387

@@ -92,7 +96,8 @@ The number of this test is less stable compared to the delay and latency benchma
9296
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
9397

9498
## Visualizing the results
95-
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table.
99+
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
96100
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
97101
If you do not see the table, please wait till the benchmark finish running.
98-
The JSON file is also attached within each buildkite job for further analysis.
102+
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
103+
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.

.buildkite/nightly-benchmarks/benchmark-pipeline.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ steps:
1717
plugins:
1818
- kubernetes:
1919
podSpec:
20+
priorityClassName: perf-benchmark
2021
containers:
2122
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
2223
command:

.buildkite/nightly-benchmarks/run-benchmarks-suite.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -343,9 +343,9 @@ main() {
343343
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
344344

345345
# benchmarking
346-
run_serving_tests $QUICK_BENCHMARK_ROOT/serving-tests.json
347-
run_latency_tests $QUICK_BENCHMARK_ROOT/latency-tests.json
348-
run_throughput_tests $QUICK_BENCHMARK_ROOT/throughput-tests.json
346+
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
347+
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
348+
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
349349

350350

351351
# postprocess benchmarking results
Lines changed: 158 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import json
2+
import os
23
from pathlib import Path
34

45
import pandas as pd
@@ -11,145 +12,181 @@
1112
latency_column_mapping = {
1213
"test_name": "Test name",
1314
"gpu_type": "GPU",
14-
"avg_latency": "Average latency (s)",
15-
"P10": "P10 (s)",
16-
"P25": "P25 (s)",
17-
"P50": "P50 (s)",
18-
"P75": "P75 (s)",
19-
"P90": "P90 (s)",
15+
"avg_latency": "Mean latency (ms)",
16+
# "P10": "P10 (s)",
17+
# "P25": "P25 (s)",
18+
"P50": "Median latency (ms)",
19+
# "P75": "P75 (s)",
20+
# "P90": "P90 (s)",
21+
"P99": "P99 latency (ms)",
2022
}
2123

22-
# thoughput tests and the keys that will be printed into markdown
24+
# throughput tests and the keys that will be printed into markdown
2325
throughput_results = []
2426
throughput_results_column_mapping = {
2527
"test_name": "Test name",
2628
"gpu_type": "GPU",
27-
"num_requests": "# of req.",
28-
"total_num_tokens": "Total # of tokens",
29-
"elapsed_time": "Elapsed time (s)",
29+
# "num_requests": "# of req.",
30+
# "total_num_tokens": "Total # of tokens",
31+
# "elapsed_time": "Elapsed time (s)",
3032
"requests_per_second": "Tput (req/s)",
31-
"tokens_per_second": "Tput (tok/s)",
33+
# "tokens_per_second": "Tput (tok/s)",
3234
}
3335

3436
# serving results and the keys that will be printed into markdown
3537
serving_results = []
3638
serving_column_mapping = {
3739
"test_name": "Test name",
3840
"gpu_type": "GPU",
39-
"completed": "# of req.",
41+
# "completed": "# of req.",
4042
"request_throughput": "Tput (req/s)",
41-
"input_throughput": "Input Tput (tok/s)",
42-
"output_throughput": "Output Tput (tok/s)",
43+
# "input_throughput": "Input Tput (tok/s)",
44+
# "output_throughput": "Output Tput (tok/s)",
4345
"mean_ttft_ms": "Mean TTFT (ms)",
44-
# do not say TTFT again to avoid the table getting too wide
45-
"median_ttft_ms": "Median",
46-
"p99_ttft_ms": "P99",
47-
"mean_tpot_ms": "Mean TPOT (ms)",
48-
"median_tpot_ms": "Median",
49-
"p99_tpot_ms": "P99",
46+
"median_ttft_ms": "Median TTFT (ms)",
47+
"p99_ttft_ms": "P99 TTFT (ms)",
48+
# "mean_tpot_ms": "Mean TPOT (ms)",
49+
# "median_tpot_ms": "Median",
50+
# "p99_tpot_ms": "P99",
5051
"mean_itl_ms": "Mean ITL (ms)",
51-
"median_itl_ms": "Median",
52-
"p99_itl_ms": "P99",
52+
"median_itl_ms": "Median ITL (ms)",
53+
"p99_itl_ms": "P99 ITL (ms)",
5354
}
5455

55-
for test_file in results_folder.glob("*.json"):
56-
57-
with open(test_file, "r") as f:
58-
raw_result = json.loads(f.read())
59-
60-
if "serving" in str(test_file):
61-
# this result is generated via `benchmark_serving.py`
62-
63-
# attach the benchmarking command to raw_result
64-
with open(test_file.with_suffix(".commands"), "r") as f:
65-
command = json.loads(f.read())
66-
raw_result.update(command)
67-
68-
# update the test name of this result
69-
raw_result.update({"test_name": test_file.stem})
70-
71-
# add the result to raw_result
72-
serving_results.append(raw_result)
73-
continue
74-
75-
elif "latency" in f.name:
76-
# this result is generated via `benchmark_latency.py`
77-
78-
# attach the benchmarking command to raw_result
79-
with open(test_file.with_suffix(".commands"), "r") as f:
80-
command = json.loads(f.read())
81-
raw_result.update(command)
82-
83-
# update the test name of this result
84-
raw_result.update({"test_name": test_file.stem})
85-
86-
# get different percentiles
87-
for perc in [10, 25, 50, 75, 90]:
88-
raw_result.update(
89-
{f"P{perc}": raw_result["percentiles"][str(perc)]})
90-
91-
# add the result to raw_result
92-
latency_results.append(raw_result)
93-
continue
94-
95-
elif "throughput" in f.name:
96-
# this result is generated via `benchmark_throughput.py`
97-
98-
# attach the benchmarking command to raw_result
99-
with open(test_file.with_suffix(".commands"), "r") as f:
100-
command = json.loads(f.read())
101-
raw_result.update(command)
102-
103-
# update the test name of this result
104-
raw_result.update({"test_name": test_file.stem})
105-
106-
# add the result to raw_result
107-
throughput_results.append(raw_result)
108-
continue
109-
110-
print(f"Skipping {test_file}")
111-
112-
latency_results = pd.DataFrame.from_dict(latency_results)
113-
serving_results = pd.DataFrame.from_dict(serving_results)
114-
throughput_results = pd.DataFrame.from_dict(throughput_results)
115-
116-
# remapping the key, for visualization purpose
117-
if not latency_results.empty:
118-
latency_results = latency_results[list(
119-
latency_column_mapping.keys())].rename(columns=latency_column_mapping)
120-
if not serving_results.empty:
121-
serving_results = serving_results[list(
122-
serving_column_mapping.keys())].rename(columns=serving_column_mapping)
123-
if not throughput_results.empty:
124-
throughput_results = throughput_results[list(
125-
throughput_results_column_mapping.keys())].rename(
126-
columns=throughput_results_column_mapping)
127-
128-
# get markdown tables
129-
latency_md_table = tabulate(latency_results,
130-
headers='keys',
131-
tablefmt='pipe',
132-
showindex=False)
133-
serving_md_table = tabulate(serving_results,
134-
headers='keys',
135-
tablefmt='pipe',
136-
showindex=False)
137-
throughput_md_table = tabulate(throughput_results,
138-
headers='keys',
139-
tablefmt='pipe',
140-
showindex=False)
141-
142-
# document the result
143-
with open(results_folder / "benchmark_results.md", "w") as f:
56+
57+
def read_markdown(file):
58+
if os.path.exists(file):
59+
with open(file, "r") as f:
60+
return f.read() + "\n"
61+
else:
62+
return f"{file} not found.\n"
63+
64+
65+
def results_to_json(latency, throughput, serving):
66+
return json.dumps({
67+
'latency': latency.to_dict(),
68+
'throughput': throughput.to_dict(),
69+
'serving': serving.to_dict()
70+
})
71+
72+
73+
if __name__ == "__main__":
74+
75+
# collect results
76+
for test_file in results_folder.glob("*.json"):
77+
78+
with open(test_file, "r") as f:
79+
raw_result = json.loads(f.read())
80+
81+
if "serving" in str(test_file):
82+
# this result is generated via `benchmark_serving.py`
83+
84+
# attach the benchmarking command to raw_result
85+
with open(test_file.with_suffix(".commands"), "r") as f:
86+
command = json.loads(f.read())
87+
raw_result.update(command)
88+
89+
# update the test name of this result
90+
raw_result.update({"test_name": test_file.stem})
91+
92+
# add the result to raw_result
93+
serving_results.append(raw_result)
94+
continue
95+
96+
elif "latency" in f.name:
97+
# this result is generated via `benchmark_latency.py`
98+
99+
# attach the benchmarking command to raw_result
100+
with open(test_file.with_suffix(".commands"), "r") as f:
101+
command = json.loads(f.read())
102+
raw_result.update(command)
103+
104+
# update the test name of this result
105+
raw_result.update({"test_name": test_file.stem})
106+
107+
# get different percentiles
108+
for perc in [10, 25, 50, 75, 90, 99]:
109+
# Multiply 1000 to convert the time unit from s to ms
110+
raw_result.update(
111+
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
112+
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
113+
114+
# add the result to raw_result
115+
latency_results.append(raw_result)
116+
continue
117+
118+
elif "throughput" in f.name:
119+
# this result is generated via `benchmark_throughput.py`
120+
121+
# attach the benchmarking command to raw_result
122+
with open(test_file.with_suffix(".commands"), "r") as f:
123+
command = json.loads(f.read())
124+
raw_result.update(command)
125+
126+
# update the test name of this result
127+
raw_result.update({"test_name": test_file.stem})
128+
129+
# add the result to raw_result
130+
throughput_results.append(raw_result)
131+
continue
132+
133+
print(f"Skipping {test_file}")
134+
135+
latency_results = pd.DataFrame.from_dict(latency_results)
136+
serving_results = pd.DataFrame.from_dict(serving_results)
137+
throughput_results = pd.DataFrame.from_dict(throughput_results)
138+
139+
raw_results_json = results_to_json(latency_results, throughput_results,
140+
serving_results)
141+
142+
# remapping the key, for visualization purpose
144143
if not latency_results.empty:
145-
f.write("## Latency tests\n")
146-
f.write(latency_md_table)
147-
f.write("\n")
148-
if not throughput_results.empty:
149-
f.write("## Throughput tests\n")
150-
f.write(throughput_md_table)
151-
f.write("\n")
144+
latency_results = latency_results[list(
145+
latency_column_mapping.keys())].rename(
146+
columns=latency_column_mapping)
152147
if not serving_results.empty:
153-
f.write("## Serving tests\n")
154-
f.write(serving_md_table)
155-
f.write("\n")
148+
serving_results = serving_results[list(
149+
serving_column_mapping.keys())].rename(
150+
columns=serving_column_mapping)
151+
if not throughput_results.empty:
152+
throughput_results = throughput_results[list(
153+
throughput_results_column_mapping.keys())].rename(
154+
columns=throughput_results_column_mapping)
155+
156+
processed_results_json = results_to_json(latency_results,
157+
throughput_results,
158+
serving_results)
159+
160+
# get markdown tables
161+
latency_md_table = tabulate(latency_results,
162+
headers='keys',
163+
tablefmt='pipe',
164+
showindex=False)
165+
serving_md_table = tabulate(serving_results,
166+
headers='keys',
167+
tablefmt='pipe',
168+
showindex=False)
169+
throughput_md_table = tabulate(throughput_results,
170+
headers='keys',
171+
tablefmt='pipe',
172+
showindex=False)
173+
174+
# document the result
175+
with open(results_folder / "benchmark_results.md", "w") as f:
176+
177+
results = read_markdown(
178+
"../.buildkite/nightly-benchmarks/tests/descriptions.md")
179+
results = results.format(
180+
latency_tests_markdown_table=latency_md_table,
181+
throughput_tests_markdown_table=throughput_md_table,
182+
serving_tests_markdown_table=serving_md_table,
183+
benchmarking_results_in_json_string=processed_results_json)
184+
f.write(results)
185+
186+
# document benchmarking results in json
187+
with open(results_folder / "benchmark_results.json", "w") as f:
188+
189+
results = latency_results.to_dict(
190+
orient='records') + throughput_results.to_dict(
191+
orient='records') + serving_results.to_dict(orient='records')
192+
f.write(json.dumps(results))

0 commit comments

Comments
 (0)