Skip to content

Commit d057777

Browse files
WoosukKwonywang96
authored andcommitted
[Benchmark] Add BurstGPT to benchmark_serving (vllm-project#13063)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
1 parent 3d7e275 commit d057777

File tree

2 files changed

+47
-1
lines changed

2 files changed

+47
-1
lines changed

benchmarks/README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,11 @@ mkdir coco -p
1919
wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
2020
unzip coco/train2017.zip -d coco/
2121
```
22+
23+
# Downloading the BurstGPT dataset
24+
25+
You can download the BurstGPT v1.1 dataset by running:
26+
27+
```bash
28+
wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv
29+
```

benchmarks/benchmark_serving.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple
3939

4040
import numpy as np
41+
import pandas as pd
4142
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
4243
RequestFuncOutput)
4344
from datasets import load_dataset
@@ -131,6 +132,35 @@ def sample_sharegpt_requests(
131132
return filtered_dataset
132133

133134

135+
def sample_burstgpt_requests(
136+
dataset_path: str,
137+
num_requests: int,
138+
random_seed: int,
139+
tokenizer: PreTrainedTokenizerBase,
140+
) -> List[Tuple[str, int, int, None]]:
141+
df = pd.read_csv(dataset_path)
142+
gpt4_df = df[df["Model"] == "GPT-4"]
143+
# Remove the failed requests (i.e., response length is 0)
144+
gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
145+
# Randomly sample num_requests from the dataset
146+
if num_requests <= len(gpt4_df):
147+
gpt4_df = gpt4_df.sample(n=num_requests, random_state=random_seed)
148+
else:
149+
gpt4_df = gpt4_df.sample(n=num_requests,
150+
random_state=random_seed,
151+
replace=True)
152+
# Convert the dataframe to a list of tuples
153+
dataset = gpt4_df.values.tolist()
154+
input_requests = []
155+
for i in range(num_requests):
156+
input_len = int(dataset[i][2])
157+
output_len = int(dataset[i][3])
158+
prompt = tokenizer.decode([(i + j) % tokenizer.vocab_size
159+
for j in range(input_len)])
160+
input_requests.append((prompt, input_len, output_len, None))
161+
return input_requests
162+
163+
134164
def sample_sonnet_requests(
135165
dataset_path: str,
136166
num_requests: int,
@@ -830,6 +860,14 @@ def main(args: argparse.Namespace):
830860
fixed_output_len=args.sharegpt_output_len,
831861
)
832862

863+
elif args.dataset_name == "burstgpt":
864+
input_requests = sample_burstgpt_requests(
865+
dataset_path=args.dataset_path,
866+
num_requests=args.num_prompts,
867+
random_seed=args.seed,
868+
tokenizer=tokenizer,
869+
)
870+
833871
elif args.dataset_name == "sonnet":
834872
# Do not format the prompt, pass to message directly
835873
if args.backend == "openai-chat":
@@ -995,7 +1033,7 @@ def main(args: argparse.Namespace):
9951033
"--dataset-name",
9961034
type=str,
9971035
default="sharegpt",
998-
choices=["sharegpt", "sonnet", "random", "hf"],
1036+
choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"],
9991037
help="Name of the dataset to benchmark on.",
10001038
)
10011039
parser.add_argument("--dataset-path",

0 commit comments

Comments
 (0)