Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add script for benchmarking serving throughput #145

Merged
merged 43 commits into from
Jun 15, 2023
Merged
Changes from 1 commit
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
473c5b8
Minor fix
WoosukKwon Jun 10, 2023
a644a9b
Minor
WoosukKwon Jun 10, 2023
67ed51c
Minor
WoosukKwon Jun 10, 2023
83acd5e
Minor
WoosukKwon Jun 10, 2023
4957281
Add log-requests option to AsyncLLMServer
WoosukKwon Jun 10, 2023
c6b38d2
[WIP] Add benchmark_serving.py
WoosukKwon Jun 10, 2023
5210de0
Minor
WoosukKwon Jun 10, 2023
d4df348
Delete unused files
WoosukKwon Jun 10, 2023
fab12d6
Minor
WoosukKwon Jun 10, 2023
3ddadf4
Add docstring
WoosukKwon Jun 10, 2023
4269b11
Bugfix
WoosukKwon Jun 10, 2023
af8974d
Minor
WoosukKwon Jun 10, 2023
f8dee6e
Minor
WoosukKwon Jun 10, 2023
d181f10
Add script to launch HF server
WoosukKwon Jun 10, 2023
fc02a02
Add HF backend
WoosukKwon Jun 10, 2023
99d9ce3
Minor
WoosukKwon Jun 10, 2023
bc9ec63
Bugfix
WoosukKwon Jun 10, 2023
9477f2f
Filter out long prompts
WoosukKwon Jun 10, 2023
51a5332
Minor fix
WoosukKwon Jun 10, 2023
6b0d77b
Merge branch 'main' into benchmark-llama
WoosukKwon Jun 10, 2023
00d158d
Repeat failed requests
WoosukKwon Jun 10, 2023
0c55c40
Stream=False
WoosukKwon Jun 10, 2023
bcb8e16
Minor
WoosukKwon Jun 10, 2023
6a7baaa
Prune short sequences
WoosukKwon Jun 10, 2023
071b4aa
Add 1 hour timeout
WoosukKwon Jun 10, 2023
983cf97
Increase timeout
WoosukKwon Jun 10, 2023
b55b1ee
Add shortcut
WoosukKwon Jun 11, 2023
c45a2dd
Simplify
WoosukKwon Jun 11, 2023
66f8c60
Merge branch 'opt' into benchmark-llama
WoosukKwon Jun 11, 2023
a1b513e
n -> best_of
WoosukKwon Jun 11, 2023
72d6a63
Minor
WoosukKwon Jun 11, 2023
44bc461
Add latency stats
WoosukKwon Jun 11, 2023
6990fc5
Increase max_best_of in HF server
WoosukKwon Jun 11, 2023
2c610bd
Merge branch 'main' into benchmark-llama
WoosukKwon Jun 11, 2023
5687f10
hf -> tgi
WoosukKwon Jun 13, 2023
672fbbd
Add HF backend
WoosukKwon Jun 13, 2023
60bccc4
Fix batching
WoosukKwon Jun 13, 2023
b7fcade
Fix a bug & Add tqdm
WoosukKwon Jun 13, 2023
6accbfd
Minor
WoosukKwon Jun 14, 2023
c7360d1
Fix
WoosukKwon Jun 15, 2023
bf1bae6
Comment
WoosukKwon Jun 15, 2023
7bebe29
Add docstring
WoosukKwon Jun 15, 2023
5c1b852
Comment
WoosukKwon Jun 15, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add HF backend
  • Loading branch information
WoosukKwon committed Jun 10, 2023
commit fc02a02bc71391c499960b11ab7523413f1ccf59
48 changes: 34 additions & 14 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
"""Benchmark the online serving throughput.

On the server side, run:
On the server side, run one of the following commands:
(CacheFlow backend)
python -m cacheflow.entrypoints.simple_fastapi_frontend \
--disable-log-requests --model <your_model>

(HuggingFace backend)
./launch_hf_server.sh <your_model>

On the client side, run:
python benchmarks/benchmark_serving.py \
--backend <backend> \
--tokenizer <your_model> --dataset <target_dataset> \
--request-rate <request_rate>
"""
Expand Down Expand Up @@ -91,22 +96,34 @@ async def get_request(


async def send_request(
backend: str,
api_url: str,
prompt: str,
output_len: int,
n: int,
use_beam_search: bool,
) -> None:
headers = {"User-Agent": "Benchmark Client"}
pload = {
"prompt": prompt,
"n": n,
"use_beam_search": use_beam_search,
"temperature": 0.0 if use_beam_search else 1.0,
"top_p": 1.0,
"max_tokens": output_len,
"ignore_eos": True,
}
if backend == "cacheflow":
pload = {
"prompt": prompt,
"n": n,
"use_beam_search": use_beam_search,
"temperature": 0.0 if use_beam_search else 1.0,
"top_p": 1.0,
"max_tokens": output_len,
"ignore_eos": True,
}
elif backend == "huggingface":
assert n == 1
assert not use_beam_search
pload = {
"inputs": prompt,
"parameters": {"max_new_tokens": output_len, "do_sample": True},
}
else:
raise ValueError(f"Unknown backend: {backend}")

async with aiohttp.ClientSession() as session:
async with session.post(api_url, headers=headers, json=pload) as response:
chunks = []
Expand All @@ -116,6 +133,7 @@ async def send_request(


async def benchmark(
backend: str,
api_url: str,
input_requests: List[Tuple[str, int]],
n: int,
Expand All @@ -124,8 +142,8 @@ async def benchmark(
) -> None:
tasks: List[asyncio.Task] = []
async for prompt, output_len in get_request(input_requests, request_rate):
task = asyncio.create_task(
send_request(api_url, prompt, output_len, n, use_beam_search))
task = asyncio.create_task(send_request(backend, api_url, prompt,
output_len, n, use_beam_search))
tasks.append(task)
await asyncio.gather(*tasks)

Expand All @@ -138,13 +156,15 @@ def main(args: argparse.Namespace):
api_url = f"http://{args.host}:{args.port}/generate"
tokenizer = get_tokenizer(args.tokenizer)
input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
asyncio.run(benchmark(api_url, input_requests, args.n, args.use_beam_search,
args.request_rate))
asyncio.run(benchmark(args.backend, api_url, input_requests, args.n,
args.use_beam_search, args.request_rate))


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Benchmark the online serving throughput.")
parser.add_argument("--backend", type=str, default="cacheflow",
choices=["cacheflow", "huggingface"])
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8001)
parser.add_argument("--dataset", type=str, required=True,
Expand Down