Skip to content

Serving Benchmark Refactoring #2433

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 40 commits into from
Feb 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
15cf77e
initial refactor
ywang96 Jan 13, 2024
1a29d37
iterate
ywang96 Jan 13, 2024
5962f8a
iterate - add openai completions
ywang96 Jan 14, 2024
b3f41cb
iterate
ywang96 Jan 14, 2024
1f31236
add deepspeed-mii
ywang96 Jan 16, 2024
fb871f4
add trt-llm and cleanup
ywang96 Jan 17, 2024
c36ee7f
fix top_p for TGI
ywang96 Jan 17, 2024
8d44415
add endpoint arg
ywang96 Jan 17, 2024
e302f24
fix backend options
ywang96 Jan 17, 2024
3610e29
rename & format
ywang96 Jan 17, 2024
ebb7742
dynamic backend choices
ywang96 Jan 19, 2024
4c11279
changes from PR #2532
ywang96 Jan 22, 2024
c3fa3a8
add streaming & remove openai dependency
ywang96 Jan 22, 2024
941b51a
clean up and add note for deepspeed mii
ywang96 Jan 29, 2024
a09bd8b
add ttfts to outputs
ywang96 Jan 30, 2024
95b206f
minor changes on commend args
ywang96 Jan 30, 2024
2dd2e93
typo
ywang96 Jan 30, 2024
150e23e
minor edit on args
ywang96 Jan 30, 2024
ae3d7c3
fix mii payload & adding catch
ywang96 Jan 31, 2024
8a6dc51
convert metrics to dataclass
ywang96 Feb 11, 2024
d9838dd
refactor
ywang96 Feb 11, 2024
7fff37c
Merge branch 'main' into benchmark-refactor
ywang96 Feb 11, 2024
14c05fb
fix dataclasses
ywang96 Feb 11, 2024
b05a838
fix import
ywang96 Feb 11, 2024
59975d4
adding progressbar
ywang96 Feb 12, 2024
a0f7971
option to disable tqdm
ywang96 Feb 12, 2024
1c83ef3
add benchmark to ci
ywang96 Feb 12, 2024
8d51ed2
add wget
ywang96 Feb 12, 2024
a325bca
fix wget
ywang96 Feb 12, 2024
f691f72
replace wget with curl
ywang96 Feb 12, 2024
693f282
Merge branch 'main' into benchmark-refactor
ywang96 Feb 12, 2024
4ec4bc5
ypaf
ywang96 Feb 12, 2024
80574d8
yapf
ywang96 Feb 12, 2024
e42eb3e
fix CI
ywang96 Feb 12, 2024
af5be05
comments
ywang96 Feb 12, 2024
5ce50df
move files back to top-level
ywang96 Feb 12, 2024
1b7a2e9
inline common metrics
simon-mo Feb 13, 2024
43692d5
add artifact upload
simon-mo Feb 13, 2024
71984b0
fix flag
simon-mo Feb 13, 2024
78526b9
Apply suggestions from code review
simon-mo Feb 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions .buildkite/run-benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,31 @@ set -o pipefail
# cd into parent directory of this file
cd "$(dirname "${BASH_SOURCE[0]}")/.."

(wget && curl) || (apt-get update && apt-get install -y wget curl)
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)

# run benchmarks and upload the result to buildkite
# run python-based benchmarks and upload the result to buildkite
python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
bench_latency_exit_code=$?

python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
bench_throughput_exit_code=$?

# run server-based benchmarks and upload the result to buildkite
python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
server_pid=$!
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json

# wait for server to start, timeout after 600 seconds
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
--backend openai \
--dataset ./ShareGPT_V3_unfiltered_cleaned_split.json \
--model meta-llama/Llama-2-7b-chat-hf \
--num-prompts 20 \
--endpoint /v1/completions \
--tokenizer meta-llama/Llama-2-7b-chat-hf 2>&1 | tee benchmark_serving.txt
--tokenizer meta-llama/Llama-2-7b-chat-hf \
--save-result \
2>&1 | tee benchmark_serving.txt
bench_serving_exit_code=$?
kill $server_pid

Expand All @@ -44,7 +48,7 @@ sed -n '$p' benchmark_throughput.txt >> benchmark_results.md # last line
echo "### Serving Benchmarks" >> benchmark_results.md
sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
echo "" >> benchmark_results.md
tail -n 5 benchmark_serving.txt >> benchmark_results.md # last 5 lines
tail -n 13 benchmark_serving.txt >> benchmark_results.md # last 13 lines

# upload the results to buildkite
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
Expand All @@ -61,3 +65,5 @@ fi
if [ $bench_serving_exit_code -ne 0 ]; then
exit $bench_serving_exit_code
fi

/workspace/buildkite-agent artifact upload openai-*.json
284 changes: 284 additions & 0 deletions benchmarks/backend_request_func.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,284 @@
import json
import os
import time
from dataclasses import dataclass
from typing import Optional

import aiohttp
from tqdm.asyncio import tqdm

AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)


@dataclass
class RequestFuncInput:
prompt: str
api_url: str
prompt_len: int
output_len: int
model: str
best_of: int = 1
use_beam_search: bool = False


@dataclass
class RequestFuncOutput:
generated_text: str = ""
success: bool = False
latency: float = 0
ttft: float = 0
prompt_len: int = 0


async def async_request_tgi(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")

async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
assert not request_func_input.use_beam_search
params = {
"best_of": request_func_input.best_of,
"max_new_tokens": request_func_input.output_len,
"do_sample": True,
"temperature": 0.01, # TGI does not accept 0.0 temperature.
"top_p": 0.99, # TGI does not accept 1.0 top_p.
}
payload = {
"inputs": request_func_input.prompt,
"parameters": params,
}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len

ttft = 0
st = time.perf_counter()
try:
async with session.post(url=api_url, json=payload) as response:
if response.status == 200:
async for data in response.content.iter_any():
if ttft == 0:
ttft = time.perf_counter() - st
output.ttft = ttft
output.latency = time.perf_counter() - st

body = data.decode("utf-8").lstrip("data:")
output.generated_text = json.loads(body)["generated_text"]
output.success = True
else:
output.success = False
except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
output.success = False

if pbar:
pbar.update(1)
return output


async def async_request_vllm(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith("generate")

async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
payload = {
"prompt": request_func_input.prompt,
"n": 1,
"best_of": request_func_input.best_of,
"use_beam_search": request_func_input.use_beam_search,
"temperature": 0.0 if request_func_input.use_beam_search else 1.0,
"top_p": 1.0,
"max_tokens": request_func_input.output_len,
"ignore_eos": True,
"stream": True,
}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len

ttft = 0
st = time.perf_counter()
try:
async with session.post(url=api_url, json=payload) as response:
if response.status == 200:
async for data in response.content.iter_any():
if ttft == 0:
ttft = time.perf_counter() - st
output.ttft = ttft
output.latency = time.perf_counter() - st

# When streaming, '\0' is appended to the end of the response.
body = data.decode("utf-8").strip("\0")
output.generated_text = json.loads(
body)["text"][0][len(request_func_input.prompt):]
output.success = True

else:
output.success = False
except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
output.success = False

if pbar:
pbar.update(1)
return output


async def async_request_trt_llm(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")

async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
assert not request_func_input.use_beam_search
assert request_func_input.best_of == 1
payload = {
"accumulate_tokens": True,
"text_input": request_func_input.prompt,
"temperature": 0.0,
"top_p": 1.0,
"max_tokens": request_func_input.output_len,
"stream": True,
}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
ttft = 0

st = time.perf_counter()
try:
async with session.post(url=api_url, json=payload) as resp:
if resp.status == 200:
async for data in resp.content.iter_any():
if ttft == 0:
ttft = time.perf_counter() - st
output.ttft = ttft
output.latency = time.perf_counter() - st

body = data.decode("utf-8").lstrip("data:")
output.generated_text = json.loads(body)["text_output"]
output.success = True

else:
output.success = False
except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
output.success = False

if pbar:
pbar.update(1)
return output


async def async_request_deepspeed_mii(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
assert request_func_input.best_of == 1
assert not request_func_input.use_beam_search

payload = {
"prompts": request_func_input.prompt,
"max_new_tokens": request_func_input.output_len,
"ignore_eos": True,
"do_sample": True,
"temperature":
0.01, # deepspeed-mii does not accept 0.0 temperature.
"top_p": 1.0,
}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len

# DeepSpeed-MII doesn't support streaming as of Jan 28 2024, will use 0 as placeholder.
# https://github.com/microsoft/DeepSpeed-MII/pull/311
output.ttft = 0

st = time.perf_counter()
try:
async with session.post(url=request_func_input.api_url,
json=payload) as resp:
if resp.status == 200:
parsed_resp = await resp.json()
output.latency = time.perf_counter() - st
output.generated_text = parsed_resp[0]["generated_text"]
output.success = True
else:
output.success = False
except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
output.success = False

if pbar:
pbar.update(1)
return output


async def async_request_openai_completions(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith("v1/completions")

async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
assert not request_func_input.use_beam_search
payload = {
"model": request_func_input.model,
"prompt": request_func_input.prompt,
"temperature": 0.0,
"best_of": request_func_input.best_of,
"max_tokens": request_func_input.output_len,
"stream": True,
}
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
}

output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len

generated_text = ""
ttft = 0
st = time.perf_counter()
try:
async with session.post(url=api_url, json=payload,
headers=headers) as response:
if response.status == 200:
async for chunk in response.content:
if ttft == 0:
ttft = time.perf_counter() - st
output.ttft = ttft

chunk = chunk.strip()
if not chunk:
continue

chunk = chunk.decode("utf-8").lstrip("data: ")
if chunk == "[DONE]":
latency = time.perf_counter() - st
else:
body = json.loads(chunk)
generated_text += body["choices"][0]["text"]

output.generated_text = generated_text
output.success = True
output.latency = latency
else:
output.success = False
except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
output.success = False

if pbar:
pbar.update(1)
return output


ASYNC_REQUEST_FUNCS = {
"tgi": async_request_tgi,
"vllm": async_request_vllm,
"deepspeed-mii": async_request_deepspeed_mii,
"openai": async_request_openai_completions,
"tensorrt-llm": async_request_trt_llm,
}
Loading