Skip to content
This repository has been archived by the owner on Oct 11, 2024. It is now read-only.

Commit

Permalink
[Misc] Make Serving Benchmark More User-friendly (vllm-project#5044)
Browse files Browse the repository at this point in the history
  • Loading branch information
ywang96 authored and robertgshaw2-neuralmagic committed Jun 8, 2024
1 parent 8768b3f commit e7e376f
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 3 deletions.
6 changes: 6 additions & 0 deletions benchmarks/backend_request_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,9 @@ async def async_request_tgi(
output.latency = most_recent_timestamp - st
output.success = True
output.generated_text = data["generated_text"]
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
Expand Down Expand Up @@ -280,6 +283,9 @@ async def async_request_openai_completions(
output.generated_text = generated_text
output.success = True
output.latency = latency
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
Expand Down
29 changes: 26 additions & 3 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,11 @@ def calculate_metrics(
else:
actual_output_lens.append(0)

if completed == 0:
warnings.warn(
"All requests failed. This is likely due to a misconfiguration "
"on the benchmark arguments.",
stacklevel=2)
metrics = BenchmarkMetrics(
completed=completed,
total_input=total_input,
Expand All @@ -228,9 +233,9 @@ def calculate_metrics(
1000, # ttfts is empty if streaming is not supported by backend
median_ttft_ms=np.median(ttfts or 0) * 1000,
p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
mean_tpot_ms=np.mean(tpots) * 1000,
median_tpot_ms=np.median(tpots) * 1000,
p99_tpot_ms=np.percentile(tpots, 99) * 1000,
mean_tpot_ms=np.mean(tpots or 0) * 1000,
median_tpot_ms=np.median(tpots or 0) * 1000,
p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
)

return metrics, actual_output_lens
Expand All @@ -252,6 +257,24 @@ async def benchmark(
else:
raise ValueError(f"Unknown backend: {backend}")

print("Starting initial single prompt test run...")
test_prompt, test_prompt_len, test_output_len = input_requests[0]
test_input = RequestFuncInput(
model=model_id,
prompt=test_prompt,
api_url=api_url,
prompt_len=test_prompt_len,
output_len=test_output_len,
best_of=best_of,
use_beam_search=use_beam_search,
)
test_output = await request_func(request_func_input=test_input)
if not test_output.success:
raise ValueError(
"Initial test run failed - Please make sure benchmark arguments "
f"are correctly specified. Error: {test_output.error}")
else:
print("Initial test run completed. Starting main benchmark run...")
print(f"Traffic request rate: {request_rate}")

pbar = None if disable_tqdm else tqdm(total=len(input_requests))
Expand Down

0 comments on commit e7e376f

Please sign in to comment.