Skip to content
This repository has been archived by the owner on Oct 11, 2024. It is now read-only.

Benchmarking : Misc updates #95

Merged
merged 6 commits into from
Mar 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion neuralmagic/benchmarks/scripts/backend_request_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,9 @@ async def async_request_vllm(
"n": 1,
"best_of": request_func_input.best_of,
"use_beam_search": request_func_input.use_beam_search,
"temperature": 0.0 if request_func_input.use_beam_search else 1.0,
# TODO (varun) : Make temperature configurable
#"temperature": 0.0 if request_func_input.use_beam_search else 1.0,
"temperature": 0.0,
"top_p": 1.0,
"max_tokens": request_func_input.output_len,
"ignore_eos": True,
Expand Down
9 changes: 5 additions & 4 deletions neuralmagic/benchmarks/scripts/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@
from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase
from vllm.transformers_utils.tokenizer import get_tokenizer
from .common import instantiate_benchmark_results_dict, generate_synthetic_requests, print_benchmark_io
# TODO (move this to scripts)
from .common import instantiate_benchmark_results_dict, generate_synthetic_requests, print_serving_request_io
from .datasets_registry import get_dataset, DatasetArgs

from neuralmagic.benchmarks.scripts.backend_request_func import (
Expand Down Expand Up @@ -100,7 +99,9 @@ def calculate_metrics(
total_output += output_len
total_input += input_requests[i][1]
latencies.append(outputs[i].latency)
tpots.append((outputs[i].latency - outputs[i].ttft) / output_len)
if output_len > 1:
tpots.append(
(outputs[i].latency - outputs[i].ttft) / (output_len - 1))
ttfts.append(outputs[i].ttft)
completed += 1

Expand Down Expand Up @@ -167,7 +168,7 @@ async def benchmark(backend: str, api_url: str, model_id: str,

# Dump model i/o
if log_model_io:
print_benchmark_io(outputs)
print_serving_request_io(input_requests, outputs)

metrics = calculate_metrics(
input_requests=input_requests,
Expand Down
54 changes: 33 additions & 21 deletions neuralmagic/benchmarks/scripts/benchmark_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from pathlib import Path
from typing import List, Optional, Tuple
from transformers import AutoTokenizer
from .common import instantiate_benchmark_results_dict, generate_synthetic_requests, warmup_vllm_engine, num_available_gpus
from .common import instantiate_benchmark_results_dict, generate_synthetic_requests, warmup_vllm_engine, num_available_gpus, print_request_outputs
from .datasets_registry import get_dataset, DatasetArgs


Expand All @@ -25,21 +25,21 @@ def get_tensor_parallel_size(args: argparse.Namespace) -> int:
return tensor_parallel_size


def run_vllm(
requests: List[Tuple[str, int, int]],
model: str,
tokenizer: str,
quantization: Optional[str],
tensor_parallel_size: int,
seed: int,
n: int,
use_beam_search: bool,
trust_remote_code: bool,
dtype: str,
max_model_len: Optional[int],
enforce_eager: bool,
sparsity: Optional[str],
) -> float:
def run_vllm(requests: List[Tuple[str, int, int]],
model: str,
tokenizer: str,
quantization: Optional[str],
tensor_parallel_size: int,
seed: int,
n: int,
use_beam_search: bool,
trust_remote_code: bool,
dtype: str,
max_model_len: Optional[int],
enforce_eager: bool,
sparsity: Optional[str],
num_warmup_prompts: int,
log_model_io: bool = False) -> float:
from vllm import LLM, SamplingParams
llm = LLM(
model=model,
Expand All @@ -53,13 +53,15 @@ def run_vllm(
enforce_eager=enforce_eager,
)

warmup_vllm_engine(engine=llm, model=model, num_prompts=1000)
warmup_vllm_engine(engine=llm, model=model, num_prompts=num_warmup_prompts)

# Add the requests to the engine.
for prompt, _, output_len in requests:
sampling_params = SamplingParams(
n=n,
temperature=0.0 if use_beam_search else 1.0,
# TODO (varun) Make temperature configurable
#temperature=0.0 if use_beam_search else 1.0,
temperature=0.0,
top_p=1.0,
use_beam_search=use_beam_search,
ignore_eos=True,
Expand All @@ -74,9 +76,12 @@ def run_vllm(

start = time.perf_counter()
# FIXME(woosuk): Do not use internal method.
llm._run_engine(use_tqdm=True)
outputs = llm._run_engine(use_tqdm=True)
end = time.perf_counter()

if log_model_io:
print_request_outputs(outputs)

return end - start


Expand All @@ -96,7 +101,7 @@ def main(args: argparse.Namespace):
num_samples=args.num_prompts,
max_len=2048,
seed=42,
))
fixed_output_len=args.output_len))
else:
# Make a synthetic dataset.
requests = generate_synthetic_requests(args.input_len, args.output_len,
Expand All @@ -114,7 +119,9 @@ def main(args: argparse.Namespace):
args.dtype,
args.max_model_len,
args.enforce_eager,
sparsity=args.sparsity)
sparsity=args.sparsity,
num_warmup_prompts=args.num_warmup_prompts,
log_model_io=args.log_model_io)

total_prompt_tokens = sum(prompt_len for _, prompt_len, _ in requests)
total_output_tokens = sum(output_len for _, _, output_len in requests)
Expand Down Expand Up @@ -189,10 +196,15 @@ def main(args: argparse.Namespace):
type=int,
default=1000,
help="Number of prompts to process.")
parser.add_argument("--num-warmup-prompts",
type=int,
default=1000,
help="Number of prompts to do warmups with.")
parser.add_argument("--seed", type=int, default=0)
parser.add_argument('--trust-remote-code',
action='store_true',
help='trust remote code from huggingface')
parser.add_argument("--log-model-io", action="store_true")
parser.add_argument(
'--max-model-len',
type=int,
Expand Down
28 changes: 23 additions & 5 deletions neuralmagic/benchmarks/scripts/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from vllm.outputs import RequestOutput
from vllm.transformers_utils.tokenizer import get_tokenizer
from .datasets_registry import SHAREGPT_PATH, SHAREGPT_DOWNLOAD_STR
from .backend_request_func import RequestFuncInput, async_request_vllm
from .backend_request_func import RequestFuncInput, RequestFuncOutput, async_request_vllm
from ...tools.call_cmd import call_cmd


Expand Down Expand Up @@ -204,9 +204,27 @@ def instantiate_benchmark_results_dict(benchmarking_script_name: str,
return result_dict


def print_benchmark_io(results: List[RequestOutput]) -> None:
def format_io_log(prompt: str, output_text: str, n_prompt_tokens: int,
n_output_tokens: int) -> str:
return f"\n=== Prompt ({n_prompt_tokens}) ==\n{prompt}\n==== output({n_output_tokens}) ==\n{output_text}\n"


def print_request_outputs(results: List[RequestOutput]) -> None:
for result in results:
output = result.outputs[0]
print(
f"\n\n inputs({len(result.prompt_token_ids)}): {result.prompt}\n output({len(output.token_ids)}): {output.text}"
)
io_log = format_io_log(result.prompt, output.text,
len(result.prompt_token_ids),
len(output.token_ids))
print(f"\n{io_log}")


def print_serving_request_io(inputs: List[Tuple[str, int, int]],
outputs: List[RequestFuncOutput]) -> None:
"""
inputs: list of tuples where the tuple is [prompt, prompt_length, output_length],
outputs: list of RequestFuncOutput that is the output from the serving case (benchmark_serving.py)
Format and print the inputs and outputs.
"""
for i, o in zip(inputs, outputs):
io_log = format_io_log(i[0], o.generated_text, i[1], i[2])
print(f"\n{io_log}")
Loading