diff --git a/neuralmagic/benchmarks/scripts/benchmark_throughput.py b/neuralmagic/benchmarks/scripts/benchmark_throughput.py index 0607067e3817d..f49de1ec27a3f 100644 --- a/neuralmagic/benchmarks/scripts/benchmark_throughput.py +++ b/neuralmagic/benchmarks/scripts/benchmark_throughput.py @@ -10,10 +10,12 @@ import time from datetime import datetime from pathlib import Path -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, cast from transformers import AutoTokenizer +from vllm.inputs import PromptStrictInputs + from .common import (generate_synthetic_requests, num_available_gpus, print_request_outputs, warmup_vllm_engine) from .datasets_registry import DatasetArgs, get_dataset @@ -77,8 +79,7 @@ def run_vllm(requests: List[Tuple[str, int, int]], ) # FIXME(woosuk): Do not use internal method. llm._add_request( - prompt=prompt, - prompt_token_ids=None, + inputs=cast(PromptStrictInputs, prompt), params=sampling_params, )