diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 48cdffb8afcc1..8269481e178da 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -21,6 +21,7 @@ def main(args: argparse.Namespace): tensor_parallel_size=args.tensor_parallel_size, max_num_seqs=args.batch_size, max_num_batched_tokens=args.batch_size * args.input_len, + trust_remote_code=args.trust_remote_code, ) sampling_params = SamplingParams( @@ -74,5 +75,7 @@ def run_to_completion(profile: bool = False): parser.add_argument('--use-beam-search', action='store_true') parser.add_argument('--num-iters', type=int, default=3, help='Number of iterations to run.') + parser.add_argument('--trust-remote-code', action='store_true', + help='trust remote code from huggingface') args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index b0705ec0fe80b..d691c8a5f702a 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -177,7 +177,7 @@ def main(args: argparse.Namespace): np.random.seed(args.seed) api_url = f"http://{args.host}:{args.port}/generate" - tokenizer = get_tokenizer(args.tokenizer) + tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code) input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer) benchmark_start_time = time.time() @@ -227,5 +227,7 @@ def main(args: argparse.Namespace): "Otherwise, we use Poisson process to synthesize " "the request arrival times.") parser.add_argument("--seed", type=int, default=0) + parser.add_argument('--trust-remote-code', action='store_true', + help='trust remote code from huggingface') args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 8a8639cf5425e..b2bea8520565d 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -74,7 +74,7 @@ def run_vllm( tokenizer=tokenizer, tensor_parallel_size=tensor_parallel_size, seed=seed, - trust_remote_code=trust_remote_code + trust_remote_code=trust_remote_code, ) # Add the requests to the engine. @@ -111,7 +111,8 @@ def run_hf( trust_remote_code: bool, ) -> float: assert not use_beam_search - llm = AutoModelForCausalLM.from_pretrained(model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code) + llm = AutoModelForCausalLM.from_pretrained(model, + torch_dtype=torch.float16, trust_remote_code=trust_remote_code) if llm.config.model_type == "llama": # To enable padding in the HF backend. tokenizer.pad_token = tokenizer.eos_token @@ -173,8 +174,9 @@ def main(args: argparse.Namespace): args.seed, args.n, args.use_beam_search, args.trust_remote_code) elif args.backend == "hf": assert args.tensor_parallel_size == 1 - elapsed_time = run_hf(requests, args.model, tokenizer, args.n, - args.use_beam_search, args.hf_max_batch_size) + elapsed_time = run_hf( + requests, args.model, tokenizer, args.n, args.use_beam_search, + args.hf_max_batch_size, args.trust_remote_code) else: raise ValueError(f"Unknown backend: {args.backend}") total_num_tokens = sum( diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 8fe664bb9400f..373c4812264a0 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -585,7 +585,8 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]: # A separate tokenizer to map token IDs to strings. tokenizer = get_tokenizer(engine_args.tokenizer, - tokenizer_mode=engine_args.tokenizer_mode) + tokenizer_mode=engine_args.tokenizer_mode, + trust_remote_code=engine_args.trust_remote_code) uvicorn.run(app, host=args.host,