From 8c4b2592fb953d1a8f880d42ebb1b28eaa94d0a6 Mon Sep 17 00:00:00 2001
From: Ricardo Lu <37237570+gesanqiu@users.noreply.github.com>
Date: Thu, 20 Jul 2023 08:06:15 +0800
Subject: [PATCH] fix: enable trust-remote-code in api server & benchmark.
 (#509)

---
 benchmarks/benchmark_latency.py       |  3 +++
 benchmarks/benchmark_serving.py       |  4 +++-
 benchmarks/benchmark_throughput.py    | 10 ++++++----
 vllm/entrypoints/openai/api_server.py |  3 ++-
 4 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 48cdffb8afcc1..8269481e178da 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -21,6 +21,7 @@ def main(args: argparse.Namespace):
         tensor_parallel_size=args.tensor_parallel_size,
         max_num_seqs=args.batch_size,
         max_num_batched_tokens=args.batch_size * args.input_len,
+        trust_remote_code=args.trust_remote_code,
     )
 
     sampling_params = SamplingParams(
@@ -74,5 +75,7 @@ def run_to_completion(profile: bool = False):
     parser.add_argument('--use-beam-search', action='store_true')
     parser.add_argument('--num-iters', type=int, default=3,
                         help='Number of iterations to run.')
+    parser.add_argument('--trust-remote-code', action='store_true',
+                        help='trust remote code from huggingface')
     args = parser.parse_args()
     main(args)
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index b0705ec0fe80b..d691c8a5f702a 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -177,7 +177,7 @@ def main(args: argparse.Namespace):
     np.random.seed(args.seed)
 
     api_url = f"http://{args.host}:{args.port}/generate"
-    tokenizer = get_tokenizer(args.tokenizer)
+    tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
     input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
 
     benchmark_start_time = time.time()
@@ -227,5 +227,7 @@ def main(args: argparse.Namespace):
                              "Otherwise, we use Poisson process to synthesize "
                              "the request arrival times.")
     parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument('--trust-remote-code', action='store_true',
+                        help='trust remote code from huggingface')
     args = parser.parse_args()
     main(args)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 8a8639cf5425e..b2bea8520565d 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -74,7 +74,7 @@ def run_vllm(
         tokenizer=tokenizer,
         tensor_parallel_size=tensor_parallel_size,
         seed=seed,
-        trust_remote_code=trust_remote_code
+        trust_remote_code=trust_remote_code,
     )
 
     # Add the requests to the engine.
@@ -111,7 +111,8 @@ def run_hf(
     trust_remote_code: bool,
 ) -> float:
     assert not use_beam_search
-    llm = AutoModelForCausalLM.from_pretrained(model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
+    llm = AutoModelForCausalLM.from_pretrained(model,
+        torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
     if llm.config.model_type == "llama":
         # To enable padding in the HF backend.
         tokenizer.pad_token = tokenizer.eos_token
@@ -173,8 +174,9 @@ def main(args: argparse.Namespace):
             args.seed, args.n, args.use_beam_search, args.trust_remote_code)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
-        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
-                              args.use_beam_search, args.hf_max_batch_size)
+        elapsed_time = run_hf(
+            requests, args.model, tokenizer, args.n, args.use_beam_search,
+            args.hf_max_batch_size, args.trust_remote_code)
     else:
         raise ValueError(f"Unknown backend: {args.backend}")
     total_num_tokens = sum(
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 8fe664bb9400f..373c4812264a0 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -585,7 +585,8 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]:
 
     # A separate tokenizer to map token IDs to strings.
     tokenizer = get_tokenizer(engine_args.tokenizer,
-                              tokenizer_mode=engine_args.tokenizer_mode)
+                              tokenizer_mode=engine_args.tokenizer_mode,
+                              trust_remote_code=engine_args.trust_remote_code)
 
     uvicorn.run(app,
                 host=args.host,