From 93d364da3406f5523e5e4772ffbc3c72dac7bbf4 Mon Sep 17 00:00:00 2001 From: Pernekhan Utemuratov Date: Thu, 26 Sep 2024 15:47:00 -0700 Subject: [PATCH] [Bugfix] Include encoder prompts len to non-stream api usage response (#8861) --- vllm/entrypoints/openai/serving_chat.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 94076ea3a51db..254671ef4486a 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -726,6 +726,8 @@ async def chat_completion_full_generator( assert final_res.prompt_token_ids is not None num_prompt_tokens = len(final_res.prompt_token_ids) + if final_res.encoder_prompt_token_ids is not None: + num_prompt_tokens += len(final_res.encoder_prompt_token_ids) num_generated_tokens = sum( len(output.token_ids) for output in final_res.outputs) usage = UsageInfo(