From 173f4de2bf79bf44c93f26523fdd1262b95a142c Mon Sep 17 00:00:00 2001 From: pandada8 Date: Sat, 9 Dec 2023 23:58:32 +0800 Subject: [PATCH] Prevent returning partial stop string in vllm worker (#2780) --- fastchat/serve/vllm_worker.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/fastchat/serve/vllm_worker.py b/fastchat/serve/vllm_worker.py index 59ee172a1..deae0b05a 100644 --- a/fastchat/serve/vllm_worker.py +++ b/fastchat/serve/vllm_worker.py @@ -22,7 +22,7 @@ logger, worker_id, ) -from fastchat.utils import get_context_length +from fastchat.utils import get_context_length, is_partial_stop app = FastAPI() @@ -119,7 +119,12 @@ async def generate_stream(self, params): else: text_outputs = [output.text for output in request_output.outputs] text_outputs = " ".join(text_outputs) - # Note: usage is not supported yet + + partial_stop = any(is_partial_stop(text_outputs, i) for i in stop) + # prevent yielding partial stop sequence + if partial_stop: + continue + prompt_tokens = len(request_output.prompt_token_ids) completion_tokens = sum( len(output.token_ids) for output in request_output.outputs @@ -139,6 +144,10 @@ async def generate_stream(self, params): if len(request_output.outputs) == 1 else [output.finish_reason for output in request_output.outputs], } + # Emit twice here to ensure a 'finish_reason' with empty content in the OpenAI API response. + # This aligns with the behavior of model_worker. + if request_output.finished: + yield (json.dumps(ret | {"finish_reason": None}) + "\0").encode() yield (json.dumps(ret) + "\0").encode() async def generate(self, params):