From d7aebb63555fd5d46cb76cb2c9b406324559fc51 Mon Sep 17 00:00:00 2001 From: Itay Etelis <92247226+Etelis@users.noreply.github.com> Date: Mon, 10 Jun 2024 17:22:09 +0300 Subject: [PATCH] [Feature][Frontend]: Continued `stream_options` implementation also in CompletionRequest (#5319) --- tests/entrypoints/test_openai_server.py | 236 ++++++++++-------- vllm/entrypoints/openai/protocol.py | 9 + vllm/entrypoints/openai/serving_chat.py | 35 ++- vllm/entrypoints/openai/serving_completion.py | 26 +- 4 files changed, 180 insertions(+), 126 deletions(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index b7d0946ba7244..d0fe08ae0ddd2 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -478,8 +478,6 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI, temperature=0.0, ) single_output = single_completion.choices[0].text - single_usage = single_completion.usage - stream = await client.completions.create(model=model_name, prompt=prompt, max_tokens=5, @@ -495,7 +493,6 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI, assert finish_reason_count == 1 assert chunk.choices[0].finish_reason == "length" assert chunk.choices[0].text - assert chunk.usage == single_usage assert "".join(chunks) == single_output @@ -550,6 +547,138 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI, assert "".join(chunks) == output +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"], +) +async def test_chat_completion_stream_options(server, + client: openai.AsyncOpenAI, + model_name: str): + messages = [{ + "role": "system", + "content": "You are a helpful assistant." + }, { + "role": "user", + "content": "What is the capital of France?" + }] + + # Test stream=True, stream_options={"include_usage": False} + stream = await client.chat.completions.create( + model=model_name, + messages=messages, + max_tokens=10, + temperature=0.0, + stream=True, + stream_options={"include_usage": False}) + async for chunk in stream: + assert chunk.usage is None + + # Test stream=True, stream_options={"include_usage": True} + stream = await client.chat.completions.create( + model=model_name, + messages=messages, + max_tokens=10, + temperature=0.0, + stream=True, + stream_options={"include_usage": True}) + + async for chunk in stream: + if chunk.choices[0].finish_reason is None: + assert chunk.usage is None + else: + assert chunk.usage is None + final_chunk = await stream.__anext__() + assert final_chunk.usage is not None + assert final_chunk.usage.prompt_tokens > 0 + assert final_chunk.usage.completion_tokens > 0 + assert final_chunk.usage.total_tokens == ( + final_chunk.usage.prompt_tokens + + final_chunk.usage.completion_tokens) + assert final_chunk.choices == [] + + # Test stream=False, stream_options={"include_usage": None} + with pytest.raises(BadRequestError): + await client.chat.completions.create( + model=model_name, + messages=messages, + max_tokens=10, + temperature=0.0, + stream=False, + stream_options={"include_usage": None}) + + # Test stream=False, stream_options={"include_usage": True} + with pytest.raises(BadRequestError): + await client.chat.completions.create( + model=model_name, + messages=messages, + max_tokens=10, + temperature=0.0, + stream=False, + stream_options={"include_usage": True}) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"], +) +async def test_completion_stream_options(server, client: openai.AsyncOpenAI, + model_name: str): + prompt = "What is the capital of France?" + + # Test stream=True, stream_options={"include_usage": False} + stream = await client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=True, + stream_options={"include_usage": False}) + async for chunk in stream: + assert chunk.usage is None + + # Test stream=True, stream_options={"include_usage": True} + stream = await client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=True, + stream_options={"include_usage": True}) + async for chunk in stream: + if chunk.choices[0].finish_reason is None: + assert chunk.usage is None + else: + assert chunk.usage is None + final_chunk = await stream.__anext__() + assert final_chunk.usage is not None + assert final_chunk.usage.prompt_tokens > 0 + assert final_chunk.usage.completion_tokens > 0 + assert final_chunk.usage.total_tokens == ( + final_chunk.usage.prompt_tokens + + final_chunk.usage.completion_tokens) + assert final_chunk.choices == [] + + # Test stream=False, stream_options={"include_usage": None} + with pytest.raises(BadRequestError): + await client.completions.create(model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=False, + stream_options={"include_usage": None}) + + # Test stream=False, stream_options={"include_usage": True} + with pytest.raises(BadRequestError): + await client.completions.create(model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=False, + stream_options={"include_usage": True}) + + @pytest.mark.asyncio @pytest.mark.parametrize( # just test 1 lora hereafter @@ -1343,106 +1472,5 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI, assert embeddings.usage.total_tokens == 17 -@pytest.mark.parametrize( - "model_name", - [MODEL_NAME], -) -async def test_stream_options(server, client: openai.AsyncOpenAI, - model_name: str): - prompt = "What is the capital of France?" - - # Test stream=True, stream_options=None - stream = await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=True, - stream_options=None, - ) - chunks = [] - async for chunk in stream: - chunks.append(chunk.choices[0].text) - assert len(chunks) > 0 - assert "usage" not in chunk - - # Test stream=True, stream_options={"include_usage": False} - stream = await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=True, - stream_options={"include_usage": False}, - ) - chunks = [] - async for chunk in stream: - chunks.append(chunk.choices[0].text) - assert len(chunks) > 0 - assert "usage" not in chunk - - # Test stream=True, stream_options={"include_usage": True} - stream = await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=True, - stream_options={"include_usage": True}, - ) - chunks = [] - finish_reason_count = 0 - async for chunk in stream: - if chunk.choices[0].finish_reason is None: - assert chunk.usage is None - chunks.append(chunk.choices[0].text) - else: - assert chunk.usage is None - finish_reason_count += 1 - - # The last message should have usage and no choices - last_message = await stream.__anext__() - assert last_message.usage is not None - assert last_message.usage.prompt_tokens > 0 - assert last_message.usage.completion_tokens > 0 - assert last_message.usage.total_tokens == ( - last_message.usage.prompt_tokens + - last_message.usage.completion_tokens) - assert last_message.choices == [] - - # Test stream=False, stream_options={"include_usage": None} - with pytest.raises(BadRequestError): - await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=False, - stream_options={"include_usage": None}, - ) - - # Test stream=False, stream_options={"include_usage": False} - with pytest.raises(BadRequestError): - await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=False, - stream_options={"include_usage": False}, - ) - - # Test stream=False, stream_options={"include_usage": True} - with pytest.raises(BadRequestError): - await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=False, - stream_options={"include_usage": True}, - ) - - if __name__ == "__main__": pytest.main([__file__]) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index fa33318786b9a..9424ccc959d11 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -346,6 +346,7 @@ class CompletionRequest(OpenAIBaseModel): le=torch.iinfo(torch.long).max) stop: Optional[Union[str, List[str]]] = Field(default_factory=list) stream: Optional[bool] = False + stream_options: Optional[StreamOptions] = None suffix: Optional[str] = None temperature: Optional[float] = 1.0 top_p: Optional[float] = 1.0 @@ -482,6 +483,14 @@ def check_logprobs(cls, data): " in the interval [0, 5].")) return data + @model_validator(mode="before") + @classmethod + def validate_stream_options(cls, data): + if data.get("stream_options") and not data.get("stream"): + raise ValueError( + "Stream options can only be defined when stream is True.") + return data + class EmbeddingRequest(BaseModel): # Ordered by official OpenAI API documentation diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index c025e7e96826c..dae60e4ec99f1 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -441,25 +441,24 @@ async def chat_completion_stream_generator( yield f"data: {data}\n\n" finish_reason_sent[i] = True - if (request.stream_options - and request.stream_options.include_usage): - final_usage = UsageInfo( - prompt_tokens=prompt_tokens, - completion_tokens=previous_num_tokens[i], - total_tokens=prompt_tokens + - previous_num_tokens[i], - ) + if (request.stream_options + and request.stream_options.include_usage): + final_usage = UsageInfo( + prompt_tokens=prompt_tokens, + completion_tokens=previous_num_tokens[i], + total_tokens=prompt_tokens + previous_num_tokens[i], + ) - final_usage_chunk = ChatCompletionStreamResponse( - id=request_id, - object=chunk_object_type, - created=created_time, - choices=[], - model=model_name, - usage=final_usage) - final_usage_data = (final_usage_chunk.model_dump_json( - exclude_unset=True, exclude_none=True)) - yield f"data: {final_usage_data}\n\n" + final_usage_chunk = ChatCompletionStreamResponse( + id=request_id, + object=chunk_object_type, + created=created_time, + choices=[], + model=model_name, + usage=final_usage) + final_usage_data = (final_usage_chunk.model_dump_json( + exclude_unset=True, exclude_none=True)) + yield f"data: {final_usage_data}\n\n" except ValueError as e: # TODO: Use a vllm-specific Validation Error diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 572878b5527dc..c3c40f2b97d14 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -264,7 +264,8 @@ async def completion_stream_generator( ) else: final_usage = None - response_json = CompletionStreamResponse( + + chunk = CompletionStreamResponse( id=request_id, created=created_time, model=model_name, @@ -276,10 +277,27 @@ async def completion_stream_generator( finish_reason=finish_reason, stop_reason=stop_reason, ) - ], - usage=final_usage, - ).model_dump_json(exclude_unset=True) + ]) + if (request.stream_options + and request.stream_options.include_usage): + chunk.usage = None + + response_json = chunk.model_dump_json(exclude_unset=True) yield f"data: {response_json}\n\n" + + if (request.stream_options + and request.stream_options.include_usage): + final_usage_chunk = CompletionStreamResponse( + id=request_id, + created=created_time, + model=model_name, + choices=[], + usage=final_usage, + ) + final_usage_data = (final_usage_chunk.model_dump_json( + exclude_unset=True, exclude_none=True)) + yield f"data: {final_usage_data}\n\n" + except ValueError as e: # TODO: Use a vllm-specific Validation Error data = self.create_streaming_error_response(str(e))