Skip to content

[Bugfix]: Fix the incompatibility issue with stream when Thinking is disabled #19135

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 86 additions & 30 deletions tests/entrypoints/openai/test_completion_with_function_calling.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from typing import NamedTuple

import openai # use the official client for correctness check
import pytest
import pytest_asyncio
Expand All @@ -22,7 +24,9 @@ def server(): # noqa: F811
"--guided-decoding-backend",
"xgrammar",
"--tool-call-parser",
"hermes"
"hermes",
"--reasoning-parser",
"qwen3",
]

with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
Expand All @@ -35,9 +39,53 @@ async def client(server):
yield async_client


class TestCase(NamedTuple):
model_name: str
stream: bool
tool_choice: str
enable_thinking: bool


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_required_tool_use(client: openai.AsyncOpenAI, model_name: str):
@pytest.mark.parametrize(
"test_case",
[
TestCase(model_name=MODEL_NAME,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shall we just add more parameterize on "stream", "tool_choice" and "enable_thinking" instead of enumerate all of them here? :-)

stream=True,
tool_choice="auto",
enable_thinking=False),
TestCase(model_name=MODEL_NAME,
stream=False,
tool_choice="auto",
enable_thinking=False),
TestCase(model_name=MODEL_NAME,
stream=True,
tool_choice="required",
enable_thinking=False),
TestCase(model_name=MODEL_NAME,
stream=False,
tool_choice="required",
enable_thinking=False),
TestCase(model_name=MODEL_NAME,
stream=True,
tool_choice="auto",
enable_thinking=True),
TestCase(model_name=MODEL_NAME,
stream=False,
tool_choice="auto",
enable_thinking=True),
TestCase(model_name=MODEL_NAME,
stream=True,
tool_choice="required",
enable_thinking=True),
TestCase(model_name=MODEL_NAME,
stream=False,
tool_choice="required",
enable_thinking=True),
],
)
async def test_function_tool_use(client: openai.AsyncOpenAI,
test_case: TestCase):
tools = [
{
"type": "function",
Expand Down Expand Up @@ -126,30 +174,38 @@ async def test_required_tool_use(client: openai.AsyncOpenAI, model_name: str):
"forecast for the next 5 days, in fahrenheit?",
},
]

# Non-streaming test
chat_completion = await client.chat.completions.create(
messages=messages,
model=model_name,
tools=tools,
tool_choice="required",
)

assert chat_completion.choices[0].message.tool_calls is not None
assert len(chat_completion.choices[0].message.tool_calls) > 0

# Streaming test
stream = await client.chat.completions.create(
messages=messages,
model=model_name,
tools=tools,
tool_choice="required",
stream=True,
)

output = []
async for chunk in stream:
if chunk.choices and chunk.choices[0].delta.tool_calls:
output.extend(chunk.choices[0].delta.tool_calls)

assert len(output) > 0
if not test_case.stream:
# Non-streaming test
chat_completion = await client.chat.completions.create(
messages=messages,
model=test_case.model_name,
tools=tools,
tool_choice=test_case.tool_choice,
extra_body={
"chat_template_kwargs": {
"enable_thinking": test_case.enable_thinking
}
})

assert chat_completion.choices[0].message.tool_calls is not None
assert len(chat_completion.choices[0].message.tool_calls) > 0
else:
# Streaming test
stream = await client.chat.completions.create(
messages=messages,
model=test_case.model_name,
tools=tools,
tool_choice=test_case.tool_choice,
stream=True,
extra_body={
"chat_template_kwargs": {
"enable_thinking": test_case.enable_thinking
}
})

output = []
async for chunk in stream:
if chunk.choices and chunk.choices[0].delta.tool_calls:
output.extend(chunk.choices[0].delta.tool_calls)

assert len(output) > 0
16 changes: 15 additions & 1 deletion vllm/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -689,7 +689,21 @@ async def chat_completion_stream_generator(
current_token_ids,
output.token_ids,
))

# When encountering think end id in prompt_token_ids
# i.e {"enable_thinking": False},
# set reasoning status to end.
# Remove the text and token ids related
# to 'reasoning_content'.
if res.prompt_token_ids and \
reasoning_parser.is_reasoning_end(
list(res.prompt_token_ids)):
reasoning_end_arr[i] = True
current_token_ids = list(output.token_ids)
if delta_message and delta_message.content:
current_text = delta_message.content
delta_message.content = None
else:
current_text = ""
# When encountering think end id in delta_token_ids,
# set reasoning status to end.
# Remove the text and token ids related
Expand Down