Skip to content

[Doc] Update reasoning with stream example to use OpenAI library #14077

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Mar 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 49 additions & 1 deletion docs/source/features/reasoning_outputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,55 @@ Streaming chat completions are also supported for reasoning models. The `reasoni
}
```

Please note that it is not compatible with the OpenAI Python client library. You can use the `requests` library to make streaming requests. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client support extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example:

```python
from openai import OpenAI

# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"

client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)

models = client.models.list()
model = models.data[0].id

messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
stream = client.chat.completions.create(model=model,
messages=messages,
stream=True)

print("client: Start streaming chat completions...")
printed_reasoning_content = False
printed_content = False

for chunk in stream:
reasoning_content = None
content = None
# Check the content is reasoning_content or content
if hasattr(chunk.choices[0].delta, "reasoning_content"):
reasoning_content = chunk.choices[0].delta.reasoning_content
elif hasattr(chunk.choices[0].delta, "content"):
content = chunk.choices[0].delta.content

if reasoning_content is not None:
if not printed_reasoning_content:
printed_reasoning_content = True
print("reasoning_content:", end="", flush=True)
print(reasoning_content, end="", flush=True)
elif content is not None:
if not printed_content:
printed_content = True
print("\ncontent:", end="", flush=True)
# Extract and print the content
print(content, end="", flush=True)
```

Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).

## Limitations

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,73 +19,50 @@
where you want to display chat completions to the user as they are generated
by the model.

Here we do not use the OpenAI Python client library, because it does not support
`reasoning_content` fields in the response.
Remember to check content and reasoning_content exist in `ChatCompletionChunk`,
content may not exist leading to errors if you try to access it.
"""

import json

import requests
from openai import OpenAI

# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"

models = requests.get(
f"{openai_api_base}/models",
headers={
"Authorization": f"Bearer {openai_api_key}"
},
).json()
model = models["data"][0]["id"]
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)

# Streaming chat completions
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
models = client.models.list()
model = models.data[0].id

response = requests.post(
f"{openai_api_base}/chat/completions",
headers={"Authorization": f"Bearer {openai_api_key}"},
json={
"model": model,
"messages": messages,
"stream": True
},
)
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
stream = client.chat.completions.create(model=model,
messages=messages,
stream=True)

print("client: Start streaming chat completions...")
printed_reasoning_content = False
printed_content = False
# Make the streaming request
if response.status_code == 200:
# Process the streaming response
for line in response.iter_lines():
if line: # Filter out keep-alive new lines
# Decode the line and parse the JSON
decoded_line = line.decode("utf-8")
if decoded_line.startswith("data:"):
data = decoded_line[5:].strip() # Remove "data:" prefix
if data == "[DONE]": # End of stream
print("\nclient: Stream completed.")
break
try:
# Parse the JSON data
chunk = json.loads(data)
reasoning_content = chunk["choices"][0]["delta"].get(
"reasoning_content", "")
content = chunk["choices"][0]["delta"].get("content", "")

if reasoning_content:
if not printed_reasoning_content:
printed_reasoning_content = True
print("reasoning_content:", end="", flush=True)
print(reasoning_content, end="", flush=True)
elif content:
if not printed_content:
printed_content = True
print("\ncontent:", end="", flush=True)
# Extract and print the content
print(content, end="", flush=True)
except json.JSONDecodeError:
print("Error decoding JSON:", decoded_line)
else:
print(f"Error: {response.status_code} - {response.text}")
for chunk in stream:
reasoning_content = None
content = None
# Check the content is reasoning_content or content
if hasattr(chunk.choices[0].delta, "reasoning_content"):
reasoning_content = chunk.choices[0].delta.reasoning_content
elif hasattr(chunk.choices[0].delta, "content"):
content = chunk.choices[0].delta.content

if reasoning_content is not None:
if not printed_reasoning_content:
printed_reasoning_content = True
print("reasoning_content:", end="", flush=True)
print(reasoning_content, end="", flush=True)
elif content is not None:
if not printed_content:
printed_content = True
print("\ncontent:", end="", flush=True)
# Extract and print the content
print(content, end="", flush=True)