@@ -39,9 +39,10 @@ client = OpenAI(
39
39
base_url = " http://localhost:8000/v1" ,
40
40
api_key = " -" ,
41
41
)
42
+ model = client.models.list().data[0 ].id
42
43
43
44
completion = client.chat.completions.create(
44
- model = " Qwen/Qwen2.5-3B-Instruct " ,
45
+ model = model ,
45
46
messages = [
46
47
{" role" : " user" , " content" : " Classify this sentiment: vLLM is wonderful!" }
47
48
],
@@ -54,7 +55,7 @@ The next example shows how to use the `guided_regex`. The idea is to generate an
54
55
55
56
``` python
56
57
completion = client.chat.completions.create(
57
- model = " Qwen/Qwen2.5-3B-Instruct " ,
58
+ model = model ,
58
59
messages = [
59
60
{
60
61
" role" : " user" ,
@@ -92,26 +93,32 @@ class CarDescription(BaseModel):
92
93
json_schema = CarDescription.model_json_schema()
93
94
94
95
completion = client.chat.completions.create(
95
- model = " Qwen/Qwen2.5-3B-Instruct " ,
96
+ model = model ,
96
97
messages = [
97
98
{
98
99
" role" : " user" ,
99
100
" content" : " Generate a JSON with the brand, model and car_type of the most iconic car from the 90's" ,
100
101
}
101
102
],
102
- extra_body = {" guided_json" : json_schema},
103
+ " response_format" : {
104
+ " type" : " json_schema" ,
105
+ " json_schema" : {
106
+ " name" : " car-description" ,
107
+ " schema" : CarDescription.model_json_schema()
108
+ },
109
+ },
103
110
)
104
111
print (completion.choices[0 ].message.content)
105
112
```
106
113
107
114
!!! tip
108
115
While not strictly necessary, normally it´s better to indicate in the prompt the
109
- JSON schema and how the fields should be populated. This can improve the
116
+ JSON schema and how the fields should be populated. This can improve the
110
117
results notably in most cases.
111
118
112
119
Finally we have the ` guided_grammar ` option, which is probably the most
113
120
difficult to use, but it´s really powerful. It allows us to define complete
114
- languages like SQL queries. It works by using a context free EBNF grammar.
121
+ languages like SQL queries. It works by using a context free EBNF grammar.
115
122
As an example, we can use to define a specific format of simplified SQL queries:
116
123
117
124
``` python
@@ -130,7 +137,7 @@ simplified_sql_grammar = """
130
137
"""
131
138
132
139
completion = client.chat.completions.create(
133
- model = " Qwen/Qwen2.5-3B-Instruct " ,
140
+ model = model ,
134
141
messages = [
135
142
{
136
143
" role" : " user" ,
@@ -142,7 +149,48 @@ completion = client.chat.completions.create(
142
149
print (completion.choices[0 ].message.content)
143
150
```
144
151
145
- Full example: < gh-file:examples/online_serving/openai_chat_completion_structured_outputs.py >
152
+ See also: [ full example] ( ../../examples/online_serving/structured_outputs )
153
+
154
+ ## Reasoning Outputs
155
+
156
+ You can also use structured outputs with < project:#reasoning-outputs > for reasoning models.
157
+
158
+ ``` bash
159
+ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r1
160
+ ```
161
+
162
+ Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema:
163
+
164
+ ``` python
165
+ from pydantic import BaseModel
166
+
167
+
168
+ class People (BaseModel ):
169
+ name: str
170
+ age: int
171
+
172
+
173
+ completion = client.chat.completions.create(
174
+ model = model,
175
+ messages = [
176
+ {
177
+ " role" : " user" ,
178
+ " content" : " Generate a JSON with the name and age of one random person." ,
179
+ }
180
+ ],
181
+ response_format = {
182
+ " type" : " json_schema" ,
183
+ " json_schema" : {
184
+ " name" : " people" ,
185
+ " schema" : People.model_json_schema()
186
+ }
187
+ },
188
+ )
189
+ print (" reasoning_content: " , completion.choices[0 ].message.reasoning_content)
190
+ print (" content: " , completion.choices[0 ].message.content)
191
+ ```
192
+
193
+ See also: [ full example] ( ../../examples/online_serving/structured_outputs )
146
194
147
195
## Experimental Automatic Parsing (OpenAI API)
148
196
@@ -163,14 +211,14 @@ class Info(BaseModel):
163
211
age: int
164
212
165
213
client = OpenAI(base_url = " http://0.0.0.0:8000/v1" , api_key = " dummy" )
214
+ model = client.models.list().data[0 ].id
166
215
completion = client.beta.chat.completions.parse(
167
- model = " meta-llama/Llama-3.1-8B-Instruct " ,
216
+ model = model ,
168
217
messages = [
169
218
{" role" : " system" , " content" : " You are a helpful assistant." },
170
219
{" role" : " user" , " content" : " My name is Cameron, I'm 28. What's my name and age?" },
171
220
],
172
221
response_format = Info,
173
- extra_body = dict (guided_decoding_backend = " outlines" ),
174
222
)
175
223
176
224
message = completion.choices[0 ].message
@@ -203,15 +251,13 @@ class MathResponse(BaseModel):
203
251
steps: list[Step]
204
252
final_answer: str
205
253
206
- client = OpenAI(base_url = " http://0.0.0.0:8000/v1" , api_key = " dummy" )
207
254
completion = client.beta.chat.completions.parse(
208
- model = " meta-llama/Llama-3.1-8B-Instruct " ,
255
+ model = model ,
209
256
messages = [
210
257
{" role" : " system" , " content" : " You are a helpful expert math tutor." },
211
258
{" role" : " user" , " content" : " Solve 8x + 31 = 2." },
212
259
],
213
260
response_format = MathResponse,
214
- extra_body = dict (guided_decoding_backend = " outlines" ),
215
261
)
216
262
217
263
message = completion.choices[0 ].message
@@ -232,11 +278,11 @@ Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equa
232
278
Answer: x = -29/8
233
279
```
234
280
235
- An example of using ` structural_tag ` can be found here: < gh-file:examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py >
281
+ An example of using ` structural_tag ` can be found here: < gh-file:examples/online_serving/structured_outputs >
236
282
237
283
## Offline Inference
238
284
239
- Offline inference allows for the same types of guided decoding .
285
+ Offline inference allows for the same types of structured outputs .
240
286
To use it, we´ll need to configure the guided decoding using the class ` GuidedDecodingParams ` inside ` SamplingParams ` .
241
287
The main available options inside ` GuidedDecodingParams ` are:
242
288
@@ -247,7 +293,7 @@ The main available options inside `GuidedDecodingParams` are:
247
293
- ` structural_tag `
248
294
249
295
These parameters can be used in the same way as the parameters from the Online
250
- Serving examples above. One example for the usage of the ` choice ` parameter is
296
+ Serving examples above. One example for the usage of the ` choice ` parameter is
251
297
shown below:
252
298
253
299
``` python
@@ -265,4 +311,4 @@ outputs = llm.generate(
265
311
print (outputs[0 ].outputs[0 ].text)
266
312
```
267
313
268
- Full example: < gh-file: examples/offline_inference /structured_outputs.py >
314
+ See also: [ full example ] ( ../../ examples/online_serving /structured_outputs)
0 commit comments