@@ -275,6 +275,7 @@ class VLLMGenerateConfig(TypedDict, total=False):
275
275
276
276
if VLLM_INSTALLED and VLLM_VERSION > version .parse ("0.10.0" ):
277
277
VLLM_SUPPORTED_CHAT_MODELS .append ("glm-4.5" )
278
+ VLLM_SUPPORTED_CHAT_MODELS .append ("gpt-oss" )
278
279
279
280
280
281
class VLLMModel (LLM ):
@@ -1284,6 +1285,7 @@ def set_context():
1284
1285
previous_texts = ["" ]
1285
1286
tool_call = False
1286
1287
tool_call_texts = ["" ]
1288
+ full_text = ""
1287
1289
if self .reasoning_parser :
1288
1290
set_context ()
1289
1291
chunks = self .reasoning_parser .prepare_reasoning_content_streaming (chunks )
@@ -1299,6 +1301,7 @@ def set_context():
1299
1301
if not choices :
1300
1302
yield self ._get_final_chat_completion_chunk (chunk )
1301
1303
else :
1304
+ full_text += chunk ["choices" ][0 ]["text" ]
1302
1305
if self .is_tool_call_chunk_start (chunk ):
1303
1306
tool_call = True
1304
1307
if tool_call :
@@ -1320,6 +1323,7 @@ def set_context():
1320
1323
chunk , self .reasoning_parser , previous_texts
1321
1324
)
1322
1325
i += 1
1326
+ logger .debug ("Chat finished, output: %s" , full_text )
1323
1327
1324
1328
@vllm_check
1325
1329
async def async_chat (
@@ -1348,13 +1352,26 @@ async def async_chat(
1348
1352
):
1349
1353
full_context_kwargs ["tools" ] = tools
1350
1354
assert self .model_family .chat_template is not None
1351
- full_prompt = self .get_full_context (
1352
- messages , self .model_family .chat_template , ** full_context_kwargs
1353
- )
1354
1355
1355
1356
generate_config = self ._sanitize_chat_config (generate_config )
1356
1357
stream = generate_config .get ("stream" , None )
1357
1358
1359
+ lora_request = None
1360
+ lora_model = generate_config .get ("lora_name" )
1361
+ if lora_model is not None :
1362
+ for lora in self .lora_requests :
1363
+ if lora_model == lora .lora_name :
1364
+ lora_request = lora
1365
+ break
1366
+ tokenizer = await self ._get_tokenizer (lora_request )
1367
+
1368
+ full_prompt = self .get_full_context (
1369
+ messages ,
1370
+ self .model_family .chat_template ,
1371
+ tokenizer = tokenizer ,
1372
+ ** full_context_kwargs ,
1373
+ )
1374
+
1358
1375
if stream :
1359
1376
agen = await self .async_generate (
1360
1377
full_prompt , generate_config , tools , request_id = request_id
0 commit comments