Skip to content

Commit b848c01

Browse files
authored
FEAT: [model] gpt-oss (#3924)
1 parent f2db1c8 commit b848c01

File tree

9 files changed

+158
-10
lines changed

9 files changed

+158
-10
lines changed

doc/source/getting_started/installation.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ Currently, supported models include:
9898
- ``Ernie4.5``
9999
- ``Qwen3-Instruct``, ``Qwen3-Thinking``, ``Qwen3-Coder``
100100
- ``glm-4.5``
101+
- ``gpt-oss``
101102
.. vllm_end
102103
103104
To install Xinference and vLLM::
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
.. _models_llm_gpt-oss:
2+
3+
========================================
4+
gpt-oss
5+
========================================
6+
7+
- **Context Length:** 131072
8+
- **Model Name:** gpt-oss
9+
- **Languages:** en
10+
- **Abilities:** chat
11+
- **Description:** gpt-oss series, OpenAI’s open-weight models designed for powerful reasoning, agentic tasks, and versatile developer use cases.
12+
13+
Specifications
14+
^^^^^^^^^^^^^^
15+
16+
17+
Model Spec 1 (pytorch, 20 Billion)
18+
++++++++++++++++++++++++++++++++++++++++
19+
20+
- **Model Format:** pytorch
21+
- **Model Size (in billions):** 20
22+
- **Quantizations:** none
23+
- **Engines**: vLLM, Transformers
24+
- **Model ID:** openai/gpt-oss-20b
25+
- **Model Hubs**: `Hugging Face <https://huggingface.co/openai/gpt-oss-20b>`__, `ModelScope <https://modelscope.cn/models/openai-mirror/gpt-oss-20b>`__
26+
27+
Execute the following command to launch the model, remember to replace ``${quantization}`` with your
28+
chosen quantization method from the options listed above::
29+
30+
xinference launch --model-engine ${engine} --model-name gpt-oss --size-in-billions 20 --model-format pytorch --quantization ${quantization}
31+
32+
33+
Model Spec 2 (pytorch, 120 Billion)
34+
++++++++++++++++++++++++++++++++++++++++
35+
36+
- **Model Format:** pytorch
37+
- **Model Size (in billions):** 120
38+
- **Quantizations:** none
39+
- **Engines**: vLLM, Transformers
40+
- **Model ID:** openai/gpt-oss-120b
41+
- **Model Hubs**: `Hugging Face <https://huggingface.co/openai/gpt-oss-120b>`__, `ModelScope <https://modelscope.cn/models/openai-mirror/gpt-oss-120b>`__
42+
43+
Execute the following command to launch the model, remember to replace ``${quantization}`` with your
44+
chosen quantization method from the options listed above::
45+
46+
xinference launch --model-engine ${engine} --model-name gpt-oss --size-in-billions 120 --model-format pytorch --quantization ${quantization}
47+

doc/source/user_guide/backends.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,7 @@ Currently, supported model includes:
167167
- ``Ernie4.5``
168168
- ``Qwen3-Instruct``, ``Qwen3-Thinking``, ``Qwen3-Coder``
169169
- ``glm-4.5``
170+
- ``gpt-oss``
170171
.. vllm_end
171172
172173
.. _sglang_backend:

xinference/model/llm/llm_family.json

Lines changed: 69 additions & 1 deletion
Large diffs are not rendered by default.

xinference/model/llm/transformers/core.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -286,12 +286,18 @@ def load(self):
286286

287287
kwargs = {}
288288

289-
dtype = get_device_preferred_dtype(self._device)
290-
291-
if dtype is not None:
292-
kwargs["torch_dtype"] = dtype
289+
torch_dtype = self._pytorch_model_config.get("torch_dtype")
290+
if torch_dtype is not None:
291+
if isinstance(torch_dtype, str) and torch_dtype != "auto":
292+
torch_dtype = getattr(torch, torch_dtype)
293+
kwargs["torch_dtype"] = torch_dtype
293294
else:
294-
raise ValueError(f"Device {self._device} is not supported in temporary")
295+
dtype = get_device_preferred_dtype(self._device)
296+
297+
if dtype is not None:
298+
kwargs["torch_dtype"] = dtype
299+
else:
300+
raise ValueError(f"Device {self._device} is not supported in temporary")
295301

296302
kwargs["revision"] = self._pytorch_model_config.get(
297303
"revision", self.model_spec.model_revision
@@ -327,6 +333,8 @@ def load(self):
327333
reasoning_content, enable_thinking=enable_thinking
328334
)
329335

336+
logger.debug("Loading Transformers model with kwargs: %s", kwargs)
337+
330338
if self._check_tensorizer_integrity():
331339
self._model, self._tokenizer = self._load_tensorizer(**kwargs)
332340
else:

xinference/model/llm/utils.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -465,6 +465,7 @@ def set_context():
465465
chat_context_var.set(ctx)
466466

467467
previous_texts = [""]
468+
full_text = ""
468469
# Process chunks
469470
if reasoning_parser:
470471
set_context()
@@ -476,10 +477,14 @@ def set_context():
476477
# usage
477478
chat_chunk = cls._get_final_chat_completion_chunk(chunk)
478479
else:
480+
if choices[0].get("text"):
481+
full_text += choices[0]["text"] # type: ignore
482+
479483
chat_chunk = cls._to_chat_completion_chunk(
480484
chunk, reasoning_parser, previous_texts
481485
)
482486
yield chat_chunk
487+
logger.debug("Chat finished, output: %s", full_text)
483488

484489
@staticmethod
485490
def _to_chat_completion(

xinference/model/llm/vllm/core.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,7 @@ class VLLMGenerateConfig(TypedDict, total=False):
275275

276276
if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.0"):
277277
VLLM_SUPPORTED_CHAT_MODELS.append("glm-4.5")
278+
VLLM_SUPPORTED_CHAT_MODELS.append("gpt-oss")
278279

279280

280281
class VLLMModel(LLM):
@@ -1284,6 +1285,7 @@ def set_context():
12841285
previous_texts = [""]
12851286
tool_call = False
12861287
tool_call_texts = [""]
1288+
full_text = ""
12871289
if self.reasoning_parser:
12881290
set_context()
12891291
chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
@@ -1299,6 +1301,7 @@ def set_context():
12991301
if not choices:
13001302
yield self._get_final_chat_completion_chunk(chunk)
13011303
else:
1304+
full_text += chunk["choices"][0]["text"]
13021305
if self.is_tool_call_chunk_start(chunk):
13031306
tool_call = True
13041307
if tool_call:
@@ -1320,6 +1323,7 @@ def set_context():
13201323
chunk, self.reasoning_parser, previous_texts
13211324
)
13221325
i += 1
1326+
logger.debug("Chat finished, output: %s", full_text)
13231327

13241328
@vllm_check
13251329
async def async_chat(
@@ -1348,13 +1352,26 @@ async def async_chat(
13481352
):
13491353
full_context_kwargs["tools"] = tools
13501354
assert self.model_family.chat_template is not None
1351-
full_prompt = self.get_full_context(
1352-
messages, self.model_family.chat_template, **full_context_kwargs
1353-
)
13541355

13551356
generate_config = self._sanitize_chat_config(generate_config)
13561357
stream = generate_config.get("stream", None)
13571358

1359+
lora_request = None
1360+
lora_model = generate_config.get("lora_name")
1361+
if lora_model is not None:
1362+
for lora in self.lora_requests:
1363+
if lora_model == lora.lora_name:
1364+
lora_request = lora
1365+
break
1366+
tokenizer = await self._get_tokenizer(lora_request)
1367+
1368+
full_prompt = self.get_full_context(
1369+
messages,
1370+
self.model_family.chat_template,
1371+
tokenizer=tokenizer,
1372+
**full_context_kwargs,
1373+
)
1374+
13581375
if stream:
13591376
agen = await self.async_generate(
13601377
full_prompt, generate_config, tools, request_id=request_id

xinference/types.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,7 @@ class PytorchModelConfig(TypedDict, total=False):
296296
max_pixels: NotRequired[int]
297297
quantization_config: NotRequired[Dict]
298298
context_length: NotRequired[int]
299+
torch_dtype: NotRequired[str]
299300

300301

301302
def get_pydantic_model_from_method(

xinference/ui/web/ui/src/scenes/launch_model/data/data.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,12 +81,12 @@ export const featureModels = [
8181
{
8282
type: 'llm',
8383
feature_models: [
84+
'gpt-oss',
8485
'qwen3',
8586
'Ernie4.5',
8687
'deepseek-v3-0324',
8788
'deepseek-r1-0528',
8889
'deepseek-r1-0528-qwen3',
89-
'deepseek-r1-distill-llama',
9090
'qwen2.5-vl-instruct',
9191
'glm4-0414',
9292
'QwQ-32B',

0 commit comments

Comments
 (0)