Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/getting_started/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ Currently, supported models include:
- ``Ernie4.5``
- ``Qwen3-Instruct``, ``Qwen3-Thinking``, ``Qwen3-Coder``
- ``glm-4.5``
- ``gpt-oss``
.. vllm_end

To install Xinference and vLLM::
Expand Down
47 changes: 47 additions & 0 deletions doc/source/models/builtin/llm/gpt-oss.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
.. _models_llm_gpt-oss:

========================================
gpt-oss
========================================

- **Context Length:** 131072
- **Model Name:** gpt-oss
- **Languages:** en
- **Abilities:** chat
- **Description:** gpt-oss series, OpenAI’s open-weight models designed for powerful reasoning, agentic tasks, and versatile developer use cases.

Specifications
^^^^^^^^^^^^^^


Model Spec 1 (pytorch, 20 Billion)
++++++++++++++++++++++++++++++++++++++++

- **Model Format:** pytorch
- **Model Size (in billions):** 20
- **Quantizations:** none
- **Engines**: vLLM, Transformers
- **Model ID:** openai/gpt-oss-20b
- **Model Hubs**: `Hugging Face <https://huggingface.co/openai/gpt-oss-20b>`__, `ModelScope <https://modelscope.cn/models/openai-mirror/gpt-oss-20b>`__

Execute the following command to launch the model, remember to replace ``${quantization}`` with your
chosen quantization method from the options listed above::

xinference launch --model-engine ${engine} --model-name gpt-oss --size-in-billions 20 --model-format pytorch --quantization ${quantization}


Model Spec 2 (pytorch, 120 Billion)
++++++++++++++++++++++++++++++++++++++++

- **Model Format:** pytorch
- **Model Size (in billions):** 120
- **Quantizations:** none
- **Engines**: vLLM, Transformers
- **Model ID:** openai/gpt-oss-120b
- **Model Hubs**: `Hugging Face <https://huggingface.co/openai/gpt-oss-120b>`__, `ModelScope <https://modelscope.cn/models/openai-mirror/gpt-oss-120b>`__

Execute the following command to launch the model, remember to replace ``${quantization}`` with your
chosen quantization method from the options listed above::

xinference launch --model-engine ${engine} --model-name gpt-oss --size-in-billions 120 --model-format pytorch --quantization ${quantization}

1 change: 1 addition & 0 deletions doc/source/user_guide/backends.rst
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ Currently, supported model includes:
- ``Ernie4.5``
- ``Qwen3-Instruct``, ``Qwen3-Thinking``, ``Qwen3-Coder``
- ``glm-4.5``
- ``gpt-oss``
.. vllm_end

.. _sglang_backend:
Expand Down
70 changes: 69 additions & 1 deletion xinference/model/llm/llm_family.json

Large diffs are not rendered by default.

18 changes: 13 additions & 5 deletions xinference/model/llm/transformers/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,12 +286,18 @@ def load(self):

kwargs = {}

dtype = get_device_preferred_dtype(self._device)

if dtype is not None:
kwargs["torch_dtype"] = dtype
torch_dtype = self._pytorch_model_config.get("torch_dtype")
if torch_dtype is not None:
if isinstance(torch_dtype, str) and torch_dtype != "auto":
torch_dtype = getattr(torch, torch_dtype)
kwargs["torch_dtype"] = torch_dtype
else:
raise ValueError(f"Device {self._device} is not supported in temporary")
dtype = get_device_preferred_dtype(self._device)

if dtype is not None:
kwargs["torch_dtype"] = dtype
else:
raise ValueError(f"Device {self._device} is not supported in temporary")

kwargs["revision"] = self._pytorch_model_config.get(
"revision", self.model_spec.model_revision
Expand Down Expand Up @@ -327,6 +333,8 @@ def load(self):
reasoning_content, enable_thinking=enable_thinking
)

logger.debug("Loading Transformers model with kwargs: %s", kwargs)

if self._check_tensorizer_integrity():
self._model, self._tokenizer = self._load_tensorizer(**kwargs)
else:
Expand Down
5 changes: 5 additions & 0 deletions xinference/model/llm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,7 @@ def set_context():
chat_context_var.set(ctx)

previous_texts = [""]
full_text = ""
# Process chunks
if reasoning_parser:
set_context()
Expand All @@ -476,10 +477,14 @@ def set_context():
# usage
chat_chunk = cls._get_final_chat_completion_chunk(chunk)
else:
if choices[0].get("text"):
full_text += choices[0]["text"] # type: ignore

chat_chunk = cls._to_chat_completion_chunk(
chunk, reasoning_parser, previous_texts
)
yield chat_chunk
logger.debug("Chat finished, output: %s", full_text)

@staticmethod
def _to_chat_completion(
Expand Down
23 changes: 20 additions & 3 deletions xinference/model/llm/vllm/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,7 @@ class VLLMGenerateConfig(TypedDict, total=False):

if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.0"):
VLLM_SUPPORTED_CHAT_MODELS.append("glm-4.5")
VLLM_SUPPORTED_CHAT_MODELS.append("gpt-oss")


class VLLMModel(LLM):
Expand Down Expand Up @@ -1284,6 +1285,7 @@ def set_context():
previous_texts = [""]
tool_call = False
tool_call_texts = [""]
full_text = ""
if self.reasoning_parser:
set_context()
chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
Expand All @@ -1299,6 +1301,7 @@ def set_context():
if not choices:
yield self._get_final_chat_completion_chunk(chunk)
else:
full_text += chunk["choices"][0]["text"]
if self.is_tool_call_chunk_start(chunk):
tool_call = True
if tool_call:
Expand All @@ -1320,6 +1323,7 @@ def set_context():
chunk, self.reasoning_parser, previous_texts
)
i += 1
logger.debug("Chat finished, output: %s", full_text)

@vllm_check
async def async_chat(
Expand Down Expand Up @@ -1348,13 +1352,26 @@ async def async_chat(
):
full_context_kwargs["tools"] = tools
assert self.model_family.chat_template is not None
full_prompt = self.get_full_context(
messages, self.model_family.chat_template, **full_context_kwargs
)

generate_config = self._sanitize_chat_config(generate_config)
stream = generate_config.get("stream", None)

lora_request = None
lora_model = generate_config.get("lora_name")
if lora_model is not None:
for lora in self.lora_requests:
if lora_model == lora.lora_name:
lora_request = lora
break
tokenizer = await self._get_tokenizer(lora_request)

full_prompt = self.get_full_context(
messages,
self.model_family.chat_template,
tokenizer=tokenizer,
**full_context_kwargs,
)

if stream:
agen = await self.async_generate(
full_prompt, generate_config, tools, request_id=request_id
Expand Down
1 change: 1 addition & 0 deletions xinference/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,7 @@ class PytorchModelConfig(TypedDict, total=False):
max_pixels: NotRequired[int]
quantization_config: NotRequired[Dict]
context_length: NotRequired[int]
torch_dtype: NotRequired[str]


def get_pydantic_model_from_method(
Expand Down
2 changes: 1 addition & 1 deletion xinference/ui/web/ui/src/scenes/launch_model/data/data.js
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,12 @@ export const featureModels = [
{
type: 'llm',
feature_models: [
'gpt-oss',
'qwen3',
'Ernie4.5',
'deepseek-v3-0324',
'deepseek-r1-0528',
'deepseek-r1-0528-qwen3',
'deepseek-r1-distill-llama',
'qwen2.5-vl-instruct',
'glm4-0414',
'QwQ-32B',
Expand Down
Loading