FEAT: [model] gpt-oss (#3924)

qinxuye · web-flow · commit b848c01f6875 · 2025-08-08T18:13:37.000+08:00
diff --git a/doc/source/getting_started/installation.rst b/doc/source/getting_started/installation.rst
@@ -98,6 +98,7 @@ Currently, supported models include:
 - ``Ernie4.5``
 - ``Qwen3-Instruct``, ``Qwen3-Thinking``, ``Qwen3-Coder``
 - ``glm-4.5``
+- ``gpt-oss``
 .. vllm_end
 
 To install Xinference and vLLM::
diff --git a/doc/source/models/builtin/llm/gpt-oss.rst b/doc/source/models/builtin/llm/gpt-oss.rst
@@ -0,0 +1,47 @@
+.. _models_llm_gpt-oss:
+
+========================================
+gpt-oss
+========================================
+
+- **Context Length:** 131072
+- **Model Name:** gpt-oss
+- **Languages:** en
+- **Abilities:** chat
+- **Description:** gpt-oss series, OpenAI’s open-weight models designed for powerful reasoning, agentic tasks, and versatile developer use cases.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 20 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 20
+- **Quantizations:** none
+- **Engines**: vLLM, Transformers
+- **Model ID:** openai/gpt-oss-20b
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/openai/gpt-oss-20b>`__, `ModelScope <https://modelscope.cn/models/openai-mirror/gpt-oss-20b>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name gpt-oss --size-in-billions 20 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 120 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 120
+- **Quantizations:** none
+- **Engines**: vLLM, Transformers
+- **Model ID:** openai/gpt-oss-120b
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/openai/gpt-oss-120b>`__, `ModelScope <https://modelscope.cn/models/openai-mirror/gpt-oss-120b>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name gpt-oss --size-in-billions 120 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/user_guide/backends.rst b/doc/source/user_guide/backends.rst
@@ -167,6 +167,7 @@ Currently, supported model includes:
 - ``Ernie4.5``
 - ``Qwen3-Instruct``, ``Qwen3-Thinking``, ``Qwen3-Coder``
 - ``glm-4.5``
+- ``gpt-oss``
 .. vllm_end
 
 .. _sglang_backend:
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py
@@ -286,12 +286,18 @@ def load(self):
 
         kwargs = {}
 
-        dtype = get_device_preferred_dtype(self._device)
-
-        if dtype is not None:
-            kwargs["torch_dtype"] = dtype
+        torch_dtype = self._pytorch_model_config.get("torch_dtype")
+        if torch_dtype is not None:
+            if isinstance(torch_dtype, str) and torch_dtype != "auto":
+                torch_dtype = getattr(torch, torch_dtype)
+            kwargs["torch_dtype"] = torch_dtype
         else:
-            raise ValueError(f"Device {self._device} is not supported in temporary")
+            dtype = get_device_preferred_dtype(self._device)
+
+            if dtype is not None:
+                kwargs["torch_dtype"] = dtype
+            else:
+                raise ValueError(f"Device {self._device} is not supported in temporary")
 
         kwargs["revision"] = self._pytorch_model_config.get(
             "revision", self.model_spec.model_revision
@@ -327,6 +333,8 @@ def load(self):
             reasoning_content, enable_thinking=enable_thinking
         )
 
+        logger.debug("Loading Transformers model with kwargs: %s", kwargs)
+
         if self._check_tensorizer_integrity():
             self._model, self._tokenizer = self._load_tensorizer(**kwargs)
         else:
diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py
@@ -465,6 +465,7 @@ def set_context():
                 chat_context_var.set(ctx)
 
         previous_texts = [""]
+        full_text = ""
         # Process chunks
         if reasoning_parser:
             set_context()
@@ -476,10 +477,14 @@ def set_context():
                 # usage
                 chat_chunk = cls._get_final_chat_completion_chunk(chunk)
             else:
+                if choices[0].get("text"):
+                    full_text += choices[0]["text"]  # type: ignore
+
                 chat_chunk = cls._to_chat_completion_chunk(
                     chunk, reasoning_parser, previous_texts
                 )
             yield chat_chunk
+        logger.debug("Chat finished, output: %s", full_text)
 
     @staticmethod
     def _to_chat_completion(
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
@@ -275,6 +275,7 @@ class VLLMGenerateConfig(TypedDict, total=False):
 
 if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.0"):
     VLLM_SUPPORTED_CHAT_MODELS.append("glm-4.5")
+    VLLM_SUPPORTED_CHAT_MODELS.append("gpt-oss")
 
 
 class VLLMModel(LLM):
@@ -1284,6 +1285,7 @@ def set_context():
         previous_texts = [""]
         tool_call = False
         tool_call_texts = [""]
+        full_text = ""
         if self.reasoning_parser:
             set_context()
             chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
@@ -1299,6 +1301,7 @@ def set_context():
             if not choices:
                 yield self._get_final_chat_completion_chunk(chunk)
             else:
+                full_text += chunk["choices"][0]["text"]
                 if self.is_tool_call_chunk_start(chunk):
                     tool_call = True
                 if tool_call:
@@ -1320,6 +1323,7 @@ def set_context():
                         chunk, self.reasoning_parser, previous_texts
                     )
             i += 1
+        logger.debug("Chat finished, output: %s", full_text)
 
     @vllm_check
     async def async_chat(
@@ -1348,13 +1352,26 @@ async def async_chat(
             ):
                 full_context_kwargs["tools"] = tools
         assert self.model_family.chat_template is not None
-        full_prompt = self.get_full_context(
-            messages, self.model_family.chat_template, **full_context_kwargs
-        )
 
         generate_config = self._sanitize_chat_config(generate_config)
         stream = generate_config.get("stream", None)
 
+        lora_request = None
+        lora_model = generate_config.get("lora_name")
+        if lora_model is not None:
+            for lora in self.lora_requests:
+                if lora_model == lora.lora_name:
+                    lora_request = lora
+                    break
+        tokenizer = await self._get_tokenizer(lora_request)
+
+        full_prompt = self.get_full_context(
+            messages,
+            self.model_family.chat_template,
+            tokenizer=tokenizer,
+            **full_context_kwargs,
+        )
+
         if stream:
             agen = await self.async_generate(
                 full_prompt, generate_config, tools, request_id=request_id
diff --git a/xinference/types.py b/xinference/types.py
@@ -296,6 +296,7 @@ class PytorchModelConfig(TypedDict, total=False):
     max_pixels: NotRequired[int]
     quantization_config: NotRequired[Dict]
     context_length: NotRequired[int]
+    torch_dtype: NotRequired[str]
 
 
 def get_pydantic_model_from_method(
diff --git a/xinference/ui/web/ui/src/scenes/launch_model/data/data.js b/xinference/ui/web/ui/src/scenes/launch_model/data/data.js
@@ -81,12 +81,12 @@ export const featureModels = [
   {
     type: 'llm',
     feature_models: [
+      'gpt-oss',
       'qwen3',
       'Ernie4.5',
       'deepseek-v3-0324',
       'deepseek-r1-0528',
       'deepseek-r1-0528-qwen3',
-      'deepseek-r1-distill-llama',
       'qwen2.5-vl-instruct',
       'glm4-0414',
       'QwQ-32B',