feat(model/llm): add glm-4.1v model

llyycchhee · llyycchhee · commit 28928100bd01 · 2025-07-18T09:11:03.000Z
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
@@ -18251,7 +18251,7 @@
     "reasoning_end_tag": "</think>"
   },
   {
-    "version": 1,
+    "version": 2,
     "context_length": 65536,
     "model_name": "glm-4.1v",
     "model_lang": [
@@ -18267,11 +18267,22 @@
       {
         "model_format": "pytorch",
         "model_size_in_billions": 9,
-        "quantizations": [
-          "none"
-        ],
-        "model_revision": "master",
-        "model_id": "ZhipuAI/GLM-4.1V-9B-Base"
+        "model_src": {
+          "huggingface": {
+            "quantizations": [
+              "none"
+            ],
+            "model_id": "THUDM/GLM-4.1V-9B-Base",
+            "model_revision": "34507daeedba84517747844915f08f191521a83a"
+          },
+          "modelscope": {
+            "quantizations": [
+              "none"
+            ],
+            "model_id": "ZhipuAI/GLM-4.1V-9B-Base",
+            "model_revision": "master"
+          }
+        }
       }
     ],
     "chat_template": "",
diff --git a/xinference/model/llm/transformers/multimodal/glm4_1v.py b/xinference/model/llm/transformers/multimodal/glm4_1v.py
@@ -22,7 +22,7 @@
 from .....core.model import register_batching_multimodal_models
 from .....core.scheduler import InferenceRequest
 from .....model.utils import select_device
-from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
+from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
 from ...utils import _decode_image
 from ..core import register_non_default_model
 from ..utils import get_max_src_len
@@ -37,7 +37,7 @@
 class Glm4_1VModel(PytorchMultiModalModel):
     @classmethod
     def match_json(
-        cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
+        cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         family = model_family.model_family or model_family.model_name
         if "glm-4.1v" in family.lower():
@@ -56,17 +56,16 @@ def load_processor(self):
         )
 
     def load_multimodal_model(self):
-        from transformers import AutoModel
-        from transformers import Glm4vConfig
+        from transformers import AutoModel, Glm4vConfig
 
         kwargs = {"device_map": self._device}
         kwargs = self.apply_bnb_quantization(kwargs)
 
         model = AutoModel.from_pretrained(
-                self.model_path,
-                trust_remote_code=True,
-                **kwargs,
-            )
+            self.model_path,
+            trust_remote_code=True,
+            **kwargs,
+        )
         self._model = model.eval()
         # Specify hyperparameters for generation
         self._model.generation_config = Glm4vConfig.from_pretrained(