xorbitsai
diff --git a/‎doc/source/gen_docs.py‎
Lines changed: 88 additions & 37 deletions b/‎doc/source/gen_docs.py‎
Lines changed: 88 additions & 37 deletions
diff --git a/‎doc/source/models/builtin/embedding/index.rst‎
Lines changed: 4 additions & 0 deletions b/‎doc/source/models/builtin/embedding/index.rst‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎doc/source/models/builtin/image/flux.2-dev.rst‎
Lines changed: 29 additions & 0 deletions b/‎doc/source/models/builtin/image/flux.2-dev.rst‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎doc/source/models/builtin/image/hunyuanocr.rst‎
Lines changed: 22 additions & 0 deletions b/‎doc/source/models/builtin/image/hunyuanocr.rst‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎doc/source/models/builtin/image/index.rst‎
Lines changed: 6 additions & 0 deletions b/‎doc/source/models/builtin/image/index.rst‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎doc/source/models/builtin/llm/index.rst‎
Lines changed: 8 additions & 1 deletion b/‎doc/source/models/builtin/llm/index.rst‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎doc/source/models/builtin/llm/minimax-m2.rst‎
Lines changed: 1 addition & 1 deletion b/‎doc/source/models/builtin/llm/minimax-m2.rst‎
Lines changed: 1 addition & 1 deletion
@@ -98,44 +98,95 @@ def mock_engine_libraries():
 # Mock platform checks BEFORE importing xinference modules
 def mock_platform_checks():
     """Mock platform and hardware checks for documentation generation"""
-    from unittest.mock import patch
-    import sys
-    import platform
-
-    # Mock platform system for MLX (make it appear as Apple Silicon)
-    sys.platform = "darwin"
-    platform.system = lambda: "Darwin"
-    platform.processor = lambda: "arm"
-
-    # Mock vLLM platform checks
-    import xinference.model.llm.vllm.core as vllm_core
-    vllm_core.VLLMModel._is_linux = lambda: True
-    vllm_core.VLLMModel._has_cuda_device = lambda: True
-    vllm_core.VLLMChatModel._is_linux = lambda: True
-    vllm_core.VLLMChatModel._has_cuda_device = lambda: True
-    vllm_core.VLLMMultiModel._is_linux = lambda: True
-    vllm_core.VLLMMultiModel._has_cuda_device = lambda: True
-
-    # Mock SGLang platform checks if available
+    # Import and mock engine checks without modifying system-wide platform settings
     try:
-        import xinference.model.llm.sglang.core as sglang_core
-        sglang_core.SGLANGModel._is_linux = lambda: True
-        sglang_core.SGLANGModel._has_cuda_device = lambda: True
-        sglang_core.SGLANGChatModel._is_linux = lambda: True
-        sglang_core.SGLANGChatModel._has_cuda_device = lambda: True
-        sglang_core.SGLANGVisionModel._is_linux = lambda: True
-        sglang_core.SGLANGVisionModel._has_cuda_device = lambda: True
-    except ImportError:
-        pass
-
-    # Mock LMDEPLOY platform checks if available
-    try:
-        import xinference.model.llm.lmdeploy.core as lmdeploy_core
-        lmdeploy_core.LMDeployModel._is_linux = lambda: True
-        lmdeploy_core.LMDeployModel._has_cuda_device = lambda: True
-        lmdeploy_core.LMDeployChatModel._is_linux = lambda: True
-        lmdeploy_core.LMDeployChatModel._has_cuda_device = lambda: True
-    except ImportError:
+        # Mock vLLM platform checks
+        import xinference.model.llm.vllm.core as vllm_core
+        vllm_core.VLLMModel._is_linux = lambda: True
+        vllm_core.VLLMModel._has_cuda_device = lambda: True
+        vllm_core.VLLMChatModel._is_linux = lambda: True
+        vllm_core.VLLMChatModel._has_cuda_device = lambda: True
+        vllm_core.VLLMMultiModel._is_linux = lambda: True
+        vllm_core.VLLMMultiModel._has_cuda_device = lambda: True
+
+        # Mock SGLang platform checks if available
+        try:
+            import xinference.model.llm.sglang.core as sglang_core
+            sglang_core.SGLANGModel._is_linux = lambda: True
+            sglang_core.SGLANGModel._has_cuda_device = lambda: True
+            sglang_core.SGLANGChatModel._is_linux = lambda: True
+            sglang_core.SGLANGChatModel._has_cuda_device = lambda: True
+            sglang_core.SGLANGVisionModel._is_linux = lambda: True
+            sglang_core.SGLANGVisionModel._has_cuda_device = lambda: True
+        except ImportError:
+            pass
+
+        # Mock LMDEPLOY platform checks if available
+        try:
+            import xinference.model.llm.lmdeploy.core as lmdeploy_core
+            lmdeploy_core.LMDeployModel._is_linux = lambda: True
+            lmdeploy_core.LMDeployModel._has_cuda_device = lambda: True
+            lmdeploy_core.LMDeployChatModel._is_linux = lambda: True
+            lmdeploy_core.LMDeployChatModel._has_cuda_device = lambda: True
+        except ImportError:
+            pass
+
+        # Mock MLX engine platform checks by monkey-patching the imports within MLX module
+        try:
+            # First, let's monkey-patch sys and platform imports within the MLX module only
+            import xinference.model.llm.mlx.core as mlx_core
+
+            # Create mock objects that look like sys.platform and platform functions
+            class MockSys:
+                platform = "darwin"
+
+            class MockPlatform:
+                @staticmethod
+                def system():
+                    return "Darwin"
+
+                @staticmethod
+                def processor():
+                    return "arm"
+
+            # Store original references
+            original_mlx_match = mlx_core.MLXModel.match_json
+            original_mlx_chat_match = mlx_core.MLXChatModel.match_json
+            original_mlx_vision_match = mlx_core.MLXVisionModel.match_json
+
+            # Now create wrapper functions that replace sys and platform only during the platform check
+            def create_wrapped_match_json(original_match):
+                def wrapped_match_json(cls, llm_family, llm_spec, quantization):
+                    # Temporarily replace sys and platform in the MLX module
+                    import sys as original_sys
+                    import platform as original_platform
+
+                    # Replace sys and platform temporarily
+                    mlx_core.sys = MockSys()
+                    mlx_core.platform = MockPlatform()
+
+                    try:
+                        # Call the original match_json which will now see the mocked platform
+                        result = original_match.__func__(cls, llm_family, llm_spec, quantization)
+                        return result
+                    finally:
+                        # Restore original sys and platform
+                        mlx_core.sys = original_sys
+                        mlx_core.platform = original_platform
+
+                return classmethod(wrapped_match_json)
+
+            # Apply the wrapped match_json methods
+            mlx_core.MLXModel.match_json = create_wrapped_match_json(original_mlx_match)
+            mlx_core.MLXChatModel.match_json = create_wrapped_match_json(original_mlx_chat_match)
+            mlx_core.MLXVisionModel.match_json = create_wrapped_match_json(original_mlx_vision_match)
+
+        except ImportError:
+            pass
+
+    except Exception as e:
+        # If any mocking fails, continue without it
+        print(f"Warning: Could not mock some engine platform checks: {e}")
         pass
 
 mock_platform_checks()
 
@@ -41,6 +41,10 @@ The following is a list of built-in embedding models in Xinference:
 
    e5-large-v2
 
+   gme-qwen2-vl-2b-instruct
+  
+   gme-qwen2-vl-7b-instruct
+  
    gte-base
 
    gte-large
 
@@ -0,0 +1,29 @@
+.. _models_builtin_flux.2-dev:
+
+==========
+FLUX.2-dev
+==========
+
+- **Model Name:** FLUX.2-dev
+- **Model Family:** stable_diffusion
+- **Abilities:** text2image, image2image, inpainting
+- **Available ControlNet:** None
+
+Specifications
+^^^^^^^^^^^^^^
+
+- **Model ID:** black-forest-labs/FLUX.2-dev
+- **GGUF Model ID**: city96/FLUX.2-dev-gguf
+- **GGUF Quantizations**: BF16, Q2_K, Q3_K_M, Q3_K_S, Q4_0, Q4_1, Q4_K_M, Q4_K_S, Q5_0, Q5_1, Q5_K_M, Q5_K_S, Q6_K, Q8_0
+
+
+Execute the following command to launch the model::
+
+   xinference launch --model-name FLUX.2-dev --model-type image
+
+
+For GGUF quantization, using below command::
+
+    xinference launch --model-name FLUX.2-dev --model-type image --gguf_quantization ${gguf_quantization} --cpu_offload True
+
+
@@ -0,0 +1,22 @@
+.. _models_builtin_hunyuanocr:
+
+==========
+HunyuanOCR
+==========
+
+- **Model Name:** HunyuanOCR
+- **Model Family:** ocr
+- **Abilities:** ocr
+- **Available ControlNet:** None
+
+Specifications
+^^^^^^^^^^^^^^
+
+- **Model ID:** tencent/HunyuanOCR
+
+Execute the following command to launch the model::
+
+   xinference launch --model-name HunyuanOCR --model-type image
+
+
+
@@ -13,18 +13,24 @@ The following is a list of built-in image models in Xinference:
 
    cogview4
 
+   deepseek-ocr
+  
    flux.1-dev
 
    flux.1-kontext-dev
 
    flux.1-schnell
 
+   flux.2-dev
+  
    got-ocr2_0
 
    hunyuandit-v1.2
 
    hunyuandit-v1.2-distilled
 
+   hunyuanocr
+  
    kolors
 
    qwen-image
 
@@ -366,6 +366,11 @@ The following is a list of built-in LLM in Xinference:
      - 32768
      - MiniCPM4 series are highly efficient large language models (LLMs) designed explicitly for end-side devices, which achieves this efficiency through systematic innovation in four key dimensions: model architecture, training data, training algorithms, and inference systems.
 
+   * - :ref:`minimax-m2 <models_llm_minimax-m2>`
+     - chat, tools, reasoning
+     - 196608
+     - MiniMax-M2, a Mini model built for Max coding & agentic workflows.
+
    * - :ref:`mistral-instruct-v0.1 <models_llm_mistral-instruct-v0.1>`
      - chat
      - 8192
@@ -534,7 +539,7 @@ The following is a list of built-in LLM in Xinference:
    * - :ref:`qwen3 <models_llm_qwen3>`
      - chat, reasoning, hybrid, tools
      - 40960
-     - Qwen3 is the latest generation of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support
+     - Qwen3 is the latest generation of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support.
 
    * - :ref:`qwen3-coder <models_llm_qwen3-coder>`
      - chat, tools
@@ -846,6 +851,8 @@ The following is a list of built-in LLM in Xinference:
 
    minicpm4
 
+   minimax-m2
+  
    mistral-instruct-v0.1
 
    mistral-instruct-v0.2
 
@@ -52,7 +52,7 @@ Model Spec 3 (mlx, 230 Billion)
 - **Model Format:** mlx
 - **Model Size (in billions):** 230
 - **Quantizations:** 3bit, 4bit, 5bit, 6bit, 8bit
-- **Engines**: 
+- **Engines**: MLX
 - **Model ID:** mlx-community/MiniMax-M2-{quantization}
 - **Model Hubs**:  `Hugging Face <https://huggingface.co/mlx-community/MiniMax-M2-{quantization}>`__, `ModelScope <https://modelscope.cn/models/mlx-community/MiniMax-M2-{quantization}>`__