FEAT: [model] Qwen-Image (#3916)

qinxuye · web-flow · commit f2db1c8f6200 · 2025-08-06T16:13:16.000+08:00
diff --git a/doc/source/models/builtin/image/index.rst b/doc/source/models/builtin/image/index.rst
@@ -27,6 +27,8 @@ The following is a list of built-in image models in Xinference:
   
    kolors
   
+   qwen-image
+  
    sd-turbo
   
    sd3-medium
diff --git a/doc/source/models/builtin/image/qwen-image.rst b/doc/source/models/builtin/image/qwen-image.rst
@@ -0,0 +1,20 @@
+.. _models_builtin_qwen-image:
+
+==========
+Qwen-Image
+==========
+
+- **Model Name:** Qwen-Image
+- **Model Family:** stable_diffusion
+- **Abilities:** text2image
+- **Available ControlNet:** None
+
+Specifications
+^^^^^^^^^^^^^^
+
+- **Model ID:** Qwen/Qwen-Image
+
+Execute the following command to launch the model::
+
+   xinference launch --model-name Qwen-Image --model-type image
+
diff --git a/doc/source/models/model_abilities/image.rst b/doc/source/models/model_abilities/image.rst
@@ -46,9 +46,12 @@ The Text-to-image API is supported with the following models in Xinference:
 * sd3.5-large-turbo
 * FLUX.1-schnell
 * FLUX.1-dev
+* Flux.1-Kontext-dev
 * Kolors
 * hunyuandit-v1.2
 * hunyuandit-v1.2-distilled
+* cogview4
+* Qwen-Image
 
 
 Quickstart
@@ -211,6 +214,8 @@ Below list default options that used from v0.16.1.
 +-------------------+-----------------------+----------------------+------------------+
 | sd3.5-large-turbo | text_encoder_3        | N/A                  | True             |
 +-------------------+-----------------------+----------------------+------------------+
+| Qwen-Image        | text_encoder          | N/A                  | False            |
++-------------------+-----------------------+----------------------+------------------+
 
 .. note::
 
diff --git a/xinference/core/worker.py b/xinference/core/worker.py
@@ -817,10 +817,7 @@ def _create_virtual_env_manager(
             # we specify python_path explicitly
             # sometimes uv would find other versions.
             python_path = pathlib.Path(sys.executable)
-        kw = {}
-        if XINFERENCE_VIRTUAL_ENV_SKIP_INSTALLED:
-            kw["skip_installed"] = XINFERENCE_VIRTUAL_ENV_SKIP_INSTALLED
-        virtual_env_manager.create_env(python_path=python_path, **kw)
+        virtual_env_manager.create_env(python_path=python_path)
         return virtual_env_manager
 
     @classmethod
@@ -847,6 +844,8 @@ def _prepare_virtual_env(
             packages.extend(virtual_env_packages)
         conf.pop("packages", None)
         conf.pop("inherit_pip_config", None)
+        if XINFERENCE_VIRTUAL_ENV_SKIP_INSTALLED:
+            conf["skip_installed"] = XINFERENCE_VIRTUAL_ENV_SKIP_INSTALLED
 
         logger.info(
             "Installing packages %s in virtual env %s, with settings(%s)",
diff --git a/xinference/model/image/model_spec.json b/xinference/model/image/model_spec.json
@@ -175,6 +175,39 @@
       "no_build_isolation": true
     }
   },
+  {
+    "version": 2,
+    "model_name": "Qwen-Image",
+    "model_family": "stable_diffusion",
+    "model_ability": [
+      "text2image"
+    ],
+    "model_src": {
+      "huggingface": {
+        "model_id": "Qwen/Qwen-Image",
+        "model_revision": "4516c4d3058302ff35cd86c62ffa645d039fefad"
+      },
+      "modelscope": {
+        "model_id": "Qwen/Qwen-Image",
+        "model_revision": "master"
+      }
+    },
+    "default_model_config": {
+      "quantize": true,
+      "quantize_text_encoder": "text_encoder",
+      "torch_dtype": "bfloat16"
+    },
+    "default_generate_config": {
+      "guidance_scale": 1.0
+    },
+    "virtualenv": {
+      "packages": [
+        "git+https://github.com/huggingface/diffusers.git",
+        "#system_numpy#"
+      ],
+      "no_build_isolation": true
+    }
+  },
   {
     "version": 2,
     "model_name": "sd3-medium",
diff --git a/xinference/model/image/stable_diffusion/core.py b/xinference/model/image/stable_diffusion/core.py
@@ -254,6 +254,14 @@ def load(self):
                 self._model = FluxKontextPipeline.from_pretrained(
                     self._model_path, **self._kwargs
                 )
+            elif "qwen" in self._model_spec.model_name.lower():
+                # TODO: remove this branch when auto pipeline supports
+                # Qwen-Image
+                from diffusers import DiffusionPipeline
+
+                self._model = DiffusionPipeline.from_pretrained(
+                    self._model_path, **self._kwargs
+                )
             else:
                 raise
         self._load_to_device(self._model)
@@ -348,11 +356,19 @@ def _quantize_text_encoder(self, quantize_text_encoder: Optional[str]):
             return
 
         if not quantize_text_encoder:
+            logger.debug("No text encoder quantization")
             return
 
         quantization_method = self._kwargs.pop("text_encoder_quantize_method", "bnb")
         quantization = self._kwargs.pop("text_encoder_quantization", "8-bit")
 
+        logger.debug(
+            "Quantize text encoder %s with method %s, quantization %s",
+            quantize_text_encoder,
+            quantization_method,
+            quantization,
+        )
+
         torch_dtype = self._torch_dtype
         for text_encoder_name in quantize_text_encoder.split(","):
             quantization_kwargs: Dict[str, Any] = {}
@@ -389,8 +405,13 @@ def _quantize_transformer(self):
 
         if not quantization:
             # skip if no quantization specified
+            logger.debug("No transformer quantization")
             return
 
+        logger.debug(
+            "Quantize transformer with %s, quantization %s", method, quantization
+        )
+
         torch_dtype = self._torch_dtype
         transformer_cls = self._get_layer_cls("transformer")
         quantization_config = self._get_quantize_config(
@@ -409,6 +430,7 @@ def _quantize_transformer_gguf(self):
 
         # GGUF transformer
         torch_dtype = self._torch_dtype
+        logger.debug("Quantize transformer with gguf file %s", self._gguf_model_path)
         self._kwargs["transformer"] = self._get_layer_cls(
             "transformer"
         ).from_single_file(
diff --git a/xinference/ui/web/ui/src/scenes/launch_model/data/data.js b/xinference/ui/web/ui/src/scenes/launch_model/data/data.js
@@ -113,6 +113,7 @@ export const featureModels = [
   {
     type: 'image',
     feature_models: [
+      'Qwen-Image',
       'FLUX.1-dev',
       'FLUX.1-Kontext-dev',
       'FLUX.1-schnell',

Original file line number	Diff line number	Diff line change
`@@ -113,6 +113,7 @@ export const featureModels = [`
`113`	`113`	`{`
`114`	`114`	`type: 'image',`
`115`	`115`	`feature_models: [`
	`116`	`+ 'Qwen-Image',`
`116`	`117`	`'FLUX.1-dev',`
`117`	`118`	`'FLUX.1-Kontext-dev',`
`118`	`119`	`'FLUX.1-schnell',`