huggingface · echarlaix · May 28, 2025 · May 23, 2025 · May 23, 2025 · May 23, 2025
diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml
@@ -27,9 +27,11 @@ jobs:
       matrix:
         python-version: [3.9]
         runs-on: [ubuntu-22.04]
-        test_file: [
+        test_file:
+          [
             test_timm.py,
-            test_modeling.py, # todo: split into test_encoder, test_decoder and test_encoder_decoder
+            test_decoder.py,
+            test_modeling.py,
             test_diffusion.py,
             test_optimization.py,
             test_quantization.py,

diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx
@@ -92,6 +92,8 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
 - PoolFormer
 - PVT
 - Qwen2(Qwen1.5)
+- Qwen3
+- Qwen3-MoE
 - RegNet
 - RemBERT
 - ResNet

diff --git a/optimum/exporters/onnx/base.py b/optimum/exporters/onnx/base.py
@@ -938,19 +938,19 @@ def post_process_exported_models(
             path, models_and_onnx_configs, onnx_files_subpaths
         )
 
-        # Attempt to merge only if the decoder was exported without/with past, and ignore seq2seq models exported with text-generation task
-        if len(onnx_files_subpaths) >= 3 and self.use_past is True:
-            decoder_path = Path(path, onnx_files_subpaths[1])
-            decoder_with_past_path = Path(path, onnx_files_subpaths[2])
-            decoder_merged_path = Path(path, ONNX_DECODER_MERGED_NAME + ".onnx")
+        # Attempt to merge only if the decoder was exported without/with past
+        onnx_decoder_path = Path(path, ONNX_DECODER_NAME + ".onnx")
+        onnx_decoder_with_past_path = Path(path, ONNX_DECODER_WITH_PAST_NAME + ".onnx")
+        decoder_merged_path = Path(path, ONNX_DECODER_MERGED_NAME + ".onnx")
+        if onnx_decoder_path.is_file() and onnx_decoder_with_past_path.is_file() and self.use_past is True:
             try:
-                from ...onnx import merge_decoders
-
                 # The decoder with past does not output the cross attention past key values as they are constant,
                 # hence the need for strict=False
+                from ...onnx import merge_decoders
+
                 merge_decoders(
-                    decoder=decoder_path,
-                    decoder_with_past=decoder_with_past_path,
+                    decoder=onnx_decoder_path,
+                    decoder_with_past=onnx_decoder_with_past_path,
                     save_path=decoder_merged_path,
                     strict=False,
                 )

diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
@@ -337,7 +337,7 @@ class GPTNeoXOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
 
 
 # OPT does not take position_ids as input for transfomers < v4.46, needs it for transformers >= v4.46
-if is_transformers_version(">=", "4.45.99"):
+if is_transformers_version(">=", "4.46.0"):
 
     class OPTOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
         DEFAULT_ONNX_OPSET = 14  # uses SDPA in Transformers, hence opset>=14.
@@ -352,7 +352,6 @@ class OPTOnnxConfig(TextDecoderOnnxConfig):
 
 class LlamaOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
     DEFAULT_ONNX_OPSET = 14  # Llama now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
-
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator)
     DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
@@ -371,6 +370,14 @@ class Qwen2OnnxConfig(LlamaOnnxConfig):
     MIN_TRANSFORMERS_VERSION = version.parse("4.37.0")
 
 
+class Qwen3OnnxConfig(LlamaOnnxConfig):
+    MIN_TRANSFORMERS_VERSION = version.parse("4.51.0")
+
+
+class Qwen3MoeOnnxConfig(LlamaOnnxConfig):
+    MIN_TRANSFORMERS_VERSION = version.parse("4.51.0")
+
+
 class GemmaOnnxConfig(LlamaOnnxConfig):
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, GemmaDummyPastKeyValuesGenerator)
     DUMMY_PKV_GENERATOR_CLASS = GemmaDummyPastKeyValuesGenerator
@@ -385,7 +392,7 @@ class GraniteOnnxConfig(LlamaOnnxConfig):
 class PhiOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
     DEFAULT_ONNX_OPSET = 14  # Phi now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
-    MIN_TRANSFORMERS_VERSION = version.parse("4.36.0")
+    MIN_TRANSFORMERS_VERSION = version.parse("4.42.0")
 
 
 class Phi3OnnxConfig(PhiOnnxConfig):
@@ -430,33 +437,11 @@ class BloomOnnxConfig(TextDecoderOnnxConfig):
     DUMMY_INPUT_GENERATOR_CLASSES = (
         BloomDummyPastKeyValuesGenerator,
     ) + TextDecoderOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES
+
+    DEFAULT_ONNX_OPSET = 14  # Bloom uses F.scaled_dot_product_attention
+    MIN_TRANSFORMERS_VERSION = version.parse("4.44.0")
     DUMMY_PKV_GENERATOR_CLASS = BloomDummyPastKeyValuesGenerator
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_layers="n_layer", num_attention_heads="n_head")
-    DEFAULT_ONNX_OPSET = 14  # Bloom uses aten::triu that requires opset>=14, and F.scaled_dot_product_attention
-
-    def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str):
-        if is_transformers_version(">=", "4.44"):
-            super().add_past_key_values(inputs_or_outputs, direction)
-        else:
-            if direction not in ["inputs", "outputs"]:
-                raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given')
-
-            if direction == "inputs":
-                decoder_sequence_name = "past_sequence_length"
-                name = "past_key_values"
-            else:
-                decoder_sequence_name = "past_sequence_length + 1"
-                name = "present"
-
-            for i in range(self._normalized_config.num_layers):
-                inputs_or_outputs[f"{name}.{i}.key"] = {
-                    0: "batch_size x num_heads",
-                    2: decoder_sequence_name,
-                }
-                inputs_or_outputs[f"{name}.{i}.value"] = {
-                    0: "batch_size x num_heads",
-                    1: decoder_sequence_name,
-                }
 
 
 class GPTBigCodeOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):

diff --git a/optimum/exporters/onnx/utils.py b/optimum/exporters/onnx/utils.py
@@ -85,11 +85,13 @@
     "phi",
     "phi3",
     "qwen2",
+    "qwen3",
+    "qwen3-moe",
     "granite",
 }
 
 
-if is_transformers_version(">=", "4.45.99"):
+if is_transformers_version(">=", "4.46.0"):
     MODEL_TYPES_REQUIRING_POSITION_IDS.add("opt")
 
 

diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
@@ -997,6 +997,23 @@ class TasksManager:
             "token-classification",
             onnx="Qwen2OnnxConfig",
         ),
+        "qwen3": supported_tasks_mapping(
+            "feature-extraction",
+            "feature-extraction-with-past",
+            "text-generation",
+            "text-generation-with-past",
+            "text-classification",
+            onnx="Qwen3OnnxConfig",
+        ),
+        "qwen3-moe": supported_tasks_mapping(
+            "feature-extraction",
+            "feature-extraction-with-past",
+            "text-generation",
+            "text-generation-with-past",
+            "text-classification",
+            "token-classification",
+            onnx="Qwen3MoeOnnxConfig",
+        ),
         "llama": supported_tasks_mapping(
             "feature-extraction",
             "feature-extraction-with-past",
@@ -1475,12 +1492,15 @@ def get_supported_model_type_for_task(task: str, exporter: str) -> List[str]:
         """
         Returns the list of supported architectures by the exporter for a given task. Transformers-specific.
         """
-        return [
+
+        supported_model_types = [
             model_type.replace("-", "_")
             for model_type in TasksManager._SUPPORTED_MODEL_TYPE
             if task in TasksManager._SUPPORTED_MODEL_TYPE[model_type][exporter]
         ]
 
+        return supported_model_types
+
     @staticmethod
     def synonyms_for_task(task: str) -> Set[str]:
         synonyms = [k for k, v in TasksManager._SYNONYM_TASK_MAP.items() if v == task]
-Original file line number
+Diff line change
@@ Expand Up @@
     - PoolFormer
     - PVT
     - Qwen2(Qwen1.5)
+    - Qwen3
+    - Qwen3-MoE
     - RegNet
     - RemBERT
     - ResNet
@@ Expand Down @@