[CI] post-GptOss fixes for green CI (#39929)

gante · web-flow · commit b771e476a8d8 · 2025-08-05T20:04:59.000+02:00
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -511,6 +511,8 @@
         title: GPT2
       - local: model_doc/gpt_bigcode
         title: GPTBigCode
+      - local: model_doc/gpt_oss
+        title: GptOss
       - local: model_doc/gptsan-japanese
         title: GPTSAN Japanese
       - local: model_doc/gpt-sw3
@@ -617,8 +619,6 @@
         title: OLMoE
       - local: model_doc/open-llama
         title: Open-Llama
-      - local: model_doc/openai_moe
-        title: OpenAIMoe
       - local: model_doc/opt
         title: OPT
       - local: model_doc/pegasus
diff --git a/docs/source/en/main_classes/quantization.md b/docs/source/en/main_classes/quantization.md
@@ -65,6 +65,10 @@ Learn how to quantize models in the [Quantization](../quantization) guide.
 
 [[autodoc]] HqqConfig
 
+## Mxfp4Config
+
+[[autodoc]] Mxfp4Config
+
 ## FbgemmFp8Config
 
 [[autodoc]] FbgemmFp8Config
diff --git a/docs/source/en/model_doc/gpt_oss.md b/docs/source/en/model_doc/gpt_oss.md
@@ -24,11 +24,11 @@ rendered properly in your Markdown viewer.
     </div>
 </div>
 
-# OpenAIMoE
+# GptOss
 
 ## Overview
 
-The OpenAIMoE model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+The GptOss model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
 <INSERT SHORT SUMMARY HERE>
 
 The abstract from the paper is the following:
@@ -43,16 +43,16 @@ This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface
 The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
 
 
-## OpenAIMoeConfig
+## GptOssConfig
 
-[[autodoc]] OpenAIMoeConfig
+[[autodoc]] GptOssConfig
 
-## OpenAIMoeModel
+## GptOssModel
 
-[[autodoc]] OpenAIMoeModel
+[[autodoc]] GptOssModel
     - forward
 
-## OpenAIMoeForCausalLM
+## GptOssForCausalLM
 
-[[autodoc]] OpenAIMoeForCausalLM
+[[autodoc]] GptOssForCausalLM
     - forward
diff --git a/src/transformers/models/granitemoe/modeling_granitemoe.py b/src/transformers/models/granitemoe/modeling_granitemoe.py
@@ -40,7 +40,7 @@
 logger = logging.get_logger(__name__)
 
 
-# Copied from transformers.models.jetmoe.modeling_jetmoe.load_balancing_loss_func
+# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func
 def load_balancing_loss_func(
     gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
     num_experts: Optional[int] = None,
diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py
@@ -67,7 +67,7 @@
 logger = logging.get_logger(__name__)
 
 
-# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func with gate->router
+# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func with gate->router
 def load_balancing_loss_func(
     router_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
     num_experts: Optional[int] = None,
diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py
@@ -50,7 +50,7 @@
 logger = logging.get_logger(__name__)
 
 
-# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
+# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func
 def load_balancing_loss_func(
     gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
     num_experts: Optional[int] = None,
diff --git a/src/transformers/models/olmoe/modeling_olmoe.py b/src/transformers/models/olmoe/modeling_olmoe.py
@@ -39,7 +39,7 @@
 logger = logging.get_logger(__name__)
 
 
-# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
+# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func
 def load_balancing_loss_func(
     gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
     num_experts: Optional[int] = None,
diff --git a/src/transformers/models/phimoe/modeling_phimoe.py b/src/transformers/models/phimoe/modeling_phimoe.py
@@ -55,7 +55,7 @@
 logger = logging.get_logger(__name__)
 
 
-# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
+# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func
 def load_balancing_loss_func(
     gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
     num_experts: Optional[int] = None,
diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
@@ -59,7 +59,6 @@
 logger = logging.get_logger(__name__)
 
 
-# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
 def load_balancing_loss_func(
     gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
     num_experts: Optional[int] = None,
diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
@@ -345,6 +345,8 @@
         "IdeficsConfig": True,
         "IdeficsVisionConfig": True,
         "IdeficsPerceiverConfig": True,
+        # TODO: @Arthur/Joao (`hidden_act` unused)
+        "GptOssConfig": True,
     }
 )
 

Original file line number	Diff line number	Diff line change
`@@ -345,6 +345,8 @@`
`345`	`345`	`"IdeficsConfig": True,`
`346`	`346`	`"IdeficsVisionConfig": True,`
`347`	`347`	`"IdeficsPerceiverConfig": True,`
	`348`	+ # TODO: @Arthur/Joao (`hidden_act` unused)
	`349`	`+ "GptOssConfig": True,`
`348`	`350`	`}`
`349`	`351`	`)`
`350`	`352`