huggingface · Cyrilvallez · Sep 18, 2025 · Sep 17, 2025 · Sep 18, 2025 · Sep 18, 2025
diff --git a/src/transformers/models/gemma3n/configuration_gemma3n.py b/src/transformers/models/gemma3n/configuration_gemma3n.py
@@ -502,10 +502,10 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
+        self.architecture = architecture
         self.initializer_range = initializer_range
         self.do_pooling = do_pooling
         self.model_args = model_args  # named "model_args" for BC with timm
-        self.architecture = architecture
         self.hidden_size = hidden_size
         self.vocab_size = vocab_size
         self.vocab_offset = vocab_offset

diff --git a/src/transformers/models/timm_wrapper/configuration_timm_wrapper.py b/src/transformers/models/timm_wrapper/configuration_timm_wrapper.py
@@ -41,6 +41,8 @@ class TimmWrapperConfig(PretrainedConfig):
     imagenet models is set to `None` due to occlusions in the label descriptions.
 
     Args:
+        architecture (`str`, *optional*, defaults to `"resnet50"`):
+            The timm architecture to load.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         do_pooling (`bool`, *optional*, defaults to `True`):
@@ -65,11 +67,13 @@ class TimmWrapperConfig(PretrainedConfig):
 
     def __init__(
         self,
+        architecture: str = "resnet50",
         initializer_range: float = 0.02,
         do_pooling: bool = True,
         model_args: Optional[dict[str, Any]] = None,
         **kwargs,
     ):
+        self.architecture = architecture
         self.initializer_range = initializer_range
         self.do_pooling = do_pooling
         self.model_args = model_args  # named "model_args" for BC with timm

diff --git a/tests/models/chameleon/test_modeling_chameleon.py b/tests/models/chameleon/test_modeling_chameleon.py
@@ -76,7 +76,7 @@ def __init__(
         pad_token_id=0,
         vq_num_embeds=5,
         vq_embed_dim=5,
-        vq_channel_multiplier=[1, 4],
+        vq_channel_multiplier=[1, 2],
         vq_img_token_start_id=10,  # has to be less than vocab size when added with vq_num_embeds
         scope=None,
     ):
@@ -255,10 +255,6 @@ def test_model_rope_scaling(self, scaling_type):
     def test_batching_equivalence(self):
         pass
 
-    @unittest.skip("Chameleon VQ model cannot be squishes more due to hardcoded layer params in model code")
-    def test_model_is_small(self):
-        pass
-
 
 class ChameleonVision2SeqModelTester(ChameleonModelTester):
     def __init__(self, parent, image_size=10, **kwargs):
@@ -321,10 +317,6 @@ def test_disk_offload_bin(self):
     def test_disk_offload_safetensors(self):
         pass
 
-    @unittest.skip("Chameleon VQ model cannot be squishes more due to hardcoded layer params in model code")
-    def test_model_is_small(self):
-        pass
-
     @unittest.skip("Chameleon applies key/query norm which doesn't work with packing")
     def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
         pass

diff --git a/tests/models/emu3/test_modeling_emu3.py b/tests/models/emu3/test_modeling_emu3.py
@@ -359,10 +359,6 @@ def test_initialization(self):
     def test_generate_with_static_cache(self):
         pass
 
-    # @unittest.skip("Emu3 can't be smaller than currently if we want to downsample images")
-    # def test_model_is_small(self):
-    #     pass
-
 
 @require_torch
 class Emu3IntegrationTest(unittest.TestCase):

diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -70,7 +70,7 @@ def __init__(
         type_vocab_size=16,
         type_sequence_label_size=2,
         initializer_range=0.02,
-        image_feature_pool_shape=[7, 7, 256],
+        image_feature_pool_shape=[7, 7, 32],
         coordinate_size=6,
         shape_size=6,
         num_labels=3,
@@ -106,6 +106,14 @@ def __init__(
         self.num_choices = num_choices
         self.scope = scope
         self.range_bbox = range_bbox
+        detectron2_config = LayoutLMv2Config.get_default_detectron2_config()
+        # We need to make the model smaller
+        detectron2_config["MODEL.RESNETS.DEPTH"] = 50
+        detectron2_config["MODEL.RESNETS.RES2_OUT_CHANNELS"] = 4
+        detectron2_config["MODEL.RESNETS.STEM_OUT_CHANNELS"] = 4
+        detectron2_config["MODEL.FPN.OUT_CHANNELS"] = 32
+        detectron2_config["MODEL.RESNETS.NUM_GROUPS"] = 1
+        self.detectron2_config = detectron2_config
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -158,13 +166,9 @@ def prepare_config_and_inputs(self):
             image_feature_pool_shape=self.image_feature_pool_shape,
             coordinate_size=self.coordinate_size,
             shape_size=self.shape_size,
+            detectron2_config_args=self.detectron2_config,
         )
 
-        # use smaller resnet backbone to make tests faster
-        config.detectron2_config_args["MODEL.RESNETS.DEPTH"] = 18
-        config.detectron2_config_args["MODEL.RESNETS.RES2_OUT_CHANNELS"] = 64
-        config.detectron2_config_args["MODEL.RESNETS.NUM_GROUPS"] = 1
-
         return config, input_ids, bbox, image, token_type_ids, input_mask, sequence_labels, token_labels
 
     def create_and_check_model(
@@ -422,10 +426,6 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    @unittest.skip(reason="We cannot configure detectron2 to output a smaller backbone")
-    def test_model_is_small(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "microsoft/layoutlmv2-base-uncased"

diff --git a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
@@ -441,10 +441,6 @@ def test_sdpa_can_dispatch_on_flash(self):
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
-    @unittest.skip(reason="We cannot configure to output a smaller model.")
-    def test_model_is_small(self):
-        pass
-
 
 @require_torch
 class Qwen2_5_VLIntegrationTest(unittest.TestCase):

diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
@@ -394,10 +394,6 @@ def test_sdpa_can_dispatch_on_flash(self):
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
-    @unittest.skip(reason="We cannot configure to output a smaller model.")
-    def test_model_is_small(self):
-        pass
-
 
 @require_torch
 class Qwen2VLIntegrationTest(unittest.TestCase):

diff --git a/tests/models/timm_wrapper/test_modeling_timm_wrapper.py b/tests/models/timm_wrapper/test_modeling_timm_wrapper.py
@@ -53,14 +53,15 @@ class TimmWrapperModelTester:
     def __init__(
         self,
         parent,
-        model_name="timm/resnet18.a1_in1k",
         batch_size=3,
         image_size=32,
         num_channels=3,
         is_training=True,
     ):
         self.parent = parent
-        self.model_name = model_name
+        self.architecture = "resnet26"
+        # We need this to make the model smaller
+        self.model_args = {"channels": (16, 16, 16, 16)}
         self.batch_size = batch_size
         self.image_size = image_size
         self.num_channels = num_channels
@@ -73,7 +74,7 @@ def prepare_config_and_inputs(self):
         return config, pixel_values
 
     def get_config(self):
-        return TimmWrapperConfig.from_pretrained(self.model_name)
+        return TimmWrapperConfig(architecture=self.architecture, model_args=self.model_args)
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -166,10 +167,6 @@ def test_initialization(self):
     def test_mismatched_shapes_have_properly_initialized_weights(self):
         pass
 
-    @unittest.skip(reason="Need to use a timm model and there is no tiny model available.")
-    def test_model_is_small(self):
-        pass
-
     def test_gradient_checkpointing(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
         model = TimmWrapperModel._from_config(config)

diff --git a/tests/models/xcodec/test_modeling_xcodec.py b/tests/models/xcodec/test_modeling_xcodec.py
@@ -39,7 +39,7 @@
 if is_torch_available():
     import torch
 
-    from transformers import XcodecModel
+    from transformers import DacConfig, HubertConfig, XcodecModel
 
 
 @require_torch
@@ -51,7 +51,7 @@ def __init__(
         num_channels=1,
         sample_rate=16000,
         codebook_size=1024,
-        num_samples=400,
+        num_samples=256,
         is_training=False,
     ):
         self.parent = parent
@@ -61,6 +61,16 @@ def __init__(
         self.codebook_size = codebook_size
         self.is_training = is_training
         self.num_samples = num_samples
+        self.acoustic_model_config = DacConfig(
+            decoder_hidden_size=8, encoder_hidden_size=8, codebook_size=16, downsampling_ratios=[16, 16]
+        )
+        self.semantic_model_config = HubertConfig(
+            hidden_size=32,
+            num_hidden_layers=2,
+            num_attention_heads=2,
+            intermediate_size=12,
+            conv_dim=(4, 4, 4, 4, 4, 4, 4),
+        )
 
     def prepare_config_and_inputs(self):
         config = self.get_config()
@@ -86,6 +96,8 @@ def get_config(self):
             sample_rate=self.sample_rate,
             audio_channels=self.num_channels,
             codebook_size=self.codebook_size,
+            acoustic_model_config=self.acoustic_model_config,
+            semantic_model_config=self.semantic_model_config,
         )
 
     def create_and_check_model_forward(self, config, inputs_dict):
@@ -151,10 +163,6 @@ def test_gradient_checkpointing_backward_compatibility(self):
             model = model_class(config)
             self.assertTrue(model.is_gradient_checkpointing)
 
-    @unittest.skip(reason="We cannot configure to output a smaller model.")
-    def test_model_is_small(self):
-        pass
-
     @unittest.skip(reason="The XcodecModel does not have `inputs_embeds` logics")
     def test_inputs_embeds(self):
         pass