huggingface · zucchini-nlp · Oct 8, 2025 · Sep 10, 2025 · Sep 10, 2025 · Sep 10, 2025
diff --git a/setup.py b/setup.py
@@ -114,7 +114,7 @@
     "GitPython<3.1.19",
     "hf-doc-builder>=0.3.0",
     "hf_xet",
-    "huggingface-hub==1.0.0.rc2",
+    "huggingface-hub==1.0.0.rc4",
     "importlib_metadata",
     "ipadic>=1.0.0,<2.0",
     "jinja2>=3.1.0",

diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
@@ -23,7 +23,7 @@
     "GitPython": "GitPython<3.1.19",
     "hf-doc-builder": "hf-doc-builder>=0.3.0",
     "hf_xet": "hf_xet",
-    "huggingface-hub": "huggingface-hub==1.0.0.rc2",
+    "huggingface-hub": "huggingface-hub==1.0.0.rc4",
     "importlib_metadata": "importlib_metadata",
     "ipadic": "ipadic>=1.0.0,<2.0",
     "jinja2": "jinja2>=3.1.0",

diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py
@@ -18,6 +18,7 @@
 from typing import Any, Optional, Union
 
 import numpy as np
+from huggingface_hub.dataclasses import validate_typed_dict
 
 from .image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from .image_transforms import (
@@ -710,6 +711,10 @@ def _validate_preprocess_kwargs(
     def preprocess(self, images: ImageInput, *args, **kwargs: Unpack[ImagesKwargs]) -> BatchFeature:
         # args are not validated, but their order in the `preprocess` and `_preprocess` signatures must be the same
         validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_kwargs_names)
+
+        # Perform type validation on received kwargs
+        validate_typed_dict(self.valid_kwargs, kwargs)
+
         # Set default kwargs from self. This ensures that if a kwarg is not provided
         # by the user, it gets its default value from the instance, or is set to None.
         for kwarg_name in self._valid_kwargs_names:

diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py
@@ -38,7 +38,7 @@
 )
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_utils import PreTrainedModel
-from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils import PreTokenizedInput, TextInput
 from ...utils import TensorType, TransformersKwargs, auto_docstring, can_return_tuple, logging
 from ..auto import CONFIG_MAPPING, AutoConfig, AutoTokenizer
@@ -904,7 +904,15 @@ def get_number_of_image_patches(self, height: int, width: int, images_kwargs=Non
         return num_patches
 
 
+class AriaImagesKwargs(ImagesKwargs, total=False):
+    split_image: bool
+    max_image_size: int
+    min_image_size: int
+
+
 class AriaProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: AriaImagesKwargs
+
     _defaults = {
         "text_kwargs": {
             "padding": False,

diff --git a/src/transformers/models/aria/processing_aria.py b/src/transformers/models/aria/processing_aria.py
@@ -24,13 +24,21 @@
 
 from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput
-from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils import PreTokenizedInput, TextInput
 from ...utils import TensorType
 from ..auto import AutoTokenizer
 
 
+class AriaImagesKwargs(ImagesKwargs, total=False):
+    split_image: bool
+    max_image_size: int
+    min_image_size: int
+
+
 class AriaProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: AriaImagesKwargs
+
     _defaults = {
         "text_kwargs": {
             "padding": False,

diff --git a/src/transformers/models/beit/image_processing_beit.py b/src/transformers/models/beit/image_processing_beit.py
@@ -55,15 +55,15 @@
 logger = logging.get_logger(__name__)
 
 
-class BeitImageProcessorKwargs(ImagesKwargs):
+class BeitImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
         Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
         is used for background, and background itself is not included in all classes of a dataset (e.g.
         ADE20k). The background label will be replaced by 255.
     """
 
-    do_reduce_labels: Optional[bool]
+    do_reduce_labels: bool
 
 
 @requires(backends=("vision",))

diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py
@@ -123,8 +123,8 @@ def get_resize_output_image_size(
     return new_height, new_width
 
 
-class BridgeTowerImageProcessorKwargs(ImagesKwargs):
-    size_divisor: Optional[int]
+class BridgeTowerImageProcessorKwargs(ImagesKwargs, total=False):
+    size_divisor: int
 
 
 class BridgeTowerImageProcessor(BaseImageProcessor):

diff --git a/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py b/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py
@@ -33,7 +33,7 @@
 from ...utils import TensorType, auto_docstring
 
 
-class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs):
+class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs, total=False):
     """
     crop_to_patches (`bool`, *optional*, defaults to `False`):
         Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
@@ -46,9 +46,9 @@ class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs):
         set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
     """
 
-    crop_to_patches: Optional[bool]
-    min_patches: Optional[int]
-    max_patches: Optional[int]
+    crop_to_patches: bool
+    min_patches: int
+    max_patches: int
 
 
 @lru_cache(maxsize=10)

diff --git a/src/transformers/models/cohere2_vision/modular_cohere2_vision.py b/src/transformers/models/cohere2_vision/modular_cohere2_vision.py
@@ -303,7 +303,7 @@ def get_optimal_tiled_canvas(
     return best_grid
 
 
-class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs):
+class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs, total=False):
     """
     crop_to_patches (`bool`, *optional*, defaults to `False`):
         Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
@@ -316,9 +316,9 @@ class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs):
         set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
     """
 
-    crop_to_patches: Optional[bool]
-    min_patches: Optional[int]
-    max_patches: Optional[int]
+    crop_to_patches: bool
+    min_patches: int
+    max_patches: int
 
 
 @auto_docstring

diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
@@ -729,7 +729,7 @@ def compute_segments(
     return segmentation, segments
 
 
-class ConditionalDetrImageProcessorKwargs(ImagesKwargs):
+class ConditionalDetrImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
         Data format of the annotations. One of "coco_detection" or "coco_panoptic".
@@ -745,9 +745,9 @@ class ConditionalDetrImageProcessorKwargs(ImagesKwargs):
         Path to the directory containing the segmentation masks.
     """
 
-    format: Optional[Union[str, AnnotationFormat]]
-    do_convert_annotations: Optional[bool]
-    return_segmentation_masks: Optional[bool]
+    format: Union[str, AnnotationFormat]
+    do_convert_annotations: bool
+    return_segmentation_masks: bool
     annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
     masks_path: Optional[Union[str, pathlib.Path]]
 

diff --git a/src/transformers/models/convnext/image_processing_convnext.py b/src/transformers/models/convnext/image_processing_convnext.py
@@ -50,14 +50,14 @@
 logger = logging.get_logger(__name__)
 
 
-class ConvNextImageProcessorKwargs(ImagesKwargs):
+class ConvNextImageProcessorKwargs(ImagesKwargs, total=False):
     """
     crop_pct (`float`, *optional*):
         Percentage of the image to crop. Only has an effect if size < 384. Can be
         overridden by `crop_pct` in the`preprocess` method.
     """
 
-    crop_pct: Optional[float]
+    crop_pct: float
 
 
 @requires(backends=("vision",))

diff --git a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py
@@ -49,7 +49,7 @@
 logger = logging.get_logger(__name__)
 
 
-class DeepseekVLImageProcessorKwargs(ImagesKwargs):
+class DeepseekVLImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     min_size (`int`, *optional*, defaults to 14):
         The minimum allowed size for the resized image. Ensures that neither the height nor width

diff --git a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py
@@ -50,7 +50,7 @@
 logger = logging.get_logger(__name__)
 
 
-class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):
+class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     min_size (`int`, *optional*, defaults to 14):
         The minimum allowed size for the resized image. Ensures that neither the height nor width
@@ -71,9 +71,9 @@ class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):
 
     min_size: int
     high_res_size: dict
-    high_res_resample: "PILImageResampling"
-    high_res_image_mean: list[float]
-    high_res_image_std: list[float]
+    high_res_resample: Union["PILImageResampling", int]
+    high_res_image_mean: Union[float, list[float], tuple[float, ...]]
+    high_res_image_std: Union[float, list[float], tuple[float, ...]]
 
 
 class DeepseekVLHybridImageProcessor(BaseImageProcessor):

diff --git a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
@@ -429,7 +429,7 @@ def prepare_inputs_for_generation(
         return model_inputs
 
 
-class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):
+class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     min_size (`int`, *optional*, defaults to 14):
         The minimum allowed size for the resized image. Ensures that neither the height nor width
@@ -450,9 +450,9 @@ class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):
 
     min_size: int
     high_res_size: dict
-    high_res_resample: "PILImageResampling"
-    high_res_image_mean: list[float]
-    high_res_image_std: list[float]
+    high_res_resample: Union["PILImageResampling", int]
+    high_res_image_mean: Union[float, list[float], tuple[float, ...]]
+    high_res_image_std: Union[float, list[float], tuple[float, ...]]
 
 
 class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):

diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
@@ -82,7 +82,7 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-class DeformableDetrImageProcessorKwargs(ImagesKwargs):
+class DeformableDetrImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
         Data format of the annotations. One of "coco_detection" or "coco_panoptic".
@@ -98,9 +98,9 @@ class DeformableDetrImageProcessorKwargs(ImagesKwargs):
         Path to the directory containing the segmentation masks.
     """
 
-    format: Optional[Union[str, AnnotationFormat]]
-    do_convert_annotations: Optional[bool]
-    return_segmentation_masks: Optional[bool]
+    format: Union[str, AnnotationFormat]
+    do_convert_annotations: bool
+    return_segmentation_masks: bool
     annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
     masks_path: Optional[Union[str, pathlib.Path]]
 

diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py
@@ -84,7 +84,7 @@
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
-class DetrImageProcessorKwargs(ImagesKwargs):
+class DetrImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
         Data format of the annotations. One of "coco_detection" or "coco_panoptic".
@@ -100,9 +100,9 @@ class DetrImageProcessorKwargs(ImagesKwargs):
         Path to the directory containing the segmentation masks.
     """
 
-    format: Optional[Union[str, AnnotationFormat]]
-    do_convert_annotations: Optional[bool]
-    return_segmentation_masks: Optional[bool]
+    format: Union[str, AnnotationFormat]
+    do_convert_annotations: bool
+    return_segmentation_masks: bool
     annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
     masks_path: Optional[Union[str, pathlib.Path]]
 

diff --git a/src/transformers/models/dia/processing_dia.py b/src/transformers/models/dia/processing_dia.py
@@ -55,7 +55,9 @@ class DiaProcessorKwargs(ProcessingKwargs, total=False):
             "generation": True,
             "sampling_rate": 44100,
         },
-        "common_kwargs": {"return_tensors": "pt"},
+        "common_kwargs": {
+            "return_tensors": "pt",
+        },
     }
 
 

diff --git a/src/transformers/models/donut/image_processing_donut.py b/src/transformers/models/donut/image_processing_donut.py
@@ -52,16 +52,16 @@
     import PIL
 
 
-class DonutImageProcessorKwargs(ImagesKwargs):
+class DonutImageProcessorKwargs(ImagesKwargs, total=False):
     """
     do_thumbnail (`bool`, *optional*, defaults to `self.do_thumbnail`):
         Whether to resize the image using thumbnail method.
     do_align_long_axis (`bool`, *optional*, defaults to `self.do_align_long_axis`):
         Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
     """
 
-    do_thumbnail: Optional[bool]
-    do_align_long_axis: Optional[bool]
+    do_thumbnail: bool
+    do_align_long_axis: bool
 
 
 @requires(backends=("vision",))

diff --git a/src/transformers/models/dpt/image_processing_dpt.py b/src/transformers/models/dpt/image_processing_dpt.py
@@ -64,7 +64,7 @@
 logger = logging.get_logger(__name__)
 
 
-class DPTImageProcessorKwargs(ImagesKwargs):
+class DPTImageProcessorKwargs(ImagesKwargs, total=False):
     """
     ensure_multiple_of (`int`, *optional*, defaults to 1):
         If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden
@@ -78,10 +78,10 @@ class DPTImageProcessorKwargs(ImagesKwargs):
         ADE20k). The background label will be replaced by 255.
     """
 
-    ensure_multiple_of: Optional[int]
-    size_divisor: Optional[int]
-    keep_aspect_ratio: Optional[bool]
-    do_reduce_labels: Optional[bool]
+    ensure_multiple_of: int
+    size_divisor: int
+    keep_aspect_ratio: bool
+    do_reduce_labels: bool
 
 
 def get_resize_output_image_size(

diff --git a/src/transformers/models/efficientloftr/image_processing_efficientloftr.py b/src/transformers/models/efficientloftr/image_processing_efficientloftr.py
@@ -50,13 +50,13 @@
 logger = logging.get_logger(__name__)
 
 
-class EfficientLoFTRImageProcessorKwargs(ImagesKwargs):
+class EfficientLoFTRImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     do_grayscale (`bool`, *optional*, defaults to `True`):
         Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
     """
 
-    do_grayscale: Optional[bool] = True
+    do_grayscale: bool
 
 
 # Copied from transformers.models.superpoint.image_processing_superpoint.is_grayscale

diff --git a/src/transformers/models/efficientnet/image_processing_efficientnet.py b/src/transformers/models/efficientnet/image_processing_efficientnet.py
@@ -44,7 +44,7 @@
 logger = logging.get_logger(__name__)
 
 
-class EfficientNetImageProcessorKwargs(ImagesKwargs):
+class EfficientNetImageProcessorKwargs(ImagesKwargs, total=False):
     """
     rescale_offset (`bool`, *optional*, defaults to `self.rescale_offset`):
         Whether to rescale the image between [-max_range/2, scale_range/2] instead of [0, scale_range].

diff --git a/src/transformers/models/emu3/image_processing_emu3.py b/src/transformers/models/emu3/image_processing_emu3.py
@@ -47,9 +47,9 @@
 logger = logging.get_logger(__name__)
 
 
-class Emu3ImageProcessorKwargs(ImagesKwargs):
-    ratio: Optional[str]
-    image_area: Optional[int]
+class Emu3ImageProcessorKwargs(ImagesKwargs, total=False):
+    ratio: str
+    image_area: int
 
 
 def smart_resize(