Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
762b651
initial design draft
zucchini-nlp Sep 10, 2025
02e22c6
delete
zucchini-nlp Sep 10, 2025
e744875
fix a few tests
zucchini-nlp Sep 10, 2025
63532bf
fix
zucchini-nlp Sep 10, 2025
1f62d6f
fix the rest of tests
zucchini-nlp Sep 11, 2025
c203ffd
common-kwargs
zucchini-nlp Sep 11, 2025
725a479
why the runner complains about typing with "|"?
zucchini-nlp Sep 11, 2025
d8ca683
revert
zucchini-nlp Sep 11, 2025
8ff15f7
forgot to delete
zucchini-nlp Sep 11, 2025
b0e8120
update
zucchini-nlp Sep 11, 2025
9f761c6
fix last issues
zucchini-nlp Sep 11, 2025
f935cff
add more detalis in docs
zucchini-nlp Sep 16, 2025
e6a77d8
pin the latest hub release
zucchini-nlp Sep 24, 2025
01841b3
merge main
zucchini-nlp Sep 24, 2025
5a42630
fix tests for new models
zucchini-nlp Sep 24, 2025
fe4ba56
also fast image processor
zucchini-nlp Sep 24, 2025
6e8d77e
fix copies
zucchini-nlp Sep 24, 2025
ba41992
image processing ast validated
zucchini-nlp Sep 25, 2025
601985c
Merge remote-tracking branch 'upstream/main' into validate-processor-…
zucchini-nlp Sep 25, 2025
3233a70
fix more tests
zucchini-nlp Sep 25, 2025
909b98e
typo.and fix copies
zucchini-nlp Sep 25, 2025
9b0bc0c
Merge branch 'main' into validate-processor-kwargs
zucchini-nlp Sep 25, 2025
4410dd3
bump
zucchini-nlp Sep 25, 2025
121931c
merge main
zucchini-nlp Oct 3, 2025
1daa883
style
zucchini-nlp Oct 3, 2025
bd902fb
Merge remote-tracking branch 'upstream/main' into validate-processor-…
zucchini-nlp Oct 7, 2025
b8385a2
fix some tests
zucchini-nlp Oct 7, 2025
69448bb
fix copies
zucchini-nlp Oct 8, 2025
d253615
pin rc4 and mark all TypedDict as non-total
zucchini-nlp Oct 8, 2025
0c52d03
Merge branch 'main' into validate-processor-kwargs
zucchini-nlp Oct 8, 2025
7a4e79f
delete typed dict adaptor
zucchini-nlp Oct 8, 2025
0395b54
address comments
zucchini-nlp Oct 8, 2025
34c9ec7
delete optionals
zucchini-nlp Oct 8, 2025
774c260
frigit to fix copies
zucchini-nlp Oct 8, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@
"GitPython<3.1.19",
"hf-doc-builder>=0.3.0",
"hf_xet",
"huggingface-hub==1.0.0.rc2",
"huggingface-hub==1.0.0.rc4",
"importlib_metadata",
"ipadic>=1.0.0,<2.0",
"jinja2>=3.1.0",
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/dependency_versions_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
"GitPython": "GitPython<3.1.19",
"hf-doc-builder": "hf-doc-builder>=0.3.0",
"hf_xet": "hf_xet",
"huggingface-hub": "huggingface-hub==1.0.0.rc2",
"huggingface-hub": "huggingface-hub==1.0.0.rc4",
"importlib_metadata": "importlib_metadata",
"ipadic": "ipadic>=1.0.0,<2.0",
"jinja2": "jinja2>=3.1.0",
Expand Down
5 changes: 5 additions & 0 deletions src/transformers/image_processing_utils_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from typing import Any, Optional, Union

import numpy as np
from huggingface_hub.dataclasses import validate_typed_dict

from .image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from .image_transforms import (
Expand Down Expand Up @@ -710,6 +711,10 @@ def _validate_preprocess_kwargs(
def preprocess(self, images: ImageInput, *args, **kwargs: Unpack[ImagesKwargs]) -> BatchFeature:
# args are not validated, but their order in the `preprocess` and `_preprocess` signatures must be the same
validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_kwargs_names)

# Perform type validation on received kwargs
validate_typed_dict(self.valid_kwargs, kwargs)

# Set default kwargs from self. This ensures that if a kwarg is not provided
# by the user, it gets its default value from the instance, or is set to None.
for kwarg_name in self._valid_kwargs_names:
Expand Down
10 changes: 9 additions & 1 deletion src/transformers/models/aria/modular_aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
)
from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...modeling_utils import PreTrainedModel
from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils import PreTokenizedInput, TextInput
from ...utils import TensorType, TransformersKwargs, auto_docstring, can_return_tuple, logging
from ..auto import CONFIG_MAPPING, AutoConfig, AutoTokenizer
Expand Down Expand Up @@ -904,7 +904,15 @@ def get_number_of_image_patches(self, height: int, width: int, images_kwargs=Non
return num_patches


class AriaImagesKwargs(ImagesKwargs, total=False):
split_image: bool
max_image_size: int
min_image_size: int


class AriaProcessorKwargs(ProcessingKwargs, total=False):
images_kwargs: AriaImagesKwargs

_defaults = {
"text_kwargs": {
"padding": False,
Expand Down
10 changes: 9 additions & 1 deletion src/transformers/models/aria/processing_aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,21 @@

from ...image_processing_utils import BatchFeature
from ...image_utils import ImageInput
from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils import PreTokenizedInput, TextInput
from ...utils import TensorType
from ..auto import AutoTokenizer


class AriaImagesKwargs(ImagesKwargs, total=False):
split_image: bool
max_image_size: int
min_image_size: int


class AriaProcessorKwargs(ProcessingKwargs, total=False):
images_kwargs: AriaImagesKwargs

_defaults = {
"text_kwargs": {
"padding": False,
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/beit/image_processing_beit.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,15 @@
logger = logging.get_logger(__name__)


class BeitImageProcessorKwargs(ImagesKwargs):
class BeitImageProcessorKwargs(ImagesKwargs, total=False):
r"""
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
is used for background, and background itself is not included in all classes of a dataset (e.g.
ADE20k). The background label will be replaced by 255.
"""

do_reduce_labels: Optional[bool]
do_reduce_labels: bool


@requires(backends=("vision",))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,8 @@ def get_resize_output_image_size(
return new_height, new_width


class BridgeTowerImageProcessorKwargs(ImagesKwargs):
size_divisor: Optional[int]
class BridgeTowerImageProcessorKwargs(ImagesKwargs, total=False):
size_divisor: int


class BridgeTowerImageProcessor(BaseImageProcessor):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
from ...utils import TensorType, auto_docstring


class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs):
class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs, total=False):
"""
crop_to_patches (`bool`, *optional*, defaults to `False`):
Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
Expand All @@ -46,9 +46,9 @@ class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs):
set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
"""

crop_to_patches: Optional[bool]
min_patches: Optional[int]
max_patches: Optional[int]
crop_to_patches: bool
min_patches: int
max_patches: int


@lru_cache(maxsize=10)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ def get_optimal_tiled_canvas(
return best_grid


class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs):
class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs, total=False):
"""
crop_to_patches (`bool`, *optional*, defaults to `False`):
Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
Expand All @@ -316,9 +316,9 @@ class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs):
set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
"""

crop_to_patches: Optional[bool]
min_patches: Optional[int]
max_patches: Optional[int]
crop_to_patches: bool
min_patches: int
max_patches: int


@auto_docstring
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -729,7 +729,7 @@ def compute_segments(
return segmentation, segments


class ConditionalDetrImageProcessorKwargs(ImagesKwargs):
class ConditionalDetrImageProcessorKwargs(ImagesKwargs, total=False):
r"""
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
Expand All @@ -745,9 +745,9 @@ class ConditionalDetrImageProcessorKwargs(ImagesKwargs):
Path to the directory containing the segmentation masks.
"""

format: Optional[Union[str, AnnotationFormat]]
do_convert_annotations: Optional[bool]
return_segmentation_masks: Optional[bool]
format: Union[str, AnnotationFormat]
do_convert_annotations: bool
return_segmentation_masks: bool
annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
masks_path: Optional[Union[str, pathlib.Path]]

Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/convnext/image_processing_convnext.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,14 @@
logger = logging.get_logger(__name__)


class ConvNextImageProcessorKwargs(ImagesKwargs):
class ConvNextImageProcessorKwargs(ImagesKwargs, total=False):
"""
crop_pct (`float`, *optional*):
Percentage of the image to crop. Only has an effect if size < 384. Can be
overridden by `crop_pct` in the`preprocess` method.
"""

crop_pct: Optional[float]
crop_pct: float


@requires(backends=("vision",))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
logger = logging.get_logger(__name__)


class DeepseekVLImageProcessorKwargs(ImagesKwargs):
class DeepseekVLImageProcessorKwargs(ImagesKwargs, total=False):
r"""
min_size (`int`, *optional*, defaults to 14):
The minimum allowed size for the resized image. Ensures that neither the height nor width
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
logger = logging.get_logger(__name__)


class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):
class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs, total=False):
r"""
min_size (`int`, *optional*, defaults to 14):
The minimum allowed size for the resized image. Ensures that neither the height nor width
Expand All @@ -71,9 +71,9 @@ class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):

min_size: int
high_res_size: dict
high_res_resample: "PILImageResampling"
high_res_image_mean: list[float]
high_res_image_std: list[float]
high_res_resample: Union["PILImageResampling", int]
high_res_image_mean: Union[float, list[float], tuple[float, ...]]
high_res_image_std: Union[float, list[float], tuple[float, ...]]


class DeepseekVLHybridImageProcessor(BaseImageProcessor):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ def prepare_inputs_for_generation(
return model_inputs


class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):
class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs, total=False):
r"""
min_size (`int`, *optional*, defaults to 14):
The minimum allowed size for the resized image. Ensures that neither the height nor width
Expand All @@ -450,9 +450,9 @@ class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):

min_size: int
high_res_size: dict
high_res_resample: "PILImageResampling"
high_res_image_mean: list[float]
high_res_image_std: list[float]
high_res_resample: Union["PILImageResampling", int]
high_res_image_mean: Union[float, list[float], tuple[float, ...]]
high_res_image_std: Union[float, list[float], tuple[float, ...]]


class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@
logger = logging.get_logger(__name__) # pylint: disable=invalid-name


class DeformableDetrImageProcessorKwargs(ImagesKwargs):
class DeformableDetrImageProcessorKwargs(ImagesKwargs, total=False):
r"""
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
Expand All @@ -98,9 +98,9 @@ class DeformableDetrImageProcessorKwargs(ImagesKwargs):
Path to the directory containing the segmentation masks.
"""

format: Optional[Union[str, AnnotationFormat]]
do_convert_annotations: Optional[bool]
return_segmentation_masks: Optional[bool]
format: Union[str, AnnotationFormat]
do_convert_annotations: bool
return_segmentation_masks: bool
annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
masks_path: Optional[Union[str, pathlib.Path]]

Expand Down
8 changes: 4 additions & 4 deletions src/transformers/models/detr/image_processing_detr.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)


class DetrImageProcessorKwargs(ImagesKwargs):
class DetrImageProcessorKwargs(ImagesKwargs, total=False):
r"""
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
Expand All @@ -100,9 +100,9 @@ class DetrImageProcessorKwargs(ImagesKwargs):
Path to the directory containing the segmentation masks.
"""

format: Optional[Union[str, AnnotationFormat]]
do_convert_annotations: Optional[bool]
return_segmentation_masks: Optional[bool]
format: Union[str, AnnotationFormat]
do_convert_annotations: bool
return_segmentation_masks: bool
annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
masks_path: Optional[Union[str, pathlib.Path]]

Expand Down
4 changes: 3 additions & 1 deletion src/transformers/models/dia/processing_dia.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,9 @@ class DiaProcessorKwargs(ProcessingKwargs, total=False):
"generation": True,
"sampling_rate": 44100,
},
"common_kwargs": {"return_tensors": "pt"},
"common_kwargs": {
"return_tensors": "pt",
},
}


Expand Down
6 changes: 3 additions & 3 deletions src/transformers/models/donut/image_processing_donut.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,16 +52,16 @@
import PIL


class DonutImageProcessorKwargs(ImagesKwargs):
class DonutImageProcessorKwargs(ImagesKwargs, total=False):
"""
do_thumbnail (`bool`, *optional*, defaults to `self.do_thumbnail`):
Whether to resize the image using thumbnail method.
do_align_long_axis (`bool`, *optional*, defaults to `self.do_align_long_axis`):
Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
"""

do_thumbnail: Optional[bool]
do_align_long_axis: Optional[bool]
do_thumbnail: bool
do_align_long_axis: bool


@requires(backends=("vision",))
Expand Down
10 changes: 5 additions & 5 deletions src/transformers/models/dpt/image_processing_dpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
logger = logging.get_logger(__name__)


class DPTImageProcessorKwargs(ImagesKwargs):
class DPTImageProcessorKwargs(ImagesKwargs, total=False):
"""
ensure_multiple_of (`int`, *optional*, defaults to 1):
If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden
Expand All @@ -78,10 +78,10 @@ class DPTImageProcessorKwargs(ImagesKwargs):
ADE20k). The background label will be replaced by 255.
"""

ensure_multiple_of: Optional[int]
size_divisor: Optional[int]
keep_aspect_ratio: Optional[bool]
do_reduce_labels: Optional[bool]
ensure_multiple_of: int
size_divisor: int
keep_aspect_ratio: bool
do_reduce_labels: bool


def get_resize_output_image_size(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,13 @@
logger = logging.get_logger(__name__)


class EfficientLoFTRImageProcessorKwargs(ImagesKwargs):
class EfficientLoFTRImageProcessorKwargs(ImagesKwargs, total=False):
r"""
do_grayscale (`bool`, *optional*, defaults to `True`):
Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
"""

do_grayscale: Optional[bool] = True
do_grayscale: bool


# Copied from transformers.models.superpoint.image_processing_superpoint.is_grayscale
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
logger = logging.get_logger(__name__)


class EfficientNetImageProcessorKwargs(ImagesKwargs):
class EfficientNetImageProcessorKwargs(ImagesKwargs, total=False):
"""
rescale_offset (`bool`, *optional*, defaults to `self.rescale_offset`):
Whether to rescale the image between [-max_range/2, scale_range/2] instead of [0, scale_range].
Expand Down
6 changes: 3 additions & 3 deletions src/transformers/models/emu3/image_processing_emu3.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@
logger = logging.get_logger(__name__)


class Emu3ImageProcessorKwargs(ImagesKwargs):
ratio: Optional[str]
image_area: Optional[int]
class Emu3ImageProcessorKwargs(ImagesKwargs, total=False):
ratio: str
image_area: int


def smart_resize(
Expand Down
Loading