Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 4 additions & 16 deletions src/transformers/image_processing_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,25 +362,13 @@ def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
"""
image_processor_dict = image_processor_dict.copy()
return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)

# The `size` parameter is a dict and was previously an int or tuple in feature extractors.
# We set `size` here directly to the `image_processor_dict` so that it is converted to the appropriate
# dict within the image processor and isn't overwritten if `size` is passed in as a kwarg.
if "size" in kwargs and "size" in image_processor_dict:
image_processor_dict["size"] = kwargs.pop("size")
if "crop_size" in kwargs and "crop_size" in image_processor_dict:
image_processor_dict["crop_size"] = kwargs.pop("crop_size")

image_processor_dict.update({k: v for k, v in kwargs.items() if k in cls.valid_kwargs.__annotations__})
image_processor = cls(**image_processor_dict)

# Update image_processor with kwargs if needed
to_remove = []
for key, value in kwargs.items():
# Remove kwargs that are used to initialize the image processor attributes
for key in list(kwargs):
if hasattr(image_processor, key):
setattr(image_processor, key, value)
to_remove.append(key)
for key in to_remove:
kwargs.pop(key, None)
kwargs.pop(key)

logger.info(f"Image processor {image_processor}")
if return_unused_kwargs:
Expand Down
1 change: 1 addition & 0 deletions src/transformers/image_processing_utils_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
input_data_format = None
device = None
model_input_names = ["pixel_values"]
image_seq_length = None
valid_kwargs = ImagesKwargs
unused_kwargs = None

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,18 @@ class Pix2StructImageProcessorKwargs(ImagesKwargs, total=False):
"""
max_patches (`int`, *optional*):
Maximum number of patches to extract.
patch_size (`dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
The patch size to use for the image. According to Pix2Struct paper and code, the patch size is 16x16.
is_vqa (`bool`, *optional*, defaults to `False`):
Whether or not the image processor is for the VQA task. If `True` and `header_text` is passed in, text is
rendered onto the input images.
header_text (`Union[list[str], str]`, *optional*):
Text to render as a header. Only has an effect if `image_processor.is_vqa` is `True`.
"""

max_patches: int
patch_size: dict[str, int]
is_vqa: bool
header_text: Optional[Union[list[str], str]]


Expand Down
4 changes: 4 additions & 0 deletions src/transformers/processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,9 @@ class methods and docstrings.
- `'np'`: Return NumPy `np.ndarray` objects.
disable_grouping (`bool`, *optional*):
Whether to group images by shapes when processing or not, only relevant for fast image processing.
image_seq_length (`int`, *optional*):
The number of image tokens to be used for each image in the input.
Added for backward compatibility but this should be set as a processor attribute in future models.
Comment on lines +222 to +224
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry for nitty-picking after PR is merged, For my own understanding, do we need it in here, because not all image processors are VLM-specific and image_seq_length isn't always needed for them

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No worries, I agree with you we shouldn't really have this here, as this should be a processor attribute, but a lot of models on the hub have it as an image processor attribute, so I put it here for BC...

"""

do_convert_rgb: Optional[bool]
Expand All @@ -239,6 +242,7 @@ class methods and docstrings.
device: Annotated[Optional[str], device_validator()]
return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()]
disable_grouping: Optional[bool]
image_seq_length: Optional[int]


class VideosKwargs(TypedDict, total=False):
Expand Down
1 change: 1 addition & 0 deletions tests/models/pix2struct/test_processing_pix2struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor", max_patches=1024, patch_size={"height": 8, "width": 8})
print("image_processor", image_processor)
tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")

processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
Expand Down