Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
1abefc9
[Bugfix] Remove incorrect torchvision requirement from PIL backend im…
Lidang-Jiang Mar 27, 2026
035abaa
[Bugfix] Remove redundant @requires(backends=("vision",)) from PIL ba…
Lidang-Jiang Mar 27, 2026
1417bc1
update
ArthurZucker Mar 27, 2026
be82ad8
remove torch when its not necessary
ArthurZucker Mar 27, 2026
ed3111a
remove if typechecking
ArthurZucker Mar 27, 2026
c078455
fix import shinanigans
ArthurZucker Mar 27, 2026
4099913
marvellous that's how we protect torch :)
ArthurZucker Mar 27, 2026
6e011bf
beit is torchvisionbackend
ArthurZucker Mar 27, 2026
a1bda86
more import cleanup
ArthurZucker Mar 27, 2026
6e906b5
fiixup
ArthurZucker Mar 27, 2026
d266fed
fix-repo
ArthurZucker Mar 27, 2026
756aa7c
update
ArthurZucker Mar 27, 2026
2f0267a
style
ArthurZucker Mar 27, 2026
f9d73c9
fixes
ArthurZucker Mar 27, 2026
678dbc5
up
ArthurZucker Mar 27, 2026
5a0e02d
more
ArthurZucker Mar 27, 2026
e8ea722
fix repo
ArthurZucker Mar 27, 2026
b15369e
up
ArthurZucker Mar 27, 2026
a5c7481
update
ArthurZucker Mar 27, 2026
b747308
fix imports
ArthurZucker Mar 27, 2026
58ddf7f
style
ArthurZucker Mar 27, 2026
f5956fd
fix check copies
ArthurZucker Mar 27, 2026
5ef4900
arf
ArthurZucker Mar 27, 2026
5df7f82
converter up
ArthurZucker Mar 27, 2026
8d445e5
fix?
ArthurZucker Mar 27, 2026
0a2c4f9
fix copies
ArthurZucker Mar 27, 2026
e153efd
fix for func
ArthurZucker Mar 30, 2026
2e76424
style
ArthurZucker Mar 30, 2026
c2510fe
ignore
ArthurZucker Mar 30, 2026
93239c1
type
ArthurZucker Mar 30, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/transformers/image_processing_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,7 +529,7 @@ def resize(
self,
image: np.ndarray,
size: SizeDict,
resample: Union["PILImageResampling", "tvF.InterpolationMode", int] | None = None,
resample: "PILImageResampling | None" = None,
reducing_gap: int | None = None,
**kwargs,
) -> np.ndarray:
Expand Down Expand Up @@ -628,7 +628,7 @@ def _preprocess(
images: list[np.ndarray],
do_resize: bool,
size: SizeDict,
resample: Union["PILImageResampling", "tvF.InterpolationMode", int] | None,
resample: "PILImageResampling | None",
do_center_crop: bool,
crop_size: SizeDict,
do_rescale: bool,
Expand Down
7 changes: 2 additions & 5 deletions src/transformers/models/aria/image_processing_aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from torchvision.transforms.v2 import functional as tvF

from ...image_processing_backends import TorchvisionBackend
from ...image_processing_utils import BatchFeature, get_patch_output_size, select_best_resolution
from ...image_transforms import divide_to_patches
from ...image_utils import ChannelDimension, PILImageResampling, SizeDict, get_image_size
from ...processing_utils import ImagesKwargs, Unpack
from ...utils import TensorType, auto_docstring, is_torchvision_available


if is_torchvision_available():
from torchvision.transforms.v2 import functional as tvF
from ...utils import TensorType, auto_docstring


class AriaImageProcessorKwargs(ImagesKwargs, total=False):
Expand Down
30 changes: 23 additions & 7 deletions src/transformers/models/aria/image_processing_pil_aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,29 @@
SizeDict,
get_image_size,
)
from ...processing_utils import Unpack
from ...processing_utils import ImagesKwargs, Unpack
from ...utils import TensorType, auto_docstring
from ...utils.import_utils import requires
from .image_processing_aria import AriaImageProcessorKwargs


@requires(backends=("vision", "torch", "torchvision"))
# Adapted from transformers.models.aria.image_processing_aria.AriaImageProcessorKwargs
class AriaImageProcessorKwargs(ImagesKwargs, total=False):
r"""
max_image_size (`int`, *optional*, defaults to `self.max_image_size`):
Maximum image size. Must be either 490 or 980.
min_image_size (`int`, *optional*, defaults to `self.min_image_size`):
Minimum image size. Images smaller than this in any dimension will be scaled up.
split_resolutions (`list[list[int]]`, *optional*, defaults to `self.split_resolutions`):
A list of possible resolutions as (height, width) pairs for splitting high-resolution images into patches.
split_image (`bool`, *optional*, defaults to `self.split_image`):
Whether to split the image into patches using the best matching resolution from `split_resolutions`.
"""

max_image_size: int
min_image_size: int
split_resolutions: list[list[int]]
split_image: bool


@auto_docstring
class AriaImageProcessorPil(PilBackend):
model_input_names = ["pixel_values", "pixel_mask", "num_crops"]
Expand Down Expand Up @@ -65,7 +81,7 @@ def _resize_for_patching(
self,
image: np.ndarray,
target_resolution: tuple,
resample: "PILImageResampling | int | None",
resample: "PILImageResampling | None",
) -> np.ndarray:
"""Resize an image to a target resolution while maintaining aspect ratio."""
new_height, new_width = get_patch_output_size(
Expand All @@ -90,7 +106,7 @@ def get_image_patches(
image: np.ndarray,
grid_pinpoints: list[list[int]],
patch_size: int,
resample: "PILImageResampling | int | None",
resample: "PILImageResampling | None",
) -> list[np.ndarray]:
"""
Process an image with variable resolutions by dividing it into patches.
Expand Down Expand Up @@ -131,7 +147,7 @@ def _preprocess(
min_image_size: int = 336,
split_resolutions: list[list[int]] | None = None,
split_image: bool = False,
resample: "PILImageResampling | int | None" = None,
resample: "PILImageResampling | None" = None,
**kwargs,
) -> BatchFeature:
if max_image_size not in [490, 980]:
Expand Down
7 changes: 1 addition & 6 deletions src/transformers/models/aria/modeling_aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,7 @@
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from ...processing_utils import Unpack
from ...utils import (
TransformersKwargs,
auto_docstring,
can_return_tuple,
torch_compilable_check,
)
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, torch_compilable_check
from ...utils.generic import maybe_autocast, merge_with_config_defaults
from ...utils.output_capturing import capture_outputs
from ..auto import AutoModel
Expand Down
10 changes: 1 addition & 9 deletions src/transformers/models/aria/modular_aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import torch
from huggingface_hub.dataclasses import strict
from torch import nn
from torchvision.transforms.v2 import functional as tvF

from ... import initialization as init
from ...activations import ACT2FN
Expand All @@ -39,8 +40,6 @@
TransformersKwargs,
auto_docstring,
can_return_tuple,
is_torch_available,
is_torchvision_available,
logging,
)
from ..auto import CONFIG_MAPPING, AutoConfig, AutoTokenizer
Expand Down Expand Up @@ -323,13 +322,6 @@ def forward(self, key_value_states: torch.Tensor, attn_mask: torch.Tensor | None
return out


if is_torch_available():
import torch

if is_torchvision_available():
from torchvision.transforms.v2 import functional as tvF


class AriaImageProcessorKwargs(ImagesKwargs, total=False):
r"""
max_image_size (`int`, *optional*, defaults to `self.max_image_size`):
Expand Down
14 changes: 5 additions & 9 deletions src/transformers/models/beit/image_processing_beit.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@

from typing import Union

import torch
import torch.nn.functional as F
from torchvision.transforms.v2 import functional as tvF

from ...image_processing_backends import TorchvisionBackend
from ...image_processing_utils import BatchFeature
from ...image_transforms import group_images_by_shape, reorder_images
Expand All @@ -27,15 +31,7 @@
SizeDict,
)
from ...processing_utils import ImagesKwargs, Unpack
from ...utils import TensorType, auto_docstring, is_torch_available, is_torchvision_available


if is_torch_available():
import torch
import torch.nn.functional as F

if is_torchvision_available():
from torchvision.transforms.v2 import functional as tvF
from ...utils import TensorType, auto_docstring, is_torch_available


class BeitImageProcessorKwargs(ImagesKwargs, total=False):
Expand Down
28 changes: 18 additions & 10 deletions src/transformers/models/beit/image_processing_pil_beit.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@
"""Image processor class for BEiT."""

import numpy as np
import torch
import torch.nn.functional as F
from torchvision.transforms.v2 import functional as tvF

from ...image_processing_backends import PilBackend
from ...image_processing_utils import BatchFeature
Expand All @@ -28,14 +25,24 @@
PILImageResampling,
SizeDict,
)
from ...processing_utils import Unpack
from ...utils import TensorType, auto_docstring, is_torch_available
from ...processing_utils import ImagesKwargs, Unpack
from ...utils import TensorType, auto_docstring
from ...utils.import_utils import requires
from .image_processing_beit import BeitImageProcessorKwargs


# Adapted from transformers.models.beit.image_processing_beit.BeitImageProcessorKwargs
class BeitImageProcessorKwargs(ImagesKwargs, total=False):
r"""
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
is used for background, and background itself is not included in all classes of a dataset (e.g.
ADE20k). The background label will be replaced by 255.
"""

do_reduce_labels: bool


@auto_docstring
@requires(backends=("vision", "torch", "torchvision"))
class BeitImageProcessorPil(PilBackend):
"""PIL backend for BEiT with reduce_label support."""

Expand Down Expand Up @@ -124,7 +131,7 @@ def _preprocess(
images: list[np.ndarray],
do_resize: bool,
size: SizeDict,
resample: "PILImageResampling | tvF.InterpolationMode | int | None",
resample: PILImageResampling | None,
do_center_crop: bool,
crop_size: SizeDict,
do_rescale: bool,
Expand Down Expand Up @@ -152,6 +159,7 @@ def _preprocess(

return processed_images

@requires(backends=("torch",))
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Image

cc @LysandreJik updated to work as such

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As seen with yu, I don't think this was initially designed for object-methods themselves, only for module-root objects

def post_process_semantic_segmentation(self, outputs, target_sizes: list[tuple] | None = None):
"""
Converts the output of [`BeitForSemanticSegmentation`] into semantic segmentation maps.
Expand All @@ -168,8 +176,8 @@ def post_process_semantic_segmentation(self, outputs, target_sizes: list[tuple]
segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
"""
if not is_torch_available():
raise ImportError("PyTorch is required for post_process_semantic_segmentation")
import torch
import torch.nn.functional as F

logits = outputs.logits

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
from typing import Union

import numpy as np
import torch
from torchvision.transforms.v2 import functional as tvF

from ...image_processing_backends import TorchvisionBackend
from ...image_processing_utils import BatchFeature
Expand All @@ -29,14 +31,7 @@
get_image_size,
)
from ...processing_utils import ImagesKwargs, Unpack
from ...utils import TensorType, auto_docstring, is_torch_available, is_torchvision_available


if is_torch_available():
import torch

if is_torchvision_available():
from torchvision.transforms.v2 import functional as tvF
from ...utils import TensorType, auto_docstring, is_torch_available


def get_resize_output_image_size(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,57 @@
from ...image_utils import (
OPENAI_CLIP_MEAN,
OPENAI_CLIP_STD,
ChannelDimension,
PILImageResampling,
SizeDict,
get_image_size,
)
from ...processing_utils import Unpack
from ...processing_utils import ImagesKwargs, Unpack
from ...utils import TensorType, auto_docstring
from ...utils.import_utils import requires
from .image_processing_bridgetower import BridgeTowerImageProcessorKwargs, get_resize_output_image_size


@requires(backends=("vision", "torch", "torchvision"))
# Adapted from transformers.models.bridgetower.image_processing_bridgetower.BridgeTowerImageProcessorKwargs
class BridgeTowerImageProcessorKwargs(ImagesKwargs, total=False):
r"""
size_divisor (`int`, *optional*, defaults to `self.size_divisor`):
The size by which to make sure both the height and width can be divided.
"""

size_divisor: int


# adapted from transformers.models.bridgetower.image_processing_bridgetower.get_resize_output_image_size
def get_resize_output_image_size(
input_image: np.ndarray,
shorter: int = 800,
longer: int = 1333,
size_divisor: int = 32,
) -> tuple[int, int]:
"""Get output image size after resizing with size_divisor."""
input_height, input_width = get_image_size(input_image, channel_dim=ChannelDimension.FIRST)

min_size, max_size = shorter, longer
scale = min_size / min(input_height, input_width)

if input_height < input_width:
new_height = min_size
new_width = scale * input_width
else:
new_height = scale * input_height
new_width = min_size

if max(new_height, new_width) > max_size:
scale = max_size / max(new_height, new_width)
new_height = scale * new_height
new_width = scale * new_width

new_height, new_width = int(new_height + 0.5), int(new_width + 0.5)
new_height = new_height // size_divisor * size_divisor
new_width = new_width // size_divisor * size_divisor

return new_height, new_width


@auto_docstring
class BridgeTowerImageProcessorPil(PilBackend):
"""PIL backend for BridgeTower with custom resize and center_crop."""
Expand Down Expand Up @@ -57,7 +98,7 @@ def resize(
self,
image: np.ndarray,
size: SizeDict,
resample: "PILImageResampling | int | None",
resample: "PILImageResampling | None",
size_divisor: int = 32,
**kwargs,
) -> np.ndarray:
Expand All @@ -82,7 +123,7 @@ def _preprocess(
images: list[np.ndarray],
do_resize: bool,
size: SizeDict,
resample: "PILImageResampling | int | None",
resample: "PILImageResampling | None",
do_center_crop: bool,
crop_size: SizeDict,
do_rescale: bool,
Expand Down
10 changes: 3 additions & 7 deletions src/transformers/models/chameleon/image_processing_chameleon.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@

import numpy as np
import PIL.Image
import torch
from torchvision.transforms.v2 import functional as tvF

from ...image_processing_backends import TorchvisionBackend
from ...image_utils import (
Expand All @@ -23,15 +25,9 @@
SizeDict,
)
from ...processing_utils import ImagesKwargs, Unpack
from ...utils import auto_docstring, is_torch_available, is_torchvision_available, logging
from ...utils import auto_docstring, logging


if is_torch_available():
import torch

if is_torchvision_available():
from torchvision.transforms.v2 import functional as tvF

logger = logging.get_logger(__name__)


Expand Down
12 changes: 3 additions & 9 deletions src/transformers/models/chmv2/image_processing_chmv2.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,22 +23,16 @@
from typing import Union

import torch
import torch.nn.functional as F
from torchvision.transforms.v2 import functional as tvF

from ...image_processing_backends import TorchvisionBackend
from ...image_processing_base import BatchFeature
from ...image_transforms import group_images_by_shape, reorder_images
from ...image_utils import ChannelDimension, ImageInput, PILImageResampling, SizeDict
from ...modeling_outputs import DepthEstimatorOutput
from ...processing_utils import ImagesKwargs, Unpack
from ...utils import TensorType, auto_docstring, is_torch_available, is_torchvision_available, requires_backends


if is_torchvision_available():
import torchvision.transforms.v2.functional as tvF


if is_torch_available():
import torch.nn.functional as F
from ...utils import TensorType, auto_docstring, is_torch_available, requires_backends


class CHMv2ImageProcessorKwargs(ImagesKwargs, total=False):
Expand Down
Loading
Loading