Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
93abcf9
first part of the fix
ArthurZucker Mar 26, 2026
5d0c0c2
fix torch imports
ArthurZucker Mar 26, 2026
2950cd2
revert
ArthurZucker Mar 26, 2026
2d3049e
fix: make from transformers import * work without torch
ArthurZucker Mar 26, 2026
4028c51
up
ArthurZucker Mar 26, 2026
500d73c
style of this
ArthurZucker Mar 26, 2026
483b44f
revert: remove src/models changes, keep only core import fixes
ArthurZucker Mar 26, 2026
e3c50a9
nit
ArthurZucker Mar 26, 2026
ade0cd1
the mega quidproquo
ArthurZucker Mar 26, 2026
d0cab8e
use rquires(backend
ArthurZucker Mar 26, 2026
c463b18
more pil fixes
ArthurZucker Mar 26, 2026
93969d2
fixes
ArthurZucker Mar 26, 2026
9feaf6f
temp update
ArthurZucker Mar 26, 2026
8545852
up?
ArthurZucker Mar 26, 2026
e463e97
is this it?
ArthurZucker Mar 26, 2026
9de946a
style?
ArthurZucker Mar 26, 2026
015b6b8
revert a bunch of ai shit
ArthurZucker Mar 26, 2026
5bf5056
pi0 requires this
ArthurZucker Mar 26, 2026
01d4a97
revert some stuffs
ArthurZucker Mar 26, 2026
f8ab9e1
upd
ArthurZucker Mar 26, 2026
a940f1a
the fix
ArthurZucker Mar 26, 2026
440b9de
yups
ArthurZucker Mar 26, 2026
1509c73
ah
ArthurZucker Mar 26, 2026
c11d84f
up
ArthurZucker Mar 26, 2026
9814fc4
up
ArthurZucker Mar 26, 2026
910801c
fix
ArthurZucker Mar 26, 2026
81e1305
yes?
ArthurZucker Mar 26, 2026
4b43926
update
ArthurZucker Mar 26, 2026
f63fa89
up
ArthurZucker Mar 26, 2026
007bbdb
nits
ArthurZucker Mar 26, 2026
ef26d04
up
ArthurZucker Mar 26, 2026
8669a89
up
ArthurZucker Mar 26, 2026
21f38de
order
ArthurZucker Mar 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,28 @@ jobs:
- store_artifacts:
path: ~/transformers/installed.txt
- run: make check-repository-consistency
- run:
name: "Test import with all backends (torch + PIL + torchvision)"
command: python -c "from transformers import *" || (echo '🚨 import failed with all backends. Fix unprotected imports!! 🚨'; exit 1)
- run:
name: "Test import with torch only (no PIL, no torchvision)"
command: |
uv pip uninstall Pillow torchvision -q
python -c "from transformers import *" || (echo '🚨 import failed with torch only (no PIL). Fix unprotected imports!! 🚨'; exit 1)
uv pip install -e ".[quality]" -q
- run:
name: "Test import with PIL only (no torch, no torchvision)"
command: |
uv pip uninstall torch torchvision torchaudio -q
python -c "from transformers import *" || (echo '🚨 import failed with PIL only (no torch). Fix unprotected imports!! 🚨'; exit 1)
uv pip install -e ".[quality]" -q
- run:
name: "Test import with torch + PIL, no torchvision"
command: |
uv pip uninstall torchvision -q
python -c "from transformers import *" || (echo '🚨 import failed with torch+PIL but no torchvision. Fix unprotected imports!! 🚨'; exit 1)
uv pip install -e ".[quality]" -q


workflows:
version: 2
Expand Down
4 changes: 3 additions & 1 deletion src/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@
name for name in dir(dummy_vision_objects) if not name.startswith("_")
]
else:
_import_structure["image_processing_backends"] = ["PilBackend", "TorchvisionBackend"]
_import_structure["image_processing_backends"] = ["PilBackend"]
_import_structure["image_processing_base"] = ["ImageProcessingMixin"]
_import_structure["image_processing_utils"] = ["BaseImageProcessor"]
_import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
Expand All @@ -345,6 +345,8 @@
name for name in dir(dummy_torchvision_objects) if not name.startswith("_")
]
else:
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

torch vision depends on torch

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

second most important

_import_structure.setdefault("image_processing_backends", [])
_import_structure["image_processing_backends"] += ["TorchvisionBackend"]
_import_structure["video_processing_utils"] = ["BaseVideoProcessor"]

# PyTorch-backed objects
Expand Down
6 changes: 3 additions & 3 deletions src/transformers/image_processing_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,8 @@
is_torchvision_available,
is_vision_available,
logging,
requires_backends,
)
from .utils.import_utils import is_rocm_platform, is_torchdynamo_compiling
from .utils.import_utils import is_rocm_platform, is_torchdynamo_compiling, requires


if is_vision_available():
Expand All @@ -81,11 +80,11 @@
logger = logging.get_logger(__name__)


@requires(backends=("torch", "torchvision"))
class TorchvisionBackend(BaseImageProcessor):
"""Torchvision backend for GPU-accelerated batched image processing."""

def __init__(self, **kwargs: Unpack[ImagesKwargs]):
requires_backends(self, "torchvision")
super().__init__(**kwargs)
self._set_attributes(**kwargs)

Expand Down Expand Up @@ -407,6 +406,7 @@ def _preprocess(
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)


@requires(backends=("vision",))
class PilBackend(BaseImageProcessor):
"""PIL/NumPy backend for portable CPU-only image processing."""

Expand Down
16 changes: 7 additions & 9 deletions src/transformers/models/aria/image_processing_pil_aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,12 @@
get_image_size,
)
from ...processing_utils import Unpack
from ...utils import TensorType, auto_docstring, is_torchvision_available
from ...utils import TensorType, auto_docstring
from ...utils.import_utils import requires
from .image_processing_aria import AriaImageProcessorKwargs


if is_torchvision_available():
from torchvision.transforms.v2 import functional as tvF


@requires(backends=("vision", "torch", "torchvision"))
@auto_docstring
class AriaImageProcessorPil(PilBackend):
model_input_names = ["pixel_values", "pixel_mask", "num_crops"]
Expand Down Expand Up @@ -67,7 +65,7 @@ def _resize_for_patching(
self,
image: np.ndarray,
target_resolution: tuple,
resample: "PILImageResampling | tvF.InterpolationMode | int | None",
resample: "PILImageResampling | int | None",
) -> np.ndarray:
"""Resize an image to a target resolution while maintaining aspect ratio."""
new_height, new_width = get_patch_output_size(
Expand All @@ -92,7 +90,7 @@ def get_image_patches(
image: np.ndarray,
grid_pinpoints: list[list[int]],
patch_size: int,
resample: "PILImageResampling | tvF.InterpolationMode | int | None",
resample: "PILImageResampling | int | None",
) -> list[np.ndarray]:
"""
Process an image with variable resolutions by dividing it into patches.
Expand All @@ -104,7 +102,7 @@ def get_image_patches(
A list of possible resolutions as (height, width) pairs.
patch_size (`int`):
Size of each square patch to divide the image into.
resample (`PILImageResampling | tvF.InterpolationMode | int | None`):
resample (`PILImageResampling | int | None`):
Resampling filter to use when resizing.

Returns:
Expand Down Expand Up @@ -133,7 +131,7 @@ def _preprocess(
min_image_size: int = 336,
split_resolutions: list[list[int]] | None = None,
split_image: bool = False,
resample: "PILImageResampling | tvF.InterpolationMode | int | None" = None,
resample: "PILImageResampling | int | None" = None,
**kwargs,
) -> BatchFeature:
if max_image_size not in [490, 980]:
Expand Down
15 changes: 6 additions & 9 deletions src/transformers/models/beit/image_processing_pil_beit.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
"""Image processor class for BEiT."""

import numpy as np
import torch
import torch.nn.functional as F
from torchvision.transforms.v2 import functional as tvF

from ...image_processing_backends import PilBackend
from ...image_processing_utils import BatchFeature
Expand All @@ -26,19 +29,13 @@
SizeDict,
)
from ...processing_utils import Unpack
from ...utils import TensorType, auto_docstring, is_torch_available, is_torchvision_available
from ...utils import TensorType, auto_docstring, is_torch_available
from ...utils.import_utils import requires
from .image_processing_beit import BeitImageProcessorKwargs


if is_torch_available():
import torch
import torch.nn.functional as F

if is_torchvision_available():
from torchvision.transforms.v2 import functional as tvF


@auto_docstring
@requires(backends=("vision", "torch", "torchvision"))
class BeitImageProcessorPil(PilBackend):
"""PIL backend for BEiT with reduce_label support."""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,12 @@
SizeDict,
)
from ...processing_utils import Unpack
from ...utils import TensorType, auto_docstring, is_torchvision_available
from ...utils import TensorType, auto_docstring
from ...utils.import_utils import requires
from .image_processing_bridgetower import BridgeTowerImageProcessorKwargs, get_resize_output_image_size


if is_torchvision_available():
from torchvision.transforms.v2 import functional as tvF


@requires(backends=("vision", "torch", "torchvision"))
@auto_docstring
class BridgeTowerImageProcessorPil(PilBackend):
"""PIL backend for BridgeTower with custom resize and center_crop."""
Expand Down Expand Up @@ -59,7 +57,7 @@ def resize(
self,
image: np.ndarray,
size: SizeDict,
resample: "PILImageResampling | tvF.InterpolationMode | int | None",
resample: "PILImageResampling | int | None",
size_divisor: int = 32,
**kwargs,
) -> np.ndarray:
Expand All @@ -84,7 +82,7 @@ def _preprocess(
images: list[np.ndarray],
do_resize: bool,
size: SizeDict,
resample: "PILImageResampling | tvF.InterpolationMode | int | None",
resample: "PILImageResampling | int | None",
do_center_crop: bool,
crop_size: SizeDict,
do_rescale: bool,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
logging,
requires_backends,
)
from ...utils.import_utils import requires
from .image_processing_conditional_detr import (
ConditionalDetrImageProcessorKwargs,
compute_segments,
Expand Down Expand Up @@ -262,6 +263,7 @@ def prepare_coco_panoptic_annotation(
return new_target


@requires(backends=("vision", "torch", "torchvision"))
@auto_docstring
class ConditionalDetrImageProcessorPil(PilBackend):
resample = PILImageResampling.BILINEAR
Expand Down Expand Up @@ -676,6 +678,7 @@ def _preprocess(
]
return encoded_inputs

@requires(backends=("vision", "torch"))
def post_process_object_detection(
self, outputs, threshold: float = 0.5, target_sizes: TensorType | list[tuple] = None, top_k: int = 100
):
Expand Down Expand Up @@ -736,6 +739,7 @@ def post_process_object_detection(

return results

@requires(backends=("vision", "torch"))
def post_process_semantic_segmentation(self, outputs, target_sizes: list[tuple[int, int]] | None = None):
"""
Converts the output of [`ConditionalDetrForSegmentation`] into semantic segmentation maps. Only supports PyTorch.
Expand Down Expand Up @@ -784,6 +788,7 @@ def post_process_semantic_segmentation(self, outputs, target_sizes: list[tuple[i

return semantic_segmentation

@requires(backends=("vision", "torch"))
def post_process_instance_segmentation(
self,
outputs,
Expand Down Expand Up @@ -872,6 +877,7 @@ def post_process_instance_segmentation(
results.append({"segmentation": segmentation, "segments_info": segments})
return results

@requires(backends=("vision", "torch"))
def post_process_panoptic_segmentation(
self,
outputs,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
requires_backends,
)
from ...utils.generic import can_return_tuple, merge_with_config_defaults
from ...utils.import_utils import requires
from ...utils.output_capturing import OutputRecorder, capture_outputs
from ..deformable_detr.modeling_deformable_detr import inverse_sigmoid
from ..detr.image_processing_detr import DetrImageProcessor
Expand Down Expand Up @@ -173,6 +174,7 @@ def post_process_semantic_segmentation(self, outputs, target_sizes: list[tuple[i


class ConditionalDetrImageProcessorPil(DetrImageProcessorPil):
@requires(backends=("vision", "torch"))
def post_process_object_detection(
self, outputs, threshold: float = 0.5, target_sizes: TensorType | list[tuple] = None, top_k: int = 100
):
Expand Down Expand Up @@ -233,6 +235,7 @@ def post_process_object_detection(

return results

@requires(backends=("vision", "torch"))
def post_process_semantic_segmentation(self, outputs, target_sizes: list[tuple[int, int]] | None = None):
"""
Converts the output of [`ConditionalDetrForSegmentation`] into semantic segmentation maps. Only supports PyTorch.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,12 @@
SizeDict,
)
from ...processing_utils import Unpack
from ...utils import TensorType, auto_docstring, is_torchvision_available
from ...utils import TensorType, auto_docstring
from ...utils.import_utils import requires
from .image_processing_convnext import ConvNextImageProcessorKwargs


if is_torchvision_available():
from torchvision.transforms.v2 import functional as tvF


@requires(backends=("vision", "torch", "torchvision"))
@auto_docstring
class ConvNextImageProcessorPil(PilBackend):
"""PIL backend for ConvNeXT with custom resize."""
Expand All @@ -57,7 +55,7 @@ def resize(
self,
image: np.ndarray,
size: SizeDict,
resample: "PILImageResampling | tvF.InterpolationMode | int | None",
resample: "PILImageResampling | int | None",
crop_pct: float = 224 / 256,
**kwargs,
) -> np.ndarray:
Expand Down Expand Up @@ -98,7 +96,7 @@ def _preprocess(
images: list[np.ndarray],
do_resize: bool,
size: SizeDict,
resample: "PILImageResampling | tvF.InterpolationMode | int | None",
resample: "PILImageResampling | int | None",
do_center_crop: bool,
crop_size: SizeDict,
do_rescale: bool,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,11 @@
)
from ...processing_utils import Unpack
from ...utils import TensorType, auto_docstring
from ...utils.import_utils import requires
from .image_processing_deepseek_vl import DeepseekVLImageProcessorKwargs


@requires(backends=("vision", "torch", "torchvision"))
@auto_docstring
class DeepseekVLImageProcessorPil(PilBackend):
resample = PILImageResampling.BICUBIC
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,14 @@
from typing import Union

import torch
import torchvision.transforms.v2.functional as tvF

from ...image_processing_backends import TorchvisionBackend
from ...image_processing_utils import BatchFeature, get_size_dict
from ...image_transforms import group_images_by_shape, reorder_images
from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling, SizeDict
from ...processing_utils import ImagesKwargs, Unpack
from ...utils import TensorType, auto_docstring, is_torchvision_available


if is_torchvision_available():
import torchvision.transforms.v2.functional as tvF
from ...utils import TensorType, auto_docstring


class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs, total=False):
Expand Down Expand Up @@ -176,8 +173,8 @@ def _preprocess(
size: SizeDict,
high_res_size: SizeDict,
min_size: int,
resample: "PILImageResampling | tvF.InterpolationMode | int | None",
high_res_resample: "PILImageResampling | tvF.InterpolationMode | int | None",
resample: "PILImageResampling | int | None",
high_res_resample: "PILImageResampling | int | None",
do_rescale: bool,
rescale_factor: float,
do_normalize: bool,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,16 @@
SizeDict,
)
from ...processing_utils import Unpack
from ...utils import TensorType, auto_docstring, is_torch_available, is_torchvision_available
from ...utils import TensorType, auto_docstring, is_torch_available
from ...utils.import_utils import requires
from .image_processing_deepseek_vl_hybrid import DeepseekVLHybridImageProcessorKwargs


if is_torch_available():
import torch
if is_torchvision_available():
import torchvision.transforms.v2.functional as tvF


@requires(backends=("vision", "torch", "torchvision"))
@auto_docstring
class DeepseekVLHybridImageProcessorPil(PilBackend):
resample = PILImageResampling.BICUBIC
Expand Down Expand Up @@ -150,8 +150,8 @@ def _preprocess(
size: SizeDict,
high_res_size: SizeDict,
min_size: int,
resample: "PILImageResampling | tvF.InterpolationMode | int | None",
high_res_resample: "PILImageResampling | tvF.InterpolationMode | int | None",
resample: "PILImageResampling | int | None",
high_res_resample: "PILImageResampling | int | None",
do_rescale: bool,
rescale_factor: float,
do_normalize: bool,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,7 @@
from ...modeling_outputs import BaseModelOutputWithPooling, ModelOutput
from ...modeling_utils import PreTrainedModel
from ...processing_utils import Unpack
from ...utils import (
TransformersKwargs,
auto_docstring,
can_return_tuple,
torch_compilable_check,
)
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, torch_compilable_check
from ..auto import AutoModel
from .configuration_deepseek_vl_hybrid import DeepseekVLHybridConfig

Expand Down
Loading
Loading