huggingface · ArthurZucker · Mar 27, 2026 · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -175,6 +175,28 @@ jobs:
             - store_artifacts:
                   path: ~/transformers/installed.txt
             - run: make check-repository-consistency
+            - run:
+                name: "Test import with all backends (torch + PIL + torchvision)"
+                command: python -c "from transformers import *" || (echo '🚨 import failed with all backends. Fix unprotected imports!! 🚨'; exit 1)
+            - run:
+                name: "Test import with torch only (no PIL, no torchvision)"
+                command: |
+                    uv pip uninstall Pillow torchvision -q
+                    python -c "from transformers import *" || (echo '🚨 import failed with torch only (no PIL). Fix unprotected imports!! 🚨'; exit 1)
+                    uv pip install -e ".[quality]" -q
+            - run:
+                name: "Test import with PIL only (no torch, no torchvision)"
+                command: |
+                    uv pip uninstall torch torchvision torchaudio -q
+                    python -c "from transformers import *" || (echo '🚨 import failed with PIL only (no torch). Fix unprotected imports!! 🚨'; exit 1)
+                    uv pip install -e ".[quality]" -q
+            - run:
+                name: "Test import with torch + PIL, no torchvision"
+                command: |
+                    uv pip uninstall torchvision -q
+                    python -c "from transformers import *" || (echo '🚨 import failed with torch+PIL but no torchvision. Fix unprotected imports!! 🚨'; exit 1)
+                    uv pip install -e ".[quality]" -q
+
 
 workflows:
     version: 2

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -330,7 +330,7 @@
         name for name in dir(dummy_vision_objects) if not name.startswith("_")
     ]
 else:
-    _import_structure["image_processing_backends"] = ["PilBackend", "TorchvisionBackend"]
+    _import_structure["image_processing_backends"] = ["PilBackend"]
     _import_structure["image_processing_base"] = ["ImageProcessingMixin"]
     _import_structure["image_processing_utils"] = ["BaseImageProcessor"]
     _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
@@ -345,6 +345,8 @@
         name for name in dir(dummy_torchvision_objects) if not name.startswith("_")
     ]
 else:
+    _import_structure.setdefault("image_processing_backends", [])
+    _import_structure["image_processing_backends"] += ["TorchvisionBackend"]
     _import_structure["video_processing_utils"] = ["BaseVideoProcessor"]
 
 # PyTorch-backed objects

diff --git a/src/transformers/image_processing_backends.py b/src/transformers/image_processing_backends.py
@@ -58,9 +58,8 @@
     is_torchvision_available,
     is_vision_available,
     logging,
-    requires_backends,
 )
-from .utils.import_utils import is_rocm_platform, is_torchdynamo_compiling
+from .utils.import_utils import is_rocm_platform, is_torchdynamo_compiling, requires
 
 
 if is_vision_available():
@@ -81,11 +80,11 @@
 logger = logging.get_logger(__name__)
 
 
+@requires(backends=("torch", "torchvision"))
 class TorchvisionBackend(BaseImageProcessor):
     """Torchvision backend for GPU-accelerated batched image processing."""
 
     def __init__(self, **kwargs: Unpack[ImagesKwargs]):
-        requires_backends(self, "torchvision")
         super().__init__(**kwargs)
         self._set_attributes(**kwargs)
 
@@ -407,6 +406,7 @@ def _preprocess(
         return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
 
 
+@requires(backends=("vision",))
 class PilBackend(BaseImageProcessor):
     """PIL/NumPy backend for portable CPU-only image processing."""
 

diff --git a/src/transformers/models/aria/image_processing_pil_aria.py b/src/transformers/models/aria/image_processing_pil_aria.py
@@ -25,14 +25,12 @@
     get_image_size,
 )
 from ...processing_utils import Unpack
-from ...utils import TensorType, auto_docstring, is_torchvision_available
+from ...utils import TensorType, auto_docstring
+from ...utils.import_utils import requires
 from .image_processing_aria import AriaImageProcessorKwargs
 
 
-if is_torchvision_available():
-    from torchvision.transforms.v2 import functional as tvF
-
-
+@requires(backends=("vision", "torch", "torchvision"))
 @auto_docstring
 class AriaImageProcessorPil(PilBackend):
     model_input_names = ["pixel_values", "pixel_mask", "num_crops"]
@@ -67,7 +65,7 @@ def _resize_for_patching(
         self,
         image: np.ndarray,
         target_resolution: tuple,
-        resample: "PILImageResampling | tvF.InterpolationMode | int | None",
+        resample: "PILImageResampling | int | None",
     ) -> np.ndarray:
         """Resize an image to a target resolution while maintaining aspect ratio."""
         new_height, new_width = get_patch_output_size(
@@ -92,7 +90,7 @@ def get_image_patches(
         image: np.ndarray,
         grid_pinpoints: list[list[int]],
         patch_size: int,
-        resample: "PILImageResampling | tvF.InterpolationMode | int | None",
+        resample: "PILImageResampling | int | None",
     ) -> list[np.ndarray]:
         """
         Process an image with variable resolutions by dividing it into patches.
@@ -104,7 +102,7 @@ def get_image_patches(
                 A list of possible resolutions as (height, width) pairs.
             patch_size (`int`):
                 Size of each square patch to divide the image into.
-            resample (`PILImageResampling | tvF.InterpolationMode | int | None`):
+            resample (`PILImageResampling | int | None`):
                 Resampling filter to use when resizing.
 
         Returns:
@@ -133,7 +131,7 @@ def _preprocess(
         min_image_size: int = 336,
         split_resolutions: list[list[int]] | None = None,
         split_image: bool = False,
-        resample: "PILImageResampling | tvF.InterpolationMode | int | None" = None,
+        resample: "PILImageResampling | int | None" = None,
         **kwargs,
     ) -> BatchFeature:
         if max_image_size not in [490, 980]:

diff --git a/src/transformers/models/beit/image_processing_pil_beit.py b/src/transformers/models/beit/image_processing_pil_beit.py
@@ -14,6 +14,9 @@
 """Image processor class for BEiT."""
 
 import numpy as np
+import torch
+import torch.nn.functional as F
+from torchvision.transforms.v2 import functional as tvF
 
 from ...image_processing_backends import PilBackend
 from ...image_processing_utils import BatchFeature
@@ -26,19 +29,13 @@
     SizeDict,
 )
 from ...processing_utils import Unpack
-from ...utils import TensorType, auto_docstring, is_torch_available, is_torchvision_available
+from ...utils import TensorType, auto_docstring, is_torch_available
+from ...utils.import_utils import requires
 from .image_processing_beit import BeitImageProcessorKwargs
 
 
-if is_torch_available():
-    import torch
-    import torch.nn.functional as F
-
-if is_torchvision_available():
-    from torchvision.transforms.v2 import functional as tvF
-
-
 @auto_docstring
+@requires(backends=("vision", "torch", "torchvision"))
 class BeitImageProcessorPil(PilBackend):
     """PIL backend for BEiT with reduce_label support."""
 

diff --git a/src/transformers/models/bridgetower/image_processing_pil_bridgetower.py b/src/transformers/models/bridgetower/image_processing_pil_bridgetower.py
@@ -24,14 +24,12 @@
     SizeDict,
 )
 from ...processing_utils import Unpack
-from ...utils import TensorType, auto_docstring, is_torchvision_available
+from ...utils import TensorType, auto_docstring
+from ...utils.import_utils import requires
 from .image_processing_bridgetower import BridgeTowerImageProcessorKwargs, get_resize_output_image_size
 
 
-if is_torchvision_available():
-    from torchvision.transforms.v2 import functional as tvF
-
-
+@requires(backends=("vision", "torch", "torchvision"))
 @auto_docstring
 class BridgeTowerImageProcessorPil(PilBackend):
     """PIL backend for BridgeTower with custom resize and center_crop."""
@@ -59,7 +57,7 @@ def resize(
         self,
         image: np.ndarray,
         size: SizeDict,
-        resample: "PILImageResampling | tvF.InterpolationMode | int | None",
+        resample: "PILImageResampling | int | None",
         size_divisor: int = 32,
         **kwargs,
     ) -> np.ndarray:
@@ -84,7 +82,7 @@ def _preprocess(
         images: list[np.ndarray],
         do_resize: bool,
         size: SizeDict,
-        resample: "PILImageResampling | tvF.InterpolationMode | int | None",
+        resample: "PILImageResampling | int | None",
         do_center_crop: bool,
         crop_size: SizeDict,
         do_rescale: bool,

diff --git a/src/transformers/models/conditional_detr/image_processing_pil_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_pil_conditional_detr.py
@@ -57,6 +57,7 @@
     logging,
     requires_backends,
 )
+from ...utils.import_utils import requires
 from .image_processing_conditional_detr import (
     ConditionalDetrImageProcessorKwargs,
     compute_segments,
@@ -262,6 +263,7 @@ def prepare_coco_panoptic_annotation(
     return new_target
 
 
+@requires(backends=("vision", "torch", "torchvision"))
 @auto_docstring
 class ConditionalDetrImageProcessorPil(PilBackend):
     resample = PILImageResampling.BILINEAR
@@ -676,6 +678,7 @@ def _preprocess(
             ]
         return encoded_inputs
 
+    @requires(backends=("vision", "torch"))
     def post_process_object_detection(
         self, outputs, threshold: float = 0.5, target_sizes: TensorType | list[tuple] = None, top_k: int = 100
     ):
@@ -736,6 +739,7 @@ def post_process_object_detection(
 
         return results
 
+    @requires(backends=("vision", "torch"))
     def post_process_semantic_segmentation(self, outputs, target_sizes: list[tuple[int, int]] | None = None):
         """
         Converts the output of [`ConditionalDetrForSegmentation`] into semantic segmentation maps. Only supports PyTorch.
@@ -784,6 +788,7 @@ def post_process_semantic_segmentation(self, outputs, target_sizes: list[tuple[i
 
         return semantic_segmentation
 
+    @requires(backends=("vision", "torch"))
     def post_process_instance_segmentation(
         self,
         outputs,
@@ -872,6 +877,7 @@ def post_process_instance_segmentation(
             results.append({"segmentation": segmentation, "segments_info": segments})
         return results
 
+    @requires(backends=("vision", "torch"))
     def post_process_panoptic_segmentation(
         self,
         outputs,

diff --git a/src/transformers/models/conditional_detr/modular_conditional_detr.py b/src/transformers/models/conditional_detr/modular_conditional_detr.py
@@ -34,6 +34,7 @@
     requires_backends,
 )
 from ...utils.generic import can_return_tuple, merge_with_config_defaults
+from ...utils.import_utils import requires
 from ...utils.output_capturing import OutputRecorder, capture_outputs
 from ..deformable_detr.modeling_deformable_detr import inverse_sigmoid
 from ..detr.image_processing_detr import DetrImageProcessor
@@ -173,6 +174,7 @@ def post_process_semantic_segmentation(self, outputs, target_sizes: list[tuple[i
 
 
 class ConditionalDetrImageProcessorPil(DetrImageProcessorPil):
+    @requires(backends=("vision", "torch"))
     def post_process_object_detection(
         self, outputs, threshold: float = 0.5, target_sizes: TensorType | list[tuple] = None, top_k: int = 100
     ):
@@ -233,6 +235,7 @@ def post_process_object_detection(
 
         return results
 
+    @requires(backends=("vision", "torch"))
     def post_process_semantic_segmentation(self, outputs, target_sizes: list[tuple[int, int]] | None = None):
         """
         Converts the output of [`ConditionalDetrForSegmentation`] into semantic segmentation maps. Only supports PyTorch.

diff --git a/src/transformers/models/convnext/image_processing_pil_convnext.py b/src/transformers/models/convnext/image_processing_pil_convnext.py
@@ -26,14 +26,12 @@
     SizeDict,
 )
 from ...processing_utils import Unpack
-from ...utils import TensorType, auto_docstring, is_torchvision_available
+from ...utils import TensorType, auto_docstring
+from ...utils.import_utils import requires
 from .image_processing_convnext import ConvNextImageProcessorKwargs
 
 
-if is_torchvision_available():
-    from torchvision.transforms.v2 import functional as tvF
-
-
+@requires(backends=("vision", "torch", "torchvision"))
 @auto_docstring
 class ConvNextImageProcessorPil(PilBackend):
     """PIL backend for ConvNeXT with custom resize."""
@@ -57,7 +55,7 @@ def resize(
         self,
         image: np.ndarray,
         size: SizeDict,
-        resample: "PILImageResampling | tvF.InterpolationMode | int | None",
+        resample: "PILImageResampling | int | None",
         crop_pct: float = 224 / 256,
         **kwargs,
     ) -> np.ndarray:
@@ -98,7 +96,7 @@ def _preprocess(
         images: list[np.ndarray],
         do_resize: bool,
         size: SizeDict,
-        resample: "PILImageResampling | tvF.InterpolationMode | int | None",
+        resample: "PILImageResampling | int | None",
         do_center_crop: bool,
         crop_size: SizeDict,
         do_rescale: bool,

diff --git a/src/transformers/models/deepseek_vl/image_processing_pil_deepseek_vl.py b/src/transformers/models/deepseek_vl/image_processing_pil_deepseek_vl.py
@@ -34,9 +34,11 @@
 )
 from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring
+from ...utils.import_utils import requires
 from .image_processing_deepseek_vl import DeepseekVLImageProcessorKwargs
 
 
+@requires(backends=("vision", "torch", "torchvision"))
 @auto_docstring
 class DeepseekVLImageProcessorPil(PilBackend):
     resample = PILImageResampling.BICUBIC

diff --git a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py
@@ -22,17 +22,14 @@
 from typing import Union
 
 import torch
+import torchvision.transforms.v2.functional as tvF
 
 from ...image_processing_backends import TorchvisionBackend
 from ...image_processing_utils import BatchFeature, get_size_dict
 from ...image_transforms import group_images_by_shape, reorder_images
 from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling, SizeDict
 from ...processing_utils import ImagesKwargs, Unpack
-from ...utils import TensorType, auto_docstring, is_torchvision_available
-
-
-if is_torchvision_available():
-    import torchvision.transforms.v2.functional as tvF
+from ...utils import TensorType, auto_docstring
 
 
 class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs, total=False):
@@ -176,8 +173,8 @@ def _preprocess(
         size: SizeDict,
         high_res_size: SizeDict,
         min_size: int,
-        resample: "PILImageResampling | tvF.InterpolationMode | int | None",
-        high_res_resample: "PILImageResampling | tvF.InterpolationMode | int | None",
+        resample: "PILImageResampling | int | None",
+        high_res_resample: "PILImageResampling | int | None",
         do_rescale: bool,
         rescale_factor: float,
         do_normalize: bool,

diff --git a/src/transformers/models/deepseek_vl_hybrid/image_processing_pil_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/image_processing_pil_deepseek_vl_hybrid.py
@@ -34,16 +34,16 @@
     SizeDict,
 )
 from ...processing_utils import Unpack
-from ...utils import TensorType, auto_docstring, is_torch_available, is_torchvision_available
+from ...utils import TensorType, auto_docstring, is_torch_available
+from ...utils.import_utils import requires
 from .image_processing_deepseek_vl_hybrid import DeepseekVLHybridImageProcessorKwargs
 
 
 if is_torch_available():
     import torch
-if is_torchvision_available():
-    import torchvision.transforms.v2.functional as tvF
 
 
+@requires(backends=("vision", "torch", "torchvision"))
 @auto_docstring
 class DeepseekVLHybridImageProcessorPil(PilBackend):
     resample = PILImageResampling.BICUBIC
@@ -150,8 +150,8 @@ def _preprocess(
         size: SizeDict,
         high_res_size: SizeDict,
         min_size: int,
-        resample: "PILImageResampling | tvF.InterpolationMode | int | None",
-        high_res_resample: "PILImageResampling | tvF.InterpolationMode | int | None",
+        resample: "PILImageResampling | int | None",
+        high_res_resample: "PILImageResampling | int | None",
         do_rescale: bool,
         rescale_factor: float,
         do_normalize: bool,

diff --git a/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py
@@ -29,12 +29,7 @@
 from ...modeling_outputs import BaseModelOutputWithPooling, ModelOutput
 from ...modeling_utils import PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import (
-    TransformersKwargs,
-    auto_docstring,
-    can_return_tuple,
-    torch_compilable_check,
-)
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, torch_compilable_check
 from ..auto import AutoModel
 from .configuration_deepseek_vl_hybrid import DeepseekVLHybridConfig