shallow copy to avoid deepcopy errors

aijadugar · aijadugar · commit c2ab93f76042 · 2025-10-24T23:49:40.000+05:30
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
@@ -654,30 +654,16 @@ def to_dict(self) -> dict[str, Any]:
         Returns:
             `dict[str, Any]`: Dictionary of all the attributes that make up this processor instance.
         """
-        output = copy.deepcopy(self.__dict__)
+        # shallow copy to avoid deepcopy errors
+        output = self.__dict__.copy()  
 
-        # Get the kwargs in `__init__`.
         sig = inspect.signature(self.__init__)
-        # Only save the attributes that are presented in the kwargs of `__init__`.
-        # or in the attributes
-        attrs_to_save = list(sig.parameters) + self.__class__.attributes
-        # extra attributes to be kept
-        attrs_to_save += ["auto_map"]
-
-        if "tokenizer" in output:
-            del output["tokenizer"]
-        if "qformer_tokenizer" in output:
-            del output["qformer_tokenizer"]
-        if "protein_tokenizer" in output:
-            del output["protein_tokenizer"]
-        if "char_tokenizer" in output:
-            del output["char_tokenizer"]
-        if "chat_template" in output:
-            del output["chat_template"]
+        attrs_to_save = list(sig.parameters) + self.__class__.attributes + ["auto_map"]
+
+        for key in ["tokenizer", "qformer_tokenizer", "protein_tokenizer", "char_tokenizer", "chat_template"]:
+            output.pop(key, None)
 
         def save_public_processor_class(dictionary):
-            # make sure private name "_processor_class" is correctly
-            # saved as "processor_class"
             _processor_class = dictionary.pop("_processor_class", None)
             if _processor_class is not None:
                 dictionary["processor_class"] = _processor_class
@@ -687,33 +673,24 @@ def save_public_processor_class(dictionary):
             return dictionary
 
         def cast_array_to_list(dictionary):
-            """
-            Numpy arrays are not serialiazable but can be in pre-processing dicts.
-            This function casts arrays to list, recusring through the nested configs as well.
-            """
             for key, value in dictionary.items():
                 if isinstance(value, np.ndarray):
                     dictionary[key] = value.tolist()
                 elif isinstance(value, dict):
                     dictionary[key] = cast_array_to_list(value)
             return dictionary
 
-        # Special case, add `audio_tokenizer` dict which points to model weights and path
         if "audio_tokenizer" in output:
             audio_tokenizer_dict = {
                 "audio_tokenizer_class": self.audio_tokenizer.__class__.__name__,
                 "audio_tokenizer_name_or_path": self.audio_tokenizer.name_or_path,
             }
             output["audio_tokenizer"] = audio_tokenizer_dict
 
-        # Serialize attributes as a dict
         output = {
             k: v.to_dict() if isinstance(v, PushToHubMixin) else v
             for k, v in output.items()
-            if (
-                k in attrs_to_save  # keep all attributes that have to be serialized
-                and v.__class__.__name__ != "BeamSearchDecoderCTC"  # remove attributes with that are objects
-            )
+            if k in attrs_to_save and v.__class__.__name__ != "BeamSearchDecoderCTC"
         }
         output = cast_array_to_list(output)
         output = save_public_processor_class(output)
diff --git a/tests/test_processor_utils.py b/tests/test_processor_utils.py
@@ -0,0 +1,37 @@
+import tempfile
+
+from transformers.testing_utils import TestCasePlus
+from transformers import ProcessorMixin, AutoTokenizer, PreTrainedTokenizer
+
+
+class ProcessorSavePretrainedMultipleAttributes(TestCasePlus):
+    def test_processor_loads_separate_attributes(self):
+        class OtherProcessor(ProcessorMixin):
+            name = "other-processor"
+
+            attributes = [
+                "tokenizer1",
+                "tokenizer2",
+            ]
+            tokenizer1_class = "AutoTokenizer"
+            tokenizer2_class = "AutoTokenizer"
+
+            def __init__(self,
+                         tokenizer1: PreTrainedTokenizer,
+                         tokenizer2: PreTrainedTokenizer
+                         ):
+                super().__init__(tokenizer1=tokenizer1,
+                                 tokenizer2=tokenizer2)
+
+        tokenizer1 = AutoTokenizer.from_pretrained("google/gemma-3-270m")
+        tokenizer2 = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B")
+
+        processor = OtherProcessor(tokenizer1=tokenizer1,
+                                   tokenizer2=tokenizer2)
+        assert processor.tokenizer1.__class__ != processor.tokenizer2.__class__
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            processor.save_pretrained(save_directory=temp_dir, push_to_hub=False)
+            new_processor = OtherProcessor.from_pretrained(temp_dir)
+
+        assert new_processor.tokenizer1.__class__ != new_processor.tokenizer2.__class__