Skip to content

Commit c2ab93f

Browse files
committed
shallow copy to avoid deepcopy errors
1 parent 73f91c5 commit c2ab93f

File tree

2 files changed

+44
-30
lines changed

2 files changed

+44
-30
lines changed

src/transformers/processing_utils.py

Lines changed: 7 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -654,30 +654,16 @@ def to_dict(self) -> dict[str, Any]:
654654
Returns:
655655
`dict[str, Any]`: Dictionary of all the attributes that make up this processor instance.
656656
"""
657-
output = copy.deepcopy(self.__dict__)
657+
# shallow copy to avoid deepcopy errors
658+
output = self.__dict__.copy()
658659

659-
# Get the kwargs in `__init__`.
660660
sig = inspect.signature(self.__init__)
661-
# Only save the attributes that are presented in the kwargs of `__init__`.
662-
# or in the attributes
663-
attrs_to_save = list(sig.parameters) + self.__class__.attributes
664-
# extra attributes to be kept
665-
attrs_to_save += ["auto_map"]
666-
667-
if "tokenizer" in output:
668-
del output["tokenizer"]
669-
if "qformer_tokenizer" in output:
670-
del output["qformer_tokenizer"]
671-
if "protein_tokenizer" in output:
672-
del output["protein_tokenizer"]
673-
if "char_tokenizer" in output:
674-
del output["char_tokenizer"]
675-
if "chat_template" in output:
676-
del output["chat_template"]
661+
attrs_to_save = list(sig.parameters) + self.__class__.attributes + ["auto_map"]
662+
663+
for key in ["tokenizer", "qformer_tokenizer", "protein_tokenizer", "char_tokenizer", "chat_template"]:
664+
output.pop(key, None)
677665

678666
def save_public_processor_class(dictionary):
679-
# make sure private name "_processor_class" is correctly
680-
# saved as "processor_class"
681667
_processor_class = dictionary.pop("_processor_class", None)
682668
if _processor_class is not None:
683669
dictionary["processor_class"] = _processor_class
@@ -687,33 +673,24 @@ def save_public_processor_class(dictionary):
687673
return dictionary
688674

689675
def cast_array_to_list(dictionary):
690-
"""
691-
Numpy arrays are not serialiazable but can be in pre-processing dicts.
692-
This function casts arrays to list, recusring through the nested configs as well.
693-
"""
694676
for key, value in dictionary.items():
695677
if isinstance(value, np.ndarray):
696678
dictionary[key] = value.tolist()
697679
elif isinstance(value, dict):
698680
dictionary[key] = cast_array_to_list(value)
699681
return dictionary
700682

701-
# Special case, add `audio_tokenizer` dict which points to model weights and path
702683
if "audio_tokenizer" in output:
703684
audio_tokenizer_dict = {
704685
"audio_tokenizer_class": self.audio_tokenizer.__class__.__name__,
705686
"audio_tokenizer_name_or_path": self.audio_tokenizer.name_or_path,
706687
}
707688
output["audio_tokenizer"] = audio_tokenizer_dict
708689

709-
# Serialize attributes as a dict
710690
output = {
711691
k: v.to_dict() if isinstance(v, PushToHubMixin) else v
712692
for k, v in output.items()
713-
if (
714-
k in attrs_to_save # keep all attributes that have to be serialized
715-
and v.__class__.__name__ != "BeamSearchDecoderCTC" # remove attributes with that are objects
716-
)
693+
if k in attrs_to_save and v.__class__.__name__ != "BeamSearchDecoderCTC"
717694
}
718695
output = cast_array_to_list(output)
719696
output = save_public_processor_class(output)

tests/test_processor_utils.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import tempfile
2+
3+
from transformers.testing_utils import TestCasePlus
4+
from transformers import ProcessorMixin, AutoTokenizer, PreTrainedTokenizer
5+
6+
7+
class ProcessorSavePretrainedMultipleAttributes(TestCasePlus):
8+
def test_processor_loads_separate_attributes(self):
9+
class OtherProcessor(ProcessorMixin):
10+
name = "other-processor"
11+
12+
attributes = [
13+
"tokenizer1",
14+
"tokenizer2",
15+
]
16+
tokenizer1_class = "AutoTokenizer"
17+
tokenizer2_class = "AutoTokenizer"
18+
19+
def __init__(self,
20+
tokenizer1: PreTrainedTokenizer,
21+
tokenizer2: PreTrainedTokenizer
22+
):
23+
super().__init__(tokenizer1=tokenizer1,
24+
tokenizer2=tokenizer2)
25+
26+
tokenizer1 = AutoTokenizer.from_pretrained("google/gemma-3-270m")
27+
tokenizer2 = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B")
28+
29+
processor = OtherProcessor(tokenizer1=tokenizer1,
30+
tokenizer2=tokenizer2)
31+
assert processor.tokenizer1.__class__ != processor.tokenizer2.__class__
32+
33+
with tempfile.TemporaryDirectory() as temp_dir:
34+
processor.save_pretrained(save_directory=temp_dir, push_to_hub=False)
35+
new_processor = OtherProcessor.from_pretrained(temp_dir)
36+
37+
assert new_processor.tokenizer1.__class__ != new_processor.tokenizer2.__class__

0 commit comments

Comments
 (0)