@@ -654,30 +654,16 @@ def to_dict(self) -> dict[str, Any]:
654654 Returns:
655655 `dict[str, Any]`: Dictionary of all the attributes that make up this processor instance.
656656 """
657- output = copy .deepcopy (self .__dict__ )
657+ # shallow copy to avoid deepcopy errors
658+ output = self .__dict__ .copy ()
658659
659- # Get the kwargs in `__init__`.
660660 sig = inspect .signature (self .__init__ )
661- # Only save the attributes that are presented in the kwargs of `__init__`.
662- # or in the attributes
663- attrs_to_save = list (sig .parameters ) + self .__class__ .attributes
664- # extra attributes to be kept
665- attrs_to_save += ["auto_map" ]
666-
667- if "tokenizer" in output :
668- del output ["tokenizer" ]
669- if "qformer_tokenizer" in output :
670- del output ["qformer_tokenizer" ]
671- if "protein_tokenizer" in output :
672- del output ["protein_tokenizer" ]
673- if "char_tokenizer" in output :
674- del output ["char_tokenizer" ]
675- if "chat_template" in output :
676- del output ["chat_template" ]
661+ attrs_to_save = list (sig .parameters ) + self .__class__ .attributes + ["auto_map" ]
662+
663+ for key in ["tokenizer" , "qformer_tokenizer" , "protein_tokenizer" , "char_tokenizer" , "chat_template" ]:
664+ output .pop (key , None )
677665
678666 def save_public_processor_class (dictionary ):
679- # make sure private name "_processor_class" is correctly
680- # saved as "processor_class"
681667 _processor_class = dictionary .pop ("_processor_class" , None )
682668 if _processor_class is not None :
683669 dictionary ["processor_class" ] = _processor_class
@@ -687,33 +673,24 @@ def save_public_processor_class(dictionary):
687673 return dictionary
688674
689675 def cast_array_to_list (dictionary ):
690- """
691- Numpy arrays are not serialiazable but can be in pre-processing dicts.
692- This function casts arrays to list, recusring through the nested configs as well.
693- """
694676 for key , value in dictionary .items ():
695677 if isinstance (value , np .ndarray ):
696678 dictionary [key ] = value .tolist ()
697679 elif isinstance (value , dict ):
698680 dictionary [key ] = cast_array_to_list (value )
699681 return dictionary
700682
701- # Special case, add `audio_tokenizer` dict which points to model weights and path
702683 if "audio_tokenizer" in output :
703684 audio_tokenizer_dict = {
704685 "audio_tokenizer_class" : self .audio_tokenizer .__class__ .__name__ ,
705686 "audio_tokenizer_name_or_path" : self .audio_tokenizer .name_or_path ,
706687 }
707688 output ["audio_tokenizer" ] = audio_tokenizer_dict
708689
709- # Serialize attributes as a dict
710690 output = {
711691 k : v .to_dict () if isinstance (v , PushToHubMixin ) else v
712692 for k , v in output .items ()
713- if (
714- k in attrs_to_save # keep all attributes that have to be serialized
715- and v .__class__ .__name__ != "BeamSearchDecoderCTC" # remove attributes with that are objects
716- )
693+ if k in attrs_to_save and v .__class__ .__name__ != "BeamSearchDecoderCTC"
717694 }
718695 output = cast_array_to_list (output )
719696 output = save_public_processor_class (output )
0 commit comments