diff --git a/paddlenlp/transformers/luke/tokenizer.py b/paddlenlp/transformers/luke/tokenizer.py index 7653c96526c2..56f8242b87ce 100644 --- a/paddlenlp/transformers/luke/tokenizer.py +++ b/paddlenlp/transformers/luke/tokenizer.py @@ -482,6 +482,12 @@ def __call__( return encode_output + def __len__(self): + """ + Size of the full vocabulary with the added tokens. + """ + return len(self.encoder) + len(self.added_tokens_encoder) + def tokenize(self, text, add_prefix_space=False): """ Tokenize a string. @@ -608,7 +614,7 @@ def _convert_token_to_id_with_added_voc(self, token): return self._convert_token_to_id(token) - def add_special_tokens(self, token_list: Union[List[int], Dict]): + def add_special_tokens(self, token_list: Union[List[int], Dict], replace_additional_special_tokens: bool = True): """ Adding special tokens if you need. @@ -616,14 +622,33 @@ def add_special_tokens(self, token_list: Union[List[int], Dict]): token_list (List[int], Dict[List[int]]): The special token list you provided. If you provide a Dict, the key of the Dict must be "additional_special_tokens" and the value must be token list. + replace_additional_special_tokens (bool, optional, defaults to True): + If True, the existing list of additional special tokens will be replaced by the list provided in + `token_list`. Otherwise, `self._additional_special_tokens` is just extended. In the former + case, the tokens will NOT be removed from the tokenizer's full vocabulary - they are only being flagged + as non-special tokens. Remember, this only affects which tokens are skipped during decoding, not the + `added_tokens_encoder` and `added_tokens_decoder`. This means that the previous + `additional_special_tokens` are still added tokens, and will not be split by the model. """ if isinstance(token_list, dict): token_list = token_list["additional_special_tokens"] + + if replace_additional_special_tokens: + self._additional_special_tokens = list(token_list) + else: + self._additional_special_tokens.extend( + [token for token in token_list if token not in self._additional_special_tokens] + ) encoder_dict = dict() decoder_dict = dict() + + token_id_counter = len(self) for token in token_list: - encoder_dict[token] = len(self.encoder.keys()) - decoder_dict[len(self.decoder.keys())] = token + if token not in self.added_tokens_encoder: + encoder_dict[token] = token_id_counter + decoder_dict[token_id_counter] = token + token_id_counter += 1 + self.added_tokens_encoder.update(encoder_dict) self.added_tokens_decoder.update(decoder_dict) diff --git a/paddlenlp/transformers/tokenizer_utils_base.py b/paddlenlp/transformers/tokenizer_utils_base.py index 535dd657d0e6..6af5cc29e5d4 100644 --- a/paddlenlp/transformers/tokenizer_utils_base.py +++ b/paddlenlp/transformers/tokenizer_utils_base.py @@ -801,14 +801,16 @@ def sanitize_special_tokens(self) -> int: """ return self.add_tokens(self.all_special_tokens_extended, special_tokens=True) - def add_special_tokens(self, special_tokens_dict: Dict[str, Union[str, AddedToken]]) -> int: + def add_special_tokens( + self, special_tokens_dict: Dict[str, Union[str, AddedToken]], replace_additional_special_tokens=True + ) -> int: """ Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to class attributes. If special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the current vocabulary). - Note,None When adding new tokens to the vocabulary, you should make sure to also resize the token embedding - matrix of the model so that its embedding matrix matches the tokenizer. + When adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of the + model so that its embedding matrix matches the tokenizer. In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method. @@ -829,6 +831,13 @@ def add_special_tokens(self, special_tokens_dict: Dict[str, Union[str, AddedToke Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the `unk_token` to them). + replace_additional_special_tokens (`bool`, *optional*,, defaults to `True`): + If `True`, the existing list of additional special tokens will be replaced by the list provided in + `special_tokens_dict`. Otherwise, `self._additional_special_tokens` is just extended. In the former + case, the tokens will NOT be removed from the tokenizer's full vocabulary - they are only being flagged + as non-special tokens. Remember, this only affects which tokens are skipped during decoding, not the + `added_tokens_encoder` and `added_tokens_decoder`. This means that the previous + `additional_special_tokens` are still added tokens, and will not be split by the model. Returns: `int`: Number of tokens added to the vocabulary. @@ -852,25 +861,38 @@ def add_special_tokens(self, special_tokens_dict: Dict[str, Union[str, AddedToke if not special_tokens_dict: return 0 - added_tokens = 0 + added_tokens = [] for key, value in special_tokens_dict.items(): assert key in self.SPECIAL_TOKENS_ATTRIBUTES, f"Key {key} is not a special token" if self.verbose: logger.info(f"Assigning {value} to the {key} key of the tokenizer") - setattr(self, key, value) if key == "additional_special_tokens": assert isinstance(value, (list, tuple)) and all( isinstance(t, (str, AddedToken)) for t in value ), f"Tokens {value} for key {key} should all be str or AddedToken instances" - added_tokens += self.add_tokens(value, special_tokens=True) - else: - assert isinstance( - value, (str, AddedToken) - ), f"Token {value} for key {key} should be a str or an AddedToken instance" - added_tokens += self.add_tokens([value], special_tokens=True) + to_add = [] + for token in value: + if not replace_additional_special_tokens and str(token) in self.additional_special_tokens: + continue + to_add.append(token) + if replace_additional_special_tokens and len(to_add) > 0: + setattr(self, key, list(to_add)) + else: + self._additional_special_tokens.extend(to_add) + added_tokens += to_add + + else: + if not isinstance(value, (str, AddedToken)): + raise ValueError(f"Token {value} for key {key} should be a str or an AddedToken instance") + setattr(self, key, value) + if value not in added_tokens: + added_tokens.append(value) + + # if we are adding tokens that were not part of the vocab, we ought to add them + added_tokens = self.add_tokens(added_tokens, special_tokens=True) return added_tokens def add_tokens( diff --git a/tests/transformers/test_tokenizer_common.py b/tests/transformers/test_tokenizer_common.py index b86db0d6748f..7d78bfb09e0f 100644 --- a/tests/transformers/test_tokenizer_common.py +++ b/tests/transformers/test_tokenizer_common.py @@ -1156,6 +1156,31 @@ def test_maximum_encoding_length_pair_input(self): # self.assertEqual(encoded_masked, encoded_1) + def test_special_token_addition(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + # Create tokenizer and add an additional special token + tokenizer_1 = tokenizer.from_pretrained(pretrained_name) + tokenizer_1.add_special_tokens({"additional_special_tokens": [""]}) + self.assertEqual(tokenizer_1.additional_special_tokens, [""]) + with tempfile.TemporaryDirectory() as tmp_dir: + tokenizer_1.save_pretrained(tmp_dir) + # Load the above tokenizer and add the same special token a second time + tokenizer_2 = tokenizer.from_pretrained(pretrained_name) + tokenizer_2.add_special_tokens({"additional_special_tokens": [""]}) + self.assertEqual(tokenizer_2.additional_special_tokens, [""]) + + tokenizer_2.add_special_tokens({"additional_special_tokens": ["", ""]}) + self.assertEqual(tokenizer_2.additional_special_tokens, ["", ""]) + tokenizer_2.add_special_tokens({"additional_special_tokens": ["", ""]}) + self.assertEqual(tokenizer_2.additional_special_tokens, ["", ""]) + + tokenizer_2.add_special_tokens( + {"additional_special_tokens": [""]}, + replace_additional_special_tokens=False, + ) + self.assertEqual(tokenizer_2.additional_special_tokens, ["", "", ""]) + def test_special_tokens_mask(self): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: