@@ -3814,7 +3814,7 @@ def _xlmroberta_set_vocab(self) -> None:
3814
3814
remove_whitespaces = tokenizer .clean_up_tokenization_spaces
3815
3815
precompiled_charsmap = b64decode (tokenizer_json ["normalizer" ]["precompiled_charsmap" ])
3816
3816
3817
- vocab_size = self .hparams .get ("vocab_size" , tokenizer .vocab_size )
3817
+ vocab_size = max ( self .hparams .get ("vocab_size" , 0 ) , tokenizer .vocab_size )
3818
3818
else :
3819
3819
sentencepiece_model = model .ModelProto () # pyright: ignore[reportAttributeAccessIssue]
3820
3820
sentencepiece_model .ParseFromString (open (tokenizer_path , "rb" ).read ())
@@ -3827,7 +3827,7 @@ def _xlmroberta_set_vocab(self) -> None:
3827
3827
tokenizer = SentencePieceProcessor ()
3828
3828
tokenizer .LoadFromFile (str (tokenizer_path ))
3829
3829
3830
- vocab_size = self .hparams .get (' vocab_size' , tokenizer .vocab_size ())
3830
+ vocab_size = max ( self .hparams .get (" vocab_size" , 0 ) , tokenizer .vocab_size ())
3831
3831
3832
3832
tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
3833
3833
scores : list [float ] = [- 10000.0 ] * vocab_size
@@ -3857,33 +3857,26 @@ def _xlmroberta_set_vocab(self) -> None:
3857
3857
unk_token = tokenizer_config_json .get ("unk_token" )
3858
3858
unk_token_id = added_vocab .get (unk_token , tokenizer_json ["model" ].get ("unk_id" , 3 ))
3859
3859
3860
- for token_id in range (vocab_size ):
3860
+ for token_id in range (tokenizer . vocab_size ):
3861
3861
piece = tokenizer ._convert_id_to_token (token_id )
3862
- text = piece .encode ("utf-8" )
3863
- score = tokenizer_json ["model" ]["vocab" ][token_id ][1 ]
3864
-
3865
- toktype = SentencePieceTokenTypes .NORMAL
3866
- if token_id == unk_token_id :
3867
- toktype = SentencePieceTokenTypes .UNKNOWN
3868
- elif token_id in tokenizer .all_special_ids :
3869
- toktype = SentencePieceTokenTypes .CONTROL
3870
- elif token_id in added_vocab .values ():
3871
- toktype = SentencePieceTokenTypes .USER_DEFINED
3872
- # No reliable way to detect this, but jina doesn't have any
3873
- # elif tokenizer.IsByte(token_id):
3874
- # toktype = SentencePieceTokenTypes.BYTE
3875
-
3876
- tokens [token_id ] = text
3877
- scores [token_id ] = score
3878
- toktypes [token_id ] = toktype
3879
-
3880
- if vocab_size > len (tokens ):
3881
- pad_count = vocab_size - len (tokens )
3882
- logger .debug (f"Padding vocab with { pad_count } token(s) - [PAD1] through [PAD{ pad_count } ]" )
3883
- for i in range (1 , pad_count + 1 ):
3884
- tokens .append (bytes (f"[PAD{ i } ]" , encoding = "utf-8" ))
3885
- scores .append (- 1000.0 )
3886
- toktypes .append (SentencePieceTokenTypes .UNUSED )
3862
+ if (piece := tokenizer ._convert_id_to_token (token_id )) is not None :
3863
+ text = piece .encode ("utf-8" )
3864
+ score = tokenizer_json ["model" ]["vocab" ][token_id ][1 ]
3865
+
3866
+ toktype = SentencePieceTokenTypes .NORMAL
3867
+ if token_id == unk_token_id :
3868
+ toktype = SentencePieceTokenTypes .UNKNOWN
3869
+ elif token_id in tokenizer .all_special_ids :
3870
+ toktype = SentencePieceTokenTypes .CONTROL
3871
+ elif token_id in added_vocab .values ():
3872
+ toktype = SentencePieceTokenTypes .USER_DEFINED
3873
+ # No reliable way to detect this, but jina doesn't have any
3874
+ # elif tokenizer.IsByte(token_id):
3875
+ # toktype = SentencePieceTokenTypes.BYTE
3876
+
3877
+ tokens [token_id ] = text
3878
+ scores [token_id ] = score
3879
+ toktypes [token_id ] = toktype
3887
3880
3888
3881
if isinstance (tokenizer , SentencePieceProcessor ):
3889
3882
# realign tokens (see HF tokenizer code)
0 commit comments