@@ -190,7 +190,7 @@ class TokenizerArgs:
190
190
tokenizer_path : Optional [Union [Path , str ]] = None
191
191
is_sentencepiece : bool = False
192
192
is_tiktoken : bool = False
193
- is_tokenizers : bool = False
193
+ is_hf_tokenizer : bool = False
194
194
t : Optional [Any ] = None
195
195
196
196
def __post_init__ (self ):
@@ -200,7 +200,7 @@ def __post_init__(self):
200
200
self .t = TiktokenTokenizer (model_path = str (self .tokenizer_path ))
201
201
self .is_tiktoken = True
202
202
self .is_sentencepiece = False
203
- self .is_tokenizers = False
203
+ self .is_hf_tokenizer = False
204
204
return
205
205
except :
206
206
pass
@@ -211,25 +211,25 @@ def __post_init__(self):
211
211
self .t = SentencePieceProcessor (model_file = str (self .tokenizer_path ))
212
212
self .is_tiktoken = False
213
213
self .is_sentencepiece = True
214
- self .is_tokenizers = False
214
+ self .is_hf_tokenizer = False
215
215
return
216
216
except :
217
217
pass
218
218
219
219
try :
220
- from tokenizer .tokenizers import TokenizersTokenizer
220
+ from tokenizer .hf_tokenizer import HFTokenizer
221
221
222
- self .t = TokenizersTokenizer (str (self .tokenizer_path ))
222
+ self .t = HFTokenizer (str (self .tokenizer_path ))
223
223
self .is_tiktoken = False
224
224
self .is_sentencepiece = False
225
- self .is_tokenizers = True
225
+ self .is_hf_tokenizer = True
226
226
return
227
227
except :
228
228
pass
229
229
230
230
self .is_tiktoken = False
231
231
self .is_sentencepiece = False
232
- self .is_tokenizers = False
232
+ self .is_hf_tokenizer = False
233
233
self .t = None
234
234
return
235
235
@@ -241,25 +241,25 @@ def validate_model(
241
241
if model is None :
242
242
return
243
243
244
- if len ( list ( filter ( lambda x : x , [self .is_tiktoken , self .is_tokenizers , self .is_sentencepiece ])) ) != 1 :
244
+ if sum ( [self .is_tiktoken , self .is_hf_tokenizer , self .is_sentencepiece ]) != 1 :
245
245
raise RuntimeError (f"no tokenizer was found at { self .tokenizer_path } " )
246
246
247
247
is_tiktoken = self .is_tiktoken
248
248
is_sentencepiece = self .is_sentencepiece
249
- is_tokenizers = self .is_tokenizers
249
+ is_hf_tokenizer = self .is_hf_tokenizer
250
250
use_tiktoken = model .config .use_tiktoken
251
251
use_tokenizers = model .config .use_tokenizers
252
252
use_sentencepiece = not (use_tiktoken or use_tokenizers )
253
253
254
254
if (
255
255
(is_tiktoken and not use_tiktoken ) or
256
- (is_tokenizers and not use_tokenizers ) or
256
+ (is_hf_tokenizer and not use_tokenizers ) or
257
257
(is_sentencepiece and not use_sentencepiece )
258
258
):
259
259
raise RuntimeError (
260
260
"model-specified tokenizer ({}) does not match provided tokenizer ({}) for {}" .format (
261
261
tokenizer_setting_to_name (use_tiktoken , use_tokenizers ),
262
- tokenizer_setting_to_name (is_tiktoken , is_tokenizers ),
262
+ tokenizer_setting_to_name (is_tiktoken , is_hf_tokenizer ),
263
263
model_description ,
264
264
)
265
265
)
0 commit comments