Skip to content

Commit ca7f7ee

Browse files
committed
fix(hf_tokenizer): Rename to HFTokenizer and corresponding flags
#1251 Branch: TokenizersTokenizer-1251 Co-Authored-By: jackkhuu@fb.com Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
1 parent 87bcf5c commit ca7f7ee

File tree

3 files changed

+19
-19
lines changed

3 files changed

+19
-19
lines changed

tokenizer/tokenizers.py renamed to tokenizer/hf_tokenizer.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@
1616
from .base import TokenizerBase
1717

1818

19-
class TokenizersTokenizer(TokenizerBase):
19+
class HFTokenizer(TokenizerBase):
2020
"""
21-
Wrapper around the `tokenizers` library for API compatibility
21+
Wrapper around the Huggingface `tokenizers` library for API compatibility
2222
"""
2323

2424
def __init__(self, file_path: str):

torchchat/cli/builder.py

+11-11
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ class TokenizerArgs:
190190
tokenizer_path: Optional[Union[Path, str]] = None
191191
is_sentencepiece: bool = False
192192
is_tiktoken: bool = False
193-
is_tokenizers: bool = False
193+
is_hf_tokenizer: bool = False
194194
t: Optional[Any] = None
195195

196196
def __post_init__(self):
@@ -200,7 +200,7 @@ def __post_init__(self):
200200
self.t = TiktokenTokenizer(model_path=str(self.tokenizer_path))
201201
self.is_tiktoken = True
202202
self.is_sentencepiece = False
203-
self.is_tokenizers = False
203+
self.is_hf_tokenizer = False
204204
return
205205
except:
206206
pass
@@ -211,25 +211,25 @@ def __post_init__(self):
211211
self.t = SentencePieceProcessor(model_file=str(self.tokenizer_path))
212212
self.is_tiktoken = False
213213
self.is_sentencepiece = True
214-
self.is_tokenizers = False
214+
self.is_hf_tokenizer = False
215215
return
216216
except:
217217
pass
218218

219219
try:
220-
from tokenizer.tokenizers import TokenizersTokenizer
220+
from tokenizer.hf_tokenizer import HFTokenizer
221221

222-
self.t = TokenizersTokenizer(str(self.tokenizer_path))
222+
self.t = HFTokenizer(str(self.tokenizer_path))
223223
self.is_tiktoken = False
224224
self.is_sentencepiece = False
225-
self.is_tokenizers = True
225+
self.is_hf_tokenizer = True
226226
return
227227
except:
228228
pass
229229

230230
self.is_tiktoken = False
231231
self.is_sentencepiece = False
232-
self.is_tokenizers = False
232+
self.is_hf_tokenizer = False
233233
self.t = None
234234
return
235235

@@ -241,25 +241,25 @@ def validate_model(
241241
if model is None:
242242
return
243243

244-
if len(list(filter(lambda x: x, [self.is_tiktoken, self.is_tokenizers, self.is_sentencepiece]))) != 1:
244+
if sum([self.is_tiktoken, self.is_hf_tokenizer, self.is_sentencepiece]) != 1:
245245
raise RuntimeError(f"no tokenizer was found at {self.tokenizer_path}")
246246

247247
is_tiktoken = self.is_tiktoken
248248
is_sentencepiece = self.is_sentencepiece
249-
is_tokenizers = self.is_tokenizers
249+
is_hf_tokenizer = self.is_hf_tokenizer
250250
use_tiktoken = model.config.use_tiktoken
251251
use_tokenizers = model.config.use_tokenizers
252252
use_sentencepiece = not (use_tiktoken or use_tokenizers)
253253

254254
if (
255255
(is_tiktoken and not use_tiktoken) or
256-
(is_tokenizers and not use_tokenizers) or
256+
(is_hf_tokenizer and not use_tokenizers) or
257257
(is_sentencepiece and not use_sentencepiece)
258258
):
259259
raise RuntimeError(
260260
"model-specified tokenizer ({}) does not match provided tokenizer ({}) for {}".format(
261261
tokenizer_setting_to_name(use_tiktoken, use_tokenizers),
262-
tokenizer_setting_to_name(is_tiktoken, is_tokenizers),
262+
tokenizer_setting_to_name(is_tiktoken, is_hf_tokenizer),
263263
model_description,
264264
)
265265
)

torchchat/model.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ class TransformerArgs:
272272
ffn_dim_multiplier: Optional[int] = None
273273
# Select the desired tokenizer. Defaults to sentencepiece
274274
use_tiktoken: bool = False
275-
use_tokenizers: bool = False
275+
use_hf_tokenizer: bool = False
276276
max_seq_length: int = 8192
277277
rope_scaling: Optional[Dict[str, Any]] = None
278278
# For pipeline parallel
@@ -329,14 +329,14 @@ class ModelArgs:
329329
model_type: ModelType
330330
transformer_args: Dict[str, Dict[str, Any]]
331331
use_tiktoken: bool
332-
use_tokenizers: bool
332+
use_hf_tokenizer: bool
333333

334334
def __init__(
335335
self,
336336
transformer_args: Dict[str, Dict[str, Any]],
337337
model_type: ModelType = ModelType.TextOnly,
338338
use_tiktoken: bool = False,
339-
use_tokenizers: bool = False,
339+
use_hf_tokenizer: bool = False,
340340
) -> None:
341341
self._sanity_check(transformer_args, model_type)
342342

@@ -345,7 +345,7 @@ def __init__(
345345

346346
# Model-level attributes
347347
self.use_tiktoken = use_tiktoken
348-
self.use_tokenizers = use_tokenizers
348+
self.use_hf_tokenizer = use_hf_tokenizer
349349

350350
def _sanity_check(
351351
self,
@@ -372,8 +372,8 @@ def from_params(cls, params_path):
372372
}
373373

374374
use_tiktoken = loaded_params.get("use_tiktoken", False)
375-
use_tokenizers = loaded_params.get("use_tokenizers", False)
376-
return cls(transformer_args, model_type, use_tiktoken, use_tokenizers)
375+
use_hf_tokenizer = loaded_params.get("use_hf_tokenizer", False)
376+
return cls(transformer_args, model_type, use_tiktoken, use_hf_tokenizer)
377377

378378
@classmethod
379379
def from_table(cls, name: str):

0 commit comments

Comments
 (0)