Skip to content

Commit 8945cbb

Browse files
author
mark
committed
Avoid a top-level import of tokenizers.
1 parent 9996f34 commit 8945cbb

File tree

1 file changed

+4
-2
lines changed

1 file changed

+4
-2
lines changed

axlearn/experiments/text/gpt/c4_trainer.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,12 @@
4040
```
4141
"""
4242

43-
4443
from axlearn.common.config import InstantiableConfig, config_for_class, config_for_function
4544
from axlearn.common.input_lm import lm_text_preprocessor
4645
from axlearn.common.utils import get_data_dir
4746
from axlearn.experiments.text.common import DataMixtureComponent, vocab
4847
from axlearn.experiments.text.gpt import fuji, gspmd
4948
from axlearn.experiments.text.gpt.common import mixture_train_input_source, tfds_input
50-
from axlearn.experiments.text.gpt.vocabulary_fuji_v3 import FujiV3Vocabulary
5149
from axlearn.experiments.trainer_config_utils import TrainerConfigFn
5250

5351

@@ -59,6 +57,10 @@ def _vocab_cfg(vocab_size: int):
5957
if vocab_size == 128 * 1024:
6058
return config_for_function(vocab).set(sentencepiece_model_name="bpe_128k_c4.model")
6159
if vocab_size == 128256:
60+
# Avoid a global tokenizers dependency.
61+
# pylint: disable-next=import-outside-toplevel
62+
from axlearn.experiments.text.gpt.vocabulary_fuji_v3 import FujiV3Vocabulary
63+
6264
# TikToken.
6365
return config_for_class(FujiV3Vocabulary).set(filename="Llama-3-tokenizer.json")
6466
raise ValueError(f"Tokenizer with vocab size {vocab_size} does not exist.")

0 commit comments

Comments
 (0)