Skip to content

Commit

Permalink
tokenizer model args
Browse files Browse the repository at this point in the history
  • Loading branch information
taranais committed Jul 2, 2020
1 parent 4c18b5f commit 97ec29d
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 2 deletions.
3 changes: 3 additions & 0 deletions simpletransformers/config/model_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,9 @@ class LanguageModelingArgs(ModelArgs):
tie_generator_and_discriminator_embeddings: bool = True
tokenizer_name: str = None
vocab_size: int = None
clean_text : bool = True
handle_chinese_chars : bool = True
strip_accents : bool = True
local_rank: int = -1


Expand Down
11 changes: 9 additions & 2 deletions simpletransformers/language_modeling/language_modeling_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -910,7 +910,12 @@ def train_tokenizer(self, train_files, tokenizer_name=None, output_dir=None, use
output_dir = self.args.output_dir

if self.args.model_type in ["bert", "electra"]:
tokenizer = BertWordPieceTokenizer()
tokenizer = BertWordPieceTokenizer(
clean_text = self.args.clean_text,
handle_chinese_chars = self.args.handle_chinese_chars,
strip_accents = self.args.strip_accents,
lowercase = self.args.do_lower_case
)
self.args.special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
self.args.wordpieces_prefix = "##"

Expand All @@ -922,7 +927,9 @@ def train_tokenizer(self, train_files, tokenizer_name=None, output_dir=None, use
wordpieces_prefix="##",
)
else:
tokenizer = ByteLevelBPETokenizer()
tokenizer = ByteLevelBPETokenizer(
lowercase = self.args.do_lower_case
)

tokenizer.train(
files=train_files,
Expand Down

0 comments on commit 97ec29d

Please sign in to comment.