Skip to content

Commit

Permalink
Reuse EOS token for EOD to optimize vocabulary size and training effi…
Browse files Browse the repository at this point in the history
…ciency (ServiceNow#73)
  • Loading branch information
tscholak authored Nov 28, 2024
1 parent 3bf38be commit 5e6de1a
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 9 deletions.
1 change: 0 additions & 1 deletion fast_llm/data/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ def _validate(self):
Assert.in_range_incl(self.rate, 0, 1)


EOD = "<|endoftext|>"
TokenizerFromFile = "TokenizerFromFile"


Expand Down
14 changes: 6 additions & 8 deletions fast_llm/data/tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from transformers import PreTrainedTokenizerFast

from fast_llm.data.config import EOD, TokenizerConfig
from fast_llm.data.config import TokenizerConfig
from fast_llm.engine.config_utils.run import log_main_rank


Expand All @@ -11,13 +11,11 @@ class Tokenizer:

def __init__(self, config: TokenizerConfig):
log_main_rank(f"> loading tokenizer from {config.path} ...")
special_tokens = [EOD]
self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=config.path, errors="replace", max_len=None)
self.tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})
self.eod_id = self.tokenizer.vocab[EOD]
# Token->id mapping for additional special-tokens
self.special_tokens = {tok: self.tokenizer.vocab[tok] for tok in special_tokens}
self._inv_vocab = {v: k for k, v in self.tokenizer.vocab.items()}
if self.tokenizer.eos_token_id is None:
raise ValueError("Tokenizer does not have an EOS token.")
self.eod_id = self.tokenizer.eos_token_id
self._inv_vocab = {v: k for k, v in self.vocab.items()}

@property
def vocab_size(self):
Expand All @@ -31,7 +29,7 @@ def vocab(self):
def inv_vocab(self):
return self._inv_vocab

def tokenize(self, text):
def tokenize(self, text: str):
return self.tokenizer.encode(text)

def detokenize(self, token_ids):
Expand Down

0 comments on commit 5e6de1a

Please sign in to comment.