Skip to content

Commit

Permalink
Revert "[fsmt tokenizer] support lowercase tokenizer (huggingface#8389)"
Browse files Browse the repository at this point in the history
This reverts commit c3f5372.
  • Loading branch information
fabiocapsouza authored Nov 15, 2020
1 parent 027d1f6 commit 1e9c2fa
Show file tree
Hide file tree
Showing 3 changed files with 1 addition and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -133,14 +133,6 @@ def convert_fsmt_checkpoint_to_pytorch(fsmt_checkpoint_path, pytorch_dump_folder
with open(src_vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent))

# detect whether this is a do_lower_case situation, which can be derived by checking whether we
# have at least one upcase letter in the source vocab
do_lower_case = True
for k in src_vocab.keys():
if not k.islower():
do_lower_case = False
break

tgt_dict = Dictionary.load(tgt_dict_file)
tgt_vocab = rewrite_dict_keys(tgt_dict.indices)
tgt_vocab_size = len(tgt_vocab)
Expand Down Expand Up @@ -215,7 +207,6 @@ def convert_fsmt_checkpoint_to_pytorch(fsmt_checkpoint_path, pytorch_dump_folder
tokenizer_conf = {
"langs": [src_lang, tgt_lang],
"model_max_length": 1024,
"do_lower_case": do_lower_case,
}

print(f"Generating {fsmt_tokenizer_config_file}")
Expand Down
8 changes: 1 addition & 7 deletions src/transformers/tokenization_fsmt.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ class FSMTTokenizer(PreTrainedTokenizer):
File containing the vocabulary for the target language.
merges_file (:obj:`str`):
File containing the merges.
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to lowercase the input when tokenizing.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
Expand Down Expand Up @@ -186,7 +186,6 @@ def __init__(
src_vocab_file=None,
tgt_vocab_file=None,
merges_file=None,
do_lower_case=False,
unk_token="<unk>",
bos_token="<s>",
sep_token="</s>",
Expand All @@ -198,7 +197,6 @@ def __init__(
src_vocab_file=src_vocab_file,
tgt_vocab_file=tgt_vocab_file,
merges_file=merges_file,
do_lower_case=do_lower_case,
unk_token=unk_token,
bos_token=bos_token,
sep_token=sep_token,
Expand All @@ -209,7 +207,6 @@ def __init__(
self.src_vocab_file = src_vocab_file
self.tgt_vocab_file = tgt_vocab_file
self.merges_file = merges_file
self.do_lower_case = do_lower_case

# cache of sm.MosesPunctNormalizer instance
self.cache_moses_punct_normalizer = dict()
Expand Down Expand Up @@ -354,9 +351,6 @@ def _tokenize(self, text, lang="en", bypass_tokenizer=False):
# raise ValueError(f"Expected lang={self.src_lang}, but got {lang}")
lang = self.src_lang

if self.do_lower_case:
text = text.lower()

if bypass_tokenizer:
text = text.split()
else:
Expand Down
7 changes: 0 additions & 7 deletions tests/test_tokenization_fsmt.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,13 +151,6 @@ def test_match_encode_decode(self):
decoded_text = tokenizer_dec.decode(encoded_ids, skip_special_tokens=True)
self.assertEqual(decoded_text, src_text)

@slow
def test_tokenizer_lower(self):
tokenizer = FSMTTokenizer.from_pretrained("facebook/wmt19-ru-en", do_lower_case=True)
tokens = tokenizer.tokenize("USA is United States of America")
expected = ["us", "a</w>", "is</w>", "un", "i", "ted</w>", "st", "ates</w>", "of</w>", "am", "er", "ica</w>"]
self.assertListEqual(tokens, expected)

@unittest.skip("FSMTConfig.__init__ requires non-optional args")
def test_torch_encode_plus_sent_to_model(self):
pass
Expand Down

0 comments on commit 1e9c2fa

Please sign in to comment.