Skip to content

Commit

Permalink
Add test for Chinese tokenization
Browse files Browse the repository at this point in the history
  • Loading branch information
elyase authored Nov 5, 2018
1 parent 0ce2f49 commit 4d124ba
Showing 1 changed file with 7 additions and 0 deletions.
7 changes: 7 additions & 0 deletions tests/tokenization_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,13 @@ def test_full_tokenizer(self):
self.assertListEqual(
tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])

def test_chinese(self):
tokenizer = tokenization.BasicTokenizer()

self.assertListEqual(
tokenizer.tokenize(u"ah\u535A\u63A8zz"),
[u"ah", u"\u535A", u"\u63A8", u"zz"])

def test_basic_tokenizer_lower(self):
tokenizer = tokenization.BasicTokenizer(do_lower_case=True)

Expand Down

0 comments on commit 4d124ba

Please sign in to comment.