diff --git a/tests/tokenization_test.py b/tests/tokenization_test.py index 8d6ede9300a7c1..7c12ecccfed29a 100644 --- a/tests/tokenization_test.py +++ b/tests/tokenization_test.py @@ -43,6 +43,13 @@ def test_full_tokenizer(self): self.assertListEqual( tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) + def test_chinese(self): + tokenizer = tokenization.BasicTokenizer() + + self.assertListEqual( + tokenizer.tokenize(u"ah\u535A\u63A8zz"), + [u"ah", u"\u535A", u"\u63A8", u"zz"]) + def test_basic_tokenizer_lower(self): tokenizer = tokenization.BasicTokenizer(do_lower_case=True) diff --git a/tokenization.py b/tokenization.py index 83bc86e444721d..8cf83720d9ec21 100644 --- a/tokenization.py +++ b/tokenization.py @@ -133,6 +133,13 @@ def tokenize(self, text): """Tokenizes a piece of text.""" text = convert_to_unicode(text) text = self._clean_text(text) + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) orig_tokens = whitespace_tokenize(text) split_tokens = [] for token in orig_tokens: @@ -174,7 +181,42 @@ def _run_split_on_punc(self, text): i += 1 return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + def _clean_text(self, text): """Performs invalid character removal and whitespace cleanup on text.""" output = []