1616
1717def word_tokenize (text , engine = "newmm" , whitespaces = True ):
1818 """
19- :param str text: the text to be tokenized
20- :param str engine: the engine to tokenize text
21- :param bool whitespaces: True to output no whitespace, a common mark of sentence or end of phrase in Thai.
19+ :param str text: text to be tokenized
20+ :param str engine: tokenizer to be used
21+ :param bool whitespaces: True to output no whitespace, a common mark of sentence or end of phrase in Thai
2222 :Parameters for engine:
23- * newmm - Maximum Matching algorithm + TCC
24- * icu - IBM ICU
25- * longest-matching - Longest matching
26- * mm - Maximum Matching algorithm
27- * pylexto - LexTo
28- * deepcut - Deep Neural Network
29- * wordcutpy - wordcutpy ( https://github.com/veer66/wordcutpy)
30- :return: A list of words, tokenized from a text
23+ * newmm (default) - dictionary-based, Maximum Matching + TCC
24+ * mm - dictionary-based, Maximum Matching
25+ * longest - dictionary-based, Longest Matching
26+ * icu - wrapper for ICU, dictionary-based
27+ * pylexto - wrapper for PyLexTo, dictionary-based, Longest Matching
28+ * wordcutpy - wrapper for wordcutpy, dictionary-based https://github.com/veer66/wordcutpy
29+ * deepcut - wrapper for deepcut, language-model-based https://github.com/rkcosmos/deepcut
30+ :return: list of words, tokenized from the text
3131
3232 **Example**::
33- from pythainlp.tokenize import word_tokenize
34- text = "โอเคบ่พวกเรารักภาษาบ้านเกิด"
35- word_tokenize(text, engine="newmm") # ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด']
36- word_tokenize(text, engine="icu") # ['โอ', 'เค', 'บ่', 'พวก', 'เรา', 'รัก', 'ภาษา', 'บ้าน', 'เกิด']
33+ >>> from pythainlp.tokenize import word_tokenize
34+ >>> text = "โอเคบ่พวกเรารักภาษาบ้านเกิด"
35+ >>> word_tokenize(text, engine="newmm")
36+ ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด']
37+ >>> word_tokenize(text, engine="icu")
38+ ['โอ', 'เค', 'บ่', 'พวก', 'เรา', 'รัก', 'ภาษา', 'บ้าน', 'เกิด']
3739 """
38- if engine == "icu " :
40+ if engine == "newmm" or engine == "onecut " :
3941 from .pyicu import segment
40- elif engine == "multi_cut " or engine == "mm " :
41- from .multi_cut import segment
42+ elif engine == "longest " or engine == "longest-matching " :
43+ from .longest import segment
4244 elif engine == "ulmfit" :
4345 from .newmm import mmcut
46+
4447 def segment (text ):
4548 return mmcut (text , trie = FROZEN_DICT_TRIE )
46- elif engine == "longest-matching" :
47- from .longest import segment
48- elif engine == "pylexto" :
49- from .pylexto import segment
49+
50+ elif engine == "icu" :
51+ from .pyicu import segment
5052 elif engine == "deepcut" :
5153 from .deepcut import segment
54+ elif engine == "pylexto" :
55+ from .pylexto import segment
5256 elif engine == "wordcutpy" :
5357 from .wordcutpy import segment
54- else : # default, use "newmm" ("onecut") engine
58+ elif engine == "mm" or engine == "multi_cut" :
59+ from .multi_cut import segment
60+ else : # default, use "newmm" engine
5561 from .newmm import mmcut as segment
5662
5763 if not whitespaces :
@@ -66,24 +72,26 @@ def dict_word_tokenize(text, custom_dict_trie, engine="newmm"):
6672
6773 :param str text: the text to be tokenized
6874 :param dict custom_dict_trie: คือ trie ที่สร้างจาก create_custom_dict_trie
69- :param str engine: choose between different options of engine to token (newmm, wordcutpy, mm, longest-matching )
75+ :param str engine: choose between different options of engine to token (newmm, wordcutpy, mm, longest)
7076 :return: A list of words, tokenized from a text.
7177 **Example**::
7278 >>> from pythainlp.tokenize import dict_word_tokenize,create_custom_dict_trie
73- >>> listword=[' แมว', "ดี"]
74- >>> data_dict= create_custom_dict_trie(listword)
75- >>> dict_word_tokenize("แมวดีดีแมว",data_dict)
79+ >>> listword = [" แมว", "ดี"]
80+ >>> data_dict = create_custom_dict_trie(listword)
81+ >>> dict_word_tokenize("แมวดีดีแมว", data_dict)
7682 ['แมว', 'ดี', 'ดี', 'แมว']
7783 """
78- if engine == "mm " or engine == "multi_cut " :
79- from .multi_cut import segment
80- elif engine == "longest-matching" :
84+ if engine == "newmm " or engine == "onecut " :
85+ from .pyicu import segment
86+ elif engine == "longest" or engine == "longest -matching" :
8187 from .longest import segment
8288 elif engine == "wordcutpy" :
8389 from .wordcutpy import segment
8490
8591 return segment (text , custom_dict_trie .keys ())
86- else : # default, use "newmm" ("onecut") engine
92+ elif engine == "mm" or engine == "multi_cut" :
93+ from .multi_cut import segment
94+ else : # default, use "newmm" engine
8795 from .newmm import mmcut as segment
8896
8997 return segment (text , custom_dict_trie )
0 commit comments