@@ -30,12 +30,15 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]:
3030
3131 :Example:
3232
33+ Clause tokenizer::
34+
3335 from pythainlp.tokenize import clause_tokenize
3436
3537 clause_tokenize(["ฉัน","นอน","และ","คุณ","เล่น","มือถือ","ส่วน","น้อง","เขียน","โปรแกรม"])
36- [['ฉัน', 'นอน'],
37- ['และ', 'คุณ', 'เล่น', 'มือถือ'],
38- ['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']]
38+ # [['ฉัน', 'นอน'],
39+ # ['และ', 'คุณ', 'เล่น', 'มือถือ'],
40+ # ['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']]
41+
3942 """
4043 if not doc or not isinstance (doc , str ):
4144 return []
@@ -81,6 +84,8 @@ def word_tokenize(
8184 * *nercut* - Dictionary-based maximal matching word segmentation,
8285 constrained with Thai Character Cluster (TCC) boundaries,
8386 and combining tokens that are parts of the same named-entity.
87+ * *sefr_cut* - wrapper for
88+ `SEFR CUT <https://github.com/mrpeerat/SEFR_CUT>`_.,
8489
8590 :Note:
8691 - The parameter **custom_dict** can be provided as an argument \
@@ -173,6 +178,10 @@ def word_tokenize(
173178 elif engine == "nercut" :
174179 from pythainlp .tokenize .nercut import segment
175180
181+ segments = segment (text )
182+ elif engine == "sefr_cut" :
183+ from pythainlp .tokenize .sefr_cut import segment
184+
176185 segments = segment (text )
177186 else :
178187 raise ValueError (
0 commit comments