Sorting tokenizers

bact · bact · commit eee54c17d408 · 2018-10-23T19:25:59.000+07:00
- more details on word tokenizers
- add alias "longest" for longest-matching tokenizer
- remove "dict" tokenizer from document, as there is no implementation in the code
- remove "mm" tokenizer from document, as it is not recommended to use (maintenance mode), but keep the code, so people can call it
- update doc: pythainlp-dev-thai.md
- remove unused import sys
diff --git a/docs/pythainlp-dev-thai.md b/docs/pythainlp-dev-thai.md
@@ -50,18 +50,16 @@ word_tokenize(text, engine)
 ```
 text คือ ข้อความในรูปแบบสตริง str เท่านั้น
 
-engine คือ ระบบตัดคำ ปัจจุบัน PyThaiNLP มี 6 engine ดังนี้
+engine คือ ระบบตัดคำ ปัจจุบันมี engine ดังนี้
 
-1. newmm (ค่าเริ่มต้น) - ใช้วิธี Maximum Matching โค้ดชุดใหม่[โดยคุณ Korakot Chaovavanich](https://www.facebook.com/groups/408004796247683/permalink/431283740586455/)
-2. icu - เรียกใช้ตัวตัดคำจาก ICU ตัวดั้งเดิมของ PyThaiNLP (ความแม่นยำต่ำ)
-3. dict - ตัดคำโดยใช้พจานุกรมจาก thaiword.txt ใน corpus (ความแม่นยำปานกลาง) **จะคืนค่า False หากข้อความนั้นไม่สามารถตัดคำได้**
-4. longest-matching - ใช้วิธี Longest Matching
-5. mm - ใช้วิธี Maximum Matching **(โค้ดชุดเก่า อยู่ในสถานะบำรุงรักษาเท่านั้น)**
-6. pylexto - เรียกใช้ตัวตัดคำจาก LexTo ซึ่งเป็น Longest Matching
-7. deepcut - เรียกใช้ [deepcut](https://github.com/rkcosmos/deepcut) ซึ่งตัดคำจากโมเดลการเรียนรู้ของเครื่อง
-8. wordcutpy - เรียกใช้ตัวตัดคำจาก [wordcutpy](https://github.com/veer66/wordcutpy)
+- newmm (ค่าเริ่มต้น) - ใช้พจนานุกรม ด้วยวิธี Maximum Matching โค้ดชุดใหม่[โดยคุณ Korakot Chaovavanich](https://www.facebook.com/groups/408004796247683/permalink/431283740586455/)
+- longest - ใช้พจนานุกรม ด้วยวิธี Longest Matching
+- icu - เรียกใช้ตัวตัดคำจาก ICU ใช้พจนานุกรม (ความแม่นยำต่ำ)
+- pylexto - เรียกใช้ตัวตัดคำจาก LexTo ใช้พจนานุกรม ด้วยวิธี Longest Matching
+- wordcutpy - เรียกใช้ตัวตัดคำจาก [wordcutpy](https://github.com/veer66/wordcutpy) ใช้พจนานุกรม
+- deepcut - เรียกใช้ตัวตัดคำจาก [deepcut](https://github.com/rkcosmos/deepcut) ใช้การเรียนรู้ของเครื่อง
 
-คืนค่าเป็น ''list'' เช่น ['แมว','กิน']
+คืนค่าเป็น ''list'' เช่น ['แมว', 'กิน']
 
 **การใช้งาน**
 
@@ -86,11 +84,11 @@ text คือ ข้อความที่ต้องการตัดค
 
 filename คือ ที่ตั้งไฟล์ที่ต้องการมาเป็นฐานข้อมูลตัดคำ
 
-engine คือ เครื่องมือตัดคำ
-- newmm ตัดคำด้วย newmm
-- wordcutpy ใช้ [wordcutpy](https://github.com/veer66/wordcutpy) ในการตัดคำ
-- mm ตัดคำด้วย mm
-- longest-matching ตัดคำโดยใช้ longest matching
+engine คือ ระบบตัดคำ (ดูรายละเอียดที่ word_tokenize)
+- newmm 
+- mm
+- longest
+- wordcutpy
 
 ตัวอย่างการใช้งาน https://gist.github.com/wannaphongcom/1e862583051bf0464b6ef4ed592f739c
 
diff --git a/examples/tokenize.py b/examples/tokenize.py
@@ -20,5 +20,5 @@
 print(word_tokenize(text2))
 # ['กฎหมายแรงงาน']
 
-print(word_tokenize(text2, engine="longest-matching"))
+print(word_tokenize(text2, engine="longest"))
 # ['กฎหมาย', 'แรงงาน']
diff --git a/pythainlp/romanization/pyicu.py b/pythainlp/romanization/pyicu.py
@@ -1,7 +1,5 @@
 # -*- coding: utf-8 -*-
 
-import sys
-
 try:
     import icu
 except ImportError:
diff --git a/pythainlp/sentiment/ulmfit_sent.py b/pythainlp/sentiment/ulmfit_sent.py
@@ -3,7 +3,6 @@
 Sentiment analyzer based on thai2vec ("ulmfit" engine)
 Code by https://github.com/cstorm125/thai2vec/tree/master/notebook
 """
-import sys
 from collections import defaultdict
 
 from pythainlp.corpus import download, get_file
diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py
@@ -2,9 +2,8 @@
 """
 Part-Of-Speech tagger
 """
-import sys
 
-ARTAGGER_URL = "https://github.com/wannaphongcom/artagger/archive/master.zip"
+_ARTAGGER_URL = "https://github.com/wannaphongcom/artagger/archive/master.zip"
 
 
 def pos_tag(words, engine="unigram", corpus="orchid"):
@@ -31,11 +30,11 @@ def _tag(text, corpus=None):
             except ImportError:
                 from pythainlp.tools import install_package
 
-                install_package(ARTAGGER_URL)
+                install_package(_ARTAGGER_URL)
                 try:
                     from artagger import Tagger
                 except ImportError:
-                    raise ImportError("Error: Try 'pip install " + ARTAGGER_URL + "'")
+                    raise ImportError("Error: Try 'pip install " + _ARTAGGER_URL + "'")
 
             words = Tagger().tag(" ".join(text))
 
diff --git a/pythainlp/tag/perceptron.py b/pythainlp/tag/perceptron.py
@@ -24,7 +24,7 @@ def pud_data():
     return model
 
 
-def tag(text, corpus):
+def tag(text, corpus="pud"):
     """
     รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('ข้อความ', 'ชนิดคำ')]"""
     if corpus == "orchid":
diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
@@ -16,42 +16,48 @@
 
 def word_tokenize(text, engine="newmm", whitespaces=True):
     """
-    :param str text:  the text to be tokenized
-    :param str engine: the engine to tokenize text
-    :param bool whitespaces: True to output no whitespace, a common mark of sentence or end of phrase in Thai.
+    :param str text: text to be tokenized
+    :param str engine: tokenizer to be used
+    :param bool whitespaces: True to output no whitespace, a common mark of sentence or end of phrase in Thai
     :Parameters for engine:
-        * newmm - Maximum Matching algorithm + TCC
-        * icu -  IBM ICU
-        * longest-matching - Longest matching
-        * mm - Maximum Matching algorithm
-        * pylexto - LexTo
-        * deepcut - Deep Neural Network
-        * wordcutpy - wordcutpy (https://github.com/veer66/wordcutpy)
-    :return: A list of words, tokenized from a text
+        * newmm (default) - dictionary-based, Maximum Matching + TCC
+        * mm - dictionary-based, Maximum Matching
+        * longest - dictionary-based, Longest Matching
+        * icu - wrapper for ICU, dictionary-based
+        * pylexto - wrapper for PyLexTo, dictionary-based, Longest Matching
+        * wordcutpy - wrapper for wordcutpy, dictionary-based https://github.com/veer66/wordcutpy
+        * deepcut - wrapper for deepcut, language-model-based https://github.com/rkcosmos/deepcut
+    :return: list of words, tokenized from the text
 
     **Example**::
-    from pythainlp.tokenize import word_tokenize
-    text = "โอเคบ่พวกเรารักภาษาบ้านเกิด"
-    word_tokenize(text, engine="newmm")  # ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด']
-    word_tokenize(text, engine="icu")  # ['โอ', 'เค', 'บ่', 'พวก', 'เรา', 'รัก', 'ภาษา', 'บ้าน', 'เกิด']
+        >>> from pythainlp.tokenize import word_tokenize
+        >>> text = "โอเคบ่พวกเรารักภาษาบ้านเกิด"
+        >>> word_tokenize(text, engine="newmm")
+        ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด']
+        >>> word_tokenize(text, engine="icu")
+        ['โอ', 'เค', 'บ่', 'พวก', 'เรา', 'รัก', 'ภาษา', 'บ้าน', 'เกิด']
     """
-    if engine == "icu":
+    if engine == "newmm" or engine == "onecut":
         from .pyicu import segment
-    elif engine == "multi_cut" or engine == "mm":
-        from .multi_cut import segment
+    elif engine == "longest" or engine == "longest-matching":
+        from .longest import segment
     elif engine == "ulmfit":
         from .newmm import mmcut
+
         def segment(text):
             return mmcut(text, trie=FROZEN_DICT_TRIE)
-    elif engine == "longest-matching":
-        from .longest import segment
-    elif engine == "pylexto":
-        from .pylexto import segment
+
+    elif engine == "icu":
+        from .pyicu import segment
     elif engine == "deepcut":
         from .deepcut import segment
+    elif engine == "pylexto":
+        from .pylexto import segment
     elif engine == "wordcutpy":
         from .wordcutpy import segment
-    else:  # default, use "newmm" ("onecut") engine
+    elif engine == "mm" or engine == "multi_cut":
+        from .multi_cut import segment
+    else:  # default, use "newmm" engine
         from .newmm import mmcut as segment
 
     if not whitespaces:
@@ -66,24 +72,26 @@ def dict_word_tokenize(text, custom_dict_trie, engine="newmm"):
 
     :param str text: the text to be tokenized
     :param dict custom_dict_trie: คือ trie ที่สร้างจาก create_custom_dict_trie
-    :param str engine: choose between different options of engine to token (newmm, wordcutpy, mm, longest-matching)
+    :param str engine: choose between different options of engine to token (newmm, wordcutpy, mm, longest)
     :return: A list of words, tokenized from a text.
     **Example**::
         >>> from pythainlp.tokenize import dict_word_tokenize,create_custom_dict_trie
-        >>> listword=['แมว',"ดี"]
-        >>> data_dict=create_custom_dict_trie(listword)
-        >>> dict_word_tokenize("แมวดีดีแมว",data_dict)
+        >>> listword = ["แมว", "ดี"]
+        >>> data_dict = create_custom_dict_trie(listword)
+        >>> dict_word_tokenize("แมวดีดีแมว", data_dict)
         ['แมว', 'ดี', 'ดี', 'แมว']
     """
-    if engine == "mm" or engine == "multi_cut":
-        from .multi_cut import segment
-    elif engine == "longest-matching":
+    if engine == "newmm" or engine == "onecut":
+        from .pyicu import segment
+    elif engine == "longest" or engine == "longest-matching":
         from .longest import segment
     elif engine == "wordcutpy":
         from .wordcutpy import segment
 
         return segment(text, custom_dict_trie.keys())
-    else:  # default, use "newmm" ("onecut") engine
+    elif engine == "mm" or engine == "multi_cut":
+        from .multi_cut import segment
+    else:  # default, use "newmm" engine
         from .newmm import mmcut as segment
 
     return segment(text, custom_dict_trie)
diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py
@@ -2,7 +2,6 @@
 """
 Wrapper for deepcut Thai word segmentation
 """
-import sys
 
 try:
     import deepcut
diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py
@@ -2,7 +2,8 @@
 """
 Longest-matching Thai word segmentation
 
-Based on code from https://github.com/patorn/thaitokenizer/blob/master/thaitokenizer/tokenizer.py
+Based on code from
+https://github.com/patorn/thaitokenizer/blob/master/thaitokenizer/tokenizer.py
 """
 import re
 
diff --git a/pythainlp/tokenize/pyicu.py b/pythainlp/tokenize/pyicu.py
@@ -3,7 +3,6 @@
 Wrapper for ICU word segmentation
 """
 import re
-import sys
 
 try:
     import icu
diff --git a/pythainlp/tokenize/pylexto.py b/pythainlp/tokenize/pylexto.py
@@ -2,7 +2,6 @@
 """
 Wrapper for LexTo Thai word segmentation
 """
-import sys
 
 _LEXTO_URL = "https://github.com/PyThaiNLP/pylexto/archive/master.zip"
 
diff --git a/pythainlp/tokenize/wordcutpy.py b/pythainlp/tokenize/wordcutpy.py
@@ -2,7 +2,6 @@
 """
 Wrapper for WordCut Thai word segmentation
 """
-import sys
 
 try:
     from wordcut import Wordcut
diff --git a/pythainlp/ulmfit/utils.py b/pythainlp/ulmfit/utils.py
@@ -4,7 +4,6 @@
 Code by https://github.com/cstorm125/thai2vec/tree/master/notebook
 """
 import re
-import sys
 
 from pythainlp.corpus import download, get_file
 from pythainlp.tokenize import word_tokenize
@@ -43,12 +42,11 @@ class ThaiTokenizer:
     def __init__(self, engine="newmm"):
         """
         :parameters for tokenization engine:
-            * newmm - Maximum Matching algorithm + TCC
-            * icu - IBM ICU
-            * longest-matching - Longest matching
-            * mm - Maximum Matching algorithm
-            * pylexto - LexTo
-            * deepcut - Deep Neural Network
+            * newmm - dictionary-based, Maximum Matching algorithm + TCC
+            * longest - dictionary-based, Longest Matching
+            * icu - use ICU, dictionary-based
+            * pylexto - use LexTo, dictionary-based
+            * deepcut - use deepcut, language model-based
         """
         self.engine = engine
         self.__RE_BR = re.compile(r"<\s*br\s*/?>", re.IGNORECASE)
diff --git a/pythainlp/word_vector/thai2vec.py b/pythainlp/word_vector/thai2vec.py
@@ -3,8 +3,6 @@
 thai2vec - Thai word vector
 Code by https://github.com/cstorm125/thai2vec/blob/master/notebooks/examples.ipynb
 """
-import sys
-
 from pythainlp.corpus import download as download_data
 from pythainlp.corpus import get_file
 from pythainlp.tokenize import word_tokenize
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -78,7 +78,7 @@ def test_segment_newmm(self):
 
     def test_segment_longest_matching(self):
         self.assertEqual(
-            word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="longest-matching"),
+            word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="longest"),
             ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
         )