Remove obsolete and unused codes

bact · bact · commit f309d24bfb3e · 2018-10-30T17:29:17.000+08:00
- Remove pyicu uses in collation (already have _thkey() for sort key)
- Remove hunspell uses in spell (already have Peter Norvig's checker)
- Remove pythainlp.text.Text (nltk Text wrapper) (user can do this by themselve)
- Remove pylexto uses in tokenization
diff --git a/docs/pythainlp-dev-thai.md b/docs/pythainlp-dev-thai.md
@@ -57,7 +57,6 @@ engine คือ ระบบตัดคำ ปัจจุบันมี engi
 - longest - ใช้พจนานุกรม ด้วยวิธี Longest Matching
 - icu - เรียกใช้ตัวตัดคำจาก ICU ใช้พจนานุกรม (ความแม่นยำต่ำ)
 - wordcutpy - เรียกใช้ตัวตัดคำจาก [wordcutpy](https://github.com/veer66/wordcutpy) ใช้พจนานุกรม
-- pylexto - เรียกใช้ตัวตัดคำจาก LexTo ใช้พจนานุกรม ด้วยวิธี Longest Matching
 - deepcut - เรียกใช้ตัวตัดคำจาก [deepcut](https://github.com/rkcosmos/deepcut) ใช้การเรียนรู้ของเครื่อง
 
 คืนค่าเป็น ''list'' เช่น ['แมว', 'กิน']
diff --git a/examples/collation.py b/examples/collation.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
 
-from pythainlp.collation import collation
+from pythainlp.collation import collate
 
-print(collation(["ไก่", "ไข่", "ก", "ฮา"]))  # ['ก', 'ไก่', 'ไข่', 'ฮา']
+print(collate(["ไก่", "ไข่", "ก", "ฮา"]))  # ['ก', 'ไก่', 'ไข่', 'ฮา']
diff --git a/examples/spell.py b/examples/spell.py
@@ -8,7 +8,6 @@
 
 # spell checker from pythainlp.spell module (generic)
 print(spell("สี่เหลียม"))  # ['สี่เหลี่ยม']
-# print(spell("สี่เหลียม", engine="hunspell"))  # available in some Linux systems
 
 # spell checker from pythainlp.spell.pn module (specified algorithm - Peter Norvig's)
 print(pn_tnc_spell("เหลืยม"))
diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py
@@ -1,6 +1,6 @@
 ﻿# -*- coding: utf-8 -*-
 from pythainlp.change import texttoeng, texttothai
-from pythainlp.collation import collation
+from pythainlp.collation import collate
 from pythainlp.date import now
 from pythainlp.keywords import find_keyword
 from pythainlp.rank import rank
diff --git a/pythainlp/collation/__init__.py b/pythainlp/collation/__init__.py
@@ -1,41 +1,27 @@
 # -*- coding: utf-8 -*-
 """
-Thai collation (sort according to dictionary order)
-For Unicode collation, please refer to Unicode Common Locale Data Repository (CLDR)
-https://unicode.org/cldr/charts/latest/collation/th.html
+Thai collation (sort according to Thai dictionary order)
 """
 import re
 
-RE_TONE = re.compile(r"[็-์]")
-RE_LV_C = re.compile(r"([เ-ไ])([ก-ฮ])")
+_RE_TONE = re.compile(r"[็-์]")
+_RE_LV_C = re.compile(r"([เ-ไ])([ก-ฮ])")
 
-try:
-    import icu
 
-    thkey = icu.Collator.createInstance(icu.Locale("th_TH")).getSortKey
-except ImportError:
+def _thkey(word):
+    cv = _RE_TONE.sub("", word)  # remove tone
+    cv = _RE_LV_C.sub("\\2\\1", cv)  # switch lead vowel
+    tone = _RE_TONE.sub(" ", word)  # just tone
+    return cv + tone
 
-    def thkey(word):
-        cv = RE_TONE.sub("", word)  # remove tone
-        cv = RE_LV_C.sub("\\2\\1", cv)  # switch lead vowel
-        tone = RE_TONE.sub(" ", word)  # just tone
-        return cv + tone
 
-
-def collation(data):
+def collate(data):
     """
-    :param list data: a list of thai text
-    :return: a list of thai text, sorted alphabetically
+    :param list data: a list of strings
+    :return: a list of strings, sorted alphabetically, according to Thai rules
     **Example**::
         >>> from pythainlp.collation import *
         >>> collation(['ไก่', 'เป็ด', 'หมู', 'วัว'])
         ['ไก่', 'เป็ด', 'วัว', 'หมู']
     """
-    return sorted(data, key=thkey)
-
-
-if __name__ == "__main__":
-    a = collation(["ไก่", "ไข่", "ก", "ฮา"]) == ["ก", "ไก่", "ไข่", "ฮา"]
-    print(a)
-    print(collation(["หลาย", "หญิง"]) == ["หญิง", "หลาย"])
-    print(collation(["ไก่", "เป็ด", "หมู", "วัว"]) == ["ไก่", "เป็ด", "วัว", "หมู"])
+    return sorted(data, key=_thkey)
diff --git a/pythainlp/spell/__init__.py b/pythainlp/spell/__init__.py
@@ -3,18 +3,15 @@
 Spell checking
 """
 
+from .pn import spell as pn_spell
+
 
 def spell(word, engine="pn"):
     """
     :param str word: word to check spelling
     :param str engine:
         * pn - Peter Norvig's algorithm (default)
-        * hunspell - uses hunspell's algorithm, which should already exist in Linux
     :return: list of words
     """
-    if engine == "hunspell":
-        from .hunspell import spell as _spell
-    else:  # default, use "pn" engine
-        from .pn import spell as _spell
 
-    return _spell(word)
+    return pn_spell(word)
diff --git a/pythainlp/spell/hunspell.py b/pythainlp/spell/hunspell.py
diff --git a/pythainlp/tag/old.py b/pythainlp/tag/old.py
@@ -2,7 +2,6 @@
 """
 Unigram Part-Of-Speech Tagger
 """
-import json
 import os
 
 import dill
diff --git a/pythainlp/text.py b/pythainlp/text.py
diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
@@ -24,7 +24,6 @@ def word_tokenize(text, engine="newmm", whitespaces=True):
         * longest - dictionary-based, Longest Matching
         * icu - wrapper for ICU, dictionary-based
         * wordcutpy - wrapper for wordcutpy, dictionary-based https://github.com/veer66/wordcutpy
-        * pylexto - wrapper for PyLexTo, dictionary-based, Longest Matching
         * deepcut - wrapper for deepcut, language-model-based https://github.com/rkcosmos/deepcut
         * ulmfit - use newmm engine with a specific dictionary for use with thai2vec
     :return: list of words, tokenized from the text
@@ -53,8 +52,6 @@ def segment(text):
         from .deepcut import segment
     elif engine == "wordcutpy":
         from .wordcutpy import segment
-    elif engine == "pylexto":
-        from .pylexto import segment
     elif engine == "mm" or engine == "multi_cut":
         from .multi_cut import segment
     else:  # default, use "newmm" engine
diff --git a/pythainlp/tokenize/etcc.py b/pythainlp/tokenize/etcc.py
@@ -1,9 +1,7 @@
 # -*- coding: utf-8 -*-
 """
 โปรแกรม ETCC ใน Python
-
 พัฒนาโดย นาย วรรณพงษ์  ภัททิยไพบูลย์
-
 19 มิ.ย. 2560
 
 วิธีใช้งาน
@@ -13,58 +11,13 @@
 
 import re
 
+from pythainlp.corpus.alphabet import get_data as thai_alphas
 
-C = [
-    "ก",
-    "ข",
-    "ฃ",
-    "ค",
-    "ฅ",
-    "ฆ",
-    "ง",
-    "จ",
-    "ฉ",
-    "ช",
-    "ฌ",
-    "ซ",
-    "ศ",
-    "ษ",
-    "ส",
-    "ญ",
-    "ฎ",
-    "ฑ",
-    "ด",
-    "ฏ",
-    "ต",
-    "ฐ",
-    "ฑ",
-    "ฒ",
-    "ถ",
-    "ท",
-    "ธ",
-    "ณ",
-    "น",
-    "บ",
-    "ป",
-    "ผ",
-    "พ",
-    "ภ",
-    "ฝ",
-    "ฟ",
-    "ม",
-    "ย",
-    "ร",
-    "ล",
-    "ฬ",
-    "ว",
-    "ห",
-    "ฮ",
-]
-UV = ["็", "ี", "ื", "ิ"]
-UV1 = ["ั", "ี"]
-LV = ["ุ", "ู"]
-c = "[" + "".join(C) + "]"
-uv2 = "[" + "".join(["ั", "ื"]) + "]"
+_UV = ["็", "ี", "ื", "ิ"]
+_UV1 = ["ั", "ี"]
+_LV = ["ุ", "ู"]
+c = "[" + "".join(thai_alphas()) + "]"
+_UV2 = "[" + "".join(["ั", "ื"]) + "]"
 
 
 def etcc(text):
@@ -74,20 +27,18 @@ def etcc(text):
     รับ str
     ส่งออก str
     """
-    if re.search(r"[เแ]" + c + r"[" + "".join(UV) + r"]" + r"\w", text):
-        search = re.findall(r"[เแ]" + c + r"[" + "".join(UV) + r"]" + r"\w", text)
+    if re.search(r"[เแ]" + c + r"[" + "".join(_UV) + r"]" + r"\w", text):
+        search = re.findall(r"[เแ]" + c + r"[" + "".join(_UV) + r"]" + r"\w", text)
         for i in search:
             text = re.sub(i, "/" + i + "/", text)
 
-    if re.search(c + r"[" + "".join(UV1) + r"]" + c + c + r"ุ" + r"์", text):
-        search = re.findall(
-            c + r"[" + "".join(UV1) + r"]" + c + c + r"ุ" + r"์", text
-        )
+    if re.search(c + r"[" + "".join(_UV1) + r"]" + c + c + r"ุ" + r"์", text):
+        search = re.findall(c + r"[" + "".join(_UV1) + r"]" + c + c + r"ุ" + r"์", text)
         for i in search:
             text = re.sub(i, "//" + i + "/", text)
 
-    if re.search(c + uv2 + c, text):
-        search = re.findall(c + uv2 + c, text)
+    if re.search(c + _UV2 + c, text):
+        search = re.findall(c + _UV2 + c, text)
         for i in search:
             text = re.sub(i, "/" + i + "/", text)
     re.sub("//", "/", text)
@@ -103,8 +54,8 @@ def etcc(text):
             text = re.sub(i, "/" + i + "/", text)
     text = re.sub("//", "/", text)
 
-    if re.search(c + "[" + "".join(UV1) + "]" + c + c + "์", text):
-        search = re.findall(c + "[" + "".join(UV1) + "]" + c + c + "์", text)
+    if re.search(c + "[" + "".join(_UV1) + "]" + c + c + "์", text):
+        search = re.findall(c + "[" + "".join(_UV1) + "]" + c + c + "์", text)
         for i in search:
             text = re.sub(i, "/" + i + "/", text)
 
@@ -115,4 +66,4 @@ def etcc(text):
             ii = re.sub("/", "", i)
             text = re.sub(i, ii + "/", text)
 
-    return re.sub("//", "/", text)
+    return re.sub("//", "/", text)
diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
@@ -93,4 +93,4 @@ def onecut(text, trie):
 def mmcut(text, trie=None):
     if not trie:
         trie = DEFAULT_DICT_TRIE
-    return list(onecut(text, trie))
+    return list(onecut(text, trie))
diff --git a/pythainlp/tokenize/pylexto.py b/pythainlp/tokenize/pylexto.py
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -4,7 +4,7 @@
 from collections import Counter
 
 from pythainlp.change import texttoeng, texttothai
-from pythainlp.collation import collation
+from pythainlp.collation import collate
 from pythainlp.corpus.conceptnet import edges
 from pythainlp.corpus.tnc import get_word_frequency_all
 from pythainlp.corpus import (
@@ -188,10 +188,10 @@ def test_corpus(self):
         self.assertIsNotNone(provinces.get_data())
         self.assertTrue(len(newthaiword.get_data()) > len(thaiword.get_data()))
 
-    def test_collation(self):
-        self.assertEqual(collation(["ไก่", "กก"]), ["กก", "ไก่"])
+    def test_collate(self):
+        self.assertEqual(collate(["ไก่", "กก"]), ["กก", "ไก่"])
         self.assertEqual(
-            collation(["ไก่", "เป็ด", "หมู", "วัว"]), ["ไก่", "เป็ด", "วัว", "หมู"]
+            collate(["ไก่", "เป็ด", "หมู", "วัว"]), ["ไก่", "เป็ด", "วัว", "หมู"]
         )
 
     def test_normalize(self):