Skip to content

Commit f309d24

Browse files
committed
Remove obsolete and unused codes
- Remove pyicu uses in collation (already have _thkey() for sort key) - Remove hunspell uses in spell (already have Peter Norvig's checker) - Remove pythainlp.text.Text (nltk Text wrapper) (user can do this by themselve) - Remove pylexto uses in tokenization
1 parent e04323d commit f309d24

File tree

14 files changed

+38
-196
lines changed

14 files changed

+38
-196
lines changed

docs/pythainlp-dev-thai.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,6 @@ engine คือ ระบบตัดคำ ปัจจุบันมี engi
5757
- longest - ใช้พจนานุกรม ด้วยวิธี Longest Matching
5858
- icu - เรียกใช้ตัวตัดคำจาก ICU ใช้พจนานุกรม (ความแม่นยำต่ำ)
5959
- wordcutpy - เรียกใช้ตัวตัดคำจาก [wordcutpy](https://github.com/veer66/wordcutpy) ใช้พจนานุกรม
60-
- pylexto - เรียกใช้ตัวตัดคำจาก LexTo ใช้พจนานุกรม ด้วยวิธี Longest Matching
6160
- deepcut - เรียกใช้ตัวตัดคำจาก [deepcut](https://github.com/rkcosmos/deepcut) ใช้การเรียนรู้ของเครื่อง
6261

6362
คืนค่าเป็น ''list'' เช่น ['แมว', 'กิน']

examples/collation.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# -*- coding: utf-8 -*-
22

3-
from pythainlp.collation import collation
3+
from pythainlp.collation import collate
44

5-
print(collation(["ไก่", "ไข่", "ก", "ฮา"])) # ['ก', 'ไก่', 'ไข่', 'ฮา']
5+
print(collate(["ไก่", "ไข่", "ก", "ฮา"])) # ['ก', 'ไก่', 'ไข่', 'ฮา']

examples/spell.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99
# spell checker from pythainlp.spell module (generic)
1010
print(spell("สี่เหลียม")) # ['สี่เหลี่ยม']
11-
# print(spell("สี่เหลียม", engine="hunspell")) # available in some Linux systems
1211

1312
# spell checker from pythainlp.spell.pn module (specified algorithm - Peter Norvig's)
1413
print(pn_tnc_spell("เหลืยม"))

pythainlp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# -*- coding: utf-8 -*-
22
from pythainlp.change import texttoeng, texttothai
3-
from pythainlp.collation import collation
3+
from pythainlp.collation import collate
44
from pythainlp.date import now
55
from pythainlp.keywords import find_keyword
66
from pythainlp.rank import rank

pythainlp/collation/__init__.py

Lines changed: 12 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,27 @@
11
# -*- coding: utf-8 -*-
22
"""
3-
Thai collation (sort according to dictionary order)
4-
For Unicode collation, please refer to Unicode Common Locale Data Repository (CLDR)
5-
https://unicode.org/cldr/charts/latest/collation/th.html
3+
Thai collation (sort according to Thai dictionary order)
64
"""
75
import re
86

9-
RE_TONE = re.compile(r"[็-์]")
10-
RE_LV_C = re.compile(r"([เ-ไ])([ก-ฮ])")
7+
_RE_TONE = re.compile(r"[็-์]")
8+
_RE_LV_C = re.compile(r"([เ-ไ])([ก-ฮ])")
119

12-
try:
13-
import icu
1410

15-
thkey = icu.Collator.createInstance(icu.Locale("th_TH")).getSortKey
16-
except ImportError:
11+
def _thkey(word):
12+
cv = _RE_TONE.sub("", word) # remove tone
13+
cv = _RE_LV_C.sub("\\2\\1", cv) # switch lead vowel
14+
tone = _RE_TONE.sub(" ", word) # just tone
15+
return cv + tone
1716

18-
def thkey(word):
19-
cv = RE_TONE.sub("", word) # remove tone
20-
cv = RE_LV_C.sub("\\2\\1", cv) # switch lead vowel
21-
tone = RE_TONE.sub(" ", word) # just tone
22-
return cv + tone
2317

24-
25-
def collation(data):
18+
def collate(data):
2619
"""
27-
:param list data: a list of thai text
28-
:return: a list of thai text, sorted alphabetically
20+
:param list data: a list of strings
21+
:return: a list of strings, sorted alphabetically, according to Thai rules
2922
**Example**::
3023
>>> from pythainlp.collation import *
3124
>>> collation(['ไก่', 'เป็ด', 'หมู', 'วัว'])
3225
['ไก่', 'เป็ด', 'วัว', 'หมู']
3326
"""
34-
return sorted(data, key=thkey)
35-
36-
37-
if __name__ == "__main__":
38-
a = collation(["ไก่", "ไข่", "ก", "ฮา"]) == ["ก", "ไก่", "ไข่", "ฮา"]
39-
print(a)
40-
print(collation(["หลาย", "หญิง"]) == ["หญิง", "หลาย"])
41-
print(collation(["ไก่", "เป็ด", "หมู", "วัว"]) == ["ไก่", "เป็ด", "วัว", "หมู"])
27+
return sorted(data, key=_thkey)

pythainlp/spell/__init__.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,15 @@
33
Spell checking
44
"""
55

6+
from .pn import spell as pn_spell
7+
68

79
def spell(word, engine="pn"):
810
"""
911
:param str word: word to check spelling
1012
:param str engine:
1113
* pn - Peter Norvig's algorithm (default)
12-
* hunspell - uses hunspell's algorithm, which should already exist in Linux
1314
:return: list of words
1415
"""
15-
if engine == "hunspell":
16-
from .hunspell import spell as _spell
17-
else: # default, use "pn" engine
18-
from .pn import spell as _spell
1916

20-
return _spell(word)
17+
return pn_spell(word)

pythainlp/spell/hunspell.py

Lines changed: 0 additions & 47 deletions
This file was deleted.

pythainlp/tag/old.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
"""
33
Unigram Part-Of-Speech Tagger
44
"""
5-
import json
65
import os
76

87
import dill

pythainlp/text.py

Lines changed: 0 additions & 12 deletions
This file was deleted.

pythainlp/tokenize/__init__.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ def word_tokenize(text, engine="newmm", whitespaces=True):
2424
* longest - dictionary-based, Longest Matching
2525
* icu - wrapper for ICU, dictionary-based
2626
* wordcutpy - wrapper for wordcutpy, dictionary-based https://github.com/veer66/wordcutpy
27-
* pylexto - wrapper for PyLexTo, dictionary-based, Longest Matching
2827
* deepcut - wrapper for deepcut, language-model-based https://github.com/rkcosmos/deepcut
2928
* ulmfit - use newmm engine with a specific dictionary for use with thai2vec
3029
:return: list of words, tokenized from the text
@@ -53,8 +52,6 @@ def segment(text):
5352
from .deepcut import segment
5453
elif engine == "wordcutpy":
5554
from .wordcutpy import segment
56-
elif engine == "pylexto":
57-
from .pylexto import segment
5855
elif engine == "mm" or engine == "multi_cut":
5956
from .multi_cut import segment
6057
else: # default, use "newmm" engine

0 commit comments

Comments
 (0)