Skip to content

Commit 72d49c2

Browse files
authored
Merge branch 'dev' into g2p
2 parents 606528e + c76e786 commit 72d49c2

File tree

17 files changed

+215
-225
lines changed

17 files changed

+215
-225
lines changed

examples/soundex.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,16 @@
11
# -*- coding: utf-8 -*-
22

3-
from pythainlp.soundex import LK82, Udom83
3+
from pythainlp.soundex import lk82, metasound, udom83
44

5-
print(LK82("รถ") == LK82("รด"))
5+
texts = ["บูรณะ", "บูรณการ", "มัก", "มัค", "มรรค", "ลัก", "รัก", "รักษ์", ""]
6+
for text in texts:
7+
print(
8+
"{} - lk82: {} - udom83: {} - metasound: {}".format(
9+
text, lk82(text), udom83(text), metasound(text)
10+
)
11+
)
612

7-
print(Udom83("วรร") == Udom83("วัน"))
13+
# check equivalence
14+
print(lk82("รถ") == lk82("รด"))
15+
print(udom83("วรร") == udom83("วัน"))
16+
print(metasound("นพ") == metasound("นภ"))

examples/spell.py

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,28 @@
11
# -*- coding: utf-8 -*-
22

3+
from pythainlp.corpus import ttc
34
from pythainlp.spell import spell
4-
from pythainlp.spell.pn import spell as pn_tnc_spell
5-
from pythainlp.spell.pn import correct as pn_tnc_correct
65
from pythainlp.spell.pn import NorvigSpellChecker
7-
from pythainlp.corpus import ttc
6+
from pythainlp.spell.pn import correct as pn_tnc_correct
7+
from pythainlp.spell.pn import spell as pn_tnc_spell
88

9-
# checker from pythainlp.spell module (generic)
10-
spell("สี่เหลียม") # ['สี่เหลี่ยม']
11-
# spell("สี่เหลียม", engine="hunspell") # available in some Linux systems
9+
# spell checker from pythainlp.spell module (generic)
10+
print(spell("สี่เหลียม")) # ['สี่เหลี่ยม']
11+
# print(spell("สี่เหลียม", engine="hunspell")) # available in some Linux systems
1212

13-
# checker from pythainlp.spell.pn module (specified algorithm - Peter Norvig's)
14-
pn_tnc_spell("เหลืยม")
15-
pn_tnc_correct("เหลืยม")
13+
# spell checker from pythainlp.spell.pn module (specified algorithm - Peter Norvig's)
14+
print(pn_tnc_spell("เหลืยม"))
15+
print(pn_tnc_correct("เหลืยม"))
1616

17-
# checker from pythainlp.spell.pn module (specified algorithm, custom dictionary)
17+
18+
# spell checker from pythainlp.spell.pn module (specified algorithm, custom dictionary)
1819
ttc_word_freqs = ttc.get_word_frequency_all()
19-
pn_ttc_spell_checker = NorvigSpellChecker(custom_dict=ttc_word_freqs)
20-
pn_ttc_spell_checker.spell("เหลืยม")
21-
pn_ttc_spell_checker.correct("เหลืยม")
20+
pn_ttc_checker = NorvigSpellChecker(custom_dict=ttc_word_freqs)
21+
print(pn_ttc_checker.spell("เหลืยม"))
22+
print(pn_ttc_checker.correct("เหลืยม"))
23+
24+
# apply different dictionary filter when creating spell checker
25+
pn_tnc_checker = NorvigSpellChecker()
26+
print(len(pn_tnc_checker.dictionary()))
27+
pn_tnc_checker_no_filter = NorvigSpellChecker(dict_filter=None)
28+
print(len(pn_tnc_checker_no_filter.dictionary()))

pythainlp/__init__.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,13 @@
33
from pythainlp.collation import collation
44
from pythainlp.date import now
55
from pythainlp.keywords import find_keyword
6-
from pythainlp.metasound import metasound
76
from pythainlp.rank import rank
87
from pythainlp.romanization import romanize
98
from pythainlp.sentiment import sentiment
10-
from pythainlp.soundex import LK82, Udom83
9+
from pythainlp.soundex import lk82, metasound, udom83
1110
from pythainlp.spell import spell
1211
from pythainlp.tag import pos_tag
13-
from pythainlp.Text import Text
12+
from pythainlp.text import Text
1413
from pythainlp.tokenize import etcc, sent_tokenize, tcc, word_tokenize
1514
from pythainlp.util import bigrams, ngrams, trigram
1615

pythainlp/corpus/alphabet.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
11
# -*- coding: utf-8 -*-
2+
"""
3+
Thai alphabets
4+
"""
25

36

47
def get_data():
5-
"""เป็นคำสั่งสำหรับดึงตัวอักษร ก - ฮ ในภาษาไทย
6-
คืนค่า list
8+
"""
9+
Get a list of Thai alphabets (from Ko Kai \u0e01 to Ho Nokhuk \u0e2e)
10+
คืนค่า list ที่มีพญัชนะไทย ก (\u0e01) - ฮ (\u0e2e)
711
"""
812
return [
913
"ก",

pythainlp/corpus/conceptnet.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# -*- coding: utf-8 -*-
2-
32
"""
4-
นี่คือ API สำหรับดึงข้อมูลมาจาก http://conceptnet.io
3+
ดึงข้อมูลจาก http://conceptnet.io
54
"""
65
import requests
76

pythainlp/corpus/country.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
# -*- coding: utf-8 -*-
2+
"""
3+
Country list
4+
"""
25

36

47
def get_data():
58
"""
6-
Return list of countries, in Thai language
9+
Return a list of countries, in Thai language
710
"""
811
return [
912
"อัฟกานิสถาน",

pythainlp/corpus/make-stopword.tool

Lines changed: 0 additions & 58 deletions
This file was deleted.

pythainlp/corpus/provinces.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,4 +74,4 @@
7474
อุดรธานี,อด,UDN
7575
อุตรดิตถ์,อต,UTT
7676
อุทัยธานี,อน,UTI
77-
อุบลราชธานี,อบ,UBN
77+
อุบลราชธานี,อบ,UBN

pythainlp/corpus/provinces.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
def get_data():
88
"""
9-
Return list of provinces in Thailand, in Thai language
9+
Return a list of provinces in Thailand, in Thai language
1010
"""
1111
return [
1212
"กระบี่",

pythainlp/corpus/wordnet.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
11
# -*- coding: utf-8 -*-
22
"""
3-
API ตัวใหม่ เริ่มใช้ตั้งแต่ PyThaiNLP 1.4 เป็นต้นไป
3+
WordNet
44
"""
55
import nltk
66

77
try:
88
nltk.data.find("corpora/omw")
9+
except LookupError:
10+
nltk.download("omw")
11+
12+
try:
913
nltk.data.find("corpora/wordnet")
1014
except LookupError:
1115
nltk.download("wordnet")
12-
nltk.download("omw")
1316

1417
from nltk.corpus import wordnet
1518

0 commit comments

Comments
 (0)