Skip to content

Commit e2243df

Browse files
wannaphongbact
andauthored
Add create_wordlist (#502)
* Create words.py * More readable variable names * Make a more generic revise_wordset() that can be used for any tokenize function and set of words (not only newmm and default word list) * Move to pythainlp.corpus.util, add test * Remove corpus.util.* from import in __init__.py * Add docs Co-authored-by: Arthit Suriyawongkul <arthit@gmail.com>
1 parent 3e622d7 commit e2243df

File tree

8 files changed

+193
-23
lines changed

8 files changed

+193
-23
lines changed

docs/api/corpus.rst

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,13 @@ Modules
2222
.. autofunction:: thai_family_names
2323
.. autofunction:: thai_female_names
2424
.. autofunction:: thai_male_names
25+
26+
ConceptNet
27+
----------
28+
29+
ConceptNet is an open, multilingual knowledge graph
30+
See: https://github.com/commonsense/conceptnet5/wiki/API
31+
2532
.. autofunction:: pythainlp.corpus.conceptnet.edges
2633

2734
TNC
@@ -34,7 +41,14 @@ TTC
3441

3542
.. autofunction:: pythainlp.corpus.ttc.word_freqs
3643

37-
Wordnet
44+
Util
45+
----
46+
47+
.. autofunction:: pythainlp.corpus.util.find_badwords
48+
.. autofunction:: pythainlp.corpus.util.revise_wordset
49+
.. autofunction:: pythainlp.corpus.util.revise_newmm_default_wordset
50+
51+
WordNet
3852
-------
3953

4054
PyThaiNLP API is an exact copy of NLTK WordNet API.

docs/api/summarize.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
pythainlp.summarize
44
====================================
5-
The :class:`summarize` is thai text summarize.
5+
The :class:`summarize` is Thai text summarizer.
66

77
Modules
88
-------

docs/api/tag.rst

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40,18 +40,18 @@ Abbreviation Part-of-Speech tag Examples
4040
NLBL Label noun 1, 2, 3, 4, ก, ข, a, b
4141
NCMN Common noun หนังสือ, อาหาร, อาคาร, คน
4242
NTTL Title noun ครู, พลเอก
43-
PPRS Personal pronoun ‡ คุณ, เขา, ฉัน
44-
PDMN Demonstrative pronoun œ นี่, นั้น, ที่นั่น, ที่นี่
43+
PPRS Personal pronoun คุณ, เขา, ฉัน
44+
PDMN Demonstrative pronoun นี่, นั้น, ที่นั่น, ที่นี่
4545
PNTR Interrogative pronoun ใคร, อะไร, อย่างไร
46-
PREL Relative pronoun š ที่, ซึ่ง, อัน, ผู้
47-
VACT Active verb šÎµŠµœ, ทำงาน, ร้องเพลง, กิน
46+
PREL Relative pronoun ที่, ซึ่ง, อัน, ผู้
47+
VACT Active verb ทำงาน, ร้องเพลง, กิน
4848
VSTA Stative verb เห็น, รู้, คือ
4949
VATT Attributive verb อ้วน, ดี, สวย
5050
XVBM Pre-verb auxiliary, before negator "ไม่" เกิด, เกือบ, กำลัง
5151
XVAM Pre-verb auxiliary, after negator "ไม่" ค่อย, น่า, ได้
5252
XVMM Pre-verb, before or after negator "ไม่" ควร, เคย, ต้อง
5353
XVBB Pre-verb auxiliary, in imperative mood กรุณา, จง, เชิญ, อย่า, ห้าม
54-
XVAE Post-verb auxiliary Å ไป, มา, ขึ้น
54+
XVAE Post-verb auxiliary ไป, มา, ขึ้น
5555
DDAN | Definite determiner, after noun without ยี่, นั่น, โน่น, ทั้งหมด
5656
| classifier in between
5757
DDAC | Definite determiner, allowing classifier นี้, นั้น, โน้น, นู้น
@@ -76,12 +76,12 @@ Abbreviation Part-of-Speech tag Examples
7676
CLTV Collective classifier | คู่, กลุ่ม, ฝูง, เชิง, ทาง,
7777
| ด้าน, แบบ, รุ่น
7878
CMTR Measurement classifier กิโลกรัม, แก้ว, ชั่วโมง
79-
CFQC Frequency classifier ‡ ครั้ง, เที่ยว
79+
CFQC Frequency classifier ครั้ง, เที่ยว
8080
CVBL Verbal classifier ม้วน, มัด
8181
JCRG Coordinating conjunction และ, หรือ, แต่
82-
JCMP Comparative conjunction „ กว่า, เหมือนกับ, เท่ากับ
82+
JCMP Comparative conjunction กว่า, เหมือนกับ, เท่ากับ
8383
JSBR Subordinating conjunction เพราะว่า, เนื่องจาก ที่, แม้ว่า, ถ้า
84-
RPRE Preposition ‹ จาก, ละ, ของ, ใต้, บน
84+
RPRE Preposition จาก, ละ, ของ, ใต้, บน
8585
INT Interjection โอ้บ, โอ้, เออ, เอ๋, อ๋อ
8686
FIXN Nominal prefix **การ**\ ทำงาน, **ความ**\ สนุนสนาน
8787
FIXV Adverbial prefix **อย่าง**\ เร็ว

pythainlp/corpus/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,9 @@
2828
]
2929

3030
import os
31-
from tinydb import TinyDB
3231

3332
from pythainlp.tools import get_full_data_path, get_pythainlp_path
33+
from tinydb import TinyDB
3434

3535
# Remote and local corpus databases
3636

pythainlp/corpus/common.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
"thai_words",
1616
]
1717

18-
from typing import Union
18+
from typing import FrozenSet, List, Union
1919

2020
from pythainlp.corpus import get_corpus
2121

@@ -46,7 +46,7 @@
4646
_THAI_MALE_NAMES_FILENAME = "person_names_male_th.txt"
4747

4848

49-
def countries() -> frozenset:
49+
def countries() -> FrozenSet[str]:
5050
"""
5151
Return a frozenset of country names in Thai such as "แคนาดา", "โรมาเนีย",
5252
"แอลจีเรีย", and "ลาว".
@@ -63,7 +63,7 @@ def countries() -> frozenset:
6363
return _THAI_COUNTRIES
6464

6565

66-
def provinces(details: bool = False) -> Union[frozenset, list]:
66+
def provinces(details: bool = False) -> Union[FrozenSet[str], List[str]]:
6767
"""
6868
Return a frozenset of Thailand province names in Thai such as "กระบี่",
6969
"กรุงเทพมหานคร", "กาญจนบุรี", and "อุบลราชธานี".
@@ -106,7 +106,7 @@ def provinces(details: bool = False) -> Union[frozenset, list]:
106106
return _THAI_THAILAND_PROVINCES
107107

108108

109-
def thai_syllables() -> frozenset:
109+
def thai_syllables() -> FrozenSet[str]:
110110
"""
111111
Return a frozenset of Thai syllables such as "กรอบ", "ก็", "๑", "โมบ",
112112
"โมน", "โม่ง", "กา", "ก่า", and, "ก้า".
@@ -123,7 +123,7 @@ def thai_syllables() -> frozenset:
123123
return _THAI_SYLLABLES
124124

125125

126-
def thai_words() -> frozenset:
126+
def thai_words() -> FrozenSet[str]:
127127
"""
128128
Return a frozenset of Thai words such as "กติกา", "กดดัน", "พิษ",
129129
and "พิษภัย". \n(See: `dev/pythainlp/corpus/words_th.txt\
@@ -139,7 +139,7 @@ def thai_words() -> frozenset:
139139
return _THAI_WORDS
140140

141141

142-
def thai_stopwords() -> frozenset:
142+
def thai_stopwords() -> FrozenSet[str]:
143143
"""
144144
Return a frozenset of Thai stopwords such as "มี", "ไป", "ไง", "ขณะ",
145145
"การ", and "ประการหนึ่ง". \n(See: `dev/pythainlp/corpus/stopwords_th.txt\
@@ -155,7 +155,7 @@ def thai_stopwords() -> frozenset:
155155
return _THAI_STOPWORDS
156156

157157

158-
def thai_negations() -> frozenset:
158+
def thai_negations() -> FrozenSet[str]:
159159
"""
160160
Return a frozenset of Thai negation words including "ไม่" and "แต่".
161161
\n(See: `dev/pythainlp/corpus/negations_th.txt\
@@ -171,7 +171,7 @@ def thai_negations() -> frozenset:
171171
return _THAI_NEGATIONS
172172

173173

174-
def thai_family_names() -> frozenset:
174+
def thai_family_names() -> FrozenSet[str]:
175175
"""
176176
Return a frozenset of Thai family names
177177
\n(See: `dev/pythainlp/corpus/family_names_th.txt\
@@ -187,7 +187,7 @@ def thai_family_names() -> frozenset:
187187
return _THAI_FAMLIY_NAMES
188188

189189

190-
def thai_female_names() -> frozenset:
190+
def thai_female_names() -> FrozenSet[str]:
191191
"""
192192
Return a frozenset of Thai female names
193193
\n(See: `dev/pythainlp/corpus/person_names_female_th.txt\
@@ -203,7 +203,7 @@ def thai_female_names() -> frozenset:
203203
return _THAI_FEMALE_NAMES
204204

205205

206-
def thai_male_names() -> frozenset:
206+
def thai_male_names() -> FrozenSet[str]:
207207
"""
208208
Return a frozenset of Thai male names
209209
\n(See: `dev/pythainlp/corpus/person_names_male_th.txt\

pythainlp/corpus/util.py

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Tool for create word list
4+
code is from Korakot Chaovavanich.
5+
6+
:See also:
7+
* `Facebook post \
8+
<https://www.facebook.com/groups/colab.thailand/permalink/1667821073393244>`_
9+
* `Google Colab \
10+
<https://colab.research.google.com/drive/19kY2jCHONuxmTJM0U8PIE_I5OK1rO-x_>`_
11+
"""
12+
13+
from collections import Counter
14+
from typing import Callable, Iterable, Iterator, List, Set, Tuple
15+
16+
from pythainlp.corpus import thai_words
17+
from pythainlp.tokenize import newmm
18+
from pythainlp.util import Trie
19+
20+
21+
def index_pairs(words: List[str]) -> Iterator[Tuple[int, int]]:
22+
"""
23+
Return begining and ending index pairs of words
24+
"""
25+
i = 0
26+
for w in words:
27+
yield i, i + len(w)
28+
i += len(w)
29+
30+
31+
def find_badwords(
32+
tokenize: Callable[[str], List[str]],
33+
training_data: Iterable[Iterable[str]],
34+
) -> Set[str]:
35+
"""
36+
Find words that do not work well with the `tokenize` function
37+
for the provided `training_data`.
38+
39+
:param Callable[[str], List[str]] tokenize: a tokenize function
40+
:param Iterable[Iterable[str]] training_data: tokenized text, to be used\
41+
as a training set
42+
:return: words that considered making `tokenize` perform unwell
43+
:rtype: Set[str]
44+
"""
45+
right = Counter()
46+
wrong = Counter()
47+
48+
for train_words in training_data:
49+
train_set = set(index_pairs(train_words))
50+
test_words = tokenize("".join(train_words))
51+
test_pairs = index_pairs(test_words)
52+
for w, p in zip(test_words, test_pairs):
53+
if p in train_set:
54+
right[w] += 1
55+
else:
56+
wrong[w] += 1
57+
58+
# if wrong more than right, then it's a bad word
59+
bad_words = []
60+
for w, count in wrong.items():
61+
if count > right[w]:
62+
bad_words.append(w)
63+
64+
return set(bad_words)
65+
66+
67+
def revise_wordset(
68+
tokenize: Callable[[str], List[str]],
69+
orig_words: Iterable[str],
70+
training_data: Iterable[Iterable[str]],
71+
) -> Set[str]:
72+
"""
73+
Revise a set of word that could improve tokenization performance of
74+
a dictionary-based `tokenize` function.
75+
76+
`orign_words` will be used as a base set for the dictionary.
77+
Words that do not performed well with `training_data` will be removed.
78+
The remaining words will be returned.
79+
80+
:param Callable[[str], List[str]] tokenize: a tokenize function, can be\
81+
any function that takes a string as input and returns a List[str]
82+
:param Iterable[str] orig_words: words that used by the tokenize function,\
83+
will be used as a base for revision
84+
:param Iterable[Iterable[str]] training_data: tokenized text, to be used\
85+
as a training set
86+
:return: words that considered making `tokenize` perform unwell
87+
:rtype: Set[str]
88+
89+
:Example::
90+
::
91+
92+
from pythainlp.corpus import thai_words
93+
from pythainlp.corpus.util import revise_wordset
94+
from pythainlp.tokenize.longest import segment
95+
96+
base_words = thai_words()
97+
more_words = {
98+
"ถวิล อุดล", "ทองอินทร์ ภูริพัฒน์", "เตียง ศิริขันธ์", "จำลอง ดาวเรือง"
99+
}
100+
base_words = base_words.union(more_words)
101+
dict_trie = Trie(wordlist)
102+
103+
tokenize = lambda text: segment(text, dict_trie)
104+
105+
training_data = [
106+
[str, str, str. ...],
107+
[str, str, str, str, ...],
108+
...
109+
]
110+
111+
revised_words = revise_wordset(tokenize, wordlist, training_data)
112+
"""
113+
bad_words = find_badwords(tokenize, training_data)
114+
return set(orig_words) - bad_words
115+
116+
117+
def revise_newmm_default_wordset(
118+
training_data: Iterable[Iterable[str]],
119+
) -> Set[str]:
120+
"""
121+
Revise a set of word that could improve tokenization performance of
122+
`pythainlp.tokenize.newmm`, a dictionary-based tokenizer and a default
123+
tokenizer for PyThaiNLP.
124+
125+
Words from `pythainlp.corpus.thai_words()` will be used as a base set
126+
for the dictionary. Words that do not performed well with `training_data`
127+
will be removed. The remaining words will be returned.
128+
129+
:param Iterable[Iterable[str]] training_data: tokenized text, to be used\
130+
as a training set
131+
:return: words that considered making `tokenize` perform unwell
132+
:rtype: Set[str]
133+
"""
134+
orig_words = thai_words()
135+
trie = Trie(orig_words)
136+
137+
def tokenize(text):
138+
return newmm.segment(text, custom_dict=trie)
139+
140+
revised_words = revise_wordset(tokenize, orig_words, training_data)
141+
return revised_words

pythainlp/util/__init__.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,12 @@
6868
)
6969
from pythainlp.util.numtoword import bahttext, num_to_thaiword
7070
from pythainlp.util.strftime import thai_strftime
71-
from pythainlp.util.thai import countthai, isthai, isthaichar, \
72-
display_thai_char
71+
from pythainlp.util.thai import (
72+
countthai,
73+
display_thai_char,
74+
isthai,
75+
isthaichar,
76+
)
7377
from pythainlp.util.thaiwordcheck import is_native_thai
7478
from pythainlp.util.time import thai_time, thaiword_to_time, time_to_thaiword
7579
from pythainlp.util.trie import Trie, dict_trie

tests/test_corpus.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
ttc,
2424
wordnet,
2525
)
26+
from pythainlp.corpus.util import revise_newmm_default_wordset
2627
from requests import Response
2728

2829

@@ -133,3 +134,13 @@ def test_wordnet(self):
133134

134135
cat_key = wordnet.synsets("แมว")[0].lemmas()[0].key()
135136
self.assertIsNotNone(wordnet.lemma_from_key(cat_key))
137+
138+
def test_revise_wordset(self):
139+
training_data = [
140+
["ถวิล อุดล", " ", "เป็น", "นักการเมือง", "หนึ่ง", "ใน"],
141+
["สี่เสืออีสาน", " ", "ซึ่ง", "ประกอบ", "ด้วย", "ตัว", "นายถวิล"],
142+
["เอง", " ", "นายทองอินทร์ ภูริพัฒน์", " ", "นายเตียง ศิริขันธ์"],
143+
[" ", "และ", "นายจำลอง ดาวเรือง", " ", "และ", "เป็น", "รัฐมนตรี"],
144+
["ที่", "ถูก", "สังหาร", "เมื่อ", "ปี", " ", "พ.ศ.", " ", "2492"],
145+
]
146+
self.assertIsInstance(revise_newmm_default_wordset(training_data), set)

0 commit comments

Comments
 (0)