Add create_wordlist (#502)

wannaphong · bact · web-flow · commit e2243df4f3e9 · 2020-12-27T19:15:49.000+07:00
* Create words.py

* More readable variable names

* Make a more generic revise_wordset() that can be used for any tokenize function and set of words (not only newmm and default word list)

* Move to pythainlp.corpus.util, add test

* Remove corpus.util.* from import in __init__.py

* Add docs

Co-authored-by: Arthit Suriyawongkul &lt;arthit@gmail.com&gt;
diff --git a/docs/api/corpus.rst b/docs/api/corpus.rst
@@ -22,6 +22,13 @@ Modules
 .. autofunction:: thai_family_names
 .. autofunction:: thai_female_names
 .. autofunction:: thai_male_names
+
+ConceptNet
+----------
+
+ConceptNet is an open, multilingual knowledge graph
+See: https://github.com/commonsense/conceptnet5/wiki/API
+
 .. autofunction:: pythainlp.corpus.conceptnet.edges
 
 TNC
@@ -34,7 +41,14 @@ TTC
 
 .. autofunction:: pythainlp.corpus.ttc.word_freqs
 
-Wordnet
+Util
+----
+
+.. autofunction:: pythainlp.corpus.util.find_badwords
+.. autofunction:: pythainlp.corpus.util.revise_wordset
+.. autofunction:: pythainlp.corpus.util.revise_newmm_default_wordset
+
+WordNet
 -------
 
 PyThaiNLP API is an exact copy of NLTK WordNet API.
diff --git a/docs/api/summarize.rst b/docs/api/summarize.rst
@@ -2,7 +2,7 @@
 
 pythainlp.summarize
 ====================================
-The :class:`summarize` is thai text summarize.
+The :class:`summarize` is Thai text summarizer.
 
 Modules
 -------
diff --git a/docs/api/tag.rst b/docs/api/tag.rst
@@ -40,18 +40,18 @@ Abbreviation   Part-of-Speech tag                                     Examples
  NLBL          Label noun                                             1, 2, 3, 4, ก, ข, a, b
  NCMN          Common noun                                            หนังสือ, อาหาร, อาคาร, คน
  NTTL          Title noun                                             ครู, พลเอก
- PPRS          Personal pronoun                                       คุณ, เขา, ฉัน
- PDMN          Demonstrative pronoun                                  นี่, นั้น, ที่นั่น, ที่นี่
+ PPRS          Personal pronoun                                       คุณ, เขา, ฉัน
+ PDMN          Demonstrative pronoun                                  นี่, นั้น, ที่นั่น, ที่นี่
  PNTR          Interrogative pronoun                                  ใคร, อะไร, อย่างไร
- PREL          Relative pronoun                                       ที่, ซึ่ง, อัน, ผู้
- VACT          Active verb Îµµ,                                       ทำงาน, ร้องเพลง, กิน
+ PREL          Relative pronoun                                       ที่, ซึ่ง, อัน, ผู้
+ VACT          Active verb                                            ทำงาน, ร้องเพลง, กิน
  VSTA          Stative verb                                           เห็น, รู้, คือ
  VATT          Attributive verb                                       อ้วน, ดี, สวย
  XVBM          Pre-verb auxiliary, before negator "ไม่"                เกิด, เกือบ, กำลัง
  XVAM          Pre-verb auxiliary, after negator "ไม่"                 ค่อย, น่า, ได้
  XVMM          Pre-verb, before or after negator "ไม่"                 ควร, เคย, ต้อง
  XVBB          Pre-verb auxiliary, in imperative mood                 กรุณา, จง, เชิญ, อย่า, ห้าม
- XVAE          Post-verb auxiliary Å                                  ไป, มา, ขึ้น
+ XVAE          Post-verb auxiliary                                    ไป, มา, ขึ้น
  DDAN          | Definite determiner, after noun without              ยี่, นั่น, โน่น, ทั้งหมด
                | classifier in between
  DDAC          | Definite determiner, allowing classifier             นี้, นั้น, โน้น, นู้น
@@ -76,12 +76,12 @@ Abbreviation   Part-of-Speech tag                                     Examples
  CLTV          Collective classifier                                  | คู่, กลุ่ม, ฝูง, เชิง, ทาง,
                                                                       | ด้าน, แบบ, รุ่น
  CMTR          Measurement classifier                                 กิโลกรัม, แก้ว, ชั่วโมง
- CFQC          Frequency classifier                                   ครั้ง, เที่ยว
+ CFQC          Frequency classifier                                   ครั้ง, เที่ยว
  CVBL          Verbal classifier                                      ม้วน, มัด
  JCRG          Coordinating conjunction                               และ, หรือ, แต่
- JCMP          Comparative conjunction                                กว่า, เหมือนกับ, เท่ากับ
+ JCMP          Comparative conjunction                                กว่า, เหมือนกับ, เท่ากับ
  JSBR          Subordinating conjunction                              เพราะว่า, เนื่องจาก ที่, แม้ว่า, ถ้า
- RPRE          Preposition                                            จาก, ละ, ของ, ใต้, บน
+ RPRE          Preposition                                            จาก, ละ, ของ, ใต้, บน
  INT           Interjection                                           โอ้บ, โอ้, เออ, เอ๋, อ๋อ
  FIXN          Nominal prefix                                         **การ**\ ทำงาน, **ความ**\ สนุนสนาน
  FIXV          Adverbial prefix                                       **อย่าง**\ เร็ว
diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py
@@ -28,9 +28,9 @@
 ]
 
 import os
-from tinydb import TinyDB
 
 from pythainlp.tools import get_full_data_path, get_pythainlp_path
+from tinydb import TinyDB
 
 # Remote and local corpus databases
 
diff --git a/pythainlp/corpus/common.py b/pythainlp/corpus/common.py
@@ -15,7 +15,7 @@
     "thai_words",
 ]
 
-from typing import Union
+from typing import FrozenSet, List, Union
 
 from pythainlp.corpus import get_corpus
 
@@ -46,7 +46,7 @@
 _THAI_MALE_NAMES_FILENAME = "person_names_male_th.txt"
 
 
-def countries() -> frozenset:
+def countries() -> FrozenSet[str]:
     """
     Return a frozenset of country names in Thai such as "แคนาดา", "โรมาเนีย",
     "แอลจีเรีย", and "ลาว".
@@ -63,7 +63,7 @@ def countries() -> frozenset:
     return _THAI_COUNTRIES
 
 
-def provinces(details: bool = False) -> Union[frozenset, list]:
+def provinces(details: bool = False) -> Union[FrozenSet[str], List[str]]:
     """
     Return a frozenset of Thailand province names in Thai such as "กระบี่",
     "กรุงเทพมหานคร", "กาญจนบุรี", and "อุบลราชธานี".
@@ -106,7 +106,7 @@ def provinces(details: bool = False) -> Union[frozenset, list]:
     return _THAI_THAILAND_PROVINCES
 
 
-def thai_syllables() -> frozenset:
+def thai_syllables() -> FrozenSet[str]:
     """
     Return a frozenset of Thai syllables such as "กรอบ", "ก็", "๑", "โมบ",
     "โมน", "โม่ง", "กา", "ก่า", and, "ก้า".
@@ -123,7 +123,7 @@ def thai_syllables() -> frozenset:
     return _THAI_SYLLABLES
 
 
-def thai_words() -> frozenset:
+def thai_words() -> FrozenSet[str]:
     """
     Return a frozenset of Thai words such as "กติกา", "กดดัน", "พิษ",
     and "พิษภัย". \n(See: `dev/pythainlp/corpus/words_th.txt\
@@ -139,7 +139,7 @@ def thai_words() -> frozenset:
     return _THAI_WORDS
 
 
-def thai_stopwords() -> frozenset:
+def thai_stopwords() -> FrozenSet[str]:
     """
     Return a frozenset of Thai stopwords such as "มี", "ไป", "ไง", "ขณะ",
     "การ", and "ประการหนึ่ง". \n(See: `dev/pythainlp/corpus/stopwords_th.txt\
@@ -155,7 +155,7 @@ def thai_stopwords() -> frozenset:
     return _THAI_STOPWORDS
 
 
-def thai_negations() -> frozenset:
+def thai_negations() -> FrozenSet[str]:
     """
     Return a frozenset of Thai negation words including "ไม่" and "แต่".
     \n(See: `dev/pythainlp/corpus/negations_th.txt\
@@ -171,7 +171,7 @@ def thai_negations() -> frozenset:
     return _THAI_NEGATIONS
 
 
-def thai_family_names() -> frozenset:
+def thai_family_names() -> FrozenSet[str]:
     """
     Return a frozenset of Thai family names
     \n(See: `dev/pythainlp/corpus/family_names_th.txt\
@@ -187,7 +187,7 @@ def thai_family_names() -> frozenset:
     return _THAI_FAMLIY_NAMES
 
 
-def thai_female_names() -> frozenset:
+def thai_female_names() -> FrozenSet[str]:
     """
     Return a frozenset of Thai female names
     \n(See: `dev/pythainlp/corpus/person_names_female_th.txt\
@@ -203,7 +203,7 @@ def thai_female_names() -> frozenset:
     return _THAI_FEMALE_NAMES
 
 
-def thai_male_names() -> frozenset:
+def thai_male_names() -> FrozenSet[str]:
     """
     Return a frozenset of Thai male names
     \n(See: `dev/pythainlp/corpus/person_names_male_th.txt\
diff --git a/pythainlp/corpus/util.py b/pythainlp/corpus/util.py
@@ -0,0 +1,141 @@
+# -*- coding: utf-8 -*-
+"""
+Tool for create word list
+code is from Korakot Chaovavanich.
+
+:See also:
+    * `Facebook post \
+        <https://www.facebook.com/groups/colab.thailand/permalink/1667821073393244>`_
+    * `Google Colab \
+        <https://colab.research.google.com/drive/19kY2jCHONuxmTJM0U8PIE_I5OK1rO-x_>`_
+"""
+
+from collections import Counter
+from typing import Callable, Iterable, Iterator, List, Set, Tuple
+
+from pythainlp.corpus import thai_words
+from pythainlp.tokenize import newmm
+from pythainlp.util import Trie
+
+
+def index_pairs(words: List[str]) -> Iterator[Tuple[int, int]]:
+    """
+    Return begining and ending index pairs of words
+    """
+    i = 0
+    for w in words:
+        yield i, i + len(w)
+        i += len(w)
+
+
+def find_badwords(
+    tokenize: Callable[[str], List[str]],
+    training_data: Iterable[Iterable[str]],
+) -> Set[str]:
+    """
+    Find words that do not work well with the `tokenize` function
+    for the provided `training_data`.
+
+    :param Callable[[str], List[str]] tokenize: a tokenize function
+    :param Iterable[Iterable[str]] training_data: tokenized text, to be used\
+        as a training set
+    :return: words that considered making `tokenize` perform unwell
+    :rtype: Set[str]
+    """
+    right = Counter()
+    wrong = Counter()
+
+    for train_words in training_data:
+        train_set = set(index_pairs(train_words))
+        test_words = tokenize("".join(train_words))
+        test_pairs = index_pairs(test_words)
+        for w, p in zip(test_words, test_pairs):
+            if p in train_set:
+                right[w] += 1
+            else:
+                wrong[w] += 1
+
+    # if wrong more than right, then it's a bad word
+    bad_words = []
+    for w, count in wrong.items():
+        if count > right[w]:
+            bad_words.append(w)
+
+    return set(bad_words)
+
+
+def revise_wordset(
+    tokenize: Callable[[str], List[str]],
+    orig_words: Iterable[str],
+    training_data: Iterable[Iterable[str]],
+) -> Set[str]:
+    """
+    Revise a set of word that could improve tokenization performance of
+    a dictionary-based `tokenize` function.
+
+    `orign_words` will be used as a base set for the dictionary.
+    Words that do not performed well with `training_data` will be removed.
+    The remaining words will be returned.
+
+    :param Callable[[str], List[str]] tokenize: a tokenize function, can be\
+        any function that takes a string as input and returns a List[str]
+    :param Iterable[str] orig_words: words that used by the tokenize function,\
+        will be used as a base for revision
+    :param Iterable[Iterable[str]] training_data: tokenized text, to be used\
+        as a training set
+    :return: words that considered making `tokenize` perform unwell
+    :rtype: Set[str]
+
+    :Example::
+    ::
+
+    from pythainlp.corpus import thai_words
+    from pythainlp.corpus.util import revise_wordset
+    from pythainlp.tokenize.longest import segment
+
+    base_words = thai_words()
+    more_words = {
+        "ถวิล อุดล", "ทองอินทร์ ภูริพัฒน์", "เตียง ศิริขันธ์", "จำลอง ดาวเรือง"
+    }
+    base_words = base_words.union(more_words)
+    dict_trie = Trie(wordlist)
+
+    tokenize = lambda text: segment(text, dict_trie)
+
+    training_data = [
+        [str, str, str. ...],
+        [str, str, str, str, ...],
+        ...
+    ]
+
+    revised_words = revise_wordset(tokenize, wordlist, training_data)
+    """
+    bad_words = find_badwords(tokenize, training_data)
+    return set(orig_words) - bad_words
+
+
+def revise_newmm_default_wordset(
+    training_data: Iterable[Iterable[str]],
+) -> Set[str]:
+    """
+    Revise a set of word that could improve tokenization performance of
+    `pythainlp.tokenize.newmm`, a dictionary-based tokenizer and a default
+    tokenizer for PyThaiNLP.
+
+    Words from `pythainlp.corpus.thai_words()` will be used as a base set
+    for the dictionary. Words that do not performed well with `training_data`
+    will be removed. The remaining words will be returned.
+
+    :param Iterable[Iterable[str]] training_data: tokenized text, to be used\
+        as a training set
+    :return: words that considered making `tokenize` perform unwell
+    :rtype: Set[str]
+    """
+    orig_words = thai_words()
+    trie = Trie(orig_words)
+
+    def tokenize(text):
+        return newmm.segment(text, custom_dict=trie)
+
+    revised_words = revise_wordset(tokenize, orig_words, training_data)
+    return revised_words
diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py
@@ -68,8 +68,12 @@
 )
 from pythainlp.util.numtoword import bahttext, num_to_thaiword
 from pythainlp.util.strftime import thai_strftime
-from pythainlp.util.thai import countthai, isthai, isthaichar, \
-    display_thai_char
+from pythainlp.util.thai import (
+    countthai,
+    display_thai_char,
+    isthai,
+    isthaichar,
+)
 from pythainlp.util.thaiwordcheck import is_native_thai
 from pythainlp.util.time import thai_time, thaiword_to_time, time_to_thaiword
 from pythainlp.util.trie import Trie, dict_trie
diff --git a/tests/test_corpus.py b/tests/test_corpus.py
@@ -23,6 +23,7 @@
     ttc,
     wordnet,
 )
+from pythainlp.corpus.util import revise_newmm_default_wordset
 from requests import Response
 
 
@@ -133,3 +134,13 @@ def test_wordnet(self):
 
         cat_key = wordnet.synsets("แมว")[0].lemmas()[0].key()
         self.assertIsNotNone(wordnet.lemma_from_key(cat_key))
+
+    def test_revise_wordset(self):
+        training_data = [
+            ["ถวิล อุดล", " ", "เป็น", "นักการเมือง", "หนึ่ง", "ใน"],
+            ["สี่เสืออีสาน", " ", "ซึ่ง", "ประกอบ", "ด้วย", "ตัว", "นายถวิล"],
+            ["เอง", " ", "นายทองอินทร์ ภูริพัฒน์", " ", "นายเตียง ศิริขันธ์"],
+            [" ", "และ", "นายจำลอง ดาวเรือง", " ", "และ", "เป็น", "รัฐมนตรี"],
+            ["ที่", "ถูก", "สังหาร", "เมื่อ", "ปี", " ", "พ.ศ.", " ", "2492"],
+        ]
+        self.assertIsInstance(revise_newmm_default_wordset(training_data), set)

Original file line number	Diff line number	Diff line change
`@@ -28,9 +28,9 @@`
`28`	`28`	`]`
`29`	`29`
`30`	`30`	`import os`
`31`		`-from tinydb import TinyDB`
`32`	`31`
`33`	`32`	`from pythainlp.tools import get_full_data_path, get_pythainlp_path`
	`33`	`+from tinydb import TinyDB`
`34`	`34`
`35`	`35`	`# Remote and local corpus databases`
`36`	`36`