|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +""" |
| 3 | +Tool for create word list |
| 4 | +code is from Korakot Chaovavanich. |
| 5 | +
|
| 6 | +:See also: |
| 7 | + * `Facebook post \ |
| 8 | + <https://www.facebook.com/groups/colab.thailand/permalink/1667821073393244>`_ |
| 9 | + * `Google Colab \ |
| 10 | + <https://colab.research.google.com/drive/19kY2jCHONuxmTJM0U8PIE_I5OK1rO-x_>`_ |
| 11 | +""" |
| 12 | + |
| 13 | +from collections import Counter |
| 14 | +from typing import Callable, Iterable, Iterator, List, Set, Tuple |
| 15 | + |
| 16 | +from pythainlp.corpus import thai_words |
| 17 | +from pythainlp.tokenize import newmm |
| 18 | +from pythainlp.util import Trie |
| 19 | + |
| 20 | + |
| 21 | +def index_pairs(words: List[str]) -> Iterator[Tuple[int, int]]: |
| 22 | + """ |
| 23 | + Return begining and ending index pairs of words |
| 24 | + """ |
| 25 | + i = 0 |
| 26 | + for w in words: |
| 27 | + yield i, i + len(w) |
| 28 | + i += len(w) |
| 29 | + |
| 30 | + |
| 31 | +def find_badwords( |
| 32 | + tokenize: Callable[[str], List[str]], |
| 33 | + training_data: Iterable[Iterable[str]], |
| 34 | +) -> Set[str]: |
| 35 | + """ |
| 36 | + Find words that do not work well with the `tokenize` function |
| 37 | + for the provided `training_data`. |
| 38 | +
|
| 39 | + :param Callable[[str], List[str]] tokenize: a tokenize function |
| 40 | + :param Iterable[Iterable[str]] training_data: tokenized text, to be used\ |
| 41 | + as a training set |
| 42 | + :return: words that considered making `tokenize` perform unwell |
| 43 | + :rtype: Set[str] |
| 44 | + """ |
| 45 | + right = Counter() |
| 46 | + wrong = Counter() |
| 47 | + |
| 48 | + for train_words in training_data: |
| 49 | + train_set = set(index_pairs(train_words)) |
| 50 | + test_words = tokenize("".join(train_words)) |
| 51 | + test_pairs = index_pairs(test_words) |
| 52 | + for w, p in zip(test_words, test_pairs): |
| 53 | + if p in train_set: |
| 54 | + right[w] += 1 |
| 55 | + else: |
| 56 | + wrong[w] += 1 |
| 57 | + |
| 58 | + # if wrong more than right, then it's a bad word |
| 59 | + bad_words = [] |
| 60 | + for w, count in wrong.items(): |
| 61 | + if count > right[w]: |
| 62 | + bad_words.append(w) |
| 63 | + |
| 64 | + return set(bad_words) |
| 65 | + |
| 66 | + |
| 67 | +def revise_wordset( |
| 68 | + tokenize: Callable[[str], List[str]], |
| 69 | + orig_words: Iterable[str], |
| 70 | + training_data: Iterable[Iterable[str]], |
| 71 | +) -> Set[str]: |
| 72 | + """ |
| 73 | + Revise a set of word that could improve tokenization performance of |
| 74 | + a dictionary-based `tokenize` function. |
| 75 | +
|
| 76 | + `orign_words` will be used as a base set for the dictionary. |
| 77 | + Words that do not performed well with `training_data` will be removed. |
| 78 | + The remaining words will be returned. |
| 79 | +
|
| 80 | + :param Callable[[str], List[str]] tokenize: a tokenize function, can be\ |
| 81 | + any function that takes a string as input and returns a List[str] |
| 82 | + :param Iterable[str] orig_words: words that used by the tokenize function,\ |
| 83 | + will be used as a base for revision |
| 84 | + :param Iterable[Iterable[str]] training_data: tokenized text, to be used\ |
| 85 | + as a training set |
| 86 | + :return: words that considered making `tokenize` perform unwell |
| 87 | + :rtype: Set[str] |
| 88 | +
|
| 89 | + :Example:: |
| 90 | + :: |
| 91 | +
|
| 92 | + from pythainlp.corpus import thai_words |
| 93 | + from pythainlp.corpus.util import revise_wordset |
| 94 | + from pythainlp.tokenize.longest import segment |
| 95 | +
|
| 96 | + base_words = thai_words() |
| 97 | + more_words = { |
| 98 | + "ถวิล อุดล", "ทองอินทร์ ภูริพัฒน์", "เตียง ศิริขันธ์", "จำลอง ดาวเรือง" |
| 99 | + } |
| 100 | + base_words = base_words.union(more_words) |
| 101 | + dict_trie = Trie(wordlist) |
| 102 | +
|
| 103 | + tokenize = lambda text: segment(text, dict_trie) |
| 104 | +
|
| 105 | + training_data = [ |
| 106 | + [str, str, str. ...], |
| 107 | + [str, str, str, str, ...], |
| 108 | + ... |
| 109 | + ] |
| 110 | +
|
| 111 | + revised_words = revise_wordset(tokenize, wordlist, training_data) |
| 112 | + """ |
| 113 | + bad_words = find_badwords(tokenize, training_data) |
| 114 | + return set(orig_words) - bad_words |
| 115 | + |
| 116 | + |
| 117 | +def revise_newmm_default_wordset( |
| 118 | + training_data: Iterable[Iterable[str]], |
| 119 | +) -> Set[str]: |
| 120 | + """ |
| 121 | + Revise a set of word that could improve tokenization performance of |
| 122 | + `pythainlp.tokenize.newmm`, a dictionary-based tokenizer and a default |
| 123 | + tokenizer for PyThaiNLP. |
| 124 | +
|
| 125 | + Words from `pythainlp.corpus.thai_words()` will be used as a base set |
| 126 | + for the dictionary. Words that do not performed well with `training_data` |
| 127 | + will be removed. The remaining words will be returned. |
| 128 | +
|
| 129 | + :param Iterable[Iterable[str]] training_data: tokenized text, to be used\ |
| 130 | + as a training set |
| 131 | + :return: words that considered making `tokenize` perform unwell |
| 132 | + :rtype: Set[str] |
| 133 | + """ |
| 134 | + orig_words = thai_words() |
| 135 | + trie = Trie(orig_words) |
| 136 | + |
| 137 | + def tokenize(text): |
| 138 | + return newmm.segment(text, custom_dict=trie) |
| 139 | + |
| 140 | + revised_words = revise_wordset(tokenize, orig_words, training_data) |
| 141 | + return revised_words |
0 commit comments