|
6 | 6 | https://colab.research.google.com/drive/148WNIeclf0kOU6QxKd6pcfwpSs8l-VKD#scrollTo=EuVDd0nNuI8Q |
7 | 7 | """ |
8 | 8 | import re |
| 9 | +from typing import List |
9 | 10 |
|
10 | 11 | from pythainlp.tokenize import Tokenizer |
| 12 | +from pythainlp.corpus import thai_words |
11 | 13 |
|
12 | 14 | _ptn_digits = r"(|หนึ่ง|เอ็ด|สอง|ยี่|สาม|สี่|ห้า|หก|เจ็ด|แปด|เก้า)" |
13 | 15 | _ptn_six_figures = ( |
|
45 | 47 | _tokenizer = Tokenizer(custom_dict=_valid_tokens) |
46 | 48 |
|
47 | 49 |
|
| 50 | +def _check_is_thainum(word: str): |
| 51 | + for j in list(_digits.keys()): |
| 52 | + if j in word: |
| 53 | + return (True, 'num') |
| 54 | + for j in ["สิบ", "ร้อย", "พัน", "หมื่น", "แสน", "ล้าน", "จุด", "ลบ"]: |
| 55 | + if j in word: |
| 56 | + return (True, 'unit') |
| 57 | + return (False, None) |
| 58 | + |
| 59 | + |
| 60 | +_dict_words = [i for i in list(thai_words()) if not _check_is_thainum(i)[0]] |
| 61 | +_dict_words += list(_digits.keys()) |
| 62 | +_dict_words += ["สิบ", "ร้อย", "พัน", "หมื่น", "แสน", "ล้าน", "จุด"] |
| 63 | + |
| 64 | +_tokenizer_thaiwords = Tokenizer(_dict_words) |
| 65 | + |
| 66 | + |
48 | 67 | def thaiword_to_num(word: str) -> int: |
49 | 68 | """ |
50 | 69 | Converts the spelled-out numerals in Thai scripts into an actual integer. |
@@ -102,3 +121,94 @@ def thaiword_to_num(word: str) -> int: |
102 | 121 | accumulated = -accumulated |
103 | 122 |
|
104 | 123 | return accumulated |
| 124 | + |
| 125 | + |
| 126 | +def _decimal_unit(words: list) -> float: |
| 127 | + _num = 0.0 |
| 128 | + for i, v in enumerate(words): |
| 129 | + _num += int(thaiword_to_num(v)) / (10**(i+1)) |
| 130 | + return _num |
| 131 | + |
| 132 | + |
| 133 | +def words_to_num(words: list) -> float: |
| 134 | + """ |
| 135 | + Thai Words to float |
| 136 | +
|
| 137 | + :param str text: Thai words |
| 138 | + :return: float of words |
| 139 | + :rtype: float |
| 140 | +
|
| 141 | + :Example: |
| 142 | + :: |
| 143 | +
|
| 144 | + from pythainlp.util import words_to_num |
| 145 | +
|
| 146 | + words_to_num(["ห้า", "สิบ", "จุด", "เก้า", "ห้า"]) |
| 147 | + # output: 50.95 |
| 148 | +
|
| 149 | + """ |
| 150 | + num = 0 |
| 151 | + if "จุด" not in words: |
| 152 | + num = thaiword_to_num(''.join(words)) |
| 153 | + else: |
| 154 | + words_int = ''.join(words[:words.index("จุด")]) |
| 155 | + words_float = words[words.index("จุด") + 1:] |
| 156 | + num = thaiword_to_num(words_int) |
| 157 | + if num <= -1: |
| 158 | + num -= _decimal_unit(words_float) |
| 159 | + else: |
| 160 | + num += _decimal_unit(words_float) |
| 161 | + |
| 162 | + return num |
| 163 | + |
| 164 | + |
| 165 | +def text_to_num(text: str) -> List[str]: |
| 166 | + """ |
| 167 | + Thai text to list thai word with floating point number |
| 168 | +
|
| 169 | + :param str text: Thai text with the spelled-out numerals |
| 170 | + :return: list of thai words with float value of the input |
| 171 | + :rtype: List[str] |
| 172 | +
|
| 173 | + :Example: |
| 174 | + :: |
| 175 | +
|
| 176 | + from pythainlp.util import text_to_num |
| 177 | +
|
| 178 | + text_to_num("เก้าร้อยแปดสิบจุดเก้าห้าบาทนี่คือจำนวนทั้งหมด") |
| 179 | + # output: ['980.95', 'บาท', 'นี่', 'คือ', 'จำนวน', 'ทั้งหมด'] |
| 180 | +
|
| 181 | + text_to_num("สิบล้านสองหมื่นหนึ่งพันแปดร้อยแปดสิบเก้าบาท") |
| 182 | + # output: ['10021889', 'บาท'] |
| 183 | +
|
| 184 | + """ |
| 185 | + _temp = _tokenizer_thaiwords.word_tokenize(text) |
| 186 | + thainum = [] |
| 187 | + last_index = -1 |
| 188 | + list_word_new = [] |
| 189 | + for i, word in enumerate(_temp): |
| 190 | + if ( |
| 191 | + _check_is_thainum(word)[0] |
| 192 | + and last_index+1 == i |
| 193 | + and i+1 == len(_temp) |
| 194 | + ): |
| 195 | + thainum.append(word) |
| 196 | + list_word_new.append(str(words_to_num(thainum))) |
| 197 | + elif _check_is_thainum(word)[0] and last_index+1 == i: |
| 198 | + thainum.append(word) |
| 199 | + last_index = i |
| 200 | + elif _check_is_thainum(word)[0]: |
| 201 | + thainum.append(word) |
| 202 | + last_index = i |
| 203 | + elif ( |
| 204 | + not _check_is_thainum(word)[0] |
| 205 | + and last_index+1 == i |
| 206 | + and last_index != -1 |
| 207 | + ): |
| 208 | + list_word_new.append(str(words_to_num(thainum))) |
| 209 | + thainum = [] |
| 210 | + list_word_new.append(word) |
| 211 | + else: |
| 212 | + list_word_new.append(word) |
| 213 | + last_index = -1 |
| 214 | + return list_word_new |
0 commit comments