Skip to content

Commit 05cd82c

Browse files
committed
Merge branch 'dev' of https://github.com/PyThaiNLP/pythainlp into dev
2 parents 82cbf77 + 0bbccc9 commit 05cd82c

File tree

5 files changed

+151
-1
lines changed

5 files changed

+151
-1
lines changed

docs/api/util.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ Modules
3232
.. autofunction:: remove_zw
3333
.. autofunction:: reorder_vowels
3434
.. autofunction:: text_to_arabic_digit
35+
.. autofunction:: text_to_num
3536
.. autofunction:: text_to_thai_digit
3637
.. autofunction:: thai_strftime
3738
.. autofunction:: thai_to_eng
@@ -40,5 +41,6 @@ Modules
4041
.. autofunction:: thaiword_to_num
4142
.. autofunction:: thaiword_to_time
4243
.. autofunction:: time_to_thaiword
44+
.. autofunction:: words_to_num
4345
.. autoclass:: Trie
4446
:members:
-433 KB
Binary file not shown.

pythainlp/util/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141
"thaiword_to_num",
4242
"thaiword_to_time",
4343
"time_to_thaiword",
44+
"text_to_num",
45+
"words_to_num",
4446
]
4547

4648
from pythainlp.util.collate import collate
@@ -85,4 +87,4 @@
8587
from pythainlp.util.thaiwordcheck import is_native_thai
8688
from pythainlp.util.time import thai_time, thaiword_to_time, time_to_thaiword
8789
from pythainlp.util.trie import Trie, dict_trie
88-
from pythainlp.util.wordtonum import thaiword_to_num
90+
from pythainlp.util.wordtonum import thaiword_to_num, text_to_num, words_to_num

pythainlp/util/wordtonum.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
https://colab.research.google.com/drive/148WNIeclf0kOU6QxKd6pcfwpSs8l-VKD#scrollTo=EuVDd0nNuI8Q
77
"""
88
import re
9+
from typing import List
910

1011
from pythainlp.tokenize import Tokenizer
12+
from pythainlp.corpus import thai_words
1113

1214
_ptn_digits = r"(|หนึ่ง|เอ็ด|สอง|ยี่|สาม|สี่|ห้า|หก|เจ็ด|แปด|เก้า)"
1315
_ptn_six_figures = (
@@ -45,6 +47,23 @@
4547
_tokenizer = Tokenizer(custom_dict=_valid_tokens)
4648

4749

50+
def _check_is_thainum(word: str):
51+
for j in list(_digits.keys()):
52+
if j in word:
53+
return (True, 'num')
54+
for j in ["สิบ", "ร้อย", "พัน", "หมื่น", "แสน", "ล้าน", "จุด", "ลบ"]:
55+
if j in word:
56+
return (True, 'unit')
57+
return (False, None)
58+
59+
60+
_dict_words = [i for i in list(thai_words()) if not _check_is_thainum(i)[0]]
61+
_dict_words += list(_digits.keys())
62+
_dict_words += ["สิบ", "ร้อย", "พัน", "หมื่น", "แสน", "ล้าน", "จุด"]
63+
64+
_tokenizer_thaiwords = Tokenizer(_dict_words)
65+
66+
4867
def thaiword_to_num(word: str) -> int:
4968
"""
5069
Converts the spelled-out numerals in Thai scripts into an actual integer.
@@ -102,3 +121,94 @@ def thaiword_to_num(word: str) -> int:
102121
accumulated = -accumulated
103122

104123
return accumulated
124+
125+
126+
def _decimal_unit(words: list) -> float:
127+
_num = 0.0
128+
for i, v in enumerate(words):
129+
_num += int(thaiword_to_num(v)) / (10**(i+1))
130+
return _num
131+
132+
133+
def words_to_num(words: list) -> float:
134+
"""
135+
Thai Words to float
136+
137+
:param str text: Thai words
138+
:return: float of words
139+
:rtype: float
140+
141+
:Example:
142+
::
143+
144+
from pythainlp.util import words_to_num
145+
146+
words_to_num(["ห้า", "สิบ", "จุด", "เก้า", "ห้า"])
147+
# output: 50.95
148+
149+
"""
150+
num = 0
151+
if "จุด" not in words:
152+
num = thaiword_to_num(''.join(words))
153+
else:
154+
words_int = ''.join(words[:words.index("จุด")])
155+
words_float = words[words.index("จุด") + 1:]
156+
num = thaiword_to_num(words_int)
157+
if num <= -1:
158+
num -= _decimal_unit(words_float)
159+
else:
160+
num += _decimal_unit(words_float)
161+
162+
return num
163+
164+
165+
def text_to_num(text: str) -> List[str]:
166+
"""
167+
Thai text to list thai word with floating point number
168+
169+
:param str text: Thai text with the spelled-out numerals
170+
:return: list of thai words with float value of the input
171+
:rtype: List[str]
172+
173+
:Example:
174+
::
175+
176+
from pythainlp.util import text_to_num
177+
178+
text_to_num("เก้าร้อยแปดสิบจุดเก้าห้าบาทนี่คือจำนวนทั้งหมด")
179+
# output: ['980.95', 'บาท', 'นี่', 'คือ', 'จำนวน', 'ทั้งหมด']
180+
181+
text_to_num("สิบล้านสองหมื่นหนึ่งพันแปดร้อยแปดสิบเก้าบาท")
182+
# output: ['10021889', 'บาท']
183+
184+
"""
185+
_temp = _tokenizer_thaiwords.word_tokenize(text)
186+
thainum = []
187+
last_index = -1
188+
list_word_new = []
189+
for i, word in enumerate(_temp):
190+
if (
191+
_check_is_thainum(word)[0]
192+
and last_index+1 == i
193+
and i+1 == len(_temp)
194+
):
195+
thainum.append(word)
196+
list_word_new.append(str(words_to_num(thainum)))
197+
elif _check_is_thainum(word)[0] and last_index+1 == i:
198+
thainum.append(word)
199+
last_index = i
200+
elif _check_is_thainum(word)[0]:
201+
thainum.append(word)
202+
last_index = i
203+
elif (
204+
not _check_is_thainum(word)[0]
205+
and last_index+1 == i
206+
and last_index != -1
207+
):
208+
list_word_new.append(str(words_to_num(thainum)))
209+
thainum = []
210+
list_word_new.append(word)
211+
else:
212+
list_word_new.append(word)
213+
last_index = -1
214+
return list_word_new

tests/test_util.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@
4545
thai_to_eng,
4646
thaiword_to_num,
4747
thai_keyboard_dist,
48+
text_to_num,
49+
words_to_num,
4850
)
4951

5052

@@ -107,6 +109,40 @@ def test_number(self):
107109
with self.assertRaises(TypeError):
108110
thaiword_to_num(["หนึ่ง"])
109111

112+
self.assertEqual(words_to_num("ศูนย์"), 0)
113+
self.assertEqual(words_to_num("แปด"), 8)
114+
self.assertEqual(words_to_num("ยี่สิบ"), 20)
115+
self.assertEqual(words_to_num("ร้อยสิบสอง"), 112)
116+
self.assertEqual(words_to_num("ลบแปด"), -8)
117+
self.assertEqual(words_to_num("ลบยี่สิบ"), -20)
118+
self.assertEqual(words_to_num("ลบร้อยสิบสอง"), -112)
119+
self.assertEqual(
120+
words_to_num("หกล้านหกแสนหกหมื่นหกพันหกร้อยหกสิบหก"), 6666666
121+
)
122+
self.assertEqual(words_to_num("สองล้านสามแสนหกร้อยสิบสอง"), 2300612)
123+
self.assertEqual(words_to_num("หนึ่งร้อยสิบล้าน"), 110000000)
124+
self.assertEqual(
125+
words_to_num("สิบห้าล้านล้านเจ็ดสิบสอง"), 15000000000072
126+
)
127+
self.assertEqual(words_to_num("หนึ่งล้านล้าน"), 1000000000000)
128+
self.assertEqual(
129+
words_to_num("สองแสนสี่หมื่นสามสิบล้านสี่พันล้าน"),
130+
240030004000000000,
131+
)
132+
self.assertEqual(words_to_num("ร้อยสิบล้านแปดแสนห้าพัน"), 110805000)
133+
self.assertEqual(words_to_num("ลบหนึ่ง"), -1)
134+
text = "ลบหนึ่งร้อยล้านสี่แสนห้าพันยี่สิบเอ็ด"
135+
self.assertEqual(num_to_thaiword(words_to_num(text)), text)
136+
self.assertIsNotNone(
137+
text_to_num("เก้าร้อยแปดสิบจุดเก้าห้าบาทนี่คือจำนวนทั้งหมด")
138+
)
139+
self.assertIsNotNone(
140+
text_to_num("สิบล้านสองหมื่นหนึ่งพันแปดร้อยแปดสิบเก้าบาท")
141+
)
142+
self.assertIsNotNone(
143+
text_to_num("สิบล้านสองหมื่นหนึ่งพันแปดร้อยแปดสิบเก้า")
144+
)
145+
110146
self.assertEqual(
111147
arabic_digit_to_thai_digit("ไทยแลนด์ 4.0"), "ไทยแลนด์ ๔.๐"
112148
)

0 commit comments

Comments
 (0)