Skip to content

Commit 3103449

Browse files
bactwannaphong
andauthored
Add thai_keyboard_dist() to calculate Thai keyboard distance (#513)
Calculate Euclidean distance between two characters according to their location on a Thai keyboard layout Co-authored-by: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com> Co-authored-by: Arthit Suriyawongkul <arthit@gmail.com>
2 parents 3cdd242 + ee11634 commit 3103449

File tree

3 files changed

+107
-10
lines changed

3 files changed

+107
-10
lines changed

pythainlp/util/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
"text_to_arabic_digit",
3434
"text_to_thai_digit",
3535
"thai_digit_to_arabic_digit",
36+
"thai_keyboard_dist",
3637
"thai_strftime",
3738
"thai_time",
3839
"thai_to_eng",
@@ -55,6 +56,11 @@
5556
text_to_thai_digit,
5657
thai_digit_to_arabic_digit,
5758
)
59+
from pythainlp.util.keyboard import (
60+
eng_to_thai,
61+
thai_keyboard_dist,
62+
thai_to_eng,
63+
)
5864
from pythainlp.util.emojiconv import emoji_to_thai
5965
from pythainlp.util.keyboard import eng_to_thai, thai_to_eng
6066
from pythainlp.util.keywords import find_keyword, rank

pythainlp/util/keyboard.py

Lines changed: 75 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
# -*- coding: utf-8 -*-
22
"""
3-
Correct text in one language that is incorrectly-typed
4-
with a keyboard layout in another language.
3+
Functions related to keyboard layout.
54
"""
5+
66
EN_TH_KEYB_PAIRS = {
77
"Z": "(",
88
"z": "ผ",
@@ -103,6 +103,19 @@
103103
EN_TH_TRANSLATE_TABLE = str.maketrans(EN_TH_KEYB_PAIRS)
104104
TH_EN_TRANSLATE_TABLE = str.maketrans(TH_EN_KEYB_PAIRS)
105105

106+
TIS_820_2531_MOD = [
107+
["-", "ๅ", "/", "", "_", "ภ", "ถ", "ุ", "ึ", "ค", "ต", "จ", "ข", "ช"],
108+
["ๆ", "ไ", "ำ", "พ", "ะ", "ั", "ี", "ร", "น", "ย", "บ", "ล", "ฃ"],
109+
["ฟ", "ห", "ก", "ด", "เ", "้", "่", "า", "ส", "ว", "ง"],
110+
["ผ", "ป", "แ", "อ", "ิ", "ื", "ท", "ม", "ใ", "ฝ"],
111+
]
112+
TIS_820_2531_MOD_SHIFT = [
113+
["%", "+", "๑", "๒", "๓", "๔", "ู", "฿", "๕", "๖", "๗", "๘", "๙"],
114+
["๐", "\"", "ฎ", "ฑ", "ธ", "ํ", "๊", "ณ", "ฯ", "ญ", "ฐ", ",", "ฅ"],
115+
["ฤ", "ฆ", "ฏ", "โ", "ฌ", "็", "๋", "ษ", "ศ", "ซ", "."],
116+
["(", ")", "ฉ", "ฮ", "ฺ", "์", "?", "ฒ", "ฬ", "ฦ"],
117+
]
118+
106119

107120
def eng_to_thai(text: str) -> str:
108121
"""
@@ -148,3 +161,63 @@ def thai_to_eng(text: str) -> str:
148161
# output: 'Bank of Thailand'
149162
"""
150163
return text.translate(TH_EN_TRANSLATE_TABLE)
164+
165+
166+
def thai_keyboard_dist(c1: str, c2: str, shift_dist: float = 0.0) -> float:
167+
"""
168+
Calculate euclidean distance between two Thai characters
169+
according to their location on a Thai keyboard layout.
170+
171+
A modified TIS 820-2531 standard keyboard layout, which is developed
172+
from Kedmanee layout and is the most commonly used Thai keyboard layout,
173+
is used in distance calculation.
174+
175+
The modified TIS 820-2531 is TIS 820-2531 with few key extensions
176+
proposed in TIS 820-2536 draft. See Figure 4, notice grey keys, in
177+
https://www.nectec.or.th/it-standards/keyboard_layout/thai-key.html
178+
179+
Noted that the latest TIS 820-2538 has slight changes in layout from
180+
TIS 820-2531. See Figure 2, notice the Thai Baht sign and ฅ-ฃ pair, in
181+
https://www.nectec.or.th/it-standards/std820/std820.html
182+
Since TIS 820-2538 is not widely adopted by keyboard manufacturer,
183+
this function uses the de facto standard modified TIS 820-2531 instead.
184+
185+
:param str c1: first character
186+
:param str c2: second character
187+
:param str shift_dist: return value if they're shifted
188+
:return: euclidean distance between two characters
189+
:rtype: float
190+
191+
:Example:
192+
193+
from pythainlp.util import thai_keyboard_dist
194+
thai_keyboard_dist("ด", "ะ")
195+
# output: 1.4142135623730951
196+
thai_keyboard_dist("ฟ", "ฤ")
197+
# output: 0.0
198+
thai_keyboard_dist("ฟ", "ห")
199+
# output: 1.0
200+
thai_keyboard_dist("ฟ", "ก")
201+
# output: 2.0
202+
thai_keyboard_dist("ฟ", "ฤ", 0.5)
203+
# output: 0.5
204+
"""
205+
def get_char_coord(
206+
ch: str, layouts=[TIS_820_2531_MOD, TIS_820_2531_MOD_SHIFT]
207+
):
208+
for layout in layouts:
209+
for row in layout:
210+
if ch in row:
211+
r = layout.index(row)
212+
c = row.index(ch)
213+
return (r, c)
214+
raise ValueError(ch + " not found in given keyboard layout")
215+
216+
coord1 = get_char_coord(c1)
217+
coord2 = get_char_coord(c2)
218+
distance = (
219+
(coord1[0] - coord2[0]) ** 2 + (coord1[1] - coord2[1]) ** 2
220+
) ** (0.5)
221+
if distance == 0 and c1 != c2:
222+
return shift_dist
223+
return distance

tests/test_util.py

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
time_to_thaiword,
4545
thai_to_eng,
4646
thaiword_to_num,
47+
thai_keyboard_dist,
4748
)
4849

4950

@@ -157,6 +158,17 @@ def test_rank(self):
157158
rank(["แมว", "คน", "แมว"], exclude_stopwords=True)
158159
)
159160

161+
# ### pythainlp.util.keyboard
162+
163+
def test_thai_keyboard_dist(self):
164+
self.assertEqual(thai_keyboard_dist("ฟ", "ฤ"), 0.0)
165+
self.assertEqual(thai_keyboard_dist("ฟ", "ห"), 1.0)
166+
self.assertEqual(thai_keyboard_dist("ฟ", "ก"), 2.0)
167+
self.assertEqual(thai_keyboard_dist("ฟ", "ฤ", 0.5), 0.5)
168+
self.assertNotEqual(
169+
thai_keyboard_dist("๘", "๙"), thai_keyboard_dist("๙", "๐")
170+
)
171+
160172
# ### pythainlp.util.date
161173

162174
def test_date(self):
@@ -238,7 +250,8 @@ def test_time_to_thaiword(self):
238250
time_to_thaiword(time(12, 3, 0)), "สิบสองนาฬิกาสามนาที"
239251
)
240252
self.assertEqual(
241-
time_to_thaiword(time(12, 3, 1)), "สิบสองนาฬิกาสามนาทีหนึ่งวินาที",
253+
time_to_thaiword(time(12, 3, 1)),
254+
"สิบสองนาฬิกาสามนาทีหนึ่งวินาที",
242255
)
243256
self.assertEqual(
244257
time_to_thaiword(datetime(2014, 5, 22, 12, 3, 0), precision="s"),
@@ -353,13 +366,16 @@ def test_thaiword_to_date(self):
353366
now + timedelta(days=0), thaiword_to_date("วันนี้", now)
354367
)
355368
self.assertEqual(
356-
now + timedelta(days=1), thaiword_to_date("พรุ่งนี้", now),
369+
now + timedelta(days=1),
370+
thaiword_to_date("พรุ่งนี้", now),
357371
)
358372
self.assertEqual(
359-
now + timedelta(days=2), thaiword_to_date("มะรืนนี้", now),
373+
now + timedelta(days=2),
374+
thaiword_to_date("มะรืนนี้", now),
360375
)
361376
self.assertEqual(
362-
now + timedelta(days=-1), thaiword_to_date("เมื่อวาน", now),
377+
now + timedelta(days=-1),
378+
thaiword_to_date("เมื่อวาน", now),
363379
)
364380
self.assertEqual(
365381
now + timedelta(days=-2), thaiword_to_date("วานซืน", now)
@@ -538,14 +554,16 @@ def test_emoji_to_thai(self):
538554
emoji_to_thai(
539555
"จะมานั่งรถเมล์เหมือนผมก็ได้นะครับ ใกล้ชิดประชาชนดี 😀"
540556
),
541-
("จะมานั่งรถเมล์เหมือนผมก็ได้นะครับ "
542-
"ใกล้ชิดประชาชนดี :หน้ายิ้มยิงฟัน:")
557+
(
558+
"จะมานั่งรถเมล์เหมือนผมก็ได้นะครับ "
559+
"ใกล้ชิดประชาชนดี :หน้ายิ้มยิงฟัน:"
560+
),
543561
)
544562
self.assertEqual(
545563
emoji_to_thai("หิวข้าวอยากกินอาหารญี่ปุ่น 🍣"),
546-
"หิวข้าวอยากกินอาหารญี่ปุ่น :ซูชิ:"
564+
"หิวข้าวอยากกินอาหารญี่ปุ่น :ซูชิ:",
547565
)
548566
self.assertEqual(
549567
emoji_to_thai("🇹🇭 นี่คิือธงประเทศไทย"),
550-
":ธง_ไทย: นี่คิือธงประเทศไทย"
568+
":ธง_ไทย: นี่คิือธงประเทศไทย",
551569
)

0 commit comments

Comments
 (0)