Skip to content
6 changes: 6 additions & 0 deletions pythainlp/util/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
"text_to_arabic_digit",
"text_to_thai_digit",
"thai_digit_to_arabic_digit",
"thai_keyboard_dist",
"thai_strftime",
"thai_time",
"thai_to_eng",
Expand All @@ -55,6 +56,11 @@
text_to_thai_digit,
thai_digit_to_arabic_digit,
)
from pythainlp.util.keyboard import (
eng_to_thai,
thai_keyboard_dist,
thai_to_eng,
)
from pythainlp.util.emojiconv import emoji_to_thai
from pythainlp.util.keyboard import eng_to_thai, thai_to_eng
from pythainlp.util.keywords import find_keyword, rank
Expand Down
77 changes: 75 additions & 2 deletions pythainlp/util/keyboard.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# -*- coding: utf-8 -*-
"""
Correct text in one language that is incorrectly-typed
with a keyboard layout in another language.
Functions related to keyboard layout.
"""

EN_TH_KEYB_PAIRS = {
"Z": "(",
"z": "ผ",
Expand Down Expand Up @@ -103,6 +103,19 @@
EN_TH_TRANSLATE_TABLE = str.maketrans(EN_TH_KEYB_PAIRS)
TH_EN_TRANSLATE_TABLE = str.maketrans(TH_EN_KEYB_PAIRS)

TIS_820_2531_MOD = [
["-", "ๅ", "/", "", "_", "ภ", "ถ", "ุ", "ึ", "ค", "ต", "จ", "ข", "ช"],
["ๆ", "ไ", "ำ", "พ", "ะ", "ั", "ี", "ร", "น", "ย", "บ", "ล", "ฃ"],
["ฟ", "ห", "ก", "ด", "เ", "้", "่", "า", "ส", "ว", "ง"],
["ผ", "ป", "แ", "อ", "ิ", "ื", "ท", "ม", "ใ", "ฝ"],
]
TIS_820_2531_MOD_SHIFT = [
["%", "+", "๑", "๒", "๓", "๔", "ู", "฿", "๕", "๖", "๗", "๘", "๙"],
["๐", "\"", "ฎ", "ฑ", "ธ", "ํ", "๊", "ณ", "ฯ", "ญ", "ฐ", ",", "ฅ"],
["ฤ", "ฆ", "ฏ", "โ", "ฌ", "็", "๋", "ษ", "ศ", "ซ", "."],
["(", ")", "ฉ", "ฮ", "ฺ", "์", "?", "ฒ", "ฬ", "ฦ"],
]


def eng_to_thai(text: str) -> str:
"""
Expand Down Expand Up @@ -148,3 +161,63 @@ def thai_to_eng(text: str) -> str:
# output: 'Bank of Thailand'
"""
return text.translate(TH_EN_TRANSLATE_TABLE)


def thai_keyboard_dist(c1: str, c2: str, shift_dist: float = 0.0) -> float:
"""
Calculate euclidean distance between two Thai characters
according to their location on a Thai keyboard layout.

A modified TIS 820-2531 standard keyboard layout, which is developed
from Kedmanee layout and is the most commonly used Thai keyboard layout,
is used in distance calculation.

The modified TIS 820-2531 is TIS 820-2531 with few key extensions
proposed in TIS 820-2536 draft. See Figure 4, notice grey keys, in
https://www.nectec.or.th/it-standards/keyboard_layout/thai-key.html

Noted that the latest TIS 820-2538 has slight changes in layout from
TIS 820-2531. See Figure 2, notice the Thai Baht sign and ฅ-ฃ pair, in
https://www.nectec.or.th/it-standards/std820/std820.html
Since TIS 820-2538 is not widely adopted by keyboard manufacturer,
this function uses the de facto standard modified TIS 820-2531 instead.

:param str c1: first character
:param str c2: second character
:param str shift_dist: return value if they're shifted
:return: euclidean distance between two characters
:rtype: float

:Example:

from pythainlp.util import thai_keyboard_dist
thai_keyboard_dist("ด", "ะ")
# output: 1.4142135623730951
thai_keyboard_dist("ฟ", "ฤ")
# output: 0.0
thai_keyboard_dist("ฟ", "ห")
# output: 1.0
thai_keyboard_dist("ฟ", "ก")
# output: 2.0
thai_keyboard_dist("ฟ", "ฤ", 0.5)
# output: 0.5
"""
def get_char_coord(
ch: str, layouts=[TIS_820_2531_MOD, TIS_820_2531_MOD_SHIFT]
):
for layout in layouts:
for row in layout:
if ch in row:
r = layout.index(row)
c = row.index(ch)
return (r, c)
raise ValueError(ch + " not found in given keyboard layout")

coord1 = get_char_coord(c1)
coord2 = get_char_coord(c2)
distance = (
(coord1[0] - coord2[0]) ** 2 + (coord1[1] - coord2[1]) ** 2
) ** (0.5)
if distance == 0 and c1 != c2:
return shift_dist
return distance
34 changes: 26 additions & 8 deletions tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
time_to_thaiword,
thai_to_eng,
thaiword_to_num,
thai_keyboard_dist,
)


Expand Down Expand Up @@ -157,6 +158,17 @@ def test_rank(self):
rank(["แมว", "คน", "แมว"], exclude_stopwords=True)
)

# ### pythainlp.util.keyboard

def test_thai_keyboard_dist(self):
self.assertEqual(thai_keyboard_dist("ฟ", "ฤ"), 0.0)
self.assertEqual(thai_keyboard_dist("ฟ", "ห"), 1.0)
self.assertEqual(thai_keyboard_dist("ฟ", "ก"), 2.0)
self.assertEqual(thai_keyboard_dist("ฟ", "ฤ", 0.5), 0.5)
self.assertNotEqual(
thai_keyboard_dist("๘", "๙"), thai_keyboard_dist("๙", "๐")
)

# ### pythainlp.util.date

def test_date(self):
Expand Down Expand Up @@ -238,7 +250,8 @@ def test_time_to_thaiword(self):
time_to_thaiword(time(12, 3, 0)), "สิบสองนาฬิกาสามนาที"
)
self.assertEqual(
time_to_thaiword(time(12, 3, 1)), "สิบสองนาฬิกาสามนาทีหนึ่งวินาที",
time_to_thaiword(time(12, 3, 1)),
"สิบสองนาฬิกาสามนาทีหนึ่งวินาที",
)
self.assertEqual(
time_to_thaiword(datetime(2014, 5, 22, 12, 3, 0), precision="s"),
Expand Down Expand Up @@ -353,13 +366,16 @@ def test_thaiword_to_date(self):
now + timedelta(days=0), thaiword_to_date("วันนี้", now)
)
self.assertEqual(
now + timedelta(days=1), thaiword_to_date("พรุ่งนี้", now),
now + timedelta(days=1),
thaiword_to_date("พรุ่งนี้", now),
)
self.assertEqual(
now + timedelta(days=2), thaiword_to_date("มะรืนนี้", now),
now + timedelta(days=2),
thaiword_to_date("มะรืนนี้", now),
)
self.assertEqual(
now + timedelta(days=-1), thaiword_to_date("เมื่อวาน", now),
now + timedelta(days=-1),
thaiword_to_date("เมื่อวาน", now),
)
self.assertEqual(
now + timedelta(days=-2), thaiword_to_date("วานซืน", now)
Expand Down Expand Up @@ -538,14 +554,16 @@ def test_emoji_to_thai(self):
emoji_to_thai(
"จะมานั่งรถเมล์เหมือนผมก็ได้นะครับ ใกล้ชิดประชาชนดี 😀"
),
("จะมานั่งรถเมล์เหมือนผมก็ได้นะครับ "
"ใกล้ชิดประชาชนดี :หน้ายิ้มยิงฟัน:")
(
"จะมานั่งรถเมล์เหมือนผมก็ได้นะครับ "
"ใกล้ชิดประชาชนดี :หน้ายิ้มยิงฟัน:"
),
)
self.assertEqual(
emoji_to_thai("หิวข้าวอยากกินอาหารญี่ปุ่น 🍣"),
"หิวข้าวอยากกินอาหารญี่ปุ่น :ซูชิ:"
"หิวข้าวอยากกินอาหารญี่ปุ่น :ซูชิ:",
)
self.assertEqual(
emoji_to_thai("🇹🇭 นี่คิือธงประเทศไทย"),
":ธง_ไทย: นี่คิือธงประเทศไทย"
":ธง_ไทย: นี่คิือธงประเทศไทย",
)