|
1 | 1 | # -*- coding: utf-8 -*- |
2 | 2 | """ |
3 | | -Correct text in one language that is incorrectly-typed |
4 | | -with a keyboard layout in another language. |
| 3 | +Functions related to keyboard layout. |
5 | 4 | """ |
| 5 | + |
6 | 6 | EN_TH_KEYB_PAIRS = { |
7 | 7 | "Z": "(", |
8 | 8 | "z": "ผ", |
|
103 | 103 | EN_TH_TRANSLATE_TABLE = str.maketrans(EN_TH_KEYB_PAIRS) |
104 | 104 | TH_EN_TRANSLATE_TABLE = str.maketrans(TH_EN_KEYB_PAIRS) |
105 | 105 |
|
| 106 | +TIS_820_2531_MOD = [ |
| 107 | + ["-", "ๅ", "/", "", "_", "ภ", "ถ", "ุ", "ึ", "ค", "ต", "จ", "ข", "ช"], |
| 108 | + ["ๆ", "ไ", "ำ", "พ", "ะ", "ั", "ี", "ร", "น", "ย", "บ", "ล", "ฃ"], |
| 109 | + ["ฟ", "ห", "ก", "ด", "เ", "้", "่", "า", "ส", "ว", "ง"], |
| 110 | + ["ผ", "ป", "แ", "อ", "ิ", "ื", "ท", "ม", "ใ", "ฝ"], |
| 111 | +] |
| 112 | +TIS_820_2531_MOD_SHIFT = [ |
| 113 | + ["%", "+", "๑", "๒", "๓", "๔", "ู", "฿", "๕", "๖", "๗", "๘", "๙"], |
| 114 | + ["๐", "\"", "ฎ", "ฑ", "ธ", "ํ", "๊", "ณ", "ฯ", "ญ", "ฐ", ",", "ฅ"], |
| 115 | + ["ฤ", "ฆ", "ฏ", "โ", "ฌ", "็", "๋", "ษ", "ศ", "ซ", "."], |
| 116 | + ["(", ")", "ฉ", "ฮ", "ฺ", "์", "?", "ฒ", "ฬ", "ฦ"], |
| 117 | +] |
| 118 | + |
106 | 119 |
|
107 | 120 | def eng_to_thai(text: str) -> str: |
108 | 121 | """ |
@@ -148,3 +161,63 @@ def thai_to_eng(text: str) -> str: |
148 | 161 | # output: 'Bank of Thailand' |
149 | 162 | """ |
150 | 163 | return text.translate(TH_EN_TRANSLATE_TABLE) |
| 164 | + |
| 165 | + |
| 166 | +def thai_keyboard_dist(c1: str, c2: str, shift_dist: float = 0.0) -> float: |
| 167 | + """ |
| 168 | + Calculate euclidean distance between two Thai characters |
| 169 | + according to their location on a Thai keyboard layout. |
| 170 | +
|
| 171 | + A modified TIS 820-2531 standard keyboard layout, which is developed |
| 172 | + from Kedmanee layout and is the most commonly used Thai keyboard layout, |
| 173 | + is used in distance calculation. |
| 174 | +
|
| 175 | + The modified TIS 820-2531 is TIS 820-2531 with few key extensions |
| 176 | + proposed in TIS 820-2536 draft. See Figure 4, notice grey keys, in |
| 177 | + https://www.nectec.or.th/it-standards/keyboard_layout/thai-key.html |
| 178 | +
|
| 179 | + Noted that the latest TIS 820-2538 has slight changes in layout from |
| 180 | + TIS 820-2531. See Figure 2, notice the Thai Baht sign and ฅ-ฃ pair, in |
| 181 | + https://www.nectec.or.th/it-standards/std820/std820.html |
| 182 | + Since TIS 820-2538 is not widely adopted by keyboard manufacturer, |
| 183 | + this function uses the de facto standard modified TIS 820-2531 instead. |
| 184 | +
|
| 185 | + :param str c1: first character |
| 186 | + :param str c2: second character |
| 187 | + :param str shift_dist: return value if they're shifted |
| 188 | + :return: euclidean distance between two characters |
| 189 | + :rtype: float |
| 190 | +
|
| 191 | + :Example: |
| 192 | +
|
| 193 | + from pythainlp.util import thai_keyboard_dist |
| 194 | + thai_keyboard_dist("ด", "ะ") |
| 195 | + # output: 1.4142135623730951 |
| 196 | + thai_keyboard_dist("ฟ", "ฤ") |
| 197 | + # output: 0.0 |
| 198 | + thai_keyboard_dist("ฟ", "ห") |
| 199 | + # output: 1.0 |
| 200 | + thai_keyboard_dist("ฟ", "ก") |
| 201 | + # output: 2.0 |
| 202 | + thai_keyboard_dist("ฟ", "ฤ", 0.5) |
| 203 | + # output: 0.5 |
| 204 | + """ |
| 205 | + def get_char_coord( |
| 206 | + ch: str, layouts=[TIS_820_2531_MOD, TIS_820_2531_MOD_SHIFT] |
| 207 | + ): |
| 208 | + for layout in layouts: |
| 209 | + for row in layout: |
| 210 | + if ch in row: |
| 211 | + r = layout.index(row) |
| 212 | + c = row.index(ch) |
| 213 | + return (r, c) |
| 214 | + raise ValueError(ch + " not found in given keyboard layout") |
| 215 | + |
| 216 | + coord1 = get_char_coord(c1) |
| 217 | + coord2 = get_char_coord(c2) |
| 218 | + distance = ( |
| 219 | + (coord1[0] - coord2[0]) ** 2 + (coord1[1] - coord2[1]) ** 2 |
| 220 | + ) ** (0.5) |
| 221 | + if distance == 0 and c1 != c2: |
| 222 | + return shift_dist |
| 223 | + return distance |
0 commit comments