|
1 | 1 | # -*- coding: utf-8 -*- |
2 | 2 | """ |
3 | | -Functions related to keyboard layout. |
| 3 | +Correct text in one language that is incorrectly-typed |
| 4 | +with a keyboard layout in another language. |
4 | 5 | """ |
5 | | - |
6 | 6 | EN_TH_KEYB_PAIRS = { |
7 | 7 | "Z": "(", |
8 | 8 | "z": "ผ", |
|
103 | 103 | EN_TH_TRANSLATE_TABLE = str.maketrans(EN_TH_KEYB_PAIRS) |
104 | 104 | TH_EN_TRANSLATE_TABLE = str.maketrans(TH_EN_KEYB_PAIRS) |
105 | 105 |
|
106 | | -TIS_820_2531_MOD = [ |
107 | | - ["-", "ๅ", "/", "", "_", "ภ", "ถ", "ุ", "ึ", "ค", "ต", "จ", "ข", "ช"], |
108 | | - ["ๆ", "ไ", "ำ", "พ", "ะ", "ั", "ี", "ร", "น", "ย", "บ", "ล", "ฃ"], |
109 | | - ["ฟ", "ห", "ก", "ด", "เ", "้", "่", "า", "ส", "ว", "ง"], |
110 | | - ["ผ", "ป", "แ", "อ", "ิ", "ื", "ท", "ม", "ใ", "ฝ"], |
111 | | -] |
112 | | -TIS_820_2531_MOD_SHIFT = [ |
113 | | - ["%", "+", "๑", "๒", "๓", "๔", "ู", "฿", "๕", "๖", "๗", "๘", "๙"], |
114 | | - ["๐", "\"", "ฎ", "ฑ", "ธ", "ํ", "๊", "ณ", "ฯ", "ญ", "ฐ", ",", "ฅ"], |
115 | | - ["ฤ", "ฆ", "ฏ", "โ", "ฌ", "็", "๋", "ษ", "ศ", "ซ", "."], |
116 | | - ["(", ")", "ฉ", "ฮ", "ฺ", "์", "?", "ฒ", "ฬ", "ฦ"], |
117 | | -] |
118 | | - |
119 | 106 |
|
120 | 107 | def eng_to_thai(text: str) -> str: |
121 | 108 | """ |
@@ -161,63 +148,3 @@ def thai_to_eng(text: str) -> str: |
161 | 148 | # output: 'Bank of Thailand' |
162 | 149 | """ |
163 | 150 | return text.translate(TH_EN_TRANSLATE_TABLE) |
164 | | - |
165 | | - |
166 | | -def thai_keyboard_dist(c1: str, c2: str, shift_dist: float = 0.0) -> float: |
167 | | - """ |
168 | | - Calculate euclidean distance between two Thai characters |
169 | | - according to their location on a Thai keyboard layout. |
170 | | -
|
171 | | - A modified TIS 820-2531 standard keyboard layout, which is developed |
172 | | - from Kedmanee layout and is the most commonly used Thai keyboard layout, |
173 | | - is used in distance calculation. |
174 | | -
|
175 | | - The modified TIS 820-2531 is TIS 820-2531 with few key extensions |
176 | | - proposed in TIS 820-2536 draft. See Figure 4, notice grey keys, in |
177 | | - https://www.nectec.or.th/it-standards/keyboard_layout/thai-key.html |
178 | | -
|
179 | | - Noted that the latest TIS 820-2538 has slight changes in layout from |
180 | | - TIS 820-2531. See Figure 2, notice the Thai Baht sign and ฅ-ฃ pair, in |
181 | | - https://www.nectec.or.th/it-standards/std820/std820.html |
182 | | - Since TIS 820-2538 is not widely adopted by keyboard manufacturer, |
183 | | - this function uses the de facto standard modified TIS 820-2531 instead. |
184 | | -
|
185 | | - :param str c1: first character |
186 | | - :param str c2: second character |
187 | | - :param str shift_dist: return value if they're shifted |
188 | | - :return: euclidean distance between two characters |
189 | | - :rtype: float |
190 | | -
|
191 | | - :Example: |
192 | | -
|
193 | | - from pythainlp.util import thai_keyboard_dist |
194 | | - thai_keyboard_dist("ด", "ะ") |
195 | | - # output: 1.4142135623730951 |
196 | | - thai_keyboard_dist("ฟ", "ฤ") |
197 | | - # output: 0.0 |
198 | | - thai_keyboard_dist("ฟ", "ห") |
199 | | - # output: 1.0 |
200 | | - thai_keyboard_dist("ฟ", "ก") |
201 | | - # output: 2.0 |
202 | | - thai_keyboard_dist("ฟ", "ฤ", 0.5) |
203 | | - # output: 0.5 |
204 | | - """ |
205 | | - def get_char_coord( |
206 | | - ch: str, layouts=[TIS_820_2531_MOD, TIS_820_2531_MOD_SHIFT] |
207 | | - ): |
208 | | - for layout in layouts: |
209 | | - for row in layout: |
210 | | - if ch in row: |
211 | | - r = layout.index(row) |
212 | | - c = row.index(ch) |
213 | | - return (r, c) |
214 | | - raise ValueError(ch + " not found in given keyboard layout") |
215 | | - |
216 | | - coord1 = get_char_coord(c1) |
217 | | - coord2 = get_char_coord(c2) |
218 | | - distance = ( |
219 | | - (coord1[0] - coord2[0]) ** 2 + (coord1[1] - coord2[1]) ** 2 |
220 | | - ) ** (0.5) |
221 | | - if distance == 0 and c1 != c2: |
222 | | - return shift_dist |
223 | | - return distance |
0 commit comments