Skip to content

Commit e611cd9

Browse files
committed
Merge branch 'PyThaiNLP:dev' into fix-dict
2 parents 0381aca + 68c3e81 commit e611cd9

File tree

2 files changed

+34
-39
lines changed

2 files changed

+34
-39
lines changed

pythainlp/transliterate/royin.py

Lines changed: 20 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -152,10 +152,11 @@ def _replace_consonants(word: str, consonants: str) -> str:
152152

153153
if len(consonants) == 1:
154154
return word.replace(consonants[0], _CONSONANTS[consonants[0]][0])
155+
len_cons = len(consonants)
155156

156157
i = 0
157-
len_cons = len(consonants)
158158
while i < len_cons:
159+
len_word = len(word)
159160
if i == 0:
160161
if consonants[0] == _HO_HIP:
161162
word = word.replace(consonants[0], "")
@@ -166,16 +167,18 @@ def _replace_consonants(word: str, consonants: str) -> str:
166167
consonants[0], _CONSONANTS[consonants[0]][0]
167168
)
168169
i += 1
169-
elif consonants[i] == _RO_RUA and i == len(word) and word[i - 1] == _RO_RUA:
170-
word = word.replace(
171-
consonants[i], _CONSONANTS[consonants[i]][1]
172-
)
173-
elif consonants[i] == _RO_RUA and i < len(word):
174-
if i + 1 == len(word) and word[i] == _RO_RUA:
170+
elif (
171+
i == len_word
172+
and consonants[i] == _RO_RUA
173+
and word[i - 1] == _RO_RUA
174+
):
175+
word = word.replace(consonants[i], _CONSONANTS[consonants[i]][1])
176+
elif i < len_word and consonants[i] == _RO_RUA:
177+
if i + 1 == len_word and word[i] == _RO_RUA:
175178
word = word.replace(
176179
consonants[i], _CONSONANTS[consonants[i]][1]
177180
)
178-
elif word[i] == _RO_RUA and i + 1 < len(word):
181+
elif i + 1 < len_word and word[i] == _RO_RUA:
179182
if word[i + 1] == _RO_RUA:
180183
word = list(word)
181184
del word[i + 1]
@@ -192,19 +195,12 @@ def _replace_consonants(word: str, consonants: str) -> str:
192195
i += 1
193196
else:
194197
word = word.replace(
195-
consonants[i],
196-
_CONSONANTS[consonants[i]][1]
198+
consonants[i], _CONSONANTS[consonants[i]][1]
197199
)
198200
i += 1
199-
elif word[i] == _RO_RUA:
200-
word = word.replace(
201-
consonants[i], _CONSONANTS[consonants[i]][1]
202-
)
203-
i += 1
204201
else:
205202
word = word.replace(
206-
consonants[i],
207-
_CONSONANTS[consonants[i]][1]
203+
consonants[i], _CONSONANTS[consonants[i]][1]
208204
)
209205
i += 1
210206
else:
@@ -216,10 +212,6 @@ def _replace_consonants(word: str, consonants: str) -> str:
216212

217213
# support function for romanize()
218214
def _romanize(word: str) -> str:
219-
"""
220-
:param str word: a Thai word, should have already been tokenized.
221-
:return: Spells out how the Thai word should be pronounced.
222-
"""
223215
word = _replace_vowels(_normalize(word))
224216
consonants = _RE_CONSONANT.findall(word)
225217

@@ -235,13 +227,14 @@ def _romanize(word: str) -> str:
235227

236228

237229
def romanize(text: str) -> str:
238-
"""
239-
Rendering Thai words in the Latin alphabet or "romanization",
240-
using the Royal Thai General System of Transcription (RTGS),
241-
which is the official system published by the Royal Institute of Thailand.
230+
"""Render Thai words in Latin alphabet, using RTGS
231+
232+
Royal Thai General System of Transcription (RTGS),
233+
is the official system by the Royal Institute of Thailand.
242234
243-
:param str text: Thai text to be romanized
244-
:return: A string of Thai words rendered in the Latin alphabet.
235+
:param text: Thai text to be romanized
236+
:type text: str
237+
:return: A string of Thai words rendered in the Latin alphabet
245238
:rtype: str
246239
"""
247240
words = word_tokenize(text)

pythainlp/util/collate.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,20 @@ def _thkey(word: str) -> str:
1919

2020
def collate(data: Iterable, reverse: bool = False) -> List[str]:
2121
"""
22-
This function sorts a list of strings according to Thai alphabets.
23-
24-
:param list[str] data: a list of words to be sorted
25-
:param bool reverse: If `reverse` is set to **True** the result will be
26-
sorted in descending order. Otherwise, the result will
27-
be sorted in ascending order.
28-
By default, the parameter `reverse` is set to
29-
**False**, sorting alphabettically in ascending order.
30-
31-
:return: a list of strings, sorted alphabetically, according to
32-
Thai alphabets
33-
:rtype: list[str]
22+
This function sorts strings (almost) according to Thai dictionary.
23+
24+
Important notes: this implementation ignores tone marks and symbols
25+
26+
:param data: a list of words to be sorted
27+
:type data: Iterable
28+
:param reverse: If `reverse` is set to **True** the result will be
29+
sorted in descending order. Otherwise, the result
30+
will be sorted in ascending order, defaults to False
31+
:type reverse: bool, optional
32+
33+
:return: a list of strings, sorted alphabetically, (almost) according to
34+
Thai dictionary
35+
:rtype: List[str]
3436
3537
:Example:
3638
::

0 commit comments

Comments
 (0)