Skip to content

Commit b7ec3d7

Browse files
authored
Merge pull request #813 from PyThaiNLP/add-encoding
Add pythainlp.util.encoding
2 parents e870247 + f7f762e commit b7ec3d7

File tree

4 files changed

+36
-0
lines changed

4 files changed

+36
-0
lines changed

docs/api/util.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ Modules
4949
.. autofunction:: thaiword_to_num
5050
.. autofunction:: thaiword_to_time
5151
.. autofunction:: time_to_thaiword
52+
.. autofunction:: tis620_to_utf8
5253
.. autofunction:: tone_detector
5354
.. autofunction:: words_to_num
5455
.. autofunction:: nectec_to_ipa

pythainlp/util/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@
6565
"nectec_to_ipa",
6666
"ipa_to_rtgs",
6767
"remove_tone_ipa",
68+
"tis620_to_utf8",
6869
]
6970

7071
from pythainlp.util.collate import collate
@@ -121,3 +122,4 @@
121122
syllable_open_close_detector,
122123
)
123124
from pythainlp.util.phoneme import nectec_to_ipa, ipa_to_rtgs, remove_tone_ipa
125+
from pythainlp.util.encoding import tis620_to_utf8

pythainlp/util/encoding.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# -*- coding_utf-8 -*-
2+
# Copyright (C) 2016-2023 PyThaiNLP Project
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
def tis620_to_utf8(text: str)->str:
16+
"""
17+
Convert TIS-620 to UTF-8
18+
19+
:param str text: Text that use TIS-620 encoding
20+
:return: Text that use UTF-8 encoding
21+
:rtype: str
22+
23+
:Example:
24+
25+
from pythainlp.util import tis620_to_utf8
26+
tis620_to_utf8("¡ÃзÃÇ§ÍØµÊÒË¡ÃÃÁ")
27+
# output: 'กระทรวงอุตสาหกรรม'
28+
"""
29+
return text.encode("cp1252", "ignore").decode("tis-620")

tests/test_util.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
nectec_to_ipa,
5858
ipa_to_rtgs,
5959
remove_tone_ipa,
60+
tis620_to_utf8,
6061
)
6162

6263

@@ -840,3 +841,6 @@ def test_ipa_to_rtgs(self):
840841

841842
def test_remove_tone_ipa(self):
842843
self.assertEqual(remove_tone_ipa("laː˦˥.sa˨˩.maj˩˩˦"), "laː.sa.maj")
844+
845+
def test_tis620_to_utf8(self):
846+
self.assertEqual(tis620_to_utf8("¡ÃзÃÇ§ÍØµÊÒË¡ÃÃÁ"), "กระทรวงอุตสาหกรรม")

0 commit comments

Comments
 (0)