File tree 4 files changed +36
-0
lines changed
4 files changed +36
-0
lines changed Original file line number Diff line number Diff line change @@ -49,6 +49,7 @@ Modules
49
49
.. autofunction :: thaiword_to_num
50
50
.. autofunction :: thaiword_to_time
51
51
.. autofunction :: time_to_thaiword
52
+ .. autofunction :: tis620_to_utf8
52
53
.. autofunction :: tone_detector
53
54
.. autofunction :: words_to_num
54
55
.. autofunction :: nectec_to_ipa
Original file line number Diff line number Diff line change 65
65
"nectec_to_ipa" ,
66
66
"ipa_to_rtgs" ,
67
67
"remove_tone_ipa" ,
68
+ "tis620_to_utf8" ,
68
69
]
69
70
70
71
from pythainlp .util .collate import collate
121
122
syllable_open_close_detector ,
122
123
)
123
124
from pythainlp .util .phoneme import nectec_to_ipa , ipa_to_rtgs , remove_tone_ipa
125
+ from pythainlp .util .encoding import tis620_to_utf8
Original file line number Diff line number Diff line change
1
+ # -*- coding_utf-8 -*-
2
+ # Copyright (C) 2016-2023 PyThaiNLP Project
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ def tis620_to_utf8 (text : str )-> str :
16
+ """
17
+ Convert TIS-620 to UTF-8
18
+
19
+ :param str text: Text that use TIS-620 encoding
20
+ :return: Text that use UTF-8 encoding
21
+ :rtype: str
22
+
23
+ :Example:
24
+
25
+ from pythainlp.util import tis620_to_utf8
26
+ tis620_to_utf8("¡ÃзÃÇ§ÍØµÊÒË¡ÃÃÁ")
27
+ # output: 'กระทรวงอุตสาหกรรม'
28
+ """
29
+ return text .encode ("cp1252" , "ignore" ).decode ("tis-620" )
Original file line number Diff line number Diff line change 57
57
nectec_to_ipa ,
58
58
ipa_to_rtgs ,
59
59
remove_tone_ipa ,
60
+ tis620_to_utf8 ,
60
61
)
61
62
62
63
@@ -840,3 +841,6 @@ def test_ipa_to_rtgs(self):
840
841
841
842
def test_remove_tone_ipa (self ):
842
843
self .assertEqual (remove_tone_ipa ("laː˦˥.sa˨˩.maj˩˩˦" ), "laː.sa.maj" )
844
+
845
+ def test_tis620_to_utf8 (self ):
846
+ self .assertEqual (tis620_to_utf8 ("¡ÃзÃÇ§ÍØµÊÒË¡ÃÃÁ" ), "กระทรวงอุตสาหกรรม" )
You can’t perform that action at this time.
0 commit comments