1+ # -*- coding: utf-8 -*-
2+ """
3+ MetaSound - Thai soundex system
4+
5+ References:
6+ Snae & Brückner. (2009). Novel Phonetic Name Matching Algorithm with a Statistical
7+ Ontology for Analysing Names Given in Accordance with Thai Astrology.
8+ https://pdfs.semanticscholar.org/3983/963e87ddc6dfdbb291099aa3927a0e3e4ea6.pdf
9+ """
10+
11+ _CONS_THANTHAKHAT = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ์"
12+ _THANTHAKHAT = "์" # \u0e4c
13+ _C1 = "กขฃคฆฅ" # sound K -> coded letter 1
14+ _C2 = "จฉชฌซฐทฒดฎตสศษ" # D -> 2
15+ _C3 = "ฟฝพผภบป" # B -> 3
16+ _C4 = "ง" # NG -> 4
17+ _C5 = "ลฬรนณฦญ" # N -> 5
18+ _C6 = "ม" # M -> 6
19+ _C7 = "ย" # Y -> 7
20+ _C8 = "ว" # W -> 8
21+
22+
23+ def metasound (text , length = 4 ):
24+ """
25+ Thai MetaSound
26+
27+ :param str text: Thai text
28+ :param int length: preferred length of the MetaSound (default is 4)
29+ :return: MetaSound for the text
30+ **Example**::
31+ from pythainlp.metasound import metasound
32+ metasound("ลัก") # 'ล100'
33+ metasound("รัก") # 'ร100'
34+ metasound("รักษ์") # 'ร100'
35+ metasound("บูรณการ", 5)) # 'บ5515'
36+ """
37+ # keep only consonants and thanthakhat
38+ chars = []
39+ for ch in text :
40+ if ch in _CONS_THANTHAKHAT :
41+ chars .append (ch )
42+
43+ # remove karan (thanthakhat and a consonant before it)
44+ i = 0
45+ while i < len (chars ):
46+ if chars [i ] == _THANTHAKHAT :
47+ if i > 0 :
48+ chars [i - 1 ] = " "
49+ chars [i ] = " "
50+ i += 1
51+
52+ # retain first consonant, encode the rest
53+ chars = chars [:length ]
54+ i = 1
55+ while i < len (chars ):
56+ if chars [i ] in _C1 :
57+ chars [i ] = "1"
58+ elif chars [i ] in _C2 :
59+ chars [i ] = "2"
60+ elif chars [i ] in _C3 :
61+ chars [i ] = "3"
62+ elif chars [i ] in _C4 :
63+ chars [i ] = "4"
64+ elif chars [i ] in _C5 :
65+ chars [i ] = "5"
66+ elif chars [i ] in _C6 :
67+ chars [i ] = "6"
68+ elif chars [i ] in _C7 :
69+ chars [i ] = "7"
70+ elif chars [i ] in _C8 :
71+ chars [i ] = "8"
72+ else :
73+ chars [i ] = "0"
74+ i += 1
75+
76+ while len (chars ) < length :
77+ chars .append ("0" )
78+
79+ return "" .join (chars )
80+
81+
82+ if __name__ == "__main__" :
83+ print (metasound ("บูรณะ" )) # บ550 (an example from the original paper [Figure 4])
84+ print (metasound ("บูรณการ" , 5 )) # บ5515
85+ print (metasound ("ลักษณะ" )) # ล125
86+ print (metasound ("ลัก" )) # ล100
87+ print (metasound ("รัก" )) # ร100
88+ print (metasound ("รักษ์" )) # ร100
89+ print (metasound ("" )) # 0000
90+
91+ print (metasound ("คน" ))
92+ print (metasound ("คนA" ))
93+ print (metasound ("ดา" ))
94+ print (metasound ("ปา" ))
95+ print (metasound ("งา" ))
96+ print (metasound ("ลา" ))
97+ print (metasound ("มา" ))
98+ print (metasound ("วา" ))
0 commit comments