1- # -*- coding: utf-8 -*-
1+ # -*- coding: utf-8 -*-
22"""
33Check if it is Thai text
44"""
1010_TH_FIRST_CHAR_ASCII = 3584
1111_TH_LAST_CHAR_ASCII = 3711
1212
13- def isthaichar (ch : str ) -> bool :
14- """
15- This function checks if the input character is a Thai character.
1613
17- :param str ch: input character
14+ def isthaichar (ch : str ) -> bool :
15+ """Check if a character is a Thai character.
1816
19- :return: returns **True** if the input character is a Thai characttr,
20- otherwise returns **False**
17+ :param ch: input character
18+ :type ch: str
19+ :return: True if ch is a Thai characttr, otherwise False.
2120 :rtype: bool
2221
2322 :Example:
2423 ::
2524
2625 from pythainlp.util import isthaichar
2726
28- isthaichar("ก") # THAI CHARACTER KO KAI
27+ isthaichar("ก") # THAI CHARACTER KO KAI
2928 # output: True
3029
31- isthaichar("๐") # THAI DIGIT ZERO
32- # output: True
33-
34- isthaichar("๕") # THAI DIGIT FIVE
30+ isthaichar("๕") # THAI DIGIT FIVE
3531 # output: True
3632 """
3733 ch_val = ord (ch )
@@ -40,92 +36,66 @@ def isthaichar(ch: str) -> bool:
4036 return False
4137
4238
43- def isthai (word : str , ignore_chars : str = "." ) -> bool :
44- """
45- This function checks if all character in the input string
46- are Thai character.
47-
48- :param str word: input text
49- :param str ignore_chars: string characters to be ignored
50- (i.e. will be considered as Thai)
39+ def isthai (text : str , ignore_chars : str = "." ) -> bool :
40+ """Check if every characters in a string are Thai character.
5141
52- :return: returns **True** if the input text all contains Thai characters,
53- otherwise returns **False**
42+ :param text: input text
43+ :type text: str
44+ :param ignore_chars: characters to be ignored, defaults to "."
45+ :type ignore_chars: str, optional
46+ :return: True if every characters in the input string are Thai,
47+ otherwise False.
5448 :rtype: bool
5549
5650 :Example:
5751
58- Check if all character is Thai character. By default,
59- it ignores only full stop (".")::
52+ from pythainlp.util import isthai
6053
61- from pythainlp.util import isthai
54+ isthai("กาลเวลา")
55+ # output: True
6256
63- isthai("กาลเวลา")
64- # output: True
65-
66- isthai("กาลเวลา.")
67- # output: True
57+ isthai("กาลเวลา.")
58+ # output: True
6859
69- Explicitly ignore digits, whitespace, and the following characters
70- ("-", ".", "$", ",")::
60+ isthai("กาล-เวลา")
61+ # output: False
7162
72- from pythainlp.util import isthai
73-
74- isthai("กาลเวลา, การเวลา-ก, 3.75$", ignore_chars="1234567890.-,$ ")
75- # output: True
63+ isthai("กาล-เวลา +66", ignore_chars="01234567890+-.,")
64+ # output: True
7665
7766 """
7867 if not ignore_chars :
7968 ignore_chars = ""
8069
81- for ch in word :
70+ for ch in text :
8271 if ch not in ignore_chars and not isthaichar (ch ):
8372 return False
8473 return True
8574
8675
8776def countthai (text : str , ignore_chars : str = _DEFAULT_IGNORE_CHARS ) -> float :
88- """
89- This function calculates percentage of Thai characters in the text
90- with an option to ignored some characters.
91-
92- :param str text: input text
93- :param str ignore_chars: string of characters to ignore from counting.
94- By default, the ignored characters are whitespace,
95- newline, digits, and punctuation.
96-
97- :return: percentage of Thai characters in the text
77+ """Find proportion of Thai characters in a given text
78+
79+ :param text: input text
80+ :type text: str
81+ :param ignore_chars: characters to be ignored, defaults to whitespaces,\\
82+ digits, and puntuations.
83+ :type ignore_chars: str, optional
84+ :return: proportion of Thai characters in the text (percent)
9885 :rtype: float
9986
10087 :Example:
10188
102- Find the percentage of Thai characters in the textt with default
103- ignored characters set (whitespace, newline character,
104- punctuation and digits)::
105-
106- from pythainlp.util import countthai
107-
108- countthai("ดอนัลด์ จอห์น ทรัมป์ English: Donald John Trump")
109- # output: 45.0
110-
111- countthai("(English: Donald John Trump)")
112- # output: 0.0
89+ from pythainlp.util import countthai
11390
114- Find the percentage of Thai characters in the text while ignoring
115- only punctuation but not whitespace, newline character and digits::
91+ countthai("PyThaiNLP 2.3")
92+ # output: 0.0
11693
117- import string
94+ countthai("ใช้งาน PyThaiNLP 2.3")
95+ # output: 40.0
11896
119- string.punctuation
120- # output: !"#$%&'()*+,-./:;<=>?@[\\ ]^_`{|}~
121-
122- countthai("ดอนัลด์ จอห์น ทรัมป์ English: Donald John Trump", \\
123- ignore_chars=string.punctuation)
124- # output: 39.130434782608695
125-
126- countthai("ดอนัลด์ จอห์น ทรัมป์ (English: Donald John Trump)", \\
127- ignore_chars=string.punctuation)
128- # output: 0.0
97+ countthai("ใช้งาน PyThaiNLP 2.3", ignore_chars="")
98+ # output: 30.0
12999 """
130100 if not text or not isinstance (text , str ):
131101 return 0.0
@@ -150,27 +120,27 @@ def countthai(text: str, ignore_chars: str = _DEFAULT_IGNORE_CHARS) -> float:
150120 return (num_thai / num_count ) * 100
151121
152122
153- def display_thai_char (char : str ) -> str :
154- """
155- This function adds a underscore (_) prefix to high-position vowels and tone
156- marks to ease readability
157-
158- :param str character:
123+ def display_thai_char (ch : str ) -> str :
124+ """Prefix an underscore (_) to a high-position vowel or a tone mark,
125+ to ease readability.
159126
160- :return: returns **True** if the input text all contains Thai characters,
161- otherwise returns **False**
162- :rtype: bool
127+ :param ch: input character
128+ :type ch: str
129+ :return: "_" + ch
130+ :rtype: str
163131
164132 :Example:
165133
166134 display_thai_char("้")
167135 # output: "_้"
168-
169136 """
170137
171- if char in thai_above_vowels or char in thai_tonemarks \
172- or char in '\u0e33 \u0e4c \u0e4d \u0e4e ' :
138+ if (
139+ ch in thai_above_vowels
140+ or ch in thai_tonemarks
141+ or ch in "\u0e33 \u0e4c \u0e4d \u0e4e "
142+ ):
173143 # last condition is Sra Aum, Thanthakhat, Nikhahit, Yamakkan
174- return "_" + char
144+ return "_" + ch
175145 else :
176- return char
146+ return ch
0 commit comments