Skip to content

Commit 93b9006

Browse files
authored
Fix small typo + improve docstring
1 parent 59d53e6 commit 93b9006

File tree

1 file changed

+55
-85
lines changed

1 file changed

+55
-85
lines changed

pythainlp/util/thai.py

Lines changed: 55 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# -*- coding: utf-8 -*-
1+
# -*- coding: utf-8 -*-
22
"""
33
Check if it is Thai text
44
"""
@@ -10,28 +10,24 @@
1010
_TH_FIRST_CHAR_ASCII = 3584
1111
_TH_LAST_CHAR_ASCII = 3711
1212

13-
def isthaichar(ch: str) -> bool:
14-
"""
15-
This function checks if the input character is a Thai character.
1613

17-
:param str ch: input character
14+
def isthaichar(ch: str) -> bool:
15+
"""Check if a character is a Thai character.
1816
19-
:return: returns **True** if the input character is a Thai characttr,
20-
otherwise returns **False**
17+
:param ch: input character
18+
:type ch: str
19+
:return: True if ch is a Thai characttr, otherwise False.
2120
:rtype: bool
2221
2322
:Example:
2423
::
2524
2625
from pythainlp.util import isthaichar
2726
28-
isthaichar("ก") # THAI CHARACTER KO KAI
27+
isthaichar("ก") # THAI CHARACTER KO KAI
2928
# output: True
3029
31-
isthaichar("๐") # THAI DIGIT ZERO
32-
# output: True
33-
34-
isthaichar("๕") # THAI DIGIT FIVE
30+
isthaichar("๕") # THAI DIGIT FIVE
3531
# output: True
3632
"""
3733
ch_val = ord(ch)
@@ -40,92 +36,66 @@ def isthaichar(ch: str) -> bool:
4036
return False
4137

4238

43-
def isthai(word: str, ignore_chars: str = ".") -> bool:
44-
"""
45-
This function checks if all character in the input string
46-
are Thai character.
47-
48-
:param str word: input text
49-
:param str ignore_chars: string characters to be ignored
50-
(i.e. will be considered as Thai)
39+
def isthai(text: str, ignore_chars: str = ".") -> bool:
40+
"""Check if every characters in a string are Thai character.
5141
52-
:return: returns **True** if the input text all contains Thai characters,
53-
otherwise returns **False**
42+
:param text: input text
43+
:type text: str
44+
:param ignore_chars: characters to be ignored, defaults to "."
45+
:type ignore_chars: str, optional
46+
:return: True if every characters in the input string are Thai,
47+
otherwise False.
5448
:rtype: bool
5549
5650
:Example:
5751
58-
Check if all character is Thai character. By default,
59-
it ignores only full stop (".")::
52+
from pythainlp.util import isthai
6053
61-
from pythainlp.util import isthai
54+
isthai("กาลเวลา")
55+
# output: True
6256
63-
isthai("กาลเวลา")
64-
# output: True
65-
66-
isthai("กาลเวลา.")
67-
# output: True
57+
isthai("กาลเวลา.")
58+
# output: True
6859
69-
Explicitly ignore digits, whitespace, and the following characters
70-
("-", ".", "$", ",")::
60+
isthai("กาล-เวลา")
61+
# output: False
7162
72-
from pythainlp.util import isthai
73-
74-
isthai("กาลเวลา, การเวลา-ก, 3.75$", ignore_chars="1234567890.-,$ ")
75-
# output: True
63+
isthai("กาล-เวลา +66", ignore_chars="01234567890+-.,")
64+
# output: True
7665
7766
"""
7867
if not ignore_chars:
7968
ignore_chars = ""
8069

81-
for ch in word:
70+
for ch in text:
8271
if ch not in ignore_chars and not isthaichar(ch):
8372
return False
8473
return True
8574

8675

8776
def countthai(text: str, ignore_chars: str = _DEFAULT_IGNORE_CHARS) -> float:
88-
"""
89-
This function calculates percentage of Thai characters in the text
90-
with an option to ignored some characters.
91-
92-
:param str text: input text
93-
:param str ignore_chars: string of characters to ignore from counting.
94-
By default, the ignored characters are whitespace,
95-
newline, digits, and punctuation.
96-
97-
:return: percentage of Thai characters in the text
77+
"""Find proportion of Thai characters in a given text
78+
79+
:param text: input text
80+
:type text: str
81+
:param ignore_chars: characters to be ignored, defaults to whitespaces,\\
82+
digits, and puntuations.
83+
:type ignore_chars: str, optional
84+
:return: proportion of Thai characters in the text (percent)
9885
:rtype: float
9986
10087
:Example:
10188
102-
Find the percentage of Thai characters in the textt with default
103-
ignored characters set (whitespace, newline character,
104-
punctuation and digits)::
105-
106-
from pythainlp.util import countthai
107-
108-
countthai("ดอนัลด์ จอห์น ทรัมป์ English: Donald John Trump")
109-
# output: 45.0
110-
111-
countthai("(English: Donald John Trump)")
112-
# output: 0.0
89+
from pythainlp.util import countthai
11390
114-
Find the percentage of Thai characters in the text while ignoring
115-
only punctuation but not whitespace, newline character and digits::
91+
countthai("PyThaiNLP 2.3")
92+
# output: 0.0
11693
117-
import string
94+
countthai("ใช้งาน PyThaiNLP 2.3")
95+
# output: 40.0
11896
119-
string.punctuation
120-
# output: !"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~
121-
122-
countthai("ดอนัลด์ จอห์น ทรัมป์ English: Donald John Trump", \\
123-
ignore_chars=string.punctuation)
124-
# output: 39.130434782608695
125-
126-
countthai("ดอนัลด์ จอห์น ทรัมป์ (English: Donald John Trump)", \\
127-
ignore_chars=string.punctuation)
128-
# output: 0.0
97+
countthai("ใช้งาน PyThaiNLP 2.3", ignore_chars="")
98+
# output: 30.0
12999
"""
130100
if not text or not isinstance(text, str):
131101
return 0.0
@@ -150,27 +120,27 @@ def countthai(text: str, ignore_chars: str = _DEFAULT_IGNORE_CHARS) -> float:
150120
return (num_thai / num_count) * 100
151121

152122

153-
def display_thai_char(char: str) -> str:
154-
"""
155-
This function adds a underscore (_) prefix to high-position vowels and tone
156-
marks to ease readability
157-
158-
:param str character:
123+
def display_thai_char(ch: str) -> str:
124+
"""Prefix an underscore (_) to a high-position vowel or a tone mark,
125+
to ease readability.
159126
160-
:return: returns **True** if the input text all contains Thai characters,
161-
otherwise returns **False**
162-
:rtype: bool
127+
:param ch: input character
128+
:type ch: str
129+
:return: "_" + ch
130+
:rtype: str
163131
164132
:Example:
165133
166134
display_thai_char("้")
167135
# output: "_้"
168-
169136
"""
170137

171-
if char in thai_above_vowels or char in thai_tonemarks \
172-
or char in '\u0e33\u0e4c\u0e4d\u0e4e':
138+
if (
139+
ch in thai_above_vowels
140+
or ch in thai_tonemarks
141+
or ch in "\u0e33\u0e4c\u0e4d\u0e4e"
142+
):
173143
# last condition is Sra Aum, Thanthakhat, Nikhahit, Yamakkan
174-
return "_" + char
144+
return "_" + ch
175145
else:
176-
return char
146+
return ch

0 commit comments

Comments
 (0)