Skip to content

Commit 2a853fa

Browse files
authored
Merge pull request #466 from PyThaiNLP/Add-provinces
provinces(): Fix type hinting, clean code, remove thailand_provinces_th.txt
2 parents 313aa92 + 8b4beac commit 2a853fa

File tree

4 files changed

+72
-126
lines changed

4 files changed

+72
-126
lines changed

pythainlp/corpus/common.py

Lines changed: 30 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,14 @@
1515
]
1616

1717
from pythainlp.corpus import get_corpus
18+
from typing import Union
1819

1920
_THAI_COUNTRIES = set()
2021
_THAI_COUNTRIES_FILENAME = "countries_th.txt"
2122

2223
_THAI_THAILAND_PROVINCES = set()
23-
_THAI_THAILAND_PROVINCES_FILENAME = "thailand_provinces_th.txt"
24-
25-
_THAI_THAILAND_PROVINCES_DETAILS = dict()
26-
_THAI_THAILAND_PROVINCES_LIST_ALL_FILENAME = "thailand_provinces_th.csv"
24+
_THAI_THAILAND_PROVINCES_DETAILS = list()
25+
_THAI_THAILAND_PROVINCES_FILENAME = "thailand_provinces_th.csv"
2726

2827
_THAI_SYLLABLES = set()
2928
_THAI_SYLLABLES_FILENAME = "syllables_th.txt"
@@ -60,38 +59,43 @@ def countries() -> frozenset:
6059
return _THAI_COUNTRIES
6160

6261

63-
def provinces(details: bool = False) -> frozenset:
62+
def provinces(details: bool = False) -> Union[frozenset, list]:
6463
"""
6564
Return a frozenset of Thailand province names in Thai such as "กระบี่",
6665
"กรุงเทพมหานคร", "กาญจนบุรี", and "อุบลราชธานี".
6766
\n(See: `dev/pythainlp/corpus/thailand_provinces_th.txt\
6867
<https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/thailand_provinces_th.txt>`_)
6968
70-
:param bool details: a details of provinces
69+
:param bool details: return details of provinces or not
7170
72-
:return: :class:`frozenset` containing province names of Thailand (if details is False) or list \
73-
dict of Thailand province names in Thai such as\
74-
[{'provinces_th': 'นนทบุรี', 'abridgement': 'นบ', 'provinces_en': 'Nonthaburi', 'HS': 'NBI'}].
71+
:return: :class:`frozenset` containing province names of Thailand \
72+
(if details is False) or :class:`list` containing :class:`dict` of \
73+
province names and details such as \
74+
[{'name_th': 'นนทบุรี', 'abbr_th': 'นบ', 'name_en': 'Nonthaburi', \
75+
'abbr_en': 'NBI'}].
7576
:rtype: :class:`frozenset` or :class:`list`
7677
"""
7778
global _THAI_THAILAND_PROVINCES, _THAI_THAILAND_PROVINCES_DETAILS
78-
if not _THAI_THAILAND_PROVINCES_DETAILS and not _THAI_THAILAND_PROVINCES:
79-
_THAI_THAILAND_PROVINCES = list()
80-
_TEMP = list(get_corpus(
81-
_THAI_THAILAND_PROVINCES_LIST_ALL_FILENAME
82-
))
83-
_THAI_THAILAND_PROVINCES_DETAILS = list()
84-
for i in _TEMP:
85-
_data = i.split(",")
86-
_dict_data = dict()
87-
_dict_data["provinces_th"] = _data[0]
88-
_THAI_THAILAND_PROVINCES.append(_data[0])
89-
_dict_data["abridgement"] = _data[1]
90-
_dict_data["provinces_en"] = _data[2]
91-
_dict_data["HS"] = _data[3]
92-
_THAI_THAILAND_PROVINCES_DETAILS.append(_dict_data)
93-
94-
_THAI_THAILAND_PROVINCES = frozenset(_THAI_THAILAND_PROVINCES)
79+
80+
if not _THAI_THAILAND_PROVINCES or not _THAI_THAILAND_PROVINCES_DETAILS:
81+
provs = set()
82+
prov_details = list()
83+
84+
for line in get_corpus(_THAI_THAILAND_PROVINCES_FILENAME, as_is=True):
85+
p = line.split(",")
86+
87+
prov = dict()
88+
prov["name_th"] = p[0]
89+
prov["abbr_th"] = p[1]
90+
prov["name_en"] = p[2]
91+
prov["abbr_en"] = p[3]
92+
93+
provs.add(prov["name_th"])
94+
prov_details.append(prov)
95+
96+
_THAI_THAILAND_PROVINCES = frozenset(provs)
97+
_THAI_THAILAND_PROVINCES_DETAILS = prov_details
98+
9599
if details:
96100
return _THAI_THAILAND_PROVINCES_DETAILS
97101

pythainlp/corpus/core.py

Lines changed: 39 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -51,18 +51,26 @@ def get_corpus_db_detail(name: str) -> dict:
5151
return dict()
5252

5353

54-
def get_corpus(filename: str) -> frozenset:
54+
def get_corpus(filename: str, as_is: bool = False) -> Union[frozenset, list]:
5555
"""
56-
Read corpus data from file and return a frozenset.
56+
Read corpus data from file and return a frozenset or a list.
57+
58+
Each line in the file will be a member of the set or the list.
59+
60+
By default, a frozenset will be return, with whitespaces stripped, and
61+
empty values and duplicates removed.
62+
63+
If as_is is True, a list will be return, with no modifications
64+
in member values and their orders.
5765
5866
(Please see the filename from
5967
`this file
6068
<https://github.com/PyThaiNLP/pythainlp-corpus/blob/master/db.json>`_
6169
6270
:param str filename: filename of the corpus to be read
6371
64-
:return: :mod:`frozenset` consist of lines in the file
65-
:rtype: :mod:`frozenset`
72+
:return: :class:`frozenset` or :class:`list` consists of lines in the file
73+
:rtype: :class:`frozenset` or :class:`list`
6674
6775
:Example:
6876
::
@@ -85,7 +93,11 @@ def get_corpus(filename: str) -> frozenset:
8593
with open(path, "r", encoding="utf-8-sig") as fh:
8694
lines = fh.read().splitlines()
8795

88-
return frozenset(lines)
96+
if as_is:
97+
return lines
98+
99+
lines = [line.strip() for line in lines]
100+
return frozenset(filter(None, lines))
89101

90102

91103
def _update_all():
@@ -96,7 +108,9 @@ def _update_all():
96108
for item in item_all:
97109
name = item["name"]
98110
if "file_name" in item.keys():
99-
local_db.update({"filename": item["file_name"]}, query.name == name)
111+
local_db.update(
112+
{"filename": item["file_name"]}, query.name == name
113+
)
100114
elif "file" in item.keys():
101115
local_db.update({"filename": item["file"]}, query.name == name)
102116
local_db.close()
@@ -139,9 +153,15 @@ def get_corpus_path(name: str) -> Union[str, None]:
139153
"""
140154
# check if the corpus is in local catalog, download if not
141155
corpus_db_detail = get_corpus_db_detail(name)
142-
if corpus_db_detail.get("file_name") is not None and corpus_db_detail.get("filename") is None:
156+
if (
157+
corpus_db_detail.get("file_name") is not None
158+
and corpus_db_detail.get("filename") is None
159+
):
143160
_update_all()
144-
elif corpus_db_detail.get("file") is not None and corpus_db_detail.get("filename") is None:
161+
elif (
162+
corpus_db_detail.get("file") is not None
163+
and corpus_db_detail.get("filename") is None
164+
):
145165
_update_all()
146166

147167
if not corpus_db_detail or not corpus_db_detail.get("filename"):
@@ -208,7 +228,9 @@ def _check_hash(dst: str, md5: str) -> None:
208228
raise Exception("Hash does not match expected.")
209229

210230

211-
def download(name: str, force: bool = False, url: str = None, version: str = None) -> bool:
231+
def download(
232+
name: str, force: bool = False, url: str = None, version: str = None
233+
) -> bool:
212234
"""
213235
Download corpus.
214236
@@ -256,34 +278,28 @@ def download(name: str, force: bool = False, url: str = None, version: str = Non
256278
corpus = corpus_db[name.lower()]
257279
print("Corpus:", name)
258280
if version is None:
259-
version = corpus['latest_version']
281+
version = corpus["latest_version"]
260282
corpus_versions = corpus["versions"][version]
261283
file_name = corpus_versions["filename"]
262-
found = local_db.search((query.name == name) & (query.version == version))
284+
found = local_db.search(
285+
(query.name == name) & (query.version == version)
286+
)
263287

264288
# If not found in local, download
265289
if force or not found:
266290
print(f"- Downloading: {name} {version}")
267291
_download(
268-
corpus_versions["download_url"],
269-
file_name,
292+
corpus_versions["download_url"], file_name,
270293
)
271294
_check_hash(
272-
file_name,
273-
corpus_versions["md5"],
295+
file_name, corpus_versions["md5"],
274296
)
275297

276298
if found:
277-
local_db.update(
278-
{"version": version}, query.name == name
279-
)
299+
local_db.update({"version": version}, query.name == name)
280300
else:
281301
local_db.insert(
282-
{
283-
"name": name,
284-
"version": version,
285-
"filename": file_name,
286-
}
302+
{"name": name, "version": version, "filename": file_name}
287303
)
288304
else:
289305
if local_db.search(

pythainlp/corpus/thailand_provinces_th.txt

Lines changed: 0 additions & 77 deletions
This file was deleted.

tests/test_corpus.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ def test_corpus(self):
3535
self.assertIsInstance(countries(), frozenset)
3636
self.assertIsInstance(provinces(), frozenset)
3737
self.assertIsInstance(provinces(details=True), list)
38+
self.assertEqual(
39+
len(provinces(details=False)), len(provinces(details=True))
40+
)
3841
self.assertIsInstance(thai_female_names(), frozenset)
3942
self.assertIsInstance(thai_male_names(), frozenset)
4043

0 commit comments

Comments
 (0)