Skip to content

Commit c638e95

Browse files
authored
Merge branch 'dev' into add-textaugment
2 parents 7e097ef + ad20a1e commit c638e95

File tree

22 files changed

+821
-195
lines changed

22 files changed

+821
-195
lines changed

.github/workflows/pypi-publish.yml

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,8 @@ jobs:
2222
run: |
2323
python -m pip install --upgrade pip
2424
pip install setuptools wheel twine
25-
- name: Build and publish
26-
env:
27-
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
28-
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
29-
run: |
30-
python setup.py sdist bdist_wheel
31-
twine upload dist/*
25+
- name: Publish a Python distribution to PyPI
26+
uses: pypa/gh-action-pypi-publish@release/v1
27+
with:
28+
user: __token__
29+
password: ${{ secrets.PYPI_API_TOKEN }}

docs/api/translate.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,9 @@ Modules
1212
:members: translate
1313
.. autoclass:: ThEnTranslator
1414
:members: translate
15+
.. autoclass:: ThZhTranslator
16+
:members: translate
17+
.. autoclass:: ZhThTranslator
18+
:members: translate
19+
.. autoclass:: Translate
20+
:members:

docs/api/word_vector.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ Modules
1616
.. autofunction:: most_similar_cosmul
1717
.. autofunction:: sentence_vectorizer
1818
.. autofunction:: similarity
19+
.. autoclass:: WordVector
20+
:members:
1921

2022
References
2123
----------

pythainlp/corpus/default_db.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"thainer": {
33
"name": "thainer",
4-
"latest_version": "1.5",
4+
"latest_version": "1.5.1",
55
"description": "Thai Named Entity Recognition",
66
"long_description": "Thai Named Entity Recognition",
77
"url": "https://github.com/wannaphong/thai-ner/",
@@ -11,8 +11,8 @@
1111
"author_email": "wannaphong@kkumail.com",
1212
"license": "cc-by-4.0",
1313
"versions": {
14-
"1.5": {
15-
"filename": "thainer_crf_1_5.model",
14+
"1.5.1": {
15+
"filename": "thainer_crf_1_5_1.model",
1616
"download_url": "https://github.com/wannaphong/thai-ner/releases/download/1.5/thai-ner-1-5-newmm-lst20.crfsuite",
1717
"md5": "-",
1818
"pythainlp_version": ">=2.2.7"
-1.56 MB
Binary file not shown.
1.57 MB
Binary file not shown.

pythainlp/tokenize/core.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,15 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]:
3030
3131
:Example:
3232
33+
Clause tokenizer::
34+
3335
from pythainlp.tokenize import clause_tokenize
3436
3537
clause_tokenize(["ฉัน","นอน","และ","คุณ","เล่น","มือถือ","ส่วน","น้อง","เขียน","โปรแกรม"])
36-
[['ฉัน', 'นอน'],
37-
['และ', 'คุณ', 'เล่น', 'มือถือ'],
38-
['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']]
38+
# [['ฉัน', 'นอน'],
39+
# ['และ', 'คุณ', 'เล่น', 'มือถือ'],
40+
# ['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']]
41+
3942
"""
4043
if not doc or not isinstance(doc, str):
4144
return []
@@ -81,6 +84,8 @@ def word_tokenize(
8184
* *nercut* - Dictionary-based maximal matching word segmentation,
8285
constrained with Thai Character Cluster (TCC) boundaries,
8386
and combining tokens that are parts of the same named-entity.
87+
* *sefr_cut* - wrapper for
88+
`SEFR CUT <https://github.com/mrpeerat/SEFR_CUT>`_.,
8489
8590
:Note:
8691
- The parameter **custom_dict** can be provided as an argument \
@@ -173,6 +178,10 @@ def word_tokenize(
173178
elif engine == "nercut":
174179
from pythainlp.tokenize.nercut import segment
175180

181+
segments = segment(text)
182+
elif engine == "sefr_cut":
183+
from pythainlp.tokenize.sefr_cut import segment
184+
176185
segments = segment(text)
177186
else:
178187
raise ValueError(

pythainlp/tokenize/newmm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -181,9 +181,9 @@ def segment(
181181
else:
182182
tokens = list(_onecut(sample, custom_dict))
183183
token_max_idx = 0
184+
token_max_len = 0
184185
for i, token in enumerate(tokens):
185-
token_max_len = 0
186-
if len(token) > token_max_len:
186+
if len(token) >= token_max_len:
187187
token_max_len = len(token)
188188
token_max_idx = i
189189

pythainlp/tokenize/sefr_cut.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Wrapper for SEFR CUT Thai word segmentation. SEFR CUT is a
4+
Thai Word Segmentation Models using Stacked Ensemble.
5+
6+
:See Also:
7+
* `GitHub repository <https://github.com/mrpeerat/SEFR_CUT>`_
8+
"""
9+
from typing import List
10+
11+
import sefr_cut
12+
13+
DEFAULT_ENGINE = 'ws1000'
14+
sefr_cut.load_model(engine=DEFAULT_ENGINE)
15+
16+
17+
def segment(text: str, engine: str = 'ws1000') -> List[str]:
18+
global DEFAULT_ENGINE
19+
if not text or not isinstance(text, str):
20+
return []
21+
if engine != DEFAULT_ENGINE:
22+
DEFAULT_ENGINE = engine
23+
sefr_cut.load_model(engine=DEFAULT_ENGINE)
24+
return sefr_cut.tokenize(text)[0]

pythainlp/translate/__init__.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,20 @@
66
__all__ = [
77
"EnThTranslator",
88
"ThEnTranslator",
9-
"download_model_all"
9+
"download_model_all",
10+
"ThZhTranslator",
11+
"ZhThTranslator",
12+
"Translate"
1013
]
1114

12-
from pythainlp.translate.core import (
15+
from pythainlp.translate.core import Translate
16+
17+
from pythainlp.translate.en_th import (
1318
EnThTranslator,
1419
ThEnTranslator,
1520
download_model_all,
1621
)
22+
from pythainlp.translate.zh_th import (
23+
ThZhTranslator,
24+
ZhThTranslator,
25+
)

0 commit comments

Comments
 (0)