Skip to content

Commit d5220d9

Browse files
wannaphongbact
andauthored
Add pythainlp.translate (#439)
Machine translation using model from VISTEC-depa Thailand Artificial Intelligence Research Institute Co-authored-by: Arthit Suriyawongkul <arthit@gmail.com>
1 parent 51f144a commit d5220d9

File tree

8 files changed

+230
-53
lines changed

8 files changed

+230
-53
lines changed

.github/workflows/test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ jobs:
3434
pip install pytest coverage coveralls
3535
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
3636
pip install "h5py>=2.10.0,<3" "tensorflow>=2.3.1,<3"
37-
pip install torch==1.7.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
37+
pip install torch==1.7.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
3838
pip install deepcut
3939
pip install .[full]
4040
- name: Test

appveyor.yml

Lines changed: 44 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
# environment configuration #
1010
#---------------------------------#
1111

12-
image: Visual Studio 2017
12+
image: Visual Studio 2019
1313

1414
# scripts that are called at very beginning, before repo cloning
1515
init:
@@ -32,17 +32,17 @@ init:
3232
- "ECHO Python %PYTHON_VERSION% (%PYTHON_ARCH%bit) from %PYTHON%"
3333
- ECHO %PYTHONIOENCODING%
3434
- ECHO %ICU_VERSION%
35-
# - ECHO "Installed SDKs:"
36-
# - ps: "ls C:/Python*"
37-
# - ps: "ls \"C:/Program Files (x86)/Microsoft SDKs/Windows\""
35+
- ECHO "Installed SDKs:"
36+
- ps: "ls C:/Python*"
37+
- ps: "ls \"C:/Program Files (x86)/Microsoft SDKs/Windows\""
3838

3939
# fetch repository as zip archive
4040
# https://www.appveyor.com/docs/how-to/repository-shallow-clone/
4141
shallow_clone: true
4242

4343
environment:
4444
global:
45-
APPVEYOR_SAVE_CACHE_ON_ERROR: true
45+
APPVEYOR_SAVE_CACHE_ON_ERROR: false
4646
APPVEYOR_SKIP_FINALIZE_ON_EXIT: true
4747
CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\appveyor\\run_with_env.cmd"
4848
PYTHONIOENCODING: "utf-8"
@@ -56,7 +56,7 @@ environment:
5656
# PYTHON_ARCH: "32"
5757
# PYICU_PKG: "https://www.dropbox.com/s/pahorbq29y9cura/PyICU-2.3.1-cp36-cp36m-win32.whl?dl=1"
5858

59-
- PYTHON: "C:/Python36-x64"
59+
- PYTHON: "C:\\Miniconda36-x64"
6060
PYTHON_VERSION: "3.6"
6161
PYTHON_ARCH: "64"
6262
PYICU_PKG: "https://www.dropbox.com/s/7t0rrxwckqbgivi/PyICU-2.3.1-cp36-cp36m-win_amd64.whl?dl=1"
@@ -66,39 +66,42 @@ environment:
6666
# PYTHON_ARCH: "32"
6767
# PYICU_PKG: "https://www.dropbox.com/s/3xwdnwhdcu619x4/PyICU-2.3.1-cp37-cp37m-win32.whl?dl=1"
6868

69-
- PYTHON: "C:/Python37-x64"
70-
PYTHON_VERSION: "3.7"
71-
PYTHON_ARCH: "64"
72-
PYICU_PKG: "https://www.dropbox.com/s/le5dckc3231opqt/PyICU-2.3.1-cp37-cp37m-win_amd64.whl?dl=1"
69+
# - PYTHON: "C:/Python37-x64"
70+
# PYTHON_VERSION: "3.7"
71+
# PYTHON_ARCH: "64"
72+
# PYICU_PKG: "https://www.dropbox.com/s/le5dckc3231opqt/PyICU-2.3.1-cp37-cp37m-win_amd64.whl?dl=1"
7373

74-
# - PYTHON: "C:/Python38-x64"
74+
# - PYTHON: "C:\\Miniconda38-x64"
7575
# PYTHON_VERSION: "3.8"
7676
# PYTHON_ARCH: "64"
77-
# PYICU_PKG: "https://www.dropbox.com/s/o6p2sj5z50iim1e/PyICU-2.3.1-cp38-cp38-win_amd64.whl?dl=0"
77+
# PYICU_PKG: "https://www.dropbox.com/s/o6p2sj5z50iim1e/PyICU-2.3.1-cp38-cp38-win_amd64.whl?dl=1"
7878

7979
matrix:
8080
fast_finish: true
8181

82-
cache:
83-
- "%LOCALAPPDATA%/pip/Cache"
84-
- "%APPDATA%/nltk_data"
82+
#cache:
83+
# - "%LOCALAPPDATA%/pip/Cache"
84+
# - "%APPDATA%/nltk_data"
8585
# - "%LOCALAPPDATA%/pythainlp-data"
8686

8787
install:
8888
- chcp 65001
89+
- "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%"
8990
# - '"C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" %PLATFORM%'
90-
- '"C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" %PLATFORM%'
91-
# - '"C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvarsall.bat" %PLATFORM%'
91+
# - '"C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" %PLATFORM%'
92+
- '"C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvarsall.bat" %PLATFORM%'
9293
- ps: if (-not(Test-Path($env:PYTHON))) { & appveyor\install.ps1 }
93-
- SET PATH=%PYTHON%;%PYTHON%/Scripts;%PATH%
94-
# - ECHO %PATH%
94+
- ECHO %PATH%
9595
- python --version
9696
- python -m pip install --disable-pip-version-check --user --upgrade pip setuptools
97-
- pip --version
98-
- pip install -U "h5py>=2.10.0,<3" "tensorflow>=2.3.1,<3" deepcut
99-
- pip install torch==1.7.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
100-
- pip install %PYICU_PKG%
101-
- pip install -e .[full]
97+
- python -m pip --version
98+
- python -m pip install pyyaml
99+
- python -m pip install -U "h5py>=2.10.0,<3" "tensorflow>=2.3.1,<3" deepcut
100+
- python -m pip install %PYICU_PKG%
101+
- conda install -y -c conda-forge fairseq
102+
- conda remove --force -y pytorch
103+
- python -m pip install torch==1.7.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
104+
- python -m pip install -e .[full]
102105

103106
#---------------------------------#
104107
# build configuration #
@@ -121,20 +124,20 @@ test_script:
121124
# global handlers #
122125
#---------------------------------#
123126

124-
on_success:
125-
# Remove old or huge cache files to hopefully not exceed the 1GB cache limit.
126-
#
127-
# If the cache limit is reached, the cache will not be updated (of not even
128-
# created in the first run). So this is a trade of between keeping the cache
129-
# current and having a cache at all.
130-
# NB: This is done only `on_success` since the cache in uploaded only on
131-
# success anyway.
132-
# Note: Cygwin is not available on Visual Studio 2019, can try Msys2.
133-
- "ECHO Remove old or huge cache"
134-
- C:\cygwin\bin\find "%LOCALAPPDATA%/pip" -type f -mtime +360 -delete
135-
- C:\cygwin\bin\find "%LOCALAPPDATA%/pip" -type f -size +50M -delete
136-
- C:\cygwin\bin\find "%LOCALAPPDATA%/pip" -empty -delete
137-
# Show size of cache
138-
- C:\cygwin\bin\du -hs "%LOCALAPPDATA%/pip/Cache"
139-
- C:\cygwin\bin\du -hs "%APPDATA%/nltk_data"
140-
- C:\cygwin\bin\du -hs "%LOCALAPPDATA%/pythainlp-data"
127+
#on_success:
128+
# # Remove old or huge cache files to hopefully not exceed the 1GB cache limit.
129+
# #
130+
# # If the cache limit is reached, the cache will not be updated (of not even
131+
# # created in the first run). So this is a trade of between keeping the cache
132+
# # current and having a cache at all.
133+
# # NB: This is done only `on_success` since the cache in uploaded only on
134+
# # success anyway.
135+
# # Note: Cygwin is not available on Visual Studio 2019, can try Msys2.
136+
# - "ECHO Remove old or huge cache"
137+
# - C:\cygwin\bin\find "%LOCALAPPDATA%/pip" -type f -mtime +360 -delete
138+
# - C:\cygwin\bin\find "%LOCALAPPDATA%/pip" -type f -size +50M -delete
139+
# - C:\cygwin\bin\find "%LOCALAPPDATA%/pip" -empty -delete
140+
# # Show size of cache
141+
# - C:\cygwin\bin\du -hs "%LOCALAPPDATA%/pip/Cache"
142+
# - C:\cygwin\bin\du -hs "%APPDATA%/nltk_data"
143+
# - C:\cygwin\bin\du -hs "%LOCALAPPDATA%/pythainlp-data"

docs/api/translate.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
.. currentmodule:: pythainlp.translate
2+
3+
pythainlp.translate
4+
===================
5+
The :class:`pythainlp.translate` for language translation.
6+
7+
Modules
8+
-------
9+
10+
.. autofunction:: translate

pythainlp/corpus/core.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,7 @@ def download(
316316
local_db = TinyDB(corpus_db_path())
317317
query = Query()
318318

319-
corpus = corpus_db[name.lower()]
319+
corpus = corpus_db[name]
320320
print("Corpus:", name)
321321
if version is None:
322322
for v in corpus["versions"]:

pythainlp/translate/__init__.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Language translation.
4+
"""
5+
6+
__all__ = [
7+
"translate",
8+
"download_model_all"
9+
]
10+
11+
from pythainlp.translate.core import translate, download_model_all

pythainlp/translate/core.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
# -*- coding: utf-8 -*-
2+
import os
3+
import tarfile
4+
from collections import defaultdict
5+
6+
from pythainlp.corpus import download, get_corpus_path
7+
from pythainlp.tools import get_full_data_path, get_pythainlp_data_path
8+
9+
from fairseq.models.transformer import TransformerModel
10+
from sacremoses import MosesTokenizer
11+
12+
_en_tokenizer = MosesTokenizer("en")
13+
14+
_model = None
15+
_model_name = None
16+
17+
# SCB_1M-MT_OPUS+TBASE_en-th_moses-spm_130000-16000_v1.0.tar.gz
18+
_EN_TH_FILE_NAME = (
19+
"SCB_1M-MT_OPUS+TBASE_en-th_moses-spm_130000-16000_v1.0"
20+
)
21+
# SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0.tar.gz
22+
_TH_EN_FILE_NAME = "SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0"
23+
24+
25+
def _download_install(name):
26+
if get_corpus_path(name) is None:
27+
download(name, force=True, version="1.0")
28+
tar = tarfile.open(get_corpus_path(name), "r:gz")
29+
tar.extractall()
30+
tar.close()
31+
if not os.path.exists(get_full_data_path(name)):
32+
os.mkdir(get_full_data_path(name))
33+
with tarfile.open(get_corpus_path(name)) as tar:
34+
tar.extractall(path=get_full_data_path(name))
35+
36+
37+
def download_model_all() -> None:
38+
"""
39+
Download Model
40+
"""
41+
_download_install("scb_1m_th-en_spm")
42+
_download_install("scb_1m_en-th_moses")
43+
44+
45+
def _get_translate_path(model: str, *path: str) -> str:
46+
return os.path.join(get_full_data_path(model), *path)
47+
48+
49+
def _scb_en_th_model_init():
50+
global _model, _model_name
51+
52+
if _model_name != "scb_1m_en-th_moses":
53+
del _model
54+
_model_name = "scb_1m_en-th_moses"
55+
_download_install(_model_name)
56+
_model = TransformerModel.from_pretrained(
57+
model_name_or_path=_get_translate_path(
58+
_model_name, _EN_TH_FILE_NAME, "models",
59+
),
60+
checkpoint_file="checkpoint.pt",
61+
data_name_or_path=_get_translate_path(
62+
_model_name, _EN_TH_FILE_NAME, "vocab",
63+
),
64+
)
65+
66+
67+
def _scb_en_th_translate(text: str) -> str:
68+
global _model, _model_name
69+
70+
_scb_en_th_model_init()
71+
72+
tokens = " ".join(_en_tokenizer.tokenize(text))
73+
translated = _model.translate(tokens)
74+
return translated.replace(' ', '').replace('▁', ' ').strip()
75+
76+
77+
def _scb_th_en_model_init():
78+
global _model, _model_name
79+
80+
if _model_name != "scb_1m_th-en_spm":
81+
del _model
82+
_model_name = "scb_1m_th-en_spm"
83+
_download_install(_model_name)
84+
_model = TransformerModel.from_pretrained(
85+
model_name_or_path=_get_translate_path(
86+
_model_name, _TH_EN_FILE_NAME, "models",
87+
),
88+
checkpoint_file="checkpoint.pt",
89+
data_name_or_path=_get_translate_path(
90+
_model_name, _TH_EN_FILE_NAME, "vocab",
91+
),
92+
bpe="sentencepiece",
93+
sentencepiece_model=_get_translate_path(
94+
_model_name, _TH_EN_FILE_NAME, "bpe", "spm.th.model",
95+
),
96+
)
97+
98+
99+
def _scb_th_en_translate(text: str) -> str:
100+
global _model, _model_name
101+
102+
_scb_th_en_model_init()
103+
104+
return _model.translate(text)
105+
106+
107+
def translate(text: str, source: str, target: str) -> str:
108+
"""
109+
Translate Language
110+
111+
:param str text: input text in source language
112+
:param str source: source language ("en" or "th")
113+
:param str target: target language ("en" or "th")
114+
115+
:return: translated text in target language
116+
:rtype: str
117+
"""
118+
translated = None
119+
120+
if source == "th" and target == "en":
121+
translated = _scb_th_en_translate(text)
122+
elif source == "en" and target == "th":
123+
translated = _scb_en_th_translate(text)
124+
else:
125+
return ValueError("The combination of the arguments isn't allowed.")
126+
127+
return translated

setup.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@
1111
1212
PyThaiNLP is a Python library for Thai natural language processing.
1313
The library provides functions like word tokenization, part-of-speech tagging,
14-
transliteration, soundex generation, and spell checking.
14+
transliteration, soundex generation, spell checking, and
15+
date and time parsing/formatting.
1516
1617
# Install
1718
@@ -29,13 +30,6 @@
2930
3031
Some functionalities, like named-entity recognition, required extra packages.
3132
See https://github.com/PyThaiNLP/pythainlp for installation options.
32-
33-
34-
Made with ❤️
35-
36-
PyThaiNLP Team
37-
38-
"We build Thai NLP"
3933
"""
4034

4135
requirements = [
@@ -46,24 +40,33 @@
4640

4741
extras = {
4842
"attacut": ["attacut>=1.0.6"],
49-
"benchmarks": ["numpy>=1.16.1", "pandas>=0.24", "PyYAML>=5.3.1"],
43+
"benchmarks": ["PyYAML>=5.3.1", "numpy>=1.16.1", "pandas>=0.24"],
5044
"icu": ["pyicu>=2.3"],
5145
"ipa": ["epitran>=1.1"],
5246
"ml": ["numpy>=1.16", "torch>=1.0.0"],
5347
"ssg": ["ssg>=0.0.6"],
5448
"thai2fit": ["emoji>=0.5.1", "gensim>=3.2.0", "numpy>=1.16.1"],
55-
"thai2rom": ["torch>=1.0.0", "numpy>=1.16.1"],
49+
"thai2rom": ["numpy>=1.16.1", "torch>=1.0.0"],
50+
"translate": [
51+
"fairseq>=0.10.0",
52+
"sacremoses>=0.0.41",
53+
"sentencepiece>=0.1.91",
54+
"torch>=1.0.0",
55+
],
5656
"wordnet": ["nltk>=3.3.*"],
5757
"full": [
5858
"PyYAML>=5.3.1",
5959
"attacut>=1.0.4",
6060
"emoji>=0.5.1",
6161
"epitran>=1.1",
62+
"fairseq>=0.10.0",
6263
"gensim>=3.2.0",
6364
"nltk>=3.3.*",
6465
"numpy>=1.16.1",
6566
"pandas>=0.24",
6667
"pyicu>=2.3",
68+
"sacremoses>=0.0.41",
69+
"sentencepiece>=0.1.91",
6770
"ssg>=0.0.6",
6871
"torch>=1.0.0",
6972
],

tests/test_translate.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# -*- coding: utf-8 -*-
2+
3+
import unittest
4+
5+
from pythainlp.translate import translate
6+
7+
8+
class TestTranslatePackage(unittest.TestCase):
9+
def test_translate(self):
10+
self.assertIsNotNone(
11+
translate(
12+
"แมวกินปลา",
13+
source="th",
14+
target="en"
15+
)
16+
)
17+
self.assertIsNotNone(
18+
translate(
19+
"the cat eats fish.",
20+
source="en",
21+
target="th"
22+
)
23+
)

0 commit comments

Comments
 (0)