Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/api/tools.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@

pythainlp.tools
====================================
The :class:`pythainlp.tools` is tool for pythainlp.
The :class:`pythainlp.tools` contains miscellaneous functions for PyThaiNLP internal use.

Modules
-------

.. autofunction:: get_full_data_path
.. autofunction:: get_pythainlp_data_path
.. autofunction:: get_pythainlp_path
.. autofunction:: get_pythainlp_path
8 changes: 6 additions & 2 deletions docs/api/translate.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@

pythainlp.translate
===================
The :class:`pythainlp.translate` for language translation.
The :class:`pythainlp.translate` for machine translation.

Modules
-------

.. autofunction:: translate
.. autofunction:: download_model_all
.. autoclass:: EnThTranslate
:members: translate
.. autoclass:: ThEnTranslate
:members: translate
11 changes: 6 additions & 5 deletions pythainlp/translate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
Language translation.
"""

__all__ = [
"translate",
"download_model_all"
]
__all__ = ["EnThTranslator", "ThEnTranslator", "download_model_all"]

from pythainlp.translate.core import translate, download_model_all
from pythainlp.translate.core import (
EnThTranslator,
ThEnTranslator,
download_model_all,
)
127 changes: 57 additions & 70 deletions pythainlp/translate/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,21 @@
from fairseq.models.transformer import TransformerModel
from sacremoses import MosesTokenizer

_en_tokenizer = MosesTokenizer("en")

_model = None
_model_name = None

_EN_TH_MODEL_NAME = "scb_1m_en-th_moses"
# SCB_1M-MT_OPUS+TBASE_en-th_moses-spm_130000-16000_v1.0.tar.gz
_EN_TH_FILE_NAME = (
"SCB_1M-MT_OPUS+TBASE_en-th_moses-spm_130000-16000_v1.0"
)
_EN_TH_FILE_NAME = "SCB_1M-MT_OPUS+TBASE_en-th_moses-spm_130000-16000_v1.0"

_TH_EN_MODEL_NAME = "scb_1m_th-en_spm"
# SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0.tar.gz
_TH_EN_FILE_NAME = "SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0"


def _download_install(name):
def _get_translate_path(model: str, *path: str) -> str:
return os.path.join(get_full_data_path(model), *path)


def _download_install(name: str) -> None:
if get_corpus_path(name) is None:
download(name, force=True, version="1.0")
tar = tarfile.open(get_corpus_path(name), "r:gz")
Expand All @@ -36,92 +37,78 @@ def _download_install(name):

def download_model_all() -> None:
"""
Download Model
Download all translation models in advanced
"""
_download_install("scb_1m_th-en_spm")
_download_install("scb_1m_en-th_moses")
_download_install(_EN_TH_MODEL_NAME)
_download_install(_TH_EN_MODEL_NAME)


def _get_translate_path(model: str, *path: str) -> str:
return os.path.join(get_full_data_path(model), *path)

class EnThTranslator:
def __init__(self):
self._tokenizer = MosesTokenizer("en")

def _scb_en_th_model_init():
global _model, _model_name
self._model_name = _EN_TH_MODEL_NAME

if _model_name != "scb_1m_en-th_moses":
del _model
_model_name = "scb_1m_en-th_moses"
_download_install(_model_name)
_model = TransformerModel.from_pretrained(
_download_install(self._model_name)
self._model = TransformerModel.from_pretrained(
model_name_or_path=_get_translate_path(
_model_name, _EN_TH_FILE_NAME, "models",
self._model_name,
_EN_TH_FILE_NAME,
"models",
),
checkpoint_file="checkpoint.pt",
data_name_or_path=_get_translate_path(
_model_name, _EN_TH_FILE_NAME, "vocab",
self._model_name,
_EN_TH_FILE_NAME,
"vocab",
),
)

def translate(self, text: str) -> str:
"""
Translate text from English to Thai

def _scb_en_th_translate(text: str) -> str:
global _model, _model_name

_scb_en_th_model_init()

tokens = " ".join(_en_tokenizer.tokenize(text))
translated = _model.translate(tokens)
return translated.replace(' ', '').replace('▁', ' ').strip()
:param str text: input text in source language
:return: translated text in target language
:rtype: str
"""
tokens = " ".join(self._tokenizer.tokenize(text))
translated = self._model.translate(tokens)
return translated.replace(" ", "").replace("▁", " ").strip()


def _scb_th_en_model_init():
global _model, _model_name
class ThEnTranslator:
def __init__(self):
self._model_name = _TH_EN_MODEL_NAME

if _model_name != "scb_1m_th-en_spm":
del _model
_model_name = "scb_1m_th-en_spm"
_download_install(_model_name)
_model = TransformerModel.from_pretrained(
_download_install(self._model_name)
self._model = TransformerModel.from_pretrained(
model_name_or_path=_get_translate_path(
_model_name, _TH_EN_FILE_NAME, "models",
self._model_name,
_TH_EN_FILE_NAME,
"models",
),
checkpoint_file="checkpoint.pt",
data_name_or_path=_get_translate_path(
_model_name, _TH_EN_FILE_NAME, "vocab",
self._model_name,
_TH_EN_FILE_NAME,
"vocab",
),
bpe="sentencepiece",
sentencepiece_model=_get_translate_path(
_model_name, _TH_EN_FILE_NAME, "bpe", "spm.th.model",
self._model_name,
_TH_EN_FILE_NAME,
"bpe",
"spm.th.model",
),
)

def translate(self, text: str) -> str:
"""
Translate text from Thai to English

def _scb_th_en_translate(text: str) -> str:
global _model, _model_name

_scb_th_en_model_init()

return _model.translate(text)


def translate(text: str, source: str, target: str) -> str:
"""
Translate Language

:param str text: input text in source language
:param str source: source language ("en" or "th")
:param str target: target language ("en" or "th")

:return: translated text in target language
:rtype: str
"""
translated = None

if source == "th" and target == "en":
translated = _scb_th_en_translate(text)
elif source == "en" and target == "th":
translated = _scb_en_th_translate(text)
else:
return ValueError("The combination of the arguments isn't allowed.")

return translated
:param str text: input text in source language
:return: translated text in target language
:rtype: str
"""
return self._model.translate(text)
12 changes: 5 additions & 7 deletions tests/test_translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,20 @@

import unittest

from pythainlp.translate import translate
from pythainlp.translate import EnThTranslator, ThEnTranslator


class TestTranslatePackage(unittest.TestCase):
def test_translate(self):
self.th_en_translator = ThEnTranslator()
self.assertIsNotNone(
translate(
self.th_en_translator.translate(
"แมวกินปลา",
source="th",
target="en"
)
)
self.en_th_translator = EnThTranslator()
self.assertIsNotNone(
translate(
self.en_th_translator.translate(
"the cat eats fish.",
source="en",
target="th"
)
)