Skip to content
Merged
40 changes: 27 additions & 13 deletions .github/workflows/unittest.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Unit test and coverage
name: Unit test

on:
push:
Expand All @@ -23,9 +23,10 @@ jobs:
runs-on: ${{ matrix.os }}
env:
PYICU_WIN_VER: 2.14
INSTALL_PYICU_WIN: false
INSTALL_TORCH: false
INSTALL_FULL_DEPS: false
PYTHON_VERSION_LATEST: "3.13"
PYTHON_VERSION_LATEST_2: "3.12"

steps:
- name: Checkout
Expand All @@ -52,7 +53,7 @@ jobs:
echo "ICU_VER=${ICU_VER}"
echo "ICU_VER=${ICU_VER}" >> "${GITHUB_ENV}"
- name: Install PyICU (Windows)
if: startsWith(matrix.os, 'windows-') && env.INSTALL_PYICU_WIN == 'true'
if: startsWith(matrix.os, 'windows-') && (matrix.python-version == '3.12' || matrix.python-version == '3.13')
shell: powershell
run: |
$PYTHON_WIN_VER = "${{ matrix.python-version }}"
Expand All @@ -66,29 +67,42 @@ jobs:
# If torch for the platform is not available in PyPI, use this command:
# pip install "<torch_wheel_url>"
# Get wheel URL from http://download.pytorch.org/whl/torch/
- name: Install dependencies
- name: Install dependencies from docker_requirements.txt
if: env.INSTALL_FULL_DEPS == 'true'
env:
SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL: True
run: pip install -r docker_requirements.txt
- name: Install PyThaiNLP
env:
PYTHONIOENCODING: utf-8
- name: Install PyThaiNLP + dependencies (minimum)
if: matrix.python-version != env.PYTHON_VERSION_LATEST && matrix.python-version != env.PYTHON_VERSION_LATEST_2
run: pip install .
- name: Install PyThaiNLP + dependencies (compact)
if: matrix.python-version == env.PYTHON_VERSION_LATEST || matrix.python-version == env.PYTHON_VERSION_LATEST_2
run: pip install ".[compact]"
# If you want to install a safe small set of optional dependencies, use:
# pip install .[compact]
# "compact" includes numpy, pyicu, and python-crfsuite.
# pip install ".[compact]"
# We can gradually run more test cases by installing more optional
# dependencies. But we should also consider to reduce the number
# of dependencies to avoid the conflict between dependencies.
# See: https://github.com/PyThaiNLP/pythainlp/issues/935
- name: Unit test and code coverage
run: coverage run -m unittest tests
# Use 'unittest tests' instead of 'unittest discover' to avoid loading
# tests with external imports.
- name: Unit test (core)
if: matrix.python-version != env.PYTHON_VERSION_LATEST && matrix.python-version != env.PYTHON_VERSION_LATEST_2
env:
PYTHONIOENCODING: utf-8
run: coverage run -m unittest tests.core
- name: Unit test (core + compact)
if: matrix.python-version == env.PYTHON_VERSION_LATEST || matrix.python-version == env.PYTHON_VERSION_LATEST_2
env:
PYTHONIOENCODING: utf-8
run: coverage run -m unittest tests.core tests.compact
# Only test "compact" set with the latest two stable Python versions.
# Use 'unittest <test_module>' instead of 'unittest discover' to avoid
# loading tests with dependencies more than expected.
# Test cases loaded is defined in __init__.py in the tests directory.
# See also tests/README.md
- name: Coverage report
if: matrix.python-version == env.PYTHON_VERSION_LATEST
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
COVERALLS_SERVICE_NAME: github
run: coveralls
# Only submit a report from the latest Python version
6 changes: 4 additions & 2 deletions pythainlp/util/strftime.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""
Thai date/time formatting.
"""

import warnings
from datetime import datetime
from string import digits
Expand Down Expand Up @@ -44,7 +45,7 @@ def _std_strftime(dt_obj: datetime, fmt_char: str) -> str:
# in that case just use the fmt_char
warnings.warn(
(
f"String format directive unknown/not support: %{fmt_char}"
f"String format directive unknown/not support: %{fmt_char}\n"
f"The system raises this ValueError: {err}"
),
UserWarning,
Expand Down Expand Up @@ -145,7 +146,8 @@ def _thai_strftime(dt_obj: datetime, fmt_char: str) -> str:
)
else:
# No known localization available, use Python's default
str_ = _std_strftime(dt_obj, fmt_char)
# With a good _NEED_L10N and _EXTENSIONS, this should not happen
str_ = _std_strftime(dt_obj, fmt_char) # pragma: no cover

return str_

Expand Down
7 changes: 6 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,12 @@
"wtp": ["transformers>=4.6.0", "wtpsplit>=1.0.1"],
"wunsen": ["wunsen>=0.0.1"],
# Compact dependencies, this one matches requirements.txt
"compact": ["PyYAML>=5.4.1", "numpy>=1.22", "pyicu>=2.3", "python-crfsuite>=0.9.7"],
"compact": [
"PyYAML>=5.4.1",
"numpy>=1.22",
"pyicu>=2.3",
"python-crfsuite>=0.9.7",
],
# Full dependencies
"full": [
"PyYAML>=5.4.1",
Expand Down
23 changes: 17 additions & 6 deletions tests/README.md
Original file line number Diff line number Diff line change
@@ -1,19 +1,30 @@
# Test cases

Tests are categorized into two groups: fundamental and extra.
Tests are categorized into three groups: core, compact, and extra.

## Fundamental Tests (test_*.py)
## Core Tests (test_*.py)

- Focus on core functionalities.
- Do not rely on additional dependencies beyond those listed in the
`requirements` section of `setup.py`.
- Do not rely on external dependencies beyond the standard library,
except for `requests` which is used for corpus downloading.
- Test with all officially supported Python versions
(currently 3.9, 3.10, 3.11, 3.12, and 3.13).

## Compact Tests (testc_*.py)

- Test a limited set of additional functionalities that rely on optional
dependencies specified in `requirements.txt`.
- These dependencies are `PyYAML`, `numpy`, `pyicu`, `python-crfsuite`, and
`requests`.
- Test with the latest two stable Python versions.

## Extra Tests (testx_*.py)

- Explore functionalities that rely on optional dependencies specified in the
`extras` section of `setup.py`.
- These dependencies might include libraries like `nltk`, `pycrfsuite`, or
`torch`.
- These dependencies might include libraries like `gensim`, `nltk`, or `torch`.
- Due to dependency complexities, these functionalities are not part of the
automated test suite and will not be tested in the CI/CD pipeline.

## Default Test Suite

Expand Down
14 changes: 2 additions & 12 deletions tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,8 @@

# Names of module to be tested
test_packages: list[str] = [
"tests.test_ancient",
"tests.test_cli",
"tests.test_corpus",
"tests.test_khavee",
"tests.test_morpheme",
"tests.test_soundex",
"tests.test_spell",
"tests.test_tag",
"tests.test_tokenize",
"tests.test_tools",
"tests.test_transliterate",
"tests.test_util",
"tests.core",
"tests.compact",
]


Expand Down
36 changes: 36 additions & 0 deletions tests/compact/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
"""
Unit test. Compact version.

Test functions that require "compact" dependencies (see setup.py).
"""

from unittest import TestLoader, TestSuite

# Names of module to be tested
test_packages: list[str] = [
"tests.compact.testc_tag",
"tests.compact.testc_tokenize",
"tests.compact.testc_util",
]


def load_tests(
loader: TestLoader, standard_tests: TestSuite, pattern: str
) -> TestSuite:
"""Load test protocol
See: https://docs.python.org/3/library/unittest.html#id1
"""
suite = TestSuite()
for test_package in test_packages:
tests = loader.loadTestsFromName(test_package)
suite.addTests(tests)
return suite


if __name__ == "__main__":
import unittest

unittest.main()
15 changes: 15 additions & 0 deletions tests/compact/testc_tag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0

import unittest

from pythainlp.tag import chunk_parse, pos_tag


class TagTestCase(unittest.TestCase):
def test_chunk_parse(self):
tokens = ["ผม", "รัก", "คุณ"]

w_p = pos_tag(tokens, engine="perceptron", corpus="orchid")
self.assertIsNotNone(chunk_parse(w_p))
85 changes: 85 additions & 0 deletions tests/compact/testc_tokenize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0

import unittest

from pythainlp.tokenize import (
pyicu,
sent_tokenize,
subword_tokenize,
word_tokenize,
)

from ..core.test_tokenize import (
SENT_1,
SENT_1_TOKS,
SENT_2,
SENT_2_TOKS,
SENT_3,
SENT_3_TOKS,
SENT_4,
TEXT_1,
)


# Tests for functions that need "compact" dependencies
class TokenizeTestCaseCompact(unittest.TestCase):
def test_icu(self):
self.assertEqual(pyicu.segment(None), [])
self.assertEqual(pyicu.segment(""), [])
self.assertEqual(
word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="icu"),
["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"],
)

def test_sent_tokenize(self):
# Use default engine (crfcut)
self.assertEqual(sent_tokenize(None), [])
self.assertEqual(sent_tokenize(""), [])
self.assertEqual(
sent_tokenize(SENT_1),
SENT_1_TOKS,
)
self.assertEqual(
sent_tokenize(SENT_2),
SENT_2_TOKS,
)
self.assertEqual(
sent_tokenize(SENT_3),
SENT_3_TOKS,
)

self.assertEqual(
sent_tokenize(SENT_1, engine="crfcut"),
SENT_1_TOKS,
)
self.assertEqual(
sent_tokenize(SENT_2, engine="crfcut"),
SENT_2_TOKS,
)
self.assertEqual(
sent_tokenize(SENT_3, engine="crfcut"),
SENT_3_TOKS,
)
self.assertEqual(
sent_tokenize(SENT_4, engine="crfcut"),
[["ผม", "กิน", "ข้าว", " ", "\n", "เธอ", "เล่น", "เกม"]],
)

def test_subword_tokenize(self):
self.assertEqual(subword_tokenize(None, engine="han_solo"), [])
self.assertEqual(
subword_tokenize("แมวกินปลา", engine="han_solo"),
["แมว", "กิน", "ปลา"],
)
self.assertIn(
"ดาว", subword_tokenize("สวัสดีดาวอังคาร", engine="han_solo")
)

self.assertNotIn(
"า", subword_tokenize("สวัสดีดาวอังคาร", engine="han_solo")
)

def test_word_tokenize_icu(self):
self.assertIsNotNone(word_tokenize(TEXT_1, engine="icu"))
11 changes: 0 additions & 11 deletions tests/testx_util.py → tests/compact/testc_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,14 @@

import unittest

from pythainlp.util import rhyme, thai_word_tone_detector
from pythainlp.util.spell_words import spell_word


class UtilTestCaseX(unittest.TestCase):
def test_rhyme(self):
self.assertIsInstance(rhyme("แมว"), list)
self.assertTrue(len(rhyme("แมว")) > 2)

def test_spell_word(self):
self.assertEqual(spell_word("เสือ"), ["สอ", "เอือ", "เสือ"])
self.assertEqual(spell_word("เสื้อ"), ["สอ", "เอือ", "ไม้โท", "เสื้อ"])
self.assertEqual(spell_word("คน"), ["คอ", "นอ", "คน"])
self.assertEqual(
spell_word("คนดี"), ["คอ", "นอ", "คน", "ดอ", "อี", "ดี", "คนดี"]
)

def test_thai_word_tone_detector(self):
self.assertIsNotNone(thai_word_tone_detector("คนดี"))
self.assertEqual(
thai_word_tone_detector("ราคา"), [("รา", "m"), ("คา", "m")]
)
45 changes: 45 additions & 0 deletions tests/core/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
"""
Unit test.

Each file in tests/ is for each main package.
"""

from unittest import TestLoader, TestSuite

# Names of module to be tested
test_packages: list[str] = [
"tests.core.test_ancient",
"tests.core.test_cli",
"tests.core.test_corpus",
"tests.core.test_khavee",
"tests.core.test_morpheme",
"tests.core.test_soundex",
"tests.core.test_spell",
"tests.core.test_tag",
"tests.core.test_tokenize",
"tests.core.test_tools",
"tests.core.test_transliterate",
"tests.core.test_util",
]


def load_tests(
loader: TestLoader, standard_tests: TestSuite, pattern: str
) -> TestSuite:
"""Load test protocol
See: https://docs.python.org/3/library/unittest.html#id1
"""
suite = TestSuite()
for test_package in test_packages:
tests = loader.loadTestsFromName(test_package)
suite.addTests(tests)
return suite


if __name__ == "__main__":
import unittest

unittest.main()
File renamed without changes.
File renamed without changes.
Loading