Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions docs/api/tokenize.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@ The :class:`pythainlp.tokenize` contains multiple functions for tokenizing a chu
Modules
-------

.. autofunction:: clause_tokenize
.. autofunction:: sent_tokenize
.. autofunction:: word_tokenize
.. autofunction:: syllable_tokenize
.. autofunction:: subword_tokenize
.. autofunction:: syllable_tokenize
.. autofunction:: word_tokenize
.. autoclass:: Tokenizer
:members:

Expand Down
2 changes: 2 additions & 0 deletions pythainlp/tokenize/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"THAI2FIT_TOKENIZER",
"Tokenizer",
"Trie",
"clause_tokenize",
"sent_tokenize",
"subword_tokenize",
"syllable_tokenize",
Expand All @@ -27,6 +28,7 @@

from pythainlp.tokenize.core import (
Tokenizer,
clause_tokenize,
sent_tokenize,
subword_tokenize,
syllable_tokenize,
Expand Down
32 changes: 27 additions & 5 deletions pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,39 @@
from pythainlp.tokenize import (
DEFAULT_SENT_TOKENIZE_ENGINE,
DEFAULT_SUBWORD_TOKENIZE_ENGINE,
DEFAULT_SYLLABLE_DICT_TRIE,
DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
DEFAULT_WORD_TOKENIZE_ENGINE,
DEFAULT_WORD_DICT_TRIE,
DEFAULT_SYLLABLE_DICT_TRIE,
DEFAULT_WORD_TOKENIZE_ENGINE,
)
from pythainlp.util.trie import Trie, dict_trie


def clause_tokenize(doc: List[str]) -> List[List[str]]:
"""
Clause tokenizer. (or Clause segmentation)

Tokenizes running word list into list of clauses (list of strings).
split by CRF trained on LST20 Corpus.

:param str doc: word list to be clause
:return: list of claues
:rtype: list[list[str]]

:Example:

from pythainlp.tokenize import clause_tokenize

clause_tokenize(["ฉัน","นอน","และ","คุณ","เล่น","มือถือ","ส่วน","น้อง","เขียน","โปรแกรม"])
[['ฉัน', 'นอน'],
['และ', 'คุณ', 'เล่น', 'มือถือ'],
['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']]
"""
from .crfcls import segment

return segment(doc)


def word_tokenize(
text: str,
custom_dict: Trie = None,
Expand Down Expand Up @@ -50,9 +75,6 @@ def word_tokenize(
`DeepCut <https://github.com/rkcosmos/deepcut>`_,
learning-based approach

.. warning::
* the option for engine named *ulmfit* has been deprecated since \
PyThaiNLP version 2.1
:Note:
- The parameter **custom_dict** can be provided as an argument \
only for *newmm*, *longest*, and *attacut* engine.
Expand Down
74 changes: 74 additions & 0 deletions pythainlp/tokenize/crfcls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# -*- coding: utf-8 -*-
"""
Clause segmenter
"""
from typing import List

import pycrfsuite
from pythainlp.corpus import get_corpus_path
from pythainlp.tag import pos_tag


def _doc2features(doc, i):
# features from current word
curr_word = doc[i][0]
curr_pos = doc[i][1]
features = {
"word.curr_word": curr_word,
"word.curr_isspace": curr_word.isspace(),
"word.curr_isdigit": curr_word.isdigit(),
"word.curr_postag": curr_pos,
}

# features from previous word
if i > 0:
prev_word = doc[i - 1][0]
prev_pos = doc[i - 1][1]
features["word.prev_word"] = prev_word
features["word.prev_isspace"] = prev_word.isspace()
features["word.prev_isdigit"] = prev_word.isdigit()
features["word.prev_postag"] = prev_pos
else:
features["BOS"] = True # Beginning of Sequence

# features from next word
if i < len(doc) - 1:
next_word = doc[i + 1][0]
next_pos = doc[i + 1][1]
features["word.next_word"] = next_word
features["word.next_isspace"] = next_word.isspace()
features["word.next_isdigit"] = next_word.isdigit()
features["word.next_postag"] = next_pos
else:
features["EOS"] = True # End of Sequence

return features


def _extract_features(doc):
return [_doc2features(doc, i) for i in range(len(doc))]


_CORPUS_NAME = "lst20-cls"
tagger = pycrfsuite.Tagger()
tagger.open(get_corpus_path(_CORPUS_NAME))


def segment(doc: List[str]) -> List[List[str]]:
word_tags = pos_tag(doc, corpus="lst20")
features = _extract_features(word_tags)
word_markers = list(zip(doc, tagger.tag(features)))

clauses = []
temp = []
len_doc = len(doc) - 1
for i, word_marker in enumerate(word_markers):
word, marker = word_marker
if marker == "E_CLS" or i == len_doc:
temp.append(word)
clauses.append(temp)
temp = []
else:
temp.append(word)

return clauses
2 changes: 1 addition & 1 deletion pythainlp/tokenize/crfcut.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
"""
CRFCut - Thai sentence segmentor.
CRFCut - Thai sentence segmenter.

Thai sentence segmentation using conditional random field,
default model trained on TED dataset
Expand Down
11 changes: 10 additions & 1 deletion tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@

import unittest

from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE, Tokenizer, attacut
from pythainlp.tokenize import (
DEFAULT_WORD_DICT_TRIE,
Tokenizer,
attacut,
clause_tokenize,
)
from pythainlp.tokenize import deepcut as tokenize_deepcut
from pythainlp.tokenize import etcc, longest, multi_cut, newmm
from pythainlp.tokenize import pyicu as tokenize_pyicu
Expand Down Expand Up @@ -184,6 +189,10 @@ def setUp(self):
"กกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกก"
)

def test_clause_tokenize(self):
self.assertIsNotNone(clause_tokenize(["ฉัน", "ทดสอบ"]))
self.assertIsInstance(clause_tokenize(["ฉัน", "ทดสอบ"]), list)

def test_Tokenizer(self):
t_test = Tokenizer(DEFAULT_WORD_DICT_TRIE)
self.assertEqual(t_test.word_tokenize(""), [])
Expand Down