Skip to content

Fix/broken numeric data format (#652) #723

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Oct 12, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions pythainlp/tokenize/_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# -*- coding: utf-8 -*-
"""
Utility functions for tokenize module.
"""

import re
from typing import List, Callable

_DIGITS_WITH_SEPARATOR = re.compile(r"(\d+[\.\,:])+\d+")


def apply_postprocessors(
segments: List[str], postprocessors: Callable[[List[str]], List[str]]
) -> List[str]:
"""
A list of callables to apply on a raw segmentation result.
"""
for func in postprocessors:
segments = func(segments)

return segments


def rejoin_formatted_num(segments: List[str]) -> List[str]:
"""
Rejoin well-known formatted numeric that are over-tokenized.
The formatted numeric are numbers separated by ":", ",", or ".",
such as time, decimal number, comma-added number, and IP address.

:param List[str] segments: result from word tokenizer
:return: a list of fixed tokens
:rtype: List[str]

:Example:
tokens = ['ขณะ', 'นี้', 'เวลา', ' ', '12', ':', '00น', ' ', 'อัตรา',
'แลกเปลี่ยน', ' ', '1', ',', '234', '.', '5', ' ', 'baht/zeny']
rejoin_formatted_num(tokens)
# output:
# ['ขณะ', 'นี้', 'เวลา', ' ', '12:00น', ' ', 'อัตรา', 'แลกเปลี่ยน', ' ', '1,234.5', ' ', 'baht/zeny']

tokens = ['IP', ' ', 'address', ' ', 'ของ', 'คุณ', 'คือ', ' ', '127', '.', '0', '.', '0', '.', '1', ' ', 'ครับ']
rejoin_formatted_num(tokens)
# output:
# ['IP', ' ', 'address', ' ', 'ของ', 'คุณ', 'คือ', ' ', '127.0.0.1', ' ', 'ครับ']
"""
original = "".join(segments)
matching_results = _DIGITS_WITH_SEPARATOR.finditer(original)
tokens_joined = []
pos = 0
segment_idx = 0

match = next(matching_results, None)
while segment_idx < len(segments) and match:
is_span_beginning = pos >= match.start()
token = segments[segment_idx]
if is_span_beginning:
connected_token = ""
while pos < match.end() and segment_idx < len(segments):
connected_token += segments[segment_idx]
pos += len(segments[segment_idx])
segment_idx += 1

tokens_joined.append(connected_token)
match = next(matching_results, None)
else:
tokens_joined.append(token)
segment_idx += 1
pos += len(token)
tokens_joined += segments[segment_idx:]
return tokens_joined


def strip_whitespace(segments: List[str]) -> List[str]:
"""
Strip whitespace(s) off each token and remove whitespace tokens.
:param List[str] segments: result from word tokenizer
:return: a list of tokens
:rtype: List[str]

:Example:
tokens = [" ", "วันนี้ ", "เวลา ", "19.00น"]
strip_whitespace(tokens)
# ["วันนี้", "เวลา", "19.00น"]

"""
segments = [token.strip(" ") for token in segments if token.strip(" ")]
return segments
115 changes: 80 additions & 35 deletions pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@
DEFAULT_WORD_DICT_TRIE,
DEFAULT_WORD_TOKENIZE_ENGINE,
)
from pythainlp.tokenize._utils import (
apply_postprocessors,
rejoin_formatted_num,
strip_whitespace,
)
from pythainlp.util.trie import Trie, dict_trie


Expand Down Expand Up @@ -47,7 +52,9 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]:
return segment(doc)


def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "str") -> Union[str, List[str]]:
def word_detokenize(
segments: Union[List[List[str]], List[str]], output: str = "str"
) -> Union[str, List[str]]:
"""
Word detokenizer.

Expand All @@ -62,6 +69,7 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "
if isinstance(segments[0], str):
segments = [segments]
from pythainlp import thai_characters

for i, s in enumerate(segments):
_list_sents = []
_add_index = []
Expand All @@ -70,7 +78,7 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "
for j, w in enumerate(s):
if j > 0:
# previous word
p_w = s[j-1]
p_w = s[j - 1]
# if w is number or other language and not be space
if (
w[0] not in thai_characters
Expand All @@ -88,9 +96,9 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "
if not p_w.isspace():
_list_sents.append(" ")
_mark_index.append(j)
elif w.isspace() and j-1 not in _space_index:
elif w.isspace() and j - 1 not in _space_index:
_space_index.append(j)
elif j-1 in _mark_index:
elif j - 1 in _mark_index:
_list_sents.append(" ")
_list_sents.append(w)
_list_all.append(_list_sents)
Expand All @@ -103,14 +111,15 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "
for j in i:
_temp += j
_text.append(_temp)
return ' '.join(_text)
return " ".join(_text)


def word_tokenize(
text: str,
custom_dict: Trie = None,
engine: str = DEFAULT_WORD_TOKENIZE_ENGINE,
keep_whitespace: bool = True,
join_broken_num: bool = True,
) -> List[str]:
"""
Word tokenizer.
Expand All @@ -123,37 +132,47 @@ def word_tokenize(
:param bool keep_whitespace: True to keep whitespaces, a common mark
for end of phrase in Thai.
Otherwise, whitespaces are omitted.
:param bool join_broken_num: True to rejoin formatted numeric that could be wrongly separated.
Otherwise, formatted numeric could be wrongly separated.

:return: list of words
:rtype: List[str]
**Options for engine**
* *newmm* (default) - dictionary-based, Maximum Matching +
Thai Character Cluster
* *newmm-safe* - newmm, with a mechanism to help avoid long
processing time for text with continuous ambiguous breaking points
* *mm* or *multi_cut* - dictionary-based, Maximum Matching.
* *nlpo3* - Python binding for nlpO3. It is newmm engine in Rust.
* *longest* - dictionary-based, Longest Matching
* *icu* - wrapper for ICU (International Components for Unicode,
using PyICU), dictionary-based
* *attacut* - wrapper for
`AttaCut <https://github.com/PyThaiNLP/attacut>`_.,
learning-based approach
* *deepcut* - wrapper for
`DeepCut <https://github.com/rkcosmos/deepcut>`_,
learning-based approach
* *nercut* - Dictionary-based maximal matching word segmentation,
* *icu* - wrapper for a word tokenizer in
`PyICU <https://gitlab.pyicu.org/main/pyicu>`_.,
from ICU (International Components for Unicode),
dictionary-based
* *longest* - dictionary-based, longest matching
* *mm* - "multi-cut", dictionary-based, maximum matching
* *nercut* - dictionary-based, maximal matching,
constrained with Thai Character Cluster (TCC) boundaries,
and combining tokens that are parts of the same named-entity.
combining tokens that are parts of the same named-entity
* *newmm* (default) - "new multi-cut",
dictionary-based, maximum matching,
constrained with Thai Character Cluster (TCC) boundaries
* *newmm-safe* - newmm, with a mechanism to avoid long
processing time for text with continuous ambiguous breaking points
* *nlpo3* - wrapper for a word tokenizer in
`nlpO3 <https://github.com/PyThaiNLP/nlpo3>`_.,
newmm adaptation in Rust (2.5x faster)
* *oskut* - wrapper for
`OSKut <https://github.com/mrpeerat/OSKut>`_.,
Out-of-domain StacKed cut for Word Segmentation
* *sefr_cut* - wrapper for
`SEFR CUT <https://github.com/mrpeerat/SEFR_CUT>`_.,
Stacked Ensemble Filter and Refine for Word Segmentation
* *tltk* - wrapper for
`TLTK <https://pypi.org/project/tltk/>`_.,
* *oskut* - wrapper for
`OSKut <https://github.com/mrpeerat/OSKut>`_.,

maximum collocation approach
:Note:
- The parameter **custom_dict** can be provided as an argument \
only for *newmm*, *longest*, and *deepcut* engine.
- The **custom_dict** parameter only works for \
*deepcut*, *longest*, *newmm*, and *newmm-safe* engines.
:Example:

Tokenize text with different tokenizer::
Expand All @@ -178,6 +197,19 @@ def word_tokenize(

word_tokenize(text, engine="newmm", keep_whitespace=False)
# output: ['วรรณกรรม', 'ภาพวาด', 'และ', 'การแสดง', 'งิ้ว']

Join broken formatted numeric (e.g. time, decimals, IP address)::

text = "เงิน1,234บาท19:32น 127.0.0.1"

word_tokenize(text, engine="attacut", join_broken_num=False)
# output:
# ['เงิน', '1', ',', '234', 'บาท', '19', ':', '32น', ' ',
# '127', '.', '0', '.', '0', '.', '1']

word_tokenize(text, engine="attacut", join_broken_num=True)
# output:
# ['เงิน', '1,234', 'บาท', '19:32น', ' ', '127.0.0.1']

Tokenize with default and custom dictionary::

Expand All @@ -199,8 +231,8 @@ def word_tokenize(

word_tokenize(text, engine="newmm", custom_dict=trie))
# output:
# ['ชินโซ', ' ', 'อาเบะ',
# ' ', 'เกิด', ' ', '21', ' ', 'กันยายน']
# ['ชินโซ', ' ', 'อาเบะ', ' ',
# 'เกิด', ' ', '21', ' ', 'กันยายน']
"""
if not text or not isinstance(text, str):
return []
Expand Down Expand Up @@ -257,6 +289,7 @@ def word_tokenize(
segments = segment(text)
elif engine == "nlpo3":
from pythainlp.tokenize.nlpo3 import segment

if isinstance(custom_dict, str):
segments = segment(text, custom_dict=custom_dict)
elif not isinstance(custom_dict, str) and custom_dict is not None:
Expand All @@ -274,8 +307,14 @@ def word_tokenize(
It might be a typo; if not, please consult our document."""
)

postprocessors = []
if join_broken_num:
postprocessors.append(rejoin_formatted_num)

if not keep_whitespace:
segments = [token.strip(" ") for token in segments if token.strip(" ")]
postprocessors.append(strip_whitespace)

segments = apply_postprocessors(segments, postprocessors)

return segments

Expand All @@ -297,12 +336,12 @@ def sent_tokenize(
:rtype: list[str]
**Options for engine**
* *crfcut* - (default) split by CRF trained on TED dataset
* *thaisum* - The implementation of sentence segmentator from \
Nakhun Chumpolsathien, 2020
* *tltk* - split by `TLTK <https://pypi.org/project/tltk/>`_.,
* *whitespace+newline* - split by whitespaces and newline.
* *whitespace* - split by whitespaces. Specifiaclly, with \
:class:`regex` pattern ``r" +"``
* *tltk* - split by `TLTK <https://pypi.org/project/tltk/>`_.,
* *thaisum* - The implementation of sentence segmentator from \
Nakhun Chumpolsathien, 2020
:Example:

Split the text based on *whitespace*::
Expand Down Expand Up @@ -364,7 +403,10 @@ def sent_tokenize(

segments = segment(text)
elif engine == "thaisum":
from pythainlp.tokenize.thaisumcut import ThaiSentenceSegmentor as segmentor
from pythainlp.tokenize.thaisumcut import (
ThaiSentenceSegmentor as segmentor,
)

segment = segmentor()
segments = segment.split_into_sentences(text)
else:
Expand All @@ -374,7 +416,7 @@ def sent_tokenize(
)

if not keep_whitespace:
segments = [token.strip(" ") for token in segments if token.strip(" ")]
segments = strip_whitespace(segments)

return segments

Expand Down Expand Up @@ -405,13 +447,12 @@ def subword_tokenize(
:return: list of subwords
:rtype: list[str]
**Options for engine**
* *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000)
* *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001)
* *wangchanberta* - SentencePiece from wangchanberta model.
* *dict* - newmm word tokenizer with a syllable dictionary
* *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001)
* *ssg* - CRF syllable segmenter for Thai
* *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000)
* *tltk* - syllable tokenizer from tltk

* *wangchanberta* - SentencePiece from wangchanberta model
:Example:

Tokenize text into subword based on *tcc*::
Expand Down Expand Up @@ -485,7 +526,7 @@ def subword_tokenize(
segments = segment(text)

if not keep_whitespace:
segments = [token.strip(" ") for token in segments if token.strip(" ")]
segments = strip_whitespace(segments)

return segments

Expand Down Expand Up @@ -562,6 +603,7 @@ def __init__(
custom_dict: Union[Trie, Iterable[str], str] = None,
engine: str = "newmm",
keep_whitespace: bool = True,
join_broken_num: bool = True,
):
"""
Initialize tokenizer object.
Expand All @@ -584,9 +626,11 @@ def __init__(
raise NotImplementedError(
"""
The Tokenizer class is not support %s for custom tokenizer
""" % self.__engine
"""
% self.__engine
)
self.__keep_whitespace = keep_whitespace
self.__join_broken_num = join_broken_num

def word_tokenize(self, text: str) -> List[str]:
"""
Expand All @@ -601,6 +645,7 @@ def word_tokenize(self, text: str) -> List[str]:
custom_dict=self.__trie_dict,
engine=self.__engine,
keep_whitespace=self.__keep_whitespace,
join_broken_num=self.__join_broken_num,
)

def set_tokenize_engine(self, engine: str) -> None:
Expand Down
Loading