Skip to content

Commit cbf9360

Browse files
authored
Merge pull request #562 from PyThaiNLP/fix-461
Tokenize repeating dots and commas from numbers (fix #461)
2 parents 3e4b585 + 86eae1c commit cbf9360

File tree

3 files changed

+94
-42
lines changed

3 files changed

+94
-42
lines changed

pythainlp/tokenize/multi_cut.py

Lines changed: 33 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# -*- coding: utf-8 -*-
22
"""
3-
Multi cut -- Thai word segmentation with maximum matching. The original source
4-
code is from Korakot Chaovavanich.
3+
Multi cut -- Thai word segmentation with maximum matching.
4+
Original code from Korakot Chaovavanich.
55
66
:See Also:
77
* `Facebook post \
@@ -12,16 +12,14 @@
1212

1313
import re
1414
from collections import defaultdict
15-
from typing import List
15+
from typing import Iterator, List
1616

1717
from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE
1818
from pythainlp.util import Trie
1919

2020

2121
class LatticeString(str):
22-
"""
23-
String subclass เพื่อเก็บวิธีตัดหลายๆ วิธี
24-
"""
22+
"""String that keeps possible tokenizations"""
2523

2624
def __new__(cls, value, multi=None, in_dict=True):
2725
return str.__new__(cls, value)
@@ -34,22 +32,22 @@ def __init__(self, value, multi=None, in_dict=True):
3432
self.unique = False
3533
else:
3634
self.multi = [value]
37-
self.in_dict = in_dict # บอกว่าเป็นคำมีในดิกหรือเปล่า
35+
self.in_dict = in_dict # if in dictionary
3836

3937

4038
_RE_NONTHAI = r"""(?x)
41-
[-a-zA-Z]+| # Latin
42-
\d[\d,\.]*| # number
43-
[ \t]+| # space
44-
\r?\n # newline
39+
[-a-zA-Z]+| # Latin characters
40+
\d+([,\.]\d+)*| # number
41+
[ \t]+| # space
42+
\r?\n # newline
4543
"""
4644
_PAT_NONTHAI = re.compile(_RE_NONTHAI)
4745

4846

49-
def _multicut(text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE):
50-
"""
51-
ส่งคืน LatticeString คืนมาเป็นก้อนๆ
52-
"""
47+
def _multicut(
48+
text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE
49+
) -> Iterator[LatticeString]:
50+
"""Return LatticeString"""
5351
if not custom_dict:
5452
custom_dict = DEFAULT_WORD_DICT_TRIE
5553

@@ -100,15 +98,15 @@ def serialize(p, p2): # helper function
10098
q.add(i)
10199

102100

103-
def mmcut(text: str):
101+
def mmcut(text: str) -> List[str]:
104102
res = []
105103
for w in _multicut(text):
106104
mm = min(w.multi, key=lambda x: x.count("/"))
107105
res.extend(mm.split("/"))
108106
return res
109107

110108

111-
def _combine(ww: str):
109+
def _combine(ww: List[LatticeString]) -> Iterator[str]:
112110
if ww == []:
113111
yield ""
114112
else:
@@ -124,12 +122,15 @@ def _combine(ww: str):
124122
def segment(
125123
text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE
126124
) -> List[str]:
127-
"""
128-
Dictionary-based maximum matching word segmentation.
129-
130-
:param str text: text to be tokenized to words
131-
:param pythainlp.util.Trie custom_dict: dictionary for tokenization
132-
:return: list of words, tokenized from the text
125+
"""Dictionary-based maximum matching word segmentation.
126+
127+
:param text: text to be tokenized
128+
:type text: str
129+
:param custom_dict: tokenization dictionary,\
130+
defaults to DEFAULT_WORD_DICT_TRIE
131+
:type custom_dict: Trie, optional
132+
:return: list of segmented tokens
133+
:rtype: List[str]
133134
"""
134135
if not text or not isinstance(text, str):
135136
return []
@@ -140,11 +141,15 @@ def segment(
140141
def find_all_segment(
141142
text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE
142143
) -> List[str]:
143-
"""
144-
Get all possible segment variations
145-
146-
:param str text: input string to be tokenized
147-
:return: returns list of segment variations
144+
"""Get all possible segment variations.
145+
146+
:param text: input string to be tokenized
147+
:type text: str
148+
:param custom_dict: tokenization dictionary,\
149+
defaults to DEFAULT_WORD_DICT_TRIE
150+
:type custom_dict: Trie, optional
151+
:return: list of segment variations
152+
:rtype: List[str]
148153
"""
149154
if not text or not isinstance(text, str):
150155
return []

pythainlp/tokenize/newmm.py

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,10 @@
2525
# match non-Thai tokens
2626
_PAT_NONTHAI = re.compile(
2727
r"""(?x)
28-
[-a-zA-Z]+| # Latin characters
29-
\d[\d,\.]*| # number
30-
[ \t]+| # space
31-
\r?\n # newline
28+
[-a-zA-Z]+| # Latin characters
29+
\d+([,\.]\d+)*| # number
30+
[ \t]+| # space
31+
\r?\n # newline
3232
"""
3333
)
3434

@@ -138,16 +138,23 @@ def segment(
138138
custom_dict: Trie = DEFAULT_WORD_DICT_TRIE,
139139
safe_mode: bool = False,
140140
) -> List[str]:
141-
"""
142-
Dictionary-based maximal matching word segmentation, constrained with
143-
Thai Character Cluster boundaries.
144-
145-
:param str text: text to be tokenized to words
146-
:param pythainlp.util.Trie custom_dict: dictionary for tokenization
147-
:param bool safe_mode: True to help avoid long wait for text with long\
148-
and continuous ambiguous breaking points. Long wait may still able\
149-
to occur. Default is False.
150-
:return: list of words, tokenized from the text
141+
"""Maximal-matching word segmentation, Thai Character Cluster constrained.
142+
143+
A dictionary-based word segmentation using maximal matching algorithm,
144+
constrained to Thai Character Cluster boundaries.
145+
146+
A custom dictionary can be supplied.
147+
148+
:param text: text to be tokenized
149+
:type text: str
150+
:param custom_dict: tokenization dictionary,\
151+
defaults to DEFAULT_WORD_DICT_TRIE
152+
:type custom_dict: Trie, optional
153+
:param safe_mode: reduce chance for long processing time in long text\
154+
with many ambiguous breaking points, defaults to False
155+
:type safe_mode: bool, optional
156+
:return: list of tokens
157+
:rtype: List[str]
151158
"""
152159
if not text or not isinstance(text, str):
153160
return []

tests/test_tokenize.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,26 @@ def test_mm(self):
450450
word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="mm"),
451451
["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
452452
)
453+
self.assertEqual(
454+
word_tokenize("19...", engine="mm"),
455+
['19', '...'],
456+
)
457+
self.assertEqual(
458+
word_tokenize("19.", engine="mm"),
459+
['19', '.'],
460+
)
461+
self.assertEqual(
462+
word_tokenize("19.84", engine="mm"),
463+
['19.84'],
464+
)
465+
self.assertEqual(
466+
word_tokenize("127.0.0.1", engine="mm"),
467+
["127.0.0.1"],
468+
)
469+
self.assertEqual(
470+
word_tokenize("USD1,984.42", engine="mm"),
471+
['USD', '1,984.42'],
472+
)
453473

454474
self.assertIsNotNone(multi_cut.mmcut("ทดสอบ"))
455475

@@ -465,6 +485,26 @@ def test_newmm(self):
465485
word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="newmm"),
466486
["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
467487
)
488+
self.assertEqual(
489+
word_tokenize("19...", engine="newmm"),
490+
['19', '...'],
491+
)
492+
self.assertEqual(
493+
word_tokenize("19.", engine="newmm"),
494+
['19', '.'],
495+
)
496+
self.assertEqual(
497+
word_tokenize("19.84", engine="newmm"),
498+
['19.84'],
499+
)
500+
self.assertEqual(
501+
word_tokenize("127.0.0.1", engine="newmm"),
502+
["127.0.0.1"],
503+
)
504+
self.assertEqual(
505+
word_tokenize("USD1,984.42", engine="newmm"),
506+
['USD', '1,984.42'],
507+
)
468508
self.assertEqual(
469509
word_tokenize(
470510
"สวัสดีครับ สบายดีไหมครับ",

0 commit comments

Comments
 (0)