11# -*- coding: utf-8 -*-
22"""
3- Multi cut -- Thai word segmentation with maximum matching. The original source
4- code is from Korakot Chaovavanich.
3+ Multi cut -- Thai word segmentation with maximum matching.
4+ Original code from Korakot Chaovavanich.
55
66:See Also:
77 * `Facebook post \
1212
1313import re
1414from collections import defaultdict
15- from typing import List
15+ from typing import Iterator , List
1616
1717from pythainlp .tokenize import DEFAULT_WORD_DICT_TRIE
1818from pythainlp .util import Trie
1919
2020
2121class LatticeString (str ):
22- """
23- String subclass เพื่อเก็บวิธีตัดหลายๆ วิธี
24- """
22+ """String that keeps possible tokenizations"""
2523
2624 def __new__ (cls , value , multi = None , in_dict = True ):
2725 return str .__new__ (cls , value )
@@ -34,22 +32,22 @@ def __init__(self, value, multi=None, in_dict=True):
3432 self .unique = False
3533 else :
3634 self .multi = [value ]
37- self .in_dict = in_dict # บอกว่าเป็นคำมีในดิกหรือเปล่า
35+ self .in_dict = in_dict # if in dictionary
3836
3937
4038_RE_NONTHAI = r"""(?x)
41- [-a-zA-Z]+| # Latin
42- \d[\d ,\.]*| # number
43- [ \t]+| # space
44- \r?\n # newline
39+ [-a-zA-Z]+| # Latin characters
40+ \d+([ ,\.]\d+) *| # number
41+ [ \t]+| # space
42+ \r?\n # newline
4543"""
4644_PAT_NONTHAI = re .compile (_RE_NONTHAI )
4745
4846
49- def _multicut (text : str , custom_dict : Trie = DEFAULT_WORD_DICT_TRIE ):
50- """
51- ส่งคืน LatticeString คืนมาเป็นก้อนๆ
52- """
47+ def _multicut (
48+ text : str , custom_dict : Trie = DEFAULT_WORD_DICT_TRIE
49+ ) -> Iterator [ LatticeString ]:
50+ """Return LatticeString"""
5351 if not custom_dict :
5452 custom_dict = DEFAULT_WORD_DICT_TRIE
5553
@@ -100,15 +98,15 @@ def serialize(p, p2): # helper function
10098 q .add (i )
10199
102100
103- def mmcut (text : str ):
101+ def mmcut (text : str ) -> List [ str ] :
104102 res = []
105103 for w in _multicut (text ):
106104 mm = min (w .multi , key = lambda x : x .count ("/" ))
107105 res .extend (mm .split ("/" ))
108106 return res
109107
110108
111- def _combine (ww : str ) :
109+ def _combine (ww : List [ LatticeString ]) -> Iterator [ str ] :
112110 if ww == []:
113111 yield ""
114112 else :
@@ -124,12 +122,15 @@ def _combine(ww: str):
124122def segment (
125123 text : str , custom_dict : Trie = DEFAULT_WORD_DICT_TRIE
126124) -> List [str ]:
127- """
128- Dictionary-based maximum matching word segmentation.
129-
130- :param str text: text to be tokenized to words
131- :param pythainlp.util.Trie custom_dict: dictionary for tokenization
132- :return: list of words, tokenized from the text
125+ """Dictionary-based maximum matching word segmentation.
126+
127+ :param text: text to be tokenized
128+ :type text: str
129+ :param custom_dict: tokenization dictionary,\
130+ defaults to DEFAULT_WORD_DICT_TRIE
131+ :type custom_dict: Trie, optional
132+ :return: list of segmented tokens
133+ :rtype: List[str]
133134 """
134135 if not text or not isinstance (text , str ):
135136 return []
@@ -140,11 +141,15 @@ def segment(
140141def find_all_segment (
141142 text : str , custom_dict : Trie = DEFAULT_WORD_DICT_TRIE
142143) -> List [str ]:
143- """
144- Get all possible segment variations
145-
146- :param str text: input string to be tokenized
147- :return: returns list of segment variations
144+ """Get all possible segment variations.
145+
146+ :param text: input string to be tokenized
147+ :type text: str
148+ :param custom_dict: tokenization dictionary,\
149+ defaults to DEFAULT_WORD_DICT_TRIE
150+ :type custom_dict: Trie, optional
151+ :return: list of segment variations
152+ :rtype: List[str]
148153 """
149154 if not text or not isinstance (text , str ):
150155 return []
0 commit comments