Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
317 changes: 166 additions & 151 deletions pythainlp/tokenize/__init__.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import,unicode_literals
import nltk
import re
from __future__ import absolute_import, unicode_literals

import codecs
import re

import nltk
from pythainlp.corpus.thaisyllable import get_data as syllable_dict
from pythainlp.corpus.thaiword import get_data as word_dict
from six.moves import zip
from pythainlp.corpus.thaisyllable import get_data
from pythainlp.corpus.thaiword import get_data as get_dict

from marisa_trie import Trie

DEFAULT_DICT_TRIE = Trie(get_dict())
DEFAULT_DICT_TRIE = Trie(word_dict())


def word_tokenize(text, engine='newmm',whitespaces=True):
"""
def word_tokenize(text, engine="newmm", whitespaces=True):
"""
:param str text: the text to be tokenized
:param str engine: the engine to tokenize text
:param bool whitespaces: True to output no whitespace, a common mark of sentence or end of phrase in Thai.
Expand All @@ -36,166 +40,177 @@ def word_tokenize(text, engine='newmm',whitespaces=True):
e=word_tokenize(text,engine='newmm') # ['ผม', 'รัก', 'คุณ', 'นะ', 'ครับ', 'โอเค', 'บ่', 'พวกเรา', 'เป็น', 'คนไทย', 'รัก', 'ภาษาไทย', 'ภาษา', 'บ้านเกิด']
g=word_tokenize(text,engine='wordcutpy') # ['ผม', 'รัก', 'คุณ', 'นะ', 'ครับ', 'โอเค', 'บ่', 'พวกเรา', 'เป็น', 'คน', 'ไทย', 'รัก', 'ภาษา', 'ไทย', 'ภาษา', 'บ้านเกิด']
"""
if engine=='icu':
from .pyicu import segment
elif engine=='multi_cut' or engine=='mm':
from .multi_cut import segment
elif engine=='newmm' or engine=='onecut':
from .newmm import mmcut as segment
elif engine=='longest-matching':
from .longest import segment
elif engine=='pylexto':
from .pylexto import segment
elif engine=='deepcut':
from .deepcut import segment
elif engine=='wordcutpy':
from .wordcutpy import segment
else:
raise Exception("error no have engine.")
if whitespaces==False:
return [i.strip(' ') for i in segment(text) if i.strip(' ')!='']
return segment(text)
def dict_word_tokenize(text, custom_dict_trie, engine='newmm'):
'''
:meth:`dict_word_tokenize` tokenizes word based on the dictionary you provide. The format has to be in trie data structure.

:param str text: the text to be tokenized
:param dict custom_dict_trie: คือ trie ที่สร้างจาก create_custom_dict_trie
:param str engine: choose between different options of engine to token (newmm, wordcutpy, mm, longest-matching)
if engine == "icu":
from .pyicu import segment
elif engine == "multi_cut" or engine == "mm":
from .multi_cut import segment
elif engine == "newmm" or engine == "onecut":
from .newmm import mmcut as segment
elif engine == "longest-matching":
from .longest import segment
elif engine == "pylexto":
from .pylexto import segment
elif engine == "deepcut":
from .deepcut import segment
elif engine == "wordcutpy":
from .wordcutpy import segment
else:
raise Exception("Error: Unknown engine: {}".format(engine))

if not whitespaces:
return [i.strip(" ") for i in segment(text) if i.strip(" ")]

return segment(text)


def dict_word_tokenize(text, custom_dict_trie, engine="newmm"):
"""
:meth:`dict_word_tokenize` tokenizes word based on the dictionary you provide. The format has to be in trie data structure.

:param str text: the text to be tokenized
:param dict custom_dict_trie: คือ trie ที่สร้างจาก create_custom_dict_trie
:param str engine: choose between different options of engine to token (newmm, wordcutpy, mm, longest-matching)
:return: A list of words, tokenized from a text.
**Example**::
>>> from pythainlp.tokenize import dict_word_tokenize,create_custom_dict_trie
>>> listword=['แมว',"ดี"]
>>> data_dict=create_custom_dict_trie(listword)
>>> dict_word_tokenize("แมวดีดีแมว",data_dict)
['แมว', 'ดี', 'ดี', 'แมว']
'''
if engine=="newmm" or engine=="onecut":
from .newmm import mmcut as segment
elif engine=="mm" or engine=="multi_cut":
from .multi_cut import segment
elif engine=='longest-matching':
from .longest import segment
elif engine=='wordcutpy':
from .wordcutpy import segment
return segment(text, custom_dict_trie.keys())
else:
raise Exception("error no have engine.")
return segment(text, custom_dict_trie)
def sent_tokenize(text,engine='whitespace+newline'):
'''
This function does not yet automatically recognize when a sentence actually ends. Rather it helps split text where white space and a new line is found.

:param str text: the text to be tokenized
:param str engine: choose between 'whitespace' or 'whitespace+newline'

:return: a list of text, split by whitespace or new line.
'''
if engine=='whitespace':
data=nltk.tokenize.WhitespaceTokenizer().tokenize(text)
elif engine=='whitespace+newline':
data=re.sub(r'\n+|\s+','|',text,re.U).split('|')
return data

def subword_tokenize(text, engine='tcc'):
"""
if engine == "newmm" or engine == "onecut":
from .newmm import mmcut as segment
elif engine == "mm" or engine == "multi_cut":
from .multi_cut import segment
elif engine == "longest-matching":
from .longest import segment
elif engine == "wordcutpy":
from .wordcutpy import segment

return segment(text, custom_dict_trie.keys())
else:
raise Exception("Error: Unknown engine: {}".format(engine))

return segment(text, custom_dict_trie)


def sent_tokenize(text, engine="whitespace+newline"):
"""
This function does not yet automatically recognize when a sentence actually ends. Rather it helps split text where white space and a new line is found.

:param str text: the text to be tokenized
:param str engine: choose between 'whitespace' or 'whitespace+newline'

:return: a list of text, split by whitespace or new line.
"""
if engine == "whitespace":
sentences = nltk.tokenize.WhitespaceTokenizer().tokenize(text)
else:
sentences = re.sub(r"\n+|\s+", "|", text, re.U).split("|")

return sentences


def subword_tokenize(text, engine="tcc"):
"""
:param str text: text to be tokenized
:param str engine: choosing 'tcc' uses the Thai Character Cluster rule to segment words into the smallest unique units.
:return: a list of tokenized strings.
"""
if engine == 'tcc':
from .tcc import tcc
from .tcc import tcc

return tcc(text)

def isthai(text,check_all=False):
"""
:param str text: input string or list of strings
:param bool check_all: checks all character or not

:return: A dictionary with the first value as proportional of text that is Thai, and the second value being a tuple of all characters, along with true or false.
"""
listext=list(text)
i=0
num_isthai=0
if check_all==True:
listthai=[]
while i<len(listext):
cVal = ord(listext[i])
if(cVal >= 3584 and cVal <= 3711):
num_isthai+=1
if check_all==True:
listthai.append(True)
else:
if check_all==True:
listthai.append(False)
i+=1
thai=(num_isthai/len(listext))*100
if check_all==True:
dictthai=tuple(zip(listext,listthai))
data= {'thai':thai,'check_all':dictthai}
else:
data= {'thai':thai}
return data

def isthai(text, check_all=False):
"""
:param str text: input string or list of strings
:param bool check_all: checks all character or not

:return: A dictionary with the first value as proportional of text that is Thai, and the second value being a tuple of all characters, along with true or false.
"""
isthais = []
num_isthai = 0

for ch in text:
ch_val = ord(ch)
if ch_val >= 3584 and ch_val <= 3711:
num_isthai += 1
if check_all:
isthais.append(True)
else:
if check_all:
isthais.append(False)
thai_percent = (num_isthai / len(text)) * 100

if check_all:
chars = list(text)
isthai_pairs = tuple(zip(chars, isthais))
data = {"thai": thai_percent, "check_all": isthai_pairs}
else:
data = {"thai": thai_percent}

return data


def syllable_tokenize(text):
"""
:param str text: input string to be tokenized

:return: returns list of strings of syllables
"""
text1=word_tokenize(text)
data=[]
trie = create_custom_dict_trie(custom_dict_source=get_data())
if len(text1)>1:
i=0
while i<len(text1):
data.extend(dict_word_tokenize(text=text1[i], custom_dict_trie=trie))
i+=1
else:
data=dict_word_tokenize(text=text, custom_dict_trie=trie)
return data
"""
:param str text: input string to be tokenized

:return: returns list of strings of syllables
"""
syllables = []
if text:
words = word_tokenize(text)
trie = create_custom_dict_trie(custom_dict_source=syllable_dict())
for word in words:
syllables.extend(dict_word_tokenize(text=word, custom_dict_trie=trie))

return syllables


def create_custom_dict_trie(custom_dict_source):
"""The function is used to create a custom dict trie which will be used for word_tokenize() function. For more information on the trie data structure, see: https://marisa-trie.readthedocs.io/en/latest/index.html

:param string/list custom_dict_source: a list of vocaburaries or a path to source file

:return: A trie created from custom dict input
"""

if type(custom_dict_source) is str:
# Receive a file path of the custom dict to read
with codecs.open(custom_dict_source, 'r',encoding='utf8') as f:
_vocabs = f.read().splitlines()
return Trie(_vocabs)
elif isinstance(custom_dict_source, (list, tuple, set)):
# Received a sequence type object of vocabs
return Trie(custom_dict_source)
else:
raise TypeError(
'Type of custom_dict_source must be either str (path to source file) or collections'
)
"""The function is used to create a custom dict trie which will be used for word_tokenize() function. For more information on the trie data structure, see: https://marisa-trie.readthedocs.io/en/latest/index.html

class Tokenizer:
def __init__(self, custom_dict=None):
"""
Initialize tokenizer object

:param str custom_dict: a file path or a list of vocaburaies to be used to create a trie (default - original lexitron)

:return: trie_dict - a dictionary in the form of trie data for tokenizing engines
"""
if custom_dict:
if type(custom_dict) is list:
self.trie_dict = Trie(custom_dict)
elif type(custom_dict) is str:
with codecs.open(custom_dict, 'r',encoding='utf8') as f:
vocabs = f.read().splitlines()
self.trie_dict = Trie(vocabs)
else:
self.trie_dict = Trie(get_dict())

def word_tokenize(self, text, engine='newmm'):
from .newmm import mmcut as segment
return segment(text, self.trie_dict)
:param string/list custom_dict_source: a list of vocaburaries or a path to source file

:return: A trie created from custom dict input
"""

if type(custom_dict_source) is str:
# Receive a file path of the custom dict to read
with codecs.open(custom_dict_source, "r", encoding="utf8") as f:
_vocabs = f.read().splitlines()
return Trie(_vocabs)
elif isinstance(custom_dict_source, (list, tuple, set)):
# Received a sequence type object of vocabs
return Trie(custom_dict_source)
else:
raise TypeError(
"Type of custom_dict_source must be either str (path to source file) or collections"
)


class Tokenizer:
def __init__(self, custom_dict=None):
"""
Initialize tokenizer object

:param str custom_dict: a file path or a list of vocaburaies to be used to create a trie (default - original lexitron)

:return: trie_dict - a dictionary in the form of trie data for tokenizing engines
"""
if custom_dict:
if type(custom_dict) is list:
self.trie_dict = Trie(custom_dict)
elif type(custom_dict) is str:
with codecs.open(custom_dict, "r", encoding="utf8") as f:
vocabs = f.read().splitlines()
self.trie_dict = Trie(vocabs)
else:
self.trie_dict = Trie(word_dict())

def word_tokenize(self, text, engine="newmm"):
from .newmm import mmcut as segment

return segment(text, self.trie_dict)
11 changes: 1 addition & 10 deletions pythainlp/tokenize/newmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from pythainlp.tokenize import DEFAULT_DICT_TRIE

from .tcc import tcc_gen
from .tcc import tcc_pos

# ช่วยตัดพวกภาษาอังกฤษ เป็นต้น
PAT_ENG = re.compile(
Expand All @@ -29,15 +29,6 @@
PAT_TWOCHARS = re.compile("[ก-ฮ]{,2}$")


def tcc_pos(text):
p_set = set()
p = 0
for w in tcc_gen(text):
p += len(w)
p_set.add(p)
return p_set


def bfs_paths_graph(graph, start, goal):
queue = [(start, [start])]
while queue:
Expand Down
9 changes: 9 additions & 0 deletions pythainlp/tokenize/tcc.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,15 @@ def tcc_gen(w):
p += n


def tcc_pos(text):
p_set = set()
p = 0
for w in tcc_gen(text):
p += len(w)
p_set.add(p)
return p_set


def tcc(w, sep="/"):
return sep.join(tcc_gen(w))

Expand Down