|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +from sys import stderr |
| 3 | +from typing import List |
| 4 | + |
| 5 | +from nlpo3 import segment as nlpo3_segment |
| 6 | +from nlpo3 import load_dict as nlpo3_load_dict |
| 7 | +from pythainlp.corpus.common import _THAI_WORDS_FILENAME |
| 8 | +from pythainlp.corpus import path_pythainlp_corpus |
| 9 | + |
| 10 | +_NLPO3_DEFAULT_DICT_NAME = "_67a47bf9" |
| 11 | +_NLPO3_DEFAULT_DICT = nlpo3_load_dict( |
| 12 | + path_pythainlp_corpus(_THAI_WORDS_FILENAME), |
| 13 | + _NLPO3_DEFAULT_DICT_NAME |
| 14 | + ) |
| 15 | + |
| 16 | + |
| 17 | +def load_dict(file_path: str, dict_name: str) -> bool: |
| 18 | + """Load a dictionary file into an in-memory dictionary collection. |
| 19 | +
|
| 20 | + The loaded dictionary will be accessible throught the assigned dict_name. |
| 21 | + *** This function does not override an existing dict name. *** |
| 22 | +
|
| 23 | + :param file_path: Path to a dictionary file |
| 24 | + :type file_path: str |
| 25 | + :param dict_name: A unique dictionary name, use for reference. |
| 26 | + :type dict_name: str |
| 27 | + :return bool |
| 28 | +
|
| 29 | + :See Also: |
| 30 | + * \ |
| 31 | + https://github.com/PyThaiNLP/nlpo3 |
| 32 | + """ |
| 33 | + msg, success = nlpo3_load_dict(file_path=file_path, dict_name=dict_name) |
| 34 | + if bool is False: |
| 35 | + print(msg, file=stderr) |
| 36 | + return success |
| 37 | + |
| 38 | + |
| 39 | +def segment( |
| 40 | + text: str, |
| 41 | + custom_dict: str = _NLPO3_DEFAULT_DICT_NAME, |
| 42 | + safe_mode: bool = False, |
| 43 | + parallel_mode: bool = False |
| 44 | +) -> List[str]: |
| 45 | + """Break text into tokens. |
| 46 | +
|
| 47 | + Python binding for nlpO3. It is newmm engine in Rust. |
| 48 | +
|
| 49 | + :param str text: text to be tokenized |
| 50 | + :param str custom_dict: dictionary name, as assigned with load_dict(),\ |
| 51 | + defaults to pythainlp/corpus/common/words_th.txt |
| 52 | + :param bool safe_mode: reduce chance for long processing time in long text\ |
| 53 | + with many ambiguous breaking points, defaults to False |
| 54 | + :param bool parallel_mode: Use multithread mode, defaults to False |
| 55 | +
|
| 56 | + :return: list of tokens |
| 57 | + :rtype: List[str] |
| 58 | +
|
| 59 | + :See Also: |
| 60 | + * \ |
| 61 | + https://github.com/PyThaiNLP/nlpo3 |
| 62 | + """ |
| 63 | + return nlpo3_segment( |
| 64 | + text=text, |
| 65 | + dict_name=custom_dict, |
| 66 | + safe=safe_mode, |
| 67 | + parallel=parallel_mode |
| 68 | + ) |
0 commit comments