PyThaiNLP · wannaphong · Jul 30, 2021 · Jul 30, 2021 · Jul 30, 2021 · Jul 30, 2021
diff --git a/docs/api/tag.rst b/docs/api/tag.rst
@@ -230,7 +230,9 @@ Modules
 .. autofunction:: pos_tag_sents
 .. autofunction:: tag_provinces
 .. autofunction:: chunk_parse
-.. autoclass:: pythainlp.tag.named_entity.ThaiNameTagger
+.. autoclass:: pythainlp.tag.named_entity.NER
+   :members:
+.. autoclass:: pythainlp.tag.thainer.ThaiNameTagger
    :members: get_ner
 
 Tagger Engines

diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py
@@ -11,9 +11,12 @@
     "pos_tag",
     "pos_tag_sents",
     "tag_provinces",
+    "chunk_parse",
+    "NER",
 ]
 
 from pythainlp.tag.locations import tag_provinces
 from pythainlp.tag.pos_tag import pos_tag, pos_tag_sents
 from pythainlp.tag._tag_perceptron import PerceptronTagger
 from pythainlp.tag.chunk import chunk_parse
+from pythainlp.tag.named_entity import NER
diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py
@@ -2,205 +2,88 @@
 """
 Named-entity recognizer
 """
+import warnings
+from typing import List, Tuple, Union
+from pythainlp.tag.thainer import ThaiNameTagger
 
-__all__ = ["ThaiNameTagger"]
 
-from typing import Dict, List, Tuple, Union
-
-from pycrfsuite import Tagger as CRFTagger
-from pythainlp.corpus import get_corpus_path, thai_stopwords
-from pythainlp.tag import pos_tag
-from pythainlp.tokenize import word_tokenize
-from pythainlp.util import isthai
-
-_CORPUS_NAME = "thainer"
-_TOKENIZER_ENGINE = "newmm"  # should be the same as one used in training data
-
-
-def _is_stopword(word: str) -> bool:  # เช็คว่าเป็นคำฟุ่มเฟือย
-    return word in thai_stopwords()
-
-
-def _doc2features(doc, i) -> Dict:
-    word = doc[i][0]
-    postag = doc[i][1]
-
-    # Features from current word
-    features = {
-        "word.word": word,
-        "word.stopword": _is_stopword(word),
-        "word.isthai": isthai(word),
-        "word.isspace": word.isspace(),
-        "postag": postag,
-        "word.isdigit": word.isdigit(),
-    }
-    if word.isdigit() and len(word) == 5:
-        features["word.islen5"] = True
-
-    # Features from previous word
-    if i > 0:
-        prevword = doc[i - 1][0]
-        prevpostag = doc[i - 1][1]
-        prev_features = {
-            "word.prevword": prevword,
-            "word.previsspace": prevword.isspace(),
-            "word.previsthai": isthai(prevword),
-            "word.prevstopword": _is_stopword(prevword),
-            "word.prevpostag": prevpostag,
-            "word.prevwordisdigit": prevword.isdigit(),
-        }
-        features.update(prev_features)
-    else:
-        features["BOS"] = True  # Special "Beginning of Sequence" tag
-
-    # Features from next word
-    if i < len(doc) - 1:
-        nextword = doc[i + 1][0]
-        nextpostag = doc[i + 1][1]
-        next_features = {
-            "word.nextword": nextword,
-            "word.nextisspace": nextword.isspace(),
-            "word.nextpostag": nextpostag,
-            "word.nextisthai": isthai(nextword),
-            "word.nextstopword": _is_stopword(nextword),
-            "word.nextwordisdigit": nextword.isdigit(),
-        }
-        features.update(next_features)
-    else:
-        features["EOS"] = True  # Special "End of Sequence" tag
-
-    return features
-
-
-class ThaiNameTagger:
+class NER:
     """
-    Thai named-entity recognizer.
-    :param str version: Thai NER version.
-        It's support Thai NER 1.4 & 1.5.
-        The defualt value is `1.5`
-
-    :Example:
-    ::
+    Named-entity recognizer class
 
-        from pythainlp.tag.named_entity import ThaiNameTagger
+    :param str engine: Named-entity recognizer engine
+    :param str corpus: corpus
 
-        thainer15 = ThaiNameTagger(version="1.5")
-        thainer15.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")
+    **Options for engine**
+        * *thainer* - Thai NER engine
+        * *wangchanberta* - wangchanberta model
 
-        thainer14 = ThaiNameTagger(version="1.4")
-        thainer14.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")
+    **Options for corpus**
+        * *thaimer* - Thai NER corpus
+        * *lst20* - lst20 corpus (wangchanberta only)
     """
-    def __init__(self, version: str = "1.5") -> None:
-        """
-        Thai named-entity recognizer.
-
-        :param str version: Thai NER version.
-                            It's support Thai NER 1.4 & 1.5.
-                            The defualt value is `1.5`
-        """
-        self.crf = CRFTagger()
-
-        if version == "1.4":
-            self.crf.open(get_corpus_path("thainer-1.4", version="1.4"))
-            self.pos_tag_name = "orchid_ud"
+    def __init__(self, engine: str, corpus: str = "thainer") -> None:
+        self.load_engine(engine=engine, corpus=corpus)
+
+    def load_engine(self, engine: str, corpus: str) -> None:
+        self.name_engine = engine
+        self.engine = None
+        if engine == "thainer" and corpus == "thainer":
+            from pythainlp.tag.thainer import ThaiNameTagger
+            self.engine = ThaiNameTagger()
+        elif engine == "wangchanberta":
+            from pythainlp.wangchanberta import ThaiNameTagger
+            self.engine = ThaiNameTagger(dataset_name=corpus)
         else:
-            self.crf.open(get_corpus_path(_CORPUS_NAME, version="1.5"))
-            self.pos_tag_name = "lst20"
-
-    def get_ner(
-        self, text: str, pos: bool = True, tag: bool = False
-    ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]:
+            raise ValueError(
+                "NER class not support {0} engine or {1} corpus.".format(
+                    engine,
+                    corpus
+                )
+            )
+
+    def tag(
+        self,
+        text,
+        pos=True,
+        tag=False
+    ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
         """
         This function tags named-entitiy from text in IOB format.
 
         :param str text: text in Thai to be tagged
-        :param bool pos: To include POS tags in the results (`True`) or
-                            exclude (`False`). The defualt value is `True`
+        :param bool pos: output with part-of-speech tag.\
+            (wangchanberta is not support)
         :param bool tag: output like html tag.
         :return: a list of tuple associated with tokenized word, NER tag,
                  POS tag (if the parameter `pos` is specified as `True`),
                  and output like html tag (if the parameter `tag` is
                  specified as `True`).
                  Otherwise, return a list of tuple associated with tokenized
                  word and NER tag
-        :rtype: Union[list[tuple[str, str]], list[tuple[str, str, str]]], str
-
-        :Note:
-            * For the POS tags to be included in the results, this function
-              uses :func:`pythainlp.tag.pos_tag` with engine as `perceptron`
-              and corpus as orchid_ud`.
-
+        :rtype: Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]
         :Example:
 
-            >>> from pythainlp.tag.named_entity import ThaiNameTagger
-            >>>
-            >>> ner = ThaiNameTagger()
-            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")
-            [('วันที่', 'NOUN', 'O'), (' ', 'PUNCT', 'O'),
-            ('15', 'NUM', 'B-DATE'), (' ', 'PUNCT', 'I-DATE'),
-            ('ก.ย.', 'NOUN', 'I-DATE'), (' ', 'PUNCT', 'I-DATE'),
-            ('61', 'NUM', 'I-DATE'), (' ', 'PUNCT', 'O'),
-            ('ทดสอบ', 'VERB', 'O'), ('ระบบ', 'NOUN', 'O'),
-            ('เวลา', 'NOUN', 'O'), (' ', 'PUNCT', 'O'),
-            ('14', 'NOUN', 'B-TIME'), (':', 'PUNCT', 'I-TIME'),
-            ('49', 'NUM', 'I-TIME'), (' ', 'PUNCT', 'I-TIME'),
-            ('น.', 'NOUN', 'I-TIME')]
+            >>> from pythainlp.tag import NER
             >>>
-            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",
-                            pos=False)
-            [('วันที่', 'O'), (' ', 'O'),
-            ('15', 'B-DATE'), (' ', 'I-DATE'),
-            ('ก.ย.', 'I-DATE'), (' ', 'I-DATE'),
-            ('61', 'I-DATE'), (' ', 'O'),
-            ('ทดสอบ', 'O'), ('ระบบ', 'O'),
-            ('เวลา', 'O'), (' ', 'O'),
-            ('14', 'B-TIME'), (':', 'I-TIME'),
-            ('49', 'I-TIME'), (' ', 'I-TIME'),
-            ('น.', 'I-TIME')]
-            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",
-                            tag=True)
-            'วันที่ <DATE>15 ก.ย. 61</DATE> ทดสอบระบบเวลา <TIME>14:49 น.</TIME>'
+            >>> ner = NER("thainer")
+            >>> ner.tag("ทดสอบยนายวรรณพงษ์ ภัททิยไพบูลย์")
+            [('ทดสอบ', 'VV', 'O'),
+            ('นาย', 'NN', 'B-PERSON'),
+            ('วรรณ', 'NN', 'I-PERSON'),
+            ('พงษ์', 'NN', 'I-PERSON'),
+            (' ', 'PU', 'I-PERSON'),
+            ('ภัททิย', 'NN', 'I-PERSON'),
+            ('ไพบูลย์', 'NN', 'I-PERSON')]
+            >>> ner.tag("ทดสอบยนายวรรณพงษ์ ภัททิยไพบูลย์", tag=True)
+            'ทดสอบย<PERSON>นายวรรณพงษ์ ภัททิยไพบูลย์</PERSON>'
         """
-        tokens = word_tokenize(text, engine=_TOKENIZER_ENGINE)
-        pos_tags = pos_tag(
-            tokens,
-            engine="perceptron",
-            corpus=self.pos_tag_name
-        )
-        x_test = ThaiNameTagger.__extract_features(pos_tags)
-        y = self.crf.tag(x_test)
-
-        sent_ner = [(pos_tags[i][0], data) for i, data in enumerate(y)]
-
-        if tag:
-            temp = ""
-            sent = ""
-            for idx, (word, ner) in enumerate(sent_ner):
-                if ner.startswith("B-") and temp != "":
-                    sent += "</" + temp + ">"
-                    temp = ner[2:]
-                    sent += "<" + temp + ">"
-                elif ner.startswith("B-"):
-                    temp = ner[2:]
-                    sent += "<" + temp + ">"
-                elif ner == "O" and temp != "":
-                    sent += "</" + temp + ">"
-                    temp = ""
-                sent += word
-
-                if idx == len(sent_ner) - 1 and temp != "":
-                    sent += "</" + temp + ">"
-
-            return sent
-
-        if pos:
-            return [
-                (pos_tags[i][0], pos_tags[i][1], data)
-                for i, data in enumerate(y)
-            ]
-
-        return sent_ner
-
-    @staticmethod
-    def __extract_features(doc):
-        return [_doc2features(doc, i) for i in range(len(doc))]
+        if pos and self.name_engine == "wangchanberta":
+            warnings.warn(
+                """wangchanberta is not support part-of-speech tag.
+                It have not part-of-speech tag in output."""
+            )
+        if self.name_engine == "wangchanberta":
+            return self.engine.get_ner(text, tag=tag)
+        else:
+            return self.engine.get_ner(text, tag=tag, pos=pos)