Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion docs/api/tag.rst
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,9 @@ Modules
.. autofunction:: pos_tag_sents
.. autofunction:: tag_provinces
.. autofunction:: chunk_parse
.. autoclass:: pythainlp.tag.named_entity.ThaiNameTagger
.. autoclass:: pythainlp.tag.named_entity.NER
:members:
.. autoclass:: pythainlp.tag.thainer.ThaiNameTagger
:members: get_ner

Tagger Engines
Expand Down
3 changes: 3 additions & 0 deletions pythainlp/tag/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,12 @@
"pos_tag",
"pos_tag_sents",
"tag_provinces",
"chunk_parse",
"NER",
]

from pythainlp.tag.locations import tag_provinces
from pythainlp.tag.pos_tag import pos_tag, pos_tag_sents
from pythainlp.tag._tag_perceptron import PerceptronTagger
from pythainlp.tag.chunk import chunk_parse
from pythainlp.tag.named_entity import NER
241 changes: 62 additions & 179 deletions pythainlp/tag/named_entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,205 +2,88 @@
"""
Named-entity recognizer
"""
import warnings
from typing import List, Tuple, Union
from pythainlp.tag.thainer import ThaiNameTagger

__all__ = ["ThaiNameTagger"]

from typing import Dict, List, Tuple, Union

from pycrfsuite import Tagger as CRFTagger
from pythainlp.corpus import get_corpus_path, thai_stopwords
from pythainlp.tag import pos_tag
from pythainlp.tokenize import word_tokenize
from pythainlp.util import isthai

_CORPUS_NAME = "thainer"
_TOKENIZER_ENGINE = "newmm" # should be the same as one used in training data


def _is_stopword(word: str) -> bool: # เช็คว่าเป็นคำฟุ่มเฟือย
return word in thai_stopwords()


def _doc2features(doc, i) -> Dict:
word = doc[i][0]
postag = doc[i][1]

# Features from current word
features = {
"word.word": word,
"word.stopword": _is_stopword(word),
"word.isthai": isthai(word),
"word.isspace": word.isspace(),
"postag": postag,
"word.isdigit": word.isdigit(),
}
if word.isdigit() and len(word) == 5:
features["word.islen5"] = True

# Features from previous word
if i > 0:
prevword = doc[i - 1][0]
prevpostag = doc[i - 1][1]
prev_features = {
"word.prevword": prevword,
"word.previsspace": prevword.isspace(),
"word.previsthai": isthai(prevword),
"word.prevstopword": _is_stopword(prevword),
"word.prevpostag": prevpostag,
"word.prevwordisdigit": prevword.isdigit(),
}
features.update(prev_features)
else:
features["BOS"] = True # Special "Beginning of Sequence" tag

# Features from next word
if i < len(doc) - 1:
nextword = doc[i + 1][0]
nextpostag = doc[i + 1][1]
next_features = {
"word.nextword": nextword,
"word.nextisspace": nextword.isspace(),
"word.nextpostag": nextpostag,
"word.nextisthai": isthai(nextword),
"word.nextstopword": _is_stopword(nextword),
"word.nextwordisdigit": nextword.isdigit(),
}
features.update(next_features)
else:
features["EOS"] = True # Special "End of Sequence" tag

return features


class ThaiNameTagger:
class NER:
"""
Thai named-entity recognizer.
:param str version: Thai NER version.
It's support Thai NER 1.4 & 1.5.
The defualt value is `1.5`

:Example:
::
Named-entity recognizer class

from pythainlp.tag.named_entity import ThaiNameTagger
:param str engine: Named-entity recognizer engine
:param str corpus: corpus

thainer15 = ThaiNameTagger(version="1.5")
thainer15.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")
**Options for engine**
* *thainer* - Thai NER engine
* *wangchanberta* - wangchanberta model

thainer14 = ThaiNameTagger(version="1.4")
thainer14.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")
**Options for corpus**
* *thaimer* - Thai NER corpus
* *lst20* - lst20 corpus (wangchanberta only)
"""
def __init__(self, version: str = "1.5") -> None:
"""
Thai named-entity recognizer.

:param str version: Thai NER version.
It's support Thai NER 1.4 & 1.5.
The defualt value is `1.5`
"""
self.crf = CRFTagger()

if version == "1.4":
self.crf.open(get_corpus_path("thainer-1.4", version="1.4"))
self.pos_tag_name = "orchid_ud"
def __init__(self, engine: str, corpus: str = "thainer") -> None:
self.load_engine(engine=engine, corpus=corpus)

def load_engine(self, engine: str, corpus: str) -> None:
self.name_engine = engine
self.engine = None
if engine == "thainer" and corpus == "thainer":
from pythainlp.tag.thainer import ThaiNameTagger
self.engine = ThaiNameTagger()
elif engine == "wangchanberta":
from pythainlp.wangchanberta import ThaiNameTagger
self.engine = ThaiNameTagger(dataset_name=corpus)
else:
self.crf.open(get_corpus_path(_CORPUS_NAME, version="1.5"))
self.pos_tag_name = "lst20"

def get_ner(
self, text: str, pos: bool = True, tag: bool = False
) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]:
raise ValueError(
"NER class not support {0} engine or {1} corpus.".format(
engine,
corpus
)
)

def tag(
self,
text,
pos=True,
tag=False
) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
"""
This function tags named-entitiy from text in IOB format.

:param str text: text in Thai to be tagged
:param bool pos: To include POS tags in the results (`True`) or
exclude (`False`). The defualt value is `True`
:param bool pos: output with part-of-speech tag.\
(wangchanberta is not support)
:param bool tag: output like html tag.
:return: a list of tuple associated with tokenized word, NER tag,
POS tag (if the parameter `pos` is specified as `True`),
and output like html tag (if the parameter `tag` is
specified as `True`).
Otherwise, return a list of tuple associated with tokenized
word and NER tag
:rtype: Union[list[tuple[str, str]], list[tuple[str, str, str]]], str

:Note:
* For the POS tags to be included in the results, this function
uses :func:`pythainlp.tag.pos_tag` with engine as `perceptron`
and corpus as orchid_ud`.

:rtype: Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]
:Example:

>>> from pythainlp.tag.named_entity import ThaiNameTagger
>>>
>>> ner = ThaiNameTagger()
>>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")
[('วันที่', 'NOUN', 'O'), (' ', 'PUNCT', 'O'),
('15', 'NUM', 'B-DATE'), (' ', 'PUNCT', 'I-DATE'),
('ก.ย.', 'NOUN', 'I-DATE'), (' ', 'PUNCT', 'I-DATE'),
('61', 'NUM', 'I-DATE'), (' ', 'PUNCT', 'O'),
('ทดสอบ', 'VERB', 'O'), ('ระบบ', 'NOUN', 'O'),
('เวลา', 'NOUN', 'O'), (' ', 'PUNCT', 'O'),
('14', 'NOUN', 'B-TIME'), (':', 'PUNCT', 'I-TIME'),
('49', 'NUM', 'I-TIME'), (' ', 'PUNCT', 'I-TIME'),
('น.', 'NOUN', 'I-TIME')]
>>> from pythainlp.tag import NER
>>>
>>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",
pos=False)
[('วันที่', 'O'), (' ', 'O'),
('15', 'B-DATE'), (' ', 'I-DATE'),
('ก.ย.', 'I-DATE'), (' ', 'I-DATE'),
('61', 'I-DATE'), (' ', 'O'),
('ทดสอบ', 'O'), ('ระบบ', 'O'),
('เวลา', 'O'), (' ', 'O'),
('14', 'B-TIME'), (':', 'I-TIME'),
('49', 'I-TIME'), (' ', 'I-TIME'),
('น.', 'I-TIME')]
>>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",
tag=True)
'วันที่ <DATE>15 ก.ย. 61</DATE> ทดสอบระบบเวลา <TIME>14:49 น.</TIME>'
>>> ner = NER("thainer")
>>> ner.tag("ทดสอบยนายวรรณพงษ์ ภัททิยไพบูลย์")
[('ทดสอบ', 'VV', 'O'),
('นาย', 'NN', 'B-PERSON'),
('วรรณ', 'NN', 'I-PERSON'),
('พงษ์', 'NN', 'I-PERSON'),
(' ', 'PU', 'I-PERSON'),
('ภัททิย', 'NN', 'I-PERSON'),
('ไพบูลย์', 'NN', 'I-PERSON')]
>>> ner.tag("ทดสอบยนายวรรณพงษ์ ภัททิยไพบูลย์", tag=True)
'ทดสอบย<PERSON>นายวรรณพงษ์ ภัททิยไพบูลย์</PERSON>'
"""
tokens = word_tokenize(text, engine=_TOKENIZER_ENGINE)
pos_tags = pos_tag(
tokens,
engine="perceptron",
corpus=self.pos_tag_name
)
x_test = ThaiNameTagger.__extract_features(pos_tags)
y = self.crf.tag(x_test)

sent_ner = [(pos_tags[i][0], data) for i, data in enumerate(y)]

if tag:
temp = ""
sent = ""
for idx, (word, ner) in enumerate(sent_ner):
if ner.startswith("B-") and temp != "":
sent += "</" + temp + ">"
temp = ner[2:]
sent += "<" + temp + ">"
elif ner.startswith("B-"):
temp = ner[2:]
sent += "<" + temp + ">"
elif ner == "O" and temp != "":
sent += "</" + temp + ">"
temp = ""
sent += word

if idx == len(sent_ner) - 1 and temp != "":
sent += "</" + temp + ">"

return sent

if pos:
return [
(pos_tags[i][0], pos_tags[i][1], data)
for i, data in enumerate(y)
]

return sent_ner

@staticmethod
def __extract_features(doc):
return [_doc2features(doc, i) for i in range(len(doc))]
if pos and self.name_engine == "wangchanberta":
warnings.warn(
"""wangchanberta is not support part-of-speech tag.
It have not part-of-speech tag in output."""
)
if self.name_engine == "wangchanberta":
return self.engine.get_ner(text, tag=tag)
else:
return self.engine.get_ner(text, tag=tag, pos=pos)
Loading