PyThaiNLP · bact · Aug 25, 2020 · Aug 21, 2020 · Aug 21, 2020 · Aug 21, 2020
diff --git a/pythainlp/corpus/corpus_license.md b/pythainlp/corpus/corpus_license.md
@@ -18,7 +18,6 @@ negations_th.txt | Negation word list
 stopwords_th.txt | Stop word list
 syllables_th.txt | List of Thai syllables
 thailand_provinces_th.csv | List of Thailand provinces in Thai
-thailand_provinces_th.txt | List of Thailand provinces in Thai
 tnc_freq.txt | Words and their frequencies, from Thai National Corpus
 ttc_freq.txt | Words and their frequencies, from Thai Textbook Corpus
 words_th.txt | List of Thai words
@@ -46,9 +45,11 @@ https://creativecommons.org/licenses/by/4.0/
 
 Filename | Description
 ---------|------------
-sentenceseg_crfcut.model | Sentence segmentation model
-pos_ud_perceptron.pkl | Part-of-speech tagging model
-pos_ud_unigram.json | Part-of-speech tagging model
+pos_orchid_perceptron.pkl | Part-of-speech tagging model, trained from ORCHID data, using perceptron
+pos_orchid_unigram.json | Part-of-speech tagging model, trained from ORCHID data, using unigram
+pos_ud_perceptron.pkl | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using perceptron
+pos_ud_unigram.json | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using unigram
+sentenceseg_crfcut.model | Sentence segmentation model, trained from TED subtitles, using CRF
 
 
 ## Thai WordNet

diff --git a/pythainlp/corpus/pos_orchid_perceptron.pkl b/pythainlp/corpus/pos_orchid_perceptron.pkl
diff --git a/pythainlp/corpus/pos_ud_perceptron.pkl b/pythainlp/corpus/pos_ud_perceptron.pkl
diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py
@@ -6,8 +6,13 @@
 such as its part-of-speech (POS) tag, and named entity (NE) tag.
 """
 
-__all__ = ["pos_tag", "pos_tag_sents", "tag_provinces"]
+__all__ = [
+    "PerceptronTagger",
+    "pos_tag",
+    "pos_tag_sents",
+    "tag_provinces",
+]
 
 from pythainlp.tag.locations import tag_provinces
-from pythainlp.tag.pos_tag import pos_tag
-from pythainlp.tag.pos_tag import pos_tag_sents
+from pythainlp.tag.pos_tag import pos_tag, pos_tag_sents
+from pythainlp.tag._tag_perceptron import PerceptronTagger
diff --git a/pythainlp/tag/_tag_perceptron.py b/pythainlp/tag/_tag_perceptron.py
@@ -0,0 +1,284 @@
+# -*- coding: utf-8 -*-
+"""
+Perceptron Tagger.
+
+This tagger is a port of the Textblob Averaged Perceptron Tagger
+Author: Matthew Honnibal <honnibal+gh@gmail.com>,
+        Long Duong <longdt219@gmail.com> (NLTK port)
+        Wannaphong Phatthiyaphaibun <wannaphong@kkumail.com> (PyThaiNLP port)
+URL: <https://github.com/sloria/textblob-aptagger>
+     <https://nltk.org/>
+Copyright 2013 Matthew Honnibal
+NLTK modifications Copyright 2015 The NLTK Project
+PyThaiNLP modifications Copyright 2020 PyThaiNLP Project
+
+This tagger is provided under the terms of the MIT License.
+"""
+
+from __future__ import absolute_import
+
+import os
+import pickle
+import random
+from collections import defaultdict
+from typing import Dict, Iterable, List, Tuple, Union
+
+
+class AveragedPerceptron(object):
+    """
+    An averaged perceptron, as implemented by Matthew Honnibal.
+
+    See more implementation details here:
+        http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
+    """
+
+    def __init__(self) -> None:
+        # Each feature gets its own weight vector,
+        # so weights is a dict-of-dicts
+        self.weights = {}
+        self.classes = set()
+        # The accumulated values, for the averaging. These will be keyed by
+        # feature/class tuples
+        self._totals = defaultdict(int)
+        # The last time the feature was changed, for the averaging. Also
+        # keyed by feature/class tuples
+        # (tstamps is short for timestamps)
+        self._tstamps = defaultdict(int)
+        # Number of instances seen
+        self.i = 0
+
+    def predict(self, features: Dict):
+        """
+        Dot-product the features and current weights and return the best
+        label.
+        """
+        scores = defaultdict(float)
+        for feat, value in features.items():
+            if feat not in self.weights or value == 0:
+                continue
+            weights = self.weights[feat]
+            for label, weight in weights.items():
+                scores[label] += value * weight
+        # Do a secondary alphabetic sort, for stability
+        return max(self.classes, key=lambda label: (scores[label], label))
+
+    def update(self, truth, guess, features: Dict) -> None:
+        """Update the feature weights."""
+
+        def upd_feat(c, f, w, v):
+            param = (f, c)
+            self._totals[param] += (self.i - self._tstamps[param]) * w
+            self._tstamps[param] = self.i
+            self.weights[f][c] = w + v
+
+        self.i += 1
+        if truth == guess:
+            return
+        for f in features:
+            weights = self.weights.setdefault(f, {})
+            upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
+            upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
+
+    def average_weights(self) -> None:
+        """Average weights from all iterations."""
+        for feat, weights in self.weights.items():
+            new_feat_weights = {}
+            for clas, weight in weights.items():
+                param = (feat, clas)
+                total = self._totals[param]
+                total += (self.i - self._tstamps[param]) * weight
+                averaged = round(total / float(self.i), 3)
+                if averaged:
+                    new_feat_weights[clas] = averaged
+            self.weights[feat] = new_feat_weights
+
+
+class PerceptronTagger:
+    """
+    Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal.
+
+    See more implementation details here:
+        http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
+
+    >>> from pythainlp.tag import PerceptronTagger
+    >>> tagger = PerceptronTagger()
+    >>> data = [
+            [("คน", "N"), ("เดิน", "V")],
+            [("แมว", "N"), ("เดิน", "V")],
+            [("คน", "N"), ("วิ่ง", "V")],
+            [("ปลา", "N"), ("ว่าย", "V")],
+            [("นก", "N"), ("บิน", "V")],
+        ]
+    >>> tagger.train(data)
+    >>> tagger.tag(["นก", "เดิน])
+    [('นก', 'N'), ('เดิน', 'V')]
+
+    """
+
+    START = ["-START-", "-START2-"]
+    END = ["-END-", "-END2-"]
+    AP_MODEL_LOC = ""
+
+    def __init__(self, path: str = "") -> None:
+        """
+        :param str path: model path
+        """
+        self.model = AveragedPerceptron()
+        self.tagdict = {}
+        self.classes = set()
+        if path != "":
+            self.AP_MODEL_LOC = path
+            self.load(self.AP_MODEL_LOC)
+
+    def tag(self, tokens: Iterable[str]) -> List[Tuple[str, str]]:
+        """Tags a string `tokens`."""
+        prev, prev2 = self.START
+        output = []
+
+        context = self.START + [self._normalize(w) for w in tokens] + self.END
+        for i, word in enumerate(tokens):
+            tag = self.tagdict.get(word)
+            if not tag:
+                features = self._get_features(i, word, context, prev, prev2)
+                tag = self.model.predict(features)
+            output.append((word, tag))
+            prev2 = prev
+            prev = tag
+        return output
+
+    def train(
+        self,
+        sentences: Iterable[Iterable[Tuple[str, str]]],
+        save_loc: Union[str, None] = None,
+        nr_iter: int = 5,
+    ) -> None:
+        """
+        Train a model from sentences, and save it at ``save_loc``.
+        ``nr_iter`` controls the number of Perceptron training iterations.
+
+        :param sentences: A list of (words, tags) tuples.
+        :param save_loc: If not ``None``, saves a pickled model in this \
+            location.
+        :param nr_iter: Number of training iterations.
+        """
+        self._make_tagdict(sentences)
+        self.model.classes = self.classes
+        for _ in range(nr_iter):
+            c = 0
+            n = 0
+            for sentence in sentences:
+                words, tags = zip(*sentence)
+
+                prev, prev2 = self.START
+                context = (
+                    self.START + [self._normalize(w) for w in words] + self.END
+                )
+                for i, word in enumerate(words):
+                    guess = self.tagdict.get(word)
+                    if not guess:
+                        feats = self._get_features(
+                            i, word, context, prev, prev2
+                        )
+                        guess = self.model.predict(feats)
+                        self.model.update(tags[i], guess, feats)
+                    prev2 = prev
+                    prev = guess
+                    c += guess == tags[i]
+                    n += 1
+            random.shuffle(sentences)
+        self.model.average_weights()
+
+        # save the model
+        if save_loc is not None:
+            data = {}
+            data["weights"] = self.model.weights
+            data["tagdict"] = self.tagdict
+            data["classes"] = self.classes
+            with open(save_loc, "wb") as f:
+                pickle.dump(data, f, -1)
+
+    def load(self, loc: str) -> None:
+        """
+        Load a pickled model.
+        :param str loc: model path
+        """
+        try:
+            with open(loc, "rb") as f:
+                w_td_c = pickle.load(f)
+        except IOError:
+            msg = "Missing trontagger.pickle file."
+            raise IOError(msg)
+        self.model.weights = w_td_c["weights"]
+        self.tagdict = w_td_c["tagdict"]
+        self.classes = w_td_c["classes"]
+        self.model.classes = self.classes
+
+    def _normalize(self, word: str) -> str:
+        """
+        Normalization used in pre-processing.
+
+        - All words are lower cased
+        - Digits in the range 1800-2100 are represented as !YEAR;
+        - Other digits are represented as !DIGITS
+
+        :rtype: str
+        """
+        if "-" in word and word[0] != "-":
+            return "!HYPHEN"
+        elif word.isdigit() and len(word) == 4:
+            return "!YEAR"
+        elif word[0].isdigit():
+            return "!DIGITS"
+        else:
+            return word.lower()
+
+    def _get_features(
+        self, i: int, word: str, context: List[str], prev: str, prev2: str
+    ) -> Dict:
+        """
+        Map tokens into a feature representation, implemented as a
+        {hashable: float} dict. If the features change, a new model must be
+        trained.
+        """
+
+        def add(name: str, *args):
+            features[" ".join((name,) + tuple(args))] += 1
+
+        i += len(self.START)
+        features = defaultdict(int)
+        # It's useful to have a constant feature,
+        # which acts sort of like a prior
+        add("bias")
+        add("i suffix", word[-3:])
+        add("i pref1", word[0])
+        add("i-1 tag", prev)
+        add("i-2 tag", prev2)
+        add("i tag+i-2 tag", prev, prev2)
+        add("i word", context[i])
+        add("i-1 tag+i word", prev, context[i])
+        add("i-1 word", context[i - 1])
+        add("i-1 suffix", context[i - 1][-3:])
+        add("i-2 word", context[i - 2])
+        add("i+1 word", context[i + 1])
+        add("i+1 suffix", context[i + 1][-3:])
+        add("i+2 word", context[i + 2])
+        return features
+
+    def _make_tagdict(
+        self, sentences: Iterable[Iterable[Tuple[str, str]]]
+    ) -> None:
+        """Make a tag dictionary for single-tag words."""
+        counts = defaultdict(lambda: defaultdict(int))
+        for sentence in sentences:
+            for word, tag in sentence:
+                counts[word][tag] += 1
+                self.classes.add(tag)
+        freq_thresh = 20
+        ambiguity_thresh = 0.97
+        for word, tag_freqs in counts.items():
+            tag, mode = max(tag_freqs.items(), key=lambda item: item[1])
+            n = sum(tag_freqs.values())
+            # Don't add rare words to the tag dictionary
+            # Only add quite unambiguous words
+            if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
+                self.tagdict[word] = tag
diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py
@@ -5,7 +5,7 @@
 
 __all__ = ["ThaiNameTagger"]
 
-from typing import List, Tuple, Union
+from typing import Dict, List, Tuple, Union
 
 from pycrfsuite import Tagger as CRFTagger
 from pythainlp.corpus import get_corpus_path, thai_stopwords
@@ -21,7 +21,7 @@ def _is_stopword(word: str) -> bool:  # เช็คว่าเป็นคำ
     return word in thai_stopwords()
 
 
-def _doc2features(doc, i) -> dict:
+def _doc2features(doc, i) -> Dict:
     word = doc[i][0]
     postag = doc[i][1]
 
@@ -73,7 +73,7 @@ def _doc2features(doc, i) -> dict:
 
 
 class ThaiNameTagger:
-    def __init__(self):
+    def __init__(self) -> None:
         """
         Thai named-entity recognizer.
         """

diff --git a/pythainlp/tag/perceptron.py b/pythainlp/tag/perceptron.py
@@ -7,8 +7,7 @@
 from typing import List, Tuple
 
 from pythainlp.corpus import corpus_path, get_corpus_path
-from pythainlp.tag import lst20, orchid
-
+from pythainlp.tag import PerceptronTagger, lst20, orchid
 
 _ORCHID_FILENAME = "pos_orchid_perceptron.pkl"
 _ORCHID_PATH = os.path.join(corpus_path(), _ORCHID_FILENAME)
@@ -26,16 +25,14 @@
 def _orchid_tagger():
     global _ORCHID_TAGGER
     if not _ORCHID_TAGGER:
-        with open(_ORCHID_PATH, "rb") as fh:
-            _ORCHID_TAGGER = pickle.load(fh)
+        _ORCHID_TAGGER = PerceptronTagger(path=_ORCHID_PATH)
     return _ORCHID_TAGGER
 
 
 def _pud_tagger():
     global _PUD_TAGGER
     if not _PUD_TAGGER:
-        with open(_PUD_PATH, "rb") as fh:
-            _PUD_TAGGER = pickle.load(fh)
+        _PUD_TAGGER = PerceptronTagger(path=_PUD_PATH)
     return _PUD_TAGGER
 
 

diff --git a/pythainlp/tag/unigram.py b/pythainlp/tag/unigram.py
@@ -9,7 +9,6 @@
 from pythainlp.corpus import corpus_path, get_corpus_path
 from pythainlp.tag import lst20, orchid
 
-
 _ORCHID_FILENAME = "pos_orchid_unigram.json"
 _ORCHID_PATH = os.path.join(corpus_path(), _ORCHID_FILENAME)