Add pythainlp.textaugment

wannaphong · wannaphong · commit 738ae3bba6f1 · 2021-06-12T14:39:52.000+07:00
diff --git a/docs/api/textaugment.rst b/docs/api/textaugment.rst
@@ -0,0 +1,23 @@
+.. currentmodule:: pythainlp.textaugment
+
+pythainlp.textaugment
+=====================
+
+The :class:`textaugment` is Thai text augment.
+
+Modules
+-------
+
+.. autoclass:: WordNetAug
+    :members:
+.. autofunction:: postype2wordnet
+.. autoclass:: pythainlp.textaugment.word2vec.Word2VecAug
+    :members:
+.. autoclass:: pythainlp.textaugment.word2vec.BPEmbAug
+    :members:
+.. autoclass:: pythainlp.textaugment.word2vec.Thai2fitAug
+    :members:
+.. autoclass:: pythainlp.textaugment.lm.FastTextAug
+    :members:
+.. autoclass:: pythainlp.textaugment.lm.Thai2transformersAug
+    :members:
diff --git a/pythainlp/textaugment/__init__.py b/pythainlp/textaugment/__init__.py
@@ -0,0 +1,8 @@
+# -*- coding: utf-8 -*-
+"""
+Thai text augment
+"""
+
+__all__ = ["WordNetAug"]
+
+from pythainlp.textaugment.wordnet import WordNetAug
diff --git a/pythainlp/textaugment/lm/__init__.py b/pythainlp/textaugment/lm/__init__.py
@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+"""
+LM
+"""
+
+__all__ = [
+    "FastTextAug",
+    "Thai2transformersAug",
+]
+
+from pythainlp.textaugment.lm.fasttext import FastTextAug
+from pythainlp.textaugment.lm.wangchanberta import Thai2transformersAug
diff --git a/pythainlp/textaugment/lm/fasttext.py b/pythainlp/textaugment/lm/fasttext.py
@@ -0,0 +1,69 @@
+# -*- coding: utf-8 -*-
+from typing import List, Tuple
+from gensim.models.fasttext import FastText as FastText_gensim
+from pythainlp.tokenize import word_tokenize
+from gensim.models.keyedvectors import KeyedVectors
+import itertools
+
+
+class FastTextAug:
+    """
+    Text Augment from FastText
+    """
+    def __init__(self, model_path: str):
+        """
+        :param str model_path: path of model file
+        """
+        if model_path.endswith('.bin'):
+            self.model = FastText_gensim.load_facebook_vectors(model_path)
+        elif model_path.endswith('.vec'):
+            self.model = KeyedVectors.load_word2vec_format(model_path)
+        else:
+            self.model = FastText_gensim.load(model_path)
+        self.dict_wv = list(self.model.key_to_index.keys())
+    def tokenize(self, text: str)-> List[str]:
+        """
+        Thai text tokenize for fasttext
+
+        :param str text: thai text
+
+        :return: list of word
+        :rtype: List[str]
+        """
+        return word_tokenize(text, engine='icu')
+    def modify_sent(self,sent, p = 0.7) -> List[List[str]]:
+        """
+        :param str sent: text sentence
+        :param float p: probability
+        :rtype: List[List[str]]
+        """
+        list_sent_new = []
+        for i in sent:
+            if i in self.dict_wv:
+                w = [j for j,v in self.model.most_similar(i) if v>=p]
+                if w == []:
+                    list_sent_new.append([i])
+                else:
+                    list_sent_new.append(w)
+            else:
+                list_sent_new.append([i])
+        return list_sent_new
+    def augment(self, sentence: str, n_sent: int = 1, p:float = 0.7) -> List[Tuple[str]]:
+        """
+        Text Augment from FastText
+
+        You wants to download thai model from https://fasttext.cc/docs/en/crawl-vectors.html.
+
+        :param str sentence: thai sentence
+        :param int n_sent: number sentence
+        :param float p: Probability of word
+
+        :return: list of synonyms
+        :rtype: List[Tuple[str]]
+        """
+        self.sentence = self.tokenize(sentence)
+        self.list_synonym = self.modify_sent(self.sentence, p = p)
+        new_sentences = []
+        for x in list(itertools.product(*self.list_synonym))[0:n_sent]:
+            new_sentences.append(x)
+        return new_sentences
diff --git a/pythainlp/textaugment/lm/wangchanberta.py b/pythainlp/textaugment/lm/wangchanberta.py
@@ -0,0 +1,71 @@
+# -*- coding: utf-8 -*-
+from datasets import load_dataset
+
+# transformers
+from transformers import (
+    CamembertTokenizer,
+    AutoTokenizer,
+    AutoModel,
+    AutoModelForMaskedLM,
+    AutoModelForSequenceClassification,
+    AutoModelForTokenClassification,
+    TrainingArguments,
+    Trainer,
+    pipeline,
+)
+import random
+from typing import List
+import thai2transformers
+from thai2transformers.preprocess import process_transformers
+
+model_name = "airesearch/wangchanberta-base-att-spm-uncased"
+
+
+class Thai2transformersAug:
+    def __init__(self):
+        self.model_name = "airesearch/wangchanberta-base-att-spm-uncased"
+        self.target_tokenizer = CamembertTokenizer
+        self.tokenizer = CamembertTokenizer.from_pretrained(
+                                    self.model_name,
+                                    revision='main')
+        self.tokenizer.additional_special_tokens = ['<s>NOTUSED', '</s>NOTUSED', '<_>']
+        self.fill_mask = pipeline(
+            task='fill-mask',
+            tokenizer=self.tokenizer,
+            model = f'{self.model_name}',
+            revision = 'main',)
+    def generate(self, sentence: str, num_replace_tokens: int=3):
+        self.sent2 = []
+        self.input_text = process_transformers(sentence)
+        sent = [i for i in self.tokenizer.tokenize(self.input_text) if i != '▁']
+        if len(sent) < num_replace_tokens:
+            num_replace_tokens = len(sent)
+        masked_text = self.input_text
+        for i in range(num_replace_tokens):
+            replace_token =  [sent.pop(random.randrange(len(sent))) for _ in range(1)][0]
+            masked_text = masked_text.replace(replace_token, f"{self.fill_mask.tokenizer.mask_token}",1)
+            self.sent2+=[str(j['sequence']).replace('<s> ','').replace('</s>','') for j in self.fill_mask(masked_text+'<pad>') if j['sequence'] not in self.sent2]
+            masked_text = self.input_text
+        return self.sent2
+
+    def augment(self, sentence: str, num_replace_tokens: int=3) -> List[str]:
+        """
+        Text Augment from wangchanberta
+
+        :param str sentence: thai sentence
+        :param int num_replace_tokens: number replace tokens
+
+        :return: list of text augment
+        :rtype: List[str]
+        """
+        self.sent2 = []
+        try:
+            self.sent2 = self.generate(sentence, num_replace_tokens)
+            if self.sent2 == []:
+                self.sent2 = self.generate(sentence, num_replace_tokens)
+            return self.sent2
+        except:
+            if len(self.sent2) > 0:
+                return self.sent2
+            else:
+                return self.sent2
diff --git a/pythainlp/textaugment/word2vec/BPEmb_core.py b/pythainlp/textaugment/word2vec/BPEmb_core.py
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+from pythainlp.textaugment.word2vec.core import Word2VecAug
+from bpemb import BPEmb
+from typing import List, Tuple
+
+
+class BPEmbAug:
+    """
+    Thai Text Augment using word2vec from BPEmb
+
+    BPEmb: `github.com/bheinzerling/bpemb <https://github.com/bheinzerling/bpemb>`_
+    """
+    def __init__(self, lang: str = "th", vs:int = 100000, dim: int = 300):
+        self.bpemb_temp = BPEmb(lang=lang, dim=dim, vs= vs)
+        self.model = self.bpemb_temp.emb
+        self.load_w2v()
+    def tokenizer(self, text: str) -> List[str]:
+        """
+        :param str text: thai text
+        :rtype: List[str]
+        """
+        return self.bpemb_temp.encode(text)
+    def load_w2v(self):
+        """
+        Load BPEmb model
+        """
+        self.aug = Word2VecAug(self.model, tokenize=self.tokenizer, type="model")
+    def augment(self, sentence: str, n_sent: int = 1, p: float = 0.7)  -> List[Tuple[str]]:
+        """
+        Text Augment using word2vec from BPEmb
+
+        :param str sentence: thai sentence
+        :param int n_sent: number sentence
+        :param float p: Probability of word
+
+        :return: list of synonyms
+        :rtype: List[Tuple[str]]
+        """
+        self.sentence =  sentence.replace(" ","▁")
+        self.temp = self.aug.augment(self.sentence, n_sent, p = p)
+        self.temp_new = []
+        for i in self.temp:
+            self.t = ""
+            for j in i:
+                self.t += j.replace('▁','')
+            self.temp_new.append(self.t)
+        return self.temp_new
diff --git a/pythainlp/textaugment/word2vec/__init__.py b/pythainlp/textaugment/word2vec/__init__.py
@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+"""
+Word2Vec
+"""
+
+__all__ = [
+    "Word2VecAug",
+    "BPEmbAug",
+    "Thai2fitAug"
+]
+
+from pythainlp.textaugment.word2vec.core import Word2VecAug
+from pythainlp.textaugment.word2vec.BPEmb_core import BPEmbAug
+from pythainlp.textaugment.word2vec.thai2fit import Thai2fitAug
diff --git a/pythainlp/textaugment/word2vec/core.py b/pythainlp/textaugment/word2vec/core.py
@@ -0,0 +1,53 @@
+# -*- coding: utf-8 -*-
+from typing import List, Tuple
+import gensim.models.keyedvectors as word2vec
+import itertools
+
+
+class Word2VecAug:
+    def __init__(self, model: str, tokenize: object, type: str = "file") -> None:
+        """
+        :param str model: path model
+        :param object tokenize: tokenize function
+        :param str type: moodel type (file, binary)
+        """
+        self.tokenizer = tokenize
+        if type == "file":
+            self.model = word2vec.KeyedVectors.load_word2vec_format(model)
+        elif type=="binary":
+            self.model = word2vec.KeyedVectors.load_word2vec_format(model, binary=True)
+        else:
+            self.model = model
+        self.dict_wv = list(self.model.key_to_index.keys())
+    def modify_sent(self,sent, p = 0.7) -> List[List[str]]:
+        """
+        :param str sent: text sentence
+        :param float p: probability
+        :rtype: List[List[str]]
+        """
+        list_sent_new = []
+        for i in sent:
+            if i in self.dict_wv:
+                w = [j for j,v in self.model.most_similar(i) if v>=p]
+                if w == []:
+                    list_sent_new.append([i])
+                else:
+                    list_sent_new.append(w)
+            else:
+                list_sent_new.append([i])
+        return list_sent_new
+    def augment(self, sentence: str, n_sent: int = 1, p:float = 0.7) -> List[Tuple[str]]:
+        """
+        :param str sentence: text sentence
+        :param int n_sent: max number for synonyms sentence
+        :param int p: probability
+
+        :return: list of synonyms
+        :rtype: List[Tuple[str]]
+        """
+        self.sentence = self.tokenizer(sentence)
+        self.list_synonym = self.modify_sent(self.sentence, p = p)
+        new_sentences = []
+        for x in list(itertools.product(*self.list_synonym))[0:n_sent]:
+            new_sentences.append(x)
+        return new_sentences
diff --git a/pythainlp/textaugment/word2vec/thai2fit.py b/pythainlp/textaugment/word2vec/thai2fit.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+from pythainlp.textaugment.word2vec.core import Word2VecAug
+from pythainlp.corpus import get_corpus_path
+from pythainlp.tokenize import THAI2FIT_TOKENIZER
+from typing import List, Tuple
+
+
+class Thai2fitAug:
+    """
+    Text Augment using word2vec from Thai2Fit
+
+    Thai2Fit: `github.com/cstorm125/thai2fit <https://github.com/cstorm125/thai2fit>`_
+    """
+    def __init__(self):
+        self.thai2fit_wv = get_corpus_path('thai2fit_wv')
+        self.load_w2v()
+    def tokenizer(self, text: str) -> List[str]:
+        """
+        :param str text: thai text
+        :rtype: List[str]
+        """
+        return THAI2FIT_TOKENIZER.word_tokenize(text)
+    def load_w2v(self): # insert substitute
+        """
+        Load thai2fit word2vec model
+        """
+        self.aug = Word2VecAug(self.thai2fit_wv, self.tokenizer, type="binary")
+    def augment(self, sentence: str, n_sent: int = 1, p: float = 0.7) -> List[Tuple[str]]:
+        """
+        Text Augment using word2vec from Thai2Fit
+
+        :param str sentence: thai sentence
+        :param int n_sent: number sentence
+        :param float p: Probability of word
+
+        :return: list of text augment
+        :rtype: List[Tuple[str]]
+        """
+        return self.aug.augment(sentence, n_sent, p)
diff --git a/pythainlp/textaugment/wordnet.py b/pythainlp/textaugment/wordnet.py
diff --git a/setup.py b/setup.py
diff --git a/tests/test_textaugment.py b/tests/test_textaugment.py