Skip to content

Commit 738ae3b

Browse files
committed
Add pythainlp.textaugment
1 parent 59d53e6 commit 738ae3b

File tree

12 files changed

+571
-0
lines changed

12 files changed

+571
-0
lines changed

docs/api/textaugment.rst

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
.. currentmodule:: pythainlp.textaugment
2+
3+
pythainlp.textaugment
4+
=====================
5+
6+
The :class:`textaugment` is Thai text augment.
7+
8+
Modules
9+
-------
10+
11+
.. autoclass:: WordNetAug
12+
:members:
13+
.. autofunction:: postype2wordnet
14+
.. autoclass:: pythainlp.textaugment.word2vec.Word2VecAug
15+
:members:
16+
.. autoclass:: pythainlp.textaugment.word2vec.BPEmbAug
17+
:members:
18+
.. autoclass:: pythainlp.textaugment.word2vec.Thai2fitAug
19+
:members:
20+
.. autoclass:: pythainlp.textaugment.lm.FastTextAug
21+
:members:
22+
.. autoclass:: pythainlp.textaugment.lm.Thai2transformersAug
23+
:members:

pythainlp/textaugment/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Thai text augment
4+
"""
5+
6+
__all__ = ["WordNetAug"]
7+
8+
from pythainlp.textaugment.wordnet import WordNetAug
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
LM
4+
"""
5+
6+
__all__ = [
7+
"FastTextAug",
8+
"Thai2transformersAug",
9+
]
10+
11+
from pythainlp.textaugment.lm.fasttext import FastTextAug
12+
from pythainlp.textaugment.lm.wangchanberta import Thai2transformersAug
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
# -*- coding: utf-8 -*-
2+
from typing import List, Tuple
3+
from gensim.models.fasttext import FastText as FastText_gensim
4+
from pythainlp.tokenize import word_tokenize
5+
from gensim.models.keyedvectors import KeyedVectors
6+
import itertools
7+
8+
9+
class FastTextAug:
10+
"""
11+
Text Augment from FastText
12+
"""
13+
def __init__(self, model_path: str):
14+
"""
15+
:param str model_path: path of model file
16+
"""
17+
if model_path.endswith('.bin'):
18+
self.model = FastText_gensim.load_facebook_vectors(model_path)
19+
elif model_path.endswith('.vec'):
20+
self.model = KeyedVectors.load_word2vec_format(model_path)
21+
else:
22+
self.model = FastText_gensim.load(model_path)
23+
self.dict_wv = list(self.model.key_to_index.keys())
24+
def tokenize(self, text: str)-> List[str]:
25+
"""
26+
Thai text tokenize for fasttext
27+
28+
:param str text: thai text
29+
30+
:return: list of word
31+
:rtype: List[str]
32+
"""
33+
return word_tokenize(text, engine='icu')
34+
def modify_sent(self,sent, p = 0.7) -> List[List[str]]:
35+
"""
36+
:param str sent: text sentence
37+
:param float p: probability
38+
:rtype: List[List[str]]
39+
"""
40+
list_sent_new = []
41+
for i in sent:
42+
if i in self.dict_wv:
43+
w = [j for j,v in self.model.most_similar(i) if v>=p]
44+
if w == []:
45+
list_sent_new.append([i])
46+
else:
47+
list_sent_new.append(w)
48+
else:
49+
list_sent_new.append([i])
50+
return list_sent_new
51+
def augment(self, sentence: str, n_sent: int = 1, p:float = 0.7) -> List[Tuple[str]]:
52+
"""
53+
Text Augment from FastText
54+
55+
You wants to download thai model from https://fasttext.cc/docs/en/crawl-vectors.html.
56+
57+
:param str sentence: thai sentence
58+
:param int n_sent: number sentence
59+
:param float p: Probability of word
60+
61+
:return: list of synonyms
62+
:rtype: List[Tuple[str]]
63+
"""
64+
self.sentence = self.tokenize(sentence)
65+
self.list_synonym = self.modify_sent(self.sentence, p = p)
66+
new_sentences = []
67+
for x in list(itertools.product(*self.list_synonym))[0:n_sent]:
68+
new_sentences.append(x)
69+
return new_sentences
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# -*- coding: utf-8 -*-
2+
from datasets import load_dataset
3+
4+
# transformers
5+
from transformers import (
6+
CamembertTokenizer,
7+
AutoTokenizer,
8+
AutoModel,
9+
AutoModelForMaskedLM,
10+
AutoModelForSequenceClassification,
11+
AutoModelForTokenClassification,
12+
TrainingArguments,
13+
Trainer,
14+
pipeline,
15+
)
16+
import random
17+
from typing import List
18+
import thai2transformers
19+
from thai2transformers.preprocess import process_transformers
20+
21+
model_name = "airesearch/wangchanberta-base-att-spm-uncased"
22+
23+
24+
class Thai2transformersAug:
25+
def __init__(self):
26+
self.model_name = "airesearch/wangchanberta-base-att-spm-uncased"
27+
self.target_tokenizer = CamembertTokenizer
28+
self.tokenizer = CamembertTokenizer.from_pretrained(
29+
self.model_name,
30+
revision='main')
31+
self.tokenizer.additional_special_tokens = ['<s>NOTUSED', '</s>NOTUSED', '<_>']
32+
self.fill_mask = pipeline(
33+
task='fill-mask',
34+
tokenizer=self.tokenizer,
35+
model = f'{self.model_name}',
36+
revision = 'main',)
37+
def generate(self, sentence: str, num_replace_tokens: int=3):
38+
self.sent2 = []
39+
self.input_text = process_transformers(sentence)
40+
sent = [i for i in self.tokenizer.tokenize(self.input_text) if i != '▁']
41+
if len(sent) < num_replace_tokens:
42+
num_replace_tokens = len(sent)
43+
masked_text = self.input_text
44+
for i in range(num_replace_tokens):
45+
replace_token = [sent.pop(random.randrange(len(sent))) for _ in range(1)][0]
46+
masked_text = masked_text.replace(replace_token, f"{self.fill_mask.tokenizer.mask_token}",1)
47+
self.sent2+=[str(j['sequence']).replace('<s> ','').replace('</s>','') for j in self.fill_mask(masked_text+'<pad>') if j['sequence'] not in self.sent2]
48+
masked_text = self.input_text
49+
return self.sent2
50+
51+
def augment(self, sentence: str, num_replace_tokens: int=3) -> List[str]:
52+
"""
53+
Text Augment from wangchanberta
54+
55+
:param str sentence: thai sentence
56+
:param int num_replace_tokens: number replace tokens
57+
58+
:return: list of text augment
59+
:rtype: List[str]
60+
"""
61+
self.sent2 = []
62+
try:
63+
self.sent2 = self.generate(sentence, num_replace_tokens)
64+
if self.sent2 == []:
65+
self.sent2 = self.generate(sentence, num_replace_tokens)
66+
return self.sent2
67+
except:
68+
if len(self.sent2) > 0:
69+
return self.sent2
70+
else:
71+
return self.sent2
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# -*- coding: utf-8 -*-
2+
from pythainlp.textaugment.word2vec.core import Word2VecAug
3+
from bpemb import BPEmb
4+
from typing import List, Tuple
5+
6+
7+
class BPEmbAug:
8+
"""
9+
Thai Text Augment using word2vec from BPEmb
10+
11+
BPEmb: `github.com/bheinzerling/bpemb <https://github.com/bheinzerling/bpemb>`_
12+
"""
13+
def __init__(self, lang: str = "th", vs:int = 100000, dim: int = 300):
14+
self.bpemb_temp = BPEmb(lang=lang, dim=dim, vs= vs)
15+
self.model = self.bpemb_temp.emb
16+
self.load_w2v()
17+
def tokenizer(self, text: str) -> List[str]:
18+
"""
19+
:param str text: thai text
20+
:rtype: List[str]
21+
"""
22+
return self.bpemb_temp.encode(text)
23+
def load_w2v(self):
24+
"""
25+
Load BPEmb model
26+
"""
27+
self.aug = Word2VecAug(self.model, tokenize=self.tokenizer, type="model")
28+
def augment(self, sentence: str, n_sent: int = 1, p: float = 0.7) -> List[Tuple[str]]:
29+
"""
30+
Text Augment using word2vec from BPEmb
31+
32+
:param str sentence: thai sentence
33+
:param int n_sent: number sentence
34+
:param float p: Probability of word
35+
36+
:return: list of synonyms
37+
:rtype: List[Tuple[str]]
38+
"""
39+
self.sentence = sentence.replace(" ","▁")
40+
self.temp = self.aug.augment(self.sentence, n_sent, p = p)
41+
self.temp_new = []
42+
for i in self.temp:
43+
self.t = ""
44+
for j in i:
45+
self.t += j.replace('▁','')
46+
self.temp_new.append(self.t)
47+
return self.temp_new
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Word2Vec
4+
"""
5+
6+
__all__ = [
7+
"Word2VecAug",
8+
"BPEmbAug",
9+
"Thai2fitAug"
10+
]
11+
12+
from pythainlp.textaugment.word2vec.core import Word2VecAug
13+
from pythainlp.textaugment.word2vec.BPEmb_core import BPEmbAug
14+
from pythainlp.textaugment.word2vec.thai2fit import Thai2fitAug
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# -*- coding: utf-8 -*-
2+
from typing import List, Tuple
3+
import gensim.models.keyedvectors as word2vec
4+
import itertools
5+
6+
7+
class Word2VecAug:
8+
def __init__(self, model: str, tokenize: object, type: str = "file") -> None:
9+
"""
10+
:param str model: path model
11+
:param object tokenize: tokenize function
12+
:param str type: moodel type (file, binary)
13+
"""
14+
self.tokenizer = tokenize
15+
if type == "file":
16+
self.model = word2vec.KeyedVectors.load_word2vec_format(model)
17+
elif type=="binary":
18+
self.model = word2vec.KeyedVectors.load_word2vec_format(model, binary=True)
19+
else:
20+
self.model = model
21+
self.dict_wv = list(self.model.key_to_index.keys())
22+
def modify_sent(self,sent, p = 0.7) -> List[List[str]]:
23+
"""
24+
:param str sent: text sentence
25+
:param float p: probability
26+
:rtype: List[List[str]]
27+
"""
28+
list_sent_new = []
29+
for i in sent:
30+
if i in self.dict_wv:
31+
w = [j for j,v in self.model.most_similar(i) if v>=p]
32+
if w == []:
33+
list_sent_new.append([i])
34+
else:
35+
list_sent_new.append(w)
36+
else:
37+
list_sent_new.append([i])
38+
return list_sent_new
39+
def augment(self, sentence: str, n_sent: int = 1, p:float = 0.7) -> List[Tuple[str]]:
40+
"""
41+
:param str sentence: text sentence
42+
:param int n_sent: max number for synonyms sentence
43+
:param int p: probability
44+
45+
:return: list of synonyms
46+
:rtype: List[Tuple[str]]
47+
"""
48+
self.sentence = self.tokenizer(sentence)
49+
self.list_synonym = self.modify_sent(self.sentence, p = p)
50+
new_sentences = []
51+
for x in list(itertools.product(*self.list_synonym))[0:n_sent]:
52+
new_sentences.append(x)
53+
return new_sentences
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# -*- coding: utf-8 -*-
2+
from pythainlp.textaugment.word2vec.core import Word2VecAug
3+
from pythainlp.corpus import get_corpus_path
4+
from pythainlp.tokenize import THAI2FIT_TOKENIZER
5+
from typing import List, Tuple
6+
7+
8+
class Thai2fitAug:
9+
"""
10+
Text Augment using word2vec from Thai2Fit
11+
12+
Thai2Fit: `github.com/cstorm125/thai2fit <https://github.com/cstorm125/thai2fit>`_
13+
"""
14+
def __init__(self):
15+
self.thai2fit_wv = get_corpus_path('thai2fit_wv')
16+
self.load_w2v()
17+
def tokenizer(self, text: str) -> List[str]:
18+
"""
19+
:param str text: thai text
20+
:rtype: List[str]
21+
"""
22+
return THAI2FIT_TOKENIZER.word_tokenize(text)
23+
def load_w2v(self): # insert substitute
24+
"""
25+
Load thai2fit word2vec model
26+
"""
27+
self.aug = Word2VecAug(self.thai2fit_wv, self.tokenizer, type="binary")
28+
def augment(self, sentence: str, n_sent: int = 1, p: float = 0.7) -> List[Tuple[str]]:
29+
"""
30+
Text Augment using word2vec from Thai2Fit
31+
32+
:param str sentence: thai sentence
33+
:param int n_sent: number sentence
34+
:param float p: Probability of word
35+
36+
:return: list of text augment
37+
:rtype: List[Tuple[str]]
38+
"""
39+
return self.aug.augment(sentence, n_sent, p)

0 commit comments

Comments
 (0)