Skip to content

Commit aa09960

Browse files
committed
del BPEmbAug
1 parent 738ae3b commit aa09960

File tree

8 files changed

+14
-62
lines changed

8 files changed

+14
-62
lines changed

pythainlp/textaugment/lm/fasttext.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ def __init__(self, model_path: str):
2121
else:
2222
self.model = FastText_gensim.load(model_path)
2323
self.dict_wv = list(self.model.key_to_index.keys())
24+
2425
def tokenize(self, text: str)-> List[str]:
2526
"""
2627
Thai text tokenize for fasttext
@@ -31,6 +32,7 @@ def tokenize(self, text: str)-> List[str]:
3132
:rtype: List[str]
3233
"""
3334
return word_tokenize(text, engine='icu')
35+
3436
def modify_sent(self,sent, p = 0.7) -> List[List[str]]:
3537
"""
3638
:param str sent: text sentence
@@ -48,6 +50,7 @@ def modify_sent(self,sent, p = 0.7) -> List[List[str]]:
4850
else:
4951
list_sent_new.append([i])
5052
return list_sent_new
53+
5154
def augment(self, sentence: str, n_sent: int = 1, p:float = 0.7) -> List[Tuple[str]]:
5255
"""
5356
Text Augment from FastText

pythainlp/textaugment/lm/wangchanberta.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ def __init__(self):
3434
tokenizer=self.tokenizer,
3535
model = f'{self.model_name}',
3636
revision = 'main',)
37+
3738
def generate(self, sentence: str, num_replace_tokens: int=3):
3839
self.sent2 = []
3940
self.input_text = process_transformers(sentence)
@@ -68,4 +69,4 @@ def augment(self, sentence: str, num_replace_tokens: int=3) -> List[str]:
6869
if len(self.sent2) > 0:
6970
return self.sent2
7071
else:
71-
return self.sent2
72+
return self.sent2

pythainlp/textaugment/word2vec/BPEmb_core.py

Lines changed: 0 additions & 47 deletions
This file was deleted.

pythainlp/textaugment/word2vec/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,8 @@
55

66
__all__ = [
77
"Word2VecAug",
8-
"BPEmbAug",
98
"Thai2fitAug"
109
]
1110

1211
from pythainlp.textaugment.word2vec.core import Word2VecAug
13-
from pythainlp.textaugment.word2vec.BPEmb_core import BPEmbAug
1412
from pythainlp.textaugment.word2vec.thai2fit import Thai2fitAug

pythainlp/textaugment/word2vec/core.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ def __init__(self, model: str, tokenize: object, type: str = "file") -> None:
1919
else:
2020
self.model = model
2121
self.dict_wv = list(self.model.key_to_index.keys())
22+
2223
def modify_sent(self,sent, p = 0.7) -> List[List[str]]:
2324
"""
2425
:param str sent: text sentence
@@ -36,6 +37,7 @@ def modify_sent(self,sent, p = 0.7) -> List[List[str]]:
3637
else:
3738
list_sent_new.append([i])
3839
return list_sent_new
40+
3941
def augment(self, sentence: str, n_sent: int = 1, p:float = 0.7) -> List[Tuple[str]]:
4042
"""
4143
:param str sentence: text sentence
@@ -50,4 +52,4 @@ def augment(self, sentence: str, n_sent: int = 1, p:float = 0.7) -> List[Tuple[s
5052
new_sentences = []
5153
for x in list(itertools.product(*self.list_synonym))[0:n_sent]:
5254
new_sentences.append(x)
53-
return new_sentences
55+
return new_sentences

pythainlp/textaugment/word2vec/thai2fit.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,17 +14,20 @@ class Thai2fitAug:
1414
def __init__(self):
1515
self.thai2fit_wv = get_corpus_path('thai2fit_wv')
1616
self.load_w2v()
17+
1718
def tokenizer(self, text: str) -> List[str]:
1819
"""
1920
:param str text: thai text
2021
:rtype: List[str]
2122
"""
2223
return THAI2FIT_TOKENIZER.word_tokenize(text)
24+
2325
def load_w2v(self): # insert substitute
2426
"""
2527
Load thai2fit word2vec model
2628
"""
2729
self.aug = Word2VecAug(self.thai2fit_wv, self.tokenizer, type="binary")
30+
2831
def augment(self, sentence: str, n_sent: int = 1, p: float = 0.7) -> List[Tuple[str]]:
2932
"""
3033
Text Augment using word2vec from Thai2Fit
@@ -36,4 +39,4 @@ def augment(self, sentence: str, n_sent: int = 1, p: float = 0.7) -> List[Tuple[
3639
:return: list of text augment
3740
:rtype: List[Tuple[str]]
3841
"""
39-
return self.aug.augment(sentence, n_sent, p)
42+
return self.aug.augment(sentence, n_sent, p)

setup.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
"ipa": ["epitran>=1.1"],
4848
"ml": ["numpy>=1.16", "torch>=1.0.0"],
4949
"ssg": ["ssg>=0.0.6"],
50-
"thai2fit": ["emoji>=0.5.1", "gensim>=3.2.0", "numpy>=1.16.1"],
50+
"thai2fit": ["emoji>=0.5.1", "gensim>=4.0.0", "numpy>=1.16.1"],
5151
"thai2rom": ["numpy>=1.16.1", "torch>=1.0.0"],
5252
"translate": [
5353
"fairseq>=0.10.0",
@@ -56,7 +56,6 @@
5656
"torch>=1.0.0",
5757
],
5858
"textaugment": [
59-
"bpemb",
6059
"gensim>=4.0.0",
6160
"thai2transformers"
6261
],
@@ -69,7 +68,6 @@
6968
"emoji>=0.5.1",
7069
"epitran>=1.1",
7170
"fairseq>=0.10.0",
72-
"gensim>=3.2.0",
7371
"nltk>=3.3.*",
7472
"numpy>=1.16.1",
7573
"pandas>=0.24",
@@ -79,7 +77,6 @@
7977
"ssg>=0.0.6",
8078
"torch>=1.0.0",
8179
"transformers>=4.1.1",
82-
"bpemb",
8380
"gensim>=4.0.0",
8481
"thai2transformers"
8582
],

tests/test_textaugment.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,7 @@ def test_Thai2fitAug(self):
2121
_aug = Thai2fitAug()
2222
self.assertIsNotNone(_aug.tokenizer(self.text))
2323
self.assertIsNotNone(_aug.augment(self.text, n_sent=3, p = 0.5))
24-
25-
def test_BPEmbAug(self):
26-
_aug = BPEmbAug()
27-
self.assertIsNotNone(_aug.tokenizer(self.text))
28-
self.assertIsNotNone(_aug.augment(self.text, n_sent=3, p = 0.5))
29-
24+
3025
def test_Thai2transformersAug(self):
3126
_aug = Thai2transformersAug()
3227
self.assertIsNotNone(_aug.augment(self.text2, num_replace_tokens=1))

0 commit comments

Comments
 (0)