del BPEmbAug

wannaphong · wannaphong · commit aa099608e9e7 · 2021-06-12T15:16:06.000+07:00
diff --git a/pythainlp/textaugment/lm/fasttext.py b/pythainlp/textaugment/lm/fasttext.py
@@ -21,6 +21,7 @@ def __init__(self, model_path: str):
         else:
             self.model = FastText_gensim.load(model_path)
         self.dict_wv = list(self.model.key_to_index.keys())
+
     def tokenize(self, text: str)-> List[str]:
         """
         Thai text tokenize for fasttext
@@ -31,6 +32,7 @@ def tokenize(self, text: str)-> List[str]:
         :rtype: List[str]
         """
         return word_tokenize(text, engine='icu')
+
     def modify_sent(self,sent, p = 0.7) -> List[List[str]]:
         """
         :param str sent: text sentence
@@ -48,6 +50,7 @@ def modify_sent(self,sent, p = 0.7) -> List[List[str]]:
             else:
                 list_sent_new.append([i])
         return list_sent_new
+
     def augment(self, sentence: str, n_sent: int = 1, p:float = 0.7) -> List[Tuple[str]]:
         """
         Text Augment from FastText
diff --git a/pythainlp/textaugment/lm/wangchanberta.py b/pythainlp/textaugment/lm/wangchanberta.py
@@ -34,6 +34,7 @@ def __init__(self):
             tokenizer=self.tokenizer,
             model = f'{self.model_name}',
             revision = 'main',)
+
     def generate(self, sentence: str, num_replace_tokens: int=3):
         self.sent2 = []
         self.input_text = process_transformers(sentence)
@@ -68,4 +69,4 @@ def augment(self, sentence: str, num_replace_tokens: int=3) -> List[str]:
             if len(self.sent2) > 0:
                 return self.sent2
             else:
-                return self.sent2
+                return self.sent2
diff --git a/pythainlp/textaugment/word2vec/BPEmb_core.py b/pythainlp/textaugment/word2vec/BPEmb_core.py
diff --git a/pythainlp/textaugment/word2vec/__init__.py b/pythainlp/textaugment/word2vec/__init__.py
@@ -5,10 +5,8 @@
 
 __all__ = [
     "Word2VecAug",
-    "BPEmbAug",
     "Thai2fitAug"
 ]
 
 from pythainlp.textaugment.word2vec.core import Word2VecAug
-from pythainlp.textaugment.word2vec.BPEmb_core import BPEmbAug
 from pythainlp.textaugment.word2vec.thai2fit import Thai2fitAug
diff --git a/pythainlp/textaugment/word2vec/core.py b/pythainlp/textaugment/word2vec/core.py
@@ -19,6 +19,7 @@ def __init__(self, model: str, tokenize: object, type: str = "file") -> None:
         else:
             self.model = model
         self.dict_wv = list(self.model.key_to_index.keys())
+
     def modify_sent(self,sent, p = 0.7) -> List[List[str]]:
         """
         :param str sent: text sentence
@@ -36,6 +37,7 @@ def modify_sent(self,sent, p = 0.7) -> List[List[str]]:
             else:
                 list_sent_new.append([i])
         return list_sent_new
+
     def augment(self, sentence: str, n_sent: int = 1, p:float = 0.7) -> List[Tuple[str]]:
         """
         :param str sentence: text sentence
@@ -50,4 +52,4 @@ def augment(self, sentence: str, n_sent: int = 1, p:float = 0.7) -> List[Tuple[s
         new_sentences = []
         for x in list(itertools.product(*self.list_synonym))[0:n_sent]:
             new_sentences.append(x)
-        return new_sentences
+        return new_sentences
diff --git a/pythainlp/textaugment/word2vec/thai2fit.py b/pythainlp/textaugment/word2vec/thai2fit.py
@@ -14,17 +14,20 @@ class Thai2fitAug:
     def __init__(self):
         self.thai2fit_wv = get_corpus_path('thai2fit_wv')
         self.load_w2v()
+
     def tokenizer(self, text: str) -> List[str]:
         """
         :param str text: thai text
         :rtype: List[str]
         """
         return THAI2FIT_TOKENIZER.word_tokenize(text)
+
     def load_w2v(self): # insert substitute
         """
         Load thai2fit word2vec model
         """
         self.aug = Word2VecAug(self.thai2fit_wv, self.tokenizer, type="binary")
+
     def augment(self, sentence: str, n_sent: int = 1, p: float = 0.7) -> List[Tuple[str]]:
         """
         Text Augment using word2vec from Thai2Fit
@@ -36,4 +39,4 @@ def augment(self, sentence: str, n_sent: int = 1, p: float = 0.7) -> List[Tuple[
         :return: list of text augment
         :rtype: List[Tuple[str]]
         """
-        return self.aug.augment(sentence, n_sent, p)
+        return self.aug.augment(sentence, n_sent, p)
diff --git a/setup.py b/setup.py
@@ -47,7 +47,7 @@
     "ipa": ["epitran>=1.1"],
     "ml": ["numpy>=1.16", "torch>=1.0.0"],
     "ssg": ["ssg>=0.0.6"],
-    "thai2fit": ["emoji>=0.5.1", "gensim>=3.2.0", "numpy>=1.16.1"],
+    "thai2fit": ["emoji>=0.5.1", "gensim>=4.0.0", "numpy>=1.16.1"],
     "thai2rom": ["numpy>=1.16.1", "torch>=1.0.0"],
     "translate": [
         "fairseq>=0.10.0",
@@ -56,7 +56,6 @@
         "torch>=1.0.0",
     ],
     "textaugment": [
-        "bpemb",
         "gensim>=4.0.0",
         "thai2transformers"
     ],
@@ -69,7 +68,6 @@
         "emoji>=0.5.1",
         "epitran>=1.1",
         "fairseq>=0.10.0",
-        "gensim>=3.2.0",
         "nltk>=3.3.*",
         "numpy>=1.16.1",
         "pandas>=0.24",
@@ -79,7 +77,6 @@
         "ssg>=0.0.6",
         "torch>=1.0.0",
         "transformers>=4.1.1",
-        "bpemb",
         "gensim>=4.0.0",
         "thai2transformers"
     ],
diff --git a/tests/test_textaugment.py b/tests/test_textaugment.py
@@ -21,12 +21,7 @@ def test_Thai2fitAug(self):
         _aug = Thai2fitAug()
         self.assertIsNotNone(_aug.tokenizer(self.text))
         self.assertIsNotNone(_aug.augment(self.text, n_sent=3, p = 0.5))
-    
-    def test_BPEmbAug(self):
-        _aug = BPEmbAug()
-        self.assertIsNotNone(_aug.tokenizer(self.text))
-        self.assertIsNotNone(_aug.augment(self.text, n_sent=3, p = 0.5))
-    
+
     def test_Thai2transformersAug(self):
         _aug = Thai2transformersAug()
         self.assertIsNotNone(_aug.augment(self.text2, num_replace_tokens=1))

Original file line number	Diff line number	Diff line change
`@@ -5,10 +5,8 @@`
`5`	`5`
`6`	`6`	`__all__ = [`
`7`	`7`	`"Word2VecAug",`
`8`		`- "BPEmbAug",`
`9`	`8`	`"Thai2fitAug"`
`10`	`9`	`]`
`11`	`10`
`12`	`11`	`from pythainlp.textaugment.word2vec.core import Word2VecAug`
`13`		`-from pythainlp.textaugment.word2vec.BPEmb_core import BPEmbAug`
`14`	`12`	`from pythainlp.textaugment.word2vec.thai2fit import Thai2fitAug`