Update README, fix the transform method in DeepcutTokenizer close #57

rkcosmos · Sep 17, 2019 · b8459df · b8459df
1 parent 3645820
commit b8459df
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 21 deletions.
diff --git a/README.md b/README.md
@@ -103,8 +103,11 @@ Here is an example usage:
 from deepcut import DeepcutTokenizer
 tokenizer = DeepcutTokenizer(ngram_range=(1,1),
                              max_df=1.0, min_df=0.0)
-X = tokenizer.fit_tranform(['ฉันบินได้', 'ฉันกินข้าว', 'ฉันอยากบิน']) # 3 x 4 CSR sparse matrix
-print(tokenizer.vocabulary_) # {'กิน': 0, 'ข้าว': 3, 'อยาก': 1, 'ได้': 2}
+X = tokenizer.fit_tranform(['ฉันบินได้', 'ฉันกินข้าว', 'ฉันอยากบิน']) # 3 x 6 CSR sparse matrix
+print(tokenizer.vocabulary_) # {'บิน': 0, 'ได้': 1, 'ฉัน': 2, 'อยาก': 3, 'ข้าว': 4, 'กิน': 5}
+
+X_test = tokenizer.transform(['ฉันกิน', 'ฉันไม่อยากบิน']) # use built tokenizer to transform new text
+print(X_test.shape)
 ```
 
 

diff --git a/deepcut/deepcut.py b/deepcut/deepcut.py
@@ -139,6 +139,8 @@ def __init__(self, ngram_range=(1, 1), stop_words=None,
         self.dtype = dtype
         self.max_df = max_df
         self.min_df = min_df
+        if max_df < 0 or min_df < 0:
+            raise ValueError("negative value for max_df or min_df")
         self.max_features = max_features
         self.stop_words = _check_stop_list(stop_words)
 
@@ -190,13 +192,13 @@ def _limit_features(self, X, vocabulary,
 
         # Calculate a mask based on document frequencies
         dfs = _document_frequency(X)
-        tfs = np.asarray(X.sum(axis=0)).ravel()
         mask = np.ones(len(dfs), dtype=bool)
         if high is not None:
             mask &= dfs <= high
         if low is not None:
             mask &= dfs >= low
         if limit is not None and mask.sum() > limit:
+            tfs = np.asarray(X.sum(axis=0)).ravel()
             mask_inds = (-tfs[mask]).argsort()[:limit]
             new_mask = np.zeros(len(dfs), dtype=bool)
             new_mask[np.where(mask)[0][mask_inds]] = True
@@ -214,10 +216,15 @@ def _limit_features(self, X, vocabulary,
         if len(kept_indices) == 0:
             raise ValueError("After pruning, no terms remain. Try a lower"
                              " min_df or a higher max_df.")
-        return X[:, kept_indices], vocabulary, removed_terms
+        return X[:, kept_indices], removed_terms
 
 
     def transform(self, raw_documents, new_document=False):
+        """
+        raw_documents: list, list of new documents to be transformed
+        new_document: bool, if True, assume seeing documents and build a new self.vobabulary_,
+            if False, use the previous self.vocabulary_
+        """
         n_doc = len(raw_documents)
         tokenized_documents = []
         for doc in raw_documents:
@@ -250,22 +257,22 @@ def transform(self, raw_documents, new_document=False):
                           dtype=self.dtype)
 
         # truncate vocabulary by max_df and min_df
-        max_df = self.max_df
-        min_df = self.min_df
-        max_doc_count = (max_df
-                         if isinstance(max_df, numbers.Integral)
-                         else max_df * n_doc)
-        min_doc_count = (min_df
-                         if isinstance(min_df, numbers.Integral)
-                         else min_df * n_doc)
-        if max_doc_count < min_doc_count:
-            raise ValueError(
-                "max_df corresponds to < documents than min_df")
-        X, vocabulary, _ = self._limit_features(X, self.vocabulary_,
-                                                max_doc_count,
-                                                min_doc_count,
-                                                self.max_features)
-        self.vocabulary_ = vocabulary
+        if new_document:
+            max_df = self.max_df
+            min_df = self.min_df
+            max_doc_count = (max_df
+                            if isinstance(max_df, numbers.Integral)
+                            else max_df * n_doc)
+            min_doc_count = (min_df
+                            if isinstance(min_df, numbers.Integral)
+                            else min_df * n_doc)
+            if max_doc_count < min_doc_count:
+                raise ValueError(
+                    "max_df corresponds to < documents than min_df")
+            X, _ = self._limit_features(X, self.vocabulary_,
+                                        max_doc_count,
+                                        min_doc_count,
+                                        self.max_features)
 
         return X
 
@@ -280,7 +287,6 @@ def fit_tranform(self, raw_documents):
 
     def tokenize(self, text, custom_dict=None):
         n_pad = 21
-        n_pad_2 = int((n_pad - 1)/2)
 
         if not text:
             return [''] # case of empty string