Skip to content

Commit

Permalink
Update README, fix the transform method in DeepcutTokenizer close #57
Browse files Browse the repository at this point in the history
  • Loading branch information
titipata committed Sep 17, 2019
1 parent 3645820 commit b8459df
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 21 deletions.
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,11 @@ Here is an example usage:
from deepcut import DeepcutTokenizer
tokenizer = DeepcutTokenizer(ngram_range=(1,1),
max_df=1.0, min_df=0.0)
X = tokenizer.fit_tranform(['ฉันบินได้', 'ฉันกินข้าว', 'ฉันอยากบิน']) # 3 x 4 CSR sparse matrix
print(tokenizer.vocabulary_) # {'กิน': 0, 'ข้าว': 3, 'อยาก': 1, 'ได้': 2}
X = tokenizer.fit_tranform(['ฉันบินได้', 'ฉันกินข้าว', 'ฉันอยากบิน']) # 3 x 6 CSR sparse matrix
print(tokenizer.vocabulary_) # {'บิน': 0, 'ได้': 1, 'ฉัน': 2, 'อยาก': 3, 'ข้าว': 4, 'กิน': 5}

X_test = tokenizer.transform(['ฉันกิน', 'ฉันไม่อยากบิน']) # use built tokenizer to transform new text
print(X_test.shape)
```


Expand Down
44 changes: 25 additions & 19 deletions deepcut/deepcut.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ def __init__(self, ngram_range=(1, 1), stop_words=None,
self.dtype = dtype
self.max_df = max_df
self.min_df = min_df
if max_df < 0 or min_df < 0:
raise ValueError("negative value for max_df or min_df")
self.max_features = max_features
self.stop_words = _check_stop_list(stop_words)

Expand Down Expand Up @@ -190,13 +192,13 @@ def _limit_features(self, X, vocabulary,

# Calculate a mask based on document frequencies
dfs = _document_frequency(X)
tfs = np.asarray(X.sum(axis=0)).ravel()
mask = np.ones(len(dfs), dtype=bool)
if high is not None:
mask &= dfs <= high
if low is not None:
mask &= dfs >= low
if limit is not None and mask.sum() > limit:
tfs = np.asarray(X.sum(axis=0)).ravel()
mask_inds = (-tfs[mask]).argsort()[:limit]
new_mask = np.zeros(len(dfs), dtype=bool)
new_mask[np.where(mask)[0][mask_inds]] = True
Expand All @@ -214,10 +216,15 @@ def _limit_features(self, X, vocabulary,
if len(kept_indices) == 0:
raise ValueError("After pruning, no terms remain. Try a lower"
" min_df or a higher max_df.")
return X[:, kept_indices], vocabulary, removed_terms
return X[:, kept_indices], removed_terms


def transform(self, raw_documents, new_document=False):
"""
raw_documents: list, list of new documents to be transformed
new_document: bool, if True, assume seeing documents and build a new self.vobabulary_,
if False, use the previous self.vocabulary_
"""
n_doc = len(raw_documents)
tokenized_documents = []
for doc in raw_documents:
Expand Down Expand Up @@ -250,22 +257,22 @@ def transform(self, raw_documents, new_document=False):
dtype=self.dtype)

# truncate vocabulary by max_df and min_df
max_df = self.max_df
min_df = self.min_df
max_doc_count = (max_df
if isinstance(max_df, numbers.Integral)
else max_df * n_doc)
min_doc_count = (min_df
if isinstance(min_df, numbers.Integral)
else min_df * n_doc)
if max_doc_count < min_doc_count:
raise ValueError(
"max_df corresponds to < documents than min_df")
X, vocabulary, _ = self._limit_features(X, self.vocabulary_,
max_doc_count,
min_doc_count,
self.max_features)
self.vocabulary_ = vocabulary
if new_document:
max_df = self.max_df
min_df = self.min_df
max_doc_count = (max_df
if isinstance(max_df, numbers.Integral)
else max_df * n_doc)
min_doc_count = (min_df
if isinstance(min_df, numbers.Integral)
else min_df * n_doc)
if max_doc_count < min_doc_count:
raise ValueError(
"max_df corresponds to < documents than min_df")
X, _ = self._limit_features(X, self.vocabulary_,
max_doc_count,
min_doc_count,
self.max_features)

return X

Expand All @@ -280,7 +287,6 @@ def fit_tranform(self, raw_documents):

def tokenize(self, text, custom_dict=None):
n_pad = 21
n_pad_2 = int((n_pad - 1)/2)

if not text:
return [''] # case of empty string
Expand Down

0 comments on commit b8459df

Please sign in to comment.