Small variable rename and handle engine not found case

bact · bact · commit 04b22d88ac63 · 2018-10-20T09:33:28.000+07:00
- add __ prefix for private class members
- summarize_text() will always return something (if summarization engine not found, return first n sentences)
diff --git a/pythainlp/summarize/__init__.py b/pythainlp/summarize/__init__.py
@@ -1,51 +1,68 @@
 # -*- coding: utf-8 -*-
-from __future__ import absolute_import,unicode_literals
-from pythainlp.corpus import stopwords
-from string import punctuation
+
+from __future__ import absolute_import, unicode_literals
+
 from collections import defaultdict
-from pythainlp.tokenize import sent_tokenize, word_tokenize
 from heapq import nlargest
+from string import punctuation
+
+from pythainlp.corpus import stopwords
+from pythainlp.tokenize import sent_tokenize, word_tokenize
+
+
 class FrequencySummarizer:
     def __init__(self, min_cut=0.1, max_cut=0.9):
-        self._min_cut = min_cut
-        self._max_cut = max_cut
-        self._stopwords = set(stopwords.words('thai') + list(punctuation))
+        self.__min_cut = min_cut
+        self.__max_cut = max_cut
+        self.__stopwords = set(stopwords.words("thai") + list(punctuation))
 
-    def _compute_frequencies(self, word_sent):
+    def __compute_frequencies(self, word_sent):
         freq = defaultdict(int)
         for s in word_sent:
             for word in s:
-                if word not in self._stopwords:
+                if word not in self.__stopwords:
                     freq[word] += 1
+
         m = float(max(freq.values()))
         for w in list(freq):
-            freq[w] = freq[w]/m
-            if freq[w] >= self._max_cut or freq[w] <= self._min_cut:
+            freq[w] = freq[w] / m
+            if freq[w] >= self.__max_cut or freq[w] <= self.__min_cut:
                 del freq[w]
+
         return freq
 
-    def _rank(self, ranking, n):
+    def __rank(self, ranking, n):
         return nlargest(n, ranking, key=ranking.get)
 
-    def summarize(self, text, n,tokenize):
+    def summarize(self, text, n, tokenizer):
         sents = sent_tokenize(text)
-        word_sent = [word_tokenize(s,tokenize) for s in sents]
-        self._freq = self._compute_frequencies(word_sent)
+        word_sent = [word_tokenize(s, tokenizer) for s in sents]
+        self.__freq = self.__compute_frequencies(word_sent)
         ranking = defaultdict(int)
+
         for i, sent in enumerate(word_sent):
             for w in sent:
-                if w in self._freq:
-                    ranking[i] += self._freq[w]
-        sents_idx = self._rank(ranking,n)
-        return [sents[j] for j in sents_idx]
-def summarize_text(text,n,engine='frequency',tokenize='newmm'):
-    '''
-    Thai text summarize.
-    :param str text: thai text
-    :param int n: sent number
-    :param str engine: Thai text summarize engine.
-    :param str tokenize: thai word tokenize.
-    '''
-    if engine=='frequency':
-        data=FrequencySummarizer().summarize(text,n,tokenize)
-    return data
+                if w in self.__freq:
+                    ranking[i] += self.__freq[w]
+        summaries_idx = self.__rank(ranking, n)
+
+        return [sents[j] for j in summaries_idx]
+
+
+def summarize_text(text, n, engine="frequency", tokenizer="newmm"):
+    """
+    Thai text summarization
+    :param str text: text to be summarized
+    :param int n: number of sentences to be included in the summary
+    :param str engine: text summarization engine
+    :param str tokenizer: word tokenizer
+    :return List[str] summary: list of selected sentences
+    """
+    sents = []
+
+    if engine == "frequency":
+        sents = FrequencySummarizer().summarize(text, n, tokenizer)
+    else:  # if engine not found, return first n sentences
+        sents = sent_tokenize(text)[:n]
+
+    return sents