Skip to content

Commit 04b22d8

Browse files
committed
Small variable rename and handle engine not found case
- add __ prefix for private class members - summarize_text() will always return something (if summarization engine not found, return first n sentences)
1 parent e6ef72a commit 04b22d8

File tree

1 file changed

+47
-30
lines changed

1 file changed

+47
-30
lines changed

pythainlp/summarize/__init__.py

Lines changed: 47 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,68 @@
11
# -*- coding: utf-8 -*-
2-
from __future__ import absolute_import,unicode_literals
3-
from pythainlp.corpus import stopwords
4-
from string import punctuation
2+
3+
from __future__ import absolute_import, unicode_literals
4+
55
from collections import defaultdict
6-
from pythainlp.tokenize import sent_tokenize, word_tokenize
76
from heapq import nlargest
7+
from string import punctuation
8+
9+
from pythainlp.corpus import stopwords
10+
from pythainlp.tokenize import sent_tokenize, word_tokenize
11+
12+
813
class FrequencySummarizer:
914
def __init__(self, min_cut=0.1, max_cut=0.9):
10-
self._min_cut = min_cut
11-
self._max_cut = max_cut
12-
self._stopwords = set(stopwords.words('thai') + list(punctuation))
15+
self.__min_cut = min_cut
16+
self.__max_cut = max_cut
17+
self.__stopwords = set(stopwords.words("thai") + list(punctuation))
1318

14-
def _compute_frequencies(self, word_sent):
19+
def __compute_frequencies(self, word_sent):
1520
freq = defaultdict(int)
1621
for s in word_sent:
1722
for word in s:
18-
if word not in self._stopwords:
23+
if word not in self.__stopwords:
1924
freq[word] += 1
25+
2026
m = float(max(freq.values()))
2127
for w in list(freq):
22-
freq[w] = freq[w]/m
23-
if freq[w] >= self._max_cut or freq[w] <= self._min_cut:
28+
freq[w] = freq[w] / m
29+
if freq[w] >= self.__max_cut or freq[w] <= self.__min_cut:
2430
del freq[w]
31+
2532
return freq
2633

27-
def _rank(self, ranking, n):
34+
def __rank(self, ranking, n):
2835
return nlargest(n, ranking, key=ranking.get)
2936

30-
def summarize(self, text, n,tokenize):
37+
def summarize(self, text, n, tokenizer):
3138
sents = sent_tokenize(text)
32-
word_sent = [word_tokenize(s,tokenize) for s in sents]
33-
self._freq = self._compute_frequencies(word_sent)
39+
word_sent = [word_tokenize(s, tokenizer) for s in sents]
40+
self.__freq = self.__compute_frequencies(word_sent)
3441
ranking = defaultdict(int)
42+
3543
for i, sent in enumerate(word_sent):
3644
for w in sent:
37-
if w in self._freq:
38-
ranking[i] += self._freq[w]
39-
sents_idx = self._rank(ranking,n)
40-
return [sents[j] for j in sents_idx]
41-
def summarize_text(text,n,engine='frequency',tokenize='newmm'):
42-
'''
43-
Thai text summarize.
44-
:param str text: thai text
45-
:param int n: sent number
46-
:param str engine: Thai text summarize engine.
47-
:param str tokenize: thai word tokenize.
48-
'''
49-
if engine=='frequency':
50-
data=FrequencySummarizer().summarize(text,n,tokenize)
51-
return data
45+
if w in self.__freq:
46+
ranking[i] += self.__freq[w]
47+
summaries_idx = self.__rank(ranking, n)
48+
49+
return [sents[j] for j in summaries_idx]
50+
51+
52+
def summarize_text(text, n, engine="frequency", tokenizer="newmm"):
53+
"""
54+
Thai text summarization
55+
:param str text: text to be summarized
56+
:param int n: number of sentences to be included in the summary
57+
:param str engine: text summarization engine
58+
:param str tokenizer: word tokenizer
59+
:return List[str] summary: list of selected sentences
60+
"""
61+
sents = []
62+
63+
if engine == "frequency":
64+
sents = FrequencySummarizer().summarize(text, n, tokenizer)
65+
else: # if engine not found, return first n sentences
66+
sents = sent_tokenize(text)[:n]
67+
68+
return sents

0 commit comments

Comments
 (0)