-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_script.py
85 lines (79 loc) · 2.45 KB
/
text_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import nltk
import string
from lxml import html
import requests
from readability import Document
from urllib import request
import re
from collections import defaultdict
from string import punctuation
from heapq import nlargest
import copy
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
try:
import urllib.request as urllib2
except ImportError:
import urllib2
class FrequencySummarizer:
def __init__(self, min_cut=0.1, max_cut=0.9):
"""
Initilize the text summarizer.
Words that have a frequency term lower than min_cut
or higer than max_cut will be ignored.
"""
self._min_cut = min_cut
self._max_cut = max_cut
self._stopwords = set(stopwords.words('english') + list(punctuation))
def _compute_frequencies(self, word_sent):
"""
Compute the frequency of each of word.
Input:
word_sent, a list of sentences already tokenized.
Output:
freq, a dictionary where freq[w] is the frequency of w.
"""
freq = defaultdict(int)
for s in word_sent:
for word in s:
if word not in self._stopwords:
freq[word] += 1
# frequencies normalization and fitering
# print(freq)
m = float(max(freq.values()))
freq_words = copy.deepcopy(freq)
for w in freq.keys():
freq[w] = freq[w]/m
freq_words[w] = freq_words[w]/m
if freq[w] >= self._max_cut or freq[w] <= self._min_cut:
del freq_words[w]
return freq_words
def summarize(self, text, n):
"""
Return a list of n sentences
which represent the summary of text.
"""
sents = sent_tokenize(text)
assert n <= len(sents)
word_sent = [word_tokenize(s.lower()) for s in sents]
self._freq = self._compute_frequencies(word_sent)
ranking = defaultdict(int)
for i,sent in enumerate(word_sent):
for w in sent:
if w in self._freq:
ranking[i] += self._freq[w]
sents_idx = self._rank(ranking, n)
return [sents[j] for j in sents_idx]
def _rank(self, ranking, n):
""" return the first n sentences with highest ranking """
return nlargest(n, ranking, key=ranking.get)
# print("5:")
def get_text(file_name):
fs = FrequencySummarizer()
f=open('result.txt','w+')
with open(file_name,"r") as file:
text=file.read()
for s in fs.summarize(text,4):
f.write(str('*'+s))
print()
f.close()