-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
83 changed files
with
8,211 additions
and
52 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,52 +1,9 @@ | ||
# Prerequisites | ||
*.d | ||
|
||
# Object files | ||
*.o | ||
*.ko | ||
*.obj | ||
*.elf | ||
|
||
# Linker output | ||
*.ilk | ||
*.map | ||
*.exp | ||
|
||
# Precompiled Headers | ||
*.gch | ||
*.pch | ||
|
||
# Libraries | ||
*.lib | ||
*.a | ||
*.la | ||
*.lo | ||
|
||
# Shared objects (inc. Windows DLLs) | ||
*.dll | ||
*.so | ||
*.so.* | ||
*.dylib | ||
|
||
# Executables | ||
*.exe | ||
*.out | ||
*.app | ||
*.i*86 | ||
*.x86_64 | ||
*.hex | ||
|
||
# Debug files | ||
*.dSYM/ | ||
*.su | ||
*.idb | ||
*.pdb | ||
|
||
# Kernel Module Compile Results | ||
*.mod* | ||
*.cmd | ||
.tmp_versions/ | ||
modules.order | ||
Module.symvers | ||
Mkfile.old | ||
dkms.conf | ||
.idea | ||
*.pyc | ||
.DS_Store | ||
*/.DS_Store | ||
.ftpconfig | ||
data/ | ||
baseline/ | ||
mate-tool/ | ||
word2vec/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
import sys | ||
import os | ||
import argparse | ||
|
||
''' | ||
Standard ROUGE | ||
guardian(L) | ||
UB1 Rouge-1: 0.498439 Rouge-2: 0.216667 Rouge-l: 0.324901 Rouge-SU*: 0.216997 | ||
UB2 Rouge-1: 0.469815 Rouge-2: 0.278474 Rouge-l: 0.344528 Rouge-SU*: 0.208485 | ||
LexRank Rouge-1: 0.210933 Rouge-2: 0.037603 Rouge-l: 0.131110 Rouge-SU*: 0.046715 | ||
TextRank Rouge-1: 0.184086 Rouge-2: 0.029617 Rouge-l: 0.117287 Rouge-SU*: 0.037783 | ||
ICSI Rouge-1: 0.257562 Rouge-2: 0.060022 Rouge-l: 0.157313 Rouge-SU*: 0.065799 | ||
Luhn Rouge-1: 0.154681 Rouge-2: 0.022884 Rouge-l: 0.100451 Rouge-SU*: 0.027575 | ||
bbc(L) | ||
UB1 Rouge-1: 0.464780 Rouge-2: 0.195108 Rouge-l: 0.272242 Rouge-SU4: 0.197798 | ||
UB2 Rouge-1: 0.413318 Rouge-2: 0.227026 Rouge-l: 0.268316 Rouge-SU4: 0.193755 | ||
LexRank Rouge-1: 0.160842 Rouge-2: 0.024327 Rouge-l: 0.097632 Rouge-SU4: 0.042892 | ||
TextRank Rouge-1: 0.139200 Rouge-2: 0.021073 Rouge-l: 0.093124 Rouge-SU4: 0.037206 | ||
Luhn Rouge-1: 0.141699 Rouge-2: 0.023175 Rouge-l: 0.091994 Rouge-SU4: 0.038216 | ||
ICSI Rouge-1: 0.209584 Rouge-2: 0.046293 Rouge-l: 0.135454 Rouge-SU4: 0.063704 | ||
''' | ||
|
||
sys.path.append('../') | ||
|
||
from utils.data_helpers import load_data | ||
from tqdm import tqdm | ||
from myrouge.rouge import get_rouge_score | ||
|
||
from summarize.upper_bound import ExtractiveUpperbound | ||
from summarize.sume_wrap import SumeWrap | ||
from summarize.sumy.nlp.tokenizers import Tokenizer | ||
from summarize.sumy.parsers.plaintext import PlaintextParser | ||
from summarize.sumy.summarizers.lsa import LsaSummarizer | ||
from summarize.sumy.summarizers.kl import KLSummarizer | ||
from summarize.sumy.summarizers.luhn import LuhnSummarizer | ||
from summarize.sumy.summarizers.lex_rank import LexRankSummarizer | ||
from summarize.sumy.summarizers.text_rank import TextRankSummarizer | ||
from summarize.sumy.nlp.stemmers import Stemmer | ||
from nltk.corpus import stopwords | ||
import sys | ||
|
||
reload(sys) | ||
sys.setdefaultencoding('utf-8') | ||
parser = argparse.ArgumentParser(description='LiveBlogSum Baseline') | ||
parser.add_argument('-corpus', type=str, default='bbc') | ||
parser.add_argument('-path', type=str, default='../data/') | ||
parser.add_argument('-sum_len', type=int, default=1) | ||
|
||
args = parser.parse_args() | ||
args.path = args.path + args.corpus + '/test/' | ||
|
||
|
||
def get_summary_scores(algo, docs, refs, summary_size): | ||
language = 'english' | ||
summary = '' | ||
if algo == 'UB1': | ||
summarizer = ExtractiveUpperbound(language) | ||
summary = summarizer(docs, refs, summary_size, ngram_type=1) | ||
elif algo == 'UB2': | ||
summarizer = ExtractiveUpperbound(language) | ||
summary = summarizer(docs, refs, summary_size, ngram_type=2) | ||
elif algo == 'ICSI': | ||
summarizer = SumeWrap(language) | ||
summary = summarizer(docs, summary_size) | ||
else: | ||
doc_string = u'\n'.join([u'\n'.join(doc_sents) for doc_sents in docs]) | ||
parser = PlaintextParser.from_string(doc_string, Tokenizer(language)) | ||
stemmer = Stemmer(language) | ||
if algo == 'LSA': | ||
summarizer = LsaSummarizer(stemmer) | ||
if algo == 'KL': | ||
summarizer = KLSummarizer(stemmer) | ||
if algo == 'Luhn': | ||
summarizer = LuhnSummarizer(stemmer) | ||
if algo == 'LexRank': | ||
summarizer = LexRankSummarizer(stemmer) | ||
if algo == 'TextRank': | ||
summarizer = TextRankSummarizer(stemmer) | ||
|
||
summarizer.stop_words = frozenset(stopwords.words(language)) | ||
summary = summarizer(parser.document, summary_size) | ||
hyps, refs = map(list, zip(*[[' '.join(summary), ' '.join(model)] for model in refs])) | ||
hyp = str(hyps[0]).split() | ||
hyp = ' '.join(hyp[:summary_size]) | ||
ref = str(refs[0]) | ||
score = get_rouge_score(hyp, ref) | ||
return score['ROUGE-1']['r'], score['ROUGE-2']['r'], score['ROUGE-L']['r'], score['ROUGE-SU4']['r'] | ||
|
||
|
||
if __name__ == '__main__': | ||
file_names = os.listdir(args.path) | ||
algos = ['UB1', 'UB2', 'LexRank', 'TextRank', 'Luhn', 'ICSI'] | ||
R1 = {'UB1': .0, 'UB2': .0, 'ICSI': .0, 'LSA': .0, 'KL': .0, 'Luhn': .0, 'LexRank': .0, 'TextRank': .0} | ||
R2 = {'UB1': .0, 'UB2': .0, 'ICSI': .0, 'LSA': .0, 'KL': .0, 'Luhn': .0, 'LexRank': .0, 'TextRank': .0} | ||
Rl = {'UB1': .0, 'UB2': .0, 'ICSI': .0, 'LSA': .0, 'KL': .0, 'Luhn': .0, 'LexRank': .0, 'TextRank': .0} | ||
Rsu = {'UB1': .0, 'UB2': .0, 'ICSI': .0, 'LSA': .0, 'KL': .0, 'Luhn': .0, 'LexRank': .0, 'TextRank': .0} | ||
for filename in tqdm(file_names): | ||
data_file = os.path.join(args.path, filename) | ||
docs, refs = load_data(data_file) | ||
sum_len = len(' '.join(refs[0]).split(' ')) * args.sum_len | ||
print('####', filename, '####') | ||
for algo in algos: | ||
r1, r2, rl, rsu = get_summary_scores(algo, docs, refs, sum_len) | ||
print algo, r1, r2, rl, rsu | ||
R1[algo] += r1 | ||
R2[algo] += r2 | ||
Rl[algo] += rl | ||
Rsu[algo] += rsu | ||
print('Final Results') | ||
for algo in algos: | ||
R1[algo] /= len(file_names) | ||
R2[algo] /= len(file_names) | ||
Rl[algo] /= len(file_names) | ||
Rsu[algo] /= len(file_names) | ||
print('%s Rouge-1: %f Rouge-2: %f Rouge-l: %f Rouge-SU4: %f' % (algo, R1[algo], R2[algo], Rl[algo], Rsu[algo])) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from base import * | ||
from models import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
""" Base structures and functions for the sume module. | ||
Base contains the Sentence, LoadFile and State classes. | ||
author: florian boudin (florian.boudin@univ-nantes.fr) | ||
version: 0.1 | ||
date: Nov. 2014 | ||
""" | ||
|
||
import re | ||
import os | ||
import codecs | ||
from collections import Counter | ||
|
||
class State: | ||
""" State class | ||
Internal class used as a structure to keep track of the search state in | ||
the tabu_search method. | ||
Args: | ||
subset (set): a subset of sentences | ||
concepts (Counter): a set of concepts for the subset | ||
length (int): the length in words | ||
score (int): the score for the subset | ||
""" | ||
def __init__(self): | ||
self.subset = set() | ||
self.concepts = Counter() | ||
self.length = 0 | ||
self.score = 0 | ||
|
||
class Sentence: | ||
"""The sentence data structure. | ||
Args: | ||
tokens (list of str): the list of word tokens. | ||
doc_id (str): the identifier of the document from which the sentence | ||
comes from. | ||
position (int): the position of the sentence in the source document. | ||
""" | ||
def __init__(self, tokens, doc_id, position, phrases = [], dict_tokens_pos={}): | ||
|
||
self.tokens = tokens | ||
""" tokens as a list. """ | ||
|
||
self.doc_id = doc_id | ||
""" document identifier of the sentence. """ | ||
|
||
self.position = position | ||
""" position of the sentence within the document. """ | ||
|
||
self.concepts = [] | ||
""" concepts of the sentence. """ | ||
|
||
self.untokenized_form = '' | ||
""" untokenized form of the sentence. """ | ||
|
||
self.length = 0 | ||
""" length of the untokenized sentence. """ | ||
|
||
self.phrases = phrases | ||
""" phrases of the sentence. """ | ||
|
||
self.tokens_pos = dict_tokens_pos | ||
|
||
class LoadFile(object): | ||
"""Objects which inherit from this class have read file functions. | ||
""" | ||
|
||
def __init__(self, input_directory): | ||
""" | ||
Args: | ||
input_file (str): the path of the input file. | ||
use_stems (bool): whether stems should be used instead of words, | ||
defaults to False. | ||
""" | ||
self.input_directory = input_directory | ||
self.sentences = [] | ||
|
||
def read_documents(self, file_extension="txt"): | ||
"""Read the input files in the given directory. | ||
Load the input files and populate the sentence list. Input files are | ||
expected to be in one tokenized sentence per line format. | ||
Args: | ||
file_extension (str): the file extension for input documents, | ||
defaults to txt. | ||
""" | ||
for infile in os.listdir(self.input_directory): | ||
|
||
# skip files with wrong extension | ||
if not infile.endswith(file_extension): | ||
continue | ||
|
||
with codecs.open(self.input_directory + '/' + infile, | ||
'r', | ||
'utf-8') as f: | ||
|
||
# load the sentences | ||
lines = f.readlines() | ||
|
||
# loop over sentences | ||
for i in range(len(lines)): | ||
|
||
# split the sentence into tokens | ||
tokens = lines[i].strip().split(' ') | ||
|
||
# add the sentence | ||
if len(tokens) > 0: | ||
sentence = Sentence(tokens, infile, i) | ||
untokenized_form = untokenize(tokens) | ||
sentence.untokenized_form = untokenized_form | ||
sentence.length = len(untokenized_form.split(' ')) | ||
self.sentences.append(sentence) | ||
|
||
def untokenize(tokens): | ||
"""Untokenizing a list of tokens. | ||
Args: | ||
tokens (list of str): the list of tokens to untokenize. | ||
Returns: | ||
a string | ||
""" | ||
text = u' '.join(tokens) | ||
text = re.sub(u"\s+", u" ", text.strip()) | ||
text = re.sub(u" ('[a-z]) ", u"\g<1> ", text) | ||
text = re.sub(u" ([\.;,-]) ", u"\g<1> ", text) | ||
text = re.sub(u" ([\.;,-?!])$", u"\g<1>", text) | ||
text = re.sub(u" _ (.+) _ ", u" _\g<1>_ ", text) | ||
text = re.sub(u" \$ ([\d\.]+) ", u" $\g<1> ", text) | ||
text = text.replace(u" ' ", u"' ") | ||
text = re.sub(u"([\W\s])\( ", u"\g<1>(", text) | ||
text = re.sub(u" \)([\W\s])", u")\g<1>", text) | ||
text = text.replace(u"`` ", u"``") | ||
text = text.replace(u" ''", u"''") | ||
text = text.replace(u" n't", u"n't") | ||
text = re.sub(u'(^| )" ([^"]+) "( |$)', u'\g<1>"\g<2>"\g<3>', text) | ||
text = re.sub(u' -RRB-', u')', text) | ||
text = re.sub(u'-LRB- ', u'(', text) | ||
# times | ||
text = re.sub('(\d+) : (\d+ [ap]\.m\.)', '\g<1>:\g<2>', text) | ||
|
||
text = re.sub('^" ', '"', text) | ||
text = re.sub(' "$', '"', text) | ||
text = re.sub(u"\s+", u" ", text.strip()) | ||
|
||
return text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from concept_based import * |
Oops, something went wrong.