diff --git a/.gitignore b/.gitignore index c6127b3..f5f261c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,52 +1,9 @@ -# Prerequisites -*.d - -# Object files -*.o -*.ko -*.obj -*.elf - -# Linker output -*.ilk -*.map -*.exp - -# Precompiled Headers -*.gch -*.pch - -# Libraries -*.lib -*.a -*.la -*.lo - -# Shared objects (inc. Windows DLLs) -*.dll -*.so -*.so.* -*.dylib - -# Executables -*.exe -*.out -*.app -*.i*86 -*.x86_64 -*.hex - -# Debug files -*.dSYM/ -*.su -*.idb -*.pdb - -# Kernel Module Compile Results -*.mod* -*.cmd -.tmp_versions/ -modules.order -Module.symvers -Mkfile.old -dkms.conf +.idea +*.pyc +.DS_Store +*/.DS_Store +.ftpconfig +data/ +baseline/ +mate-tool/ +word2vec/ diff --git a/baselines/baseline2/baseline.py b/baselines/baseline2/baseline.py new file mode 100755 index 0000000..c316351 --- /dev/null +++ b/baselines/baseline2/baseline.py @@ -0,0 +1,118 @@ +import sys +import os +import argparse + +''' +Standard ROUGE + +guardian(L) +UB1 Rouge-1: 0.498439 Rouge-2: 0.216667 Rouge-l: 0.324901 Rouge-SU*: 0.216997 +UB2 Rouge-1: 0.469815 Rouge-2: 0.278474 Rouge-l: 0.344528 Rouge-SU*: 0.208485 +LexRank Rouge-1: 0.210933 Rouge-2: 0.037603 Rouge-l: 0.131110 Rouge-SU*: 0.046715 +TextRank Rouge-1: 0.184086 Rouge-2: 0.029617 Rouge-l: 0.117287 Rouge-SU*: 0.037783 +ICSI Rouge-1: 0.257562 Rouge-2: 0.060022 Rouge-l: 0.157313 Rouge-SU*: 0.065799 +Luhn Rouge-1: 0.154681 Rouge-2: 0.022884 Rouge-l: 0.100451 Rouge-SU*: 0.027575 + +bbc(L) +UB1 Rouge-1: 0.464780 Rouge-2: 0.195108 Rouge-l: 0.272242 Rouge-SU4: 0.197798 +UB2 Rouge-1: 0.413318 Rouge-2: 0.227026 Rouge-l: 0.268316 Rouge-SU4: 0.193755 +LexRank Rouge-1: 0.160842 Rouge-2: 0.024327 Rouge-l: 0.097632 Rouge-SU4: 0.042892 +TextRank Rouge-1: 0.139200 Rouge-2: 0.021073 Rouge-l: 0.093124 Rouge-SU4: 0.037206 +Luhn Rouge-1: 0.141699 Rouge-2: 0.023175 Rouge-l: 0.091994 Rouge-SU4: 0.038216 +ICSI Rouge-1: 0.209584 Rouge-2: 0.046293 Rouge-l: 0.135454 Rouge-SU4: 0.063704 + +''' + +sys.path.append('../') + +from utils.data_helpers import load_data +from tqdm import tqdm +from myrouge.rouge import get_rouge_score + +from summarize.upper_bound import ExtractiveUpperbound +from summarize.sume_wrap import SumeWrap +from summarize.sumy.nlp.tokenizers import Tokenizer +from summarize.sumy.parsers.plaintext import PlaintextParser +from summarize.sumy.summarizers.lsa import LsaSummarizer +from summarize.sumy.summarizers.kl import KLSummarizer +from summarize.sumy.summarizers.luhn import LuhnSummarizer +from summarize.sumy.summarizers.lex_rank import LexRankSummarizer +from summarize.sumy.summarizers.text_rank import TextRankSummarizer +from summarize.sumy.nlp.stemmers import Stemmer +from nltk.corpus import stopwords +import sys + +reload(sys) +sys.setdefaultencoding('utf-8') +parser = argparse.ArgumentParser(description='LiveBlogSum Baseline') +parser.add_argument('-corpus', type=str, default='bbc') +parser.add_argument('-path', type=str, default='../data/') +parser.add_argument('-sum_len', type=int, default=1) + +args = parser.parse_args() +args.path = args.path + args.corpus + '/test/' + + +def get_summary_scores(algo, docs, refs, summary_size): + language = 'english' + summary = '' + if algo == 'UB1': + summarizer = ExtractiveUpperbound(language) + summary = summarizer(docs, refs, summary_size, ngram_type=1) + elif algo == 'UB2': + summarizer = ExtractiveUpperbound(language) + summary = summarizer(docs, refs, summary_size, ngram_type=2) + elif algo == 'ICSI': + summarizer = SumeWrap(language) + summary = summarizer(docs, summary_size) + else: + doc_string = u'\n'.join([u'\n'.join(doc_sents) for doc_sents in docs]) + parser = PlaintextParser.from_string(doc_string, Tokenizer(language)) + stemmer = Stemmer(language) + if algo == 'LSA': + summarizer = LsaSummarizer(stemmer) + if algo == 'KL': + summarizer = KLSummarizer(stemmer) + if algo == 'Luhn': + summarizer = LuhnSummarizer(stemmer) + if algo == 'LexRank': + summarizer = LexRankSummarizer(stemmer) + if algo == 'TextRank': + summarizer = TextRankSummarizer(stemmer) + + summarizer.stop_words = frozenset(stopwords.words(language)) + summary = summarizer(parser.document, summary_size) + hyps, refs = map(list, zip(*[[' '.join(summary), ' '.join(model)] for model in refs])) + hyp = str(hyps[0]).split() + hyp = ' '.join(hyp[:summary_size]) + ref = str(refs[0]) + score = get_rouge_score(hyp, ref) + return score['ROUGE-1']['r'], score['ROUGE-2']['r'], score['ROUGE-L']['r'], score['ROUGE-SU4']['r'] + + +if __name__ == '__main__': + file_names = os.listdir(args.path) + algos = ['UB1', 'UB2', 'LexRank', 'TextRank', 'Luhn', 'ICSI'] + R1 = {'UB1': .0, 'UB2': .0, 'ICSI': .0, 'LSA': .0, 'KL': .0, 'Luhn': .0, 'LexRank': .0, 'TextRank': .0} + R2 = {'UB1': .0, 'UB2': .0, 'ICSI': .0, 'LSA': .0, 'KL': .0, 'Luhn': .0, 'LexRank': .0, 'TextRank': .0} + Rl = {'UB1': .0, 'UB2': .0, 'ICSI': .0, 'LSA': .0, 'KL': .0, 'Luhn': .0, 'LexRank': .0, 'TextRank': .0} + Rsu = {'UB1': .0, 'UB2': .0, 'ICSI': .0, 'LSA': .0, 'KL': .0, 'Luhn': .0, 'LexRank': .0, 'TextRank': .0} + for filename in tqdm(file_names): + data_file = os.path.join(args.path, filename) + docs, refs = load_data(data_file) + sum_len = len(' '.join(refs[0]).split(' ')) * args.sum_len + print('####', filename, '####') + for algo in algos: + r1, r2, rl, rsu = get_summary_scores(algo, docs, refs, sum_len) + print algo, r1, r2, rl, rsu + R1[algo] += r1 + R2[algo] += r2 + Rl[algo] += rl + Rsu[algo] += rsu + print('Final Results') + for algo in algos: + R1[algo] /= len(file_names) + R2[algo] /= len(file_names) + Rl[algo] /= len(file_names) + Rsu[algo] /= len(file_names) + print('%s Rouge-1: %f Rouge-2: %f Rouge-l: %f Rouge-SU4: %f' % (algo, R1[algo], R2[algo], Rl[algo], Rsu[algo])) diff --git a/baselines/baseline2/summarize/__init__.py b/baselines/baseline2/summarize/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/baselines/baseline2/summarize/sume/__init__.py b/baselines/baseline2/summarize/sume/__init__.py new file mode 100755 index 0000000..8c41bc6 --- /dev/null +++ b/baselines/baseline2/summarize/sume/__init__.py @@ -0,0 +1,2 @@ +from base import * +from models import * diff --git a/baselines/baseline2/summarize/sume/base.py b/baselines/baseline2/summarize/sume/base.py new file mode 100755 index 0000000..40548a6 --- /dev/null +++ b/baselines/baseline2/summarize/sume/base.py @@ -0,0 +1,156 @@ +# -*- coding: utf-8 -*- + +""" Base structures and functions for the sume module. + + Base contains the Sentence, LoadFile and State classes. + + + author: florian boudin (florian.boudin@univ-nantes.fr) + version: 0.1 + date: Nov. 2014 +""" + +import re +import os +import codecs +from collections import Counter + +class State: + """ State class + + Internal class used as a structure to keep track of the search state in + the tabu_search method. + + Args: + subset (set): a subset of sentences + concepts (Counter): a set of concepts for the subset + length (int): the length in words + score (int): the score for the subset + """ + def __init__(self): + self.subset = set() + self.concepts = Counter() + self.length = 0 + self.score = 0 + +class Sentence: + """The sentence data structure. + + Args: + tokens (list of str): the list of word tokens. + doc_id (str): the identifier of the document from which the sentence + comes from. + position (int): the position of the sentence in the source document. + """ + def __init__(self, tokens, doc_id, position, phrases = [], dict_tokens_pos={}): + + self.tokens = tokens + """ tokens as a list. """ + + self.doc_id = doc_id + """ document identifier of the sentence. """ + + self.position = position + """ position of the sentence within the document. """ + + self.concepts = [] + """ concepts of the sentence. """ + + self.untokenized_form = '' + """ untokenized form of the sentence. """ + + self.length = 0 + """ length of the untokenized sentence. """ + + self.phrases = phrases + """ phrases of the sentence. """ + + self.tokens_pos = dict_tokens_pos + +class LoadFile(object): + """Objects which inherit from this class have read file functions. + + """ + + def __init__(self, input_directory): + """ + Args: + input_file (str): the path of the input file. + use_stems (bool): whether stems should be used instead of words, + defaults to False. + + """ + self.input_directory = input_directory + self.sentences = [] + + def read_documents(self, file_extension="txt"): + """Read the input files in the given directory. + + Load the input files and populate the sentence list. Input files are + expected to be in one tokenized sentence per line format. + + Args: + file_extension (str): the file extension for input documents, + defaults to txt. + """ + for infile in os.listdir(self.input_directory): + + # skip files with wrong extension + if not infile.endswith(file_extension): + continue + + with codecs.open(self.input_directory + '/' + infile, + 'r', + 'utf-8') as f: + + # load the sentences + lines = f.readlines() + + # loop over sentences + for i in range(len(lines)): + + # split the sentence into tokens + tokens = lines[i].strip().split(' ') + + # add the sentence + if len(tokens) > 0: + sentence = Sentence(tokens, infile, i) + untokenized_form = untokenize(tokens) + sentence.untokenized_form = untokenized_form + sentence.length = len(untokenized_form.split(' ')) + self.sentences.append(sentence) + +def untokenize(tokens): + """Untokenizing a list of tokens. + + Args: + tokens (list of str): the list of tokens to untokenize. + + Returns: + a string + + """ + text = u' '.join(tokens) + text = re.sub(u"\s+", u" ", text.strip()) + text = re.sub(u" ('[a-z]) ", u"\g<1> ", text) + text = re.sub(u" ([\.;,-]) ", u"\g<1> ", text) + text = re.sub(u" ([\.;,-?!])$", u"\g<1>", text) + text = re.sub(u" _ (.+) _ ", u" _\g<1>_ ", text) + text = re.sub(u" \$ ([\d\.]+) ", u" $\g<1> ", text) + text = text.replace(u" ' ", u"' ") + text = re.sub(u"([\W\s])\( ", u"\g<1>(", text) + text = re.sub(u" \)([\W\s])", u")\g<1>", text) + text = text.replace(u"`` ", u"``") + text = text.replace(u" ''", u"''") + text = text.replace(u" n't", u"n't") + text = re.sub(u'(^| )" ([^"]+) "( |$)', u'\g<1>"\g<2>"\g<3>', text) + text = re.sub(u' -RRB-', u')', text) + text = re.sub(u'-LRB- ', u'(', text) + # times + text = re.sub('(\d+) : (\d+ [ap]\.m\.)', '\g<1>:\g<2>', text) + + text = re.sub('^" ', '"', text) + text = re.sub(' "$', '"', text) + text = re.sub(u"\s+", u" ", text.strip()) + + return text diff --git a/baselines/baseline2/summarize/sume/models/__init__.py b/baselines/baseline2/summarize/sume/models/__init__.py new file mode 100755 index 0000000..8dadd6f --- /dev/null +++ b/baselines/baseline2/summarize/sume/models/__init__.py @@ -0,0 +1 @@ +from concept_based import * \ No newline at end of file diff --git a/baselines/baseline2/summarize/sume/models/concept_based.py b/baselines/baseline2/summarize/sume/models/concept_based.py new file mode 100755 index 0000000..baff7fb --- /dev/null +++ b/baselines/baseline2/summarize/sume/models/concept_based.py @@ -0,0 +1,632 @@ +# -*- coding: utf-8 -*- + +""" Concept-based ILP summarization methods. + + authors: Florian Boudin (florian.boudin@univ-nantes.fr) + Hugo Mougard (hugo.mougard@univ-nantes.fr) + version: 0.2 + date: May 2015 +""" + +from collections import defaultdict, deque +import re +import random +import sys +import pulp +from nltk.corpus import stopwords +from nltk.stem.snowball import SnowballStemmer + +from summarize.sume.base import LoadFile, State + +class ConceptBasedILPSummarizer(LoadFile): + """Implementation of the concept-based ILP model for summarization. + + The original algorithm was published and described in: + + * Dan Gillick and Benoit Favre, A Scalable Global Model for Summarization, + *Proceedings of the NAACL HLT Workshop on Integer Linear Programming for + Natural Language Processing*, pages 10–18, 2009. + """ + def __init__(self, input_directory, language): + """ + Args: + input_directory (str): the directory from which text documents to + be summarized are loaded. + + @type language: str + + """ + self.input_directory = input_directory + self.sentences = [] + self.weights = {} + self.c2s = defaultdict(set) + self.concept_sets = defaultdict(frozenset) + self.LANGUAGE = language + # type: str + + self.stoplist = set(stopwords.words(self.LANGUAGE)) + self.stemmer = SnowballStemmer(self.LANGUAGE) + + self.word_frequencies = defaultdict(int) + self.w2s = defaultdict(set) + + + def extract_ngrams(self, n=2): + """Extract the ngrams of words from the input sentences. + + Args: + n (int): the number of words for ngrams, defaults to 2 + """ + for i, sentence in enumerate(self.sentences): + + # for each ngram of words + for j in range(len(sentence.tokens)-(n-1)): + + # initialize ngram container + ngram = [] + + # for each token of the ngram + for k in range(j, j+n): + ngram.append(sentence.tokens[k].lower()) + + # do not consider ngrams containing punctuation marks + marks = [t for t in ngram if not re.search('[a-zA-Z0-9]', t)] + if len(marks) > 0: + continue + + # do not consider ngrams composed of only stopwords + stops = [t for t in ngram if t in self.stoplist] + if len(stops) == len(ngram): + continue + + # stem the ngram + ngram = [self.stemmer.stem(t) for t in ngram] + + # add the ngram to the concepts + self.sentences[i].concepts.append(' '.join(ngram)) + + + def compute_document_frequency(self): + """Compute the document frequency of each concept. + + """ + for i in range(len(self.sentences)): + + # for each concept + for concept in self.sentences[i].concepts: + + # add the document id to the concept weight container + if concept not in self.weights: + self.weights[concept] = set([]) + self.weights[concept].add(self.sentences[i].doc_id) + + # loop over the concepts and compute the document frequency + for concept in self.weights: + self.weights[concept] = len(self.weights[concept]) + + def compute_word_frequency(self): + """Compute the frequency of each word in the set of documents. """ + + for i, sentence in enumerate(self.sentences): + for token in sentence.tokens: + t = token.lower() + if not re.search('[a-zA-Z0-9]', t) or t in self.stoplist: + continue + t = self.stemmer.stem(t) + self.w2s[t].add(i) + self.word_frequencies[t] += 1 + + def prune_sentences(self, + mininum_sentence_length=5, + remove_citations=True, + remove_redundancy=True, + imp_list=[]): + """Prune the sentences. + + Remove the sentences that are shorter than a given length, redundant + sentences and citations from entering the summary. + + Args: + mininum_sentence_length (int): the minimum number of words for a + sentence to enter the summary, defaults to 5 + remove_citations (bool): indicates that citations are pruned, + defaults to True + remove_redundancy (bool): indicates that redundant sentences are + pruned, defaults to True + + """ + pruned_sentences = [] + + # loop over the sentences + for i, sentence in enumerate(self.sentences): + if imp_list: + if imp_list[i] == 0: + continue + # prune short sentences + if sentence.length < mininum_sentence_length: + continue + + # prune citations + first_token, last_token = sentence.tokens[0], sentence.tokens[-1] + + if remove_citations and \ + (first_token == u"``" or first_token == u'"' \ + or last_token == u"''" or first_token == u'"' \ + or last_token== u"'" or first_token==u"'") \ + or last_token == u'"': + continue + + # prune ___ said citations + # if remove_citations and \ + # (sentence.tokens[0]==u"``" or sentence.tokens[0]==u'"') and \ + # re.search('(?i)(''|") \w{,30} (said|reported|told)\.$', + # sentence.untokenized_form): + # continue + + # prune identical and almost identical sentences + if remove_redundancy: + is_redundant = False + for prev_sentence in pruned_sentences: + if sentence.tokens == prev_sentence.tokens: + is_redundant = True + break + + if is_redundant: + continue + + # otherwise add the sentence to the pruned sentence container + pruned_sentences.append(sentence) + + self.sentences = pruned_sentences + + def prune_concepts(self, method="threshold", value=3, rejected_list=[]): + """Prune the concepts for efficient summarization. + + Args: + method (str): the method for pruning concepts that can be whether + by using a minimal value for concept scores (threshold) or using + the top-N highest scoring concepts (top-n), defaults to + threshold. + value (int): the value used for pruning concepts, defaults to 3. + + """ + if method == 'stopwords': + concepts = self.weights.keys() + for concept in concepts: + pruned_list = prune_ngrams(concept, self.stoplist, 1) + if not pruned_list: + #print concept, self.weights[concept] + del self.weights[concept] + + if method == "list": + concepts = self.weights.keys() + for concept in concepts: + if concept in rejected_list: + #print concept, self.weights[concept] + del self.weights[concept] + + # 'threshold' pruning method + if method == "threshold": + + # iterates over the concept weights + concepts = self.weights.keys() + for concept in concepts: + if self.weights[concept] < value: + del self.weights[concept] + + # 'top-n' pruning method + elif method == "top-n": + + # sort concepts by scores + sorted_concepts = sorted(self.weights, + key=lambda x: self.weights[x], + reverse=True) + + # iterates over the concept weights + concepts = self.weights.keys() + for concept in concepts: + if concept not in sorted_concepts[:value]: + del self.weights[concept] + + # iterates over the sentences + for i in range(len(self.sentences)): + + # current sentence concepts + concepts = self.sentences[i].concepts + + # prune concepts + self.sentences[i].concepts = [c for c in concepts + if c in self.weights] + + + def compute_c2s(self): + """Compute the inverted concept to sentences dictionary. """ + + for i, sentence in enumerate(self.sentences): + for concept in sentence.concepts: + self.c2s[concept].add(i) + + def compute_concept_sets(self): + """Compute the concept sets for each sentence.""" + + for i, sentence in enumerate(self.sentences): + for concept in sentence.concepts: + self.concept_sets[i] |= {concept} + + def greedy_approximation(self, summary_size=100): + """Greedy approximation of the ILP model. + + Args: + summary_size (int): the maximum size in words of the summary, + defaults to 100. + + Returns: + (value, set) tuple (int, list): the value of the approximated + objective function and the set of selected sentences as a tuple. + + """ + # initialize the inverted c2s dictionary if not already created + if not self.c2s: + self.compute_c2s() + + # initialize weights + weights = {} + + # initialize the score of the best singleton + best_singleton_score = 0 + + # compute indices of our sentences + sentences = range(len(self.sentences)) + + # compute initial weights and fill the reverse index + # while keeping track of the best singleton solution + for i, sentence in enumerate(self.sentences): + weights[i] = sum(self.weights[c] for c in set(sentence.concepts)) + if sentence.length <= summary_size\ + and weights[i] > best_singleton_score: + best_singleton_score = weights[i] + best_singleton = i + + # initialize the selected solution properties + sel_subset, sel_concepts, sel_length, sel_score = set(), set(), 0, 0 + + # greedily select a sentence + while True: + + ################################################################### + # RETRIEVE THE BEST SENTENCE + ################################################################### + + # sort the sentences by gain and reverse length + sort_sent = sorted(((weights[i] / float(self.sentences[i].length), + -self.sentences[i].length, + i) + for i in sentences), + reverse=True) + + # select the first sentence that fits in the length limit + for sentence_gain, rev_length, sentence_index in sort_sent: + if sel_length - rev_length <= summary_size: + break + # if we don't find a sentence, break out of the main while loop + else: + break + + # if the gain is null, break out of the main while loop + if not weights[sentence_index]: + break + + # update the selected subset properties + sel_subset.add(sentence_index) + sel_score += weights[sentence_index] + sel_length -= rev_length + + # update sentence weights with the reverse index + for concept in set(self.sentences[sentence_index].concepts): + if concept not in sel_concepts: + for sentence in self.c2s[concept]: + weights[sentence] -= self.weights[concept] + + # update the last selected subset property + sel_concepts.update(self.sentences[sentence_index].concepts) + + # check if a singleton has a better score than our greedy solution + if best_singleton_score > sel_score: + return best_singleton_score, set([best_singleton]) + + # returns the (objective function value, solution) tuple + return sel_score, sel_subset + + def tabu_search(self, + summary_size=100, + memory_size=10, + iterations=100, + mutation_size=2, + mutation_group=True): + """Greedy approximation of the ILP model with a tabu search + meta-heuristic. + + Args: + summary_size (int): the maximum size in words of the summary, + defaults to 100. + memory_size (int): the maximum size of the pool of sentences + to ban at a given time, defaults at 5. + iterations (int): the number of iterations to run, defaults at + 30. + mutation_size (int): number of sentences to unselect and add to + the tabu list at each iteration. + mutation_group (boolean): flag to consider the mutations as a + group: we'll check sentence combinations in the tabu list, not + sentences alone. + Returns: + (value, set) tuple (int, list): the value of the approximated + objective function and the set of selected sentences as a tuple. + + """ + # compute concept to sentences and concept sets for each sentence + if not self.c2s: + self.compute_c2s() + if not self.concept_sets: + self.compute_concept_sets() + + # initialize weights + weights = {} + + # initialize the score of the best singleton + best_singleton_score = 0 + + # compute initial weights and fill the reverse index + # while keeping track of the best singleton solution + for i, sentence in enumerate(self.sentences): + weights[i] = sum(self.weights[c] for c in set(sentence.concepts)) + if sentence.length <= summary_size\ + and weights[i] > best_singleton_score: + best_singleton_score = weights[i] + best_singleton = i + + best_subset, best_score = None, 0 + state = State() + for i in xrange(iterations): + queue = deque([], memory_size) + # greedily select sentences + state = self.select_sentences(summary_size, + weights, + state, + queue, + mutation_group) + if state.score > best_score: + best_subset = state.subset.copy() + best_score = state.score + to_tabu = set(random.sample(state.subset, mutation_size)) + state = self.unselect_sentences(weights, state, to_tabu) + queue.extend(to_tabu) + + # check if a singleton has a better score than our greedy solution + if best_singleton_score > best_score: + return best_singleton_score, set([best_singleton]) + + # returns the (objective function value, solution) tuple + return best_score, best_subset + + def select_sentences(self, + summary_size, + weights, + state, + tabu_set, + mutation_group): + """Greedy sentence selector. + + Args: + summary_size (int): the maximum size in words of the summary, + defaults to 100. + weights (dictionary): the sentence weights dictionary. This + dictionnary is updated during this method call (in-place). + state (State): the state of the tabu search from which to start + selecting sentences. + tabu_set (iterable): set of sentences that are tabu: this + selector will not consider them. + mutation_group (boolean): flag to consider the mutations as a + group: we'll check sentence combinations in the tabu list, not + sentences alone. + + Returns: + state (State): the new state of the search. Also note that + weights is modified in-place. + + """ + # greedily select a sentence while respecting the tabu + while True: + + ################################################################### + # RETRIEVE THE BEST SENTENCE + ################################################################### + + # sort the sentences by gain and reverse length + sort_sent = sorted(((weights[i] / float(self.sentences[i].length), + -self.sentences[i].length, + i) + for i in range(len(self.sentences)) + if self.sentences[i].length + state.length <= + summary_size), + reverse=True) + + # select the first sentence that fits in the length limit + for sentence_gain, rev_length, sentence_index in sort_sent: + if mutation_group: + subset = state.subset | {sentence_index} + for tabu in tabu_set: + if tabu <= subset: + break + else: + break + else: + if sentence_index not in tabu_set: + break + # if we don't find a sentence, break out of the main while loop + else: + break + + # if the gain is null, break out of the main while loop + if not weights[sentence_index]: + break + + # update state + state.subset |= {sentence_index} + state.concepts.update(self.concept_sets[sentence_index]) + state.length -= rev_length + state.score += weights[sentence_index] + + # update sentence weights with the reverse index + for concept in set(self.concept_sets[sentence_index]): + if state.concepts[concept] == 1: + for sentence in self.c2s[concept]: + weights[sentence] -= self.weights[concept] + return state + + def unselect_sentences(self, weights, state, to_remove): + """Sentence ``un-selector'' (reverse operation of the + select_sentences method). + + Args: + weights (dictionary): the sentence weights dictionary. This + dictionnary is updated during this method call (in-place). + state (State): the state of the tabu search from which to start + un-selecting sentences. + to_remove (iterable): set of sentences to unselect. + + Returns: + state (State): the new state of the search. Also note that + weights is modified in-place. + + """ + # remove the sentence indices from the solution subset + state.subset -= to_remove + for sentence_index in to_remove: + # update state + state.concepts.subtract(self.concept_sets[sentence_index]) + state.length -= self.sentences[sentence_index].length + # update sentence weights with the reverse index + for concept in set(self.concept_sets[sentence_index]): + if not state.concepts[concept]: + for sentence in self.c2s[concept]: + weights[sentence] += self.weights[concept] + state.score -= weights[sentence_index] + return state + + def solve_ilp_problem(self, + summary_size=100, units="WORDS", + solver='glpk', + excluded_solutions=[], + unique=False): + """Solve the ILP formulation of the concept-based model. + + :param summary_size: the maximum size in words of the summary, defaults to 100. + :param units: defaults to "WORDS" + :param solver: the solver used, defaults to glpk + :param excluded_solutions: (list of list): a list of subsets of sentences that are to be excluded, + defaults to [] + :param unique: (bool): modify the model so that it produces only one optimal solution, defaults to False + + :return: (value, set) tuple (int, list): the value of the objective function + and the set of selected sentences as a tuple. + + """ + # initialize container shortcuts + concepts = self.weights.keys() + + w = self.weights + L = summary_size + C = len(concepts) + S = len(self.sentences) + + if not self.word_frequencies: + self.compute_word_frequency() + + tokens = self.word_frequencies.keys() + f = self.word_frequencies + T = len(tokens) + + # HACK Sort keys + concepts = sorted(self.weights, key=self.weights.get, reverse=True) + + # formulation of the ILP problem + prob = pulp.LpProblem(self.input_directory, pulp.LpMaximize) + + # initialize the concepts binary variables + c = pulp.LpVariable.dicts(name='c', + indexs=range(C), + lowBound=0, + upBound=1, + cat='Integer') + + # initialize the sentences binary variables + s = pulp.LpVariable.dicts(name='s', + indexs=range(S), + lowBound=0, + upBound=1, + cat='Integer') + + # initialize the word binary variables + t = pulp.LpVariable.dicts(name='t', + indexs=range(T), + lowBound=0, + upBound=1, + cat='Integer') + + # OBJECTIVE FUNCTION + prob += sum(w[concepts[i]] * c[i] for i in range(C)) + + if unique: + prob += sum(w[concepts[i]] * c[i] for i in range(C)) + \ + 10e-6 * sum(f[tokens[k]] * t[k] for k in range(T)) + + # CONSTRAINT FOR SUMMARY SIZE + if units == "WORDS": + prob += sum(s[j] * self.sentences[j].length for j in range(S)) <= L + if units == "CHARACTERS": + prob += sum(s[j] * len(self.sentences[j].untokenized_form) for j in range(S)) <= L + + + # INTEGRITY CONSTRAINTS + for i in range(C): + for j in range(S): + if concepts[i] in self.sentences[j].concepts: + prob += s[j] <= c[i] + + for i in range(C): + prob += sum(s[j] for j in range(S) + if concepts[i] in self.sentences[j].concepts) >= c[i] + + # WORD INTEGRITY CONSTRAINTS + if unique: + for k in range(T): + for j in self.w2s[tokens[k]]: + prob += s[j] <= t[k] + + for k in range(T): + prob += sum(s[j] for j in self.w2s[tokens[k]]) >= t[k] + + # CONSTRAINTS FOR FINDING OPTIMAL SOLUTIONS + for sentence_set in excluded_solutions: + prob += sum([s[j] for j in sentence_set]) <= len(sentence_set)-1 + + # prob.writeLP('test.lp') + + # solving the ilp problem + try: + prob.solve(pulp.CPLEX(msg=0)) + except: + if solver == 'gurobi': + prob.solve(pulp.GUROBI(msg=0)) + elif solver == 'glpk': + prob.solve(pulp.GLPK(msg=0)) + elif solver == 'cplex': + prob.solve(pulp.CPLEX(msg=0)) + else: + sys.exit('no solver specified') + + # retreive the optimal subset of sentences + solution = set([j for j in range(S) if s[j].varValue == 1]) + + # returns the (objective function value, solution) tuple + return (pulp.value(prob.objective), solution) diff --git a/baselines/baseline2/summarize/sume/utils/__init__.py b/baselines/baseline2/summarize/sume/utils/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/baselines/baseline2/summarize/sume/utils/extract_text.py b/baselines/baseline2/summarize/sume/utils/extract_text.py new file mode 100755 index 0000000..85f81d4 --- /dev/null +++ b/baselines/baseline2/summarize/sume/utils/extract_text.py @@ -0,0 +1,84 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import re +import sys +import codecs + +""" Extract the textual content from the DUC/TAC files. + + author: florian boudin (florian.boudin@univ-nantes.fr) +""" + +def remove_byline(text): + """ Remove the newswire byline from the textual content. + + Examples of headers are: + WASHINGTON _ + NEW YORK _ + AMHERST, N.Y. _ + DAR ES SALAAM, Tanzania _ + LAUSANNE, Switzerland (AP) _ + SEOUL, South Korea (AP) _ + BRUSSELS, Belgium (AP) - + INNSBRUCK, Austria (AP) -- + PORT-AU-PRINCE, Haiti (AP) _ + BEIJING &UR; &LR; _ + """ + text = re.sub(u'^[A-Z][\-\,\.\w\s]+ (\([A-Z]+\) )?(_|-|--) ', '', text) + + return text + +# open the input file +with codecs.open(sys.argv[1], 'r', 'utf-8') as f: + + # read the entire file + content = f.read() + + # extract the textual content + m = re.search(u'(?is)(.+)', content) + content = m.group(1) + + # remove the paragraph tags + content = re.sub(u'(?i)', '', content) + + # remove annotation tags + content = re.sub(u'(?i)[^<]+', '', content) + + # remove the HTML entities + content = re.sub(u'(?i)&', '&', content) + content = re.sub(u'(?i)"', '"', content) + content = re.sub(u'(?i)'', "'", content) + content = re.sub(u'(?i)<', "<", content) + content = re.sub(u'(?i)>', ">", content) + content = re.sub(u'&\w+;', "", content) + + # remove extra spacing + content = re.sub(u'\s+', ' ', content.strip()) + + # remove byline from the first 80 characters + header = remove_byline(content[:80]) + content = header + content[80:] + + prev_content = content + + # normalize the quotation marks + content = re.sub(u'```', '"`', content) + content = re.sub(u'``', '"', content) + content = re.sub(u"'''", '\'"', content) + content = re.sub(u"''", '"', content) + content = re.sub(u"[”“]", '"', content) + content = re.sub(u'(^|[ :;()])\"([^\"])', '\g<1>``\g<2>', content) + content = re.sub(u'([^\"])\"($|[ :;()])', '\g<1>\'\'\g<2>', content) + + # count the quotation marks + # opening_quotation_marks = re.findall(u'``', content) + # ending_quotation_marks = re.findall(u"''", content) + + # write the extracted textual content into a file + with codecs.open(sys.argv[2], 'w', 'utf-8') as w: + w.write(content) + + + + diff --git a/baselines/baseline2/summarize/sume/utils/stack_citations.py b/baselines/baseline2/summarize/sume/utils/stack_citations.py new file mode 100755 index 0000000..89d7e64 --- /dev/null +++ b/baselines/baseline2/summarize/sume/utils/stack_citations.py @@ -0,0 +1,66 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import re +import sys +import codecs + +""" Reconstruct the citations from the DUC/TAC files. + + author: florian boudin (florian.boudin@univ-nantes.fr) +""" + +# open the input file +with codecs.open(sys.argv[1], 'r', 'utf-8') as f: + + # read the lines from the input file + lines = f.readlines() + + stacked_lines = [] + in_citation = False + + openings = [] + endings = [] + + # for each line + for line in lines: + + line = line.strip() + tokens = line.split(' ') + + for i in range(len(tokens)): + token = tokens[i] + if token == u"``": + openings.append(token) + if token == u"''": + endings.append(token) + if token == u'"': + remp_char = u'``' + if len(openings) > len(endings): + remp_char = u"''" + endings.append(remp_char) + else: + openings.append(remp_char) + print 'info - error with quotation marks at', sys.argv[1] + print 'info - correcting, modifying with ', remp_char + tokens[i] = remp_char + line = ' '.join(tokens) + + + if len(openings) == len(endings): + if in_citation: + stacked_lines[-1] = stacked_lines[-1] + ' ' + line + else: + stacked_lines.append(line) + in_citation = False + + else: + if in_citation: + stacked_lines[-1] = stacked_lines[-1] + ' ' + line + else: + stacked_lines.append(line) + in_citation = True + + # write the reconstructed file + with codecs.open(sys.argv[2], 'w', 'utf-8') as w: + w.write('\n'.join(stacked_lines)) \ No newline at end of file diff --git a/baselines/baseline2/summarize/sume_wrap.py b/baselines/baseline2/summarize/sume_wrap.py new file mode 100755 index 0000000..4d16b02 --- /dev/null +++ b/baselines/baseline2/summarize/sume_wrap.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import os, sys + +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))) +from nltk.tokenize import word_tokenize +import summarize.sume as sume +from sume import Sentence, untokenize + +from nltk.corpus import stopwords +from nltk.stem.snowball import SnowballStemmer + +class SumeWrap(): + def __init__(self, language): + self.s = sume.ConceptBasedILPSummarizer(" ", language) + self.LANGUAGE = language + # self.stoplist = set(stopwords.words(self.LANGUAGE)) + # self.stemmer = SnowballStemmer(self.LANGUAGE) + + def load_sume_sentences(self, docs, parse_type=None, parse_info=[]): + """ + + :param docs: the documents to load + :param parse_type: + :param parse_info: + :return: list[Sentence] + + @type docs: list[tuple] + @type parse_type: str + @type parse_info: list + """ + self.docs = docs + self.sentences = [] + self.doc_sent_dict = {} + + + doc_id = 0 + for doc_id, doc in enumerate(docs): + doc_sents = doc + total = len(self.sentences) + for sent_id, sentence in enumerate(doc_sents): + token_sentence = word_tokenize(sentence, self.LANGUAGE) + sentence_s = Sentence(token_sentence, doc_id, sent_id+1) + untokenized_form = untokenize(token_sentence) + sentence_s.untokenized_form = untokenized_form + sentence_s.length = len(untokenized_form.split(' ')) + self.doc_sent_dict[total+sent_id] = "%s_%s" % (str(doc_id), str(sent_id)) + self.sentences.append(sentence_s) + return self.sentences + + def __call__(self, docs, length=100, units="WORDS", rejected_list=[], imp_list=[], parser_type=None): + try: + length = int(length) + except: + raise TypeError("argument 'length' could not be converted to int. It is of type '%s' and has value '%s'" % (type(length), length)) + # load documents with extension 'txt' + self.s.sentences = self.load_sume_sentences(docs, parser_type) + + # compute the parameters needed by the model + # extract bigrams as concepts + self.s.extract_ngrams() + + # compute document frequency as concept weights + self.s.compute_document_frequency() + + # solve the ilp model + value, subset = self.s.solve_ilp_problem(summary_size=length, units=units) + + return [self.s.sentences[j].untokenized_form for j in subset] diff --git a/baselines/baseline2/summarize/sumy/.travis.yml b/baselines/baseline2/summarize/sumy/.travis.yml new file mode 100755 index 0000000..f3ecb51 --- /dev/null +++ b/baselines/baseline2/summarize/sumy/.travis.yml @@ -0,0 +1,22 @@ +language: + - python +python: + # https://github.com/travis-ci/travis-ci/issues/2219#issuecomment-41804942 + - "2.7" + - "3.3" + - "3.4" + - "3.5" +before_install: + # install dependencies for NumPy + - sudo apt-get update -qq + - sudo apt-get install -qq gfortran libatlas-base-dev + - sudo apt-get install -qq python-numpy + - sudo apt-get install -qq pandoc +install: + - pandoc --from=markdown --to=rst README.md -o README.rst + - python setup.py install + - pip install -U pip wheel + - pip install -U --use-wheel pytest pytest-cov + - python -c "import nltk; nltk.download('punkt')" +script: + - py.test tests diff --git a/baselines/baseline2/summarize/sumy/__init__.py b/baselines/baseline2/summarize/sumy/__init__.py new file mode 100755 index 0000000..e879e32 --- /dev/null +++ b/baselines/baseline2/summarize/sumy/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf8 -*- + +from __future__ import absolute_import +from __future__ import division, print_function, unicode_literals + + +__author__ = "Michal Belica" +__version__ = "0.4.1" diff --git a/baselines/baseline2/summarize/sumy/__main__.py b/baselines/baseline2/summarize/sumy/__main__.py new file mode 100755 index 0000000..47cba4d --- /dev/null +++ b/baselines/baseline2/summarize/sumy/__main__.py @@ -0,0 +1,132 @@ +# -*- coding: utf8 -*- + +""" +Sumy - automatic text summarizer. + +Usage: + sumy (luhn | edmundson | lsa | text-rank | lex-rank | sum-basic | kl) [--length=] [--language=] [--stopwords=] [--format=] + sumy (luhn | edmundson | lsa | text-rank | lex-rank | sum-basic | kl) [--length=] [--language=] [--stopwords=] [--format=] --url= + sumy (luhn | edmundson | lsa | text-rank | lex-rank | sum-basic | kl) [--length=] [--language=] [--stopwords=] [--format=] --file= + sumy --version + sumy --help + +Options: + --length= Length of summarized text. It may be count of sentences + or percentage of input text. [default: 20%] + --language= Natural language of summarized text. [default: english] + --stopwords= Path to a file containing a list of stopwords. One word per line in UTF-8 encoding. + If it's not provided default list of stop-words is used according to chosen language. + --format= Format of input document. Possible values: html, plaintext + --url= URL address of the web page to summarize. + --file= Path to the text file to summarize. + --version Displays current application version. + --help Displays this text. + +""" + +from __future__ import absolute_import +from __future__ import division, print_function, unicode_literals + +from os import sys, path +sys.path.append(path.dirname(path.abspath(__file__))) + +from docopt import docopt +from .utils import ItemsCount, get_stop_words, read_stop_words, fetch_url +from ._compat import to_string, to_unicode, to_bytes, PY3 +from .nlp.tokenizers import Tokenizer +from .parsers.html import HtmlParser +from .parsers.plaintext import PlaintextParser +from .summarizers.luhn import LuhnSummarizer +from .summarizers.edmundson import EdmundsonSummarizer +from .summarizers.lsa import LsaSummarizer +from .summarizers.text_rank import TextRankSummarizer +from .summarizers.lex_rank import LexRankSummarizer +from .summarizers.sum_basic import SumBasicSummarizer +from .summarizers.kl import KLSummarizer +from .nlp.stemmers import Stemmer + +PARSERS = { + "html": HtmlParser, + "plaintext": PlaintextParser, +} + +AVAILABLE_METHODS = { + "luhn": LuhnSummarizer, + "edmundson": EdmundsonSummarizer, + "lsa": LsaSummarizer, + "text-rank": TextRankSummarizer, + "lex-rank": LexRankSummarizer, + "sum-basic": SumBasicSummarizer, + "kl": KLSummarizer, +} + + +def main(args=None): + args = docopt(to_string(__doc__), args, version=__version__) + summarizer, parser, items_count = handle_arguments(args) + + for sentence in summarizer(parser.document, items_count): + if PY3: + print(to_unicode(sentence)) + else: + print(to_bytes(sentence)) + + return 0 + + +def handle_arguments(args, default_input_stream=sys.stdin): + document_format = args['--format'] + if document_format is not None and document_format not in PARSERS: + raise ValueError("Unsupported format of input document. Possible values are: %s. Given: %s." % ( + ", ".join(PARSERS.keys()), + document_format, + )) + + if args["--url"] is not None: + parser = PARSERS[document_format or "html"] + document_content = fetch_url(args["--url"]) + elif args["--file"] is not None: + parser = PARSERS[document_format or "plaintext"] + with open(args["--file"], "rb") as file: + document_content = file.read() + else: + parser = PARSERS[document_format or "plaintext"] + document_content = default_input_stream.read() + + items_count = ItemsCount(args["--length"]) + + language = args["--language"] + if args['--stopwords']: + stop_words = read_stop_words(args['--stopwords']) + else: + stop_words = get_stop_words(language) + + parser = parser(document_content, Tokenizer(language)) + stemmer = Stemmer(language) + + summarizer_class = next(cls for name, cls in AVAILABLE_METHODS.items() if args[name]) + summarizer = build_summarizer(summarizer_class, stop_words, stemmer, parser) + + return summarizer, parser, items_count + + +def build_summarizer(summarizer_class, stop_words, stemmer, parser): + summarizer = summarizer_class(stemmer) + if summarizer_class is EdmundsonSummarizer: + summarizer.null_words = stop_words + summarizer.bonus_words = parser.significant_words + summarizer.stigma_words = parser.stigma_words + else: + summarizer.stop_words = stop_words + return summarizer + + +if __name__ == "__main__": + try: + exit_code = main() + exit(exit_code) + except KeyboardInterrupt: + exit(1) + except Exception as e: + print(e) + exit(1) diff --git a/baselines/baseline2/summarize/sumy/_compat.py b/baselines/baseline2/summarize/sumy/_compat.py new file mode 100755 index 0000000..367224a --- /dev/null +++ b/baselines/baseline2/summarize/sumy/_compat.py @@ -0,0 +1,103 @@ +# -*- coding: utf8 -*- + +from __future__ import absolute_import +from __future__ import division, print_function, unicode_literals + +from sys import version_info + + +PY3 = version_info[0] == 3 + + +if PY3: + bytes = bytes + unicode = str +else: + bytes = str + unicode = unicode +string_types = (bytes, unicode,) + + +try: + from itertools import ifilterfalse as ffilter +except ImportError: + from itertools import filterfalse as ffilter + + +try: + from collections import Counter +except ImportError: + # Python < 2.7 + from itertools import groupby + + def Counter(iterable): + iterable = sorted(iterable) + return dict((key, len(tuple(group))) for key, group in groupby(iterable)) + + +def unicode_compatible(cls): + """ + Decorator for unicode compatible classes. Method ``__unicode__`` + has to be implemented to work decorator as expected. + """ + if PY3: + cls.__str__ = cls.__unicode__ + cls.__bytes__ = lambda self: self.__str__().encode("utf8") + else: + cls.__str__ = lambda self: self.__unicode__().encode("utf8") + + return cls + + +def to_string(object): + return to_unicode(object) if PY3 else to_bytes(object) + + +def to_bytes(object): + if isinstance(object, bytes): + return object + elif isinstance(object, unicode): + return object.encode("utf8") + else: + # try encode instance to bytes + return instance_to_bytes(object) + + +def to_unicode(object): + if isinstance(object, unicode): + return object + elif isinstance(object, bytes): + return object.decode("utf8") + else: + # try decode instance to unicode + return instance_to_unicode(object) + + +def instance_to_bytes(instance): + if PY3: + if hasattr(instance, "__bytes__"): + return bytes(instance) + elif hasattr(instance, "__str__"): + return unicode(instance).encode("utf8") + else: + if hasattr(instance, "__str__"): + return bytes(instance) + elif hasattr(instance, "__unicode__"): + return unicode(instance).encode("utf8") + + return to_bytes(repr(instance)) + + +def instance_to_unicode(instance): + if PY3: + if hasattr(instance, "__str__"): + return unicode(instance) + elif hasattr(instance, "__bytes__"): + return bytes(instance).decode("utf8") + else: + if hasattr(instance, "__unicode__"): + return unicode(instance) + elif hasattr(instance, "__str__"): + return bytes(instance).decode("utf8") + + return to_unicode(repr(instance)) diff --git a/baselines/baseline2/summarize/sumy/evaluation/__init__.py b/baselines/baseline2/summarize/sumy/evaluation/__init__.py new file mode 100755 index 0000000..a60fe9c --- /dev/null +++ b/baselines/baseline2/summarize/sumy/evaluation/__init__.py @@ -0,0 +1,9 @@ +# -*- coding: utf8 -*- + +from __future__ import absolute_import +from __future__ import division, print_function, unicode_literals + + +from .coselection import f_score, precision, recall +from .content_based import cosine_similarity, unit_overlap +from .rouge import rouge_n, rouge_1, rouge_2, rouge_l_sentence_level, rouge_l_summary_level diff --git a/baselines/baseline2/summarize/sumy/evaluation/__main__.py b/baselines/baseline2/summarize/sumy/evaluation/__main__.py new file mode 100755 index 0000000..99fac25 --- /dev/null +++ b/baselines/baseline2/summarize/sumy/evaluation/__main__.py @@ -0,0 +1,220 @@ +# -*- coding: utf8 -*- + +""" +Sumy - evaluation of automatic text summary. + +Usage: + sumy_eval (random | luhn | edmundson | lsa | text-rank | lex-rank | sum-basic | kl) [--length=] [--language=] + sumy_eval (random | luhn | edmundson | lsa | text-rank | lex-rank | sum-basic | kl) [--length=] [--language=] --url= + sumy_eval (random | luhn | edmundson | lsa | text-rank | lex-rank | sum-basic | kl) [--length=] [--language=] --file= --format= + sumy_eval --version + sumy_eval --help + +Options: + Path to the file with reference summary. + --url= URL address of summarizied message. + --file= Path to file with summarizied text. + --format= Format of input file. [default: plaintext] + --length= Length of summarizied text. It may be count of sentences + or percentage of input text. [default: 20%] + --language= Natural language of summarizied text. [default: english] + --version Displays version of application. + --help Displays this text. + +""" + +from __future__ import absolute_import +from __future__ import division, print_function, unicode_literals + +import sys + +from itertools import chain +from docopt import docopt +from .. import __version__ +from ..utils import ItemsCount, get_stop_words, fetch_url +from ..models import TfDocumentModel +from .._compat import to_string +from ..nlp.tokenizers import Tokenizer +from ..parsers.html import HtmlParser +from ..parsers.plaintext import PlaintextParser +from ..summarizers.random import RandomSummarizer +from ..summarizers.luhn import LuhnSummarizer +from ..summarizers.edmundson import EdmundsonSummarizer +from ..summarizers.lsa import LsaSummarizer +from ..summarizers.text_rank import TextRankSummarizer +from ..summarizers.lex_rank import LexRankSummarizer +from ..summarizers.sum_basic import SumBasicSummarizer +from ..summarizers.kl import KLSummarizer +from ..nlp.stemmers import Stemmer +from . import precision, recall, f_score, cosine_similarity, unit_overlap +from . import rouge_1, rouge_2, rouge_l_sentence_level, rouge_l_summary_level + + +PARSERS = { + "html": HtmlParser, + "plaintext": PlaintextParser, +} + + +def build_random(parser, language): + return RandomSummarizer() + + +def build_luhn(parser, language): + summarizer = LuhnSummarizer(Stemmer(language)) + summarizer.stop_words = get_stop_words(language) + + return summarizer + + +def build_edmundson(parser, language): + summarizer = EdmundsonSummarizer(Stemmer(language)) + summarizer.null_words = get_stop_words(language) + summarizer.bonus_words = parser.significant_words + summarizer.stigma_words = parser.stigma_words + + return summarizer + + +def build_lsa(parser, language): + summarizer = LsaSummarizer(Stemmer(language)) + summarizer.stop_words = get_stop_words(language) + + return summarizer + + +def build_text_rank(parser, language): + summarizer = TextRankSummarizer(Stemmer(language)) + summarizer.stop_words = get_stop_words(language) + + return summarizer + + +def build_lex_rank(parser, language): + summarizer = LexRankSummarizer(Stemmer(language)) + summarizer.stop_words = get_stop_words(language) + + return summarizer + + +def build_sum_basic(parser, language): + summarizer = SumBasicSummarizer(Stemmer(language)) + summarizer.stop_words = get_stop_words(language) + + return summarizer + + +def build_kl(parser, language): + summarizer = KLSummarizer(Stemmer(language)) + summarizer.stop_words = get_stop_words(language) + + return summarizer + + +def evaluate_cosine_similarity(evaluated_sentences, reference_sentences): + evaluated_words = tuple(chain(*(s.words for s in evaluated_sentences))) + reference_words = tuple(chain(*(s.words for s in reference_sentences))) + evaluated_model = TfDocumentModel(evaluated_words) + reference_model = TfDocumentModel(reference_words) + + return cosine_similarity(evaluated_model, reference_model) + + +def evaluate_unit_overlap(evaluated_sentences, reference_sentences): + evaluated_words = tuple(chain(*(s.words for s in evaluated_sentences))) + reference_words = tuple(chain(*(s.words for s in reference_sentences))) + evaluated_model = TfDocumentModel(evaluated_words) + reference_model = TfDocumentModel(reference_words) + + return unit_overlap(evaluated_model, reference_model) + + +AVAILABLE_METHODS = { + "random": build_random, + "luhn": build_luhn, + "edmundson": build_edmundson, + "lsa": build_lsa, + "text-rank": build_text_rank, + "lex-rank": build_lex_rank, + "sum-basic": build_sum_basic, + "kl": build_kl, +} + +AVAILABLE_EVALUATIONS = ( + ("Precision", False, precision), + ("Recall", False, recall), + ("F-score", False, f_score), + ("Cosine similarity", False, evaluate_cosine_similarity), + ("Cosine similarity (document)", True, evaluate_cosine_similarity), + ("Unit overlap", False, evaluate_unit_overlap), + ("Unit overlap (document)", True, evaluate_unit_overlap), + ("Rouge-1", False, rouge_1), + ("Rouge-2", False, rouge_2), + ("Rouge-L (Sentence Level)", False, rouge_l_sentence_level), + ("Rouge-L (Summary Level)", False, rouge_l_summary_level) +) + + +def main(args=None): + args = docopt(to_string(__doc__), args, version=__version__) + summarizer, document, items_count, reference_summary = handle_arguments(args) + + evaluated_sentences = summarizer(document, items_count) + reference_document = PlaintextParser.from_string(reference_summary, + Tokenizer(args["--language"])) + reference_sentences = reference_document.document.sentences + + for name, evaluate_document, evaluate in AVAILABLE_EVALUATIONS: + if evaluate_document: + result = evaluate(evaluated_sentences, document.sentences) + else: + result = evaluate(evaluated_sentences, reference_sentences) + print("%s: %f" % (name, result)) + + return 0 + + +def handle_arguments(args): + document_format = args["--format"] + if document_format is not None and document_format not in PARSERS: + raise ValueError("Unsupported format of input document. Possible values are: %s. Given: %s." % ( + ", ".join(PARSERS.keys()), + document_format, + )) + + if args["--url"] is not None: + parser = PARSERS["html"] + document_content = fetch_url(args["--url"]) + elif args["--file"] is not None: + parser = PARSERS.get(document_format, PlaintextParser) + with open(args["--file"], "rb") as file: + document_content = file.read() + else: + parser = PARSERS["plaintext"] + document_content = sys.stdin.read() + + summarizer_builder = AVAILABLE_METHODS["luhn"] + for method, builder in AVAILABLE_METHODS.items(): + if args[method]: + summarizer_builder = builder + break + + items_count = ItemsCount(args["--length"]) + + parser = parser(document_content, Tokenizer(args["--language"])) + + with open(args[""], "rb") as file: + reference_summmary = file.read().decode("utf8") + + return summarizer_builder(parser, args["--language"]), parser.document, items_count, reference_summmary + + +if __name__ == "__main__": + try: + exit_code = main() + exit(exit_code) + except KeyboardInterrupt: + exit(1) + except Exception as e: + print(e) + exit(1) diff --git a/baselines/baseline2/summarize/sumy/evaluation/content_based.py b/baselines/baseline2/summarize/sumy/evaluation/content_based.py new file mode 100755 index 0000000..f8e0ac7 --- /dev/null +++ b/baselines/baseline2/summarize/sumy/evaluation/content_based.py @@ -0,0 +1,57 @@ +# -*- coding: utf8 -*- + +from __future__ import absolute_import +from __future__ import division, print_function, unicode_literals + +from ..models import TfDocumentModel as TfModel + + +def cosine_similarity(evaluated_model, reference_model): + """ + Computes cosine similarity of two text documents. Each document + has to be represented as TF model of non-empty document. + + :returns float: + 0 <= cos <= 1, where 0 means independence and 1 means + exactly the same. + """ + if not (isinstance(evaluated_model, TfModel) and isinstance(reference_model, TfModel)): + raise ValueError( + "Arguments has to be instances of 'sumy.models.TfDocumentModel'") + + terms = frozenset(evaluated_model.terms) | frozenset(reference_model.terms) + + numerator = 0.0 + for term in terms: + numerator += evaluated_model.term_frequency(term) * reference_model.term_frequency(term) + + denominator = evaluated_model.magnitude * reference_model.magnitude + if denominator == 0.0: + raise ValueError("Document model can't be empty. Given %r & %r" % ( + evaluated_model, reference_model)) + + return numerator / denominator + + +def unit_overlap(evaluated_model, reference_model): + """ + Computes unit overlap of two text documents. Documents + has to be represented as TF models of non-empty document. + + :returns float: + 0 <= overlap <= 1, where 0 means no match and 1 means + exactly the same. + """ + if not (isinstance(evaluated_model, TfModel) and isinstance(reference_model, TfModel)): + raise ValueError( + "Arguments has to be instances of 'sumy.models.TfDocumentModel'") + + terms1 = frozenset(evaluated_model.terms) + terms2 = frozenset(reference_model.terms) + + if not terms1 and not terms2: + raise ValueError( + "Documents can't be empty. Please pass the valid documents.") + + common_terms_count = len(terms1 & terms2) + return common_terms_count / (len(terms1) + len(terms2) - common_terms_count) diff --git a/baselines/baseline2/summarize/sumy/evaluation/coselection.py b/baselines/baseline2/summarize/sumy/evaluation/coselection.py new file mode 100755 index 0000000..f785b37 --- /dev/null +++ b/baselines/baseline2/summarize/sumy/evaluation/coselection.py @@ -0,0 +1,85 @@ +# -*- coding: utf8 -*- + +from __future__ import absolute_import +from __future__ import division, print_function, unicode_literals + + +def f_score(evaluated_sentences, reference_sentences, weight=1.0): + """ + Computation of F-Score measure. It is computed as + F(E) = ( (W^2 + 1) * P(E) * R(E) ) / ( W^2 * P(E) + R(E) ), where: + + - P(E) is precision metrics of extract E. + - R(E) is recall metrics of extract E. + - W is a weighting factor that favours P(E) metrics + when W > 1 and favours R(E) metrics when W < 1. + + If W = 1.0 (default value) basic F-Score is computed. + It is equivalent to F(E) = (2 * P(E) * R(E)) / (P(E) + R(E)). + + :parameter iterable evaluated_sentences: + Sentences of evaluated extract. + :parameter iterable reference_sentences: + Sentences of reference extract. + :returns float: + Returns 0.0 <= P(E) <= 1.0 + """ + p = precision(evaluated_sentences, reference_sentences) + r = recall(evaluated_sentences, reference_sentences) + + weight **= 2 # weight = weight^2 + denominator = weight * p + r + if denominator == 0.0: + return 0.0 + else: + return ((weight + 1) * p * r) / denominator + + +def precision(evaluated_sentences, reference_sentences): + """ + Intrinsic method of evaluation for extracts. It is computed as + P(E) = A / B, where: + + - A is count of common sentences occurring in both extracts. + - B is count of sentences in evaluated extract. + + :parameter iterable evaluated_sentences: + Sentences of evaluated extract. + :parameter iterable reference_sentences: + Sentences of reference extract. + :returns float: + Returns 0.0 <= P(E) <= 1.0 + """ + return _divide_evaluation(reference_sentences, evaluated_sentences) + + +def recall(evaluated_sentences, reference_sentences): + """ + Intrinsic method of evaluation for extracts. It is computed as + R(E) = A / C, where: + + - A is count of common sentences in both extracts. + - C is count of sentences in reference extract. + + :parameter iterable evaluated_sentences: + Sentences of evaluated extract. + :parameter iterable reference_sentences: + Sentences of reference extract. + :returns float: + Returns 0.0 <= R(E) <= 1.0 + """ + return _divide_evaluation(evaluated_sentences, reference_sentences) + + +def _divide_evaluation(numerator_sentences, denominator_sentences): + denominator_sentences = frozenset(denominator_sentences) + numerator_sentences = frozenset(numerator_sentences) + + if len(numerator_sentences) == 0 or len(denominator_sentences) == 0: + raise ValueError("Both collections have to contain at least 1 sentence.") + + common_count = len(denominator_sentences & numerator_sentences) + choosen_count = len(denominator_sentences) + + assert choosen_count != 0 + return common_count / choosen_count diff --git a/baselines/baseline2/summarize/sumy/evaluation/rouge.py b/baselines/baseline2/summarize/sumy/evaluation/rouge.py new file mode 100755 index 0000000..9f2c50d --- /dev/null +++ b/baselines/baseline2/summarize/sumy/evaluation/rouge.py @@ -0,0 +1,287 @@ +# -*- coding: utf8 -*- + +from __future__ import absolute_import +from __future__ import division, print_function, unicode_literals + +from ..models.dom import Sentence + + +def _get_ngrams(n, text): + ngram_set = set() + text_length = len(text) + max_index_ngram_start = text_length - n + for i in range (max_index_ngram_start + 1): + ngram_set.add(tuple(text[i:i+n])) + return ngram_set + + +def _split_into_words(sentences): + fullTextWords = [] + for s in sentences: + if not isinstance(s, Sentence): + raise (ValueError("Object in collection must be of type Sentence")) + fullTextWords.extend(s.words) + return fullTextWords + + +def _get_word_ngrams(n, sentences): + assert (len(sentences) > 0) + assert (n > 0) + + words = _split_into_words(sentences) + return _get_ngrams(n, words) + + +def _get_index_of_lcs(x, y): + return len(x), len(y) + + +def _len_lcs(x, y): + ''' + Returns the length of the Longest Common Subsequence between sequences x + and y. + Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence + + :param x: sequence of words + :param y: sequence of words + :returns integer: Length of LCS between x and y + ''' + table = _lcs(x, y) + n, m = _get_index_of_lcs(x, y) + return table[n, m] + + +def _lcs (x, y): + ''' + Computes the length of the longest common subsequence (lcs) between two + strings. The implementation below uses a DP programming algorithm and runs + in O(nm) time where n = len(x) and m = len(y). + Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence + + :param x: collection of words + :param y: collection of words + :returns table: dictionary of coord and len lcs + ''' + n, m = _get_index_of_lcs(x, y) + table = dict() + for i in range(n + 1): + for j in range (m + 1): + if i == 0 or j == 0: + table[i, j] = 0 + elif x[i-1] == y[j-1]: + table[i, j] = table[i-1, j-1] + 1 + else: + table[i, j] = max(table[i-1, j], table[i, j-1]) + return table + + +def _recon_lcs(x, y): + ''' + Returns the Longest Subsequence between x and y. + Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence + + :param x: sequence of words + :param y: sequence of words + :returns sequence: LCS of x and y + ''' + i, j = _get_index_of_lcs(x, y) + table = _lcs(x, y) + def _recon (i, j): + if i == 0 or j == 0: + return [] + elif x[i-1] == y[j-1]: + return _recon(i-1, j-1) + [(x[i-1], i)] + elif table[i-1, j] > table[i, j-1]: + return _recon(i-1, j) + else: + return _recon(i, j-1) + recon_tuple = tuple(map(lambda x: x[0], _recon(i, j))) + return recon_tuple + + +def rouge_n(evaluated_sentences, reference_sentences, n=2): + """ + Computes ROUGE-N of two text collections of sentences. + Sourece: http://research.microsoft.com/en-us/um/people/cyl/download/ + papers/rouge-working-note-v1.3.1.pdf + + :param evaluated_sentences: + The sentences that have been picked by the summarizer + :param reference_sentences: + The sentences from the referene set + :param n: Size of ngram. Defaults to 2. + :returns: + float 0 <= ROUGE-N <= 1, where 0 means no overlap and 1 means + exactly the same. + :raises ValueError: raises exception if a param has len <= 0 + """ + if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: + raise (ValueError("Collections must contain at least 1 sentence.")) + + evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences) + reference_ngrams = _get_word_ngrams(n, reference_sentences) + reference_count = len(reference_ngrams) + + # Gets the overlapping ngrams between evaluated and reference + overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams) + overlapping_count = len(overlapping_ngrams) + + return overlapping_count / reference_count + + +def rouge_1(evaluated_sentences, reference_sentences): + ''' + Rouge-N where N=1. This is a commonly used metric. + + :param evaluated_sentences: + The sentences that have been picked by the summarizer + :param reference_sentences: + The sentences from the referene set + :returns: + float 0 <= ROUGE-N <= 1, where 0 means no overlap and 1 means + exactly the same. + ''' + return rouge_n(evaluated_sentences, reference_sentences, 1) + + +def rouge_2(evaluated_sentences, reference_sentences): + ''' + Rouge-N where N=2. This is a commonly used metric. + + :param evaluated_sentences: + The sentences that have been picked by the summarizer + :param reference_sentences: + The sentences from the referene set + :returns: + float 0 <= ROUGE-N <= 1, where 0 means no overlap and 1 means + exactly the same. + ''' + return rouge_n(evaluated_sentences, reference_sentences, 2) + + +def _f_lcs(llcs, m, n): + ''' + Computes the LCS-based F-measure score + Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/ + rouge-working-note-v1.3.1.pdf + + :param llcs: Length of LCS + :param m: number of words in reference summary + :param n: number of words in candidate summary + :returns float: LCS-based F-measure score + ''' + r_lcs = llcs / m + p_lcs = llcs / n + beta = p_lcs / r_lcs + num = (1 + (beta ** 2)) * r_lcs * p_lcs + denom = r_lcs + ((beta ** 2) * p_lcs) + return num / denom + + +def rouge_l_sentence_level(evaluated_sentences, reference_sentences): + """ + Computes ROUGE-L (sentence level) of two text collections of sentences. + http://research.microsoft.com/en-us/um/people/cyl/download/papers/ + rouge-working-note-v1.3.1.pdf + + Calculated according to: + R_lcs = LCS(X,Y)/m + P_lcs = LCS(X,Y)/n + F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs) + + where: + X = reference summary + Y = Candidate summary + m = length of reference summary + n = length of candidate summary + + :param evaluated_sentences: + The sentences that have been picked by the summarizer + :param reference_sentences: + The sentences from the referene set + :returns float: F_lcs + :raises ValueError: raises exception if a param has len <= 0 + """ + if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: + raise (ValueError("Collections must contain at least 1 sentence.")) + reference_words = _split_into_words(reference_sentences) + evaluated_words = _split_into_words(evaluated_sentences) + m = len(reference_words) + n = len(evaluated_words) + lcs = _len_lcs(evaluated_words, reference_words) + return _f_lcs(lcs, m, n) + + +def _union_lcs(evaluated_sentences, reference_sentence): + ''' + Returns LCS_u(r_i, C) which is the LCS score of the union longest common subsequence + between reference sentence ri and candidate summary C. For example, if + r_i= w1 w2 w3 w4 w5, and C contains two sentences: c1 = w1 w2 w6 w7 w8 and + c2 = w1 w3 w8 w9 w5, then the longest common subsequence of r_i and c1 is + “w1 w2” and the longest common subsequence of r_i and c2 is “w1 w3 w5”. The + union longest common subsequence of r_i, c1, and c2 is “w1 w2 w3 w5” and + LCS_u(r_i, C) = 4/5. + + :param evaluated_sentences: + The sentences that have been picked by the summarizer + :param reference_sentence: + One of the sentences in the reference summaries + :returns float: LCS_u(r_i, C) + :raises ValueError: raises exception if a param has len <= 0 + ''' + if len(evaluated_sentences) <= 0: + raise (ValueError("Collections must contain at least 1 sentence.")) + + lcs_union = set() + reference_words = _split_into_words([reference_sentence]) + combined_lcs_length = 0 + for eval_s in evaluated_sentences: + evaluated_words = _split_into_words([eval_s]) + lcs = set(_recon_lcs(reference_words, evaluated_words)) + combined_lcs_length += len(lcs) + lcs_union = lcs_union.union(lcs) + + union_lcs_count = len(lcs_union) + union_lcs_value = union_lcs_count / combined_lcs_length + return union_lcs_value + + +def rouge_l_summary_level(evaluated_sentences, reference_sentences): + """ + Computes ROUGE-L (summary level) of two text collections of sentences. + http://research.microsoft.com/en-us/um/people/cyl/download/papers/ + rouge-working-note-v1.3.1.pdf + + Calculated according to: + R_lcs = SUM(1, u)[LCS(r_i,C)]/m + P_lcs = SUM(1, u)[LCS(r_i,C)]/n + F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs) + + where: + SUM(i,u) = SUM from i through u + u = number of sentences in reference summary + C = Candidate summary made up of v sentences + m = number of words in reference summary + n = number of words in candidate summary + + :param evaluated_sentences: + The sentences that have been picked by the summarizer + :param reference_sentences: + The sentences from the referene set + :returns float: F_lcs + :raises ValueError: raises exception if a param has len <= 0 + """ + if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: + raise (ValueError("Collections must contain at least 1 sentence.")) + + # total number of words in reference sentences + m = len(_split_into_words(reference_sentences)) + + # total number of words in evaluated sentences + n = len(_split_into_words(evaluated_sentences)) + + union_lcs_sum_across_all_references = 0 + for ref_s in reference_sentences: + union_lcs_sum_across_all_references += _union_lcs(evaluated_sentences, ref_s) + return _f_lcs(union_lcs_sum_across_all_references, m, n) + diff --git a/baselines/baseline2/summarize/sumy/models/__init__.py b/baselines/baseline2/summarize/sumy/models/__init__.py new file mode 100755 index 0000000..f158453 --- /dev/null +++ b/baselines/baseline2/summarize/sumy/models/__init__.py @@ -0,0 +1,7 @@ +# -*- coding: utf8 -*- + +from __future__ import absolute_import +from __future__ import division, print_function, unicode_literals + + +from .tf import TfDocumentModel diff --git a/baselines/baseline2/summarize/sumy/models/dom/__init__.py b/baselines/baseline2/summarize/sumy/models/dom/__init__.py new file mode 100755 index 0000000..5bfe104 --- /dev/null +++ b/baselines/baseline2/summarize/sumy/models/dom/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf8 -*- + +from __future__ import absolute_import +from __future__ import division, print_function, unicode_literals + +from ._document import ObjectDocumentModel +from ._paragraph import Paragraph +from ._sentence import Sentence diff --git a/baselines/baseline2/summarize/sumy/models/dom/_document.py b/baselines/baseline2/summarize/sumy/models/dom/_document.py new file mode 100755 index 0000000..4420b94 --- /dev/null +++ b/baselines/baseline2/summarize/sumy/models/dom/_document.py @@ -0,0 +1,39 @@ +# -*- coding: utf8 -*- + +from __future__ import absolute_import +from __future__ import division, print_function, unicode_literals + +from itertools import chain +from ...utils import cached_property +from ..._compat import unicode_compatible + + +@unicode_compatible +class ObjectDocumentModel(object): + def __init__(self, paragraphs): + self._paragraphs = tuple(paragraphs) + + @property + def paragraphs(self): + return self._paragraphs + + @cached_property + def sentences(self): + sentences = (p.sentences for p in self._paragraphs) + return tuple(chain(*sentences)) + + @cached_property + def headings(self): + headings = (p.headings for p in self._paragraphs) + return tuple(chain(*headings)) + + @cached_property + def words(self): + words = (p.words for p in self._paragraphs) + return tuple(chain(*words)) + + def __unicode__(self): + return "" % len(self.paragraphs) + + def __repr__(self): + return self.__str__() diff --git a/baselines/baseline2/summarize/sumy/models/dom/_paragraph.py b/baselines/baseline2/summarize/sumy/models/dom/_paragraph.py new file mode 100755 index 0000000..8cab505 --- /dev/null +++ b/baselines/baseline2/summarize/sumy/models/dom/_paragraph.py @@ -0,0 +1,48 @@ +# -*- coding: utf8 -*- + +from __future__ import absolute_import +from __future__ import division, print_function, unicode_literals + +from itertools import chain +from ..._compat import unicode_compatible +from ...utils import cached_property +from ._sentence import Sentence + + +@unicode_compatible +class Paragraph(object): + __slots__ = ( + "_sentences", + "_cached_property_sentences", + "_cached_property_headings", + "_cached_property_words", + ) + + def __init__(self, sentences): + sentences = tuple(sentences) + for sentence in sentences: + if not isinstance(sentence, Sentence): + raise TypeError("Only instances of class 'Sentence' are allowed.") + + self._sentences = sentences + + @cached_property + def sentences(self): + return tuple(s for s in self._sentences if not s.is_heading) + + @cached_property + def headings(self): + return tuple(s for s in self._sentences if s.is_heading) + + @cached_property + def words(self): + return tuple(chain(*(s.words for s in self._sentences))) + + def __unicode__(self): + return "" % ( + len(self.headings), + len(self.sentences), + ) + + def __repr__(self): + return self.__str__() diff --git a/baselines/baseline2/summarize/sumy/models/dom/_sentence.py b/baselines/baseline2/summarize/sumy/models/dom/_sentence.py new file mode 100755 index 0000000..6d2ea40 --- /dev/null +++ b/baselines/baseline2/summarize/sumy/models/dom/_sentence.py @@ -0,0 +1,44 @@ +# -*- coding: utf8 -*- + +from __future__ import absolute_import +from __future__ import division, print_function, unicode_literals + +from ...utils import cached_property +from ..._compat import to_unicode, to_string, unicode_compatible + + +@unicode_compatible +class Sentence(object): + __slots__ = ("_text", "_cached_property_words", "_tokenizer", "_is_heading",) + + def __init__(self, text, tokenizer, is_heading=False): + self._text = to_unicode(text).strip() + self._tokenizer = tokenizer + self._is_heading = bool(is_heading) + + @cached_property + def words(self): + return self._tokenizer.to_words(self._text) + + @property + def is_heading(self): + return self._is_heading + + def __eq__(self, sentence): + assert isinstance(sentence, Sentence) + return self._is_heading is sentence._is_heading and self._text == sentence._text + + def __ne__(self, sentence): + return not self.__eq__(sentence) + + def __hash__(self): + return hash((self._is_heading, self._text)) + + def __unicode__(self): + return self._text + + def __repr__(self): + return to_string("<%s: %s>") % ( + "Heading" if self._is_heading else "Sentence", + self.__str__() + ) diff --git a/baselines/baseline2/summarize/sumy/models/tf.py b/baselines/baseline2/summarize/sumy/models/tf.py new file mode 100755 index 0000000..6fd913b --- /dev/null +++ b/baselines/baseline2/summarize/sumy/models/tf.py @@ -0,0 +1,88 @@ +# -*- coding: utf8 -*- + +from __future__ import absolute_import +from __future__ import division, print_function, unicode_literals + +import math + +from pprint import pformat +from collections import Sequence +from .._compat import to_unicode, unicode, string_types, Counter + + +class TfDocumentModel(object): + """Term-Frequency document model (term = word).""" + def __init__(self, words, tokenizer=None): + if isinstance(words, string_types) and tokenizer is None: + raise ValueError( + "Tokenizer has to be given if ``words`` is not a sequence.") + elif isinstance(words, string_types): + words = tokenizer.to_words(to_unicode(words)) + elif not isinstance(words, Sequence): + raise ValueError( + "Parameter ``words`` has to be sequence or string with tokenizer given.") + + self._terms = Counter(map(unicode.lower, words)) + self._max_frequency = max(self._terms.values()) if self._terms else 1 + + @property + def magnitude(self): + """ + Lenght/norm/magnitude of vector representation of document. + This is usually denoted by ||d||. + """ + return math.sqrt(sum(t**2 for t in self._terms.values())) + + @property + def terms(self): + return self._terms.keys() + + def most_frequent_terms(self, count=0): + """ + Returns ``count`` of terms sorted by their frequency + in descending order. + + :parameter int count: + Max. number of returned terms. Value 0 means no limit (default). + """ + # sort terms by number of occurrences in descending order + terms = sorted(self._terms.items(), key=lambda i: -i[1]) + + terms = tuple(i[0] for i in terms) + if count == 0: + return terms + elif count > 0: + return terms[:count] + else: + raise ValueError( + "Only non-negative values are allowed for count of terms.") + + def term_frequency(self, term): + """ + Returns frequency of term in document. + + :returns int: + Returns count of words in document. + """ + return self._terms.get(term, 0) + + def normalized_term_frequency(self, term, smooth=0.0): + """ + Returns normalized frequency of term in document. + http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html + + :parameter float smooth: + 0.0 <= smooth <= 1.0, generally set to 0.4, although some + early work used the value 0.5. The term is a smoothing term + whose role is to damp the contribution of the second term. + It may be viewed as a scaling down of TF by the largest TF + value in document. + :returns float: + 0.0 <= frequency <= 1.0, where 0 means no occurence in document + and 1 the most frequent term in document. + """ + frequency = self.term_frequency(term) / self._max_frequency + return smooth + (1.0 - smooth)*frequency + + def __repr__(self): + return "" % pformat(self._terms) diff --git a/baselines/baseline2/summarize/sumy/nlp/__init__.py b/baselines/baseline2/summarize/sumy/nlp/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/baselines/baseline2/summarize/sumy/nlp/stemmers/__init__.py b/baselines/baseline2/summarize/sumy/nlp/stemmers/__init__.py new file mode 100755 index 0000000..621d01a --- /dev/null +++ b/baselines/baseline2/summarize/sumy/nlp/stemmers/__init__.py @@ -0,0 +1,32 @@ +# -*- coding: utf8 -*- + +from __future__ import absolute_import +from __future__ import division, print_function, unicode_literals + +import nltk.stem.snowball as nltk_stemmers_module + +from .czech import stem_word as czech_stemmer + +from ..._compat import to_unicode + + +def null_stemmer(object): + "Converts given object to unicode with lower letters." + return to_unicode(object).lower() + + +class Stemmer(object): + def __init__(self, language): + self._stemmer = null_stemmer + if language.lower() in ('czech', 'slovak'): + self._stemmer = czech_stemmer + return + stemmer_classname = language.capitalize() + 'Stemmer' + try: + stemmer_class = getattr(nltk_stemmers_module, stemmer_classname) + except AttributeError: + raise LookupError("Stemmer is not available for language %s." % language) + self._stemmer = stemmer_class().stem + + def __call__(self, word): + return self._stemmer(word) diff --git a/baselines/baseline2/summarize/sumy/nlp/stemmers/czech.py b/baselines/baseline2/summarize/sumy/nlp/stemmers/czech.py new file mode 100755 index 0000000..d3be172 --- /dev/null +++ b/baselines/baseline2/summarize/sumy/nlp/stemmers/czech.py @@ -0,0 +1,204 @@ +# -*- coding: utf8 -*- + +""" +Czech stemmer +Copyright © 2010 Luís Gomes . + +Ported from the Java implementation available at: + http://members.unine.ch/jacques.savoy/clef/index.html + +Usage: + czech_stemmer.py light|aggressive +""" + +from __future__ import absolute_import +from __future__ import division, print_function, unicode_literals + +import re +import sys + +from warnings import warn +from ..._compat import unicode + + +WORD_PATTERN = re.compile(r"^\w+$", re.UNICODE) + + +def stem_word(word, aggressive=False): + if not isinstance(word, unicode): + word = word.decode("utf8") + + if not WORD_PATTERN.match(word): + return word + + if not word.islower() and not word.istitle() and not word.isupper(): + warn("skipping word with mixed case: " + word) + return word + + stem = word.lower() + stem = _remove_case(stem) + stem = _remove_possessives(stem) + + if aggressive: + stem = _remove_comparative(stem) + stem = _remove_diminutive(stem) + stem = _remove_augmentative(stem) + stem = _remove_derivational(stem) + + if word.isupper(): + return stem.upper() + if word.istitle(): + return stem.title() + + return stem + + +def _remove_case(word): + if len(word) > 7 and word.endswith("atech"): + return word[:-5] + + if len(word) > 6: + if word.endswith("ětem"): + return _palatalize(word[:-3]) + if word.endswith("atům"): + return word[:-4] + + if len(word) > 5: + if word[-3:] in ("ech", "ich", "ích", "ého", "ěmi", "emi", "ému", + "ete", "eti", "iho", "ího", "ími", "imu"): + return _palatalize(word[:-2]) + if word[-3:] in ("ách", "ata", "aty", "ých", "ama", "ami", + "ové", "ovi", "ými"): + return word[:-3] + + if len(word) > 4: + if word.endswith("em"): + return _palatalize(word[:-1]) + if word[-2:] in ("es", "ém", "ím"): + return _palatalize(word[:-2]) + if word[-2:] in ("ům", "at", "ám", "os", "us", "ým", "mi", "ou"): + return word[:-2] + + if len(word) > 3: + if word[-1] in "eiíě": + return _palatalize(word) + if word[-1] in "uyůaoáéý": + return word[:-1] + + return word + + +def _remove_possessives(word): + if len(word) > 5: + if word[-2:] in ("ov", "ův"): + return word[:-2] + if word.endswith("in"): + return _palatalize(word[:-1]) + return word + + +def _remove_comparative(word): + if len(word) > 5: + if word[-3:] in ("ejš", "ějš"): + return _palatalize(word[:-2]) + return word + + +def _remove_diminutive(word): + if len(word) > 7 and word.endswith("oušek"): + return word[:-5] + if len(word) > 6: + if word[-4:] in ("eček", "éček", "iček", "íček", "enek", "ének", + "inek", "ínek"): + return _palatalize(word[:-3]) + if word[-4:] in ("áček", "aček", "oček", "uček", "anek", "onek", + "unek", "ánek"): + return _palatalize(word[:-4]) + if len(word) > 5: + if word[-3:] in ("ečk", "éčk", "ičk", "íčk", "enk", "énk", + "ink", "ínk"): + return _palatalize(word[:-3]) + if word[-3:] in ("áčk", "ačk", "očk", "učk", "ank", "onk", + "unk", "átk", "ánk", "ušk"): + return word[:-3] + if len(word) > 4: + if word[-2:] in ("ek", "ék", "ík", "ik"): + return _palatalize(word[:-1]) + if word[-2:] in ("ák", "ak", "ok", "uk"): + return word[:-1] + if len(word) > 3 and word[-1] == "k": + return word[:-1] + return word + + +def _remove_augmentative(word): + if len(word) > 6 and word.endswith("ajzn"): + return word[:-4] + if len(word) > 5 and word[-3:] in ("izn", "isk"): + return _palatalize(word[:-2]) + if len(word) > 4 and word.endswith("ák"): + return word[:-2] + return word + + +def _remove_derivational(word): + if len(word) > 8 and word.endswith("obinec"): + return word[:-6] + if len(word) > 7: + if word.endswith("ionář"): + return _palatalize(word[:-4]) + if word[-5:] in ("ovisk", "ovstv", "ovišt", "ovník"): + return word[:-5] + if len(word) > 6: + if word[-4:] in ("ásek", "loun", "nost", "teln", "ovec", "ovík", + "ovtv", "ovin", "štin"): + return word[:-4] + if word[-4:] in ("enic", "inec", "itel"): + return _palatalize(word[:-3]) + if len(word) > 5: + if word.endswith("árn"): + return word[:-3] + if word[-3:] in ("ěnk", "ián", "ist", "isk", "išt", "itb", "írn"): + return _palatalize(word[:-2]) + if word[-3:] in ("och", "ost", "ovn", "oun", "out", "ouš", + "ušk", "kyn", "čan", "kář", "néř", "ník", + "ctv", "stv"): + return word[:-3] + if len(word) > 4: + if word[-2:] in ("áč", "ač", "án", "an", "ář", "as"): + return word[:-2] + if word[-2:] in ("ec", "en", "ěn", "éř", "íř", "ic", "in", "ín", + "it", "iv"): + return _palatalize(word[:-1]) + if word[-2:] in ("ob", "ot", "ov", "oň", "ul", "yn", "čk", "čn", + "dl", "nk", "tv", "tk", "vk"): + return word[:-2] + if len(word) > 3 and word[-1] in "cčklnt": + return word[:-1] + return word + + +def _palatalize(word): + if word[-2:] in ("ci", "ce", "či", "če"): + return word[:-2] + "k" + + if word[-2:] in ("zi", "ze", "ži", "že"): + return word[:-2] + "h" + + if word[-3:] in ("čtě", "čti", "čtí"): + return word[:-3] + "ck" + + if word[-3:] in ("ště", "šti", "ští"): + return word[:-3] + "sk" + + return word[:-1] + + +if __name__ == '__main__': + if len(sys.argv) != 2 or sys.argv[1] not in ("light", "aggressive"): + sys.exit(__doc__.encode("utf8")) + + aggressive_stemming = bool(sys.argv[1] == "aggressive") + for line in sys.stdin: + words = tuple(w.decode("utf8") + " " + stem_word(w, aggressive_stemming) for w in line.split()) + print(*map(lambda s: s.encode("utf8"), words)) diff --git a/baselines/baseline2/summarize/sumy/nlp/tokenizers.py b/baselines/baseline2/summarize/sumy/nlp/tokenizers.py new file mode 100755 index 0000000..016a61e --- /dev/null +++ b/baselines/baseline2/summarize/sumy/nlp/tokenizers.py @@ -0,0 +1,60 @@ +# -*- coding: utf8 -*- + +from __future__ import absolute_import +from __future__ import division, print_function, unicode_literals + +import re +import zipfile +import nltk + +from .._compat import to_string, to_unicode, unicode + + +class Tokenizer(object): + """Language dependent tokenizer of text document.""" + + _WORD_PATTERN = re.compile(r"^[^\W\d_]+$", re.UNICODE) + # feel free to contribute if you have better tokenizer for any of these languages :) + LANGUAGE_ALIASES = { + "slovak": "czech", + } + + # improve tokenizer by adding specific abbreviations it has issues with + # note the final point in these items must not be included + LANGUAGE_EXTRA_ABREVS = { + "english": ["e.g", "al", "i.e"], + "german": ["al", "z.B", "Inc", "engl", "z. B", "vgl", "lat", "bzw", "S"], + } + + def __init__(self, language): + self._language = language + + tokenizer_language = self.LANGUAGE_ALIASES.get(language, language) + self._sentence_tokenizer = self._sentence_tokenizer(tokenizer_language) + + @property + def language(self): + return self._language + + def _sentence_tokenizer(self, language): + try: + path = to_string("tokenizers/punkt/%s.pickle") % to_string(language) + return nltk.data.load(path) + except (LookupError, zipfile.BadZipfile): + raise LookupError( + "NLTK tokenizers are missing. Download them by following command: " + '''python -c "import nltk; nltk.download('punkt')"''' + ) + + def to_sentences(self, paragraph): + extra_abbreviations = self.LANGUAGE_EXTRA_ABREVS.get(self._language, []) + self._sentence_tokenizer._params.abbrev_types.update(extra_abbreviations) + sentences = self._sentence_tokenizer.tokenize(to_unicode(paragraph)) + return tuple(map(unicode.strip, sentences)) + + def to_words(self, sentence): + words = nltk.word_tokenize(to_unicode(sentence)) + return tuple(filter(self._is_word, words)) + + def _is_word(self, word): + return bool(Tokenizer._WORD_PATTERN.search(word)) diff --git a/baselines/baseline2/summarize/sumy/parsers/__init__.py b/baselines/baseline2/summarize/sumy/parsers/__init__.py new file mode 100755 index 0000000..90bc3d4 --- /dev/null +++ b/baselines/baseline2/summarize/sumy/parsers/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf8 -*- + +from __future__ import absolute_import +from __future__ import division, print_function, unicode_literals + +from .parser import DocumentParser diff --git a/baselines/baseline2/summarize/sumy/parsers/html.py b/baselines/baseline2/summarize/sumy/parsers/html.py new file mode 100755 index 0000000..e88bde0 --- /dev/null +++ b/baselines/baseline2/summarize/sumy/parsers/html.py @@ -0,0 +1,102 @@ +# -*- coding: utf8 -*- + +from __future__ import absolute_import +from __future__ import division, print_function, unicode_literals + +from breadability.readable import Article +from ..utils import cached_property, fetch_url +from ..models.dom import Sentence, Paragraph, ObjectDocumentModel +from .parser import DocumentParser + + +class HtmlParser(DocumentParser): + """Parser of text from HTML format into DOM.""" + + SIGNIFICANT_TAGS = ( + "h1", "h2", "h3", + "b", "strong", + "big", + "dfn", + "em", + ) + + @classmethod + def from_string(cls, string, url, tokenizer): + return cls(string, tokenizer, url) + + @classmethod + def from_file(cls, file_path, url, tokenizer): + with open(file_path, "rb") as file: + return cls(file.read(), tokenizer, url) + + @classmethod + def from_url(cls, url, tokenizer): + data = fetch_url(url) + return cls(data, tokenizer, url) + + def __init__(self, html_content, tokenizer, url=None): + super(HtmlParser, self).__init__(tokenizer) + self._article = Article(html_content, url) + + @cached_property + def significant_words(self): + words = [] + for paragraph in self._article.main_text: + for text, annotations in paragraph: + if self._contains_any(annotations, *self.SIGNIFICANT_TAGS): + words.extend(self.tokenize_words(text)) + + if words: + return tuple(words) + else: + return self.SIGNIFICANT_WORDS + + @cached_property + def stigma_words(self): + words = [] + for paragraph in self._article.main_text: + for text, annotations in paragraph: + if self._contains_any(annotations, "a", "strike", "s"): + words.extend(self.tokenize_words(text)) + + if words: + return tuple(words) + else: + return self.STIGMA_WORDS + + def _contains_any(self, sequence, *args): + if sequence is None: + return False + + for item in args: + if item in sequence: + return True + + return False + + @cached_property + def document(self): + # "a", "abbr", "acronym", "b", "big", "blink", "blockquote", "cite", "code", + # "dd", "del", "dfn", "dir", "dl", "dt", "em", "h", "h1", "h2", "h3", "h4", + # "h5", "h6", "i", "ins", "kbd", "li", "marquee", "menu", "ol", "pre", "q", + # "s", "samp", "strike", "strong", "sub", "sup", "tt", "u", "ul", "var", + + annotated_text = self._article.main_text + + paragraphs = [] + for paragraph in annotated_text: + sentences = [] + + current_text = "" + for text, annotations in paragraph: + if annotations and ("h1" in annotations or "h2" in annotations or "h3" in annotations): + sentences.append(Sentence(text, self._tokenizer, is_heading=True)) + # skip
 nodes
+                elif not (annotations and "pre" in annotations):
+                    current_text += " " + text
+
+            new_sentences = self.tokenize_sentences(current_text)
+            sentences.extend(Sentence(s, self._tokenizer) for s in new_sentences)
+            paragraphs.append(Paragraph(sentences))
+
+        return ObjectDocumentModel(paragraphs)
diff --git a/baselines/baseline2/summarize/sumy/parsers/parser.py b/baselines/baseline2/summarize/sumy/parsers/parser.py
new file mode 100755
index 0000000..fd6037c
--- /dev/null
+++ b/baselines/baseline2/summarize/sumy/parsers/parser.py
@@ -0,0 +1,41 @@
+# -*- coding: utf8 -*-
+
+from __future__ import absolute_import
+from __future__ import division, print_function, unicode_literals
+
+
+class DocumentParser(object):
+    """Abstract parser of input format into DOM."""
+
+    SIGNIFICANT_WORDS = (
+        "významný",
+        "vynikající",
+        "podstatný",
+        "význačný",
+        "důležitý",
+        "slavný",
+        "zajímavý",
+        "eminentní",
+        "vlivný",
+        "supr",
+        "super",
+        "nejlepší",
+        "dobrý",
+        "kvalitní",
+        "optimální",
+        "relevantní",
+    )
+    STIGMA_WORDS = (
+        "nejhorší",
+        "zlý",
+        "šeredný",
+    )
+
+    def __init__(self, tokenizer):
+        self._tokenizer = tokenizer
+
+    def tokenize_sentences(self, paragraph):
+        return self._tokenizer.to_sentences(paragraph)
+
+    def tokenize_words(self, sentence):
+        return self._tokenizer.to_words(sentence)
diff --git a/baselines/baseline2/summarize/sumy/parsers/plaintext.py b/baselines/baseline2/summarize/sumy/parsers/plaintext.py
new file mode 100755
index 0000000..9fca62b
--- /dev/null
+++ b/baselines/baseline2/summarize/sumy/parsers/plaintext.py
@@ -0,0 +1,102 @@
+# -*- coding: utf8 -*-
+
+from __future__ import absolute_import
+from __future__ import division, print_function, unicode_literals
+
+from .._compat import to_unicode
+from ..utils import cached_property
+from ..models.dom import Sentence, Paragraph, ObjectDocumentModel
+from .parser import DocumentParser
+
+
+class PlaintextParser(DocumentParser):
+    """
+    Parses simple plain text in following format:
+
+    HEADING
+    This is text of 1st paragraph. Some another sentence.
+
+    This is next paragraph.
+
+    HEADING IS LINE ALL IN UPPER CASE
+    This is 3rd paragraph with heading. Sentence in 3rd paragraph.
+    Another sentence in 3rd paragraph.
+
+    Paragraphs are separated by empty lines. And that's all :)
+    """
+
+    @classmethod
+    def from_string(cls, string, tokenizer):
+        return cls(string, tokenizer)
+
+    @classmethod
+    def from_file(cls, file_path, tokenizer):
+        with open(file_path) as file:
+            return cls(file.read(), tokenizer)
+
+    def __init__(self, text, tokenizer):
+        super(PlaintextParser, self).__init__(tokenizer)
+        self._text = to_unicode(text).strip()
+
+    @cached_property
+    def significant_words(self):
+        words = []
+        for paragraph in self.document.paragraphs:
+            for heading in paragraph.headings:
+                words.extend(heading.words)
+
+        if words:
+            return tuple(words)
+        else:
+            return self.SIGNIFICANT_WORDS
+
+    @cached_property
+    def stigma_words(self):
+        return self.STIGMA_WORDS
+
+    @cached_property
+    def document(self):
+        current_paragraph = []
+        paragraphs = []
+        for line in self._text.splitlines():
+            line = line.strip()
+            if line.isupper():
+                heading = Sentence(line, self._tokenizer, is_heading=True)
+                current_paragraph.append(heading)
+            elif not line and current_paragraph:
+                sentences = self._to_sentences(current_paragraph)
+                paragraphs.append(Paragraph(sentences))
+                current_paragraph = []
+            elif line:
+                current_paragraph.append(line)
+
+        sentences = self._to_sentences(current_paragraph)
+        paragraphs.append(Paragraph(sentences))
+
+        return ObjectDocumentModel(paragraphs)
+
+    def _to_sentences(self, lines):
+        text = u""
+        sentence_objects = []
+
+        for line in lines:
+            if isinstance(line, Sentence):
+                if text:
+                    sentences = self.tokenize_sentences(text)
+                    sentence_objects += map(self._to_sentence, sentences)
+
+                sentence_objects.append(line)
+                text = u""
+            else:
+                text += u" " + line
+
+        text = text.strip()
+        if text:
+            sentences = self.tokenize_sentences(text)
+            sentence_objects += map(self._to_sentence, sentences)
+
+        return sentence_objects
+
+    def _to_sentence(self, text):
+        assert text.strip()
+        return Sentence(text, self._tokenizer)
diff --git a/baselines/baseline2/summarize/sumy/summarizers/__init__.py b/baselines/baseline2/summarize/sumy/summarizers/__init__.py
new file mode 100755
index 0000000..4e0750e
--- /dev/null
+++ b/baselines/baseline2/summarize/sumy/summarizers/__init__.py
@@ -0,0 +1,6 @@
+# -*- coding: utf8 -*-
+
+from __future__ import absolute_import
+from __future__ import division, print_function, unicode_literals
+
+from ._summarizer import AbstractSummarizer
diff --git a/baselines/baseline2/summarize/sumy/summarizers/_summarizer.py b/baselines/baseline2/summarize/sumy/summarizers/_summarizer.py
new file mode 100755
index 0000000..f718b7a
--- /dev/null
+++ b/baselines/baseline2/summarize/sumy/summarizers/_summarizer.py
@@ -0,0 +1,56 @@
+# -*- coding: utf8 -*-
+
+from __future__ import absolute_import
+from __future__ import division, print_function, unicode_literals
+
+
+from collections import namedtuple
+from operator import attrgetter
+from ..utils import ItemsCount
+from .._compat import to_unicode
+from ..nlp.stemmers import null_stemmer
+from nltk import word_tokenize
+
+SentenceInfo = namedtuple("SentenceInfo", ("sentence", "order", "rating",))
+
+
+class AbstractSummarizer(object):
+    def __init__(self, stemmer=null_stemmer):
+        if not callable(stemmer):
+            raise ValueError("Stemmer has to be a callable object")
+
+        self._stemmer = stemmer
+
+    def __call__(self, document, sentences_count):
+        raise NotImplementedError("This method should be overriden in subclass")
+
+    def stem_word(self, word):
+        return self._stemmer(self.normalize_word(word))
+
+    def normalize_word(self, word):
+        return to_unicode(word).lower()
+
+    def _get_break_index(self, infos, summary_size):
+        s_length = 0
+        for i,_ in enumerate(infos):
+            s_length += len(word_tokenize("%s" % (infos[i].sentence)))
+            if summary_size <= s_length:
+                return i+1
+
+    def _get_best_sentences(self, sentences, summary_size, rating, *args, **kwargs):
+        rate = rating
+        if isinstance(rating, dict):
+            assert not args and not kwargs
+            rate = lambda s: rating[s]
+
+        infos = (SentenceInfo(s, o, rate(s, *args, **kwargs))
+            for o, s in enumerate(sentences))
+
+        # sort sentences by rating in descending order
+        infos = sorted(infos, key=attrgetter("rating"), reverse=True)
+        # get `count` first best rated sentences
+        index = self._get_break_index(infos, summary_size)
+        # sort sentences by their order in document
+        infos = sorted(infos[:index], key=attrgetter("order"))
+
+        return [unicode(i.sentence) for i in infos]
diff --git a/baselines/baseline2/summarize/sumy/summarizers/edmundson.py b/baselines/baseline2/summarize/sumy/summarizers/edmundson.py
new file mode 100755
index 0000000..dad164a
--- /dev/null
+++ b/baselines/baseline2/summarize/sumy/summarizers/edmundson.py
@@ -0,0 +1,138 @@
+# -*- coding: utf8 -*-
+
+from __future__ import absolute_import
+from __future__ import division, print_function, unicode_literals
+
+from collections import defaultdict
+from ..nlp.stemmers import null_stemmer
+from ._summarizer import AbstractSummarizer
+from .edmundson_cue import EdmundsonCueMethod
+from .edmundson_key import EdmundsonKeyMethod
+from .edmundson_title import EdmundsonTitleMethod
+from .edmundson_location import EdmundsonLocationMethod
+
+
+_EMPTY_SET = frozenset()
+
+
+class EdmundsonSummarizer(AbstractSummarizer):
+    _bonus_words = _EMPTY_SET
+    _stigma_words = _EMPTY_SET
+    _null_words = _EMPTY_SET
+
+    def __init__(self, stemmer=null_stemmer, cue_weight=1.0, key_weight=0.0,
+            title_weight=1.0, location_weight=1.0):
+        super(EdmundsonSummarizer, self).__init__(stemmer)
+
+        self._ensure_correct_weights(cue_weight, key_weight, title_weight,
+            location_weight)
+
+        self._cue_weight = float(cue_weight)
+        self._key_weight = float(key_weight)
+        self._title_weight = float(title_weight)
+        self._location_weight = float(location_weight)
+
+    def _ensure_correct_weights(self, *weights):
+        for w in weights:
+            if w < 0.0:
+                raise ValueError("Negative wights are not allowed.")
+
+    @property
+    def bonus_words(self):
+        return self._bonus_words
+
+    @bonus_words.setter
+    def bonus_words(self, collection):
+        self._bonus_words = frozenset(map(self.stem_word, collection))
+
+    @property
+    def stigma_words(self):
+        return self._stigma_words
+
+    @stigma_words.setter
+    def stigma_words(self, collection):
+        self._stigma_words = frozenset(map(self.stem_word, collection))
+
+    @property
+    def null_words(self):
+        return self._null_words
+
+    @null_words.setter
+    def null_words(self, collection):
+        self._null_words = frozenset(map(self.stem_word, collection))
+
+    def __call__(self, document, sentences_count):
+        ratings = defaultdict(int)
+
+        if self._cue_weight > 0.0:
+            method = self._build_cue_method_instance()
+            ratings = self._update_ratings(ratings, method.rate_sentences(document))
+        if self._key_weight > 0.0:
+            method = self._build_key_method_instance()
+            ratings = self._update_ratings(ratings, method.rate_sentences(document))
+        if self._title_weight > 0.0:
+            method = self._build_title_method_instance()
+            ratings = self._update_ratings(ratings, method.rate_sentences(document))
+        if self._location_weight > 0.0:
+            method = self._build_location_method_instance()
+            ratings = self._update_ratings(ratings, method.rate_sentences(document))
+
+        return self._get_best_sentences(document.sentences, sentences_count, ratings)
+
+    def _update_ratings(self, ratings, new_ratings):
+        assert len(ratings) == 0 or len(ratings) == len(new_ratings)
+
+        for sentence, rating in new_ratings.items():
+            ratings[sentence] += rating
+
+        return ratings
+
+    def cue_method(self, document, sentences_count, bunus_word_value=1, stigma_word_value=1):
+        summarization_method = self._build_cue_method_instance()
+        return summarization_method(document, sentences_count, bunus_word_value,
+            stigma_word_value)
+
+    def _build_cue_method_instance(self):
+        self.__check_bonus_words()
+        self.__check_stigma_words()
+
+        return EdmundsonCueMethod(self._stemmer, self._bonus_words, self._stigma_words)
+
+    def key_method(self, document, sentences_count, weight=0.5):
+        summarization_method = self._build_key_method_instance()
+        return summarization_method(document, sentences_count, weight)
+
+    def _build_key_method_instance(self):
+        self.__check_bonus_words()
+
+        return  EdmundsonKeyMethod(self._stemmer, self._bonus_words)
+
+    def title_method(self, document, sentences_count):
+        summarization_method = self._build_title_method_instance()
+        return summarization_method(document, sentences_count)
+
+    def _build_title_method_instance(self):
+        self.__check_null_words()
+
+        return EdmundsonTitleMethod(self._stemmer, self._null_words)
+
+    def location_method(self, document, sentences_count, w_h=1, w_p1=1, w_p2=1, w_s1=1, w_s2=1):
+        summarization_method = self._build_location_method_instance()
+        return summarization_method(document, sentences_count, w_h, w_p1, w_p2, w_s1, w_s2)
+
+    def _build_location_method_instance(self):
+        self.__check_null_words()
+
+        return EdmundsonLocationMethod(self._stemmer, self._null_words)
+
+    def __check_bonus_words(self):
+        if not self._bonus_words:
+            raise ValueError("Set of bonus words is empty. Please set attribute 'bonus_words' with collection of words.")
+
+    def __check_stigma_words(self):
+        if not self._stigma_words:
+            raise ValueError("Set of stigma words is empty. Please set attribute 'stigma_words' with collection of words.")
+
+    def __check_null_words(self):
+        if not self._null_words:
+            raise ValueError("Set of null words is empty. Please set attribute 'null_words' with collection of words.")
diff --git a/baselines/baseline2/summarize/sumy/summarizers/edmundson_cue.py b/baselines/baseline2/summarize/sumy/summarizers/edmundson_cue.py
new file mode 100755
index 0000000..b5765be
--- /dev/null
+++ b/baselines/baseline2/summarize/sumy/summarizers/edmundson_cue.py
@@ -0,0 +1,54 @@
+# -*- coding: utf8 -*-
+
+from __future__ import absolute_import
+from __future__ import division, print_function, unicode_literals
+
+from ._summarizer import AbstractSummarizer
+
+
+class EdmundsonCueMethod(AbstractSummarizer):
+    def __init__(self, stemmer, bonus_words, stigma_words):
+        super(EdmundsonCueMethod, self).__init__(stemmer)
+        self._bonus_words = bonus_words
+        self._stigma_words = stigma_words
+
+    def __call__(self, document, sentences_count, bunus_word_weight, stigma_word_weight):
+        return self._get_best_sentences(document.sentences,
+            sentences_count, self._rate_sentence, bunus_word_weight,
+            stigma_word_weight)
+
+    def _rate_sentence(self, sentence, bunus_word_weight, stigma_word_weight):
+        # count number of bonus/stigma words in sentece
+        words = map(self.stem_word, sentence.words)
+        bonus_words_count, stigma_words_count = self._count_words(words)
+
+        # compute positive & negative rating
+        bonus_rating = bonus_words_count*bunus_word_weight
+        stigma_rating = stigma_words_count*stigma_word_weight
+
+        # rating of sentence is (positive - negative) rating
+        return bonus_rating - stigma_rating
+
+    def _count_words(self, words):
+        """
+        Counts number of bonus/stigma words.
+
+        :param iterable words:
+            Collection of words.
+        :returns pair:
+            Tuple with number of words (bonus words, stigma words).
+        """
+        bonus_words_count = 0
+        stigma_words_count = 0
+
+        for word in words:
+            if word in self._bonus_words:
+                bonus_words_count +=1
+            if word in self._stigma_words:
+                stigma_words_count += 1
+
+        return bonus_words_count, stigma_words_count
+
+    def rate_sentences(self, document, bunus_word_weight=1, stigma_word_weight=1):
+        return {sentence: self._rate_sentence(sentence, bunus_word_weight,
+                stigma_word_weight) for sentence in document.sentences}
diff --git a/baselines/baseline2/summarize/sumy/summarizers/edmundson_key.py b/baselines/baseline2/summarize/sumy/summarizers/edmundson_key.py
new file mode 100755
index 0000000..9f1cb3a
--- /dev/null
+++ b/baselines/baseline2/summarize/sumy/summarizers/edmundson_key.py
@@ -0,0 +1,54 @@
+# -*- coding: utf8 -*-
+
+from __future__ import absolute_import
+from __future__ import division, print_function, unicode_literals
+
+from .._compat import Counter
+from ._summarizer import AbstractSummarizer
+
+
+class EdmundsonKeyMethod(AbstractSummarizer):
+    def __init__(self, stemmer, bonus_words):
+        super(EdmundsonKeyMethod, self).__init__(stemmer)
+        self._bonus_words = bonus_words
+
+    def __call__(self, document, sentences_count, weight):
+        significant_words = self._compute_significant_words(document, weight)
+
+        return self._get_best_sentences(document.sentences,
+            sentences_count, self._rate_sentence, significant_words)
+
+    def _compute_significant_words(self, document, weight):
+        # keep only stems contained in bonus words
+        words = map(self.stem_word, document.words)
+        words = filter(self._is_bonus_word, words)
+
+        # compute frequencies of bonus words in document
+        word_counts = Counter(words)
+        word_frequencies = word_counts.values()
+
+        # no frequencies means no significant words
+        if not word_frequencies:
+            return ()
+
+        # return only words greater than weight
+        max_word_frequency = max(word_frequencies)
+        return tuple(word for word, frequency in word_counts.items()
+            if frequency/max_word_frequency > weight)
+
+    def _is_bonus_word(self, word):
+        return word in self._bonus_words
+
+    def _rate_sentence(self, sentence, significant_words):
+        words = map(self.stem_word, sentence.words)
+        return sum(w in significant_words for w in words)
+
+    def rate_sentences(self, document, weight=0.5):
+        significant_words = self._compute_significant_words(document, weight)
+
+        rated_sentences = {}
+        for sentence in document.sentences:
+            rated_sentences[sentence] = self._rate_sentence(sentence,
+                significant_words)
+
+        return rated_sentences
diff --git a/baselines/baseline2/summarize/sumy/summarizers/edmundson_location.py b/baselines/baseline2/summarize/sumy/summarizers/edmundson_location.py
new file mode 100755
index 0000000..406597f
--- /dev/null
+++ b/baselines/baseline2/summarize/sumy/summarizers/edmundson_location.py
@@ -0,0 +1,66 @@
+# -*- coding: utf8 -*-
+
+from __future__ import absolute_import
+from __future__ import division, print_function, unicode_literals
+
+from itertools import chain
+from operator import attrgetter
+from .._compat import ffilter
+from ._summarizer import AbstractSummarizer
+
+
+class EdmundsonLocationMethod(AbstractSummarizer):
+    def __init__(self, stemmer, null_words):
+        super(EdmundsonLocationMethod, self).__init__(stemmer)
+        self._null_words = null_words
+
+    def __call__(self, document, sentences_count, w_h, w_p1, w_p2, w_s1, w_s2):
+        significant_words = self._compute_significant_words(document)
+        ratings = self._rate_sentences(document, significant_words, w_h, w_p1,
+            w_p2, w_s1, w_s2)
+
+        return self._get_best_sentences(document.sentences, sentences_count, ratings)
+
+    def _compute_significant_words(self, document):
+        headings = document.headings
+
+        significant_words = chain(*map(attrgetter("words"), headings))
+        significant_words = map(self.stem_word, significant_words)
+        significant_words = ffilter(self._is_null_word, significant_words)
+
+        return frozenset(significant_words)
+
+    def _is_null_word(self, word):
+        return word in self._null_words
+
+    def _rate_sentences(self, document, significant_words, w_h, w_p1, w_p2, w_s1, w_s2):
+        rated_sentences = {}
+        paragraphs = document.paragraphs
+
+        for paragraph_order, paragraph in enumerate(paragraphs):
+            sentences = paragraph.sentences
+            for sentence_order, sentence in enumerate(sentences):
+                rating = self._rate_sentence(sentence, significant_words)
+                rating *= w_h
+
+                if paragraph_order == 0:
+                    rating += w_p1
+                elif paragraph_order == len(paragraphs) - 1:
+                    rating += w_p2
+
+                if sentence_order == 0:
+                    rating += w_s1
+                elif sentence_order == len(sentences) - 1:
+                    rating += w_s2
+
+                rated_sentences[sentence] = rating
+
+        return rated_sentences
+
+    def _rate_sentence(self, sentence, significant_words):
+        words = map(self.stem_word, sentence.words)
+        return sum(w in significant_words for w in words)
+
+    def rate_sentences(self, document, w_h=1, w_p1=1, w_p2=1, w_s1=1, w_s2=1):
+        significant_words = self._compute_significant_words(document)
+        return self._rate_sentences(document, significant_words, w_h, w_p1, w_p2, w_s1, w_s2)
diff --git a/baselines/baseline2/summarize/sumy/summarizers/edmundson_title.py b/baselines/baseline2/summarize/sumy/summarizers/edmundson_title.py
new file mode 100755
index 0000000..c477920
--- /dev/null
+++ b/baselines/baseline2/summarize/sumy/summarizers/edmundson_title.py
@@ -0,0 +1,48 @@
+# -*- coding: utf8 -*-
+
+from __future__ import absolute_import
+from __future__ import division, print_function, unicode_literals
+
+from operator import attrgetter
+from itertools import chain
+from .._compat import ffilter
+from ._summarizer import AbstractSummarizer
+
+
+class EdmundsonTitleMethod(AbstractSummarizer):
+    def __init__(self, stemmer, null_words):
+        super(EdmundsonTitleMethod, self).__init__(stemmer)
+        self._null_words = null_words
+
+    def __call__(self, document, sentences_count):
+        sentences = document.sentences
+        significant_words = self._compute_significant_words(document)
+
+        return self._get_best_sentences(sentences, sentences_count,
+            self._rate_sentence, significant_words)
+
+    def _compute_significant_words(self, document):
+        heading_words = map(attrgetter("words"), document.headings)
+
+        significant_words = chain(*heading_words)
+        significant_words = map(self.stem_word, significant_words)
+        significant_words = ffilter(self._is_null_word, significant_words)
+
+        return frozenset(significant_words)
+
+    def _is_null_word(self, word):
+        return word in self._null_words
+
+    def _rate_sentence(self, sentence, significant_words):
+        words = map(self.stem_word, sentence.words)
+        return sum(w in significant_words for w in words)
+
+    def rate_sentences(self, document):
+        significant_words = self._compute_significant_words(document)
+
+        rated_sentences = {}
+        for sentence in document.sentences:
+            rated_sentences[sentence] = self._rate_sentence(sentence,
+                significant_words)
+
+        return rated_sentences
diff --git a/baselines/baseline2/summarize/sumy/summarizers/kl.py b/baselines/baseline2/summarize/sumy/summarizers/kl.py
new file mode 100755
index 0000000..107bd9f
--- /dev/null
+++ b/baselines/baseline2/summarize/sumy/summarizers/kl.py
@@ -0,0 +1,147 @@
+# -*- coding: utf8 -*-
+
+from __future__ import absolute_import
+from __future__ import division, print_function, unicode_literals
+
+import math
+
+from ._summarizer import AbstractSummarizer
+
+
+class KLSummarizer(AbstractSummarizer):
+    """
+    Method that greedily adds sentences to a summary so long as it decreases the 
+    KL Divergence.
+    Source: http://www.aclweb.org/anthology/N09-1041
+    """
+
+    stop_words = frozenset()
+
+    def __call__(self, document, sentences_count):
+        ratings = self._get_ratings(document)
+        return self._get_best_sentences(document.sentences, sentences_count, ratings)
+
+    def _get_ratings(self, document):
+        sentences = document.sentences
+
+        ratings = self._compute_ratings(sentences)
+        return ratings
+
+    def _get_all_words_in_doc(self, sentences):
+        return [w for s in sentences for w in s.words]
+
+    def _get_content_words_in_sentence(self, sentence):
+        normalized_words = self._normalize_words(sentence.words)   
+        normalized_content_words = self._filter_out_stop_words(normalized_words)
+        return normalized_content_words
+
+    def _normalize_words(self, words):
+        return [self.normalize_word(w) for w in words]
+
+    def _filter_out_stop_words(self, words):
+        return [w for w in words if w not in self.stop_words]
+
+    def _compute_word_freq(self, list_of_words):
+        word_freq = {}
+        for w in list_of_words:
+            word_freq[w] = word_freq.get(w, 0) + 1
+        return word_freq
+
+    def _get_all_content_words_in_doc(self, sentences):
+        all_words = self._get_all_words_in_doc(sentences)
+        content_words = self._filter_out_stop_words(all_words)
+        normalized_content_words = self._normalize_words(content_words)
+        return normalized_content_words
+        
+    def compute_tf(self, sentences):
+        """
+        Computes the normalized term frequency as explained in http://www.tfidf.com/
+
+        :type sentences: [sumy.models.dom.Sentence]
+        """
+        content_words = self._get_all_content_words_in_doc(sentences)
+        content_words_count = len(content_words)
+        content_words_freq = self._compute_word_freq(content_words)
+        content_word_tf = dict((w, f / content_words_count) for w, f in content_words_freq.items())
+        return content_word_tf
+
+    def _joint_freq(self, word_list_1, word_list_2):
+        # combined length of the word lists
+        total_len = len(word_list_1) + len(word_list_2)
+
+        # word frequencies within each list
+        wc1 = self._compute_word_freq(word_list_1)
+        wc2 = self._compute_word_freq(word_list_2)
+
+        # inputs the counts from the first list
+        joint = wc1.copy()
+
+        # adds in the counts of the second list
+        for k in wc2:
+            if k in joint: 
+                joint[k] += wc2[k]
+            else:
+                joint[k] = wc2[k]
+
+        # divides total counts by the combined length
+        for k in joint:
+            joint[k] /= float(total_len)
+
+        return joint
+
+    def _kl_divergence(self, summary_freq, doc_freq):
+        """
+        Note: Could import scipy.stats and use scipy.stats.entropy(doc_freq, summary_freq)
+        but this gives equivalent value without the import
+        """
+        sum_val = 0
+        for w in summary_freq:
+            frequency = doc_freq.get(w)
+            if frequency:  # missing or zero = no frequency
+                sum_val += frequency * math.log(frequency / summary_freq[w])
+
+        return sum_val
+
+    def _find_index_of_best_sentence(self, kls):
+        """
+        the best sentence is the one with the smallest kl_divergence
+        """
+        return kls.index(min(kls))
+
+    def _compute_ratings(self, sentences):
+        word_freq = self.compute_tf(sentences)
+        ratings = {}
+        summary = []
+
+        # make it a list so that it can be modified
+        sentences_list = list(sentences)
+
+        # get all content words once for efficiency
+        sentences_as_words = [self._get_content_words_in_sentence(s) for s in sentences]
+        
+        # Removes one sentence per iteration by adding to summary
+        while len(sentences_list) > 0:
+            # will store all the kls values for this pass
+            kls = []
+            
+            # converts summary to word list
+            summary_as_word_list = self._get_all_words_in_doc(summary)
+            
+            for s in sentences_as_words:
+                # calculates the joint frequency through combining the word lists
+                joint_freq = self._joint_freq(s, summary_as_word_list)
+
+                # adds the calculated kl divergence to the list in index = sentence used
+                kls.append(self._kl_divergence(joint_freq, word_freq))
+
+            # to consider and then add it into the summary
+            indexToRemove = self._find_index_of_best_sentence(kls)
+            best_sentence = sentences_list.pop(indexToRemove)
+            del sentences_as_words[indexToRemove]
+            summary.append(best_sentence)
+
+            # value is the iteration in which it was removed multiplied by -1 so that the first sentences removed (the most important) have highest values
+            ratings[best_sentence] =  -1 * len(ratings)
+
+        return ratings
+
diff --git a/baselines/baseline2/summarize/sumy/summarizers/lex_rank.py b/baselines/baseline2/summarize/sumy/summarizers/lex_rank.py
new file mode 100755
index 0000000..d1f06c5
--- /dev/null
+++ b/baselines/baseline2/summarize/sumy/summarizers/lex_rank.py
@@ -0,0 +1,147 @@
+# -*- coding: utf8 -*-
+
+from __future__ import absolute_import
+from __future__ import division, print_function, unicode_literals
+
+import math
+
+try:
+    import numpy
+except ImportError:
+    numpy = None
+
+from ._summarizer import AbstractSummarizer
+from .._compat import Counter
+
+
+class LexRankSummarizer(AbstractSummarizer):
+    """
+    LexRank: Graph-based Centrality as Salience in Text Summarization
+    Source: http://tangra.si.umich.edu/~radev/lexrank/lexrank.pdf
+    """
+    threshold = 0.1
+    epsilon = 0.1
+    _stop_words = frozenset()
+
+    @property
+    def stop_words(self):
+        return self._stop_words
+
+    @stop_words.setter
+    def stop_words(self, words):
+        self._stop_words = frozenset(map(self.normalize_word, words))
+
+    def __call__(self, document, sentences_count):
+        self._ensure_dependencies_installed()
+
+        sentences_words = [self._to_words_set(s) for s in document.sentences]
+        if not sentences_words:
+            return tuple()
+
+        tf_metrics = self._compute_tf(sentences_words)
+        idf_metrics = self._compute_idf(sentences_words)
+
+        matrix = self._create_matrix(sentences_words, self.threshold, tf_metrics, idf_metrics)
+        scores = self.power_method(matrix, self.epsilon)
+        ratings = dict(zip(document.sentences, scores))
+
+        return self._get_best_sentences(document.sentences, sentences_count, ratings)
+
+    @staticmethod
+    def _ensure_dependencies_installed():
+        if numpy is None:
+            raise ValueError("LexRank summarizer requires NumPy. Please, install it by command 'pip install numpy'.")
+
+    def _to_words_set(self, sentence):
+        words = map(self.normalize_word, sentence.words)
+        return [self.stem_word(w) for w in words if w not in self._stop_words]
+
+    def _compute_tf(self, sentences):
+        tf_values = map(Counter, sentences)
+
+        tf_metrics = []
+        for sentence in tf_values:
+            metrics = {}
+            max_tf = self._find_tf_max(sentence)
+
+            for term, tf in sentence.items():
+                metrics[term] = tf / max_tf
+
+            tf_metrics.append(metrics)
+
+        return tf_metrics
+
+    @staticmethod
+    def _find_tf_max(terms):
+        return max(terms.values()) if terms else 1
+
+    @staticmethod
+    def _compute_idf(sentences):
+        idf_metrics = {}
+        sentences_count = len(sentences)
+
+        for sentence in sentences:
+            for term in sentence:
+                if term not in idf_metrics:
+                    n_j = sum(1 for s in sentences if term in s)
+                    idf_metrics[term] = math.log(sentences_count / (1 + n_j))
+
+        return idf_metrics
+
+    def _create_matrix(self, sentences, threshold, tf_metrics, idf_metrics):
+        """
+        Creates matrix of shape |sentences|×|sentences|.
+        """
+        # create matrix |sentences|×|sentences| filled with zeroes
+        sentences_count = len(sentences)
+        matrix = numpy.zeros((sentences_count, sentences_count))
+        degrees = numpy.zeros((sentences_count, ))
+
+        for row, (sentence1, tf1) in enumerate(zip(sentences, tf_metrics)):
+            for col, (sentence2, tf2) in enumerate(zip(sentences, tf_metrics)):
+                matrix[row, col] = self._compute_cosine(sentence1, sentence2, tf1, tf2, idf_metrics)
+
+                if matrix[row, col] > threshold:
+                    matrix[row, col] = 1.0
+                    degrees[row] += 1
+                else:
+                    matrix[row, col] = 0
+
+        for row in range(sentences_count):
+            for col in range(sentences_count):
+                if degrees[row] == 0:
+                    degrees[row] = 1
+
+                matrix[row][col] = matrix[row][col] / degrees[row]
+
+        return matrix
+
+    @staticmethod
+    def _compute_cosine(sentence1, sentence2, tf1, tf2, idf_metrics):
+        common_words = frozenset(sentence1) & frozenset(sentence2)
+
+        numerator = 0.0
+        for term in common_words:
+            numerator += tf1[term]*tf2[term] * idf_metrics[term]**2
+
+        denominator1 = sum((tf1[t]*idf_metrics[t])**2 for t in sentence1)
+        denominator2 = sum((tf2[t]*idf_metrics[t])**2 for t in sentence2)
+
+        if denominator1 > 0 and denominator2 > 0:
+            return numerator / (math.sqrt(denominator1) * math.sqrt(denominator2))
+        else:
+            return 0.0
+
+    @staticmethod
+    def power_method(matrix, epsilon):
+        transposed_matrix = matrix.T
+        sentences_count = len(matrix)
+        p_vector = numpy.array([1.0 / sentences_count] * sentences_count)
+        lambda_val = 1.0
+
+        while lambda_val > epsilon:
+            next_p = numpy.dot(transposed_matrix, p_vector)
+            lambda_val = numpy.linalg.norm(numpy.subtract(next_p, p_vector))
+            p_vector = next_p
+
+        return p_vector
diff --git a/baselines/baseline2/summarize/sumy/summarizers/lsa.py b/baselines/baseline2/summarize/sumy/summarizers/lsa.py
new file mode 100755
index 0000000..2a29a90
--- /dev/null
+++ b/baselines/baseline2/summarize/sumy/summarizers/lsa.py
@@ -0,0 +1,122 @@
+# -*- coding: utf8 -*-
+
+from __future__ import absolute_import
+from __future__ import division, print_function, unicode_literals
+
+import math
+
+from warnings import warn
+
+try:
+    import numpy
+except ImportError:
+    numpy = None
+
+try:
+    from numpy.linalg import svd as singular_value_decomposition
+except ImportError:
+    singular_value_decomposition = None
+from ._summarizer import AbstractSummarizer
+
+
+class LsaSummarizer(AbstractSummarizer):
+    MIN_DIMENSIONS = 3
+    REDUCTION_RATIO = 1/1
+    _stop_words = frozenset()
+
+    @property
+    def stop_words(self):
+        return self._stop_words
+
+    @stop_words.setter
+    def stop_words(self, words):
+        self._stop_words = frozenset(map(self.normalize_word, words))
+
+    def __call__(self, document, sentences_count):
+        self._ensure_dependecies_installed()
+
+        dictionary = self._create_dictionary(document)
+        # empty document
+        if not dictionary:
+            return ()
+
+        matrix = self._create_matrix(document, dictionary)
+        matrix = self._compute_term_frequency(matrix)
+        u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)
+
+        ranks = iter(self._compute_ranks(sigma, v))
+        return self._get_best_sentences(document.sentences, sentences_count,
+            lambda s: next(ranks))
+
+    def _ensure_dependecies_installed(self):
+        if numpy is None:
+            raise ValueError("LSA summarizer requires NumPy. Please, install it by command 'pip install numpy'.")
+
+    def _create_dictionary(self, document):
+        """Creates mapping key = word, value = row index"""
+        words = map(self.normalize_word, document.words)
+        unique_words = frozenset(self.stem_word(w) for w in words if w not in self._stop_words)
+
+        return dict((w, i) for i, w in enumerate(unique_words))
+
+    def _create_matrix(self, document, dictionary):
+        """
+        Creates matrix of shape |unique words|×|sentences| where cells
+        contains number of occurences of words (rows) in senteces (cols).
+        """
+        sentences = document.sentences
+
+        words_count = len(dictionary)
+        sentences_count = len(sentences)
+        if words_count < sentences_count:
+            message = (
+                "Number of words (%d) is lower than number of sentences (%d). "
+                "LSA algorithm may not work properly."
+            )
+            warn(message % (words_count, sentences_count))
+
+        # create matrix |unique words|×|sentences| filled with zeroes
+        matrix = numpy.zeros((words_count, sentences_count))
+        for col, sentence in enumerate(sentences):
+            for word in map(self.stem_word, sentence.words):
+                # only valid words is counted (not stop-words, ...)
+                if word in dictionary:
+                    row = dictionary[word]
+                    matrix[row, col] += 1
+
+        return matrix
+
+    def _compute_term_frequency(self, matrix, smooth=0.4):
+        """
+        Computes TF metrics for each sentence (column) in the given matrix.
+        You can read more about smoothing parameter at URL below:
+        http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html
+        """
+        assert 0.0 <= smooth < 1.0
+
+        max_word_frequencies = numpy.max(matrix, axis=0)
+        rows, cols = matrix.shape
+        for row in range(rows):
+            for col in range(cols):
+                max_word_frequency = max_word_frequencies[col]
+                if max_word_frequency != 0:
+                    frequency = matrix[row, col]/max_word_frequency
+                    matrix[row, col] = smooth + (1.0 - smooth)*frequency
+
+        return matrix
+
+    def _compute_ranks(self, sigma, v_matrix):
+        assert len(sigma) == v_matrix.shape[0], "Matrices should be multiplicable"
+
+        dimensions = max(LsaSummarizer.MIN_DIMENSIONS,
+            int(len(sigma)*LsaSummarizer.REDUCTION_RATIO))
+        powered_sigma = tuple(s**2 if i < dimensions else 0.0
+            for i, s in enumerate(sigma))
+
+        ranks = []
+        # iterate over columns of matrix (rows of transposed matrix)
+        for column_vector in v_matrix.T:
+            rank = sum(s*v**2 for s, v in zip(powered_sigma, column_vector))
+            ranks.append(math.sqrt(rank))
+
+        return ranks
diff --git a/baselines/baseline2/summarize/sumy/summarizers/luhn.py b/baselines/baseline2/summarize/sumy/summarizers/luhn.py
new file mode 100755
index 0000000..e16750e
--- /dev/null
+++ b/baselines/baseline2/summarize/sumy/summarizers/luhn.py
@@ -0,0 +1,85 @@
+# -*- coding: utf8 -*-
+
+from __future__ import absolute_import
+from __future__ import division, print_function, unicode_literals
+
+from ..models import TfDocumentModel
+from ._summarizer import AbstractSummarizer
+
+
+class LuhnSummarizer(AbstractSummarizer):
+    max_gap_size = 4
+    # TODO: better recognition of significant words (automatic)
+    significant_percentage = 1
+    _stop_words = frozenset()
+
+    @property
+    def stop_words(self):
+        return self._stop_words
+
+    @stop_words.setter
+    def stop_words(self, words):
+        self._stop_words = frozenset(map(self.normalize_word, words))
+
+    def __call__(self, document, sentences_count):
+        words = self._get_significant_words(document.words)
+        return self._get_best_sentences(document.sentences,
+            sentences_count, self.rate_sentence, words)
+
+    def _get_significant_words(self, words):
+        words = map(self.normalize_word, words)
+        words = tuple(self.stem_word(w) for w in words if w not in self._stop_words)
+
+        model = TfDocumentModel(words)
+
+        # take only best `significant_percentage` % words
+        best_words_count = int(len(words) * self.significant_percentage)
+        words = model.most_frequent_terms(best_words_count)
+
+        # take only words contained multiple times in document
+        return tuple(t for t in words if model.term_frequency(t) > 1)
+
+    def rate_sentence(self, sentence, significant_stems):
+        ratings = self._get_chunk_ratings(sentence, significant_stems)
+        return max(ratings) if ratings else 0
+
+    def _get_chunk_ratings(self, sentence, significant_stems):
+        chunks = []
+        NONSIGNIFICANT_CHUNK = [0]*self.max_gap_size
+
+        in_chunk = False
+        for order, word in enumerate(sentence.words):
+            stem = self.stem_word(word)
+            # new chunk
+            if stem in significant_stems and not in_chunk:
+                in_chunk = True
+                chunks.append([1])
+            # append word to chunk
+            elif in_chunk:
+                is_significant_word = int(stem in significant_stems)
+                chunks[-1].append(is_significant_word)
+
+            # end of chunk
+            if chunks and chunks[-1][-self.max_gap_size:] == NONSIGNIFICANT_CHUNK:
+                in_chunk = False
+
+        return tuple(map(self._get_chunk_rating, chunks))
+
+    def _get_chunk_rating(self, chunk):
+        chunk = self.__remove_trailing_zeros(chunk)
+        words_count = len(chunk)
+        assert words_count > 0
+
+        significant_words = sum(chunk)
+        if significant_words == 1:
+            return 0
+        else:
+            return significant_words**2 / words_count
+
+    def __remove_trailing_zeros(self, collection):
+        """Removes trailing zeroes from indexable collection of numbers"""
+        index = len(collection) - 1
+        while index >= 0 and collection[index] == 0:
+            index -= 1
+
+        return collection[:index + 1]
diff --git a/baselines/baseline2/summarize/sumy/summarizers/random.py b/baselines/baseline2/summarize/sumy/summarizers/random.py
new file mode 100755
index 0000000..badb439
--- /dev/null
+++ b/baselines/baseline2/summarize/sumy/summarizers/random.py
@@ -0,0 +1,24 @@
+# -*- coding: utf8 -*-
+
+from __future__ import absolute_import
+from __future__ import division, print_function, unicode_literals
+
+import random
+
+from ._summarizer import AbstractSummarizer
+
+
+class RandomSummarizer(AbstractSummarizer):
+    """Summarizer that picks sentences randomly."""
+
+    def __call__(self, document, sentences_count):
+        sentences = document.sentences
+        ratings = self._get_random_ratings(sentences)
+
+        return self._get_best_sentences(sentences, sentences_count, ratings)
+
+    def _get_random_ratings(self, sentences):
+        ratings = list(range(len(sentences)))
+        random.shuffle(ratings)
+
+        return dict((s, r) for s, r in zip(sentences, ratings))
diff --git a/baselines/baseline2/summarize/sumy/summarizers/sum_basic.py b/baselines/baseline2/summarize/sumy/summarizers/sum_basic.py
new file mode 100755
index 0000000..fce3d81
--- /dev/null
+++ b/baselines/baseline2/summarize/sumy/summarizers/sum_basic.py
@@ -0,0 +1,109 @@
+# -*- coding: utf8 -*-
+
+from __future__ import absolute_import
+from __future__ import division, print_function, unicode_literals
+import math
+
+from ._summarizer import AbstractSummarizer
+from ..utils import get_stop_words
+
+
+class SumBasicSummarizer(AbstractSummarizer):
+    """
+    SumBasic: a frequency-based summarization system that adjusts word frequencies as 
+    sentences are extracted.
+    Source: http://www.cis.upenn.edu/~nenkova/papers/ipm.pdf
+
+    """
+
+    def __call__(self, document, sentences_count):
+        sentences = document.sentences
+        ratings = self._compute_ratings(sentences)
+        return self._get_best_sentences(document.sentences, sentences_count, ratings)
+
+    def _get_all_words_in_doc(self, sentences):
+        return [w for s in sentences for w in s.words]
+
+    def _get_content_words_in_sentence(self, sentence):
+        normalized_words = self._normalize_words(sentence.words)   
+        normalized_content_words = self._filter_out_stop_words(normalized_words)
+        return normalized_content_words
+
+    def _normalize_words(self, words):
+        return [self.normalize_word(w) for w in words]
+
+    def _filter_out_stop_words(self, words):
+        return [w for w in words if w not in self.stop_words]
+
+    def _compute_word_freq(self, list_of_words):
+        word_freq = {}
+        for w in list_of_words:
+            word_freq[w] = word_freq.get(w, 0) + 1
+        return word_freq
+
+    def _get_all_content_words_in_doc(self, sentences):
+        all_words = self._get_all_words_in_doc(sentences)
+        content_words = self._filter_out_stop_words(all_words)
+        normalized_content_words = self._normalize_words(content_words)
+        return normalized_content_words
+
+    def _compute_tf(self, sentences):
+        '''
+        Computes the normalized term frequency as explained in http://www.tfidf.com/
+        '''
+        content_words = self._get_all_content_words_in_doc(sentences)
+        content_words_count = len(content_words)
+        content_words_freq = self._compute_word_freq(content_words)
+        content_word_tf = dict((k, v / content_words_count) for (k, v) in content_words_freq.items())
+        return content_word_tf
+
+    def _compute_average_probability_of_words(self, word_freq_in_doc, content_words_in_sentence):
+        content_words_count = len(content_words_in_sentence)
+        if content_words_count > 0:
+            word_freq_sum = sum([word_freq_in_doc[w] for w in content_words_in_sentence])
+            word_freq_avg = word_freq_sum / content_words_count
+            return word_freq_avg
+        else: 
+            return 0
+
+    def _update_tf(self, word_freq, words_to_update):
+        for w in words_to_update:
+            word_freq[w] *= word_freq[w]
+        return word_freq
+
+
+    def _find_index_of_best_sentence(self, word_freq, sentences_as_words):
+        min_possible_freq = -1
+        max_value = min_possible_freq
+        best_sentence_index = 0
+        for i, words in enumerate(sentences_as_words):
+            word_freq_avg = self._compute_average_probability_of_words(word_freq, words)
+            if (word_freq_avg > max_value): 
+                max_value = word_freq_avg
+                best_sentence_index = i
+        return best_sentence_index
+
+
+    def _compute_ratings(self, sentences):
+        word_freq = self._compute_tf(sentences)
+        ratings = {}
+        
+        # make it a list so that it can be modified
+        sentences_list = list(sentences)
+
+        # get all content words once for efficiency
+        sentences_as_words = [self._get_content_words_in_sentence(s) for s in sentences]
+        
+        # Removes one sentence per iteration by adding to summary
+        while len(sentences_list) > 0:
+            best_sentence_index = self._find_index_of_best_sentence(word_freq, sentences_as_words)
+            best_sentence = sentences_list.pop(best_sentence_index)
+
+            # value is the iteration in which it was removed multiplied by -1 so that the first sentences removed (the most important) have highest values
+            ratings[best_sentence] =  -1 * len(ratings)
+
+            # update probabilities
+            best_sentence_words = sentences_as_words.pop(best_sentence_index)
+            self._update_tf(word_freq, best_sentence_words)
+
+        return ratings
\ No newline at end of file
diff --git a/baselines/baseline2/summarize/sumy/summarizers/text_rank.py b/baselines/baseline2/summarize/sumy/summarizers/text_rank.py
new file mode 100755
index 0000000..488bd24
--- /dev/null
+++ b/baselines/baseline2/summarize/sumy/summarizers/text_rank.py
@@ -0,0 +1,56 @@
+# -*- coding: utf8 -*-
+
+from __future__ import absolute_import
+from __future__ import division, print_function, unicode_literals
+
+import math
+
+from itertools import combinations
+from collections import defaultdict
+from ._summarizer import AbstractSummarizer
+
+
+class TextRankSummarizer(AbstractSummarizer):
+    """Source: https://github.com/adamfabish/Reduction"""
+
+    _stop_words = frozenset()
+
+    @property
+    def stop_words(self):
+        return self._stop_words
+
+    @stop_words.setter
+    def stop_words(self, words):
+        self._stop_words = frozenset(map(self.normalize_word, words))
+
+    def __call__(self, document, sentences_count):
+        ratings = self.rate_sentences(document)
+        return self._get_best_sentences(document.sentences, sentences_count, ratings)
+
+    def rate_sentences(self, document):
+        sentences_words = [(s, self._to_words_set(s)) for s in document.sentences]
+        ratings = defaultdict(float)
+
+        for (sentence1, words1), (sentence2, words2) in combinations(sentences_words, 2):
+            rank = self._rate_sentences_edge(words1, words2)
+            ratings[sentence1] += rank
+            ratings[sentence2] += rank
+
+        return ratings
+
+    def _to_words_set(self, sentence):
+        words = map(self.normalize_word, sentence.words)
+        return [self.stem_word(w) for w in words if w not in self._stop_words]
+
+    def _rate_sentences_edge(self, words1, words2):
+        rank = 0
+        for w1 in words1:
+            for w2 in words2:
+                rank += int(w1 == w2)
+
+        if rank == 0:
+            return 0.0
+
+        assert len(words1) > 0 and len(words2) > 0
+        norm = math.log(len(words1)) + math.log(len(words2))
+        return 0.0 if norm == 0.0 else rank / norm
diff --git a/baselines/baseline2/summarize/sumy/utils.py b/baselines/baseline2/summarize/sumy/utils.py
new file mode 100755
index 0000000..189a531
--- /dev/null
+++ b/baselines/baseline2/summarize/sumy/utils.py
@@ -0,0 +1,92 @@
+# -*- coding: utf8 -*-
+
+from __future__ import absolute_import
+from __future__ import division, print_function, unicode_literals
+
+import sys
+import requests
+import pkgutil
+
+from functools import wraps
+from contextlib import closing
+from os.path import dirname, abspath, join, exists
+from . import __version__
+from ._compat import to_string, to_unicode, string_types
+
+import sys, os.path as path
+path_dir = path.dirname(path.abspath(__file__))
+
+_HTTP_HEADERS = {
+    "User-Agent": "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.155 Safari/537.36 OPR/31.0.1889.174",
+    # "User-Agent": "Sumy (Automatic text summarizer) Version/%s" % __version__,
+}
+
+
+def fetch_url(url):
+    with closing(requests.get(url, headers=_HTTP_HEADERS)) as response:
+        response.raise_for_status()
+        return response.content
+
+
+def cached_property(getter):
+    """
+    Decorator that converts a method into memoized property.
+    The decorator works as expected only for classes with
+    attribute '__dict__' and immutable properties.
+    """
+    @wraps(getter)
+    def decorator(self):
+        key = "_cached_property_" + getter.__name__
+
+        if not hasattr(self, key):
+            setattr(self, key, getter(self))
+
+        return getattr(self, key)
+
+    return property(decorator)
+
+
+def expand_resource_path(path):
+    directory = dirname(sys.modules["sumy"].__file__)
+    directory = abspath(directory)
+    return join(directory, to_string("data"), to_string(path))
+
+
+def get_stop_words(language):
+    try:
+        stopwords_data = pkgutil.get_data("sumy", "data/stopwords/%s.txt" % (language))
+    except IOError as e:
+        raise LookupError("Stop-words are not available for language %s." % language)
+    return parse_stop_words(stopwords_data)
+
+
+def read_stop_words(filename):
+    with open(filename, "rb") as open_file:
+        return parse_stop_words(open_file.read())
+
+
+def parse_stop_words(data):
+    return frozenset(w.rstrip() for w in to_unicode(data).splitlines() if w)
+
+
+class ItemsCount(object):
+    def __init__(self, value):
+        self._value = value
+
+    def __call__(self, sequence):
+        if isinstance(self._value, string_types):
+            if self._value.endswith("%"):
+                total_count = len(sequence)
+                percentage = int(self._value[:-1])
+                # at least one sentence should be chosen
+                count = max(1, total_count*percentage // 100)
+                return sequence[:count]
+            else:
+                return sequence[:int(self._value)]
+        elif isinstance(self._value, (int, float)):
+            return sequence[:int(self._value)]
+        else:
+            ValueError("Unsuported value of items count '%s'." % self._value)
+
+    def __repr__(self):
+        return to_string("" % self._value)
diff --git a/baselines/baseline2/summarize/upper_bound.py b/baselines/baseline2/summarize/upper_bound.py
new file mode 100755
index 0000000..8015c0e
--- /dev/null
+++ b/baselines/baseline2/summarize/upper_bound.py
@@ -0,0 +1,243 @@
+import pulp
+import numpy as np
+
+import sys
+import os
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+from nltk.stem.snowball import SnowballStemmer
+from rouge import Rouge
+import argparse
+import logging
+
+logger = logging.getLogger(__name__)
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from utils.misc import mkdirp
+from utils.misc import set_logger
+from utils.data_helpers import extract_ngrams
+from utils.data_helpers import untokenize
+from utils.data_helpers import load_data
+
+
+class Sentence:
+    """The sentence data structure.
+    Args: 
+        tokens (list of str): the list of word tokens.
+        doc_id (str): the identifier of the document from which the sentence
+          comes from.
+        position (int): the position of the sentence in the source document.
+    """
+
+    def __init__(self, tokens, doc_id, position):
+        self.tokens = tokens
+        """ tokens as a list. """
+
+        self.doc_id = doc_id
+        """ document identifier of the sentence. """
+
+        self.position = position
+        """ position of the sentence within the document. """
+
+        self.concepts = []
+        """ concepts of the sentence. """
+
+        self.untokenized_form = ''
+        """ untokenized form of the sentence. """
+
+        self.length = 0
+        """ length of the untokenized sentence. """
+
+
+class ExtractiveUpperbound():
+    def __init__(self, language):
+        self.sentences = []
+        self.docs = []
+        self.models = []
+        self.doc_sent_dict = {}
+        self.ref_ngrams = []
+        self.LANGUAGE = language
+        self.stemmer = SnowballStemmer(self.LANGUAGE)
+        self.stoplist = set(stopwords.words(self.LANGUAGE))
+
+    def __call__(self, docs, models, length, ngram_type=2):
+        self.sum_length = int(length)
+        self.load_data(docs, models)
+        self.get_ref_ngrams(ngram_type)
+
+        self.sentences_idx = range(len(self.sentences))
+        self.ref_ngrams_idx = range(len(self.ref_ngrams))
+
+        summary_idx = self.solve_ilp(ngram_type)
+        summary_txt = self.get_summary_text(summary_idx)
+
+        return summary_txt
+
+    def load_data(self, docs, models):
+        '''
+        Load the data into
+            :doc_sent_dict
+            :sentences
+
+        Parameters:
+        docs: List of list of docs each doc is represented with its filename and sents
+            [['sent1','sent2','sent3'],['sent1','sent2','sent3']]
+        models: List of list of models each doc is represented with its filename and sents
+            [['sent1','sent2','sent3'], ['sent1','sent2','sent3']]
+
+        '''
+        self.docs = docs
+        self.models = models
+        self.sentences = []
+        self.doc_sent_dict = {}
+
+        doc_id = 0
+        for doc_id, doc in enumerate(docs):
+            doc_sents = doc
+            total = len(self.sentences)
+            for sent_id, sentence in enumerate(doc_sents):
+                token_sentence = word_tokenize(sentence, self.LANGUAGE)
+                sentence_s = Sentence(token_sentence, doc_id, sent_id + 1)
+
+                untokenized_form = untokenize(token_sentence)
+                sentence_s.untokenized_form = untokenized_form
+                sentence_s.length = len(untokenized_form.split(' '))
+                self.doc_sent_dict[total + sent_id] = "%s_%s" % (str(doc_id), str(sent_id))
+                self.sentences.append(sentence_s)
+
+    def get_ref_ngrams(self, N):
+        for summary in self.models:
+            self.ref_ngrams.extend(extract_ngrams(summary, self.stoplist, self.stemmer, self.LANGUAGE, N))
+
+    def get_summary_text(self, summary_idx):
+        return [self.sentences[idx].untokenized_form for idx in summary_idx]
+
+    def solve_ilp(self, N):
+        # build the A matrix: a_ij is 1 if j-th gram appears in the i-th sentence
+
+        A = np.zeros((len(self.sentences_idx), len(self.ref_ngrams_idx)))
+        for i in self.sentences_idx:
+            sent = self.sentences[i].untokenized_form
+            sngrams = list(extract_ngrams([sent], self.stoplist, self.stemmer, self.LANGUAGE, N))
+            for j in self.ref_ngrams_idx:
+                if self.ref_ngrams[j] in sngrams:
+                    A[i][j] = 1
+
+        # Define ILP variable, x_i is 1 if sentence i is selected, z_j is 1 if gram j appears in the created summary
+        x = pulp.LpVariable.dicts('sentences', self.sentences_idx, lowBound=0, upBound=1, cat=pulp.LpInteger)
+        z = pulp.LpVariable.dicts('grams', self.ref_ngrams_idx, lowBound=0, upBound=1, cat=pulp.LpInteger)
+
+        # Define ILP problem, maximum coverage of grams from the reference summaries
+        prob = pulp.LpProblem("ExtractiveUpperBound", pulp.LpMaximize)
+        prob += pulp.lpSum(z[j] for j in self.ref_ngrams_idx)
+
+        # Define ILP constraints, length constraint and consistency constraint (impose that z_j is 1 if j
+        # appears in the created summary)
+        prob += pulp.lpSum(x[i] * self.sentences[i].length for i in self.sentences_idx) <= self.sum_length
+
+        for j in self.ref_ngrams_idx:
+            prob += pulp.lpSum(A[i][j] * x[i] for i in self.sentences_idx) >= z[j]
+
+        # Solve ILP problem and post-processing to get the summary
+        try:
+            prob.solve(pulp.CPLEX(msg=0))
+        except:
+            prob.solve(pulp.GLPK(msg=0))
+
+        summary_idx = []
+        for idx in self.sentences_idx:
+            if x[idx].value() == 1.0:
+                summary_idx.append(idx)
+
+        return summary_idx
+
+
+def get_args():
+    ''' This function parses and return arguments passed in'''
+
+    parser = argparse.ArgumentParser(description='Upper Bound for Summarization')
+    # -- summary_len: 100, 200, 400
+    parser.add_argument('-s', '--summary_size', type=str, help='Summary Length', required=False)
+
+    # --data_set: DUC2001, DUC2002, DUC2004
+    parser.add_argument('-d', '--data_set', type=str, help='Data set ex: DUC2004', required=True)
+
+    # --language: english, german
+    parser.add_argument('-l', '--language', type=str, help='Language: english, german', required=False,
+                        default='english')
+
+    parser.add_argument('-io', '--iobasedir', type=str, help='IO base directory', required=False,
+                        default=os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "data"))
+    args = parser.parse_args()
+
+    return args
+
+
+def print_scores(algo_name, summary_sents, refs, rouge):
+    hyps, refs = map(list, zip(*[[' '.join(summary_sents), ' '.join(model)] for model in refs]))
+    score = rouge.get_scores(hyps, refs, avg=True)
+    logger.info('%s: ROUGE-1: %4f %4f %4f, ROUGE-2: %4f %4f %4f, ROUGE-L: %4f %4f %4f' % (algo_name, \
+                                                                                          score['rouge-1']['f'],
+                                                                                          score['rouge-1']['p'],
+                                                                                          score['rouge-1']['r'], \
+                                                                                          score['rouge-2']['f'],
+                                                                                          score['rouge-2']['p'],
+                                                                                          score['rouge-2']['r'], \
+                                                                                          score['rouge-l']['f'],
+                                                                                          score['rouge-l']['p'],
+                                                                                          score['rouge-l']['r']))
+
+    scores = [score['rouge-1']['f'], score['rouge-1']['p'], score['rouge-1']['r'], \
+              score['rouge-2']['f'], score['rouge-2']['p'], score['rouge-2']['r'], \
+              score['rouge-l']['f'], score['rouge-l']['p'], score['rouge-l']['r']]
+
+    return scores
+
+
+def get_summary_scores(algo, docs, refs, summary_size, language, rouge):
+    if algo == 'UB1':
+        UB = ExtractiveUpperbound(language)
+        summary = UB(docs, refs, summary_size, ngram_type=1)
+    if algo == 'UB2':
+        UB = ExtractiveUpperbound(language)
+        summary = UB(docs, refs, summary_size, ngram_type=2)
+
+    print_scores(algo_name, summary, refs, rouge)
+
+
+def main():
+    args = get_args()
+    rouge = Rouge()
+    data_path = os.path.join(args.iobasedir, 'processed/downloads', args.data_set)
+    log_path = os.path.join(args.iobasedir, 'logs')
+    log_file = os.path.join(args.iobasedir, 'logs', 'UB.log')
+    mkdirp(log_path)
+    set_logger(log_file)
+
+    for filename in os.listdir(data_path):
+        data_file = os.path.join(data_path, filename)
+        topic = filename[:-5]
+
+        docs, refs = load_data(data_file)
+        if not refs:
+            continue
+
+        if not args.summary_size:
+            summary_size = len(' '.join(refs[0]).split(' '))
+        else:
+            summary_size = int(args.summary_size)
+
+        logger.info('Topic ID: %s ', topic)
+        logger.info('###')
+        logger.info('Summmary_len: %d', summary_size)
+
+        algos = ['UB1', 'UB2']
+        for algo in algos:
+            get_summary_scores(algo, docs, refs, summary_size, language, rouge)
+
+        logger.info('###')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/baselines/baseline2/utils/__init__.py b/baselines/baseline2/utils/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/baselines/baseline2/utils/aggregate_baselines.py b/baselines/baseline2/utils/aggregate_baselines.py
new file mode 100755
index 0000000..ef661d3
--- /dev/null
+++ b/baselines/baseline2/utils/aggregate_baselines.py
@@ -0,0 +1,83 @@
+import sys, os.path as path
+
+sys.path.append(path.dirname(path.dirname(path.dirname(path.abspath(__file__)))))
+import argparse
+import numpy as np
+import re
+
+
+def get_vals(score):
+    vals = score.split(' ')
+    return [float(val) for val in vals]
+
+
+def get_scores(text):
+    pattern = re.search(' ([^\s:]+): ROUGE-1: ([^,]+), ROUGE-2: ([^,]+), ROUGE-L: ([^\n]+)', text)
+    system = pattern.group(1)
+    scores = []
+    for index in range(2, 5):
+        scores.append(get_vals(pattern.group(index)))
+    return system, scores
+
+
+def aggregate(file_name):
+    systems = ['UB1', 'UB2', 'Luhn', 'LexRank', 'TextRank', 'LSA']
+    scores_rtype = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L']
+    scores_measures = ['F', 'P', 'R']
+
+    baselines = []
+    with open(file_name, 'r') as fp:
+        lines = fp.read().splitlines()
+
+    i = 0
+    items = []
+    while (i < len(lines)):
+        # if re.search("Topic", lines[i]):
+        #    print('Topic', lines[i])
+        if re.search('###', lines[i]):
+            if items:
+                baselines.append(items)
+            items = []
+        elif re.search('UB1:', lines[i]):
+            for j in range(len(systems)):
+                # print('I AM Here', lines[i])
+                system, scores = get_scores(lines[i])
+                # print(system, scores)
+                items.append(scores)
+                i += 1
+        i += 1
+
+    if items:
+        baselines.append(items)
+
+    print '### Total topics: %d' % (len(baselines))
+    for index in range(len(systems)):
+        print '%s: ' % systems[index],
+        for i, scores_type in enumerate(scores_rtype):
+            vals = []
+            for j, scores_measure in enumerate(scores_measures):
+                # print index, i, j, baselines
+                vals.append(np.array([x[index][i][j] for x in baselines]))
+            print ' %s: %4f %4f %4f, ' % (scores_type, np.mean(vals[0]), np.mean(vals[1]), np.mean(vals[2])),
+        print
+
+
+def get_args():
+    ''' This function parses and return arguments passed in'''
+
+    parser = argparse.ArgumentParser(description='Baselines Results Aggregator')
+    parser.add_argument('-l', '--summary_length', type=str, help='Scores file', required=False)
+    parser.add_argument('-d', '--data_set', type=str, help='Year of the data set', required=True)
+
+    args = parser.parse_args()
+    summary_len = args.summary_length
+    data_set = args.data_set
+    return summary_len, data_set
+
+
+if __name__ == '__main__':
+    summary_len, data_set = get_args()
+    ios_basedir = path.join(path.dirname(path.dirname(path.abspath(__file__))))
+
+    data_path = '%s/data/logs/baselines_%s.log' % (ios_basedir, data_set)
+    aggregate = aggregate(data_path)
diff --git a/baselines/baseline2/utils/bbc.py b/baselines/baseline2/utils/bbc.py
new file mode 100755
index 0000000..f39cf88
--- /dev/null
+++ b/baselines/baseline2/utils/bbc.py
@@ -0,0 +1,234 @@
+import re
+from requests import get
+from lxml import html
+from nltk.tokenize import sent_tokenize
+from bs4 import BeautifulSoup
+import itertools
+
+def get_summary(tree):
+    
+    summary_block = tree.xpath('//ol[@class="lx-c-summary-points gel-long-primer"]')
+    if len(summary_block) == 1:
+        summary = [line + "." for line in summary_block[0].xpath(".//li/text()")]
+    else:
+        summary = [""]
+    return summary
+
+def get_documents_bbc(tree):
+    documents = []
+    prev_hour = ['00:00']
+    articles = tree.xpath(".//article")
+    for article in articles:
+        # source title
+        source_title = article.xpath('.//header[@class="lx-stream-post__header gs-o-media"]')
+        if len(source_title) == 1:
+            source_title = text_normalization(BeautifulSoup(html.tostring(source_title[0]), "html.parser").get_text())
+        # hour
+        hour = re.findall(r"[0-9]{2}:[0-9]{2}", html.tostring(article))  # get the hour linked to the article
+        if not hour:
+            hour = prev_hour
+        # lines
+        lines = article.xpath('.//div[@class="lx-stream-post-body"]//p')
+        
+        # text
+        text_lines = []
+        if len(lines) >= 1:
+            for line in lines:
+                text_lines.append(BeautifulSoup(html.tostring(line), "html.parser").get_text())
+        # author
+        author = article.xpath('.//div[@class="lx-stream-post__contributor gs-o-media"]')
+        if len(author) == 1:
+            author = author[0].xpath(".//p/text()")  # get the description of the author of the article
+        else:
+            author = ''
+        # extract the links form the block
+        
+        lines = article.xpath('.//div[@class="lx-stream-post-body"]')
+        if len(lines) == 1:
+            cont = html.tostring(lines[0])
+            links = set(re.findall(r'https?://[a-z\.]+/[a-z\-_0-9/]+\.[a-z]{2,4}', cont))
+            links = links.union(re.findall(r'https?://[A-Za-z\.]+/[A-Za-z\-_0-9/]+', cont))
+        else:
+            cont = html.tostring(article)
+            links = set(re.findall(r'https?://[a-z\.]+/[a-z\-_0-9/]+\.[a-z]{2,4}', cont))
+            links.union(re.findall(r'https?://[A-Za-z\.]+/[A-Za-z\-_0-9/]+', cont))
+
+        try:
+            for link in links:
+                # print link
+                if "https://twitter.com/" in link and "status" in link:
+                    # we extract the content from the twitter status
+                    twi_page = get(link).text
+                    twi_tree = html.fromstring(twi_page)
+                    tweets = twi_tree.xpath('//p[contains(@class, "tweet-text")]')
+                    if len(tweets) >= 1:
+                        for tweet in tweets: 
+                            twi_text = BeautifulSoup(html.tostring(tweet), "html.parser").get_text()
+                            text_lines.append(twi_text)
+        except:
+            pass
+        
+        # retrieving of links in the text
+        block_id = article.get("id")
+        
+        block_text = [sent_tokenize(text_normalization(line.strip())) for line in text_lines if line.strip() != u""]
+        block_text = list(itertools.chain.from_iterable(block_text))
+        
+        if len(block_text) == 1:
+            if block_text[0] == '':
+                block_text = [source_title]
+        if len(block_text) == 0: 
+            block_text = [source_title]
+        
+        
+        d_block = {"time": hour[0], "text": block_text, "block_id": block_id,
+                    "author": author, "title": source_title}
+        prev_hour = hour
+        documents.append(d_block)
+    return documents
+
+
+def process_html_bbc(blog_id, url, WEBPAGE_content, id_=-1):
+    """
+    That's the best extractor for BBC articles !!!
+    use of lxml fo tes
+    :param url: simple url
+    :param WEBPAGE_content: retrieved from a database
+    :param id_: the id of the live blog in the database, it's essential because the url should (maybe) change
+    to a normalized form
+    :return:
+    """
+    tree = html.fromstring(WEBPAGE_content)
+
+    title = tree.xpath("//title/text()")
+    summary = get_summary(tree)
+    documents = []
+    if len(summary) <= 2:
+        print 'summary too short!'
+    else:
+        documents = get_documents_bbc(tree)
+    genre = get_genre_bbc(url)
+    if len(summary) > 2 and not re.search('sport|football|cricket', genre):
+        quality = 'high'
+    else:
+        quality = 'low'
+
+    summary_text = [summary_normalization(sent) for sent in summary]
+
+    data = {'blog_id': blog_id, 'url': url, 'genre': genre,
+            'title': title[0], 'summary': summary_text, 'documents': documents, 'quality': quality}
+
+    return data
+
+def get_genre_bbc(url):
+    """
+    Extract the "genre" from the bbc links. It can give an idea of the category of the live blgos, nevertheless the
+    usage of this information depends on when the live blog was made.
+    :param url: string
+    :return: a genre !
+    """
+    url = remove_question_mark_and_point(url)
+    url = url.split("/")[5]
+    url = re.sub("[0-9]", "", url)
+    url = re.sub("-$", "", url)
+    return url
+
+def remove_question_mark_and_point(url):
+    """
+    Used by the BBC links
+    :param url:
+    :return:
+    """
+    if url != "":
+        pos = url.find("?")
+        if pos != -1:
+            url = url[:pos]
+        poi = url.find(".app")
+        if poi != -1:
+            url = url[:poi]
+    return url
+
+def summary_normalization(summary):
+    try:
+        summary = unicode(summary)
+    except:
+        pass
+    summary = text_normalization(summary)
+    summary = re.sub(u" {2,10}", u". ", summary)
+    if summary != u"":
+        summary = re.sub(u'[a-zA-Z]$', summary[-1] + ".", summary)       
+        
+    return summary
+
+
+
+def text_normalization(text):
+    '''
+    Normalize text
+    Remove & Replace unnessary characters
+    Parameter argument:
+    text: a string (e.g. '.... *** New York N.Y is a city...')
+    
+    Return:
+    text: a string (New York N.Y is a city.)
+    '''
+    text = re.sub(u'\u201e|\u201c',u'', text)
+    text = re.sub(u"\u2022",u'. ', text)  
+    text = re.sub(u"([.?!]);",u"\\1", text)
+    text = re.sub(u'``', u'``', text)
+    text = re.sub(u"\.\.+",u" ", text)
+    text = re.sub(u"\s+\.",u".", text)
+    text = re.sub(u"\?\.",u"?", text)
+    #Dash to remove patterns like NAME (Twitter id)
+    text = re.sub(u"\u2014[^\n]+", u'', text)
+    
+    #Line of format Month day, year (ex:March 7, 2017)
+    text = re.sub(u"\n[a-zA-Z]+\s+\d+,\s+\d{4}", u'', text)
+    
+    #Line of format Time GMT (ex:6.20pm GMT)
+    text = re.sub(u"\d+\.\d+(am|pm) (GMT|BST)\n", u'', text)
+    #Line of format 15:35
+    text = re.sub(u"\d+:\d+\n", u'', text)
+    
+    text = re.sub(u"pic[^ \n]+", u'', text)
+    text = re.sub(u"Photograph: [a-zA-Z]+", u'', text)
+
+    #BBC specific twitter:
+    text = re.sub(u"twitter: [^\s]+\s", u'', text)
+    text = re.sub(u"twitter: ", u'', text)  
+    text = re.sub(u"http[^ \n]+", u'', text)
+    
+    text = re.sub(u" @[^ ]+", u' @twitterid', text)
+    text = re.sub(u"^@[^ ]+", u'@twitterid', text)
+    text = re.sub(u'^[\n_]+',u'', text)
+    #text = re.sub(u'[\s\t]+',u' ', text)
+    text = re.sub(u'[\n_]+',u'\n', text)
+    text = re.sub(u"[*]",u"", text)
+    text = re.sub(u"\-+",u"-", text)
+    text = re.sub(u'^ ',u'', text)
+    text = re.sub(u'\u00E2',u'', text)
+    text = re.sub(u'\u00E0',u'a', text)
+    text = re.sub(u'\u00E9',u'e', text)
+    text = re.sub(u'\u2019',u"'", text)
+    text = re.sub(u'\u2018',u"'", text)
+    text = re.sub(u'\u201d',u'"', text)
+    text = re.sub(u'\u201c',u'"', text)
+    text = re.sub(u'\u2013',u'-', text)
+    text = re.sub(u'\u2026',u'', text)
+    text = re.sub(u"\u00A3",u"\u00A3 ", text)
+    text = re.sub(u"\nBBC ",u"", text)
+    text = re.sub(u"^BBC ",u"", text)
+    text = re.sub(u"\. BBC ",u". ", text)
+    text = re.sub(u"([.?!]);",u"\\1", text)
+    text = re.sub(u'[\n\s\t\r_]+', u' ', text)
+    text = re.sub(u"\\u00A0", u" ", text)
+    text = re.sub(u"\u00A0", u" ", text) 
+    text = re.sub(u' +$', u'', text)  
+    text = re.sub(u" {2,10}", u". ", text)
+    text = re.sub(u'\xa0', u' ', text)
+    text = re.sub(u' +$', u'', text)  
+    text = re.sub(u'View more on twitter', u'', text)
+    text = re.sub(u'\^CT', u'', text)
+    text = re.sub(u'http[^ ]+', u'', text)
+    text = re.sub(u'[^ ]+twitter.com[^ ]+', u'', text)
+    return text
diff --git a/baselines/baseline2/utils/data_helpers.py b/baselines/baseline2/utils/data_helpers.py
new file mode 100755
index 0000000..6c0f27f
--- /dev/null
+++ b/baselines/baseline2/utils/data_helpers.py
@@ -0,0 +1,120 @@
+import codecs
+import json
+import os
+import re
+from nltk.tokenize import word_tokenize
+
+
+def load_data(data_path):
+    """Load the live blogs corpus data
+
+    Args:
+        data_path: path to the json file
+
+    return:
+        doc_data: list of input documents represented as a list of sentences.
+        summaries: list of summaries represented as a list of sentences. 
+
+    """
+    doc_data = []
+    with codecs.open(data_path, "r", encoding='utf-8') as fp:
+        # json_text = fp.readline()
+        # json_data = json.loads(json_text)
+        json_data = json.load(fp)
+
+        summaries = [json_data['summary']]
+
+        documents = json_data['documents']
+
+        for doc in documents:
+            doc_data.append(doc['text'])
+    return doc_data, summaries
+
+
+def extract_ngrams(sentences, stoplist, stemmer, language='english', n=2):
+    """Extract the ngrams of words from the input sentences.
+
+    Args:
+        n (int): the number of words for ngrams, defaults to 2
+    """
+    concepts = []
+    for i, sentence in enumerate(sentences):
+
+        # for each ngram of words
+        tokens = sent2tokens(sentence, language)
+        for j in range(len(tokens) - (n - 1)):
+
+            # initialize ngram container
+            ngram = []
+
+            # for each token of the ngram
+            for k in range(j, j + n):
+                ngram.append(tokens[k].lower())
+
+            # do not consider ngrams containing punctuation marks
+            marks = [t for t in ngram if not re.search('[a-zA-Z0-9]', t)]
+            if len(marks) > 0:
+                continue
+
+            # do not consider ngrams composed of only stopwords
+            stops = [t for t in ngram if t in stoplist]
+            if len(stops) == len(ngram):
+                continue
+
+            # stem the ngram
+            ngram = [stemmer.stem(t) for t in ngram]
+
+            # add the ngram to the concepts
+            concepts.append(' '.join(ngram))
+    return concepts
+
+
+def untokenize(tokens):
+    """Untokenizing a list of tokens. 
+
+    Args:
+        tokens (list of str): the list of tokens to untokenize.
+
+    Returns:
+        a string
+
+    """
+    text = u' '.join(tokens)
+    text = re.sub(u"\s+", u" ", text.strip())
+    text = re.sub(u" ('[a-z]) ", u"\g<1> ", text)
+    text = re.sub(u" ([\.;,-]) ", u"\g<1> ", text)
+    text = re.sub(u" ([\.;,-?!])$", u"\g<1>", text)
+    text = re.sub(u" _ (.+) _ ", u" _\g<1>_ ", text)
+    text = re.sub(u" \$ ([\d\.]+) ", u" $\g<1> ", text)
+    text = text.replace(u" ' ", u"' ")
+    text = re.sub(u"([\W\s])\( ", u"\g<1>(", text)
+    text = re.sub(u" \)([\W\s])", u")\g<1>", text)
+    text = text.replace(u"`` ", u"``")
+    text = text.replace(u" ''", u"''")
+    text = text.replace(u" n't", u"n't")
+    text = re.sub(u'(^| )" ([^"]+) "( |$)', u'\g<1>"\g<2>"\g<3>', text)
+
+    # times
+    text = re.sub('(\d+) : (\d+ [ap]\.m\.)', '\g<1>:\g<2>', text)
+
+    text = re.sub('^" ', '"', text)
+    text = re.sub(' "$', '"', text)
+    text = re.sub(u"\s+", u" ", text.strip())
+
+    return text
+
+
+def sent2tokens(sent, language='english', lower=True):
+    '''
+    Sentence to stemmed tokens
+    Parameter arguments:
+    words = list of words e.g. sent = '... The boy is playing.'
+
+    return:
+    list of tokens
+    ['the', 'boy', 'is', 'playing','.']
+    '''
+    if lower == True:
+        sent = sent.lower()
+    words = word_tokenize(sent, language)
+    return words
diff --git a/baselines/baseline2/utils/guardian.py b/baselines/baseline2/utils/guardian.py
new file mode 100755
index 0000000..106a540
--- /dev/null
+++ b/baselines/baseline2/utils/guardian.py
@@ -0,0 +1,258 @@
+
+import re
+from nltk.tokenize import sent_tokenize
+from lxml import html
+from bs4 import BeautifulSoup
+import itertools
+
+def get_timeline(timeline_section):
+    timeline = []
+    if len(timeline_section) == 1:
+        timeline_blocks = timeline_section[0]
+        for a in timeline_blocks.xpath(".//ul/li"):
+            b = a.xpath("a")[0]
+            timeline.append({"block": b.get('data-event-id'), "text": sent_tokenize(summary_normalization(b.text_content()))})
+    return timeline
+
+def get_keypoints(keypoints_section):
+    keypoints = []
+    if len(keypoints_section) != 0:
+        for i in keypoints_section[0].getchildren():
+            if i.tag == "p":
+                sum_now = {"text": summary_normalization(i.text_content()), "link": []}
+                for k in i.xpath(".//a"):
+                    sum_now["link"].append(k.get("href"))
+                keypoints.append(sum_now)
+            elif i.tag == "ul":
+                for j in i.xpath(".//li"):
+                    sum_now = {"text": summary_normalization(j.text_content()), "link": []}
+                    for k in j.xpath(".//a"):
+                        sum_now["link"].append(k.get("href"))
+                    keypoints.append(sum_now)
+    return keypoints
+
+def get_summary(tree):
+    summary = {"bulletin": [], "key_events": []}
+    if len(tree.xpath('//div[@class="content__standfirst"]')) != 0:
+        timeline_links = []
+        timeline_block = tree.xpath('//div[@data-component="timeline"]')
+        if len(timeline_block) == 1:
+            timeline = timeline_block[0]
+            for a in timeline.xpath(".//ul/li"):
+                b = a.xpath("a")[0]
+                timeline_links.append({"block": b.get('data-event-id'), "text": b.text_content()})
+        for i in tree.xpath('//div[@class="content__standfirst"]')[0].getchildren():
+            type_ = i.tag
+            if i.tag == "p":
+                sum_now = {"text": [], "link": [], "key_points": [], "time": "", "type": type_}
+                sum_now["text"] = text_normalization(summary_normalization(i.text_content()))
+                # sum_now["link"].extend(re.findall(r'https?://[A-Za-z\.]+/[A-Za-z\-_0-9/]+', html.tostring(i)))
+                for k in i.xpath(".//a"):
+                    sum_now["link"].append(k.get("href"))
+                summary["bulletin"].append(sum_now)
+            elif i.tag == "ul":
+                for j in i.xpath(".//li"):
+                    sum_now = {"text": [], "link": [], "key_points": [], "time": "", "type": type_}
+                    sum_now["text"] = text_normalization(summary_normalization(j.text_content()))
+                    for k in j.xpath(".//a"):
+                        sum_now["link"].append(k.get("href"))
+                    summary["bulletin"].append(sum_now)
+        summary["key_events"].extend(timeline_links)
+    return summary 
+
+def extraction_date_hour(date_hour):
+    """
+    From a string whose structure is : ...
+    :param date_hour:
+    :return: [year, month, day, hour, minute] and each elements are integer
+    """
+    slash = date_hour.split('-')
+    year = slash[0]
+    month = slash[1]
+    t = slash[2].split("T")
+    day = t[0]
+    two_points = t[1].split(":")
+    hour = two_points[0]
+    minute = two_points[1]
+    return int(year), int(month), int(day), int(hour), int(minute)
+
+def extract_documents(articles):
+    body =[]
+    if len(articles) != 0:
+        for article in articles:
+            block_time = article.xpath('.//p[@class="block-time published-time"]')
+            if len(block_time) != 0:
+                cc = block_time[0].xpath(".//time")
+                for c in cc:
+                    datetime = c.get("datetime")
+                    if datetime is not None:  # "2014-07-23T12:02:45.546Z"
+                        time_creation = extraction_date_hour(datetime)
+
+            # extract the text from the block
+            text_lines = []
+            block_lines = article.xpath('.//div[@itemprop="articleBody"]')
+            for lines in block_lines:
+                text_lines.append(unicode(text_normalization(BeautifulSoup(html.tostring(lines), "html.parser").get_text())))
+
+            """
+            # Get the links inside the block
+            links = re.findall(r'https?://[a-z\.]+/[a-z\-_0-9/]+\.[a-z]{2,4}',
+                               html.tostring(article))  # retrieving of links in the text
+            links = [link for link in links if link.split('.')[-1] not in ["jpg", "jpeg", "png"]]
+            links.extend(re.findall(r'https?://[a-z\.]+/[a-z\-_0-9/]+',
+                                    html.tostring(article)))
+            """
+            
+            # Get the title of the block
+            part_title = article.xpath('.//h2[@class="block-title"]')
+            block_title = ''
+            if len(part_title) != 0:
+                block_title = part_title[0].text_content()
+            
+            block_id = article.get("id")
+            if block_id is None:
+                continue
+            
+            block_kind = article.get("class")
+            is_key_event = False
+            
+            # Check if the block is a summary point
+            section = article.get("class")
+            if re.search('is-key-event|is-summary', section):
+                is_key_event = True
+            
+            block_text = [sent_tokenize(line.strip()) for line in text_lines if line.strip() != u""]
+            block_text = list(itertools.chain.from_iterable(block_text))
+            
+            d_block = {"time": time_creation, "text": block_text, "block_id": block_id,
+                        "title": unicode(text_normalization(block_title)), "block_kind": block_kind,
+                         'is_key_event': is_key_event}
+            body.append(d_block)
+                       
+    return body       
+    
+
+def get_documents_guardian(tree):
+    article = tree.xpath('.//div[@itemprop="liveBlogUpdate"]')
+    documents = extract_documents(article)
+    if not documents:
+        article = tree.xpath('.//div[@itemprop="articleBody"]')
+        documents = extract_documents(article)
+    return documents
+
+def process_html_guardian(blog_id, url, html_content):
+
+    tree = html.fromstring(html_content)
+    # the title
+    title = tree.xpath("//title/text()")
+
+    documents = get_documents_guardian(tree)
+    summary = get_summary(tree)
+
+    summary_text = [summary_normalization(key_event['text']) for key_event in summary['key_events']]
+    genre = get_genre(url)
+    if len(summary_text) > 2 and not re.search('sport|football|cricket', genre):
+        quality = 'high'
+    else:
+        quality = 'low'
+
+    data = {'blog_id': blog_id, 'url': url, 'genre': genre,
+            'title': title[0], 'summary': summary_text, 'summary_block': summary, 'documents': documents, 'quality': quality}
+
+    return data
+
+
+def summary_normalization(summary):
+    try:
+        summary = unicode(summary)
+    except:
+        pass
+    summary = text_normalization(summary)
+    summary = re.sub(u" {2,10}", u". ", summary)
+    if summary != u"":
+        summary = re.sub(u'[a-zA-Z]$', summary[-1] + ".", summary)       
+        
+    return summary
+
+def text_normalization(text):
+    '''
+    Normalize text
+    Remove & Replace unnessary characters
+    Parameter argument:
+    text: a string (e.g. '.... *** New York N.Y is a city...')
+
+    Return:
+    text: a string (New York N.Y is a city.)
+    '''
+    text = re.sub(u'\u201e|\u201c',u'', text)
+    text = re.sub(u"\u2022",u'. ', text)  
+    text = re.sub(u"([.?!]);",u"\\1", text)
+    text = re.sub(u'``', u'``', text)
+    text = re.sub(u"\.\.+",u" ", text)
+    text = re.sub(u"\s+\.",u".", text)
+    text = re.sub(u"\?\.",u"?", text)
+    #Dash to remove patterns like NAME (Twitter id)
+    text = re.sub(u"\u2014[^\n]+", u'', text)
+    
+    #Line of format Month day, year (ex:March 7, 2017)
+    text = re.sub(u"\n[a-zA-Z]+\s+\d+,\s+\d{4}", u'', text)
+    
+    #Line of format Time GMT (ex:6.20pm GMT)
+    text = re.sub(u"\d+\.\d+(am|pm) (GMT|BST)\n", u'', text)
+    #Line of format 15:35
+    text = re.sub(u"\d+:\d+\n", u'', text)
+    
+    text = re.sub(u"pic[^ \n]+", u'', text)
+    text = re.sub(u"Photograph: [a-zA-Z]+", u'', text)
+
+    #BBC specific twitter
+    text = re.sub(u"twitter: [^\s]+\s", u'', text)
+    text = re.sub(u"twitter: ", u'', text)  
+    text = re.sub(u"http[^ \n]+", u'', text)
+    
+    text = re.sub(u" @[a-zA-Z]+", u' @twitterid', text)
+    text = re.sub(u'^[\n_]+',u'', text)
+    #text = re.sub(u'[\s\t]+',u' ', text)
+    text = re.sub(u'[\n_]+',u'\n', text)
+    text = re.sub(u"[*]",u"", text)
+    text = re.sub(u"\-+",u"-", text)
+    text = re.sub(u'^ ',u'', text)
+    
+    text = re.sub(u'\u00e2\u0080\u0093',u"", text)
+    text = re.sub(u'\u0080\u009c',u"", text)
+    text = re.sub(u'\u0080\u009d',u"", text)
+    text = re.sub(u'\u0080\u0099',u"'", text)
+    text = re.sub(u'\u0080\u0093',u"", text)
+    text = re.sub(u'\u00E2',u'', text)
+    text = re.sub(u'\u0080\u0094',u'', text)
+    text = re.sub(u'\u00c3\u00b3',u'', text)
+    text = re.sub(u'\u00E0',u'a', text)
+    text = re.sub(u'\u00E9',u'e', text)
+    text = re.sub(u'\u2019',u"'", text)
+    text = re.sub(u'\u2018',u"'", text)
+    text = re.sub(u'\u201d',u'"', text)
+    text = re.sub(u'\u201c',u'"', text)
+    text = re.sub(u'\u2013',u'-', text)
+    text = re.sub(u'\u2026',u'', text)
+    text = re.sub(u"\u00A3",u"\u00A3 ", text)
+    text = re.sub(u"\nBBC ",u"", text)
+    text = re.sub(u"^BBC ",u"", text)
+    text = re.sub(u"\. BBC ",u". ", text)
+    text = re.sub(u"([.?!]);",u"\\1", text)
+    text = re.sub(u'[\n\s\t\r_]+', u' ', text)
+    text = re.sub(u"\\u00A0", u" ", text)
+    text = re.sub(u"\u00A0", u" ", text) 
+    text = re.sub(u' +$', u'', text)  
+    text = re.sub(u" {2,10}", u". ", text)
+    text = re.sub(u'\xa0', u' ', text)
+    text = re.sub(u' +$', u'', text)  
+    return text
+    
+def get_genre(url):
+    """
+    Extract the "genre" from the Guardian links. It can give an idea of the category of the live blgos, nevertheless the
+    usage of this information depends on when the live blog was made.
+    :param url: a url struction
+    :return: the genre
+    """
+    return url.split("/")[3]
\ No newline at end of file
diff --git a/baselines/baseline2/utils/misc.py b/baselines/baseline2/utils/misc.py
new file mode 100755
index 0000000..6069e3a
--- /dev/null
+++ b/baselines/baseline2/utils/misc.py
@@ -0,0 +1,64 @@
+import os
+import errno
+import logging
+import sys
+
+def mkdirp(path):
+    """Checks if a path exists otherwise creates it
+    Each line in the filename should contain a list of URLs separated by comma.
+    Args:
+        path: The path to check or create
+    """
+    print(path)
+    if path == '':
+        return
+    try:
+        os.makedirs(path)
+    except OSError as exc: # Python >2.5
+        if exc.errno == errno.EEXIST and os.path.isdir(path):
+            pass
+        else: raise
+
+def set_logger(log_file):
+    console_format = '[%(levelname)s] (%(name)s) %(message)s'
+    logger = logging.getLogger()
+    logger.setLevel(logging.DEBUG)
+    console = logging.StreamHandler()
+    console.setLevel(logging.INFO)
+    console.setFormatter(logging.Formatter(console_format))
+    logger.addHandler(console)
+
+    if os.path.dirname(log_file):
+        file_format = '[%(levelname)s] (%(name)s) %(message)s'
+        log_file = logging.FileHandler(log_file, mode='w')
+        log_file.setLevel(logging.DEBUG)
+        log_file.setFormatter(logging.Formatter(file_format))
+        logger.addHandler(log_file)
+
+
+class ProgressBar(object):
+        """Simple progress bar.
+        Output example:
+                100.00% [2152/2152]
+        """
+
+        def __init__(self, total=100, stream=sys.stderr):
+                self.total = total
+                self.stream = stream
+                self.last_len = 0
+                self.curr = 0
+
+        def Increment(self):
+                self.curr += 1
+                self.PrintProgress(self.curr)
+
+                if self.curr == self.total:
+                        print ''
+
+        def PrintProgress(self, value):
+                self.stream.write('\b' * self.last_len)
+                pct = 100 * self.curr / float(self.total)
+                out = '{:.2f}% [{}/{}]'.format(pct, value, self.total)
+                self.last_len = len(out)
+                self.stream.write(out)
+                self.stream.flush()
\ No newline at end of file
diff --git a/baselines/baseline_acl18/Vocab.py b/baselines/baseline_acl18/Vocab.py
new file mode 100644
index 0000000..b04e6be
--- /dev/null
+++ b/baselines/baseline_acl18/Vocab.py
@@ -0,0 +1,62 @@
+# coding=utf-8
+import torch
+import torch.nn.functional as F
+import numpy as np
+
+
+class Vocab:
+    def __init__(self, embed, word2id):
+        self.embed = embed
+        self.word2id = word2id
+        self.id2word = {v: k for k, v in word2id.items()}
+        assert len(self.word2id) == len(self.id2word)
+        self.PAD_IDX = 0
+        self.UNK_IDX = 1
+        self.PAD_TOKEN = 'PAD_TOKEN'
+        self.UNK_TOKEN = 'UNK_TOKEN'
+
+    def __len__(self):
+        return len(self.word2id)
+
+    def i2w(self, idx):
+        return self.id2word[idx]
+
+    def w2i(self, w):
+        if w in self.word2id:
+            return self.word2id[w]
+        else:
+            return self.UNK_IDX
+
+    def make_features(self, blog, args):
+        summary = ' '.join(blog["summary"])  # 当前blog的summary
+        sents = []  # 当前blog的所有句子,用索引表示
+        sents_content = []  # 存储原句子
+        opt_extract = blog["opt_sents"]  # 存储最佳抽取结果
+        sents_target = blog["gain"]  # 存储句子得分,每个句子都有len(opt_extract)个得分
+
+        for sent in blog["sents"]:
+            sents.append(sent)
+            sents_content.append(sent)
+
+        # 将每一层的所有分数进行Min-Max归一化
+        for i, scores in enumerate(sents_target):
+            scores = np.array(scores)
+            max_score = scores.max()
+            min_score = scores.min()
+            sents_target[i] = [(tmp - min_score) / (max_score - min_score) for tmp in scores]
+
+        # 将每个句子的单词数截断到sent_trunc,超过截断,不足补全
+        for i, sent in enumerate(sents):
+            sent = sent.split()
+            cur_sent_len = len(sent)
+            if cur_sent_len > args.sent_trunc:
+                sent = sent[:args.sent_trunc]
+            else:
+                sent += (args.sent_trunc - cur_sent_len) * [self.PAD_TOKEN]
+            sent = [self.w2i(_) for _ in sent]
+            sents[i] = sent
+        sents = torch.LongTensor(sents)
+        targets = torch.FloatTensor(sents_target)
+        targets = F.softmax(targets * args.alpha, dim=1)
+
+        return sents, targets, summary, sents_content, opt_extract
diff --git a/baselines/baseline_acl18/main.py b/baselines/baseline_acl18/main.py
new file mode 100644
index 0000000..ce30ff2
--- /dev/null
+++ b/baselines/baseline_acl18/main.py
@@ -0,0 +1,173 @@
+# coding: utf-8
+
+# 实现论文《Neural Document Summarization by Jointly Learning to Score and Select Sentences》中的模型
+
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+from torch.nn.utils import clip_grad_norm_
+from tqdm import tqdm
+from Vocab import Vocab
+from model import Model
+import numpy as np
+import math
+import re
+import sys
+import os, json, argparse, random
+
+sys.path.append('../')
+from myrouge.rouge import get_rouge_score
+
+parser = argparse.ArgumentParser(description='ACL18 Summarization')
+parser.add_argument('-save_dir', type=str, default='checkpoints1/')
+parser.add_argument('-embed_dim', type=int, default=100)
+parser.add_argument('-embed_num', type=int, default=100)
+parser.add_argument('-embed_frozen', type=bool, default=True)
+parser.add_argument('-hidden_size', type=int, default=256)
+parser.add_argument('-teacher_forcing', type=float, default=0.0)
+parser.add_argument('-lr', type=float, default=1e-3)
+parser.add_argument('-max_norm', type=float, default=5.0)
+parser.add_argument('-sent_dropout', type=float, default=0.0)
+parser.add_argument('-doc_dropout', type=float, default=0.0)
+parser.add_argument('-sent_trunc', type=int, default=30)
+parser.add_argument('-alpha', type=float, default=20.0)
+parser.add_argument('-epochs', type=int, default=10)
+parser.add_argument('-seed', type=int, default=1)
+parser.add_argument('-embedding', type=str, default='../word2vec/embedding.npz')
+parser.add_argument('-word2id', type=str, default='../word2vec/word2id.json')
+parser.add_argument('-train_dir', type=str, default='../data/bbc_acl/train/')
+parser.add_argument('-valid_dir', type=str, default='../data/bbc_acl/test/')
+parser.add_argument('-test_dir', type=str, default='../data/bbc_acl/test/')
+parser.add_argument('-valid_every', type=int, default=500)
+parser.add_argument('-load_model', type=str, default='')
+parser.add_argument('-sum_len', type=int, default=1)  # 摘要长度为原摘要长度的倍数
+parser.add_argument('-test', action='store_true')
+parser.add_argument('-use_cuda', type=bool, default=False)
+
+use_cuda = torch.cuda.is_available()
+args = parser.parse_args()
+if use_cuda:
+    torch.cuda.manual_seed(args.seed)
+torch.manual_seed(args.seed)
+random.seed(args.seed)
+np.random.seed(args.seed)
+args.use_cuda = use_cuda
+
+
+def train():
+    print('Loading vocab, train and val dataset...')
+    embed = torch.Tensor(np.load(args.embedding)['embedding'])
+    args.embed_num = embed.size(0)
+    args.embed_dim = embed.size(1)
+    with open(args.word2id) as f:
+        word2id = json.load(f)
+    vocab = Vocab(embed, word2id)
+    train_data = []
+    for fn in tqdm(os.listdir(args.train_dir)):
+        f = open(args.train_dir + fn, 'r')
+        train_data.append(json.load(f))
+        f.close()
+    val_data = []
+    for fn in tqdm(os.listdir(args.valid_dir)):
+        f = open(args.valid_dir + fn, 'r')
+        val_data.append(json.load(f))
+        f.close()
+
+    net = Model(args, embed)
+    criterion = nn.KLDivLoss(size_average=False, reduce=True)
+    if use_cuda:
+        net.cuda()
+    optimizer = torch.optim.Adam(net.parameters(), lr=args.lr)
+    net.train()
+    min_loss = float('inf')
+
+    for epoch in range(1, args.epochs + 1):
+        for i, blog in enumerate(train_data):
+            sents, targets, _1, _2, opt = vocab.make_features(blog, args)
+            sents, targets = Variable(sents), Variable(targets)
+            if use_cuda:
+                sents = sents.cuda()
+                targets = targets.cuda()
+            probs = net(sents, opt)
+            loss = criterion(probs, targets)
+            optimizer.zero_grad()
+            loss.backward()
+            clip_grad_norm_(net.parameters(), args.max_norm)
+            optimizer.step()
+            if i % 5 == 0:
+                print('EPOCH [%d/%d]: BLOG_ID=[%d/%d] loss=%f' % (epoch, args.epochs, i, len(train_data), loss))
+
+            cnt = (epoch - 1) * len(train_data) + i
+            if cnt % args.valid_every == 0:
+                print('Begin valid... Epoch %d, Batch %d' % (epoch, i))
+                cur_loss, r1, r2, rl, rsu = evaluate(net, vocab, val_data, True)
+                if cur_loss < min_loss:
+                    min_loss = cur_loss
+                save_path = args.save_dir + 'RNN_GCN' + '_%d_%.4f_%.4f_%.4f_%.4f_%.4f' % (
+                    cnt / args.valid_every, cur_loss, r1, r2, rl, rsu)
+                net.save(save_path)
+                print('Epoch: %2d Min_Val_Loss: %f Cur_Val_Loss: %f Rouge-1: %f Rouge-2: %f Rouge-l: %f Rouge-SU4: %f' %
+                      (epoch, min_loss, cur_loss, r1, r2, rl, rsu))
+    return
+
+
+def evaluate(net, vocab, data_iter, train_next):
+    net.eval()
+    criterion = nn.KLDivLoss(size_average=False, reduce=True)
+    loss, r1, r2, rl, rsu = .0, .0, .0, .0, .0
+    for blog in tqdm(data_iter):
+        sents, targets, summary, sents_content, opt = vocab.make_features(blog, args)
+        sents, targets = Variable(sents), Variable(targets)
+        if use_cuda:
+            sents = sents.cuda()
+            targets = targets.cuda()
+        probs = net(sents, opt)
+        loss += criterion(probs, targets).data.item()
+        hyp = sents_selection(probs.tolist(), sents_content, len(summary.split()))
+        score = get_rouge_score(hyp, summary)
+        r1 += score['ROUGE-1']['r']
+        r2 += score['ROUGE-2']['r']
+        rl += score['ROUGE-L']['r']
+        rsu += score['ROUGE-SU4']['r']
+    blog_num = len(data_iter)
+    loss = loss / blog_num
+    r1 = r1 / blog_num
+    r2 = r2 / blog_num
+    rl = rl / blog_num
+    rsu = rsu / blog_num
+    if train_next:
+        net.train()
+    return loss, r1, r2, rl, rsu
+
+
+def sents_selection(scores, sents, sum_len):
+    rst = ""
+    selected = []
+    for i, score in enumerate(scores):
+        scores[i] = (-np.array(score)).argsort()
+        for s in scores[i]:
+            if s not in selected:
+                selected.append(s)
+                break
+    for s in scores[-1]:
+        if s not in selected:
+            selected.append(s)
+    for idx in selected:
+        rst += " " + sents[idx]
+        cur_len = len(rst.strip().split())
+        if cur_len > sum_len:
+            break
+    rst = " ".join(rst.strip().split()[:sum_len])
+    return rst
+
+
+def test():
+    # TODO
+    return
+
+
+if __name__ == '__main__':
+    if args.test:
+        test()
+    else:
+        train()
diff --git a/baselines/baseline_acl18/model.py b/baselines/baseline_acl18/model.py
new file mode 100644
index 0000000..f2cd8cd
--- /dev/null
+++ b/baselines/baseline_acl18/model.py
@@ -0,0 +1,85 @@
+# coding: utf-8
+
+import torch
+import random
+import torch.nn as nn
+import torch.nn.functional as F
+
+use_cuda = torch.cuda.is_available()
+
+
+class Model(nn.Module):
+    def __init__(self, args, embed=None):
+        super(Model, self).__init__()
+        self.model_name = 'Model'
+        self.args = args
+        V = args.embed_num
+        D = args.embed_dim
+        self.H = args.hidden_size
+
+        self.embed = nn.Embedding(V, D, padding_idx=0)
+        if embed is not None:
+            self.embed.weight.data.copy_(embed)
+
+        # Sentence Embedding层
+        self.sent_RNN = nn.GRU(
+            input_size=D,
+            hidden_size=self.H,
+            batch_first=True,
+            bidirectional=True,
+        )
+        self.sent_Dropout = nn.Dropout(args.sent_dropout)  # 所以单独增加Dropout层用于处理GRU输出
+
+        # Doc Embedding层
+        self.doc_RNN = nn.GRU(
+            input_size=2 * self.H,
+            hidden_size=self.H,
+            batch_first=True,
+            bidirectional=True,
+        )
+        self.doc_Dropout = nn.Dropout(args.doc_dropout)
+
+        # 预测层
+        self.predict = nn.GRUCell(input_size=2 * self.H, hidden_size=self.H)
+        self.wq = nn.Linear(self.H, self.H)
+        self.wd = nn.Linear(2 * self.H, self.H)
+        self.ws = nn.Linear(self.H, 1)
+        self.wm = nn.Linear(self.H, self.H)
+
+    def forward(self, sents, opt):
+        # Sentence Embedding,根据Embedding是否保持不变分情况处理
+        if self.args.embed_frozen:
+            sents = self.embed(sents).detach()
+        else:
+            sents = self.embed(sents)
+        _, hn = self.sent_RNN(sents)
+        hn = self.sent_Dropout(hn)
+        sents = torch.cat((hn[0], hn[1]), dim=1)  # 1 * sent_num * (2*self.H)
+        sents = sents.unsqueeze(0)
+        outputs, hn = self.doc_RNN(sents)
+        outputs, hn = self.doc_Dropout(outputs), self.doc_Dropout(hn)
+        sents = outputs.squeeze(0)  # sent_num * (2*self.H)
+
+        # 预测层
+        teacher_forcing = True if random.random() < self.args.teacher_forcing else False
+        ht = F.tanh(self.wm(hn[1][0]))  # 初始隐藏状态,由后向GRU最后一个隐藏状态线性变换得到
+        st = torch.zeros(2 * self.H)  # 一开始没有选择句子,s0用0向量表示
+        if use_cuda:
+            st = st.cuda()
+        rst = []
+        for i in range(0, len(opt)):
+            ht = self.predict(st.unsqueeze(0), ht.unsqueeze(0)).squeeze(0)  # ht表示当前已选取的句子集合
+            ht1 = ht.repeat(sents.size(0)).view(sents.size(0), -1)
+            cur_scores = self.ws(F.tanh(self.wq(ht1) + self.wd(sents))).view(-1)
+            cur_scores = F.log_softmax(cur_scores, dim=0)
+            rst.append(cur_scores)
+            if teacher_forcing:
+                st = sents[opt[i]]
+            else:
+                st = sents[torch.argmax(cur_scores).data.item()]
+
+        return torch.cat(rst).view(len(opt), -1)
+
+    def save(self, dir):
+        checkpoint = {'model': self.state_dict(), 'args': self.args}
+        torch.save(checkpoint, dir)
\ No newline at end of file
diff --git a/baselines/baseline_featureSVR/evaluate.py b/baselines/baseline_featureSVR/evaluate.py
new file mode 100644
index 0000000..a00b874
--- /dev/null
+++ b/baselines/baseline_featureSVR/evaluate.py
@@ -0,0 +1,181 @@
+# coding: utf-8
+
+# 利用训练得到的分数得到摘要,计算rouge值
+
+import sys
+
+sys.path.append('../')
+import json
+import re
+import os
+import math
+from myrouge.rouge import get_rouge_score
+from tqdm import tqdm
+
+valid_data = []
+valid_pre = []
+test_data = []
+test_pre = []
+corpus = 'bbc'
+label_method = 'cont_1'
+valid_dir = '../data/' + corpus + '_' + label_method + '/valid/'
+test_dir = '../data/' + corpus + '_' + label_method + '/test/'
+blog_trunc = 80  # live blog只保留前80个doc
+pre_dir = './data/' + corpus + '/'
+candidate_num = 3  # 得分前15的句子作为候选
+mmr = 0.75
+
+
+class Blog:
+    def __init__(self, blog_json):
+        self.id = blog_json['blog_id']
+        self.summary = ' '.join(blog_json['summary'])
+        self.docs = []
+        self.scores = []
+        for i, doc in enumerate(blog_json['documents']):
+            if i >= blog_trunc:
+                break
+            self.docs.append(doc['text'])
+            self.scores.append(doc['sent_label'])
+
+
+# 根据得分来排序下标
+def get_rank(pre):
+    a = sorted(enumerate(pre), key=lambda x: x[1])
+    rst = []
+    for tup in a:
+        rst.append(tup[0])
+    rst.reverse()
+    return rst
+
+
+# 用rouge_1_f表示两个句子之间的相似度
+def rouge_1_f(hyp, ref):
+    hyp = re.sub(r'[^a-z]', ' ', hyp.lower()).strip().split()
+    ref = re.sub(r'[^a-z]', ' ', ref.lower()).strip().split()
+    if len(hyp) == 0 or len(ref) == 0:
+        return .0
+    ref_flag = [0 for _ in ref]
+    hit = .0
+    for w in hyp:
+        for i in range(0, len(ref)):
+            if w == ref[i] and ref_flag[i] == 0:
+                hit += 1
+                ref_flag[i] = 1
+                break
+    p = hit / len(hyp)
+    r = hit / len(ref)
+    if math.fabs(p + r) < 1e-10:
+        f = .0
+    else:
+        f = 2 * p * r / (p + r)
+    return f
+
+
+# 第二种re_rank方法,使用MMR去冗余策略
+def re_rank(sents, scores, ref_len):
+    sents_num = len(sents)
+    sim = [sents_num * [.0] for _ in range(0, sents_num)]
+    for i in range(0, sents_num):
+        for j in range(i, sents_num):
+            if j == i:
+                sim[i][j] = 1.0
+            else:
+                sim[i][j] = sim[j][i] = rouge_1_f(sents[i], sents[j])
+    chosen = []
+    candidates = range(0, sents_num)
+    summary = ''
+    cur_len = 0
+    while len(candidates) != 0:
+        max_point = -1e20
+        next = -1
+        for i in candidates:
+            max_sim = .0
+            for j in chosen:
+                max_sim = max(max_sim, sim[i][j])
+            cur_point = mmr * scores[i] - (1 - mmr) * max_sim
+            if cur_point > max_point:
+                max_point = cur_point
+                next = i
+        chosen.append(next)
+        candidates.remove(next)
+        tmp = sents[next]
+        tmp = tmp.split()
+        tmp_len = len(tmp)
+        if cur_len + tmp_len > ref_len:
+            summary += ' '.join(tmp[:ref_len - cur_len])
+            break
+        else:
+            summary += ' '.join(tmp) + ' '
+            cur_len += tmp_len
+    return summary
+
+
+def main():
+    print('Loading data...')
+    for fn in os.listdir(valid_dir):
+        f = open(os.path.join(valid_dir, fn), 'r')
+        valid_data.append(Blog(json.load(f)))
+        f.close()
+    with open(pre_dir + 'valid_pre.txt', 'r') as f:
+        for line in f.readlines():
+            valid_pre.append(float(line))
+    for fn in os.listdir(test_dir):
+        f = open(os.path.join(test_dir, fn), 'r')
+        test_data.append(Blog(json.load(f)))
+        f.close()
+    with open(pre_dir + 'test_pre.txt', 'r') as f:
+        for line in f.readlines():
+            test_pre.append(float(line))
+    """
+    print('Evaluating valid set...')
+    r1, r2, rl, rsu = .0, .0, .0, .0
+    start = 0
+    blog_num = .0
+    for blog in tqdm(valid_data):
+        sents = []
+        for doc in blog.docs:
+            sents.extend(doc)
+        cur_pre = valid_pre[start: start + len(sents)]
+        start = start + len(sents)
+        ref_len = len(blog.summary.strip().split())
+        hyp = re_rank(sents, cur_pre, ref_len)
+        score = get_rouge_score(hyp, blog.summary)
+        r1 += score['ROUGE-1']['r']
+        r2 += score['ROUGE-2']['r']
+        rl += score['ROUGE-L']['r']
+        rsu += score['ROUGE-SU4']['r']
+        blog_num += 1
+    r1 = r1 / blog_num
+    r2 = r2 / blog_num
+    rl = rl / blog_num
+    rsu = rsu / blog_num
+    print(r1, r2, rl, rsu)
+    """
+    print('Evaluating test set...')
+    r1, r2, rl, rsu = .0, .0, .0, .0
+    start = 0
+    blog_num = .0
+    for blog in tqdm(test_data):
+        sents = []
+        for doc in blog.docs:
+            sents.extend(doc)
+        cur_pre = test_pre[start: start + len(sents)]
+        start = start + len(sents)
+        ref_len = len(blog.summary.strip().split())
+        hyp = re_rank(sents, cur_pre, ref_len)
+        score = get_rouge_score(hyp, blog.summary)
+        r1 += score['ROUGE-1']['r']
+        r2 += score['ROUGE-2']['r']
+        rl += score['ROUGE-L']['r']
+        rsu += score['ROUGE-SU4']['r']
+        blog_num += 1
+    r1 = r1 / blog_num
+    r2 = r2 / blog_num
+    rl = rl / blog_num
+    rsu = rsu / blog_num
+    print(r1, r2, rl, rsu)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/baselines/baseline_featureSVR/features.py b/baselines/baseline_featureSVR/features.py
new file mode 100644
index 0000000..b8d6cc3
--- /dev/null
+++ b/baselines/baseline_featureSVR/features.py
@@ -0,0 +1,242 @@
+# coding: utf-8
+
+# 有监督学习baseline,为每个句子计算一组特征,从这组特征推断最后句子得分
+# 特征包括:surface(和位置相关的特征),content(和内容相关的特征),rel(和句间关系相关的特征)
+
+import json
+import re
+import math
+import os
+from tqdm import tqdm
+from copy import deepcopy
+from nltk.corpus import stopwords
+from nltk.text import TextCollection
+import numpy as np
+
+train_data = []
+valid_data = []
+test_data = []
+corpus = 'bbc'
+train_dir = '../data/' + corpus + '_cont_1/train/'
+valid_dir = '../data/' + corpus + '_cont_1/valid/'
+test_dir = '../data/' + corpus + '_cont_1/test/'
+out_dir = './data/' + corpus + '/'
+blog_trunc = 80  # live blog只保留前80个doc
+if os.path.exists(out_dir):
+    os.system('rm -r ' + out_dir)
+os.mkdir(out_dir)
+stop_words = stopwords.words('english')
+
+
+class Blog:
+    def __init__(self, blog_json):
+        self.id = blog_json['blog_id']
+        self.summary = ' '.join(blog_json['summary'])
+        self.docs = []
+        self.scores = []
+        for i, doc in enumerate(blog_json['documents']):
+            if i >= blog_trunc:
+                break
+            self.docs.append(doc['text'])
+            self.scores.append(doc['sent_label'])
+
+
+def surface(blog, doc_idx, sent_idx, sent):  # 为一个句子计算surface特征
+    rst = []
+    rst.append(float(doc_idx))  # abs_blog_pos
+    rst.append(float(doc_idx) / len(blog.docs))  # rel_blog_pos
+    rst.append(float(sent_idx))  # abs_doc_pos
+    rst.append(float(sent_idx) / len(blog.docs[doc_idx]))  # rel_doc_pos
+    if doc_idx == 0:  # blog_first
+        rst.append(1.0)
+    else:
+        rst.append(.0)
+    if sent_idx == 0:  # doc_first
+        rst.append(1.0)
+    else:
+        rst.append(.0)
+    rst.append(float(len(sent.split())))  # sent_len
+    return rst
+
+
+def content(blog, doc_idx, sent_idx, sent, text_collection):  # 为一个句子计算content特征,tf, idf, tf_idf
+    rst = [.0, .0, .0]  # tf, df, tf_idf的平均值
+    cnt = 0
+    for w in sent.split():
+        if w in stop_words:
+            continue
+        rst[0] += text_collection.tf(w, blog.docs[doc_idx])
+        rst[1] += text_collection.idf(w)
+        rst[2] += text_collection.tf_idf(w, blog.docs[doc_idx])
+        cnt += 1
+    if cnt != 0:
+        rst = [t / cnt for t in rst]
+    return rst
+
+
+# 用rouge_1_f表示两个句子之间的相似度
+def rouge_1_f(hyp, ref):
+    hyp = re.sub(r'[^a-z]', ' ', hyp.lower()).strip().split()
+    ref = re.sub(r'[^a-z]', ' ', ref.lower()).strip().split()
+    if len(hyp) == 0 or len(ref) == 0:
+        return .0
+    ref_flag = [0 for _ in ref]
+    hit = .0
+    for w in hyp:
+        for i in range(0, len(ref)):
+            if w == ref[i] and ref_flag[i] == 0:
+                hit += 1
+                ref_flag[i] = 1
+                break
+    p = hit / len(hyp)
+    r = hit / len(ref)
+    if math.fabs(p + r) < 1e-10:
+        f = .0
+    else:
+        f = 2 * p * r / (p + r)
+    return f
+
+
+def PageRank(blog):
+    sents = []
+    for doc in blog.docs:
+        sents.extend(doc)
+    sent_num = len(sents)
+    unipai = [1.0 / sent_num for i in range(0, sent_num)]
+    unipai = np.mat(unipai)  # 平均分布
+    pai = deepcopy(unipai)  # 初始得分
+
+    # 根据相似度,计算初始的转移概率矩阵p0
+    p0 = []
+    for i in range(0, sent_num):
+        tmp = [0.0 for i in range(0, sent_num)]
+        p0.append(tmp)
+    for i in range(0, sent_num):
+        for j in range(i + 1, sent_num):
+            sim = rouge_1_f(sents[i], sents[j])
+            p0[i][j] = p0[j][i] = sim
+
+    # 对转移概率归一化
+    for i in range(0, sent_num):
+        sumi = sum(p0[i])
+        if sumi != 0.0:
+            p0[i] = [p0[i][j] / sumi for j in range(0, sent_num)]
+    p0 = np.mat(p0)
+
+    iters = 100
+    a = 0.85
+    for i in range(0, iters):
+        oldpai = deepcopy(pai)
+        pai = a * oldpai * p0 + (1 - a) * unipai  # pageRank
+        # pai几乎不变,则停止迭代
+        stop = True
+        for j in range(0, sent_num):
+            if np.fabs(oldpai[0, j] - pai[0, j]) > 1e-10:
+                stop = False
+                break
+        if stop:
+            break
+    scores = [pai[0, j] for j in range(0, sent_num)]
+    cnt = 0
+    rst = deepcopy(blog.scores)
+    for i, doc in enumerate(blog.docs):
+        for j, sent in enumerate(blog.docs[i]):
+            rst[i][j] = scores[cnt]
+            cnt += 1
+    return rst
+
+
+def rel(blog, doc_idx, sent_idx, sent):
+    rst = [.0, .0]
+    if len(blog.docs[0]) > 0:
+        rst[0] = rouge_1_f(sent, blog.docs[0][0])
+    rst[1] = rouge_1_f(sent, blog.docs[doc_idx][0])
+    return rst
+
+
+def normalize(feats):
+    leni = len(feats)
+    lenj = len(feats[0])
+    Max = [1e-10 for _ in range(0, lenj)]
+    Min = [1e10 for _ in range(0, lenj)]
+    for feat in feats:
+        for j in range(0, lenj):
+            Max[j] = max(Max[j], feat[j])
+            Min[j] = min(Min[j], feat[j])
+    for i in range(0, leni):
+        for j in range(0, lenj):
+            if Max[j] - Min[j] > 1e-10:
+                feats[i][j] = (feats[i][j] - Min[j]) / (Max[j] - Min[j])
+            else:
+                feats[i][j] = .0
+    return feats
+
+
+def compute_features(blog):
+    features = []
+    text_collection = []
+    for doc in blog.docs:
+        text_collection.append(' '.join(doc))
+    text_collection = TextCollection(text_collection)  # 为了方便计算tf_idf
+    pageranks = PageRank(blog)  # 得到每句话的pagerank分数
+    for i, doc in enumerate(blog.docs):
+        for j, sent in enumerate(doc):
+            cur_feat = []
+            cur_feat.extend(surface(blog, i, j, sent))
+            cur_feat.extend(content(blog, i, j, sent, text_collection))
+            cur_feat.extend(rel(blog, i, j, sent))
+            cur_feat.append(pageranks[i][j])
+            features.append(cur_feat)
+    features = normalize(features)
+    return features
+
+
+def main():
+    print('Loading data...')
+    for fn in os.listdir(train_dir):
+        f = open(os.path.join(train_dir, fn), 'r')
+        train_data.append(Blog(json.load(f)))
+        f.close()
+    for fn in os.listdir(valid_dir):
+        f = open(os.path.join(valid_dir, fn), 'r')
+        valid_data.append(Blog(json.load(f)))
+        f.close()
+    for fn in os.listdir(test_dir):
+        f = open(os.path.join(test_dir, fn), 'r')
+        test_data.append(Blog(json.load(f)))
+        f.close()
+
+    print('Computing features...')
+    train_features = []
+    for blog in tqdm(train_data):
+        features = compute_features(blog)
+        train_features.extend(features)
+    with open(os.path.join(out_dir, 'train.txt'), 'w') as f:
+        for i in range(0, len(train_features)):
+            for feat in train_features[i]:
+                f.write(str(feat) + ' ')
+            f.write('\n')
+
+    valid_features = []
+    for blog in tqdm(valid_data):
+        features = compute_features(blog)
+        valid_features.extend(features)
+    with open(os.path.join(out_dir, 'valid.txt'), 'w') as f:
+        for i in range(0, len(valid_features)):
+            for feat in valid_features[i]:
+                f.write(str(feat) + ' ')
+            f.write('\n')
+
+    test_features = []
+    for blog in tqdm(test_data):
+        features = compute_features(blog)
+        test_features.extend(features)
+    with open(os.path.join(out_dir, 'test.txt'), 'w') as f:
+        for i in range(0, len(test_features)):
+            for feat in test_features[i]:
+                f.write(str(feat) + ' ')
+            f.write('\n')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/baselines/baseline_featureSVR/train.py b/baselines/baseline_featureSVR/train.py
new file mode 100644
index 0000000..3167d21
--- /dev/null
+++ b/baselines/baseline_featureSVR/train.py
@@ -0,0 +1,84 @@
+# coding: utf-8
+
+# 利用得到的特征训练SVM回归模型,并预测验证集和测试集的分数
+
+import os
+import json
+from sklearn import linear_model
+from sklearn.svm import LinearSVR
+
+
+corpus = 'bbc'
+label_method = 'cont_1'
+train_label_dir = '../data/' + corpus + '_' + label_method + '/train/'
+blog_trunc = 80
+feature_dir = './data/' + corpus + '/'
+
+
+class Blog:
+    def __init__(self, blog_json):
+        self.id = blog_json['blog_id']
+        self.summary = ' '.join(blog_json['summary'])
+        self.docs = []
+        self.scores = []
+        for i, doc in enumerate(blog_json['documents']):
+            if i >= blog_trunc:
+                break
+            self.docs.append(doc['text'])
+            self.scores.append(doc['sent_label'])
+
+
+def load_train_label():
+    train_data = []
+    for fn in os.listdir(train_label_dir):
+        f = open(os.path.join(train_label_dir, fn), 'r')
+        train_data.append(Blog(json.load(f)))
+        f.close()
+    train_label = []
+    for blog in train_data:
+        for score in blog.scores:
+            train_label.extend(score)
+    return train_label
+
+
+def Reg(x, y):
+    # reg = linear_model.SGDRegressor(max_iter=1000)
+    reg = LinearSVR()
+    reg.fit(x, y)
+    return reg
+
+
+def main():
+    print('Loading data...')
+    train_x = []
+    with open(feature_dir + 'train.txt', 'r') as f:
+        for line in f.readlines():
+            data = line.strip().split()
+            train_x.append([float(_) for _ in data])
+    train_y = load_train_label()
+    valid_x = []
+    with open(feature_dir + 'valid.txt', 'r') as f:
+        for line in f.readlines():
+            data = line.strip().split()
+            valid_x.append([float(_) for _ in data])
+    test_x = []
+    with open(feature_dir + 'test.txt', 'r') as f:
+        for line in f.readlines():
+            data = line.strip().split()
+            test_x.append([float(_) for _ in data])
+
+    print('Training model...')
+    reg = Reg(train_x, train_y)
+    print('Predicting...')
+    valid_pre = reg.predict(valid_x)
+    test_pre = reg.predict(test_x)
+    with open(feature_dir + 'valid_pre.txt', 'w') as f:
+        for p in valid_pre:
+            f.write(str(p) + '\n')
+    with open(feature_dir + 'test_pre.txt', 'w') as f:
+        for p in test_pre:
+            f.write(str(p) + '\n')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/baselines/baseline_gcn/RNN_GCN.py b/baselines/baseline_gcn/RNN_GCN.py
new file mode 100644
index 0000000..83d4671
--- /dev/null
+++ b/baselines/baseline_gcn/RNN_GCN.py
@@ -0,0 +1,89 @@
+# coding: utf-8
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+use_cuda = torch.cuda.is_available()
+
+
+class RNN_GCN(nn.Module):
+    def __init__(self, args, embed=None):
+        super(RNN_GCN, self).__init__()
+
+        self.model_name = 'RNN_GCN'
+        self.args = args
+        V = args.embed_num
+        D = args.embed_dim
+        self.H = args.hidden_size
+        self.G = args.gcn_size
+
+        self.embed = nn.Embedding(V, D, padding_idx=0)
+        if embed is not None:
+            self.embed.weight.data.copy_(embed)
+        self.word_RNN = nn.GRU(
+            input_size=D,
+            hidden_size=self.H,
+            batch_first=True,
+        )
+
+        # 每一层都会有不同的权重矩阵,所以组织成列表
+        self.graph_w_0 = nn.Parameter(torch.FloatTensor(self.H, self.H).uniform_(-0.1, 0.1))
+        self.graph_w_1 = nn.Parameter(torch.FloatTensor(self.H, self.H).uniform_(-0.1, 0.1))
+        self.graph_w_2 = nn.Parameter(torch.FloatTensor(self.H, self.H).uniform_(-0.1, 0.1))
+
+        self.sent_RNN = nn.GRU(
+            input_size=self.H,
+            hidden_size=self.H,
+            batch_first=True,
+        )
+
+        self.pre_linear_0 = nn.Linear(self.H, self.H, bias=False)
+        self.pre_linear_1 = nn.Linear(self.H, self.H, bias=False)
+        self.pre_linear_2 = nn.Linear(self.H, 1, bias=False)
+
+        self.content = nn.Linear(self.H, 1, bias=False)
+        self.saliance = nn.Bilinear(self.H, self.H, 1, bias=False)
+        self.bias = nn.Parameter(torch.FloatTensor(1).uniform_(-0.1, 0.1))
+
+    # sents: sent_num * sent_trunc, doc_lens: doc_num, sim_matrix: sent_num * sent_num
+    def forward(self, sents, doc_lens, sim_matrix):
+        # Sentence Embedding
+        sents = self.embed(sents)  # sent_num * sent_trunc * D
+        _, hn = self.word_RNN(sents)  # hn: 1 * sent_num * H
+        sents = hn.squeeze(0)  # sent_num * H
+
+        # GCN
+        # sents = F.relu(sim_matrix.mm(sents).mm(self.graph_w_0))  # sent_num * H
+        # sents = F.relu(sim_matrix.mm(sents).mm(self.graph_w_1))
+        # sents = F.relu(sim_matrix.mm(sents).mm(self.graph_w_2))
+
+        # Doc Embedding
+        docs = []
+        start = 0
+        for doc_len in doc_lens:
+            cur_doc = sents[start: start + doc_len]
+            cur_doc = cur_doc.unsqueeze(0)
+            _, hn = self.sent_RNN(cur_doc)
+            docs.append(hn[0][0])
+            start += doc_len
+
+        # Blog presentation
+        blog = docs[0]
+        for doc in docs[1:]:
+            blog += doc
+        blog /= float(len(docs))
+
+        # predict
+        probs = []
+        for sent in sents:
+            sent_pre = self.pre_linear_2(F.tanh(self.pre_linear_0(blog) + self.pre_linear_1(sent)))
+            # sent_pre = self.content(sent) + self.saliance(sent, blog) + self.bias
+            probs.append(sent_pre)
+
+        probs = torch.cat(probs).squeeze()
+        return F.softmax(probs, dim=0)
+
+    def save(self, dir):
+        checkpoint = {'model': self.state_dict(), 'args': self.args}
+        torch.save(checkpoint, dir)
diff --git a/baselines/baseline_gcn/Vocab.py b/baselines/baseline_gcn/Vocab.py
new file mode 100644
index 0000000..161f928
--- /dev/null
+++ b/baselines/baseline_gcn/Vocab.py
@@ -0,0 +1,57 @@
+# coding=utf-8
+import torch
+import torch.nn.functional as F
+
+
+class Vocab:
+    def __init__(self, embed, word2id):
+        self.embed = embed
+        self.word2id = word2id
+        self.id2word = {v: k for k, v in word2id.items()}
+        assert len(self.word2id) == len(self.id2word)
+        self.PAD_IDX = 0
+        self.UNK_IDX = 1
+        self.PAD_TOKEN = 'PAD_TOKEN'
+        self.UNK_TOKEN = 'UNK_TOKEN'
+
+    def __len__(self):
+        return len(self.word2id)
+
+    def i2w(self, idx):
+        return self.id2word[idx]
+
+    def w2i(self, w):
+        if w in self.word2id:
+            return self.word2id[w]
+        else:
+            return self.UNK_IDX
+
+    def make_features(self, blog, args):
+        summary = ' '.join(blog["summary"])  # 当前blog的summary
+        sim_matrix = torch.FloatTensor(blog["sim_matrix"])
+        sents = []  # 当前blog的所有句子,用索引表示
+        sents_target = []  # 存储句子得分
+        sents_content = []  # 存储原句子
+        doc_lens = []  # 存储每个doc所包含的句子数
+
+        for doc in blog["documents"]:
+            sents.extend(doc["text"])
+            sents_target.extend(doc["sent_label"])
+            sents_content.extend(doc["text"])
+            doc_lens.append(len(doc["text"]))
+
+        # 将每个句子的单词数截断到sent_trunc,超过截断,不足补全
+        for i, sent in enumerate(sents):
+            sent = sent.split()
+            cur_sent_len = len(sent)
+            if cur_sent_len > args.sent_trunc:
+                sent = sent[:args.sent_trunc]
+            else:
+                sent += (args.sent_trunc - cur_sent_len) * [self.PAD_TOKEN]
+            sent = [self.w2i(_) for _ in sent]
+            sents[i] = sent
+        sents = torch.LongTensor(sents)
+        targets = torch.FloatTensor(sents_target)
+        targets = F.softmax(targets * args.alpha, dim=0)
+
+        return sents, targets, sim_matrix, doc_lens, sents_content, summary
diff --git a/baselines/baseline_gcn/main.py b/baselines/baseline_gcn/main.py
new file mode 100644
index 0000000..11c714c
--- /dev/null
+++ b/baselines/baseline_gcn/main.py
@@ -0,0 +1,194 @@
+# coding: utf-8
+
+# 实现论文《Graph-based Neural Multi-Document Summarization》中的模型
+
+import torch
+from torch.autograd import Variable
+from torch.nn.utils import clip_grad_norm_
+from tqdm import tqdm
+from Vocab import Vocab
+from RNN_GCN import RNN_GCN
+from myloss import myLoss
+import numpy as np
+import math
+import re
+import sys
+import os, json, argparse, random
+
+sys.path.append('../')
+from myrouge.rouge import get_rouge_score
+
+parser = argparse.ArgumentParser(description='Graph-based Summarization')
+parser.add_argument('-save_dir', type=str, default='checkpoints3/')
+parser.add_argument('-embed_dim', type=int, default=100)
+parser.add_argument('-embed_num', type=int, default=100)
+parser.add_argument('-hidden_size', type=int, default=300)
+parser.add_argument('-gcn_size', type=int, default=3)
+parser.add_argument('-lr', type=float, default=1e-3)
+parser.add_argument('-max_norm', type=float, default=1.0)
+parser.add_argument('-epochs', type=int, default=10)
+parser.add_argument('-seed', type=int, default=1)
+parser.add_argument('-embedding', type=str, default='../word2vec/embedding.npz')
+parser.add_argument('-word2id', type=str, default='../word2vec/word2id.json')
+parser.add_argument('-train_dir', type=str, default='../data/bbc_graph_1/train/')
+parser.add_argument('-valid_dir', type=str, default='../data/bbc_graph_1/test/')
+parser.add_argument('-test_dir', type=str, default='../data/bbc_graph_1/test/')
+parser.add_argument('-sent_trunc', type=int, default=20)
+parser.add_argument('-valid_every', type=int, default=500)
+parser.add_argument('-load_model', type=str, default='')
+parser.add_argument('-greedy', type=float, default=0.3)
+parser.add_argument('-alpha', type=float, default=1.0)
+parser.add_argument('-sum_len', type=int, default=1)  # 摘要长度为原摘要长度的倍数
+parser.add_argument('-test', action='store_true')
+parser.add_argument('-use_cuda', type=bool, default=False)
+
+use_cuda = torch.cuda.is_available()
+args = parser.parse_args()
+if use_cuda:
+    torch.cuda.manual_seed(args.seed)
+torch.manual_seed(args.seed)
+random.seed(args.seed)
+np.random.seed(args.seed)
+args.use_cuda = use_cuda
+
+
+def train():
+    print('Loading vocab, train and val dataset...')
+    embed = torch.Tensor(np.load(args.embedding)['embedding'])
+    args.embed_num = embed.size(0)
+    args.embed_dim = embed.size(1)
+    with open(args.word2id) as f:
+        word2id = json.load(f)
+    vocab = Vocab(embed, word2id)
+    train_data = []
+    for fn in tqdm(os.listdir(args.train_dir)):
+        f = open(args.train_dir + fn, 'r')
+        train_data.append(json.load(f))
+        f.close()
+    val_data = []
+    for fn in tqdm(os.listdir(args.valid_dir)):
+        f = open(args.valid_dir + fn, 'r')
+        val_data.append(json.load(f))
+        f.close()
+
+    net = RNN_GCN(args, embed)
+    criterion = myLoss()
+    if use_cuda:
+        net.cuda()
+    optimizer = torch.optim.Adam(net.parameters(), lr=args.lr)
+    net.train()
+    min_loss = float('inf')
+
+    for epoch in range(1, args.epochs + 1):
+        for i, blog in enumerate(train_data):
+            sents, targets, sim_matrix, doc_lens, _1, _2 = vocab.make_features(blog, args)
+            sents, targets, sim_matrix = Variable(sents), Variable(targets), Variable(sim_matrix)
+            if use_cuda:
+                sents = sents.cuda()
+                targets = targets.cuda()
+                sim_matrix = sim_matrix.cuda()
+            probs = net(sents, doc_lens, sim_matrix)
+            loss = criterion(probs, targets)
+            optimizer.zero_grad()
+            loss.backward()
+            clip_grad_norm_(net.parameters(), args.max_norm)
+            optimizer.step()
+            print('EPOCH [%d/%d]: BLOG_ID=[%d/%d] loss=%f' % (epoch, args.epochs, i, len(train_data), loss))
+
+            cnt = (epoch - 1) * len(train_data) + i
+            if cnt % args.valid_every == 0:
+                print('Begin valid... Epoch %d, Batch %d' % (epoch, i))
+                cur_loss, r1, r2, rl, rsu = evaluate(net, vocab, val_data, True)
+                if cur_loss < min_loss:
+                    min_loss = cur_loss
+                save_path = args.save_dir + 'RNN_GCN' + '_%d_%.4f_%.4f_%.4f_%.4f_%.4f' % (
+                    cnt / args.valid_every, cur_loss, r1, r2, rl, rsu)
+                net.save(save_path)
+                print('Epoch: %2d Min_Val_Loss: %f Cur_Val_Loss: %f Rouge-1: %f Rouge-2: %f Rouge-l: %f Rouge-SU4: %f' %
+                      (epoch, min_loss, cur_loss, r1, r2, rl, rsu))
+    return
+
+
+def evaluate(net, vocab, data_iter, train_next):
+    net.eval()
+    criterion = myLoss()
+    loss, r1, r2, rl, rsu = .0, .0, .0, .0, .0
+    for blog in tqdm(data_iter):
+        sents, targets, sim_matrix, doc_lens, sents_content, summary = vocab.make_features(blog, args)
+        sents, targets, sim_matrix = Variable(sents), Variable(targets), Variable(sim_matrix)
+        if use_cuda:
+            sents = sents.cuda()
+            targets = targets.cuda()
+            sim_matrix = sim_matrix.cuda()
+        probs = net(sents, doc_lens, sim_matrix)
+        loss += criterion(probs, targets).data.item()
+        hyp = greedy_selection(probs.tolist(), sents_content, len(summary.split()))
+        score = get_rouge_score(hyp, summary)
+        r1 += score['ROUGE-1']['r']
+        r2 += score['ROUGE-2']['r']
+        rl += score['ROUGE-L']['r']
+        rsu += score['ROUGE-SU4']['r']
+    blog_num = len(data_iter)
+    loss = loss / blog_num
+    r1 = r1 / blog_num
+    r2 = r2 / blog_num
+    rl = rl / blog_num
+    rsu = rsu / blog_num
+    if train_next:
+        net.train()
+    return loss, r1, r2, rl, rsu
+
+
+# 用rouge_1_f表示两个句子之间的相似度
+def rouge_1_f(hyp, ref):
+    hyp = re.sub(r'[^a-z]', ' ', hyp.lower()).strip().split()
+    ref = re.sub(r'[^a-z]', ' ', ref.lower()).strip().split()
+    if len(hyp) == 0 or len(ref) == 0:
+        return .0
+    ref_flag = [0 for _ in ref]
+    hit = .0
+    for w in hyp:
+        for i in range(0, len(ref)):
+            if w == ref[i] and ref_flag[i] == 0:
+                hit += 1
+                ref_flag[i] = 1
+                break
+    p = hit / len(hyp)
+    r = hit / len(ref)
+    if math.fabs(p + r) < 1e-10:
+        f = .0
+    else:
+        f = 2 * p * r / (p + r)
+    return f
+
+
+def greedy_selection(probs, sents_content, ref_len):
+    hyp = ""
+    selected = []
+    sorted_idx = np.array(probs).argsort().tolist()
+    sorted_idx.reverse()
+    for i in sorted_idx:
+        if len(hyp.split()) >= ref_len:
+            hyp = " ".join(hyp.split()[:ref_len])
+            break
+        valid = True
+        for j in selected:
+            if rouge_1_f(sents_content[i], sents_content[j]) > args.greedy:
+                valid = False
+                break
+        if valid:
+            selected.append(i)
+            hyp += ' ' + sents_content[i]
+    return hyp
+
+
+def test():
+    # TODO
+    return
+
+
+if __name__ == '__main__':
+    if args.test:
+        test()
+    else:
+        train()
diff --git a/baselines/baseline_gcn/myloss.py b/baselines/baseline_gcn/myloss.py
new file mode 100644
index 0000000..3e51da9
--- /dev/null
+++ b/baselines/baseline_gcn/myloss.py
@@ -0,0 +1,18 @@
+# coding: utf-8
+
+# 自定义loss函数,与论文中保持一致
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+use_cuda = torch.cuda.is_available()
+
+
+class myLoss(nn.Module):
+    def __init__(self):
+        super(myLoss, self).__init__()
+
+    def forward(self, predict, target):
+        loss = -torch.dot(target, torch.log(predict))
+        return loss.sum()
diff --git a/baselines/baseline_summarunner/Dataset.py b/baselines/baseline_summarunner/Dataset.py
new file mode 100644
index 0000000..19fcf8c
--- /dev/null
+++ b/baselines/baseline_summarunner/Dataset.py
@@ -0,0 +1,50 @@
+import csv
+import torch
+import torch.utils.data as data
+from torch.autograd import Variable
+from Vocab import Vocab
+import numpy as np
+
+
+class Dataset(data.Dataset):
+    def __init__(self, examples):
+        super(Dataset, self).__init__()
+        # data: {'sents':xxxx,'labels':'xxxx', 'summaries':[1,0]}
+        self.examples = examples
+        self.training = False
+
+    def train(self):
+        self.training = True
+        return self
+
+    def test(self):
+        self.training = False
+        return self
+
+    def shuffle(self, words):
+        np.random.shuffle(words)
+        return ' '.join(words)
+
+    def dropout(self, words, p=0.3):
+        l = len(words)
+        drop_index = np.random.choice(l, int(l * p))
+        keep_words = [words[i] for i in range(l) if i not in drop_index]
+        return ' '.join(keep_words)
+
+    def __getitem__(self, idx):
+        ex = self.examples[idx]
+        return ex
+        # words = ex['sents'].split()
+        # guess = np.random.random()
+
+        # if self.training:
+        #    if guess > 0.5:
+        #        sents = self.dropout(words,p=0.3)
+        #    else:
+        #        sents = self.shuffle(words)
+        # else:
+        #    sents = ex['sents']
+        # return {'id':ex['id'],'sents':sents,'labels':ex['labels']}
+
+    def __len__(self):
+        return len(self.examples)
diff --git a/baselines/baseline_summarunner/RNN_RNN.py b/baselines/baseline_summarunner/RNN_RNN.py
new file mode 100644
index 0000000..55dbffb
--- /dev/null
+++ b/baselines/baseline_summarunner/RNN_RNN.py
@@ -0,0 +1,135 @@
+# coding:utf-8
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from torch.autograd import Variable
+
+use_cuda = torch.cuda.is_available()
+
+
+class RNN_RNN(nn.Module):
+    def __init__(self, args, embed=None):
+        super(RNN_RNN, self).__init__()
+        self.model_name = 'RNN_RNN'
+        self.args = args
+        V = args.embed_num  # 单词表的大小
+        D = args.embed_dim  # 词向量长度
+        self.H = args.hidden_size  # 隐藏状态维数
+        S = args.seg_num  # 用于计算相对位置,将一篇文章分成固定的块数,句子的块号就是相对位置
+        P_V = args.pos_num
+        P_D = args.pos_dim
+        self.abs_pos_embed = nn.Embedding(P_V, P_D)
+        self.rel_pos_embed = nn.Embedding(S, P_D)
+        self.embed = nn.Embedding(V, D, padding_idx=0)
+        if embed is not None:
+            self.embed.weight.data.copy_(embed)
+
+        self.word_RNN = nn.GRU(
+            input_size=D,
+            hidden_size=self.H,
+            batch_first=True,
+            bidirectional=True
+        )
+
+        self.sent_RNN = nn.GRU(
+            input_size=2 * self.H,
+            hidden_size=self.H,
+            batch_first=True,
+            bidirectional=True
+        )
+
+        # 预测sent标签时,考虑sent内容,与blog相关性,冗余性,sent位置,bias
+        self.sent_content = nn.Linear(2 * self.H, 1, bias=False)
+        self.sent_salience = nn.Bilinear(2 * self.H, 2 * self.H, 1, bias=False)
+        self.novelty = nn.Bilinear(2 * self.H, 2 * self.H, 1, bias=False)
+        self.abs_pos = nn.Linear(P_D, 1, bias=False)
+        self.rel_pos = nn.Linear(P_D, 1, bias=False)
+        self.sent_bias = nn.Parameter(torch.FloatTensor(1).uniform_(-0.1, 0.1))
+
+        self.blog_fc = nn.Linear(2 * self.H, 2 * self.H)
+        self.sent_fc = nn.Linear(2 * self.H, 2 * self.H)
+
+    def max_pool1d(self, x, seq_lens):
+        out = []
+        for index, t in enumerate(x):
+            if seq_lens[index] == 0:
+                if use_cuda:
+                    out.append(torch.zeros(1, 2 * self.H, 1).cuda())
+                else:
+                    out.append(torch.zeros(1, 2 * self.H, 1))
+            else:
+                t = t[:seq_lens[index], :]
+                t = torch.t(t).unsqueeze(0)
+                out.append(F.max_pool1d(t, t.size(2)))
+
+        out = torch.cat(out).squeeze(2)
+        return out
+
+    def forward(self, x, doc_nums, doc_lens):
+        # x: total_sent_num * word_num
+        sent_lens = torch.sum(torch.sign(x), dim=1).data
+        x = self.embed(x)  # total_sent_num * word_num * D
+        x = self.word_RNN(x)[0]  # total_sent_num * word_num * (2*H)
+        sent_vec = self.max_pool1d(x, sent_lens)  # total_sent_num * (2*H)
+
+        # 现在需要把属于一篇blog的所有句子向量合成blog向量
+        blog_lens = []
+        doc_lens_start = 0
+        for doc_num in doc_nums:
+            cur_doc_lens = doc_lens[doc_lens_start: doc_lens_start + doc_num]
+            doc_lens_start += doc_num
+            blog_lens.append(np.array(cur_doc_lens).sum())
+
+        x = self.padding(sent_vec, blog_lens, self.args.pos_num)  # batch_size * pos_num * (2*H)
+        x = self.sent_RNN(x)[0]  # batch_size * pos_num * (2*H)
+        blog_vec = self.max_pool1d(x, blog_lens)  # batch_size * (2*H)
+
+        # 预测sent标签
+        probs = []
+        start = 0
+        for i in range(0, len(doc_nums)):
+            context = F.tanh(self.blog_fc(blog_vec[i])).view(1,-1)
+            end = start + blog_lens[i]
+            s = Variable(torch.zeros(1, 2 * self.H))
+            if use_cuda:
+                s = s.cuda()
+            for j in range(start, end):
+                sent = F.tanh(self.sent_fc(sent_vec[j])).view(1,-1)
+                sent_content = self.sent_content(sent)
+                sent_salience = self.sent_salience(sent, context)
+                sent_abs_index = torch.LongTensor([[j - start]])
+                sent_rel_index = torch.LongTensor([[(j - start) * 10 / blog_lens[i]]])
+                if use_cuda:
+                    sent_abs_index = sent_abs_index.cuda()
+                    sent_rel_index = sent_rel_index.cuda()
+                sent_abs_pos = self.abs_pos(self.abs_pos_embed(sent_abs_index).squeeze(0))
+                sent_rel_pos = self.rel_pos(self.rel_pos_embed(sent_rel_index).squeeze(0))
+                novelty = -1 * self.novelty(sent, s)
+                prob = F.sigmoid(sent_content + sent_salience + sent_abs_pos + sent_rel_pos + novelty + self.sent_bias)
+                s = s + torch.mm(prob, sent)
+                probs.append(prob)
+            start = end
+
+        return torch.cat(probs).squeeze()  # 一维tensor,前部分是文档的预测,后部分是所有句子(不含padding)的预测
+
+    # 对于一个序列进行padding,不足的补上全零向量
+    def padding(self, vec, seq_lens, trunc):
+        pad_dim = vec.size(1)
+        result = []
+        start = 0
+        for seq_len in seq_lens:
+            stop = start + seq_len
+            valid = vec[start:stop]
+            start = stop
+            pad = Variable(torch.zeros(trunc - seq_len, pad_dim))
+            if use_cuda:
+                pad = pad.cuda()
+            result.append(torch.cat([valid, pad]).unsqueeze(0))
+        result = torch.cat(result, dim=0)
+        return result
+
+    def save(self, dir):
+        checkpoint = {'model': self.state_dict(), 'args': self.args}
+        torch.save(checkpoint, dir)
diff --git a/baselines/baseline_summarunner/Vocab.py b/baselines/baseline_summarunner/Vocab.py
new file mode 100644
index 0000000..d74cfc2
--- /dev/null
+++ b/baselines/baseline_summarunner/Vocab.py
@@ -0,0 +1,78 @@
+# coding=utf-8
+import torch
+import numpy as np
+import math
+
+
+class Vocab():
+    def __init__(self, embed, word2id):
+        self.embed = embed
+        self.word2id = word2id
+        self.id2word = {v: k for k, v in word2id.items()}
+        assert len(self.word2id) == len(self.id2word)
+        self.PAD_IDX = 0
+        self.UNK_IDX = 1
+        self.PAD_TOKEN = 'PAD_TOKEN'
+        self.UNK_TOKEN = 'UNK_TOKEN'
+
+    def __len__(self):
+        return len(self.word2id)
+
+    def i2w(self, idx):
+        return self.id2word[idx]
+
+    def w2i(self, w):
+        if w in self.word2id:
+            return self.word2id[w]
+        else:
+            return self.UNK_IDX
+
+    def make_features(self, batch, args):
+        # sent_trunc: 每个句子的词数截取到sent_trunc,不足补全
+        # doc_trunc: 每个文档的句子数截取到doc_trunc,不补全
+        # blog_trunc: 每个live blog的文档数截取到blog_trunc,不补全
+        sent_trunc = args.sent_trunc
+        doc_trunc = args.doc_trunc
+        blog_trunc = args.blog_trunc
+
+        summarys = []
+        for s in batch["summary"]:
+            summarys.append(' '.join(s))
+        doc_nums = []  # 每个live blog含有多少文档
+        for i, d in enumerate(batch["documents"]):
+            if len(d) > blog_trunc:
+                batch["documents"][i] = d[:blog_trunc]
+            doc_nums.append(len(batch["documents"][i]))
+
+        sents = []  # 存储所有句子
+        sents_target = []  # 存储所有句子label
+        sents_content = []  # 存储所有的句子内容,与sents_target等长,便于之后计算rouge值
+        doc_lens = []  # 存储每篇文档包含的句子数
+        for d in batch["documents"]:
+            for td in d:
+                cur_sent_num = len(td["text"])
+                if cur_sent_num > doc_trunc:
+                    sents.extend(td["text"][:doc_trunc])
+                    sents_target.extend(td["sent_label"][:doc_trunc])
+                    sents_content.extend(td["text"][:doc_trunc])
+                    doc_lens.append(doc_trunc)
+                else:
+                    sents.extend(td["text"])
+                    sents_target.extend(td["sent_label"])
+                    sents_content.extend(td["text"])
+                    doc_lens.append(cur_sent_num)
+        # 将每个句子的单词数固定到sent_trunc,超过截断,不足补全
+        for i, sent in enumerate(sents):
+            sent = sent.split()
+            cur_sent_len = len(sent)
+            if cur_sent_len > sent_trunc:
+                sent = sent[:sent_trunc]
+            else:
+                sent += (sent_trunc - cur_sent_len) * [self.PAD_TOKEN]
+            sent = [self.w2i(_) for _ in sent]
+            sents[i] = sent
+        sents = torch.LongTensor(sents)
+        targets = sents_target
+        targets = torch.FloatTensor(targets)
+
+        return sents, targets, sents_content, summarys, doc_nums, doc_lens
diff --git a/baselines/baseline_summarunner/main.py b/baselines/baseline_summarunner/main.py
new file mode 100644
index 0000000..f8170bf
--- /dev/null
+++ b/baselines/baseline_summarunner/main.py
@@ -0,0 +1,294 @@
+# coding:utf-8
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+from torch.nn.utils import clip_grad_norm_
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+import numpy as np
+import math
+import re
+import sys
+from Vocab import Vocab
+from Dataset import Dataset
+from RNN_RNN import RNN_RNN
+import os, json, argparse, random
+
+sys.path.append('../')
+from myrouge.rouge import get_rouge_score
+
+parser = argparse.ArgumentParser(description='SummaRuNNer')
+# model
+parser.add_argument('-save_dir', type=str, default='checkpoints2/')
+parser.add_argument('-embed_dim', type=int, default=100)
+parser.add_argument('-embed_num', type=int, default=100)
+parser.add_argument('-hidden_size', type=int, default=200)
+parser.add_argument('-pos_dim', type=int, default=50)
+parser.add_argument('-pos_num', type=int, default=800)
+parser.add_argument('-seg_num', type=int, default=10)
+# train
+parser.add_argument('-lr', type=float, default=1e-3)
+parser.add_argument('-max_norm', type=float, default=5.0)
+parser.add_argument('-batch_size', type=int, default=5)
+parser.add_argument('-epochs', type=int, default=10)
+parser.add_argument('-seed', type=int, default=1)
+parser.add_argument('-embedding', type=str, default='../word2vec/embedding.npz')
+parser.add_argument('-word2id', type=str, default='../word2vec/word2id.json')
+parser.add_argument('-train_dir', type=str, default='../data/bbc_opt/train/')
+parser.add_argument('-valid_dir', type=str, default='../data/bbc_opt/test/')
+parser.add_argument('-sent_trunc', type=int, default=20)
+parser.add_argument('-doc_trunc', type=int, default=10)
+parser.add_argument('-blog_trunc', type=int, default=80)
+parser.add_argument('-valid_every', type=int, default=100)
+# test
+parser.add_argument('-load_model', type=str, default='')
+parser.add_argument('-test_dir', type=str, default='../data/bbc_opt/test/')
+parser.add_argument('-ref', type=str, default='outputs/ref/')
+parser.add_argument('-hyp', type=str, default='outputs/hyp/')
+parser.add_argument('-sum_len', type=int, default=1)  # 摘要长度为原摘要长度的倍数
+parser.add_argument('-mmr', type=float, default=0.75)
+# other
+parser.add_argument('-test', action='store_true')
+parser.add_argument('-use_cuda', type=bool, default=False)
+
+use_cuda = torch.cuda.is_available()
+args = parser.parse_args()
+if use_cuda:
+    torch.cuda.manual_seed(args.seed)
+torch.manual_seed(args.seed)
+random.seed(args.seed)
+np.random.seed(args.seed)
+args.use_cuda = use_cuda
+
+
+def my_collate(batch):
+    return {key: [d[key] for d in batch] for key in batch[0]}
+
+
+# 用rouge_1_f表示两个句子之间的相似度
+def rouge_1_f(hyp, ref):
+    hyp = re.sub(r'[^a-z]', ' ', hyp.lower()).strip().split()
+    ref = re.sub(r'[^a-z]', ' ', ref.lower()).strip().split()
+    if len(hyp) == 0 or len(ref) == 0:
+        return .0
+    ref_flag = [0 for _ in ref]
+    hit = .0
+    for w in hyp:
+        for i in range(0, len(ref)):
+            if w == ref[i] and ref_flag[i] == 0:
+                hit += 1
+                ref_flag[i] = 1
+                break
+    p = hit / len(hyp)
+    r = hit / len(ref)
+    if math.fabs(p + r) < 1e-10:
+        f = .0
+    else:
+        f = 2 * p * r / (p + r)
+    return f
+
+
+# 得到预测分数后,使用MMR策略进行重新排序,以消除冗余
+def re_rank(sents, scores, ref_len):
+    sents_num = len(sents)
+    sim = [sents_num * [.0] for _ in range(0, sents_num)]
+    for i in range(0, sents_num):
+        for j in range(i, sents_num):
+            if j == i:
+                sim[i][j] = 1.0
+            else:
+                sim[i][j] = sim[j][i] = rouge_1_f(sents[i], sents[j])
+    chosen = []
+    candidates = range(0, sents_num)
+    summary = ''
+    cur_len = 0
+    while len(candidates) != 0:
+        max_point = -1e20
+        next = -1
+        for i in candidates:
+            max_sim = .0
+            for j in chosen:
+                max_sim = max(max_sim, sim[i][j])
+            cur_point = args.mmr * scores[i] - (1.0 - args.mmr) * max_sim
+            if cur_point > max_point:
+                max_point = cur_point
+                next = i
+        chosen.append(next)
+        candidates.remove(next)
+        tmp = sents[next]
+        tmp = tmp.split()
+        tmp_len = len(tmp)
+        if cur_len + tmp_len > ref_len:
+            summary += ' '.join(tmp[:ref_len - cur_len])
+            break
+        else:
+            summary += ' '.join(tmp) + ' '
+            cur_len += tmp_len
+    return summary
+
+
+# 在验证集或测试集上测loss, rouge值
+def evaluate(net, vocab, data_iter, train_next):  # train_next指明接下来是否要继续训练
+    net.eval()
+    criterion = nn.MSELoss()
+    loss, r1, r2, rl, rsu = .0, .0, .0, .0, .0  # rouge-1,rouge-2,rouge-l,都使用recall值(长度限定为原摘要长度)
+    batch_num = .0
+    blog_num = .0
+    for i, batch in enumerate(tqdm(data_iter)):
+        # 计算loss
+        features, targets, sents_content, summaries, doc_nums, doc_lens = vocab.make_features(batch, args)
+        features, targets = Variable(features), Variable(targets.float())
+        if use_cuda:
+            features = features.cuda()
+            targets = targets.cuda()
+        probs = net(features, doc_nums, doc_lens)
+        batch_num += 1
+        loss += criterion(probs, targets).data.item()
+        probs_start = 0  # 当前blog对应的probs起始下标
+        doc_lens_start = 0  # 当前blog对应的doc_lens起始下标
+        sents_start = 0  # 当前blog对应的sents_content起始下标
+        for i in range(0, args.batch_size):
+            sents_num = 0
+            for j in range(doc_lens_start, doc_lens_start + doc_nums[i]):
+                sents_num += doc_lens[j]
+            cur_probs = probs[probs_start:probs_start + sents_num]
+            cur_sents = sents_content[sents_start: sents_start + sents_num]
+            probs_start = probs_start + sents_num
+            doc_lens_start = doc_lens_start + doc_nums[i]
+            sents_start = sents_start + sents_num
+            if use_cuda:
+                cur_probs = cur_probs.cpu()
+            cur_probs = list(cur_probs.detach().numpy())
+            sorted_index = list(np.argsort(cur_probs))  # cur_probs顺序排序后对应的下标
+            sorted_index.reverse()
+            ref = summaries[i].strip()
+            ref_len = len(ref.split())
+            hyp = re_rank(cur_sents, cur_probs, ref_len)
+            score = get_rouge_score(hyp, ref)
+            r1 += score['ROUGE-1']['r']
+            r2 += score['ROUGE-2']['r']
+            rl += score['ROUGE-L']['r']
+            rsu += score['ROUGE-SU4']['r']
+            blog_num += 1
+
+    loss = loss / batch_num
+    r1 = r1 / blog_num
+    r2 = r2 / blog_num
+    rl = rl / blog_num
+    rsu = rsu / blog_num
+    if train_next:  # 接下来要继续训练,将网络设成'train'状态
+        net.train()
+    return loss, r1, r2, rl, rsu
+
+
+def train():
+    print('Loading vocab, train and val dataset...')
+    embed = torch.Tensor(np.load(args.embedding)['embedding'])
+    args.embed_num = embed.size(0)
+    args.embed_dim = embed.size(1)
+    with open(args.word2id) as f:
+        word2id = json.load(f)
+    vocab = Vocab(embed, word2id)
+
+    train_data = []
+    for fn in os.listdir(args.train_dir):
+        f = open(args.train_dir + fn, 'r')
+        train_data.append(json.load(f))
+        f.close()
+    train_dataset = Dataset(train_data)
+
+    val_data = []
+    for fn in os.listdir(args.valid_dir):
+        f = open(args.valid_dir + fn, 'r')
+        val_data.append(json.load(f))
+        f.close()
+    val_dataset = Dataset(val_data)
+
+    net = RNN_RNN(args, embed)
+    criterion = nn.BCELoss()
+    if use_cuda:
+        net.cuda()
+
+    train_iter = DataLoader(dataset=train_dataset,
+                            batch_size=args.batch_size,
+                            shuffle=False,
+                            collate_fn=my_collate)
+
+    val_iter = DataLoader(dataset=val_dataset,
+                          batch_size=args.batch_size,
+                          shuffle=False,
+                          collate_fn=my_collate)
+    optimizer = torch.optim.Adam(net.parameters(), lr=args.lr)
+    net.train()
+    min_loss = float('inf')
+
+    for epoch in range(1, args.epochs + 1):
+        for i, batch in enumerate(train_iter):
+            features, targets, _1, _2, doc_nums, doc_lens = vocab.make_features(batch, args)
+            features, targets = Variable(features), Variable(targets.float())
+            if use_cuda:
+                features = features.cuda()
+                targets = targets.cuda()
+            probs = net(features, doc_nums, doc_lens)
+            loss = criterion(probs, targets)
+            optimizer.zero_grad()
+            loss.backward()
+            clip_grad_norm_(net.parameters(), args.max_norm)
+            optimizer.step()
+
+            print('EPOCH [%d/%d]: BATCH_ID=[%d/%d] loss=%f' % (
+                epoch, args.epochs, i, len(train_iter), loss))
+
+            cnt = (epoch - 1) * len(train_iter) + i
+            if cnt % args.valid_every == 0:
+                print('Begin valid... Epoch %d, Batch %d' % (epoch, i))
+                cur_loss, r1, r2, rl, rsu = evaluate(net, vocab, val_iter, True)
+                if cur_loss < min_loss:
+                    min_loss = cur_loss
+                save_path = args.save_dir + 'RNN_RNN' + '_%d_%.4f_%.4f_%.4f_%.4f_%.4f' % (
+                    cnt / args.valid_every, cur_loss, r1, r2, rl, rsu)
+                net.save(save_path)
+                print('Epoch: %2d Min_Val_Loss: %f Cur_Val_Loss: %f Rouge-1: %f Rouge-2: %f Rouge-l: %f Rouge-SU4: %f' %
+                      (epoch, min_loss, cur_loss, r1, r2, rl, rsu))
+
+
+def test():
+    print('Loading vocab and test dataset...')
+    embed = torch.Tensor(np.load(args.embedding)['embedding'])
+    args.embed_num = embed.size(0)
+    args.embed_dim = embed.size(1)
+    with open(args.word2id) as f:
+        word2id = json.load(f)
+    vocab = Vocab(embed, word2id)
+
+    test_data = []
+    for fn in os.listdir(args.test_dir):
+        f = open(args.test_dir + fn, 'r')
+        test_data.append(json.load(f))
+        f.close()
+    test_dataset = Dataset(test_data)
+    test_iter = DataLoader(dataset=test_dataset,
+                           batch_size=args.batch_size,
+                           shuffle=False,
+                           collate_fn=my_collate)
+    print('Loading model...')
+    if use_cuda:
+        checkpoint = torch.load(args.save_dir + args.load_model)
+    else:
+        checkpoint = torch.load(args.save_dir + args.load_model, map_location=lambda storage, loc: storage)
+    net = RNN_RNN(checkpoint['args'])
+    net.load_state_dict(checkpoint['model'])
+    if use_cuda:
+        net.cuda()
+    net.eval()
+
+    print('Begin test...')
+    test_loss, r1, r2, rl, rsu = evaluate(net, vocab, test_iter, False)
+    print('Test_Loss: %f Rouge-1: %f Rouge-2: %f Rouge-l: %f Rouge-SU4: %f' % (test_loss, r1, r2, rl, rsu))
+
+
+if __name__ == '__main__':
+    if args.test:
+        test()
+    else:
+        train()
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..706cd68
--- /dev/null
+++ b/main.py
@@ -0,0 +1,303 @@
+# coding:utf-8
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+from torch.nn.utils import clip_grad_norm_
+from torch.utils.data import DataLoader
+from myrouge.rouge import get_rouge_score
+from tqdm import tqdm
+import numpy as np
+import math
+import re
+import utils
+import model
+import os, json, argparse, random
+
+parser = argparse.ArgumentParser(description='LiveBlogSum')
+# model paras
+parser.add_argument('-model', type=str, default='Module6')
+parser.add_argument('-embed_frozen', type=bool, default=False)
+parser.add_argument('-embed_dim', type=int, default=100)
+parser.add_argument('-embed_num', type=int, default=100)
+parser.add_argument('-hidden_size', type=int, default=200)
+parser.add_argument('-pos_dim', type=int, default=10)
+parser.add_argument('-sum_len', type=int, default=1)
+parser.add_argument('-mmr', type=float, default=0.75)
+# train paras
+parser.add_argument('-save_dir', type=str, default='checkpoints3/')
+parser.add_argument('-lr', type=float, default=1e-3)
+parser.add_argument('-max_norm', type=float, default=5.0)
+parser.add_argument('-batch_size', type=int, default=5)
+parser.add_argument('-epochs', type=int, default=8)
+parser.add_argument('-seed', type=int, default=1)
+parser.add_argument('-sent_trunc', type=int, default=20)
+parser.add_argument('-doc_trunc', type=int, default=10)
+parser.add_argument('-blog_trunc', type=int, default=80)
+parser.add_argument('-srl_trunc', type=int, default=200)  # 每篇blog的srl元组数
+parser.add_argument('-topic_trunc', type=int, default=10)  # 每篇blog的话题数
+parser.add_argument('-topic_word_trunc', type=int, default=5)  # 每个话题的word数目
+parser.add_argument('-valid_every', type=int, default=100)
+parser.add_argument('-load_model', type=str, default='')
+parser.add_argument('-test', action='store_true')
+parser.add_argument('-use_cuda', type=bool, default=False)
+# data
+parser.add_argument('-embedding', type=str, default='word2vec/embedding.npz')
+parser.add_argument('-word2id', type=str, default='word2vec/word2id.json')
+parser.add_argument('-train_dir', type=str, default='data/bbc_srl_2/train/')
+parser.add_argument('-valid_dir', type=str, default='data/bbc_srl_2/test/')
+parser.add_argument('-test_dir', type=str, default='data/bbc_srl_2/test/')
+parser.add_argument('-ref', type=str, default='outputs/ref/')
+parser.add_argument('-hyp', type=str, default='outputs/hyp/')
+
+use_cuda = torch.cuda.is_available()
+args = parser.parse_args()
+if use_cuda:
+    torch.cuda.manual_seed(args.seed)
+torch.manual_seed(args.seed)
+random.seed(args.seed)
+np.random.seed(args.seed)
+args.use_cuda = use_cuda
+
+
+def my_collate(batch):
+    return {key: [d[key] for d in batch] for key in batch[0]}
+
+
+# 用rouge_1_f表示两个句子之间的相似度
+def rouge_1_f(hyp, ref):
+    hyp = re.sub(r'[^a-z]', ' ', hyp.lower()).strip().split()
+    ref = re.sub(r'[^a-z]', ' ', ref.lower()).strip().split()
+    if len(hyp) == 0 or len(ref) == 0:
+        return .0
+    ref_flag = [0 for _ in ref]
+    hit = .0
+    for w in hyp:
+        for i in range(0, len(ref)):
+            if w == ref[i] and ref_flag[i] == 0:
+                hit += 1
+                ref_flag[i] = 1
+                break
+    p = hit / len(hyp)
+    r = hit / len(ref)
+    if math.fabs(p + r) < 1e-10:
+        f = .0
+    else:
+        f = 2 * p * r / (p + r)
+    return f
+
+
+# 得到预测分数后,使用MMR策略进行重新排序,以消除冗余
+def re_rank(sents, scores, ref_len):
+    sents_num = len(sents)
+    sim = [sents_num * [.0] for _ in range(0, sents_num)]
+    for i in range(0, sents_num):
+        for j in range(i, sents_num):
+            if j == i:
+                sim[i][j] = 1.0
+            else:
+                sim[i][j] = sim[j][i] = rouge_1_f(sents[i], sents[j])
+    chosen = []
+    candidates = range(0, sents_num)
+    summary = ''
+    cur_len = 0
+    while len(candidates) != 0:
+        max_point = -1e20
+        next = -1
+        for i in candidates:
+            max_sim = .0
+            for j in chosen:
+                max_sim = max(max_sim, sim[i][j])
+            cur_point = args.mmr * scores[i] - (1.0 - args.mmr) * max_sim
+            if cur_point > max_point:
+                max_point = cur_point
+                next = i
+        chosen.append(next)
+        candidates.remove(next)
+        tmp = sents[next]
+        tmp = tmp.split()
+        tmp_len = len(tmp)
+        if cur_len + tmp_len > ref_len:
+            summary += ' '.join(tmp[:ref_len - cur_len])
+            break
+        else:
+            summary += ' '.join(tmp) + ' '
+            cur_len += tmp_len
+    return summary
+
+
+# 在验证集或测试集上测loss, rouge值
+def evaluate(net, vocab, data_iter, train_next):  # train_next指明接下来是否要继续训练
+    net.eval()
+    criterion = nn.MSELoss()
+    loss, r1, r2, rl, rsu = .0, .0, .0, .0, .0  # rouge-1,rouge-2,rouge-l,都使用recall值(长度限定为原摘要长度)
+    batch_num = .0
+    blog_num = .0
+    for i, batch in enumerate(tqdm(data_iter)):
+        # 计算loss
+        features, targets, events, event_weights, sents_content, summaries, doc_nums, doc_lens = vocab.make_features(
+            batch, args)
+        features, targets, events, event_weights = Variable(features), Variable(targets.float()), Variable(
+            events), Variable(event_weights.float())
+        if use_cuda:
+            features = features.cuda()
+            targets = targets.cuda()
+            events = events.cuda()
+            event_weights = event_weights.cuda()
+        probs = net(features, doc_nums, doc_lens, events, event_weights)
+        batch_num += 1
+        doc_nums_sum = np.array(doc_nums).sum()
+        loss += criterion(probs[doc_nums_sum:], targets[doc_nums_sum:]).data.item()
+        probs = probs[doc_nums_sum:]  # 删除probs前半部分对doc的预测
+        probs_start = 0  # 当前blog对应的probs起始下标
+        doc_lens_start = 0  # 当前blog对应的doc_lens起始下标
+        sents_start = 0  # 当前blog对应的sents_content起始下标
+        for i in range(0, args.batch_size):
+            sents_num = 0
+            for j in range(doc_lens_start, doc_lens_start + doc_nums[i]):
+                sents_num += doc_lens[j]
+            cur_probs = probs[probs_start:probs_start + sents_num]
+            cur_sents = sents_content[sents_start: sents_start + sents_num]
+            probs_start = probs_start + sents_num
+            doc_lens_start = doc_lens_start + doc_nums[i]
+            sents_start = sents_start + sents_num
+            if use_cuda:
+                cur_probs = cur_probs.cpu()
+            cur_probs = list(cur_probs.detach().numpy())
+            sorted_index = list(np.argsort(cur_probs))  # cur_probs顺序排序后对应的下标
+            sorted_index.reverse()
+            ref = summaries[i].strip()
+            ref_len = len(ref.split())
+            hyp = re_rank(cur_sents, cur_probs, ref_len)
+            score = get_rouge_score(hyp, ref)
+            r1 += score['ROUGE-1']['r']
+            r2 += score['ROUGE-2']['r']
+            rl += score['ROUGE-L']['r']
+            rsu += score['ROUGE-SU4']['r']
+            blog_num += 1
+
+    loss = loss / batch_num
+    r1 = r1 / blog_num
+    r2 = r2 / blog_num
+    rl = rl / blog_num
+    rsu = rsu / blog_num
+    if train_next:  # 接下来要继续训练,将网络设成'train'状态
+        net.train()
+    return loss, r1, r2, rl, rsu
+
+
+def train():
+    print('Loading vocab, train and val dataset...')
+    embed = torch.Tensor(np.load(args.embedding)['embedding'])
+    args.embed_num = embed.size(0)
+    args.embed_dim = embed.size(1)
+    with open(args.word2id) as f:
+        word2id = json.load(f)
+    vocab = utils.Vocab(embed, word2id)
+
+    train_data = []
+    for fn in os.listdir(args.train_dir):
+        f = open(args.train_dir + fn, 'r')
+        train_data.append(json.load(f))
+        f.close()
+    train_dataset = utils.Dataset(train_data)
+
+    val_data = []
+    for fn in os.listdir(args.valid_dir):
+        f = open(args.valid_dir + fn, 'r')
+        val_data.append(json.load(f))
+        f.close()
+    val_dataset = utils.Dataset(val_data)
+
+    net = getattr(model, args.model)(args, embed)
+    my_loss = getattr(model, 'myLoss')()
+    if use_cuda:
+        net.cuda()
+        my_loss.cuda()
+
+    train_iter = DataLoader(dataset=train_dataset,
+                            batch_size=args.batch_size,
+                            shuffle=False,
+                            collate_fn=my_collate)
+
+    val_iter = DataLoader(dataset=val_dataset,
+                          batch_size=args.batch_size,
+                          shuffle=False,
+                          collate_fn=my_collate)
+    optimizer = torch.optim.Adam(net.parameters(), lr=args.lr)
+    net.train()
+    min_loss = float('inf')
+
+    for epoch in range(1, args.epochs + 1):
+        for i, batch in enumerate(train_iter):
+            features, targets, events, event_weights, _1, _2, doc_nums, doc_lens = vocab.make_features(batch, args)
+            features, targets, events, event_weights = Variable(features), Variable(targets.float()), Variable(
+                events), Variable(event_weights.float())
+            if use_cuda:
+                features = features.cuda()
+                targets = targets.cuda()
+                events = events.cuda()
+                event_weights = event_weights.cuda()
+            probs = net(features, doc_nums, doc_lens, events, event_weights)
+            doc_num = np.array(doc_nums).sum()
+            loss = my_loss(probs, targets, doc_num)
+            optimizer.zero_grad()
+            loss.backward()
+            clip_grad_norm_(net.parameters(), args.max_norm)
+            optimizer.step()
+
+            print('EPOCH [%d/%d]: BATCH_ID=[%d/%d] loss=%f' % (
+                epoch, args.epochs, i, len(train_iter), loss))
+
+            cnt = (epoch - 1) * len(train_iter) + i
+            if cnt % args.valid_every == 0:
+                print('Begin valid... Epoch %d, Batch %d' % (epoch, i))
+                cur_loss, r1, r2, rl, rsu = evaluate(net, vocab, val_iter, True)
+                if cur_loss < min_loss:
+                    min_loss = cur_loss
+                save_path = args.save_dir + args.model + '_%d_%.4f_%.4f_%.4f_%.4f_%.4f' % (
+                    cnt / args.valid_every, cur_loss, r1, r2, rl, rsu)
+                net.save(save_path)
+                print('Epoch: %2d Min_Val_Loss: %f Cur_Val_Loss: %f Rouge-1: %f Rouge-2: %f Rouge-l: %f Rouge-SU4: %f' %
+                      (epoch, min_loss, cur_loss, r1, r2, rl, rsu))
+
+
+def test():
+    print('Loading vocab and test dataset...')
+    embed = torch.Tensor(np.load(args.embedding)['embedding'])
+    args.embed_num = embed.size(0)
+    args.embed_dim = embed.size(1)
+    with open(args.word2id) as f:
+        word2id = json.load(f)
+    vocab = utils.Vocab(embed, word2id)
+
+    test_data = []
+    for fn in os.listdir(args.test_dir):
+        f = open(args.test_dir + fn, 'r')
+        test_data.append(json.load(f))
+        f.close()
+    test_dataset = utils.Dataset(test_data)
+    test_iter = DataLoader(dataset=test_dataset,
+                           batch_size=args.batch_size,
+                           shuffle=False,
+                           collate_fn=my_collate)
+    print('Loading model...')
+    if use_cuda:
+        checkpoint = torch.load(args.save_dir + args.load_model)
+    else:
+        checkpoint = torch.load(args.save_dir + args.load_model, map_location=lambda storage, loc: storage)
+    net = getattr(model, checkpoint['args'].model)(checkpoint['args'])
+    net.load_state_dict(checkpoint['model'])
+    if use_cuda:
+        net.cuda()
+    net.eval()
+
+    print('Begin test...')
+    test_loss, r1, r2, rl, rsu = evaluate(net, vocab, test_iter, False)
+    print('Test_Loss: %f Rouge-1: %f Rouge-2: %f Rouge-l: %f Rouge-SU4: %f' % (test_loss, r1, r2, rl, rsu))
+
+
+if __name__ == '__main__':
+    if args.test:
+        test()
+    else:
+        train()
diff --git a/myrouge/Rouge155.py b/myrouge/Rouge155.py
new file mode 100755
index 0000000..921ff42
--- /dev/null
+++ b/myrouge/Rouge155.py
@@ -0,0 +1,595 @@
+from __future__ import print_function, unicode_literals, division
+
+import os
+import re
+import codecs
+import platform
+
+from subprocess import check_output
+from tempfile import mkdtemp
+from functools import partial
+
+try:
+    from configparser import ConfigParser
+except ImportError:
+    from ConfigParser import ConfigParser
+
+from utils.file_utils import DirectoryProcessor
+from utils.file_utils import verify_dir
+
+
+class Rouge155(object):
+    """
+    This is a wrapper for the ROUGE 1.5.5 summary evaluation package.
+    This class is designed to simplify the evaluation process by:
+
+        1) Converting summaries into a format ROUGE understands.
+        2) Generating the ROUGE configuration file automatically based
+            on filename patterns.
+
+    This class can be used within Python like this:
+
+    rouge = Rouge155()
+    rouge.system_dir = 'test/systems'
+    rouge.model_dir = 'test/models'
+
+    # The system filename pattern should contain one group that
+    # matches the document ID.
+    rouge.system_filename_pattern = 'SL.P.10.R.11.SL062003-(\d+).html'
+
+    # The model filename pattern has '#ID#' as a placeholder for the
+    # document ID. If there are multiple model summaries, pyrouge
+    # will use the provided regex to automatically match them with
+    # the corresponding system summary. Here, [A-Z] matches
+    # multiple model summaries for a given #ID#.
+    rouge.model_filename_pattern = 'SL.P.10.R.[A-Z].SL062003-#ID#.html'
+
+    rouge_output = rouge.evaluate()
+    print(rouge_output)
+    output_dict = rouge.output_to_dict(rouge_ouput)
+    print(output_dict)
+    ->    {'rouge_1_f_score': 0.95652,
+         'rouge_1_f_score_cb': 0.95652,
+         'rouge_1_f_score_ce': 0.95652,
+         'rouge_1_precision': 0.95652,
+        [...]
+
+
+    To evaluate multiple systems:
+
+        rouge = Rouge155()
+        rouge.system_dir = '/PATH/TO/systems'
+        rouge.model_dir = 'PATH/TO/models'
+        for system_id in ['id1', 'id2', 'id3']:
+            rouge.system_filename_pattern = \
+                'SL.P/.10.R.{}.SL062003-(\d+).html'.format(system_id)
+            rouge.model_filename_pattern = \
+                'SL.P.10.R.[A-Z].SL062003-#ID#.html'
+            rouge_output = rouge.evaluate(system_id)
+            print(rouge_output)
+
+    """
+
+    def __init__(self, rouge_dir=None, rouge_args=None, log_level=None):
+        """
+        Create a Rouge155 object.
+
+            rouge_dir:  Directory containing Rouge-1.5.5.pl
+            rouge_args: Arguments to pass through to ROUGE if you
+                        don't want to use the default pyrouge
+                        arguments.
+
+        """
+        self.__set_dir_properties()
+        self._config_file = None
+        self._settings_file = self.__get_config_path()
+        self.__set_rouge_dir(rouge_dir)
+        self.args = self.__clean_rouge_args(rouge_args)
+        self._system_filename_pattern = None
+        self._model_filename_pattern = None
+
+    def save_home_dir(self):
+        config = ConfigParser()
+        section = 'pyrouge settings'
+        config.add_section(section)
+        config.set(section, 'home_dir', self._home_dir)
+        with open(self._settings_file, 'w') as f:
+            config.write(f)
+
+    @property
+    def settings_file(self):
+        """
+        Path of the setttings file, which stores the ROUGE home dir.
+
+        """
+        return self._settings_file
+
+    @property
+    def bin_path(self):
+        """
+        The full path of the ROUGE binary (although it's technically
+        a script), i.e. rouge_home_dir/ROUGE-1.5.5.pl
+
+        """
+        if self._bin_path is None:
+            raise Exception(
+                "ROUGE path not set. Please set the ROUGE home directory "
+                "and ensure that ROUGE-1.5.5.pl exists in it.")
+        return self._bin_path
+
+    @property
+    def system_filename_pattern(self):
+        """
+        The regular expression pattern for matching system summary
+        filenames. The regex string.
+
+        E.g. "SL.P.10.R.11.SL062003-(\d+).html" will match the system
+        filenames in the SPL2003/system folder of the ROUGE SPL example
+        in the "sample-test" folder.
+
+        Currently, there is no support for multiple systems.
+
+        """
+        return self._system_filename_pattern
+
+    @system_filename_pattern.setter
+    def system_filename_pattern(self, pattern):
+        self._system_filename_pattern = pattern
+
+    @property
+    def model_filename_pattern(self):
+        """
+        The regular expression pattern for matching model summary
+        filenames. The pattern needs to contain the string "#ID#",
+        which is a placeholder for the document ID.
+
+        E.g. "SL.P.10.R.[A-Z].SL062003-#ID#.html" will match the model
+        filenames in the SPL2003/system folder of the ROUGE SPL
+        example in the "sample-test" folder.
+
+        "#ID#" is a placeholder for the document ID which has been
+        matched by the "(\d+)" part of the system filename pattern.
+        The different model summaries for a given document ID are
+        matched by the "[A-Z]" part.
+
+        """
+        return self._model_filename_pattern
+
+    @model_filename_pattern.setter
+    def model_filename_pattern(self, pattern):
+        self._model_filename_pattern = pattern
+
+    @property
+    def config_file(self):
+        return self._config_file
+
+    @config_file.setter
+    def config_file(self, path):
+        config_dir, _ = os.path.split(path)
+        verify_dir(config_dir, "configuration file")
+        self._config_file = path
+
+    def split_sentences(self):
+        """
+        ROUGE requires texts split into sentences. In case the texts
+        are not already split, this method can be used.
+
+        """
+        from pyrouge.utils.sentence_splitter import PunktSentenceSplitter
+        ss = PunktSentenceSplitter()
+        sent_split_to_string = lambda s: "\n".join(ss.split(s))
+        process_func = partial(
+            DirectoryProcessor.process, function=sent_split_to_string)
+        self.__process_summaries(process_func)
+
+    @staticmethod
+    def convert_summaries_to_rouge_format(input_dir, output_dir):
+        """
+        Convert all files in input_dir into a format ROUGE understands
+        and saves the files to output_dir. The input files are assumed
+        to be plain text with one sentence per line.
+
+            input_dir:  Path of directory containing the input files.
+            output_dir: Path of directory in which the converted files
+                        will be saved.
+
+        """
+        DirectoryProcessor.process(
+            input_dir, output_dir, Rouge155.convert_text_to_rouge_format)
+
+    @staticmethod
+    def convert_text_to_rouge_format(text, title="dummy title"):
+        """
+        Convert a text to a format ROUGE understands. The text is
+        assumed to contain one sentence per line.
+
+            text:   The text to convert, containg one sentence per line.
+            title:  Optional title for the text. The title will appear
+                    in the converted file, but doesn't seem to have
+                    any other relevance.
+
+        Returns: The converted text as string.
+
+        """
+        sentences = text.split("\n")
+        sent_elems = [
+            "[{i}] "
+            "{text}".format(i=i, text=sent)
+            for i, sent in enumerate(sentences, start=1)]
+        html = """
+
+{title}
+        
         
 
+
+{elems}
+
+""".format(title=title, elems="\n".join(sent_elems))
+
+        return html
+
+    @staticmethod
+    def write_config_static(system_dir, system_filename_pattern,
+                            model_dir, model_filename_pattern,
+                            config_file_path, system_id=None):
+        """
+        Write the ROUGE configuration file, which is basically a list
+        of system summary files and their corresponding model summary
+        files.
+
+        pyrouge uses regular expressions to automatically find the
+        matching model summary files for a given system summary file
+        (cf. docstrings for system_filename_pattern and
+        model_filename_pattern).
+
+            system_dir:                 Path of directory containing
+                                        system summaries.
+            system_filename_pattern:    Regex string for matching
+                                        system summary filenames.
+            model_dir:                  Path of directory containing
+                                        model summaries.
+            model_filename_pattern:     Regex string for matching model
+                                        summary filenames.
+            config_file_path:           Path of the configuration file.
+            system_id:                  Optional system ID string which
+                                        will appear in the ROUGE output.
+
+        """
+        system_filenames = [f for f in os.listdir(system_dir)]
+        system_models_tuples = []
+
+        system_filename_pattern = re.compile(system_filename_pattern)
+        for system_filename in sorted(system_filenames):
+            match = system_filename_pattern.match(system_filename)
+            if match:
+                id = match.groups(0)[0]
+                model_filenames = Rouge155.__get_model_filenames_for_id(
+                    id, model_dir, model_filename_pattern)
+                system_models_tuples.append(
+                    (system_filename, sorted(model_filenames)))
+        if not system_models_tuples:
+            raise Exception(
+                "Did not find any files matching the pattern {} "
+                "in the system summaries directory {}.".format(
+                    system_filename_pattern.pattern, system_dir))
+
+        with codecs.open(config_file_path, 'w', encoding='utf-8') as f:
+            f.write('')
+            for task_id, (system_filename, model_filenames) in enumerate(
+                    system_models_tuples, start=1):
+
+                eval_string = Rouge155.__get_eval_string(
+                    task_id, system_id,
+                    system_dir, system_filename,
+                    model_dir, model_filenames)
+                f.write(eval_string)
+            f.write("")
+
+    def write_config(self, config_file_path=None, system_id=None):
+        """
+        Write the ROUGE configuration file, which is basically a list
+        of system summary files and their matching model summary files.
+
+        This is a non-static version of write_config_file_static().
+
+            config_file_path:   Path of the configuration file.
+            system_id:          Optional system ID string which will
+                                appear in the ROUGE output.
+
+        """
+        if not system_id:
+            system_id = 1
+        if (not config_file_path) or (not self._config_dir):
+            self._config_dir = mkdtemp()
+            config_filename = "rouge_conf.xml"
+        else:
+            config_dir, config_filename = os.path.split(config_file_path)
+            verify_dir(config_dir, "configuration file")
+        self._config_file = os.path.join(self._config_dir, config_filename)
+        Rouge155.write_config_static(
+            self._system_dir, self._system_filename_pattern,
+            self._model_dir, self._model_filename_pattern,
+            self._config_file, system_id)
+
+    def evaluate(self, system_id=1, rouge_args=None):
+        """
+        Run ROUGE to evaluate the system summaries in system_dir against
+        the model summaries in model_dir. The summaries are assumed to
+        be in the one-sentence-per-line HTML format ROUGE understands.
+
+            system_id:  Optional system ID which will be printed in
+                        ROUGE's output.
+
+        Returns: Rouge output as string.
+
+        """
+        self.write_config(system_id=system_id)
+        options = self.__get_options(rouge_args)
+        command = [self._bin_path] + options
+        env = None
+        if hasattr(self, "_home_dir") and self._home_dir:
+            env = {'ROUGE_EVAL_HOME': self._home_dir}
+        rouge_output = check_output(command, env=env).decode("UTF-8")
+        return rouge_output
+
+    def convert_and_evaluate(self, system_id=1,
+                             split_sentences=False, rouge_args=None):
+        """
+        Convert plain text summaries to ROUGE format and run ROUGE to
+        evaluate the system summaries in system_dir against the model
+        summaries in model_dir. Optionally split texts into sentences
+        in case they aren't already.
+
+        This is just a convenience method combining
+        convert_summaries_to_rouge_format() and evaluate().
+
+            split_sentences:    Optional argument specifying if
+                                sentences should be split.
+            system_id:          Optional system ID which will be printed
+                                in ROUGE's output.
+
+        Returns: ROUGE output as string.
+
+        """
+        if split_sentences:
+            self.split_sentences()
+        self.__write_summaries()
+        rouge_output = self.evaluate(system_id, rouge_args)
+        return rouge_output
+
+    def output_to_dict(self, output):
+        """
+        Convert the ROUGE output into python dictionary for further
+        processing.
+
+        """
+        #0 ROUGE-1 Average_R: 0.02632 (95%-conf.int. 0.02632 - 0.02632)
+        pattern = re.compile(
+            r"(\d+) (ROUGE-\S+) (Average_\w): (\d.\d+) "
+            r"\(95%-conf.int. (\d.\d+) - (\d.\d+)\)")
+        results = {}
+        for line in output.split("\n"):
+            match = pattern.match(line)
+            if match:
+                sys_id, rouge_type, measure, result, conf_begin, conf_end = \
+                    match.groups()
+                measure = {
+                    'Average_R': 'recall',
+                    'Average_P': 'precision',
+                    'Average_F': 'f_score'
+                    }[measure]
+                rouge_type = rouge_type.lower().replace("-", '_')
+                key = "{}_{}".format(rouge_type, measure)
+                results[key] = float(result)
+                results["{}_cb".format(key)] = float(conf_begin)
+                results["{}_ce".format(key)] = float(conf_end)
+        return results
+
+    ###################################################################
+    # Private methods
+
+    def __set_rouge_dir(self, home_dir=None):
+        """
+        Verfify presence of ROUGE-1.5.5.pl and data folder, and set
+        those paths.
+
+        """
+        if not home_dir:
+            self._home_dir = self.__get_rouge_home_dir_from_settings()
+        else:
+            self._home_dir = home_dir
+            self.save_home_dir()
+        self._bin_path = os.path.join(self._home_dir, 'ROUGE-1.5.5.pl')
+        self.data_dir = os.path.join(self._home_dir, 'data')
+        if not os.path.exists(self._bin_path):
+            raise Exception(
+                "ROUGE binary not found at {}. Please set the "
+                "correct path by running pyrouge_set_rouge_path "
+                "/path/to/rouge/home.".format(self._bin_path))
+
+    def __get_rouge_home_dir_from_settings(self):
+        config = ConfigParser()
+        with open(self._settings_file) as f:
+            if hasattr(config, "read_file"):
+                config.read_file(f)
+            else:
+                # use deprecated python 2.x method
+                config.readfp(f)
+        rouge_home_dir = config.get('pyrouge settings', 'home_dir')
+        return rouge_home_dir
+
+    @staticmethod
+    def __get_eval_string(
+            task_id, system_id,
+            system_dir, system_filename,
+            model_dir, model_filenames):
+        """
+        ROUGE can evaluate several system summaries for a given text
+        against several model summaries, i.e. there is an m-to-n
+        relation between system and model summaries. The system
+        summaries are listed in the  tag and the model summaries
+        in the  tag. pyrouge currently only supports one system
+        summary per text, i.e. it assumes a 1-to-n relation between
+        system and model summaries.
+
+        """
+        peer_elems = "

{name}

".format( + id=system_id, name=system_filename) + + model_elems = ["{name}".format( + id=chr(65 + i), name=name) + for i, name in enumerate(model_filenames)] + + model_elems = "\n\t\t\t".join(model_elems) + eval_string = """ + + {model_root} + {peer_root} + + + + {peer_elems} + + + {model_elems} + + +""".format( + task_id=task_id, + model_root=model_dir, model_elems=model_elems, + peer_root=system_dir, peer_elems=peer_elems) + return eval_string + + def __process_summaries(self, process_func): + """ + Helper method that applies process_func to the files in the + system and model folders and saves the resulting files to new + system and model folders. + + """ + temp_dir = mkdtemp() + new_system_dir = os.path.join(temp_dir, "system") + os.mkdir(new_system_dir) + new_model_dir = os.path.join(temp_dir, "model") + os.mkdir(new_model_dir) + process_func(self._system_dir, new_system_dir) + process_func(self._model_dir, new_model_dir) + self._system_dir = new_system_dir + self._model_dir = new_model_dir + + def __write_summaries(self): + self.__process_summaries(self.convert_summaries_to_rouge_format) + + @staticmethod + def __get_model_filenames_for_id(id, model_dir, model_filenames_pattern): + pattern = re.compile(model_filenames_pattern.replace('#ID#', id)) + model_filenames = [ + f for f in os.listdir(model_dir) if pattern.match(f)] + if not model_filenames: + raise Exception( + "Could not find any model summaries for the system" + " summary with ID {}. Specified model filename pattern was: " + "{}".format(id, model_filenames_pattern)) + return model_filenames + + def __get_options(self, rouge_args=None): + """ + Get supplied command line arguments for ROUGE or use default + ones. + + """ + if self.args: + options = self.args.split() + elif rouge_args: + options = rouge_args.split() + else: + options = [ + '-e', self._data_dir, + '-c', 95, + '-2', 4, + '-u', + '-r', 1000, + '-n', 4, + '-w', 1.2, + '-a', + ] + options = list(map(str, options)) + + options = self.__add_config_option(options) + return options + + def __create_dir_property(self, dir_name, docstring): + """ + Generate getter and setter for a directory property. + + """ + property_name = "{}_dir".format(dir_name) + private_name = "_" + property_name + setattr(self, private_name, None) + + def fget(self): + return getattr(self, private_name) + + def fset(self, path): + verify_dir(path, dir_name) + setattr(self, private_name, path) + + p = property(fget=fget, fset=fset, doc=docstring) + setattr(self.__class__, property_name, p) + + def __set_dir_properties(self): + """ + Automatically generate the properties for directories. + + """ + directories = [ + ("home", "The ROUGE home directory."), + ("data", "The path of the ROUGE 'data' directory."), + ("system", "Path of the directory containing system summaries."), + ("model", "Path of the directory containing model summaries."), + ] + for (dirname, docstring) in directories: + self.__create_dir_property(dirname, docstring) + + def __clean_rouge_args(self, rouge_args): + """ + Remove enclosing quotation marks, if any. + + """ + if not rouge_args: + return + quot_mark_pattern = re.compile('"(.+)"') + match = quot_mark_pattern.match(rouge_args) + if match: + cleaned_args = match.group(1) + return cleaned_args + else: + return rouge_args + + def __add_config_option(self, options): + return options + ['-m'] + [self._config_file] + + def __get_config_path(self): + if platform.system() == "Windows": + parent_dir = os.getenv("APPDATA") + config_dir_name = "pyrouge" + elif os.name == "posix": + parent_dir = os.path.expanduser("~") + config_dir_name = ".pyrouge" + else: + parent_dir = os.path.dirname(__file__) + config_dir_name = "" + config_dir = os.path.join(parent_dir, config_dir_name) + if not os.path.exists(config_dir): + os.makedirs(config_dir) + return os.path.join(config_dir, 'settings.ini') + + +if __name__ == "__main__": + import argparse + from utils.argparsers import rouge_path_parser + + parser = argparse.ArgumentParser(parents=[rouge_path_parser]) + args = parser.parse_args() + + rouge = Rouge155(args.rouge_home) + rouge.save_home_dir() diff --git a/myrouge/__init__.py b/myrouge/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/myrouge/rouge.py b/myrouge/rouge.py new file mode 100644 index 0000000..e7a78a7 --- /dev/null +++ b/myrouge/rouge.py @@ -0,0 +1,55 @@ +# coding: utf-8 + +# 封装了一个计算标准Rouge值的函数,可以根据(hyp, ref)计算各种Rouge值 +# 鉴于rouge包和标准Rouge结果差距比较大,之后会使用标准Rouge替换 + +import os +from Rouge155 import Rouge155 +import sys +import random + +reload(sys) +sys.setdefaultencoding('utf-8') + + +def get_rouge_score(hyp, ref): + score = {} + tmp_dir_name = random.random() + hyp_dir = './%s_1/' % tmp_dir_name + ref_dir = './%s_2/' % tmp_dir_name + if os.path.exists(hyp_dir): + os.system('rm -r %s' % hyp_dir) + if os.path.exists(ref_dir): + os.system('rm -r %s' % ref_dir) + os.mkdir(hyp_dir) + os.mkdir(ref_dir) + with open(os.path.join(hyp_dir, '1.txt'), 'w') as f: + f.write(hyp) + with open(os.path.join(ref_dir, '1.txt'), 'w') as f: + f.write(ref) + r = Rouge155() + r.system_dir = hyp_dir + r.model_dir = ref_dir + r.system_filename_pattern = '(\d+).txt' + r.model_filename_pattern = '#ID#.txt' + + output = r.convert_and_evaluate() + metrics = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'ROUGE-SU4'] + for m in metrics: + score[m] = {} + for line in output.split('\n'): + if m in line: + if 'Average_R' in line: + score[m]['r'] = float(line.split()[3]) + if 'Average_P' in line: + score[m]['p'] = float(line.split()[3]) + if 'Average_F' in line: + score[m]['f'] = float(line.split()[3]) + os.system('rm -r %s' % hyp_dir) + os.system('rm -r %s' % ref_dir) + return score + + +if __name__ == '__main__': + s = get_rouge_score('I went to the Mars from my living town.', 'I went to Mars') + print(s) diff --git a/myrouge/utils/__init__.py b/myrouge/utils/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/myrouge/utils/argparsers.py b/myrouge/utils/argparsers.py new file mode 100755 index 0000000..aacebe3 --- /dev/null +++ b/myrouge/utils/argparsers.py @@ -0,0 +1,85 @@ +import argparse + +io_parser = argparse.ArgumentParser(add_help=False) +io_parser.add_argument( + '-i', '--input-files-dir', + help="Path of the directory containing the files to be converted.", + type=str, action="store", dest="input_dir", + required=True + ) +io_parser.add_argument( + '-o', '--output-files-dir', + help="Path of the directory in which the converted files will be saved.", + type=str, action="store", dest="output_dir", + required=True + ) + +ss_parser = argparse.ArgumentParser(add_help=False) +ss_parser.add_argument( + '-ss', '--split-sentences', + help="ROUGE assumes one sentence per line as default summary format. Use " + "this flag to split sentences using NLTK if the summary texts have " + "another format.", + action="store_true", dest="split_sents" + ) + +rouge_path_parser = argparse.ArgumentParser(add_help=False) +rouge_path_parser.add_argument( + '-hd', '--home-dir', + help="Path of the directory containing ROUGE-1.5.5.pl.", + type=str, action="store", dest="rouge_home", + required=True + ) + +model_sys_parser = argparse.ArgumentParser(add_help=False) +model_sys_parser.add_argument( + '-mfp', '--model-fn-pattern', + help="Regexp matching model filenames.", + type=str, action="store", dest="model_filename_pattern", + required=True + ) +model_sys_parser.add_argument( + '-sfp', '--system-fn-pattern', + help="Regexp matching system filenames.", + type=str, action="store", dest="system_filename_pattern", + required=True + ) +model_sys_parser.add_argument( + '-m', '--model-dir', + help="Path of the directory containing model summaries.", + type=str, action="store", dest="model_dir", + required=True + ) +model_sys_parser.add_argument( + '-s', '--system-dir', + help="Path of the directory containing system summaries.", + type=str, action="store", dest="system_dir", + required=True + ) +model_sys_parser.add_argument( + '-id', '--system-id', + help="Optional system ID. This is useful when comparing several systems.", + action="store", dest="system_id" + ) + +config_parser = argparse.ArgumentParser(add_help=False) +config_parser.add_argument( + '-c', '--config-file-path', + help="Path of configfile to be written, including file name.", + type=str, action="store", dest="config_file_path", + required=True + ) + +main_parser = argparse.ArgumentParser( + parents=[model_sys_parser], add_help=False) +main_parser.add_argument( + '-hd', '--home-dir', + help="Path of the directory containing ROUGE-1.5.5.pl.", + type=str, action="store", dest="rouge_home", + ) +main_parser.add_argument( + '-rargs', '--rouge-args', + help="Override pyrouge default ROUGE command line options with the " + "ROUGE_ARGS string, enclosed in qoutation marks.", + type=str, action="store", dest="rouge_args" + ) diff --git a/myrouge/utils/file_utils.py b/myrouge/utils/file_utils.py new file mode 100755 index 0000000..765f4e5 --- /dev/null +++ b/myrouge/utils/file_utils.py @@ -0,0 +1,85 @@ +from __future__ import print_function, unicode_literals, division + +import os +import re +import codecs +import xml.etree.ElementTree as et + + +class DirectoryProcessor: + + @staticmethod + def process(input_dir, output_dir, function): + """ + Apply function to all files in input_dir and save the resulting ouput + files in output_dir. + + """ + if not os.path.exists(output_dir): + os.makedirs(output_dir) + #logger = log.get_global_console_logger() + #logger.info("Processing files in {}.".format(input_dir)) + input_file_names = os.listdir(input_dir) + for input_file_name in input_file_names: + #logger.info("Processing {}.".format(input_file_name)) + input_file = os.path.join(input_dir, input_file_name) + with codecs.open(input_file, "r", encoding="UTF-8") as f: + input_string = f.read() + output_string = function(input_string) + output_file = os.path.join(output_dir, input_file_name) + with codecs.open(output_file, "w", encoding="UTF-8") as f: + f.write(output_string) + #logger.info("Saved processed files to {}.".format(output_dir)) + + +def str_from_file(path): + """ + Return file contents as string. + + """ + with open(path) as f: + s = f.read().strip() + return s + + +def xml_equal(xml_file1, xml_file2): + """ + Parse xml and convert to a canonical string representation so we don't + have to worry about semantically meaningless differences + + """ + def canonical(xml_file): + # poor man's canonicalization, since we don't want to install + # external packages just for unittesting + s = et.tostring(et.parse(xml_file).getroot()).decode("UTF-8") + s = re.sub("[\n|\t]*", "", s) + s = re.sub("\s+", " ", s) + s = "".join(sorted(s)).strip() + return s + + return canonical(xml_file1) == canonical(xml_file2) + + +def list_files(dir_path, recursive=True): + """ + Return a list of files in dir_path. + + """ + + for root, dirs, files in os.walk(dir_path): + file_list = [os.path.join(root, f) for f in files] + if recursive: + for dir in dirs: + dir = os.path.join(root, dir) + file_list.extend(list_files(dir, recursive=True)) + return file_list + + +def verify_dir(path, name=None): + if name: + name_str = "Cannot set {} directory because t".format(name) + else: + name_str = "T" + msg = "{}he path {} does not exist.".format(name_str, path) + if not os.path.exists(path): + raise Exception(msg) diff --git a/myrouge/utils/sentence_splitter.py b/myrouge/utils/sentence_splitter.py new file mode 100755 index 0000000..36e2675 --- /dev/null +++ b/myrouge/utils/sentence_splitter.py @@ -0,0 +1,52 @@ +from __future__ import print_function, unicode_literals, division + +from pyrouge.utils import log +from pyrouge.utils.string_utils import cleanup +from pyrouge.utils.file_utils import DirectoryProcessor + + +class PunktSentenceSplitter: + """ + Splits sentences using the NLTK Punkt sentence tokenizer. If installed, + PunktSentenceSplitter can use the default NLTK data for English, otherwise + custom trained data has to be provided. + + """ + + def __init__(self, language="en", punkt_data_path=None): + self.lang2datapath = {"en": "tokenizers/punkt/english.pickle"} + self.log = log.get_global_console_logger() + try: + import nltk.data + except ImportError: + self.log.error( + "Cannot import NLTK data for the sentence splitter. Please " + "check if the 'punkt' NLTK-package is installed correctly.") + try: + if not punkt_data_path: + punkt_data_path = self.lang2datapath[language] + self.sent_detector = nltk.data.load(punkt_data_path) + except KeyError: + self.log.error( + "No sentence splitter data for language {}.".format(language)) + except: + self.log.error( + "Could not load sentence splitter data: {}".format( + self.lang2datapath[language])) + + def split(self, text): + """Splits text and returns a list of the resulting sentences.""" + text = cleanup(text) + return self.sent_detector.tokenize(text.strip()) + + @staticmethod + def split_files(input_dir, output_dir, lang="en", punkt_data_path=None): + ss = PunktSentenceSplitter(lang, punkt_data_path) + DirectoryProcessor.process(input_dir, output_dir, ss.split) + +if __name__ == '__main__': + text = "Punkt knows that the periods in Mr. Smith and Johann S. Bach do " + "not mark sentence boundaries. And sometimes sentences can start with " + "non-capitalized words. i is a good variable name." + ss = PunktSentenceSplitter() + print(ss.split(text)) diff --git a/myrouge/utils/string_utils.py b/myrouge/utils/string_utils.py new file mode 100755 index 0000000..e5ef2b3 --- /dev/null +++ b/myrouge/utils/string_utils.py @@ -0,0 +1,20 @@ +from __future__ import print_function, unicode_literals, division + +import re + + +def remove_newlines(s): + p = re.compile("[\n|\r\n|\n\r]") + s = re.sub(p, " ", s) + s = remove_extraneous_whitespace(s) + return s + + +def remove_extraneous_whitespace(s): + p = re.compile("(\s+)") + s = re.sub(p, " ", s) + return s + + +def cleanup(s): + return remove_newlines(s) diff --git a/test.py b/test.py new file mode 100644 index 0000000..3e3f0e2 --- /dev/null +++ b/test.py @@ -0,0 +1,36 @@ +# coding: utf-8 + +# 检验live blog中对应预训练embedding的比例,结果有96.7%的词都对应有与训练embedding + +import os +import json + +word2id_f = './word2vec/word2id.json' +word2id = {} +data_dir = './data/bbc_label/' +types = ['train', 'valid', 'test'] + + +def main(): + with open(word2id_f, 'r') as f: + word2id = json.load(f) + print(len(word2id)) + all_cnt = .0 + hit_cnt = .0 + for t in types: + print(t) + cur_dir = data_dir + t + '/' + for fn in os.listdir(cur_dir): + cur_f = open(cur_dir + fn, 'r') + blog = json.load(cur_f) + for doc in blog['documents']: + for sent in doc['text']: + for word in sent.strip().split(): + all_cnt += 1 + if word in word2id: + hit_cnt += 1 + print(hit_cnt / all_cnt) + + +if __name__ == '__main__': + main() diff --git a/utils/Dataset.py b/utils/Dataset.py new file mode 100644 index 0000000..d4b9eae --- /dev/null +++ b/utils/Dataset.py @@ -0,0 +1,50 @@ +import csv +import torch +import torch.utils.data as data +from torch.autograd import Variable +from .Vocab import Vocab +import numpy as np + + +class Dataset(data.Dataset): + def __init__(self, examples): + super(Dataset, self).__init__() + # data: {'sents':xxxx,'labels':'xxxx', 'summaries':[1,0]} + self.examples = examples + self.training = False + + def train(self): + self.training = True + return self + + def test(self): + self.training = False + return self + + def shuffle(self, words): + np.random.shuffle(words) + return ' '.join(words) + + def dropout(self, words, p=0.3): + l = len(words) + drop_index = np.random.choice(l, int(l * p)) + keep_words = [words[i] for i in range(l) if i not in drop_index] + return ' '.join(keep_words) + + def __getitem__(self, idx): + ex = self.examples[idx] + return ex + # words = ex['sents'].split() + # guess = np.random.random() + + # if self.training: + # if guess > 0.5: + # sents = self.dropout(words,p=0.3) + # else: + # sents = self.shuffle(words) + # else: + # sents = ex['sents'] + # return {'id':ex['id'],'sents':sents,'labels':ex['labels']} + + def __len__(self): + return len(self.examples) diff --git a/utils/Vocab.py b/utils/Vocab.py new file mode 100644 index 0000000..1a48320 --- /dev/null +++ b/utils/Vocab.py @@ -0,0 +1,139 @@ +# coding=utf-8 +import torch +import numpy as np +import math + + +class Vocab(): + def __init__(self, embed, word2id): + self.embed = embed + self.word2id = word2id + self.id2word = {v: k for k, v in word2id.items()} + assert len(self.word2id) == len(self.id2word) + self.PAD_IDX = 0 + self.UNK_IDX = 1 + self.PAD_TOKEN = 'PAD_TOKEN' + self.UNK_TOKEN = 'UNK_TOKEN' + + def __len__(self): + return len(self.word2id) + + def i2w(self, idx): + return self.id2word[idx] + + def w2i(self, w): + if w in self.word2id: + return self.word2id[w] + else: + return self.UNK_IDX + + def make_features(self, batch, args): + # sent_trunc: 每个句子的词数截取到sent_trunc,不足补全 + # doc_trunc: 每个文档的句子数截取到doc_trunc,不补全 + # blog_trunc: 每个live blog的文档数截取到blog_trunc,不补全 + sent_trunc = args.sent_trunc + doc_trunc = args.doc_trunc + blog_trunc = args.blog_trunc + srl_trunc = args.srl_trunc + topic_word_trunc = args.topic_word_trunc + + summarys = [] + for s in batch["summary"]: + summarys.append(' '.join(s)) + doc_nums = [] # 每个live blog含有多少文档 + doc_targets = [] # 各文档的标签 + for i, d in enumerate(batch["documents"]): + if len(d) > blog_trunc: + batch["documents"][i] = d[:blog_trunc] + doc_nums.append(len(batch["documents"][i])) + for td in batch["documents"][i]: + target = td["doc_label"] + doc_targets.append(target) + + sents = [] # 存储所有句子 + sents_target = [] # 存储所有句子label + sents_content = [] # 存储所有的句子内容,与sents_target等长,便于之后计算rouge值 + doc_lens = [] # 存储每篇文档包含的句子数 + for d in batch["documents"]: + for td in d: + cur_sent_num = len(td["text"]) + if cur_sent_num > doc_trunc: + sents.extend(td["text"][:doc_trunc]) + sents_target.extend(td["sent_label"][:doc_trunc]) + sents_content.extend(td["text"][:doc_trunc]) + doc_lens.append(doc_trunc) + else: + sents.extend(td["text"]) + sents_target.extend(td["sent_label"]) + sents_content.extend(td["text"]) + doc_lens.append(cur_sent_num) + # 将每个句子的单词数固定到sent_trunc,超过截断,不足补全 + for i, sent in enumerate(sents): + sent = sent.split() + cur_sent_len = len(sent) + if cur_sent_len > sent_trunc: + sent = sent[:sent_trunc] + else: + sent += (sent_trunc - cur_sent_len) * [self.PAD_TOKEN] + sent = [self.w2i(_) for _ in sent] + sents[i] = sent + sents = torch.LongTensor(sents) + targets = doc_targets + sents_target + targets = torch.FloatTensor(targets) + + events = [] # 存储所有events,即SRL四元组 + event_weights = [] # 存储各events权重 + for d in batch["events"]: + cur_events = [] + cur_weights = [] + for td in d: + cur_events.append(td["tuple"]) + cur_weights.append(td["score"]) + if len(cur_events) == srl_trunc: + break + if len(cur_events) < srl_trunc: + cur_events += (srl_trunc - len(cur_events)) * ["-\t-\t-\t-"] + cur_weights += (srl_trunc - len(cur_weights)) * [.0] + cur_weights_sum = np.array(cur_weights).sum() + cur_weights = [_ / cur_weights_sum for _ in cur_weights] + events.extend(cur_events) + event_weights.extend(cur_weights) + for i, event in enumerate(events): + event = event.replace('-', self.PAD_TOKEN) + event = event.strip().split('\t') + new_event = [] + for w in event: + if w != self.PAD_TOKEN: + new_event.append(w) + new_event += (4 - len(new_event)) * [self.PAD_TOKEN] + assert len(new_event) == 4 + event = [self.w2i(_) for _ in new_event] + events[i] = event + events = torch.LongTensor(events) + event_weights = torch.FloatTensor(event_weights) + return sents, targets, events, event_weights, sents_content, summarys, doc_nums, doc_lens + + topics = [] # 存储所有topics,每个topic存储对应的前几个词 + topic_word_weights = [] # 存储每个word在各topic中的权重 + topic_scores = [] # 存储各个话题的得分 + for d in batch["topics"]: # d中存储了一篇blog的所有话题 + for td in d: + content = td["words"] + score = td["score"] + cur_topic = [] + cur_word_weights = [] + for tup in content[0: topic_word_trunc]: + cur_topic.append(tup[0]) + cur_word_weights.append(tup[1]) + cur_word_weight_sum = np.array(cur_word_weights).sum() + if math.fabs(cur_word_weight_sum) > 1e-5: + cur_word_weights = [w / cur_word_weight_sum for w in cur_word_weights] # 进行归一化 + topics.append(cur_topic) + topic_word_weights.append(cur_word_weights) + topic_scores.append(score) + for i, topic in enumerate(topics): + topics[i] = [self.w2i(_) for _ in topic] + topics = torch.LongTensor(topics) + topic_word_weights = torch.FloatTensor(topic_word_weights) + topic_scores = torch.FloatTensor(topic_scores) + return sents, targets, events, event_weights, topics, topic_word_weights, topic_scores, sents_content, summarys, doc_nums, doc_lens diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..018e768 --- /dev/null +++ b/utils/__init__.py @@ -0,0 +1,2 @@ +from .Dataset import Dataset +from .Vocab import Vocab