Initialize repository

PKULiuHui · Oct 23, 2018 · 61b9f5a · 61b9f5a
1 parent b2c573a
commit 61b9f5a
Show file tree

Hide file tree

Showing 83 changed files with 8,211 additions and 52 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,52 +1,9 @@
-# Prerequisites
-*.d
-
-# Object files
-*.o
-*.ko
-*.obj
-*.elf
-
-# Linker output
-*.ilk
-*.map
-*.exp
-
-# Precompiled Headers
-*.gch
-*.pch
-
-# Libraries
-*.lib
-*.a
-*.la
-*.lo
-
-# Shared objects (inc. Windows DLLs)
-*.dll
-*.so
-*.so.*
-*.dylib
-
-# Executables
-*.exe
-*.out
-*.app
-*.i*86
-*.x86_64
-*.hex
-
-# Debug files
-*.dSYM/
-*.su
-*.idb
-*.pdb
-
-# Kernel Module Compile Results
-*.mod*
-*.cmd
-.tmp_versions/
-modules.order
-Module.symvers
-Mkfile.old
-dkms.conf
+.idea
+*.pyc
+.DS_Store
+*/.DS_Store
+.ftpconfig
+data/
+baseline/
+mate-tool/
+word2vec/
diff --git a/baselines/baseline2/baseline.py b/baselines/baseline2/baseline.py
@@ -0,0 +1,118 @@
+import sys
+import os
+import argparse
+
+'''
+Standard ROUGE
+
+guardian(L)
+UB1 Rouge-1: 0.498439 Rouge-2: 0.216667 Rouge-l: 0.324901 Rouge-SU*: 0.216997
+UB2 Rouge-1: 0.469815 Rouge-2: 0.278474 Rouge-l: 0.344528 Rouge-SU*: 0.208485
+LexRank Rouge-1: 0.210933 Rouge-2: 0.037603 Rouge-l: 0.131110 Rouge-SU*: 0.046715
+TextRank Rouge-1: 0.184086 Rouge-2: 0.029617 Rouge-l: 0.117287 Rouge-SU*: 0.037783
+ICSI Rouge-1: 0.257562 Rouge-2: 0.060022 Rouge-l: 0.157313 Rouge-SU*: 0.065799
+Luhn Rouge-1: 0.154681 Rouge-2: 0.022884 Rouge-l: 0.100451 Rouge-SU*: 0.027575
+
+bbc(L)
+UB1 Rouge-1: 0.464780 Rouge-2: 0.195108 Rouge-l: 0.272242 Rouge-SU4: 0.197798
+UB2 Rouge-1: 0.413318 Rouge-2: 0.227026 Rouge-l: 0.268316 Rouge-SU4: 0.193755
+LexRank Rouge-1: 0.160842 Rouge-2: 0.024327 Rouge-l: 0.097632 Rouge-SU4: 0.042892
+TextRank Rouge-1: 0.139200 Rouge-2: 0.021073 Rouge-l: 0.093124 Rouge-SU4: 0.037206
+Luhn Rouge-1: 0.141699 Rouge-2: 0.023175 Rouge-l: 0.091994 Rouge-SU4: 0.038216
+ICSI Rouge-1: 0.209584 Rouge-2: 0.046293 Rouge-l: 0.135454 Rouge-SU4: 0.063704
+
+'''
+
+sys.path.append('../')
+
+from utils.data_helpers import load_data
+from tqdm import tqdm
+from myrouge.rouge import get_rouge_score
+
+from summarize.upper_bound import ExtractiveUpperbound
+from summarize.sume_wrap import SumeWrap
+from summarize.sumy.nlp.tokenizers import Tokenizer
+from summarize.sumy.parsers.plaintext import PlaintextParser
+from summarize.sumy.summarizers.lsa import LsaSummarizer
+from summarize.sumy.summarizers.kl import KLSummarizer
+from summarize.sumy.summarizers.luhn import LuhnSummarizer
+from summarize.sumy.summarizers.lex_rank import LexRankSummarizer
+from summarize.sumy.summarizers.text_rank import TextRankSummarizer
+from summarize.sumy.nlp.stemmers import Stemmer
+from nltk.corpus import stopwords
+import sys
+
+reload(sys)
+sys.setdefaultencoding('utf-8')
+parser = argparse.ArgumentParser(description='LiveBlogSum Baseline')
+parser.add_argument('-corpus', type=str, default='bbc')
+parser.add_argument('-path', type=str, default='../data/')
+parser.add_argument('-sum_len', type=int, default=1)
+
+args = parser.parse_args()
+args.path = args.path + args.corpus + '/test/'
+
+
+def get_summary_scores(algo, docs, refs, summary_size):
+    language = 'english'
+    summary = ''
+    if algo == 'UB1':
+        summarizer = ExtractiveUpperbound(language)
+        summary = summarizer(docs, refs, summary_size, ngram_type=1)
+    elif algo == 'UB2':
+        summarizer = ExtractiveUpperbound(language)
+        summary = summarizer(docs, refs, summary_size, ngram_type=2)
+    elif algo == 'ICSI':
+        summarizer = SumeWrap(language)
+        summary = summarizer(docs, summary_size)
+    else:
+        doc_string = u'\n'.join([u'\n'.join(doc_sents) for doc_sents in docs])
+        parser = PlaintextParser.from_string(doc_string, Tokenizer(language))
+        stemmer = Stemmer(language)
+        if algo == 'LSA':
+            summarizer = LsaSummarizer(stemmer)
+        if algo == 'KL':
+            summarizer = KLSummarizer(stemmer)
+        if algo == 'Luhn':
+            summarizer = LuhnSummarizer(stemmer)
+        if algo == 'LexRank':
+            summarizer = LexRankSummarizer(stemmer)
+        if algo == 'TextRank':
+            summarizer = TextRankSummarizer(stemmer)
+
+        summarizer.stop_words = frozenset(stopwords.words(language))
+        summary = summarizer(parser.document, summary_size)
+    hyps, refs = map(list, zip(*[[' '.join(summary), ' '.join(model)] for model in refs]))
+    hyp = str(hyps[0]).split()
+    hyp = ' '.join(hyp[:summary_size])
+    ref = str(refs[0])
+    score = get_rouge_score(hyp, ref)
+    return score['ROUGE-1']['r'], score['ROUGE-2']['r'], score['ROUGE-L']['r'], score['ROUGE-SU4']['r']
+
+
+if __name__ == '__main__':
+    file_names = os.listdir(args.path)
+    algos = ['UB1', 'UB2', 'LexRank', 'TextRank', 'Luhn', 'ICSI']
+    R1 = {'UB1': .0, 'UB2': .0, 'ICSI': .0, 'LSA': .0, 'KL': .0, 'Luhn': .0, 'LexRank': .0, 'TextRank': .0}
+    R2 = {'UB1': .0, 'UB2': .0, 'ICSI': .0, 'LSA': .0, 'KL': .0, 'Luhn': .0, 'LexRank': .0, 'TextRank': .0}
+    Rl = {'UB1': .0, 'UB2': .0, 'ICSI': .0, 'LSA': .0, 'KL': .0, 'Luhn': .0, 'LexRank': .0, 'TextRank': .0}
+    Rsu = {'UB1': .0, 'UB2': .0, 'ICSI': .0, 'LSA': .0, 'KL': .0, 'Luhn': .0, 'LexRank': .0, 'TextRank': .0}
+    for filename in tqdm(file_names):
+        data_file = os.path.join(args.path, filename)
+        docs, refs = load_data(data_file)
+        sum_len = len(' '.join(refs[0]).split(' ')) * args.sum_len
+        print('####', filename, '####')
+        for algo in algos:
+            r1, r2, rl, rsu = get_summary_scores(algo, docs, refs, sum_len)
+            print algo, r1, r2, rl, rsu
+            R1[algo] += r1
+            R2[algo] += r2
+            Rl[algo] += rl
+            Rsu[algo] += rsu
+    print('Final Results')
+    for algo in algos:
+        R1[algo] /= len(file_names)
+        R2[algo] /= len(file_names)
+        Rl[algo] /= len(file_names)
+        Rsu[algo] /= len(file_names)
+        print('%s Rouge-1: %f Rouge-2: %f Rouge-l: %f Rouge-SU4: %f' % (algo, R1[algo], R2[algo], Rl[algo], Rsu[algo]))
diff --git a/baselines/baseline2/summarize/__init__.py b/baselines/baseline2/summarize/__init__.py
diff --git a/baselines/baseline2/summarize/sume/__init__.py b/baselines/baseline2/summarize/sume/__init__.py
@@ -0,0 +1,2 @@
+from base import *
+from models import *
diff --git a/baselines/baseline2/summarize/sume/base.py b/baselines/baseline2/summarize/sume/base.py
@@ -0,0 +1,156 @@
+# -*- coding: utf-8 -*-
+
+""" Base structures and functions for the sume module.
+
+    Base contains the Sentence, LoadFile and State classes.
+
+
+    author: florian boudin (florian.boudin@univ-nantes.fr)
+    version: 0.1
+    date: Nov. 2014
+"""
+
+import re
+import os
+import codecs
+from collections import Counter
+
+class State:
+    """ State class
+
+    Internal class used as a structure to keep track of the search state in 
+    the tabu_search method.
+
+    Args:
+        subset (set): a subset of sentences
+        concepts (Counter): a set of concepts for the subset
+        length (int): the length in words
+        score (int): the score for the subset
+    """
+    def __init__(self):
+        self.subset = set()
+        self.concepts = Counter()
+        self.length = 0
+        self.score = 0
+
+class Sentence:
+    """The sentence data structure.
+
+    Args: 
+        tokens (list of str): the list of word tokens.
+        doc_id (str): the identifier of the document from which the sentence
+          comes from.
+        position (int): the position of the sentence in the source document.
+    """
+    def __init__(self, tokens, doc_id, position, phrases = [], dict_tokens_pos={}):
+
+        self.tokens = tokens
+        """ tokens as a list. """
+
+        self.doc_id = doc_id
+        """ document identifier of the sentence. """
+
+        self.position = position
+        """ position of the sentence within the document. """
+
+        self.concepts = []
+        """ concepts of the sentence. """
+
+        self.untokenized_form = ''
+        """ untokenized form of the sentence. """
+
+        self.length = 0
+        """ length of the untokenized sentence. """
+
+        self.phrases = phrases
+        """ phrases of the sentence. """
+
+        self.tokens_pos = dict_tokens_pos
+
+class LoadFile(object):
+    """Objects which inherit from this class have read file functions.
+
+    """
+
+    def __init__(self, input_directory):
+        """
+        Args:
+            input_file (str): the path of the input file.
+            use_stems (bool): whether stems should be used instead of words,
+              defaults to False.
+
+        """
+        self.input_directory = input_directory
+        self.sentences = []
+
+    def read_documents(self, file_extension="txt"):
+        """Read the input files in the given directory.
+
+        Load the input files and populate the sentence list. Input files are
+        expected to be in one tokenized sentence per line format.
+
+        Args:
+            file_extension (str): the file extension for input documents,
+              defaults to txt.
+        """
+        for infile in os.listdir(self.input_directory):
+
+            # skip files with wrong extension
+            if not infile.endswith(file_extension):
+                continue
+
+            with codecs.open(self.input_directory + '/' + infile,
+                             'r',
+                             'utf-8') as f:
+
+                # load the sentences
+                lines = f.readlines()
+
+                # loop over sentences
+                for i in range(len(lines)):
+
+                    # split the sentence into tokens
+                    tokens = lines[i].strip().split(' ')
+
+                    # add the sentence
+                    if len(tokens) > 0:
+                        sentence = Sentence(tokens, infile, i)
+                        untokenized_form = untokenize(tokens)
+                        sentence.untokenized_form = untokenized_form
+                        sentence.length = len(untokenized_form.split(' '))
+                        self.sentences.append(sentence)
+
+def untokenize(tokens):
+    """Untokenizing a list of tokens. 
+
+    Args:
+        tokens (list of str): the list of tokens to untokenize.
+
+    Returns:
+        a string
+
+    """
+    text = u' '.join(tokens)
+    text = re.sub(u"\s+", u" ", text.strip())
+    text = re.sub(u" ('[a-z]) ", u"\g<1> ", text)
+    text = re.sub(u" ([\.;,-]) ", u"\g<1> ", text)
+    text = re.sub(u" ([\.;,-?!])$", u"\g<1>", text)
+    text = re.sub(u" _ (.+) _ ", u" _\g<1>_ ", text)
+    text = re.sub(u" \$ ([\d\.]+) ", u" $\g<1> ", text)
+    text = text.replace(u" ' ", u"' ")
+    text = re.sub(u"([\W\s])\( ", u"\g<1>(", text)
+    text = re.sub(u" \)([\W\s])", u")\g<1>", text)
+    text = text.replace(u"`` ", u"``")
+    text = text.replace(u" ''", u"''")
+    text = text.replace(u" n't", u"n't")
+    text = re.sub(u'(^| )" ([^"]+) "( |$)', u'\g<1>"\g<2>"\g<3>', text)
+    text = re.sub(u' -RRB-', u')', text)
+    text = re.sub(u'-LRB- ', u'(', text)
+    # times
+    text = re.sub('(\d+) : (\d+ [ap]\.m\.)', '\g<1>:\g<2>', text)
+
+    text = re.sub('^" ', '"', text)
+    text = re.sub(' "$', '"', text)
+    text = re.sub(u"\s+", u" ", text.strip())
+
+    return text
diff --git a/baselines/baseline2/summarize/sume/models/__init__.py b/baselines/baseline2/summarize/sume/models/__init__.py
@@ -0,0 +1 @@
+from concept_based import *