Skip to content

Commit

Permalink
Initialize repository
Browse files Browse the repository at this point in the history
  • Loading branch information
PKULiuHui committed Oct 23, 2018
1 parent b2c573a commit 61b9f5a
Show file tree
Hide file tree
Showing 83 changed files with 8,211 additions and 52 deletions.
61 changes: 9 additions & 52 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,52 +1,9 @@
# Prerequisites
*.d

# Object files
*.o
*.ko
*.obj
*.elf

# Linker output
*.ilk
*.map
*.exp

# Precompiled Headers
*.gch
*.pch

# Libraries
*.lib
*.a
*.la
*.lo

# Shared objects (inc. Windows DLLs)
*.dll
*.so
*.so.*
*.dylib

# Executables
*.exe
*.out
*.app
*.i*86
*.x86_64
*.hex

# Debug files
*.dSYM/
*.su
*.idb
*.pdb

# Kernel Module Compile Results
*.mod*
*.cmd
.tmp_versions/
modules.order
Module.symvers
Mkfile.old
dkms.conf
.idea
*.pyc
.DS_Store
*/.DS_Store
.ftpconfig
data/
baseline/
mate-tool/
word2vec/
118 changes: 118 additions & 0 deletions baselines/baseline2/baseline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import sys
import os
import argparse

'''
Standard ROUGE
guardian(L)
UB1 Rouge-1: 0.498439 Rouge-2: 0.216667 Rouge-l: 0.324901 Rouge-SU*: 0.216997
UB2 Rouge-1: 0.469815 Rouge-2: 0.278474 Rouge-l: 0.344528 Rouge-SU*: 0.208485
LexRank Rouge-1: 0.210933 Rouge-2: 0.037603 Rouge-l: 0.131110 Rouge-SU*: 0.046715
TextRank Rouge-1: 0.184086 Rouge-2: 0.029617 Rouge-l: 0.117287 Rouge-SU*: 0.037783
ICSI Rouge-1: 0.257562 Rouge-2: 0.060022 Rouge-l: 0.157313 Rouge-SU*: 0.065799
Luhn Rouge-1: 0.154681 Rouge-2: 0.022884 Rouge-l: 0.100451 Rouge-SU*: 0.027575
bbc(L)
UB1 Rouge-1: 0.464780 Rouge-2: 0.195108 Rouge-l: 0.272242 Rouge-SU4: 0.197798
UB2 Rouge-1: 0.413318 Rouge-2: 0.227026 Rouge-l: 0.268316 Rouge-SU4: 0.193755
LexRank Rouge-1: 0.160842 Rouge-2: 0.024327 Rouge-l: 0.097632 Rouge-SU4: 0.042892
TextRank Rouge-1: 0.139200 Rouge-2: 0.021073 Rouge-l: 0.093124 Rouge-SU4: 0.037206
Luhn Rouge-1: 0.141699 Rouge-2: 0.023175 Rouge-l: 0.091994 Rouge-SU4: 0.038216
ICSI Rouge-1: 0.209584 Rouge-2: 0.046293 Rouge-l: 0.135454 Rouge-SU4: 0.063704
'''

sys.path.append('../')

from utils.data_helpers import load_data
from tqdm import tqdm
from myrouge.rouge import get_rouge_score

from summarize.upper_bound import ExtractiveUpperbound
from summarize.sume_wrap import SumeWrap
from summarize.sumy.nlp.tokenizers import Tokenizer
from summarize.sumy.parsers.plaintext import PlaintextParser
from summarize.sumy.summarizers.lsa import LsaSummarizer
from summarize.sumy.summarizers.kl import KLSummarizer
from summarize.sumy.summarizers.luhn import LuhnSummarizer
from summarize.sumy.summarizers.lex_rank import LexRankSummarizer
from summarize.sumy.summarizers.text_rank import TextRankSummarizer
from summarize.sumy.nlp.stemmers import Stemmer
from nltk.corpus import stopwords
import sys

reload(sys)
sys.setdefaultencoding('utf-8')
parser = argparse.ArgumentParser(description='LiveBlogSum Baseline')
parser.add_argument('-corpus', type=str, default='bbc')
parser.add_argument('-path', type=str, default='../data/')
parser.add_argument('-sum_len', type=int, default=1)

args = parser.parse_args()
args.path = args.path + args.corpus + '/test/'


def get_summary_scores(algo, docs, refs, summary_size):
language = 'english'
summary = ''
if algo == 'UB1':
summarizer = ExtractiveUpperbound(language)
summary = summarizer(docs, refs, summary_size, ngram_type=1)
elif algo == 'UB2':
summarizer = ExtractiveUpperbound(language)
summary = summarizer(docs, refs, summary_size, ngram_type=2)
elif algo == 'ICSI':
summarizer = SumeWrap(language)
summary = summarizer(docs, summary_size)
else:
doc_string = u'\n'.join([u'\n'.join(doc_sents) for doc_sents in docs])
parser = PlaintextParser.from_string(doc_string, Tokenizer(language))
stemmer = Stemmer(language)
if algo == 'LSA':
summarizer = LsaSummarizer(stemmer)
if algo == 'KL':
summarizer = KLSummarizer(stemmer)
if algo == 'Luhn':
summarizer = LuhnSummarizer(stemmer)
if algo == 'LexRank':
summarizer = LexRankSummarizer(stemmer)
if algo == 'TextRank':
summarizer = TextRankSummarizer(stemmer)

summarizer.stop_words = frozenset(stopwords.words(language))
summary = summarizer(parser.document, summary_size)
hyps, refs = map(list, zip(*[[' '.join(summary), ' '.join(model)] for model in refs]))
hyp = str(hyps[0]).split()
hyp = ' '.join(hyp[:summary_size])
ref = str(refs[0])
score = get_rouge_score(hyp, ref)
return score['ROUGE-1']['r'], score['ROUGE-2']['r'], score['ROUGE-L']['r'], score['ROUGE-SU4']['r']


if __name__ == '__main__':
file_names = os.listdir(args.path)
algos = ['UB1', 'UB2', 'LexRank', 'TextRank', 'Luhn', 'ICSI']
R1 = {'UB1': .0, 'UB2': .0, 'ICSI': .0, 'LSA': .0, 'KL': .0, 'Luhn': .0, 'LexRank': .0, 'TextRank': .0}
R2 = {'UB1': .0, 'UB2': .0, 'ICSI': .0, 'LSA': .0, 'KL': .0, 'Luhn': .0, 'LexRank': .0, 'TextRank': .0}
Rl = {'UB1': .0, 'UB2': .0, 'ICSI': .0, 'LSA': .0, 'KL': .0, 'Luhn': .0, 'LexRank': .0, 'TextRank': .0}
Rsu = {'UB1': .0, 'UB2': .0, 'ICSI': .0, 'LSA': .0, 'KL': .0, 'Luhn': .0, 'LexRank': .0, 'TextRank': .0}
for filename in tqdm(file_names):
data_file = os.path.join(args.path, filename)
docs, refs = load_data(data_file)
sum_len = len(' '.join(refs[0]).split(' ')) * args.sum_len
print('####', filename, '####')
for algo in algos:
r1, r2, rl, rsu = get_summary_scores(algo, docs, refs, sum_len)
print algo, r1, r2, rl, rsu
R1[algo] += r1
R2[algo] += r2
Rl[algo] += rl
Rsu[algo] += rsu
print('Final Results')
for algo in algos:
R1[algo] /= len(file_names)
R2[algo] /= len(file_names)
Rl[algo] /= len(file_names)
Rsu[algo] /= len(file_names)
print('%s Rouge-1: %f Rouge-2: %f Rouge-l: %f Rouge-SU4: %f' % (algo, R1[algo], R2[algo], Rl[algo], Rsu[algo]))
Empty file.
2 changes: 2 additions & 0 deletions baselines/baseline2/summarize/sume/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from base import *
from models import *
156 changes: 156 additions & 0 deletions baselines/baseline2/summarize/sume/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
# -*- coding: utf-8 -*-

""" Base structures and functions for the sume module.
Base contains the Sentence, LoadFile and State classes.
author: florian boudin (florian.boudin@univ-nantes.fr)
version: 0.1
date: Nov. 2014
"""

import re
import os
import codecs
from collections import Counter

class State:
""" State class
Internal class used as a structure to keep track of the search state in
the tabu_search method.
Args:
subset (set): a subset of sentences
concepts (Counter): a set of concepts for the subset
length (int): the length in words
score (int): the score for the subset
"""
def __init__(self):
self.subset = set()
self.concepts = Counter()
self.length = 0
self.score = 0

class Sentence:
"""The sentence data structure.
Args:
tokens (list of str): the list of word tokens.
doc_id (str): the identifier of the document from which the sentence
comes from.
position (int): the position of the sentence in the source document.
"""
def __init__(self, tokens, doc_id, position, phrases = [], dict_tokens_pos={}):

self.tokens = tokens
""" tokens as a list. """

self.doc_id = doc_id
""" document identifier of the sentence. """

self.position = position
""" position of the sentence within the document. """

self.concepts = []
""" concepts of the sentence. """

self.untokenized_form = ''
""" untokenized form of the sentence. """

self.length = 0
""" length of the untokenized sentence. """

self.phrases = phrases
""" phrases of the sentence. """

self.tokens_pos = dict_tokens_pos

class LoadFile(object):
"""Objects which inherit from this class have read file functions.
"""

def __init__(self, input_directory):
"""
Args:
input_file (str): the path of the input file.
use_stems (bool): whether stems should be used instead of words,
defaults to False.
"""
self.input_directory = input_directory
self.sentences = []

def read_documents(self, file_extension="txt"):
"""Read the input files in the given directory.
Load the input files and populate the sentence list. Input files are
expected to be in one tokenized sentence per line format.
Args:
file_extension (str): the file extension for input documents,
defaults to txt.
"""
for infile in os.listdir(self.input_directory):

# skip files with wrong extension
if not infile.endswith(file_extension):
continue

with codecs.open(self.input_directory + '/' + infile,
'r',
'utf-8') as f:

# load the sentences
lines = f.readlines()

# loop over sentences
for i in range(len(lines)):

# split the sentence into tokens
tokens = lines[i].strip().split(' ')

# add the sentence
if len(tokens) > 0:
sentence = Sentence(tokens, infile, i)
untokenized_form = untokenize(tokens)
sentence.untokenized_form = untokenized_form
sentence.length = len(untokenized_form.split(' '))
self.sentences.append(sentence)

def untokenize(tokens):
"""Untokenizing a list of tokens.
Args:
tokens (list of str): the list of tokens to untokenize.
Returns:
a string
"""
text = u' '.join(tokens)
text = re.sub(u"\s+", u" ", text.strip())
text = re.sub(u" ('[a-z]) ", u"\g<1> ", text)
text = re.sub(u" ([\.;,-]) ", u"\g<1> ", text)
text = re.sub(u" ([\.;,-?!])$", u"\g<1>", text)
text = re.sub(u" _ (.+) _ ", u" _\g<1>_ ", text)
text = re.sub(u" \$ ([\d\.]+) ", u" $\g<1> ", text)
text = text.replace(u" ' ", u"' ")
text = re.sub(u"([\W\s])\( ", u"\g<1>(", text)
text = re.sub(u" \)([\W\s])", u")\g<1>", text)
text = text.replace(u"`` ", u"``")
text = text.replace(u" ''", u"''")
text = text.replace(u" n't", u"n't")
text = re.sub(u'(^| )" ([^"]+) "( |$)', u'\g<1>"\g<2>"\g<3>', text)
text = re.sub(u' -RRB-', u')', text)
text = re.sub(u'-LRB- ', u'(', text)
# times
text = re.sub('(\d+) : (\d+ [ap]\.m\.)', '\g<1>:\g<2>', text)

text = re.sub('^" ', '"', text)
text = re.sub(' "$', '"', text)
text = re.sub(u"\s+", u" ", text.strip())

return text
1 change: 1 addition & 0 deletions baselines/baseline2/summarize/sume/models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from concept_based import *
Loading

0 comments on commit 61b9f5a

Please sign in to comment.