Skip to content

Commit

Permalink
Add baseline(PKUSUMSUM)
Browse files Browse the repository at this point in the history
  • Loading branch information
PKULiuHui committed Oct 23, 2018
1 parent 61b9f5a commit 41cad81
Show file tree
Hide file tree
Showing 157 changed files with 2,144 additions and 1 deletion.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,5 @@
*/.DS_Store
.ftpconfig
data/
baseline/
mate-tool/
word2vec/
86 changes: 86 additions & 0 deletions baselines/baseline/baseline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# coding:utf-8

# 使用PKUSUMSUM得到summary,并计算rouge值
#
# guardian(L):
# Coverage 0.205367676471 0.0358395294118 0.129378588235 0.0444649705882
# Lead 0.182529264706 0.0286435588235 0.117340205882 0.0370704117647
# Centroid 0.205031264706 0.0322918529412 0.128586235294 0.0436074705882
# LexPageRank 0.213439529412 0.0319259411765 0.130743382353 0.046391
# TextRank 0.228982382353 0.0417126470588 0.142264088235 0.0546559705882
# Submodular 0.253892411765 0.0536839117647 0.154026176471 0.0643656470588
#
# bbc(L):
# Coverage 0.16716185 0.0246575 0.10559635 0.03175085
# Lead 0.1445328 0.0159233 0.09499425 0.0253807
# Centroid 0.16769825 0.02915425 0.10635125 0.03189055
# LexPageRank 0.1832988 0.0290539 0.11394075 0.03591895
# TextRank 0.21151125 0.04204835 0.1329824 0.04681465
# Submodular 0.1995797 0.04061245 0.1227405 0.04277865

import os
import json
import sys

reload(sys)
sys.setdefaultencoding('utf-8')
sys.path.append('../')

from tqdm import tqdm
from myrouge.rouge import get_rouge_score

corpus = 'bbc'
# jar_path = '/Users/liuhui/Desktop/Lab/Tools/PKUSUMSUM/PKUSUMSUM.jar'
jar_path = '/home/liuhui/PKUSUMSUM/PKUSUMSUM.jar'
data_dir = '../data/' + corpus + '_cont_1/test/'
tmp_dir = './tmp/'
tmp_out = './out'
sum_len = 1 # 摘要长度是原摘要长度的几倍
methods = [6]
methods_name = {0: 'Coverage', 1: 'Lead', 2: 'Centroid', 3: 'ILP', 4: 'LexPageRank', 5: 'TextRank', 6: 'Submodular'}

if __name__ == '__main__':
recall = []
for i in range(0, 7):
recall.append({'rouge-1': .0, 'rouge-2': .0, 'rouge-l': .0, 'rouge-su*': .0})
if not os.path.exists(tmp_dir):
os.mkdir(tmp_dir)
for fn in tqdm(os.listdir(data_dir)):
f = open(os.path.join(data_dir, fn), 'r')
blog = json.load(f)
f.close()
ref = str(' '.join(blog['summary']))
sum_size = len(ref.strip().split()) * sum_len

if os.path.exists(tmp_dir): # 保证每次该文件夹都是空的
for tf in os.listdir(tmp_dir):
os.remove(tmp_dir + tf)

for i, doc in enumerate(blog['documents']):
tmp_f = open(tmp_dir + str(i), 'w')
for sent in doc['text']:
tmp_f.write(sent)
tmp_f.write('\n')
tmp_f.close()

for m in methods:
os.system('java -jar %s -T 2 -input %s -output %s -L 2 -n %d -m %d -stop stopword' % (
jar_path, tmp_dir, tmp_out, 2 * sum_size, m))
f = open(tmp_out, 'r')
hyp = ' '.join(str(f.read()).strip().split()[:sum_size])
f.close()
score = get_rouge_score(hyp, ref)
r1, r2, rl, rsu = score['ROUGE-1']['r'], score['ROUGE-2']['r'], score['ROUGE-L']['r'], score['ROUGE-SU4']['r']
print methods_name[m], r1, r2, rl, rsu
recall[m]['rouge-1'] += r1
recall[m]['rouge-2'] += r2
recall[m]['rouge-l'] += rl
recall[m]['rouge-su*'] += rsu

print('Final Results:')
for m in methods:
recall[m]['rouge-1'] /= len(os.listdir(data_dir))
recall[m]['rouge-2'] /= len(os.listdir(data_dir))
recall[m]['rouge-l'] /= len(os.listdir(data_dir))
recall[m]['rouge-su*'] /= len(os.listdir(data_dir))
print methods_name[m], recall[m]['rouge-1'], recall[m]['rouge-2'], recall[m]['rouge-l'], recall[m]['rouge-su*']
14 changes: 14 additions & 0 deletions baselines/baseline/out
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
it's his second oscar and the second award for del toro's film tonight so far.
inclusion water?
could this be the first of many awards tonight for three billboards?
it's a great speech, his second great speech of the night.
rita moreno at the oscars in the same dress 56 years later.
give the winners that time instead.
she is also one of the few women to wear black.
while her reem acra red gown is a pretty safe choice for her first time at the oscars, she has already won best sleeves.
plus it's the first speech cut short by music tonight.
still chose to place him on the red carpet tonight.
but no latinos were nominated for acting oscars this year.
could get out go on to win best it's seeming more likely.
and they won a lot of academy awards.
guillermo del toro's film the shape of water is also leading the oscar's race with 13 nominations.
Loading

0 comments on commit 41cad81

Please sign in to comment.