-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluate.py
157 lines (143 loc) · 6.2 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import sys
import os
import pdb
# pdb.Pdb.skip = True
import json
sys.path.append('/root/autodl-tmp/SSC/DataGeneration')
metric_root = '/root/autodl-tmp/SSC/Metrics'
sys.path.append(metric_root)
for folder_name, subfolders, filenames in os.walk(metric_root):
for folder in subfolders:
sys.path.append(os.path.join(metric_root, folder))
sys.path.append('../..')
from DataGeneration.benchmark import SummaCBenchmark, load_dataset
from Metrics.ClozE.cloze_eval import ClozEEval
from Metrics.CoLA.cola_eval import ColaEval
from Metrics.DAE.dae_eval import DAEEval
from Metrics.FactCC.factcc_eval import FactccEval
from Metrics.FEQA.feqa_eval import FEQAEval
from Metrics.QUALS.quals_eval import QUALSEval
from Metrics.SummaC.summacconv_eval import SummaCConvEval
from Metrics.SummaC.summaczs_eval import SummaCZSEval
from tqdm import tqdm
import requests
class Scorer():
def __init__(self, url, name='scorer'):
self.url = url
self.name = name
def score(self, claim, document):
# 定义请求数据
data = {'document': document, 'claim': claim}
# 发送POST请求并获取响应
response = requests.post(self.url, json=data)
# 解析响应JSON数据
try:
result = json.loads(response.text)
except Exception as e:
result = 0
return result
# fact_eval = [ClozEEval, DAEEval, FactccEval, FEQAEval, QUALSEval, SummaCConvEval]
fact_eval = [Scorer(url='http://localhost:10007/dae_doc', name='dae_doc')]
acceptance_eval = [ColaEval]
def evaluate_frank_type():
benchmark = SummaCBenchmark()
frank_result = {}
frank_type = ['RelE', 'EntE', 'CircE', 'CorefE', 'LinkE', 'OutE', 'GramE']
for item in fact_eval:
eval_metric = item
eval_name = str(type(eval_metric).__name__)
frank_result[eval_name] = {}
print('________start to evaluate on '+eval_name+'____________')
for data_type in frank_type:
score = []
data = benchmark.load_frank_sentence_by_error(data_type)
print('___________________________________________________')
print('________start to evaluate '+data_type+'____________')
print('___________________________________________________')
# try:
for d in tqdm(data):
document = str(d['document'])
claim = str(d['claim'])
s = eval_metric.score(document, claim)
score.append(s)
# except Exception as e:
# print(e)
# score = None
frank_result[eval_name][data_type] = score
del eval_metric
with open('/root/autodl-tmp/SSC/data/score/frank_type_score_0421_dae.json', 'w') as f:
f.writelines(json.dumps(frank_result))
def evaluate_xsum():
benchmark = SummaCBenchmark()
benchmark.xsum = load_dataset("xsum")["test"]
xsum = {}
for item in fact_eval:
eval_metric = item()
eval_name = str(type(eval_metric).__name__)
score = []
print('________start to evaluate on '+eval_name+'____________')
for index in tqdm(range(2000)):
d = benchmark.xsum[index]
s = eval_metric.score(str(d['document']), str(d['summary']))
score.append(s)
xsum[eval_name] = score
with open('/root/autodl-tmp/SSC/data/score/xsum_2000_score_0329_FactCC.json', 'w') as f:
f.writelines(json.dumps(xsum))
def evaluate_xsum_fake_feature():
path = '/root/autodl-tmp/SSC/data/fake_data'
file_list = os.listdir(path)
xsum_result = {}
for item in fact_eval:
eval_metric = item()
eval_name = str(type(eval_metric).__name__)
xsum_result[eval_name] = {}
print('________start to evaluate on '+eval_name+'____________')
for file_name in file_list:
score = []
with open(path+'/'+file_name, 'r') as f:
data = [item for item in json.load(f) if item['summary']!=item['fake_summary']][:1000]
print('___________________________________________________')
print('________start to evaluate '+file_name+'____________')
print('___________________________________________________')
try:
for d in tqdm(data):
document = str(d['document'])
claim = str(d['fake_summary'])
s = eval_metric.score(document, claim)
score.append(s)
except Exception as e:
print(e)
score = None
xsum_result[eval_name][file_name] = score
del eval_metric
with open('/root/autodl-tmp/SSC/data/score/fake_feature_0409_SummaCZS.json', 'w') as f:
f.writelines(json.dumps(xsum_result))
def evaluate_file(scorers):
total_result = {}
path = '/root/autodl-tmp/SSC/data/correction_result'
file_list = os.listdir(path)
for file_name in tqdm(file_list):
total_result[file_name] = {}
with open(path+'/'+file_name, 'r') as f:
data = json.load(f)
for scorer in scorers:
result = []
for item in tqdm(data):
document = item['document']
claim = item['corrected_claim']
score = scorer.score(document=document, claim=claim)
result.append(score)
total_result[file_name][scorer.name] = [item for item in result]
name = '_'.join([item.name for item in scorers])
with open('/root/autodl-tmp/SSC/data/score/frank_corrected_0422_'+name+'.json', 'w') as f:
f.writelines(json.dumps(total_result))
if __name__ == "__main__":
cloze_scorer = Scorer(url='http://localhost:10000/cloze', name='cloze')
dae_scorer = Scorer(url='http://localhost:10002/dae', name='dae')
factcc_scorer = Scorer(url='http://localhost:10003/factcc', name='factcc')
feqa_scorer = Scorer(url='http://localhost:10004/feqa', name='feqa')
quals_scorer = Scorer(url='http://localhost:10005/quals', name='quals')
summacconv_scorer = Scorer(url='http://localhost:10006/summacconv', name='summacconv')
dae_doc_scorere = Scorer(url='http://localhost:10007/dae_doc', name='dae_doc')
scorer_list = [dae_doc_scorere]
evaluate_file(scorer_list)