-
Notifications
You must be signed in to change notification settings - Fork 0
/
kappa_adjust.py
109 lines (83 loc) · 3.41 KB
/
kappa_adjust.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import csv
import pickle
import numpy as np
from datalabel import label_data
TARGET_DIRECTORY = './'
POSITIVE_THRESHOLD_VALUES = np.linspace(0,1,11)
NEGATIVE_THRESHOLD_VALUES = np.linspace(0,1,11)
mapping = {
'positive': 0,
'neutral': 1,
'negative': 2
}
def build_weight_matrix(categories, mode):
if mode == 'unweighted':
# [[0, 1, 1],
# [1, 0, 1],
# [1, 1, 0]]
return np.fromiter((i != j
for i in range(categories)
for j in range(categories)), np.int).reshape(categories, -1)
elif mode == 'squared':
# [[0, 1, 4],
# [1, 0, 1],
# [4, 1, 0]]
return np.fromiter((abs(i - j) ** 2
for i in range(categories)
for j in range(categories)), np.int).reshape(categories, -1)
else: # linear
# [[0, 1, 2],
# [1, 0, 1],
# [2, 1, 0]]
return np.fromiter((abs(i - j)
for i in range(categories)
for j in range(categories)), np.int).reshape(categories, -1)
def build_observed_matrix(categories, subjects, ratings):
observed = np.zeros((categories, categories))
for k in range(subjects):
observed[ratings[k, 0], ratings[k, 1]] += 1
return observed / subjects
def build_distributions_matrix(categories, subjects, ratings):
distributions = np.zeros((categories, 2))
for k in range(subjects):
distributions[ratings[k, 0], 0] += 1
distributions[ratings[k, 1], 1] += 1
return distributions / subjects
def build_expected_matrix(categories, distributions):
return np.fromiter((distributions[i, 0] * distributions[j, 1]
for i in range(categories)
for j in range(categories)), np.float).reshape(categories, -1)
def calculate_kappa(weighted, observed, expected):
sum_expected = sum(sum(weighted * expected))
return 1.0 - ((sum(sum(weighted * observed)) / sum_expected) if sum_expected != 0 else 0.0)
kappa_scores = []
manual_labels = []
with open('./data/kappa/labels_kappa.csv') as f:
org = csv.reader(f, quotechar='"', delimiter=',', quoting=csv.QUOTE_ALL)
for l in org:
manual_labels.append(l[2])
for pos in POSITIVE_THRESHOLD_VALUES:
for neg in NEGATIVE_THRESHOLD_VALUES:
labels = label_data(pos, neg)
manual_numerical_labels_300 = [mapping[label] for label in manual_labels[:300]] # list of 0,1,2
auto_numerical_labels_300 = [mapping[label] for label in labels[:300]] # list of 0,1,2
ratings = []
for man, auto in zip(manual_numerical_labels_300, auto_numerical_labels_300):
ratings.append([auto,man])
ratings = np.array(ratings)
mode = 'unweighted'
categories = int(np.amax(ratings)) + 1
subjects = int(ratings.size / 2)
weighted = build_weight_matrix(categories, mode)
observed = build_observed_matrix(categories, subjects, ratings)
distributions = build_distributions_matrix(categories, subjects, ratings)
expected = build_expected_matrix(categories, distributions)
kappa = calculate_kappa(weighted, observed, expected)
print('Kappa (' + mode + ') for pos/neg ' + str(pos) + ', ' + str(neg) + ': ' + str(kappa))
kappa_scores.append((pos, neg, kappa))
kappa_scores = sorted(kappa_scores, key=lambda x: x[2])
for score in kappa_scores:
print(score)
f = open(TARGET_DIRECTORY + "kappa_scores.pkl","wb")
pickle.dump(kappa_scores,f)
f.close()