-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
140 lines (113 loc) · 4.04 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
from cgitb import text
import csv
# texts, labels: list of raw sentences, list of labels.
# considering aligning the labels 1-5 to 0-4
def load_data(data_path):
texts = []
labels = []
print("loading data...")
with open(data_path) as csvfile:
reader = csv.reader(csvfile) #generator
next(reader) # skip header
for row in reader:
text = row[0].strip()
label = int(row[1].strip())
texts.append(text)
if 'yelp' in data_path:
labels.append(label-1)
else:
labels.append(label)
return texts, labels
def unique(xs):
ret = set()
for x in xs:
ret.add(x)
return ret
#texts, labels: list of texts, list of corresponding labels.
def split_words_by_label(texts, labels, label_set):
total_labeled_words = {label:{} for label in label_set}
label_portion = {label: 0 for label in label_set}
for text, label in zip(texts, labels):
# counting words' times of occurrence in labeled texts.
# Sample counting!!!
for word in set(text.split()):
#print(word)
if word not in total_labeled_words[label]:
total_labeled_words[label][word] = 1
else:
total_labeled_words[label][word] += 1
#counting the portion of label.
label_portion[label] += 1
return total_labeled_words, label_portion
def get_vocab_size(train_texts):
vocab = set()
vocab_size = 0
for seq in train_texts:
for word in set(seq.split()):
if word not in vocab:
vocab.add(word)
vocab_size += 1
return vocab_size, vocab
def calculate_avg_length(texts):
cnt = 0
for t in texts:
cnt += len(t.split())
return cnt / len(texts)
def prob_Laplace_smoothing(word, total_labeled_words, label, vocab_size, label_portion, alpha = 0.2):
up = total_labeled_words[label][word] + alpha
down = label_portion[label] + vocab_size * alpha
return up / down
def accuracy(TP, TN, FP, FN):
return (TP + TN) / (TP + FP + FN + TN)
def macro_F1(TP, FP, FN):
if TP == 0:
return 0
precision = (TP)/(TP + FP)
recall = (TP) / (TP + FN)
return 2 * precision * recall / (precision + recall)
def eval(word_to_prob, label_set, valid_texts, valid_labels, vocab_size, label_portion, unseen):
TP = {label:0 for label in label_set}
TN = {label:0 for label in label_set}
FP = {label:0 for label in label_set}
FN = {label:0 for label in label_set}
for text, gold_label in zip(valid_texts, valid_labels):
max_likelihood = 0
#Not reasonable, some cases with prob = 0(too small to be represented by float) may be predicted to label 1.
max_label = 1
for label in label_set:
#prob = label_portion[label]
prob = label_portion[label] * 1000000000
for word in set(text.split()):
if (word, label) in word_to_prob:
prob *= word_to_prob[(word, label)]
else:
prob *= unseen[label]
if prob > max_likelihood:
max_label = label
max_likelihood = prob
#print(text.split(), " predicted over! likelihood = {}, label = {}".format(max_likelihood, max_label))
if max_label == gold_label:
TP[gold_label] += 1
for label in label_set:
TN[label] += 1
TN[gold_label] -= 1
else:
for label in label_set:
TN[label] += 1
TN[gold_label] -= 1
TN[max_label] -= 1
FN[gold_label] += 1
FP[max_label] += 1
print("TP: ", TP)
print("TN: ", TN)
print("FP: ", FP)
print("FN: ", FN)
Macro_f1 = 0
for label in label_set:
Macro_f1 += macro_F1(TP[label], FP[label], FN[label])
Macro_f1 /= len(label_set)
print("Macro-F1 score: ", Macro_f1)
return Macro_f1
def MSE(res, gold):
n = len(res)
return ((res-gold) ** 2).sum() / n