-
Notifications
You must be signed in to change notification settings - Fork 27
/
heuristic_generator.py
246 lines (197 loc) · 10.1 KB
/
heuristic_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from program_synthesis.synthesizer import Synthesizer
from program_synthesis.verifier import Verifier
class HeuristicGenerator(object):
"""
A class to go through the synthesizer-verifier loop
"""
def __init__(self, train_primitive_matrix, val_primitive_matrix,
val_ground, train_ground=None, b=0.5):
"""
Initialize HeuristicGenerator object
b: class prior of most likely class (TODO: use somewhere)
beta: threshold to decide whether to abstain or label for heuristics
gamma: threshold to decide whether to call a point vague or not
"""
self.train_primitive_matrix = train_primitive_matrix
self.val_primitive_matrix = val_primitive_matrix
self.val_ground = val_ground
self.train_ground = train_ground
self.b = b
self.vf = None
self.syn = None
self.hf = []
self.feat_combos = []
def apply_heuristics(self, heuristics, primitive_matrix, feat_combos, beta_opt):
"""
Apply given heuristics to given feature matrix X and abstain by beta
heuristics: list of pre-trained logistic regression models
feat_combos: primitive indices to apply heuristics to
beta: best beta value for associated heuristics
"""
def marginals_to_labels(hf,X,beta):
marginals = hf.predict_proba(X)[:,1]
labels_cutoff = np.zeros(np.shape(marginals))
labels_cutoff[marginals <= (self.b-beta)] = -1.
labels_cutoff[marginals >= (self.b+beta)] = 1.
return labels_cutoff
L = np.zeros((np.shape(primitive_matrix)[0],len(heuristics)))
for i,hf in enumerate(heuristics):
L[:,i] = marginals_to_labels(hf,primitive_matrix[:,feat_combos[i]],beta_opt[i])
return L
def prune_heuristics(self,heuristics,feat_combos,keep=1):
"""
Selects the best heuristic based on Jaccard Distance and Reliability Metric
keep: number of heuristics to keep from all generated heuristics
"""
def calculate_jaccard_distance(num_labeled_total, num_labeled_L):
scores = np.zeros(np.shape(num_labeled_L)[1])
for i in range(np.shape(num_labeled_L)[1]):
scores[i] = np.sum(np.minimum(num_labeled_L[:,i],num_labeled_total))/np.sum(np.maximum(num_labeled_L[:,i],num_labeled_total))
return 1-scores
L_val = np.array([])
L_train = np.array([])
beta_opt = np.array([])
max_cardinality = len(heuristics)
for i in range(max_cardinality):
#Note that the LFs are being applied to the entire val set though they were developed on a subset...
beta_opt_temp = self.syn.find_optimal_beta(heuristics[i], self.val_primitive_matrix, feat_combos[i], self.val_ground)
L_temp_val = self.apply_heuristics(heuristics[i], self.val_primitive_matrix, feat_combos[i], beta_opt_temp)
L_temp_train = self.apply_heuristics(heuristics[i], self.train_primitive_matrix, feat_combos[i], beta_opt_temp)
beta_opt = np.append(beta_opt, beta_opt_temp)
if i == 0:
L_val = np.append(L_val, L_temp_val) #converts to 1D array automatically
L_val = np.reshape(L_val,np.shape(L_temp_val))
L_train = np.append(L_train, L_temp_train) #converts to 1D array automatically
L_train = np.reshape(L_train,np.shape(L_temp_train))
else:
L_val = np.concatenate((L_val, L_temp_val), axis=1)
L_train = np.concatenate((L_train, L_temp_train), axis=1)
#Use F1 trade-off for reliability
acc_cov_scores = [f1_score(self.val_ground, L_val[:,i], average='micro') for i in range(np.shape(L_val)[1])]
acc_cov_scores = np.nan_to_num(acc_cov_scores)
if self.vf != None:
#Calculate Jaccard score for diversity
train_num_labeled = np.sum(np.abs(self.vf.L_train.T), axis=0)
jaccard_scores = calculate_jaccard_distance(train_num_labeled,np.abs(L_train))
else:
jaccard_scores = np.ones(np.shape(acc_cov_scores))
#Weighting the two scores to find best heuristic
combined_scores = 0.5*acc_cov_scores + 0.5*jaccard_scores
sort_idx = np.argsort(combined_scores)[::-1][0:keep]
return sort_idx
def run_synthesizer(self, max_cardinality=1, idx=None, keep=1, model='lr'):
"""
Generates Synthesizer object and saves all generated heuristics
max_cardinality: max number of features candidate programs take as input
idx: indices of validation set to fit programs over
keep: number of heuristics to pass to verifier
model: train logistic regression ('lr') or decision tree ('dt')
"""
if idx == None:
primitive_matrix = self.val_primitive_matrix
ground = self.val_ground
else:
primitive_matrix = self.val_primitive_matrix[idx,:]
ground = self.val_ground[idx]
#Generate all possible heuristics
self.syn = Synthesizer(primitive_matrix, ground, b=self.b)
#Un-flatten indices
def index(a, inp):
i = 0
remainder = 0
while inp >= 0:
remainder = inp
inp -= len(a[i])
i+=1
try:
return a[i-1][remainder] #TODO: CHECK THIS REMAINDER THING WTF IS HAPPENING
except:
import pdb; pdb.set_trace()
#Select keep best heuristics from generated heuristics
hf, feat_combos = self.syn.generate_heuristics(model, max_cardinality)
sort_idx = self.prune_heuristics(hf,feat_combos, keep)
for i in sort_idx:
self.hf.append(index(hf,i))
self.feat_combos.append(index(feat_combos,i))
#create appended L matrices for validation and train set
beta_opt = self.syn.find_optimal_beta(self.hf, self.val_primitive_matrix, self.feat_combos, self.val_ground)
self.L_val = self.apply_heuristics(self.hf, self.val_primitive_matrix, self.feat_combos, beta_opt)
self.L_train = self.apply_heuristics(self.hf, self.train_primitive_matrix, self.feat_combos, beta_opt)
def run_verifier(self):
"""
Generates Verifier object and saves marginals
"""
###THIS IS WHERE THE SNORKEL FLAG IS SET!!!!
self.vf = Verifier(self.L_train, self.L_val, self.val_ground, has_snorkel=False)
self.vf.train_gen_model()
self.vf.assign_marginals()
def gamma_optimizer(self,marginals):
"""
Returns the best gamma parameter for abstain threshold given marginals
marginals: confidences for data from a single heuristic
"""
m = len(self.hf)
gamma = 0.5-(1/(m**(3/2.)))
return gamma
def find_feedback(self):
"""
Finds vague points according to gamma parameter
self.gamma: confidence past 0.5 that relates to a vague or incorrect point
"""
#TODO: flag for re-classifying incorrect points
#incorrect_idx = self.vf.find_incorrect_points(b=self.b)
gamma_opt = self.gamma_optimizer(self.vf.val_marginals)
#gamma_opt = self.gamma
vague_idx = self.vf.find_vague_points(b=self.b, gamma=gamma_opt)
incorrect_idx = vague_idx
self.feedback_idx = list(set(list(np.concatenate((vague_idx,incorrect_idx)))))
def evaluate(self):
"""
Calculate the accuracy and coverage for train and validation sets
"""
self.val_marginals = self.vf.val_marginals
self.train_marginals = self.vf.train_marginals
def calculate_accuracy(marginals, b, ground):
total = np.shape(np.where(marginals != 0.5))[1]
labels = np.sign(2*(marginals - 0.5))
return np.sum(labels == ground)/float(total)
def calculate_coverage(marginals, b, ground):
total = np.shape(np.where(marginals != 0.5))[1]
labels = np.sign(2*(marginals - 0.5))
return total/float(len(labels))
self.val_accuracy = calculate_accuracy(self.val_marginals, self.b, self.val_ground)
self.train_accuracy = calculate_accuracy(self.train_marginals, self.b, self.train_ground)
self.val_coverage = calculate_coverage(self.val_marginals, self.b, self.val_ground)
self.train_coverage = calculate_coverage(self.train_marginals, self.b, self.train_ground)
return self.val_accuracy, self.train_accuracy, self.val_coverage, self.train_coverage
def heuristic_stats(self):
'''For each heuristic, we want the following:
- idx of the features it relies on
- if dt, then the thresholds?
'''
def calculate_accuracy(marginals, b, ground):
total = np.shape(np.where(marginals != 0.5))[1]
labels = np.sign(2*(marginals - 0.5))
return np.sum(labels == ground)/float(total)
def calculate_coverage(marginals, b, ground):
total = np.shape(np.where(marginals != 0))[1]
labels = marginals
return total/float(len(labels))
stats_table = np.zeros((len(self.hf),6))
for i in range(len(self.hf)):
stats_table[i,0] = int(self.feat_combos[i][0])
try:
stats_table[i,1] = int(self.feat_combos[i][1])
except:
stats_table[i,1] = -1.
stats_table[i,2] = calculate_accuracy(self.L_val[:,i], self.b, self.val_ground)
stats_table[i,3] = calculate_accuracy(self.L_train[:,i], self.b, self.train_ground)
stats_table[i,4] = calculate_coverage(self.L_val[:,i], self.b, self.val_ground)
stats_table[i,5] = calculate_coverage(self.L_train[:,i], self.b, self.train_ground)
#Make table
column_headers = ['Feat 1', 'Feat 2', 'Val Acc', 'Train Acc', 'Val Cov', 'Train Cov']
pandas_stats_table = pd.DataFrame(stats_table, columns=column_headers)
return pandas_stats_table