-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPerspective.py
496 lines (372 loc) · 15.9 KB
/
Perspective.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
import numpy as np
import Data as dta
import pandas as pd
import Classes as gp
from sklearn.feature_selection import RFE
from sklearn import tree
import copy
import pathlib
path = str(pathlib.Path(__file__).parent.absolute())
def LoadCSV(FilePath):
dataFrame = dta.load_data(FilePath, 50000)
dataFrame = dataFrame.fillna(0)
return dataFrame
def LoadCSV_instNum(FilePath,instNum):
dataFrame = dta.load_data(FilePath, instNum)
dataFrame = dataFrame.fillna(0)
return dataFrame
def viewFeatures(df,target):
return list(df.drop([target], axis=1))
def getFeatures(df,target):
#Get List of features (Column names)
features = list(df.drop([target], axis=1))
# Craete a list of features and their types (real= False or descrete =True) - formant - [["fname1",False],["fname2",True]]
fTypes = []
for col in features:
fTypes.append([col,dta.is_discrete(df,col)])
return fTypes
#This fuction calculates the average change that occurs between the instancs of real-valued featurs.
#That average change is called the significant change value which indecates the minimum value for which we will
#record a change for that particular feature
def cal_sig_change(dataFrame,feature):
#Fill in blanks or nan with 0.0s
dataFrame[feature] = dataFrame[feature].fillna(0.0)
#get the length of the column
col_length = len(dataFrame[feature].values)
#The number of times a change is recorded is n-1 where n is the number of rows in that column
#get the celeing of each change
#List to store the Change values
ChaValues = []
#Use count to keep track so we don't go over the limit
count = 0
for val in dataFrame[feature]:
#print (type(val))
#print (val)
if (count !=0 and count != len(dataFrame[feature])):
ChaValues.append(abs(prev_val - val))
count+=1
prev_val = val
return sum(ChaValues)/col_length
#This function generates a csv file with the real valued features and their significant change value
def sigValCsv(dataFrame,target):
data = []
for fea in getFeatures(dataFrame,target):
if (fea[1] == False):
data.append([fea[0],cal_sig_change(dataFrame,fea[0])])
new_df = pd.DataFrame(data,columns=['Feature','Significant Value'])
new_df.to_csv(path + "/SigChange.csv")
#A function that generates a list of all possible pairs of features
def pairWise(dataFrame,target):
#Stores all posible pairs of features
pairs = []
#Get List of features (Column names)
features = list(dataFrame.drop([target], axis=1))
for fea in features:
#Loop throught with the current feature "fea" fixed
for fea2 in features[1:]:
pairs.append([fea,fea2])
#Remove the first element from the list
features = features[1:]
return pairs
#gets the significant change value of a particular feature
def getSigValue(feature):
sigDf = pd.read_csv(path + "/SigChange.csv")
#Count here is used to keep track of the index so we know what sig value we need to get
count = 0
for fea in sigDf["Feature"]:
if (fea == feature):
return sigDf["Significant Value"][count]
count+=1
return -1
#Returns true if the previous value cnaged with the current false otherwise. real is set to true if it is a valued feature and the sig value is passed in.
def didChange(curr,prev,real = False, sigVal = 0):
if(real == True and sigVal != 0):
if (abs(prev - curr) > sigVal):
return True
else:
return False
elif(curr != prev):
return True
else:
return False
#Calculate type one scores - If the attribute values for both features change at the same time.
#Calculate type two scores - If the attribute values for both features change and the class label change at the same time.
def getScores(f1,f2,target,dataFrame):
sigValf1 = -1
sigValf2 = -1
sigValTarget = -1
score = 0
score2 = 0
length = len(dataFrame[f1])
#if either feature of continous value then we would wnat to get the significant change value for that feature
if(dta.is_discrete(dataFrame,f1)!= True):
sigValf1 = getSigValue(f1) #Get the sigChange Value
if(dta.is_discrete(dataFrame,f2)!= True):
sigValf2 = getSigValue(f2) #Get the sigChange Value
if(dta.is_discrete(dataFrame,target)!= True):
sigValTarget = getSigValue(target) #Get the sigChange Value
#Both features are descrete
if (sigValf1 == -1 and sigValf2 == -1):
#print ("Case 1")
#keep track of prevous values with a prev_indext variable
prev_index = 0
for i in range(1, length):
if (didChange(dataFrame[f1][i],dataFrame[f1][prev_index]) and didChange(dataFrame[f2][i],dataFrame[f2][prev_index])):
score+= 0.001
#Check if the target is descrete valued
if(sigValTarget == -1):
#Check if the target changed as well
if(didChange(dataFrame[target][i],dataFrame[target][prev_index])):
score2+=1 #If it did update score2
else: #if it is not descrete
#Check if the target changed as well
if(didChange(dataFrame[target][i],dataFrame[target][prev_index],real=True, sigVal=sigValTarget)):
score2+=1 #If it did update score2
prev_index+=1
#only f1 is descrete
if (sigValf1 == -1 and sigValf2 != -1):
#print ("Case 2")
#keep track of prevous values with a prev_indext variable
prev_index = 0
for i in range(1, length):
if (didChange(dataFrame[f1][i],dataFrame[f1][prev_index]) and didChange(dataFrame[f2][i],dataFrame[f2][prev_index],real=True, sigVal=sigValf2)):
score+= 0.001
#Check if the target is descrete valued
if(sigValTarget == -1):
#Check if the target changed as well
if(didChange(dataFrame[target][i],dataFrame[target][prev_index])):
score2+=1 #If it did update score2
else: #if it is not descrete
#Check if the target changed as well
if(didChange(dataFrame[target][i],dataFrame[target][prev_index],real=True, sigVal=sigValTarget)):
score2+=1 #If it did update score2
prev_index+=1
#only f2 is descrete
if (sigValf1 != -1 and sigValf2 == -1):
#print ("Case 3")
#keep track of prevous values with a prev_indext variable
prev_index = 0
for i in range(1, length):
if (didChange(dataFrame[f1][i],dataFrame[f1][prev_index],real=True, sigVal=sigValf1) and didChange(dataFrame[f2][i],dataFrame[f2][prev_index])):
score+= 0.001
#Check if the target is descrete valued
if(sigValTarget == -1):
#Check if the target changed as well
if(didChange(dataFrame[target][i],dataFrame[target][prev_index])):
score2+=1 #If it did update score2
else: #if it is not descrete
#Check if the target changed as well
if(didChange(dataFrame[target][i],dataFrame[target][prev_index],real=True, sigVal=sigValTarget)):
score2+=1 #If it did update score2
prev_index+=1
#boath are real
if (sigValf1 != -1 and sigValf2 != -1):
#print ("Case 4")
#keep track of prevous values with a prev_indext variable
prev_index = 0
for i in range(1, length):
if (didChange(dataFrame[f1][i],dataFrame[f1][prev_index],real=True, sigVal=sigValf1) and didChange(dataFrame[f2][i],dataFrame[f2][prev_index],real=True, sigVal=sigValf2)):
score+= 0.001
# print("f1prev - {}\n f2prev - {}".format(dataFrame[f1][prev_index],dataFrame[f2][prev_index]))
# print("f1i - {}\n f2i - {}".format(dataFrame[f1][i],dataFrame[f2][i]))
# print ("i - {}\n prev_index - {}".format(i,prev_index))
#Check if the target is descrete valued
if(sigValTarget == -1):
#Check if the target changed as well
if(didChange(dataFrame[target][i],dataFrame[target][prev_index])):
score2+=1 #If it did update score2
else: #if it is not descrete
#Check if the target changed as well
if(didChange(dataFrame[target][i],dataFrame[target][prev_index],real=True, sigVal=sigValTarget)):
score2+=1 #If it did update score2
prev_index+=1
return score, score2
#Using the results of the pairwise fuction this fuction genates a relationsip score for every posible pair of features and stores it in a csv file
def generateRelations(dataFrame,target):
#store the reltionships in this list then create a dataframe then store as csv [f1,f2,r-score]
data = []
length = len(pairWise(dataFrame,target))
count = 1
for pair in pairWise(dataFrame,target):
score, score2 = getScores(pair[0],pair[1],target,dataFrame)
data.append([pair[0],pair[1],sum([score,score2])])
print ("{} of {}...".format(count,length))
count+=1
new_df = pd.DataFrame(data,columns=['Feature 1','Feature 2','Score'])
new_df.to_csv(path + "/Relations.csv")
#This function returns the max relation given a list of relations in this format - ['Id', 'MSSubClass', 1613]
def max_score(scoresList):
maxList = []
currentMax = 0
for r in scoresList:
if (r[2]>currentMax):
currentMax = r[2]
maxList = r
return maxList
#write a function to get all relations of a given feature
def get_relations(feature):
location = path + "\Relations.csv"
rdf = pd.read_csv(location)
rdfNames = list(rdf)
f1 = list(rdf[rdfNames[1]])
f2 = list(rdf[rdfNames[2]])
score = list(rdf[rdfNames[3]])
rList = []
for i in range(0,len(list(f1))):
if (f1[i] == feature or f2[i] == feature):
rList.append([f1[i],f2[i],score[i]])
return rList
# Write a function to get the link between 2 given features
def getFeaturesRelations(feature1,feature2):
location = path + "/Relations.csv"
rdf = pd.read_csv(location)
rdfNames = list(rdf)
f1 = list(rdf[rdfNames[1]])
f2 = list(rdf[rdfNames[2]])
score = list(rdf[rdfNames[3]])
for i in range(0,len(list(f1))):
if ((f1[i] == feature1 and f2[i] == feature2) or (f1[i] == feature2 and f2[i] == feature1)):
return score[i]
return -1
#This function returns the feature with the strongest link for the given feature
def get_best_link(feature):
# print(feature)
lst = max_score(get_relations(feature))
# print (lst)
if (lst == []):
return -1
if (feature == lst[0]):
return lst[1]
else:
return lst[0]
return -1
#Wrrite a function to group features with the strongest links
def groupFeatures (dataFrame):
print("==> Grouping Features...")
#get a list of all features
all_features = list(dataFrame)
discard = []
# print (all_features)
#While the list of all features is not empty
while (all_features!= []):
#Pull the next feature from that list
feature = all_features[0]
print ("Curent Feature - {}".format(feature))
#Get this feature's stongest link
best_link = get_best_link(feature)
print ("Best Link - {}".format(best_link))
#Check if the strongest link is in a group
print (gp.Groups.is_grouped(best_link))
print (best_link)
if(best_link == -1):
all_features.remove(feature)
discard.append(feature)
elif(gp.Groups.is_grouped(best_link)):
#get the group that it is in
best_link_group = gp.Groups.get_group(best_link)
#put this geature in the same group as its stongest link
best_link_group.add_member(feature)
#remove the feature from the list of all features
all_features.remove(feature)
else:
#Create a group and put both features in it
new_group = gp.Groups(feature)
new_group.add_member(best_link)
#remove both features from the main list of features
all_features.remove(feature)
# if (best_link in all_features):
# print (all_features)
all_features.remove(best_link)
# Write a function that when given a set of features that represents a single perspective returns a dataframe
# with only those features and the data to go along with them. This function should also create a csv file of the
# created Perspective.
def createPerspective (fList,dataFrame,targetName,file_name):
features = []
dropList = []
for col in dataFrame.columns:
features.append(col)
for f in features:
if f in fList:
pass #do nothing
else:
if (f != targetName):
dropList.append(f)
newDataFrame = dataFrame.drop(dropList, axis=1)
newDataFrame.to_csv(path + "/" +file_name)
return newDataFrame
# Create a function that generate all perspectives as a list of dataFrames given the list of all the
# grouped features.
def generatePerspectives (dataFrame,targetName,selected_features=[]):
groupFeatures(dataFrame)
print("==> Generating Perspectives...")
i = 0
perspectiveList = []
if(selected_features == []):
groups = getTopFeatures(dataFrame,targetName)
#gp.Groups.print_all_groups()
else:
groups = selected_features
for group in groups:
file_name = "perspect"+str(i)+".csv"
perspectiveList.append(createPerspective (group,dataFrame,targetName,file_name))
i+=1
print("From Perspective.py")
print (len(perspectiveList))
print("From Groups")
print (groups)
return perspectiveList
def viewPerspectives(dataFrame):
groupFeatures(dataFrame)
groups = gp.Groups.print_all_groups()
return groups
#Write a function that couts the number of perspect[#].csv files to determine the number of perspectives
def countPerspectives():
count = 0
flag = 0
while (flag != -1):
try:
f = open("perspect"+str(count)+".csv")
count += 1
except IOError:
flag = -1
finally:
f.close()
return count
# Write a function to accept a df and return the name (string) of the top ranking features
def getTopFeatures(dataFrame,targetName):
algo = tree.DecisionTreeClassifier()
# Insert code to - Add top features to the group if it is not already in it.
dta.convert_discrete(dataFrame,1)
x_train, x_test, y_train, y_test = dta.data_setup(dataFrame,targetName)
rfe = RFE(estimator=algo,step=1)
rfe = rfe.fit(x_train,y_train)
selected_rfe_features_df = pd.DataFrame({'Feature': list(x_train.columns),'Ranking':rfe.ranking_})
rate_list = list(selected_rfe_features_df.values)
oldGroups = copy.deepcopy(gp.Groups.print_all_groups())
groups = gp.Groups.print_all_groups()
def getFeatureRating(f_list):
return f_list[1]
def getFeatureName(f_list):
return f_list[0]
top_features = []
for f_list in rate_list:
if(getFeatureRating(f_list) == 1 ):
top_features.append(getFeatureName(f_list))
for feature in top_features:
print("Best Link for {} is {}".format(feature,get_best_link(feature)))
if(get_best_link(feature) not in top_features):
top_features.remove(feature)
for per in groups:
for feature in top_features:
if(feature not in per):
per.append(feature)
groups.append(top_features)
for OG in oldGroups:
groups.append(OG)
print ("Top Features")
print (groups)
return groups
# dataFrame = pd.read_csv (path + '/botnet_train3.csv')
# sigValCsv(dataFrame,"class")
# generateRelations(dataFrame,"class")
# generatePerspectives (dataFrame,"class")