Skip to content

Commit d7430a3

Browse files
committed
First Commit
0 parents  commit d7430a3

22 files changed

+297637
-0
lines changed

Analyse.py

+192
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
import Learn as lr
2+
import Data as dta
3+
import Perspective as pst
4+
5+
import copy
6+
7+
from sklearn.naive_bayes import GaussianNB
8+
from sklearn import tree
9+
from sklearn.metrics import accuracy_score
10+
from sklearn import svm
11+
12+
13+
df = dta.load_data("/home/sean/Downloads/Research/Projects/MPML Library/botnet_train3.csv", 50000)
14+
15+
thresh = 5
16+
# Algo = GaussianNB()
17+
# Algo = tree.DecisionTreeClassifier()
18+
Algo = svm.SVC(gamma='scale',probability=True)
19+
20+
# dta.convert_discrete(df,thresh)
21+
22+
# perspectiveList = pst.generatePerspectives(df,"class")
23+
24+
# lr.MPML(df,GaussianNB(),"class",thresh,perspectiveList)
25+
26+
# Write a function that removes one perspective at a time and records the result without each.
27+
28+
def analysePerspective(DataFrame,target,instNum):
29+
30+
print (df.iloc[instNum])
31+
32+
models = []
33+
pridictions = []
34+
y = 0
35+
y_hat = 0
36+
d = 0
37+
38+
y2 = 0
39+
y2_hat = 0
40+
d2 = [0,0]
41+
42+
impactRatings = []
43+
44+
perspectiveList = pst.generatePerspectives(DataFrame,target)
45+
# print ("\nResult with all Perspectives")
46+
models = lr.MPML(df,Algo,"class",thresh,perspectiveList)
47+
48+
new_df = lr.instancePrediction(perspectiveList,target,models)
49+
y = lr.majorityVote(new_df)
50+
print("\n--------------------------------------------------------------------------------------")
51+
52+
#this returns the avg confidence for the instance
53+
print ("Majority vote Acuracy with all perspective = {}".format(y))
54+
print ("Results for instance #{}".format(instNum))
55+
print ("1 is Not | 2 is Bot")
56+
print("\n======================================================")
57+
print (new_df.iloc[instNum])
58+
59+
y2 = new_df.iloc[instNum][-1]
60+
61+
62+
for x in range(0,len(models)):
63+
x_train, x_test, y_train, y_test = dta.data_setup(perspectiveList[x],"class")
64+
pridictions.append(accuracy_score(y_test,models[x].predict(x_test)))
65+
66+
67+
for i in range(0,len(models)):
68+
69+
print("\n--------------------------------------------------------------------------------------")
70+
71+
print ("Result without Perspective {}".format(i))
72+
73+
modles2 = copy.deepcopy(models)
74+
perspectiveList2 = copy.deepcopy(perspectiveList)
75+
del modles2[i]
76+
del perspectiveList2[i]
77+
78+
print("Persective {} Acuracy on it's own = ".format(i)+str(pridictions[i])+"")
79+
new_df = lr.instancePrediction(perspectiveList2,target,modles2)
80+
y_hat = lr.majorityVote(new_df)
81+
82+
print ("Majority vote Acuracy without this perspective = {}".format(y_hat))
83+
84+
d = y - y_hat
85+
impactRatings.append(d)
86+
print ("Majority vote Impact Score = {}".format(d))
87+
88+
print ("Confidence Leverl without perspective {}".format(i))
89+
print (new_df.iloc[instNum][-1])
90+
91+
y2_hat = new_df.iloc[instNum][-1]
92+
93+
# print ()
94+
# lr.combinePerspectives (target,GaussianNB(),new_df)
95+
96+
# print (max(impactRatings))
97+
98+
# for i in range(0,len(models)):
99+
100+
perspective = perspectiveList[i].drop(target, axis=1).values
101+
print("\n======================================================")
102+
print ("Current Perspetive Prediction and Confidence:")
103+
print ("Prediction = {}".format(models[i].predict(list((perspective[instNum]).reshape(1,-1)))[0]))
104+
print ("Confidence = {}".format((models[i].predict_proba(list((perspective[instNum]).reshape(1,-1)))[0])*100))
105+
106+
print("\nConfidence impact score: y - ŷ = d")
107+
d2[0] = (y2[0] - y2_hat[0])
108+
d2[1] = (y2[1] - y2_hat[1])
109+
print (d2)
110+
111+
analyseFeatures(perspectiveList,"class",Algo,instNum)
112+
113+
114+
115+
#Write a function that will give the confidence and pridiction results given a single model and instance.
116+
117+
def analyseFeatures(perspectiveList,target,clf,instNum):
118+
119+
confidence = []
120+
y = 0
121+
y_hat = 0
122+
d = [0,0]
123+
124+
for i in range(0,len(perspectiveList)):
125+
126+
perspectiveList2 = perspectiveList
127+
128+
print ("\n@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
129+
print ("Confidence with all features")
130+
131+
dta.convert_discrete(perspectiveList2[i],thresh)
132+
perspective0 = perspectiveList2[i].drop(target, axis=1).values
133+
134+
x_train, x_test, y_train, y_test = dta.data_setup(perspectiveList2[i],target)
135+
136+
clf = clf.fit(x_train,y_train)
137+
138+
print (clf.predict(list((perspective0[instNum]).reshape(1,-1)))[0])
139+
confidence = clf.predict_proba(list((perspective0[instNum]).reshape(1,-1)))[0]*100
140+
141+
print (confidence)
142+
143+
y = confidence
144+
145+
for feature in perspectiveList2[i].columns:
146+
147+
perspectiveList3 = copy.deepcopy(perspectiveList2)
148+
149+
150+
151+
if (feature != target): # to not use class label as a feature
152+
print("\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
153+
154+
# print (perspectiveList[i].columns)
155+
156+
print ("Confidence of P{} without feature - {}".format(i,feature))
157+
158+
del perspectiveList3[i][feature]
159+
160+
# print (perspectiveList3[i].columns)
161+
162+
dta.convert_discrete(perspectiveList3[i],thresh)
163+
perspective = perspectiveList3[i].drop(target, axis=1).values
164+
165+
x_train, x_test, y_train, y_test = dta.data_setup(perspectiveList3[i],target)
166+
167+
clf = clf.fit(x_train,y_train)
168+
169+
# print (clf.predict(list((perspective[instNum]).reshape(1,-1)))[0])
170+
171+
y_hat = clf.predict_proba(list((perspective[instNum]).reshape(1,-1)))[0]*100
172+
173+
# print (clf.predict_proba(list((perspective[instNum]).reshape(1,-1)))[0]*100)
174+
175+
176+
print("\nConfidence impact score: y - ŷ = d for features")
177+
d[0] = (y[0] - y_hat[0])
178+
d[1] = (y[1] - y_hat[1])
179+
print (d)
180+
181+
print ("\n******************************************************************************")
182+
print ("Relations of P{} features - {}".format(i,feature))
183+
184+
for f2 in perspectiveList3[i].columns:
185+
relation = pst.getFeaturesRelations(feature,f2)
186+
print ("{} & {} => {}".format(feature,f2,relation))
187+
188+
189+
analysePerspective(df,"class",345)
190+
191+
192+
# Make this function more effecent by allowing it to not re-create all the models all the time. Find a way to build the modles once then iterate through them to get the result on each dataset.

Classes.py

+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
#The Group class manages the groups of features as they are created.
2+
3+
class Groups:
4+
5+
all_groups = []
6+
7+
def __init__(self,leader):
8+
self.leader = leader
9+
self.members = [leader]
10+
Groups.all_groups.append(self)
11+
12+
#Add a member to a group
13+
def add_member(self,feature):
14+
self.members.append(feature)
15+
16+
#Remove a member from the group
17+
def remove_member(self,feature):
18+
self.members.remove(feature)
19+
20+
#Returns a list of the members of a group
21+
def get_members(self):
22+
return self.members
23+
24+
#Method that checks if a given feature is in this group
25+
def is_in(self,feature):
26+
return feature in self.members
27+
28+
#A class Method that checks if a feature is in any group
29+
@classmethod
30+
def is_grouped(cls,feature):
31+
for grp in Groups.all_groups:
32+
if(grp.is_in(feature)):
33+
return True
34+
return False
35+
36+
#A class Method that returns the group object for a given feature
37+
@classmethod
38+
def get_group(cls,feature):
39+
for grp in Groups.all_groups:
40+
if(grp.is_in(feature)):
41+
return grp
42+
return False
43+
44+
@classmethod
45+
def print_all_groups(cls):
46+
lst = []
47+
for grp in Groups.all_groups:
48+
lst.append(grp.get_members())
49+
50+
return lst
51+

Data.py

+57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import pandas as pd
2+
import numpy as np
3+
from sklearn.model_selection import train_test_split
4+
5+
#Accepts a string as input and returns a pandas dataframe
6+
def load_data(data):
7+
return pd.read_csv(data)
8+
9+
#Accepts a string and the number of rows to load as input and returns a pandas dataframe
10+
def load_data(data, rows):
11+
return pd.read_csv(data, nrows=rows)
12+
13+
#Accepts a dataframe and a column name as parameter and converts the nominal values to an interger
14+
#Starting from 1 up to n, where n is the number of nominal values
15+
def auto_int(dataFrame,col):
16+
NameList = dataFrame[col].unique() #Get the number of unique elements in a DataFrame
17+
counter = 1
18+
for name in NameList:
19+
dataFrame[col] = dataFrame[col].replace([name], counter) #Replace Values in a dataframe with something else
20+
counter += 1
21+
22+
#Check if a column is discrete or continuous
23+
#Find the percentage of number of unique values to the total number of values. If it is greater than the threshold (thrsh) 10% it
24+
#is consedered to be continious. Returns False if the column is nominal and True if its descrete.
25+
def is_discrete(dataFrame,col,thrsh = 1):
26+
27+
UniqueValues = len(dataFrame[col].unique())*1.0
28+
rows = dataFrame[col].shape[0]
29+
percent = (UniqueValues/rows)*100.00
30+
31+
if percent > thrsh:
32+
return False
33+
return True
34+
35+
#This function converts all descrete valued columns to int given a dataframe and threshold percentage
36+
#if the unique atribute value cont is > the threshold value then it will be maked as not descreet
37+
def convert_discrete(the_df,thrsh):
38+
colmns = list(the_df)
39+
for col in colmns:
40+
if (is_discrete(the_df,col,thrsh)):
41+
auto_int(the_df,col)
42+
43+
# This function prepares a dataset for training by spliting it into x and y
44+
def data_setup(dataFrame,target):
45+
46+
x = dataFrame.drop(target, axis=1)
47+
y = dataFrame[target]
48+
49+
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.33, random_state=42)
50+
51+
return X_train, X_test, y_train, y_test
52+
53+
54+
55+
56+
57+

0 commit comments

Comments
 (0)