SOGMiller
diff --git a/‎Analyse.py
+192 b/‎Analyse.py
+192
diff --git a/‎Classes.py
+51 b/‎Classes.py
+51
diff --git a/‎Data.py
+57 b/‎Data.py
+57
@@ -0,0 +1,192 @@
+import Learn as lr 
+import Data as dta
+import Perspective as pst
+
+import copy
+
+from sklearn.naive_bayes import GaussianNB
+from sklearn import tree
+from sklearn.metrics import accuracy_score
+from sklearn import svm
+
+
+df = dta.load_data("/home/sean/Downloads/Research/Projects/MPML Library/botnet_train3.csv", 50000)
+
+thresh = 5 
+# Algo = GaussianNB()
+# Algo = tree.DecisionTreeClassifier()
+Algo = svm.SVC(gamma='scale',probability=True)
+
+# dta.convert_discrete(df,thresh)
+
+# perspectiveList = pst.generatePerspectives(df,"class")
+
+# lr.MPML(df,GaussianNB(),"class",thresh,perspectiveList)
+
+# Write a function that removes one perspective at a time and records the result without each.
+
+def analysePerspective(DataFrame,target,instNum):
+
+        print (df.iloc[instNum])
+        
+        models = []
+        pridictions = []
+        y = 0
+        y_hat = 0
+        d = 0
+
+        y2 = 0
+        y2_hat = 0
+        d2 = [0,0]
+
+        impactRatings = []
+
+        perspectiveList = pst.generatePerspectives(DataFrame,target)
+        # print ("\nResult with all Perspectives")
+        models = lr.MPML(df,Algo,"class",thresh,perspectiveList)
+
+        new_df = lr.instancePrediction(perspectiveList,target,models)
+        y = lr.majorityVote(new_df)
+        print("\n--------------------------------------------------------------------------------------")
+
+         #this returns the avg confidence for the instance
+        print ("Majority vote Acuracy with all perspective = {}".format(y))
+        print ("Results for instance #{}".format(instNum))
+        print ("1 is Not | 2 is Bot")
+        print("\n======================================================")
+        print (new_df.iloc[instNum])
+
+        y2 = new_df.iloc[instNum][-1]
+
+
+        for x in range(0,len(models)):
+                x_train, x_test, y_train, y_test = dta.data_setup(perspectiveList[x],"class")                
+                pridictions.append(accuracy_score(y_test,models[x].predict(x_test)))
+
+
+        for i in range(0,len(models)):
+
+                print("\n--------------------------------------------------------------------------------------")
+
+                print ("Result without Perspective {}".format(i))
+                
+                modles2 = copy.deepcopy(models) 
+                perspectiveList2 = copy.deepcopy(perspectiveList)  
+                del modles2[i]
+                del perspectiveList2[i]
+
+                print("Persective {} Acuracy on it's own = ".format(i)+str(pridictions[i])+"")
+                new_df = lr.instancePrediction(perspectiveList2,target,modles2)
+                y_hat = lr.majorityVote(new_df)
+
+                print ("Majority vote Acuracy without this perspective = {}".format(y_hat))
+
+                d = y - y_hat
+                impactRatings.append(d)
+                print ("Majority vote Impact Score = {}".format(d))
+
+                print ("Confidence Leverl without perspective {}".format(i))
+                print (new_df.iloc[instNum][-1])
+
+                y2_hat = new_df.iloc[instNum][-1]
+
+                # print ()
+                # lr.combinePerspectives (target,GaussianNB(),new_df)
+
+        # print (max(impactRatings))
+
+       # for i in range(0,len(models)):
+
+                perspective = perspectiveList[i].drop(target, axis=1).values
+                print("\n======================================================")
+                print ("Current Perspetive Prediction and Confidence:")
+                print ("Prediction = {}".format(models[i].predict(list((perspective[instNum]).reshape(1,-1)))[0]))
+                print ("Confidence = {}".format((models[i].predict_proba(list((perspective[instNum]).reshape(1,-1)))[0])*100))
+
+                print("\nConfidence impact score: y - ŷ = d")
+                d2[0] = (y2[0] - y2_hat[0])
+                d2[1] = (y2[1] - y2_hat[1])
+                print (d2)
+
+        analyseFeatures(perspectiveList,"class",Algo,instNum)
+
+      
+
+#Write a function that will give the confidence and pridiction results given a single model and instance.
+
+def analyseFeatures(perspectiveList,target,clf,instNum):
+
+        confidence = []
+        y = 0
+        y_hat = 0
+        d = [0,0]
+
+        for i in range(0,len(perspectiveList)):
+
+                perspectiveList2 = perspectiveList
+
+                print ("\n@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
+                print ("Confidence with all features")
+
+                dta.convert_discrete(perspectiveList2[i],thresh)
+                perspective0 = perspectiveList2[i].drop(target, axis=1).values
+
+                x_train, x_test, y_train, y_test = dta.data_setup(perspectiveList2[i],target)
+
+                clf = clf.fit(x_train,y_train)
+
+                print (clf.predict(list((perspective0[instNum]).reshape(1,-1)))[0])
+                confidence = clf.predict_proba(list((perspective0[instNum]).reshape(1,-1)))[0]*100
+
+                print (confidence)
+
+                y = confidence
+
+                for feature in perspectiveList2[i].columns:
+
+                        perspectiveList3 = copy.deepcopy(perspectiveList2)
+
+                        
+
+                        if (feature != target):  #  to not use class label as a feature
+                                print("\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
+                                
+                                # print (perspectiveList[i].columns)
+
+                                print ("Confidence of P{} without feature - {}".format(i,feature))
+
+                                del perspectiveList3[i][feature]
+
+                                # print (perspectiveList3[i].columns)
+
+                                dta.convert_discrete(perspectiveList3[i],thresh)
+                                perspective = perspectiveList3[i].drop(target, axis=1).values
+
+                                x_train, x_test, y_train, y_test = dta.data_setup(perspectiveList3[i],target)
+                
+                                clf = clf.fit(x_train,y_train)
+
+                                # print (clf.predict(list((perspective[instNum]).reshape(1,-1)))[0])
+
+                                y_hat = clf.predict_proba(list((perspective[instNum]).reshape(1,-1)))[0]*100
+
+                                # print (clf.predict_proba(list((perspective[instNum]).reshape(1,-1)))[0]*100)
+
+
+                                print("\nConfidence impact score: y - ŷ = d for features")
+                                d[0] = (y[0] - y_hat[0]) 
+                                d[1] = (y[1] - y_hat[1])
+                                print (d)
+
+                                print ("\n******************************************************************************")
+                                print ("Relations of P{} features - {}".format(i,feature))
+
+                                for f2 in perspectiveList3[i].columns: 
+                                        relation = pst.getFeaturesRelations(feature,f2)
+                                        print ("{} & {} => {}".format(feature,f2,relation))
+
+
+analysePerspective(df,"class",345)
+
+
+# Make this function more effecent by allowing it to not re-create all the models all the time. Find a way to build the modles once then iterate through them to get the result on each dataset. 
@@ -0,0 +1,51 @@
+#The Group class manages the groups of features as they are created.
+
+class Groups:
+
+    all_groups = []
+
+    def __init__(self,leader):
+        self.leader = leader
+        self.members = [leader]
+        Groups.all_groups.append(self)
+
+    #Add a member to a group
+    def add_member(self,feature):
+        self.members.append(feature)
+
+    #Remove a member from the group
+    def remove_member(self,feature):
+        self.members.remove(feature)
+
+    #Returns a list of the members of a group
+    def get_members(self):
+        return self.members
+
+    #Method that checks if a given feature is in this group
+    def is_in(self,feature):
+        return feature in self.members
+
+    #A class Method that checks if a feature is in any group 
+    @classmethod
+    def is_grouped(cls,feature):
+        for grp in Groups.all_groups:
+            if(grp.is_in(feature)):
+                return True
+        return False
+
+    #A class Method that returns the group object for a given feature
+    @classmethod
+    def get_group(cls,feature):
+        for grp in Groups.all_groups:
+            if(grp.is_in(feature)):
+                return grp
+        return False
+
+    @classmethod
+    def print_all_groups(cls):
+        lst = []
+        for grp in Groups.all_groups:
+            lst.append(grp.get_members())
+
+        return lst
+
@@ -0,0 +1,57 @@
+import pandas as pd 
+import numpy as np
+from sklearn.model_selection import train_test_split
+
+#Accepts a string as input and returns a pandas dataframe
+def load_data(data):
+    return pd.read_csv(data)
+
+#Accepts a string and the number of rows to load as input and returns a pandas dataframe
+def load_data(data, rows):
+    return pd.read_csv(data, nrows=rows)
+
+#Accepts a dataframe and a column name as parameter and converts the nominal values to an interger 
+#Starting from 1 up to n, where n is the number of nominal values
+def auto_int(dataFrame,col):
+    NameList = dataFrame[col].unique() #Get the number of unique elements in a DataFrame 
+    counter = 1
+    for name in NameList:
+        dataFrame[col] = dataFrame[col].replace([name], counter) #Replace Values in a dataframe with something else
+        counter += 1
+
+#Check if a column is discrete or continuous
+#Find the percentage of number of unique values to the total number of values. If it is greater than the threshold (thrsh) 10% it
+#is consedered to be continious. Returns False if the column is nominal and True if its descrete.
+def is_discrete(dataFrame,col,thrsh = 1):
+ 
+    UniqueValues =  len(dataFrame[col].unique())*1.0
+    rows = dataFrame[col].shape[0]
+    percent = (UniqueValues/rows)*100.00
+    
+    if percent > thrsh:
+        return False
+    return True
+
+#This function converts all descrete valued columns to int given a dataframe and threshold percentage
+#if the unique atribute value cont is > the threshold value then it will be maked as not descreet
+def convert_discrete(the_df,thrsh):
+    colmns = list(the_df)
+    for col in colmns:
+        if (is_discrete(the_df,col,thrsh)):
+            auto_int(the_df,col)
+
+# This function prepares a dataset for training by spliting it into x and y
+def data_setup(dataFrame,target): 
+
+    x = dataFrame.drop(target, axis=1) 
+    y = dataFrame[target]
+
+    X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.33, random_state=42)
+
+    return X_train, X_test, y_train, y_test
+
+
+
+
+
+