Skip to content

Commit

Permalink
Update Naive Bayes
Browse files Browse the repository at this point in the history
  • Loading branch information
avs8687 committed Nov 1, 2018
1 parent 544ef25 commit cd8ec7f
Showing 1 changed file with 71 additions and 121 deletions.
192 changes: 71 additions & 121 deletions Data Analytics/Naive Bayes/NB.py
Original file line number Diff line number Diff line change
@@ -1,137 +1,87 @@
import pandas as pd
import math
from statistics import mean

class NB:
def __init__(self):
self.prob_1 = 0
self.prob_0 = 0
self.labels = ['very low','low','medium','high','very high']
self.data = []
#read the data
data = pd.read_csv('diabetes.csv')

def replace(self,avg,i,data):
"""Function to replace missing values with the given mean value """
for j in range(len(data)):
if data.iloc[j,i] == 0:
data.iloc[j,i] = avg
return data
#labels for discretization
labels = ['low','medium','high']

def discretize(self,data,labels):
""" Function to discretize or categorize the input into the given labels"""
temp = []
for i in list(data.columns)[:-1]:
data[i] = pd.cut(data[i],bins=len(labels),labels=labels)
return data
#Preprocessing
for j in data.columns[:-1]:
mean = data[j].mean()
data[j] = data[j].replace(0,mean)
data[j] = pd.cut(data[j],bins=len(labels),labels=labels)

def count(self,data,col,source,target):
count = 0
d_col = list(data[col])
for i in range(0,len(data)):
#Count the number of rows where with given category the target is present
if d_col[i] == source and data.iloc[i,-1] == target:
count += 1
return count
#train test split
split_per = [80,70,60]

def probability(self,data,col,source,target):
x = self.count(data,col,source,target)
y = self.count(data,'Outcome',target,target)
return x/y
def count(data,colname,label,target):
condition = (data[colname] == label) & (data['Outcome'] == target)
return len(data[condition])

def train(self,data):
temp = []
for i in list(data.columns)[:-1]:
#Dictionary to store the probabilities of various categories with different class
d = {i:{1:{},0:{}}}
for j in self.labels:
x = self.probability(data,i,j,1)
y = self.probability(data,i,j,0)
d[i][1][j] = x
d[i][0][j] = y
temp.append(d)
return temp
#Process starts here
for i in split_per:

def test(self,data,temp):
results = []
for i in range(0,len(data)):
op_1 = 1 #Probability of class 1 for given features
op_0 = 1 #Probability of class 0 for given features
c = 0 #To iterate through different features
for j in data.columns:
#Using Baye's formula
op_1 *= temp[c][j][1][data.iloc[i,c]]
op_0 *= temp[c][j][0][data.iloc[i,c]]
c += 1
#Whichever probability is greate output result as that class
if op_1 > op_0:
results.append(1)
else:
results.append(0)

return results
#result list to store predicted values
predicted = []

def process(self,train_per):
#dictionary to store probabilities
probabilities = {0:{},1:{}}

#Read the input file
self.data = pd.read_csv("diabetes.csv")

# Traverse respective columns and replace missing values with the mean
for i in range(1,len(list(self.data.columns))-1):
avg = mean(self.data.iloc[:,i])
self.data = self.replace(math.floor(avg),i,self.data)

#Discretize the data
self.data = self.discretize(self.data,self.labels)
#calculate training length
train_len = int((i*len(data))/100)

#Calculate probability of output being 1 or 0
self.prob_1 = self.probability(self.data,'Outcome',1,1)
self.prob_0 = self.probability(self.data,'Outcome',0,0)
#Split training and testing data
train_X = data.iloc[:train_len,:]

# Split dataset into training and test data
train_len = len(self.data)*train_per/100
train_len = math.floor(train_len)
train_data = self.data.iloc[:train_len,:]
test_data = self.data.iloc[train_len+1:,:]
test_X = data.iloc[train_len+1:,:-1]
test_y = data.iloc[train_len+1:,-1]

#Train the model
temp = self.train(train_data)
#count total number of 0s and 1s
count_0 = count(train_X,'Outcome',0,0)
count_1 = count(train_X,'Outcome',1,1)

prob_0 = count_0/len(train_X)
prob_1 = count_1/len(train_X)

#Test the model
results = self.test(test_data.iloc[:,:-1],temp)
#Train the model
for j in train_X.columns[:-1]:

#Calculate confusion matrix
"""
TP : Actual Yes Predicted Yes
TN : Actual No Predicted No
FP : Actual No Predicted Yes
FN : Actual Yes Predicted No
"""
tp,tn = 0,0
fp,fn = 0,0
for i in range(0,len(results)):
if test_data.iloc[i,-1] == 1:
if results[i] == test_data.iloc[i,-1]:
tp+=1
else:
fn+=1
probabilities[0][j] = {}
probabilities[1][j] = {}

for k in labels:
count_k_0 = count(train_X,j,k,0)
count_k_1 = count(train_X,j,k,1)

probabilities[0][j][k] = count_k_0 / count_0
probabilities[1][j][k] = count_k_1 / count_1

#Test the model
for row in range(0,len(test_X)):
prod_0 = prob_0
prod_1 = prob_1
for feature in test_X.columns:
prod_0 *= probabilities[0][feature][test_X[feature].iloc[row]]
prod_1 *= probabilities[1][feature][test_X[feature].iloc[row]]

#Predict the outcome
if prod_0 > prod_1:
predicted.append(0)
else:
predicted.append(1)

#create confusion matrix
tp,tn,fp,fn = 0,0,0,0
for j in range(0,len(predicted)):
if predicted[j] == 0:
if test_y.iloc[j] == 0:
tp += 1
else:
if results[i] == test_data.iloc[i,-1]:
tn+=1
else:
fp+=1

accuracy = (tp+tn)/(tp+tn+fp+fn)
misclassification = (fp+fn)/(tp+tn+fp+fn)
print('\n----------------------------------------------')
print('\nConfusion Matrix with Training : '+str(train_per)+' Test : '+str(100-train_per))
print('\n\t\tActual Yes\tActual No')
print('Predicted Yes\tTP='+str(tp)+'\t\tFP='+str(fp))
print('Predicted No \tFN='+str(fn)+'\t\tTN='+str(tn))
print('\nAccuracy : ',(accuracy*100))
print('Misclassification rate : ',(misclassification*100))

n = NB()
n.process(70)
n.process(30)
n.process(80)
n.process(20)
fp += 1
else:
if test_y.iloc[j] == 1:
tn += 1
else:
fn += 1
print('Accuracy for training length '+str(i)+'% : ',((tp+tn)/len(test_y))*100)

0 comments on commit cd8ec7f

Please sign in to comment.