NikhilGupta1997
diff --git a/‎Gaussian Discriminant Analysis/GDA.py‎
Lines changed: 9 additions & 4 deletions b/‎Gaussian Discriminant Analysis/GDA.py‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎Linear Regression/linear_regression.py‎
Lines changed: 7 additions & 7 deletions b/‎Linear Regression/linear_regression.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎Logistic Regression/Logistic_Regression.py‎
Lines changed: 4 additions & 8 deletions b/‎Logistic Regression/Logistic_Regression.py‎
Lines changed: 4 additions & 8 deletions
diff --git a/‎Naive Bayes/naive_bayes.py‎
Lines changed: 154 additions & 0 deletions b/‎Naive Bayes/naive_bayes.py‎
Lines changed: 154 additions & 0 deletions
diff --git a/‎Naive Bayes/nb_data/.DS_Store‎
6 KB b/‎Naive Bayes/nb_data/.DS_Store‎
6 KB
@@ -19,6 +19,7 @@ def Xread():
 def Yread():
 	return np.matrix([[(line.strip())] for line in open('y.dat')])
 
+''' Generate Linear Boundary '''
 def LinBoundary(u0,u1,S,X):
 	Mat1 = u0*inv(S) - u1*inv(S)
 	Mat2 = inv(S)*(u0.T - u1.T)
@@ -33,6 +34,7 @@ def LinBoundary(u0,u1,S,X):
 		x2.append(temp)
 	return x1,x2
 
+''' Generate Quadratic Boudary '''
 def QuadBoundary(u0,u1,S0,S1,X):
     Mat1 = inv(S0) - inv(S1)
     Mat2 = (-2)*(inv(S0) * u0.T - inv(S1) *u1.T)
@@ -51,11 +53,11 @@ def QuadBoundary(u0,u1,S0,S1,X):
         x2.append(temp/(2*c))
     return x1,x2
 
-''' Read input values '''
+# Read input values
 X = Xread()
 Y = Yread()
 
-''' Normalize '''
+# Normalize
 X_mean = np.mean(X, axis=0)
 X_std = np.std(X, axis=0)
 X = (X-X_mean)/X_std
@@ -69,12 +71,15 @@ def QuadBoundary(u0,u1,S0,S1,X):
 	else:
 		X_zero.append([X.tolist()[i][0], X.tolist()[i][1]])
 
+# Calculate Variables
 phi = float(len(X_one))/(len(X_one) + len(X_zero))
 mean_0 = np.sum(np.matrix(X_zero), axis=0) / len(X_zero)
 mean_1 = np.sum(np.matrix(X_one), axis=0) / len(X_one)
+print 'Phi = ', phi,
+print 'mean_0 = ', mean_0
+print 'mean_1 = ', mean_1
 
-print phi, mean_0, mean_1
-
+# Generate Sigma Matricies
 sigma = np.zeros((2, 2))
 sigma_0 = np.zeros((2, 2))
 sigma_1 = np.zeros((2, 2))
 
@@ -75,25 +75,25 @@ def gradient_descent(X, Y):
 def linear(Theta, x):
 	return Theta.item(1)*x + Theta.item(0)
 
-''' Read input values '''
+# Read input values
 X = Xread()
 Y = Yread()
 
-''' Normalize '''
+# Normalize
 X_mean = np.mean(X, axis=0)
 X_std = np.std(X, axis=0)
 X = (X-X_mean)/X_std
 X = np.c_[np.ones((X.shape[0], 1)), X]
 
-''' Perform Gradient Descent '''
+# Perform Gradient Descent
 FinalTheta = gradient_descent(X, Y)
 
-''' Print Output '''
+# Print Output
 print 'Analytical Solution\n', analytical_solution(X,Y)
 print 'Gradient Decent Solution\n', FinalTheta
 print 'Iterations used = ', iteration
 
-''' 2D plot of the hypothesis function '''
+### 2D plot of the hypothesis function ###
 X_plot = [item[1] for item in X.tolist()]
 Y_plot = [item[0] for item in Y.tolist()]
 x = np.arange(min(X_plot)-1, max(X_plot)+1, 0.1)
@@ -109,7 +109,7 @@ def linear(Theta, x):
 plt.title('House Prices')
 plt.show()
 
-''' 3D plot of the J(theta) function '''
+### 3D plot of the J(theta) function ###
 # Returns the value of J(theta)
 def create_J_plot(Theta_0, Theta_1):
 	Theta = np.matrix([[Theta_0],[Theta_1]])
@@ -120,7 +120,7 @@ def create_J_plot(Theta_0, Theta_1):
 ax = fig.add_subplot(111, projection='3d')
 ax.set_zlim(-100, 200)
 
-# # Plot the 3D curve
+# Plot the 3D curve
 A = []; B = []; C = []
 theta_0_plot = np.arange(-15, 20, 0.5)
 theta_1_plot = np.arange(-15, 20, 0.5)
 
@@ -48,21 +48,17 @@ def norm(newTheta, Theta):
 def boundary(x, Theta):
 	return (-Theta.item(1)*x-Theta.item(0))/Theta.item(2)
 
-''' Read input values '''
+# Read input values
 X = Xread()
 Y = Yread()
 
-print X
-
-''' Normalize '''
+# Normalize
 X_mean = np.mean(X, axis=0)
 X_std = np.std(X, axis=0)
 X = (X-X_mean)/X_std
 X = np.c_[np.ones((X.shape[0], 1)), X]
 
-print X
-
-''' Calculate Boundary using Newton's Method '''
+# Calculate Boundary using Newton's Method
 Theta = initialize_theta(X.shape[1])
 while(True):
 	iteration += 1
@@ -73,7 +69,7 @@ def boundary(x, Theta):
 print 'Theta \n', Theta
 print 'Iterations = ', iteration
 
-''' Create 2D Plot of points and classification boundary '''
+### Create 2D Plot of points and classification boundary ###
 # Create two lists based on classification
 X_one = []
 X_zero = []
 
@@ -0,0 +1,154 @@
+'''
+===========
+Naive Bayes
+===========
+
+'''
+
+import math
+import numpy as np
+
+''' Read from Input File '''
+def readFile(fileName):
+	data = []
+	old = np.array(data)
+	for line in open(fileName):
+		A = line.strip().split()
+		B = A[:1]
+		C = A[1:]
+		new = np.array([B] + [C])
+		old = np.concatenate((old, new))
+	size = np.shape(old)[0]
+	old = np.reshape(old, (size/2,2))
+	return old
+
+''' Remove Duplicate Values '''
+def remove_duplicates(values):
+	output = []
+	temp = []
+	seen = set()
+	for value in values:
+		temp.extend(value)
+	for value in temp:
+		if value not in seen:
+			output.append(value)
+			seen.add(value)
+	return output
+
+''' Get index of category '''
+def map_category(mapping, str):
+	return mapping[str]
+
+''' Get index of word '''
+def map_word(dictionary, str):
+	try:
+		A = dictionary[str]
+		return A
+	except Exception as e:
+		return -1
+
+''' Train Model '''
+def train(categories, dictionary, inputs):
+	cats = len(categories)
+	dicts = len(dictionary)
+	values = np.empty((cats, dicts, 2))
+	for i in range(len(categories)):
+		for j in range(len(dictionary)):
+			values[i][j][0] = 1
+			values[i][j][1] = 0
+
+	word_counts = np.array([[dicts, 0]]*cats)
+
+	for i in range(len(inputs)):
+		print i
+		curr = inputs[i]
+		category = curr[0][0]
+		words = curr[1]
+		cat_index = map_category(categories, category)
+		for word in words:
+			word_index = map_word(dictionary, word)
+			values[cat_index][word_index][0] += 1
+		count = len(words)
+		word_counts[cat_index][0] = int(word_counts[cat_index][0]) + count
+		word_counts[cat_index][1] = int(word_counts[cat_index][1]) + 1
+
+	for i in range(len(word_counts)):
+		for line in values[i]:
+			line[1] = float(line[0]) / float(word_counts[i][0])
+	return values, word_counts
+
+''' Test Model '''
+def classify(trained, categories, dictionary, classifiers, tests):
+	output = np.array([['a'] + ['b'] for line in tests])
+	probs = np.array([[1.0] for line in categories])
+	counter = -1
+	for test in tests:
+		counter = counter + 1
+		print counter
+		cat = test[0][0]
+		cat_index = map_category(categories, cat)
+		words = test[1]
+		for i in range(len(classifiers)):
+			for word in words:
+				word_index = map_word(dictionary, word)
+				if word_index == -1:
+					continue
+				probs[i][0] = float(probs[i][0]) + math.log(float(trained[i][word_index][1]))
+			probs[i][0] = float(probs[i][0]) + math.log(float(classifiers[i][1]))
+		index = probs.argmax()
+		output[counter][0] = cat_index
+		output[counter][1] = index
+		for i in range(len(probs)):
+			probs[i][0] = 1.0
+	return output
+
+# Read Inputs
+print "Reading Training Data"
+inputs = readFile('nb_data/r8-train-all-terms-new.txt')
+print "Reading Test Data"
+tests = readFile('nb_data/r8-test-all-terms-new.txt')
+
+# Remove duplicates
+categories = remove_duplicates(inputs[:,0])
+dictionary = remove_duplicates(inputs[:,1])
+
+# Create hash maps
+mycats = dict(zip(categories, np.arange(0, len(categories), 1)))
+mydict = dict(zip(dictionary, np.arange(0, len(dictionary), 1)))
+
+# Train the data
+print "Training Data"
+trained_values, classifiers = train(mycats, mydict, inputs)
+
+# Test the data
+print "Testing Data"
+category = classify(trained_values, mycats, mydict, classifiers, tests)
+
+# Calculate the accuracy of the test input
+for line in category:
+	print line
+count = 0
+correct = 0
+for line in category:
+	count = count + 1 
+	if line[0] == line[1]:
+		correct = correct + 1
+print "accuracy = ", float(correct) / float(count)
+
+# Create the confusion matrix
+m = len(categories)
+M = [map(float, [0]) * m for _ in xrange(m)]
+for line in category:
+	i = int(line[0])
+	j = int(line[1])
+	M[i][j] += 1
+N = np.matrix(M)
+S = np.sum(N, axis=1)
+
+# Create accuracy confusion matrix
+for i in xrange(m):
+	for j in xrange(m):
+		N[i,j] = float(N.item((i, j))) / float(S.item(i))
+
+
+