Skip to content

Commit 6973cce

Browse files
Naive Bayes and SVM added
1 parent cf9828f commit 6973cce

File tree

19 files changed

+16759
-27
lines changed

19 files changed

+16759
-27
lines changed

Gaussian Discriminant Analysis/GDA.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ def Xread():
1919
def Yread():
2020
return np.matrix([[(line.strip())] for line in open('y.dat')])
2121

22+
''' Generate Linear Boundary '''
2223
def LinBoundary(u0,u1,S,X):
2324
Mat1 = u0*inv(S) - u1*inv(S)
2425
Mat2 = inv(S)*(u0.T - u1.T)
@@ -33,6 +34,7 @@ def LinBoundary(u0,u1,S,X):
3334
x2.append(temp)
3435
return x1,x2
3536

37+
''' Generate Quadratic Boudary '''
3638
def QuadBoundary(u0,u1,S0,S1,X):
3739
Mat1 = inv(S0) - inv(S1)
3840
Mat2 = (-2)*(inv(S0) * u0.T - inv(S1) *u1.T)
@@ -51,11 +53,11 @@ def QuadBoundary(u0,u1,S0,S1,X):
5153
x2.append(temp/(2*c))
5254
return x1,x2
5355

54-
''' Read input values '''
56+
# Read input values
5557
X = Xread()
5658
Y = Yread()
5759

58-
''' Normalize '''
60+
# Normalize
5961
X_mean = np.mean(X, axis=0)
6062
X_std = np.std(X, axis=0)
6163
X = (X-X_mean)/X_std
@@ -69,12 +71,15 @@ def QuadBoundary(u0,u1,S0,S1,X):
6971
else:
7072
X_zero.append([X.tolist()[i][0], X.tolist()[i][1]])
7173

74+
# Calculate Variables
7275
phi = float(len(X_one))/(len(X_one) + len(X_zero))
7376
mean_0 = np.sum(np.matrix(X_zero), axis=0) / len(X_zero)
7477
mean_1 = np.sum(np.matrix(X_one), axis=0) / len(X_one)
78+
print 'Phi = ', phi,
79+
print 'mean_0 = ', mean_0
80+
print 'mean_1 = ', mean_1
7581

76-
print phi, mean_0, mean_1
77-
82+
# Generate Sigma Matricies
7883
sigma = np.zeros((2, 2))
7984
sigma_0 = np.zeros((2, 2))
8085
sigma_1 = np.zeros((2, 2))

Linear Regression/linear_regression.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -75,25 +75,25 @@ def gradient_descent(X, Y):
7575
def linear(Theta, x):
7676
return Theta.item(1)*x + Theta.item(0)
7777

78-
''' Read input values '''
78+
# Read input values
7979
X = Xread()
8080
Y = Yread()
8181

82-
''' Normalize '''
82+
# Normalize
8383
X_mean = np.mean(X, axis=0)
8484
X_std = np.std(X, axis=0)
8585
X = (X-X_mean)/X_std
8686
X = np.c_[np.ones((X.shape[0], 1)), X]
8787

88-
''' Perform Gradient Descent '''
88+
# Perform Gradient Descent
8989
FinalTheta = gradient_descent(X, Y)
9090

91-
''' Print Output '''
91+
# Print Output
9292
print 'Analytical Solution\n', analytical_solution(X,Y)
9393
print 'Gradient Decent Solution\n', FinalTheta
9494
print 'Iterations used = ', iteration
9595

96-
''' 2D plot of the hypothesis function '''
96+
### 2D plot of the hypothesis function ###
9797
X_plot = [item[1] for item in X.tolist()]
9898
Y_plot = [item[0] for item in Y.tolist()]
9999
x = np.arange(min(X_plot)-1, max(X_plot)+1, 0.1)
@@ -109,7 +109,7 @@ def linear(Theta, x):
109109
plt.title('House Prices')
110110
plt.show()
111111

112-
''' 3D plot of the J(theta) function '''
112+
### 3D plot of the J(theta) function ###
113113
# Returns the value of J(theta)
114114
def create_J_plot(Theta_0, Theta_1):
115115
Theta = np.matrix([[Theta_0],[Theta_1]])
@@ -120,7 +120,7 @@ def create_J_plot(Theta_0, Theta_1):
120120
ax = fig.add_subplot(111, projection='3d')
121121
ax.set_zlim(-100, 200)
122122

123-
# # Plot the 3D curve
123+
# Plot the 3D curve
124124
A = []; B = []; C = []
125125
theta_0_plot = np.arange(-15, 20, 0.5)
126126
theta_1_plot = np.arange(-15, 20, 0.5)

Logistic Regression/Logistic_Regression.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -48,21 +48,17 @@ def norm(newTheta, Theta):
4848
def boundary(x, Theta):
4949
return (-Theta.item(1)*x-Theta.item(0))/Theta.item(2)
5050

51-
''' Read input values '''
51+
# Read input values
5252
X = Xread()
5353
Y = Yread()
5454

55-
print X
56-
57-
''' Normalize '''
55+
# Normalize
5856
X_mean = np.mean(X, axis=0)
5957
X_std = np.std(X, axis=0)
6058
X = (X-X_mean)/X_std
6159
X = np.c_[np.ones((X.shape[0], 1)), X]
6260

63-
print X
64-
65-
''' Calculate Boundary using Newton's Method '''
61+
# Calculate Boundary using Newton's Method
6662
Theta = initialize_theta(X.shape[1])
6763
while(True):
6864
iteration += 1
@@ -73,7 +69,7 @@ def boundary(x, Theta):
7369
print 'Theta \n', Theta
7470
print 'Iterations = ', iteration
7571

76-
''' Create 2D Plot of points and classification boundary '''
72+
### Create 2D Plot of points and classification boundary ###
7773
# Create two lists based on classification
7874
X_one = []
7975
X_zero = []

Naive Bayes/naive_bayes.py

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
'''
2+
===========
3+
Naive Bayes
4+
===========
5+
6+
'''
7+
8+
import math
9+
import numpy as np
10+
11+
''' Read from Input File '''
12+
def readFile(fileName):
13+
data = []
14+
old = np.array(data)
15+
for line in open(fileName):
16+
A = line.strip().split()
17+
B = A[:1]
18+
C = A[1:]
19+
new = np.array([B] + [C])
20+
old = np.concatenate((old, new))
21+
size = np.shape(old)[0]
22+
old = np.reshape(old, (size/2,2))
23+
return old
24+
25+
''' Remove Duplicate Values '''
26+
def remove_duplicates(values):
27+
output = []
28+
temp = []
29+
seen = set()
30+
for value in values:
31+
temp.extend(value)
32+
for value in temp:
33+
if value not in seen:
34+
output.append(value)
35+
seen.add(value)
36+
return output
37+
38+
''' Get index of category '''
39+
def map_category(mapping, str):
40+
return mapping[str]
41+
42+
''' Get index of word '''
43+
def map_word(dictionary, str):
44+
try:
45+
A = dictionary[str]
46+
return A
47+
except Exception as e:
48+
return -1
49+
50+
''' Train Model '''
51+
def train(categories, dictionary, inputs):
52+
cats = len(categories)
53+
dicts = len(dictionary)
54+
values = np.empty((cats, dicts, 2))
55+
for i in range(len(categories)):
56+
for j in range(len(dictionary)):
57+
values[i][j][0] = 1
58+
values[i][j][1] = 0
59+
60+
word_counts = np.array([[dicts, 0]]*cats)
61+
62+
for i in range(len(inputs)):
63+
print i
64+
curr = inputs[i]
65+
category = curr[0][0]
66+
words = curr[1]
67+
cat_index = map_category(categories, category)
68+
for word in words:
69+
word_index = map_word(dictionary, word)
70+
values[cat_index][word_index][0] += 1
71+
count = len(words)
72+
word_counts[cat_index][0] = int(word_counts[cat_index][0]) + count
73+
word_counts[cat_index][1] = int(word_counts[cat_index][1]) + 1
74+
75+
for i in range(len(word_counts)):
76+
for line in values[i]:
77+
line[1] = float(line[0]) / float(word_counts[i][0])
78+
return values, word_counts
79+
80+
''' Test Model '''
81+
def classify(trained, categories, dictionary, classifiers, tests):
82+
output = np.array([['a'] + ['b'] for line in tests])
83+
probs = np.array([[1.0] for line in categories])
84+
counter = -1
85+
for test in tests:
86+
counter = counter + 1
87+
print counter
88+
cat = test[0][0]
89+
cat_index = map_category(categories, cat)
90+
words = test[1]
91+
for i in range(len(classifiers)):
92+
for word in words:
93+
word_index = map_word(dictionary, word)
94+
if word_index == -1:
95+
continue
96+
probs[i][0] = float(probs[i][0]) + math.log(float(trained[i][word_index][1]))
97+
probs[i][0] = float(probs[i][0]) + math.log(float(classifiers[i][1]))
98+
index = probs.argmax()
99+
output[counter][0] = cat_index
100+
output[counter][1] = index
101+
for i in range(len(probs)):
102+
probs[i][0] = 1.0
103+
return output
104+
105+
# Read Inputs
106+
print "Reading Training Data"
107+
inputs = readFile('nb_data/r8-train-all-terms-new.txt')
108+
print "Reading Test Data"
109+
tests = readFile('nb_data/r8-test-all-terms-new.txt')
110+
111+
# Remove duplicates
112+
categories = remove_duplicates(inputs[:,0])
113+
dictionary = remove_duplicates(inputs[:,1])
114+
115+
# Create hash maps
116+
mycats = dict(zip(categories, np.arange(0, len(categories), 1)))
117+
mydict = dict(zip(dictionary, np.arange(0, len(dictionary), 1)))
118+
119+
# Train the data
120+
print "Training Data"
121+
trained_values, classifiers = train(mycats, mydict, inputs)
122+
123+
# Test the data
124+
print "Testing Data"
125+
category = classify(trained_values, mycats, mydict, classifiers, tests)
126+
127+
# Calculate the accuracy of the test input
128+
for line in category:
129+
print line
130+
count = 0
131+
correct = 0
132+
for line in category:
133+
count = count + 1
134+
if line[0] == line[1]:
135+
correct = correct + 1
136+
print "accuracy = ", float(correct) / float(count)
137+
138+
# Create the confusion matrix
139+
m = len(categories)
140+
M = [map(float, [0]) * m for _ in xrange(m)]
141+
for line in category:
142+
i = int(line[0])
143+
j = int(line[1])
144+
M[i][j] += 1
145+
N = np.matrix(M)
146+
S = np.sum(N, axis=1)
147+
148+
# Create accuracy confusion matrix
149+
for i in xrange(m):
150+
for j in xrange(m):
151+
N[i,j] = float(N.item((i, j))) / float(S.item(i))
152+
153+
154+

Naive Bayes/nb_data/.DS_Store

6 KB
Binary file not shown.

0 commit comments

Comments
 (0)