-
Notifications
You must be signed in to change notification settings - Fork 1
/
SVM_main_code.py
106 lines (91 loc) · 4.28 KB
/
SVM_main_code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 9 17:36:31 2017
@author: Abdullah Mobeen
"""
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from sklearn import metrics, svm, cross_validation
import time
def normalize(data):
"""Takes as input the name of the file storing data. Returns the feature vectors
and the vector of labels"""
features = []
labels = []
with open(data,'r') as file:
all_features = file.readlines()
for i in range(len(all_features)):
features.append((all_features[i].split(',')[1:]))
labels.append((all_features[i].split(',')[0]))
for i in range(len(labels)):
labels[i] = int(labels[i])
for j in range(len(features[0])):
features[i][j] = (((2*(int(features[i][j])))/255)-1)
features = sp.array(features)
labels = sp.array(labels)
return features,labels
def classifier(train, test,C,gamma):
"""Uses Support Vector Machine to obtain the optimal hyperplane for Multiple Classes.
Then tests the optimal hyperplane for new data in mnist_test.txt.
Prints the results: errors committed, total images analyzed, classifier, and confusion matrix"""
classifier = svm.SVC(C=C,gamma=gamma)
features_train,labels_train = normalize(train)
features_test,labels_test = normalize(test)
classifier.fit(features_train,labels_train)
predicted = classifier.predict(features_test)
error = 0
total = 0
for i in range(len(predicted)):
if int(predicted[i]) != int(labels_test[i]):
error+=1
total+=1
print("Total images analysed:", len(labels_test),'\n',"Errors committed:",error)
print()
print("Error Percentage:",(error/len(predicted))*100,'%')
print()
print("Classification report for classifier %s:\n%s\n"
% (classifier, metrics.classification_report(labels_test, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(labels_test, predicted))
def cross_val(C,gamma):
"""Takes as input the value of C and gamma. Performs cross validation
with 5 folds and returns the mean scores and mean test-errors"""
global features,labels
classifier = svm.SVC(C=C,gamma=gamma)
scores = cross_validation.cross_val_score(classifier,features,labels,cv=5)
scores_mean = np.mean(scores)
test_error = ((1-scores_mean)*100)
print("Using the Value of C =",C,"We get: Cross Val Error =",scores_mean,"and Test Error % =", test_error)
print()
print("Now computing cross validation results for the next value of C. Please wait.")
print()
return test_error,scores_mean
if __name__ == '__main__':
features_1, labels_1 = normalize('mnist_train.txt')
features_2, labels_2 = normalize('mnist_test.txt')
features = np.concatenate((features_1,features_2))
labels = np.concatenate((labels_1,labels_2))
classifier('mnist_train.txt','mnist_test.txt',1,(1/len(features_1[0]))) #Also prints the confusion matrix. Concept understood from the sckit website example.
print()
print("Now doing the cross validation and plotting the graph to find the optimal value of C, it might take some time")
print()
print("Doing various tests on classifier function for the value of gamma, its optimal value has been found to be 1/200")
start_time = time.time()
print()
test_errors = list()
scores = list()
C = [0.1,0.5,1,2,3,5,10,30] #Multiple arbitrary values for C
for c in C:
errors,sc = cross_val(c,1/200) #Experimenting with classification function gave the optimal gamma = 1/200.
test_errors.append(errors)
scores.append(sc)
print("Mean cross validation scores using the respective values of C =",C,":",scores)
print()
print("Mean test errors (%) using the respective values of C =",C,":",test_errors)
plt.plot(C,test_errors,'-b',label = 'Mean Test Errors against values of C')
plt.legend(loc = 'upper right')
end_time = time.time()
print()
print("Total time taken:",round(end_time-start_time,2),'seconds')
print()
print("From the graph, it has been found that C = 3 and gamma = 1/200 gives the least error")