Skip to content

Commit

Permalink
Subject wise LDA. subject wise scaling and PCA, renaming folders.
Browse files Browse the repository at this point in the history
Results for LDA have been added to the LDA file in the comments section.
new pca outputs have been stored in the training/pca_data_v2 and testing/pca_data_v2 folders

dimensions for the pca output for each subject is not the same.
  • Loading branch information
iankurgarg committed Apr 24, 2017
1 parent 416c4de commit bfb1ef8
Show file tree
Hide file tree
Showing 20 changed files with 55,842 additions and 10,494 deletions.
3,505 changes: 0 additions & 3,505 deletions data/Testing/pca_subject1.csv

This file was deleted.

3,505 changes: 3,505 additions & 0 deletions data/testing/pca_data_v1/pca_subject1.csv

Large diffs are not rendered by default.

6,946 changes: 3,473 additions & 3,473 deletions data/Testing/pca_subject2.csv → data/testing/pca_data_v1/pca_subject2.csv

Large diffs are not rendered by default.

6,978 changes: 3,489 additions & 3,489 deletions data/Testing/pca_subject3.csv → data/testing/pca_data_v1/pca_subject3.csv

Large diffs are not rendered by default.

3,505 changes: 3,505 additions & 0 deletions data/testing/pca_data_v2/pca_subject1.csv

Large diffs are not rendered by default.

3,473 changes: 3,473 additions & 0 deletions data/testing/pca_data_v2/pca_subject2.csv

Large diffs are not rendered by default.

3,489 changes: 3,489 additions & 0 deletions data/testing/pca_data_v2/pca_subject3.csv

Large diffs are not rendered by default.

3,505 changes: 3,504 additions & 1 deletion data/testing/test_subject1_psd04.csv
100644 → 100755

Large diffs are not rendered by default.

10,529 changes: 10,529 additions & 0 deletions data/training/pca_data_v2/pca_subject1.csv

Large diffs are not rendered by default.

10,401 changes: 10,401 additions & 0 deletions data/training/pca_data_v2/pca_subject2.csv

Large diffs are not rendered by default.

10,289 changes: 10,289 additions & 0 deletions data/training/pca_data_v2/pca_subject3.csv

Large diffs are not rendered by default.

77 changes: 77 additions & 0 deletions python/featureSelection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import Normalizer



def removeCorrelation(input_data):
#Columns found using analysis. Check rough-sanket.py and R code for code and results
cor = [24, 72]
vec = list(input_data.columns)
for c in cor:
vec.remove(c)
new_data = input_data[vec]
return new_data


def runPCA(input_data, test, d):
input_data = removeCorrelation(input_data)
test = removeCorrelation(test)

normZ = Normalizer()
scaledX = normZ.fit_transform(input_data.iloc[:,:-1])
scaledTestX = normZ.transform(test)

pca = PCA()
pcaX = pca.fit_transform(scaledX)
pcaX = pd.DataFrame(pcaX)
print 'Approx 98% variance explained by '+str(d)+' features: ' + str(pca.explained_variance_ratio_[:d].sum())

trainY = input_data.iloc[:,-1]
trainY = trainY.reshape(len(trainY), 1)
trainY = pd.DataFrame(trainY)
trainY.columns = ['Class']

trainDataAfterPCA = pd.concat([pcaX.iloc[:,:d], trainY], axis=1)

testDataAfterPCA = pca.transform(scaledTestX)
testDataAfterPCA = pd.DataFrame(testDataAfterPCA)
testDataAfterPCA = testDataAfterPCA.iloc[:,:d]

return trainDataAfterPCA, testDataAfterPCA


def mainPCA():
d1 = pd.read_csv('../data/Training/train_subject1_psd01.csv',header=None)
d2 = pd.read_csv('../data/Training/train_subject1_psd02.csv',header=None)
d3 = pd.read_csv('../data/Training/train_subject1_psd03.csv',header=None)
input_data_s1 = pd.concat([d1, d2, d3], axis=0)
test_s1 = pd.read_csv('../data/Testing/test_subject1_psd04.csv', header=None)

d1 = pd.read_csv('../data/Training/train_subject2_psd01.csv',header=None)
d2 = pd.read_csv('../data/Training/train_subject2_psd02.csv',header=None)
d3 = pd.read_csv('../data/Training/train_subject2_psd03.csv',header=None)
input_data_s2 = pd.concat([d1, d2, d3], axis=0)
test_s2 = pd.read_csv('../data/Testing/test_subject2_psd04.csv', header=None)

d1 = pd.read_csv('../data/Training/train_subject3_psd01.csv',header=None)
d2 = pd.read_csv('../data/Training/train_subject3_psd02.csv',header=None)
d3 = pd.read_csv('../data/Training/train_subject3_psd03.csv',header=None)
input_data_s3 = pd.concat([d1, d2, d3], axis=0)
test_s3 = pd.read_csv('../data/Testing/test_subject3_psd04.csv', header=None)

train1, test1 = runPCA(input_data_s1, test_s1, 40)
train2, test2 = runPCA(input_data_s2, test_s2, 50)
train3, test3 = runPCA(input_data_s3, test_s3, 60)

train1.to_csv('../data/Training/pca_data_v2/pca_subject1.csv', index=False)
train2.to_csv('../data/Training/pca_data_v2/pca_subject2.csv', index=False)
train3.to_csv('../data/Training/pca_data_v2/pca_subject3.csv', index=False)

test1.to_csv('../data/Testing/pca_data_v2/pca_subject1.csv', index=False)
test2.to_csv('../data/Testing/pca_data_v2/pca_subject2.csv', index=False)
test3.to_csv('../data/Testing/pca_data_v2/pca_subject3.csv', index=False)

if __name__ == '__main__':
mainPCA()
95 changes: 95 additions & 0 deletions python/linearDiscriminantAnalysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import pandas as pd
import numpy as np
import sys
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.model_selection import cross_val_score


def LDA(input_data, test, actual):
model = LinearDiscriminantAnalysis()
scores = cross_val_score(model, input_data.iloc[:,:-1], input_data.iloc[:,-1], cv = 10)
print 'Cross Validation Accuracy = ' + str(scores.mean())
model.fit(input_data.iloc[:,:-1], input_data.iloc[:,-1])
pred = model.predict(test)
accuracy = sum(pred == actual)/float(len(actual))
print 'Test Accuracy for the subject is = ' + str(accuracy)


def mainRawData():
#Loading Input data (training)
d1 = pd.read_csv('../data/Training/train_subject1_psd01.csv',header=None)
d2 = pd.read_csv('../data/Training/train_subject1_psd02.csv',header=None)
d3 = pd.read_csv('../data/Training/train_subject1_psd03.csv',header=None)
input_data_s1 = pd.concat([d1, d2, d3], axis=0)

d1 = pd.read_csv('../data/Training/train_subject2_psd01.csv',header=None)
d2 = pd.read_csv('../data/Training/train_subject2_psd02.csv',header=None)
d3 = pd.read_csv('../data/Training/train_subject2_psd03.csv',header=None)
input_data_s2 = pd.concat([d1, d2, d3], axis=0)

d1 = pd.read_csv('../data/Training/train_subject3_psd01.csv',header=None)
d2 = pd.read_csv('../data/Training/train_subject3_psd02.csv',header=None)
d3 = pd.read_csv('../data/Training/train_subject3_psd03.csv',header=None)
input_data_s3 = pd.concat([d1, d2, d3], axis=0)

#Loading Test data (all subjects)
test_s1 = pd.read_csv('../data/Testing/test_subject1_psd04.csv', header=None)
actual_s1 = pd.read_csv('../data/Testing/ActualLables/labels_subject1_psd.csv', header=None)
actual_s1 = actual_s1[0]

test_s2 = pd.read_csv('../data/Testing/test_subject2_psd04.csv', header=None)
actual_s2 = pd.read_csv('../data/Testing/ActualLables/labels_subject2_psd.csv', header=None)
actual_s2 = actual_s2[0]

test_s3 = pd.read_csv('../data/Testing/test_subject3_psd04.csv', header=None)
actual_s3 = pd.read_csv('../data/Testing/ActualLables/labels_subject3_psd.csv', header=None)
actual_s3 = actual_s3[0]

LDA(input_data_s1, test_s1, actual_s1)
LDA(input_data_s2, test_s2, actual_s2)
LDA(input_data_s3, test_s3, actual_s3)

# Results:
#Cross Validation Accuracy = 0.69484335286
#Test Accuracy for the subject is = 0.714611872146
#
#Cross Validation Accuracy = 0.539059371463
#Test Accuracy for the subject is = 0.581221198157
#
#Cross Validation Accuracy = 0.461690546146
#Test Accuracy for the subject is = 0.491685779817

def mainPCAData():
input_data_s1 = pd.read_csv('../data/Training/pca_data_v2/pca_subject1.csv')

input_data_s2 = pd.read_csv('../data/Training/pca_data_v2/pca_subject2.csv')

input_data_s3 = pd.read_csv('../data/Training/pca_data_v2/pca_subject3.csv')

#Loading Test data (all subjects)
test_s1 = pd.read_csv('../data/Testing/pca_data_v2/pca_subject1.csv')
actual_s1 = pd.read_csv('../data/Testing/ActualLables/labels_subject1_psd.csv', header=None)
actual_s1 = actual_s1[0]

test_s2 = pd.read_csv('../data/Testing/pca_data_v2/pca_subject2.csv')
actual_s2 = pd.read_csv('../data/Testing/ActualLables/labels_subject2_psd.csv', header=None)
actual_s2 = actual_s2[0]

test_s3 = pd.read_csv('../data/Testing/pca_data_v2/pca_subject3.csv')
actual_s3 = pd.read_csv('../data/Testing/ActualLables/labels_subject3_psd.csv', header=None)
actual_s3 = actual_s3[0]

LDA(input_data_s1, test_s1, actual_s1)
LDA(input_data_s2, test_s2, actual_s2)
LDA(input_data_s3, test_s3, actual_s3)

# Results:
#Cross Validation Accuracy = 0.719154247244
#Test Accuracy for the subject is = 0.732591324201
#Cross Validation Accuracy = 0.565501871535
#Test Accuracy for the subject is = 0.613479262673
#Cross Validation Accuracy = 0.482587359569
#Test Accuracy for the subject is = 0.505447247706

if __name__ == '__main__':
mainPCAData()
39 changes: 13 additions & 26 deletions python/rough_ankur.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,29 +28,7 @@
print accuracy



# Linear Discriminant Analysis

#input_data = pd.read_csv('../data/')
input_data = pd.concat([d1, d2, d3], axis=0)
model = LinearDiscriminantAnalysis()
scores = cross_val_score(model, input_data.iloc[:,:-1], input_data.iloc[:,-1], cv = 10)
print scores.mean()

#model.fit(input_data.iloc[:,:-1], input_data.iloc[:,-1])
pred = model.predict(d3.iloc[:,:-1])
actual = d3.iloc[:,-1]
accuracy = sum(pred == actual)/float(len(actual))
print accuracy


d1 = pd.read_csv('../data/train_subject2_psd01.csv',header=None)
d2 = pd.read_csv('../data/train_subject2_psd02.csv',header=None)
d3 = pd.read_csv('../data/train_subject2_psd03.csv',header=None)

d1 = pd.read_csv('../data/train_subject3_psd01.csv',header=None)
d2 = pd.read_csv('../data/train_subject3_psd02.csv',header=None)
d3 = pd.read_csv('../data/train_subject3_psd03.csv',header=None)
# Random Forest



Expand Down Expand Up @@ -122,6 +100,10 @@
from sklearn import preprocessing

train = pd.read_csv('../data/Training/uncorrelated_training_data.csv')
test1_uncor = pd.read_csv('../data/Testing/uncorrelated_subject1_data.csv')
test2_uncor = pd.read_csv('../data/Testing/uncorrelated_subject2_data.csv')
test3_uncor = pd.read_csv('../data/Testing/uncorrelated_subject3_data.csv')

vec = list(train.columns)
train = train[vec[:-1]]
#preprocessing.scale(temp,axis=1,copy=False)
Expand All @@ -136,8 +118,13 @@
test2_pca = pd.DataFrame(test2_pca)
test3_pca = pd.DataFrame(test3_pca)

test1_pca.iloc[:,:40].to_csv('../Data/Testing/pca_subject1.csv', index=False)
test2_pca.iloc[:,:40].to_csv('../Data/Testing/pca_subject2.csv', index=False)
test3_pca.iloc[:,:40].to_csv('../Data/Testing/pca_subject3.csv', index=False)
test1_pca.iloc[:,:75].to_csv('../Data/Testing/pca_subject1.csv', index=False)
test2_pca.iloc[:,:75].to_csv('../Data/Testing/pca_subject2.csv', index=False)
test3_pca.iloc[:,:75].to_csv('../Data/Testing/pca_subject3.csv', index=False)






#

0 comments on commit bfb1ef8

Please sign in to comment.