small improvement

lazyprogrammer · lazyprogrammer · commit a45a3b25140c · 2018-01-02T22:41:36.000-05:00
diff --git a/ann_logistic_extra/ann_predict.py b/ann_logistic_extra/ann_predict.py
@@ -7,7 +7,7 @@
 import numpy as np
 from process import get_data
 
-X, Y = get_data()
+X, Y, _, _ = get_data()
 
 # randomly initialize weights
 M = 5
diff --git a/ann_logistic_extra/ann_train.py b/ann_logistic_extra/ann_train.py
@@ -17,19 +17,13 @@ def y2indicator(y, K):
         ind[i, y[i]] = 1
     return ind
 
-X, Y = get_data()
-X, Y = shuffle(X, Y)
-Y = Y.astype(np.int32)
-M = 5
-D = X.shape[1]
-K = len(set(Y))
-
-# create train and test sets
-Xtrain = X[:-100]
-Ytrain = Y[:-100]
+Xtrain, Ytrain, Xtest, Ytest = get_data()
+D = Xtrain.shape[1]
+K = len(set(Ytrain) | set(Ytest))
+M = 5 # num hidden units
+
+# convert to indicator
 Ytrain_ind = y2indicator(Ytrain, K)
-Xtest = X[-100:]
-Ytest = Y[-100:]
 Ytest_ind = y2indicator(Ytest, K)
 
 # randomly initialize weights
diff --git a/ann_logistic_extra/logistic_predict.py b/ann_logistic_extra/logistic_predict.py
@@ -7,7 +7,7 @@
 import numpy as np
 from process import get_binary_data
 
-X, Y = get_binary_data()
+X, Y, _, _ = get_binary_data()
 
 # randomly initialize weights
 D = X.shape[1]
diff --git a/ann_logistic_extra/logistic_softmax_train.py b/ann_logistic_extra/logistic_softmax_train.py
@@ -17,18 +17,12 @@ def y2indicator(y, K):
         ind[i, y[i]] = 1
     return ind
 
-X, Y = get_data()
-X, Y = shuffle(X, Y)
-Y = Y.astype(np.int32)
-D = X.shape[1]
-K = len(set(Y))
-
-# create train and test sets
-Xtrain = X[:-100]
-Ytrain = Y[:-100]
+Xtrain, Ytrain, Xtest, Ytest = get_data()
+D = Xtrain.shape[1]
+K = len(set(Ytrain) | set(Ytest))
+
+# convert to indicator
 Ytrain_ind = y2indicator(Ytrain, K)
-Xtest = X[-100:]
-Ytest = Y[-100:]
 Ytest_ind = y2indicator(Ytest, K)
 
 # randomly initialize weights
diff --git a/ann_logistic_extra/logistic_train.py b/ann_logistic_extra/logistic_train.py
@@ -10,17 +10,11 @@
 from sklearn.utils import shuffle
 from process import get_binary_data
 
-X, Y = get_binary_data()
-X, Y = shuffle(X, Y)
-
-# create train and test sets
-Xtrain = X[:-100]
-Ytrain = Y[:-100]
-Xtest = X[-100:]
-Ytest = Y[-100:]
+# get the data
+Xtrain, Ytrain, Xtest, Ytest = get_binary_data()
 
 # randomly initialize weights
-D = X.shape[1]
+D = Xtrain.shape[1]
 W = np.random.randn(D)
 b = 0 # bias term
 
diff --git a/ann_logistic_extra/process.py b/ann_logistic_extra/process.py
@@ -15,43 +15,62 @@
 # one-hot categorical columns
 
 def get_data():
-    df = pd.read_csv(dir_path + '/ecommerce_data.csv')
+  df = pd.read_csv(dir_path + '/ecommerce_data.csv')
 
-    # just in case you're curious what's in it
-    # df.head()
+  # just in case you're curious what's in it
+  # df.head()
 
-    # easier to work with numpy array
-    data = df.as_matrix()
+  # easier to work with numpy array
+  data = df.as_matrix()
 
-    X = data[:,:-1]
-    Y = data[:,-1]
+  # shuffle it
+  np.random.shuffle(data)
 
-    # normalize columns 1 and 2
-    X[:,1] = (X[:,1] - X[:,1].mean()) / X[:,1].std()
-    X[:,2] = (X[:,2] - X[:,2].mean()) / X[:,2].std()
+  # split features and labels
+  X = data[:,:-1]
+  Y = data[:,-1].astype(np.int32)
 
-    # create a new matrix X2 with the correct number of columns
-    N, D = X.shape
-    X2 = np.zeros((N, D+3))
-    X2[:,0:(D-1)] = X[:,0:(D-1)] # non-categorical
+  # one-hot encode the categorical data
+  # create a new matrix X2 with the correct number of columns
+  N, D = X.shape
+  X2 = np.zeros((N, D+3))
+  X2[:,0:(D-1)] = X[:,0:(D-1)] # non-categorical
 
-    # one-hot
-    for n in range(N):
-        t = int(X[n,D-1])
-        X2[n,t+D-1] = 1
+  # one-hot
+  for n in range(N):
+      t = int(X[n,D-1])
+      X2[n,t+D-1] = 1
 
-    # method 2
-    # Z = np.zeros((N, 4))
-    # Z[np.arange(N), X[:,D-1].astype(np.int32)] = 1
-    # # assign: X2[:,-4:] = Z
-    # assert(np.abs(X2[:,-4:] - Z).sum() < 1e-10)
+  # method 2
+  # Z = np.zeros((N, 4))
+  # Z[np.arange(N), X[:,D-1].astype(np.int32)] = 1
+  # # assign: X2[:,-4:] = Z
+  # assert(np.abs(X2[:,-4:] - Z).sum() < 1e-10)
 
-    return X2, Y
+  # assign X2 back to X, since we don't need original anymore
+  X = X2
+
+  # split train and test
+  Xtrain = X[:-100]
+  Ytrain = Y[:-100]
+  Xtest = X[-100:]
+  Ytest = Y[-100:]
+
+  # normalize columns 1 and 2
+  for i in (1, 2):
+    m = Xtrain[:,i].mean()
+    s = Xtrain[:,i].std()
+    Xtrain[:,i] = (Xtrain[:,i] - m) / s
+    Xtest[:,i] = (Xtest[:,i] - m) / s
+
+  return Xtrain, Ytrain, Xtest, Ytest
 
 
 def get_binary_data():
-    # return only the data from the first 2 classes
-    X, Y = get_data()
-    X2 = X[Y <= 1]
-    Y2 = Y[Y <= 1]
-    return X2, Y2
+  # return only the data from the first 2 classes
+  Xtrain, Ytrain, Xtest, Ytest = get_data()
+  X2train = Xtrain[Ytrain <= 1]
+  Y2train = Ytrain[Ytrain <= 1]
+  X2test = Xtest[Ytest <= 1]
+  Y2test = Ytest[Ytest <= 1]
+  return X2train, Y2train, X2test, Y2test