|
15 | 15 | # one-hot categorical columns
|
16 | 16 |
|
17 | 17 | def get_data():
|
18 |
| - df = pd.read_csv(dir_path + '/ecommerce_data.csv') |
| 18 | + df = pd.read_csv(dir_path + '/ecommerce_data.csv') |
19 | 19 |
|
20 |
| - # just in case you're curious what's in it |
21 |
| - # df.head() |
| 20 | + # just in case you're curious what's in it |
| 21 | + # df.head() |
22 | 22 |
|
23 |
| - # easier to work with numpy array |
24 |
| - data = df.as_matrix() |
| 23 | + # easier to work with numpy array |
| 24 | + data = df.as_matrix() |
25 | 25 |
|
26 |
| - X = data[:,:-1] |
27 |
| - Y = data[:,-1] |
| 26 | + # shuffle it |
| 27 | + np.random.shuffle(data) |
28 | 28 |
|
29 |
| - # normalize columns 1 and 2 |
30 |
| - X[:,1] = (X[:,1] - X[:,1].mean()) / X[:,1].std() |
31 |
| - X[:,2] = (X[:,2] - X[:,2].mean()) / X[:,2].std() |
| 29 | + # split features and labels |
| 30 | + X = data[:,:-1] |
| 31 | + Y = data[:,-1].astype(np.int32) |
32 | 32 |
|
33 |
| - # create a new matrix X2 with the correct number of columns |
34 |
| - N, D = X.shape |
35 |
| - X2 = np.zeros((N, D+3)) |
36 |
| - X2[:,0:(D-1)] = X[:,0:(D-1)] # non-categorical |
| 33 | + # one-hot encode the categorical data |
| 34 | + # create a new matrix X2 with the correct number of columns |
| 35 | + N, D = X.shape |
| 36 | + X2 = np.zeros((N, D+3)) |
| 37 | + X2[:,0:(D-1)] = X[:,0:(D-1)] # non-categorical |
37 | 38 |
|
38 |
| - # one-hot |
39 |
| - for n in range(N): |
40 |
| - t = int(X[n,D-1]) |
41 |
| - X2[n,t+D-1] = 1 |
| 39 | + # one-hot |
| 40 | + for n in range(N): |
| 41 | + t = int(X[n,D-1]) |
| 42 | + X2[n,t+D-1] = 1 |
42 | 43 |
|
43 |
| - # method 2 |
44 |
| - # Z = np.zeros((N, 4)) |
45 |
| - # Z[np.arange(N), X[:,D-1].astype(np.int32)] = 1 |
46 |
| - # # assign: X2[:,-4:] = Z |
47 |
| - # assert(np.abs(X2[:,-4:] - Z).sum() < 1e-10) |
| 44 | + # method 2 |
| 45 | + # Z = np.zeros((N, 4)) |
| 46 | + # Z[np.arange(N), X[:,D-1].astype(np.int32)] = 1 |
| 47 | + # # assign: X2[:,-4:] = Z |
| 48 | + # assert(np.abs(X2[:,-4:] - Z).sum() < 1e-10) |
48 | 49 |
|
49 |
| - return X2, Y |
| 50 | + # assign X2 back to X, since we don't need original anymore |
| 51 | + X = X2 |
| 52 | + |
| 53 | + # split train and test |
| 54 | + Xtrain = X[:-100] |
| 55 | + Ytrain = Y[:-100] |
| 56 | + Xtest = X[-100:] |
| 57 | + Ytest = Y[-100:] |
| 58 | + |
| 59 | + # normalize columns 1 and 2 |
| 60 | + for i in (1, 2): |
| 61 | + m = Xtrain[:,i].mean() |
| 62 | + s = Xtrain[:,i].std() |
| 63 | + Xtrain[:,i] = (Xtrain[:,i] - m) / s |
| 64 | + Xtest[:,i] = (Xtest[:,i] - m) / s |
| 65 | + |
| 66 | + return Xtrain, Ytrain, Xtest, Ytest |
50 | 67 |
|
51 | 68 |
|
52 | 69 | def get_binary_data():
|
53 |
| - # return only the data from the first 2 classes |
54 |
| - X, Y = get_data() |
55 |
| - X2 = X[Y <= 1] |
56 |
| - Y2 = Y[Y <= 1] |
57 |
| - return X2, Y2 |
| 70 | + # return only the data from the first 2 classes |
| 71 | + Xtrain, Ytrain, Xtest, Ytest = get_data() |
| 72 | + X2train = Xtrain[Ytrain <= 1] |
| 73 | + Y2train = Ytrain[Ytrain <= 1] |
| 74 | + X2test = Xtest[Ytest <= 1] |
| 75 | + Y2test = Ytest[Ytest <= 1] |
| 76 | + return X2train, Y2train, X2test, Y2test |
0 commit comments