forked from lazyprogrammer/machine_learning_examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess.py
47 lines (35 loc) · 1.04 KB
/
process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import numpy as np
import pandas as pd
# normalize numerical columns
# one-hot categorical columns
def get_data():
df = pd.read_csv('ecommerce_data.csv')
# just in case you're curious what's in it
# df.head()
# easier to work with numpy array
data = df.as_matrix()
X = data[:,:-1]
Y = data[:,-1]
# normalize columns 1 and 2
X[:,1] = (X[:,1] - X[:,1].mean()) / X[:,1].std()
X[:,2] = (X[:,2] - X[:,2].mean()) / X[:,2].std()
# create a new matrix X2 with the correct number of columns
N, D = X.shape
X2 = np.zeros((N, D+3))
X2[:,0:(D-1)] = X[:,0:(D-1)] # non-categorical
# one-hot
for n in xrange(N):
t = int(X[n,D-1])
X2[n,t+D-1] = 1
# method 2
# Z = np.zeros((N, 4))
# Z[np.arange(N), X[:,D-1].astype(np.int32)] = 1
# # assign: X2[:,-4:] = Z
# assert(np.abs(X2[:,-4:] - Z).sum() < 10e-10)
return X2, Y
def get_binary_data():
# return only the data from the first 2 classes
X, Y = get_data()
X2 = X[Y <= 1]
Y2 = Y[Y <= 1]
return X2, Y2