Skip to content

Commit 54a700e

Browse files
ensembles class
1 parent 7bf9117 commit 54a700e

File tree

9 files changed

+863
-0
lines changed

9 files changed

+863
-0
lines changed

supervised_class2/adaboost.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
# https://www.udemy.com/machine-learning-in-python-random-forest-adaboost
2+
import numpy as np
3+
import matplotlib.pyplot as plt
4+
from sklearn.tree import DecisionTreeClassifier
5+
from rf_classification import get_data
6+
7+
8+
class AdaBoost:
9+
def __init__(self, M):
10+
self.M = M
11+
12+
def fit(self, X, Y):
13+
self.models = []
14+
self.alphas = []
15+
16+
N, _ = X.shape
17+
W = np.ones(N) / N
18+
19+
for m in xrange(self.M):
20+
tree = DecisionTreeClassifier(max_depth=1)
21+
tree.fit(X, Y, sample_weight=W)
22+
P = tree.predict(X)
23+
24+
err = W.dot(P != Y)
25+
alpha = 0.5*(np.log(1 - err) - np.log(err))
26+
27+
W = W*np.exp(-alpha*Y*P) # vectorized form
28+
W = W / W.sum() # normalize so it sums to 1
29+
30+
self.models.append(tree)
31+
self.alphas.append(alpha)
32+
33+
def predict(self, X):
34+
# NOT like SKLearn API
35+
# we want accuracy and exponential loss for plotting purposes
36+
N, _ = X.shape
37+
FX = np.zeros(N)
38+
for alpha, tree in zip(self.alphas, self.models):
39+
FX += alpha*tree.predict(X)
40+
return np.sign(FX), FX
41+
42+
def score(self, X, Y):
43+
# NOT like SKLearn API
44+
# we want accuracy and exponential loss for plotting purposes
45+
P, FX = self.predict(X)
46+
L = np.exp(-Y*FX).mean()
47+
return np.mean(P == Y), L
48+
49+
50+
if __name__ == '__main__':
51+
52+
X, Y = get_data()
53+
Y[Y == 0] = -1 # make the targets -1,+1
54+
Ntrain = int(0.8*len(X))
55+
Xtrain, Ytrain = X[:Ntrain], Y[:Ntrain]
56+
Xtest, Ytest = X[Ntrain:], Y[Ntrain:]
57+
58+
T = 200
59+
train_errors = np.empty(T)
60+
test_losses = np.empty(T)
61+
test_errors = np.empty(T)
62+
for num_trees in xrange(T):
63+
if num_trees == 0:
64+
train_errors[num_trees] = None
65+
test_errors[num_trees] = None
66+
test_losses[num_trees] = None
67+
continue
68+
if num_trees % 20 == 0:
69+
print num_trees
70+
71+
model = AdaBoost(num_trees)
72+
model.fit(Xtrain, Ytrain)
73+
acc, loss = model.score(Xtest, Ytest)
74+
acc_train, _ = model.score(Xtrain, Ytrain)
75+
train_errors[num_trees] = 1 - acc_train
76+
test_errors[num_trees] = 1 - acc
77+
test_losses[num_trees] = loss
78+
79+
if num_trees == T - 1:
80+
print "final train error:", 1 - acc_train
81+
print "final test error:", 1 - acc
82+
83+
plt.plot(test_errors, label='test errors')
84+
plt.plot(test_losses, label='test losses')
85+
plt.legend()
86+
plt.show()
87+
88+
plt.plot(train_errors, label='train errors')
89+
plt.plot(test_errors, label='test errors')
90+
plt.legend()
91+
plt.show()
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
# https://www.udemy.com/machine-learning-in-python-random-forest-adaboost
2+
import numpy as np
3+
import matplotlib.pyplot as plt
4+
from sklearn.tree import DecisionTreeClassifier
5+
from sklearn.utils import shuffle
6+
from util import plot_decision_boundary
7+
8+
np.random.seed(10)
9+
10+
# create the data
11+
N = 500
12+
D = 2
13+
X = np.random.randn(N, D)
14+
15+
# 2 gaussians
16+
# sep = 1.5
17+
# X[:N/2] += np.array([sep, sep])
18+
# X[N/2:] += np.array([-sep, -sep])
19+
# Y = np.array([0]*(N/2) + [1]*(N/2))
20+
21+
# noisy XOR
22+
sep = 2
23+
X[:125] += np.array([sep, sep])
24+
X[125:250] += np.array([sep, -sep])
25+
X[250:375] += np.array([-sep, -sep])
26+
X[375:] += np.array([-sep, sep])
27+
Y = np.array([0]*125 + [1]*125 + [0]*125 + [1]*125)
28+
29+
# plot the data
30+
plt.scatter(X[:,0], X[:,1], s=100, c=Y, alpha=0.5)
31+
plt.show()
32+
33+
# lone decision tree
34+
model = DecisionTreeClassifier()
35+
model.fit(X, Y)
36+
print "score for 1 tree:", model.score(X, Y)
37+
38+
# plot data with boundary
39+
plt.scatter(X[:,0], X[:,1], s=100, c=Y, alpha=0.5)
40+
plot_decision_boundary(X, model)
41+
plt.show()
42+
43+
44+
# create the bagged model
45+
class BaggedTreeClassifier:
46+
def __init__(self, B):
47+
self.B = B
48+
49+
def fit(self, X, Y):
50+
N = len(X)
51+
self.models = []
52+
for b in xrange(self.B):
53+
idx = np.random.choice(N, size=N, replace=True)
54+
Xb = X[idx]
55+
Yb = Y[idx]
56+
57+
model = DecisionTreeClassifier(max_depth=2)
58+
model.fit(Xb, Yb)
59+
self.models.append(model)
60+
61+
def predict(self, X):
62+
# no need to keep a dictionary since we are doing binary classification
63+
predictions = np.zeros(len(X))
64+
for model in self.models:
65+
predictions += model.predict(X)
66+
return np.round(predictions / self.B)
67+
68+
def score(self, X, Y):
69+
P = self.predict(X)
70+
return np.mean(Y == P)
71+
72+
73+
model = BaggedTreeClassifier(200)
74+
model.fit(X, Y)
75+
76+
print "score for bagged model:", model.score(X, Y)
77+
78+
# plot data with boundary
79+
plt.scatter(X[:,0], X[:,1], s=100, c=Y, alpha=0.5)
80+
plot_decision_boundary(X, model)
81+
plt.show()
82+
83+
84+
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# https://www.udemy.com/machine-learning-in-python-random-forest-adaboost
2+
import numpy as np
3+
import matplotlib.pyplot as plt
4+
from sklearn.tree import DecisionTreeRegressor
5+
from sklearn.utils import shuffle
6+
7+
8+
# create the data
9+
T = 100
10+
x_axis = np.linspace(0, 2*np.pi, T)
11+
y_axis = np.sin(x_axis)
12+
13+
# get the training data
14+
N = 30
15+
idx = np.random.choice(T, size=N, replace=False)
16+
Xtrain = x_axis[idx].reshape(N, 1)
17+
Ytrain = y_axis[idx]
18+
19+
# try a lone decision tree
20+
model = DecisionTreeRegressor()
21+
model.fit(Xtrain, Ytrain)
22+
prediction = model.predict(x_axis.reshape(T, 1))
23+
print "score for 1 tree:", model.score(x_axis.reshape(T, 1), y_axis)
24+
25+
# plot the lone decision tree's predictions
26+
plt.plot(x_axis, prediction)
27+
plt.plot(x_axis, y_axis)
28+
plt.show()
29+
30+
# now try bagging
31+
class BaggedTreeRegressor:
32+
def __init__(self, B):
33+
self.B = B
34+
35+
def fit(self, X, Y):
36+
N = len(X)
37+
self.models = []
38+
for b in xrange(self.B):
39+
idx = np.random.choice(N, size=N, replace=True)
40+
Xb = X[idx]
41+
Yb = Y[idx]
42+
43+
model = DecisionTreeRegressor()
44+
model.fit(Xb, Yb)
45+
self.models.append(model)
46+
47+
def predict(self, X):
48+
predictions = np.zeros(len(X))
49+
for model in self.models:
50+
predictions += model.predict(X)
51+
return predictions / self.B
52+
53+
def score(self, X, Y):
54+
d1 = Y - self.predict(X)
55+
d2 = Y - Y.mean()
56+
return 1 - d1.dot(d1) / d2.dot(d2)
57+
58+
59+
model = BaggedTreeRegressor(200)
60+
model.fit(Xtrain, Ytrain)
61+
print "score for bagged tree:", model.score(x_axis.reshape(T, 1), y_axis)
62+
prediction = model.predict(x_axis.reshape(T, 1))
63+
64+
# plot the bagged regressor's predictions
65+
plt.plot(x_axis, prediction)
66+
plt.plot(x_axis, y_axis)
67+
plt.show()
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
# https://www.udemy.com/machine-learning-in-python-random-forest-adaboost
2+
import numpy as np
3+
import matplotlib.pyplot as plt
4+
from sklearn.linear_model import LinearRegression
5+
from sklearn.metrics import mean_squared_error as mse
6+
7+
NUM_DATASETS = 50
8+
NOISE_VARIANCE = 0.5
9+
MAX_POLY = 12
10+
N = 25
11+
Ntrain = int(0.9*N)
12+
13+
np.random.seed(2)
14+
15+
# make a dataset with x^D, x^(D-1), ..., x^0
16+
def make_poly(x, D):
17+
N = len(x)
18+
X = np.empty((N, D+1))
19+
for d in xrange(D+1):
20+
X[:,d] = x**d
21+
if d > 1:
22+
X[:,d] = (X[:,d] - X[:,d].mean()) / X[:,d].std()
23+
return X
24+
25+
def f(X):
26+
return np.sin(X)
27+
28+
29+
x_axis = np.linspace(-np.pi, np.pi, 100)
30+
y_axis = f(x_axis)
31+
32+
# plot the data
33+
# plt.plot(x_axis, y_axis)
34+
# plt.show()
35+
36+
# f(x) = sin(x) from x = [-pi, +pi]
37+
X = np.linspace(-np.pi, np.pi, N)
38+
np.random.shuffle(X)
39+
f_X = f(X)
40+
41+
# just need to do this once
42+
Xpoly = make_poly(X, MAX_POLY)
43+
44+
# array to store all the scores
45+
train_scores = np.zeros((NUM_DATASETS, MAX_POLY))
46+
test_scores = np.zeros((NUM_DATASETS, MAX_POLY))
47+
# squared_biases = np.zeros((NUM_DATASETS, MAX_POLY))
48+
# test_predictions = np.zeros((N - Ntrain, NUM_DATASETS, MAX_POLY))
49+
train_predictions = np.zeros((Ntrain, NUM_DATASETS, MAX_POLY))
50+
prediction_curves = np.zeros((100, NUM_DATASETS, MAX_POLY))
51+
52+
# create the model
53+
model = LinearRegression()
54+
55+
for k in xrange(NUM_DATASETS):
56+
Y = f_X + np.random.randn(N)*NOISE_VARIANCE
57+
58+
Xtrain = Xpoly[:Ntrain]
59+
Ytrain = Y[:Ntrain]
60+
61+
Xtest = Xpoly[Ntrain:]
62+
Ytest = Y[Ntrain:]
63+
64+
for d in xrange(MAX_POLY):
65+
model.fit(Xtrain[:,:d+2], Ytrain)
66+
predictions = model.predict(Xpoly[:,:d+2])
67+
68+
# debug
69+
x_axis_poly = make_poly(x_axis, d+1)
70+
prediction_axis = model.predict(x_axis_poly)
71+
# plt.plot(x_axis, prediction_axis)
72+
# plt.show()
73+
74+
prediction_curves[:,k,d] = prediction_axis
75+
76+
train_prediction = predictions[:Ntrain]
77+
test_prediction = predictions[Ntrain:]
78+
79+
train_predictions[:,k,d] = train_prediction # use this to calculate bias/variance later
80+
81+
train_score = mse(train_prediction, Ytrain)
82+
test_score = mse(test_prediction, Ytest)
83+
84+
train_scores[k,d] = train_score
85+
test_scores[k,d] = test_score
86+
87+
# show all prediction curves for each polynomial degree
88+
# along with the mean curve
89+
for d in xrange(MAX_POLY):
90+
for k in xrange(NUM_DATASETS):
91+
plt.plot(x_axis, prediction_curves[:,k,d], color='green', alpha=0.5)
92+
plt.plot(x_axis, prediction_curves[:,:,d].mean(axis=1), color='blue', linewidth=2.0)
93+
plt.title("All curves for degree = %d" % (d+1))
94+
plt.show()
95+
96+
# calculate the squared bias
97+
avg_train_prediction = np.zeros((Ntrain, MAX_POLY))
98+
squared_bias = np.zeros(MAX_POLY)
99+
f_Xtrain = f_X[:Ntrain]
100+
for d in xrange(MAX_POLY):
101+
for i in xrange(Ntrain):
102+
avg_train_prediction[i,d] = train_predictions[i,:,d].mean()
103+
squared_bias[d] = ((avg_train_prediction[:,d] - f_Xtrain)**2).mean()
104+
105+
# calculate the variance
106+
variances = np.zeros((Ntrain, MAX_POLY))
107+
for d in xrange(MAX_POLY):
108+
for i in xrange(Ntrain):
109+
delta = train_predictions[i,:,d] - avg_train_prediction[i,d]
110+
variances[i,d] = delta.dot(delta) / N
111+
variance = variances.mean(axis=0)
112+
113+
# make bias-variance plots
114+
degrees = np.arange(MAX_POLY) + 1
115+
best_degree = np.argmin(test_scores.mean(axis=0)) + 1
116+
plt.plot(degrees, squared_bias, label='squared bias')
117+
plt.plot(degrees, variance, label='variance')
118+
plt.plot(degrees, test_scores.mean(axis=0), label='test scores')
119+
plt.plot(degrees, squared_bias + variance, label='squared bias + variance')
120+
plt.axvline(x=best_degree, linestyle='--', label='best complexity')
121+
plt.legend()
122+
plt.show()
123+
124+
# train score vs test score
125+
plt.plot(degrees, train_scores.mean(axis=0), label='train scores')
126+
plt.plot(degrees, test_scores.mean(axis=0), label='test scores')
127+
plt.axvline(x=best_degree, linestyle='--', label='best complexity')
128+
plt.legend()
129+
plt.show()

0 commit comments

Comments
 (0)