ahmedfadhil
diff --git a/‎supervised_class2/adaboost.py
Lines changed: 91 additions & 0 deletions b/‎supervised_class2/adaboost.py
Lines changed: 91 additions & 0 deletions
diff --git a/‎supervised_class2/bagging_classification.py
Lines changed: 84 additions & 0 deletions b/‎supervised_class2/bagging_classification.py
Lines changed: 84 additions & 0 deletions
diff --git a/‎supervised_class2/bagging_regression.py
Lines changed: 67 additions & 0 deletions b/‎supervised_class2/bagging_regression.py
Lines changed: 67 additions & 0 deletions
diff --git a/‎supervised_class2/bias_variance_demo.py
Lines changed: 129 additions & 0 deletions b/‎supervised_class2/bias_variance_demo.py
Lines changed: 129 additions & 0 deletions
@@ -0,0 +1,91 @@
+# https://www.udemy.com/machine-learning-in-python-random-forest-adaboost
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.tree import DecisionTreeClassifier
+from rf_classification import get_data
+
+
+class AdaBoost:
+  def __init__(self, M):
+    self.M = M
+
+  def fit(self, X, Y):
+    self.models = []
+    self.alphas = []
+
+    N, _ = X.shape
+    W = np.ones(N) / N
+
+    for m in xrange(self.M):
+      tree = DecisionTreeClassifier(max_depth=1)
+      tree.fit(X, Y, sample_weight=W)
+      P = tree.predict(X)
+
+      err = W.dot(P != Y)
+      alpha = 0.5*(np.log(1 - err) - np.log(err))
+
+      W = W*np.exp(-alpha*Y*P) # vectorized form
+      W = W / W.sum() # normalize so it sums to 1
+
+      self.models.append(tree)
+      self.alphas.append(alpha)
+
+  def predict(self, X):
+    # NOT like SKLearn API
+    # we want accuracy and exponential loss for plotting purposes
+    N, _ = X.shape
+    FX = np.zeros(N)
+    for alpha, tree in zip(self.alphas, self.models):
+      FX += alpha*tree.predict(X)
+    return np.sign(FX), FX
+
+  def score(self, X, Y):
+    # NOT like SKLearn API
+    # we want accuracy and exponential loss for plotting purposes
+    P, FX = self.predict(X)
+    L = np.exp(-Y*FX).mean()
+    return np.mean(P == Y), L
+
+
+if __name__ == '__main__':
+    
+  X, Y = get_data()
+  Y[Y == 0] = -1 # make the targets -1,+1
+  Ntrain = int(0.8*len(X))
+  Xtrain, Ytrain = X[:Ntrain], Y[:Ntrain]
+  Xtest, Ytest = X[Ntrain:], Y[Ntrain:]
+
+  T = 200
+  train_errors = np.empty(T)
+  test_losses = np.empty(T)
+  test_errors = np.empty(T)
+  for num_trees in xrange(T):
+    if num_trees == 0:
+      train_errors[num_trees] = None
+      test_errors[num_trees] = None
+      test_losses[num_trees] = None
+      continue
+    if num_trees % 20 == 0:
+      print num_trees
+
+    model = AdaBoost(num_trees)
+    model.fit(Xtrain, Ytrain)
+    acc, loss = model.score(Xtest, Ytest)
+    acc_train, _ = model.score(Xtrain, Ytrain)
+    train_errors[num_trees] = 1 - acc_train
+    test_errors[num_trees] = 1 - acc
+    test_losses[num_trees] = loss
+
+    if num_trees == T - 1:
+      print "final train error:", 1 - acc_train
+      print "final test error:", 1 - acc
+
+  plt.plot(test_errors, label='test errors')
+  plt.plot(test_losses, label='test losses')
+  plt.legend()
+  plt.show()
+
+  plt.plot(train_errors, label='train errors')
+  plt.plot(test_errors, label='test errors')
+  plt.legend()
+  plt.show()
@@ -0,0 +1,84 @@
+# https://www.udemy.com/machine-learning-in-python-random-forest-adaboost
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.utils import shuffle
+from util import plot_decision_boundary
+
+np.random.seed(10)
+
+# create the data
+N = 500
+D = 2
+X = np.random.randn(N, D)
+
+# 2 gaussians
+# sep = 1.5
+# X[:N/2] += np.array([sep, sep])
+# X[N/2:] += np.array([-sep, -sep])
+# Y = np.array([0]*(N/2) + [1]*(N/2))
+
+# noisy XOR
+sep = 2
+X[:125] += np.array([sep, sep])
+X[125:250] += np.array([sep, -sep])
+X[250:375] += np.array([-sep, -sep])
+X[375:] += np.array([-sep, sep])
+Y = np.array([0]*125 + [1]*125 + [0]*125 + [1]*125)
+
+# plot the data
+plt.scatter(X[:,0], X[:,1], s=100, c=Y, alpha=0.5)
+plt.show()
+
+# lone decision tree
+model = DecisionTreeClassifier()
+model.fit(X, Y)
+print "score for 1 tree:", model.score(X, Y)
+
+# plot data with boundary
+plt.scatter(X[:,0], X[:,1], s=100, c=Y, alpha=0.5)
+plot_decision_boundary(X, model)
+plt.show()
+
+
+# create the bagged model
+class BaggedTreeClassifier:
+  def __init__(self, B):
+    self.B = B
+
+  def fit(self, X, Y):
+    N = len(X)
+    self.models = []
+    for b in xrange(self.B):
+      idx = np.random.choice(N, size=N, replace=True)
+      Xb = X[idx]
+      Yb = Y[idx]
+
+      model = DecisionTreeClassifier(max_depth=2)
+      model.fit(Xb, Yb)
+      self.models.append(model)
+
+  def predict(self, X):
+    # no need to keep a dictionary since we are doing binary classification
+    predictions = np.zeros(len(X))
+    for model in self.models:
+      predictions += model.predict(X)
+    return np.round(predictions / self.B)
+
+  def score(self, X, Y):
+    P = self.predict(X)
+    return np.mean(Y == P)
+
+
+model = BaggedTreeClassifier(200)
+model.fit(X, Y)
+
+print "score for bagged model:", model.score(X, Y)
+
+# plot data with boundary
+plt.scatter(X[:,0], X[:,1], s=100, c=Y, alpha=0.5)
+plot_decision_boundary(X, model)
+plt.show()
+
+
+
@@ -0,0 +1,67 @@
+# https://www.udemy.com/machine-learning-in-python-random-forest-adaboost
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.utils import shuffle
+
+
+# create the data
+T = 100
+x_axis = np.linspace(0, 2*np.pi, T)
+y_axis = np.sin(x_axis)
+
+# get the training data
+N = 30
+idx = np.random.choice(T, size=N, replace=False)
+Xtrain = x_axis[idx].reshape(N, 1)
+Ytrain = y_axis[idx]
+
+# try a lone decision tree
+model = DecisionTreeRegressor()
+model.fit(Xtrain, Ytrain)
+prediction = model.predict(x_axis.reshape(T, 1))
+print "score for 1 tree:", model.score(x_axis.reshape(T, 1), y_axis)
+
+# plot the lone decision tree's predictions
+plt.plot(x_axis, prediction)
+plt.plot(x_axis, y_axis)
+plt.show()
+
+# now try bagging
+class BaggedTreeRegressor:
+  def __init__(self, B):
+    self.B = B
+
+  def fit(self, X, Y):
+    N = len(X)
+    self.models = []
+    for b in xrange(self.B):
+      idx = np.random.choice(N, size=N, replace=True)
+      Xb = X[idx]
+      Yb = Y[idx]
+
+      model = DecisionTreeRegressor()
+      model.fit(Xb, Yb)
+      self.models.append(model)
+
+  def predict(self, X):
+    predictions = np.zeros(len(X))
+    for model in self.models:
+      predictions += model.predict(X)
+    return predictions / self.B
+
+  def score(self, X, Y):
+    d1 = Y - self.predict(X)
+    d2 = Y - Y.mean()
+    return 1 - d1.dot(d1) / d2.dot(d2)
+
+
+model = BaggedTreeRegressor(200)
+model.fit(Xtrain, Ytrain)
+print "score for bagged tree:", model.score(x_axis.reshape(T, 1), y_axis)
+prediction = model.predict(x_axis.reshape(T, 1))
+
+# plot the bagged regressor's predictions
+plt.plot(x_axis, prediction)
+plt.plot(x_axis, y_axis)
+plt.show()
@@ -0,0 +1,129 @@
+# https://www.udemy.com/machine-learning-in-python-random-forest-adaboost
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import mean_squared_error as mse
+
+NUM_DATASETS = 50
+NOISE_VARIANCE = 0.5
+MAX_POLY = 12
+N = 25
+Ntrain = int(0.9*N)
+
+np.random.seed(2)
+
+# make a dataset with x^D, x^(D-1), ..., x^0
+def make_poly(x, D):
+  N = len(x)
+  X = np.empty((N, D+1))
+  for d in xrange(D+1):
+    X[:,d] = x**d
+    if d > 1:
+      X[:,d] = (X[:,d] - X[:,d].mean()) / X[:,d].std()
+  return X
+
+def f(X):
+  return np.sin(X)
+
+
+x_axis = np.linspace(-np.pi, np.pi, 100)
+y_axis = f(x_axis)
+
+# plot the data
+# plt.plot(x_axis, y_axis)
+# plt.show()
+
+# f(x) = sin(x) from x = [-pi, +pi]
+X = np.linspace(-np.pi, np.pi, N)
+np.random.shuffle(X)
+f_X = f(X)
+
+# just need to do this once
+Xpoly = make_poly(X, MAX_POLY)
+
+# array to store all the scores
+train_scores = np.zeros((NUM_DATASETS, MAX_POLY))
+test_scores = np.zeros((NUM_DATASETS, MAX_POLY))
+# squared_biases = np.zeros((NUM_DATASETS, MAX_POLY))
+# test_predictions = np.zeros((N - Ntrain, NUM_DATASETS, MAX_POLY))
+train_predictions = np.zeros((Ntrain, NUM_DATASETS, MAX_POLY))
+prediction_curves = np.zeros((100, NUM_DATASETS, MAX_POLY))
+
+# create the model
+model = LinearRegression()
+
+for k in xrange(NUM_DATASETS):
+  Y = f_X + np.random.randn(N)*NOISE_VARIANCE
+
+  Xtrain = Xpoly[:Ntrain]
+  Ytrain = Y[:Ntrain]
+
+  Xtest = Xpoly[Ntrain:]
+  Ytest = Y[Ntrain:]
+
+  for d in xrange(MAX_POLY):
+    model.fit(Xtrain[:,:d+2], Ytrain)
+    predictions = model.predict(Xpoly[:,:d+2])
+
+    # debug
+    x_axis_poly = make_poly(x_axis, d+1)
+    prediction_axis = model.predict(x_axis_poly)
+    # plt.plot(x_axis, prediction_axis)
+    # plt.show()
+
+    prediction_curves[:,k,d] = prediction_axis
+
+    train_prediction = predictions[:Ntrain]
+    test_prediction = predictions[Ntrain:]
+
+    train_predictions[:,k,d] = train_prediction # use this to calculate bias/variance later
+
+    train_score = mse(train_prediction, Ytrain)
+    test_score = mse(test_prediction, Ytest)
+
+    train_scores[k,d] = train_score
+    test_scores[k,d] = test_score
+
+# show all prediction curves for each polynomial degree
+# along with the mean curve
+for d in xrange(MAX_POLY):
+  for k in xrange(NUM_DATASETS):
+    plt.plot(x_axis, prediction_curves[:,k,d], color='green', alpha=0.5)
+  plt.plot(x_axis, prediction_curves[:,:,d].mean(axis=1), color='blue', linewidth=2.0)
+  plt.title("All curves for degree = %d" % (d+1))
+  plt.show()
+
+# calculate the squared bias
+avg_train_prediction = np.zeros((Ntrain, MAX_POLY))
+squared_bias = np.zeros(MAX_POLY)
+f_Xtrain = f_X[:Ntrain]
+for d in xrange(MAX_POLY):
+  for i in xrange(Ntrain):
+    avg_train_prediction[i,d] = train_predictions[i,:,d].mean()
+  squared_bias[d] = ((avg_train_prediction[:,d] - f_Xtrain)**2).mean()
+
+# calculate the variance
+variances = np.zeros((Ntrain, MAX_POLY))
+for d in xrange(MAX_POLY):
+  for i in xrange(Ntrain):
+    delta = train_predictions[i,:,d] - avg_train_prediction[i,d]
+    variances[i,d] = delta.dot(delta) / N
+variance = variances.mean(axis=0)
+
+# make bias-variance plots
+degrees = np.arange(MAX_POLY) + 1
+best_degree = np.argmin(test_scores.mean(axis=0)) + 1
+plt.plot(degrees, squared_bias, label='squared bias')
+plt.plot(degrees, variance, label='variance')
+plt.plot(degrees, test_scores.mean(axis=0), label='test scores')
+plt.plot(degrees, squared_bias + variance, label='squared bias + variance')
+plt.axvline(x=best_degree, linestyle='--', label='best complexity')
+plt.legend()
+plt.show()
+
+# train score vs test score
+plt.plot(degrees, train_scores.mean(axis=0), label='train scores')
+plt.plot(degrees, test_scores.mean(axis=0), label='test scores')
+plt.axvline(x=best_degree, linestyle='--', label='best complexity')
+plt.legend()
+plt.show()