liufuyang
diff --git a/‎unsupervised_class2/__init__.py b/‎unsupervised_class2/__init__.py
diff --git a/‎unsupervised_class2/autoencoder.py
+56-31 b/‎unsupervised_class2/autoencoder.py
+56-31
diff --git a/‎unsupervised_class2/rbm.py
+16-15 b/‎unsupervised_class2/rbm.py
+16-15
diff --git a/‎unsupervised_class2/tsne_books.py
+10-4 b/‎unsupervised_class2/tsne_books.py
+10-4
diff --git a/‎unsupervised_class2/tsne_donut.py
+10-5 b/‎unsupervised_class2/tsne_donut.py
+10-5
diff --git a/‎unsupervised_class2/tsne_mnist.py
+17-5 b/‎unsupervised_class2/tsne_mnist.py
+17-5
diff --git a/‎unsupervised_class2/tsne_xor.py
+5 b/‎unsupervised_class2/tsne_xor.py
+5
@@ -14,19 +14,42 @@
 from util import relu, error_rate, getKaggleMNIST, init_weights
 
 
+def T_shared_zeros_like32(p):
+    # p is a Theano shared itself
+    return theano.shared(np.zeros_like(p.get_value(), dtype=np.float32))
+
+def momentum_updates(cost, params, mu, learning_rate):
+    # momentum changes
+    dparams = [T_shared_zeros_like32(p) for p in params]
+
+    updates = []
+    grads = T.grad(cost, params)
+    for p, dp, g in zip(params, dparams, grads):
+        dp_update = mu*dp - learning_rate*g
+        p_update = p + dp_update
+
+        updates.append((dp, dp_update))
+        updates.append((p, p_update))
+    return updates
+
+
 class AutoEncoder(object):
     def __init__(self, M, an_id):
         self.M = M
         self.id = an_id
 
     def fit(self, X, learning_rate=0.5, mu=0.99, epochs=1, batch_sz=100, show_fig=False):
+        # cast to float
+        mu = np.float32(mu)
+        learning_rate = np.float32(learning_rate)
+
         N, D = X.shape
         n_batches = N // batch_sz
 
         W0 = init_weights((D, self.M))
         self.W = theano.shared(W0, 'W_%s' % self.id)
-        self.bh = theano.shared(np.zeros(self.M), 'bh_%s' % self.id)
-        self.bo = theano.shared(np.zeros(D), 'bo_%s' % self.id)
+        self.bh = theano.shared(np.zeros(self.M, dtype=np.float32), 'bh_%s' % self.id)
+        self.bo = theano.shared(np.zeros(D, dtype=np.float32), 'bo_%s' % self.id)
         self.params = [self.W, self.bh, self.bo]
         self.forward_params = [self.W, self.bh]
 
@@ -61,18 +84,17 @@ def fit(self, X, learning_rate=0.5, mu=0.99, epochs=1, batch_sz=100, show_fig=Fa
             outputs=cost,
         )
 
-        updates = [
-            (p, p + mu*dp - learning_rate*T.grad(cost, p)) for p, dp in zip(self.params, self.dparams)
-        ] + [
-            (dp, mu*dp - learning_rate*T.grad(cost, p)) for p, dp in zip(self.params, self.dparams)
-        ]
+        
+
+        updates = momentum_updates(cost, self.params, mu, learning_rate)
         train_op = theano.function(
             inputs=[X_in],
             updates=updates,
         )
 
         costs = []
         print("training autoencoder: %s" % self.id)
+        print("epochs to do:", epochs)
         for i in range(epochs):
             print("epoch:", i)
             X = shuffle(X)
@@ -117,9 +139,22 @@ def __init__(self, hidden_layer_sizes, UnsupervisedModel=AutoEncoder):
             count += 1
 
 
-    def fit(self, X, Y, Xtest, Ytest, pretrain=True, learning_rate=0.01, mu=0.99, reg=0.1, epochs=1, batch_sz=100):
+    def fit(self, X, Y, Xtest, Ytest,
+        pretrain=True,
+        train_head_only=False,
+        learning_rate=0.1,
+        mu=0.99,
+        reg=0.0,
+        epochs=1,
+        batch_sz=100):
+
+        # cast to float32
+        learning_rate = np.float32(learning_rate)
+        mu = np.float32(mu)
+        reg = np.float32(reg)
+
         # greedy layer-wise training of autoencoders
-        pretrain_epochs = 1
+        pretrain_epochs = 2
         if not pretrain:
             pretrain_epochs = 0
 
@@ -135,38 +170,27 @@ def fit(self, X, Y, Xtest, Ytest, pretrain=True, learning_rate=0.01, mu=0.99, re
         K = len(set(Y))
         W0 = init_weights((self.hidden_layers[-1].M, K))
         self.W = theano.shared(W0, "W_logreg")
-        self.b = theano.shared(np.zeros(K), "b_logreg")
+        self.b = theano.shared(np.zeros(K, dtype=np.float32), "b_logreg")
 
         self.params = [self.W, self.b]
-        for ae in self.hidden_layers:
-            self.params += ae.forward_params
-
-        # for momentum
-        self.dW = theano.shared(np.zeros(W0.shape), "dW_logreg")
-        self.db = theano.shared(np.zeros(K), "db_logreg")
-        self.dparams = [self.dW, self.db]
-        for ae in self.hidden_layers:
-            self.dparams += ae.forward_dparams
+        if not train_head_only:
+            for ae in self.hidden_layers:
+                self.params += ae.forward_params
 
         X_in = T.matrix('X_in')
         targets = T.ivector('Targets')
         pY = self.forward(X_in)
 
-        # squared_magnitude = [(p*p).sum() for p in self.params]
-        # reg_cost = T.sum(squared_magnitude)
-        cost = -T.mean( T.log(pY[T.arange(pY.shape[0]), targets]) ) #+ reg*reg_cost
+        squared_magnitude = [(p*p).sum() for p in self.params]
+        reg_cost = T.sum(squared_magnitude)
+        cost = -T.mean( T.log(pY[T.arange(pY.shape[0]), targets]) ) + reg*reg_cost
         prediction = self.predict(X_in)
         cost_predict_op = theano.function(
             inputs=[X_in, targets],
             outputs=[cost, prediction],
         )
 
-        updates = [
-            (p, p + mu*dp - learning_rate*T.grad(cost, p)) for p, dp in zip(self.params, self.dparams)
-        ] + [
-            (dp, mu*dp - learning_rate*T.grad(cost, p)) for p, dp in zip(self.params, self.dparams)
-        ]
-        # updates = [(p, p - learning_rate*T.grad(cost, p)) for p in self.params]
+        updates = momentum_updates(cost, self.params, mu, learning_rate)
         train_op = theano.function(
             inputs=[X_in, targets],
             updates=updates,
@@ -209,7 +233,8 @@ def main():
     # dnn.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=3)
     # vs
     dnn = DNN([1000, 750, 500])
-    dnn.fit(Xtrain, Ytrain, Xtest, Ytest, pretrain=False, epochs=10)
+    dnn.fit(Xtrain, Ytrain, Xtest, Ytest, pretrain=True, train_head_only=False, epochs=3)
+    # note: try training the head only too! what does that mean?
 
 
 def test_single_autoencoder():
@@ -239,5 +264,5 @@ def test_single_autoencoder():
 
 
 if __name__ == '__main__':
-    # main()
-    test_single_autoencoder()
+    main()
+    # test_single_autoencoder()
@@ -1,5 +1,10 @@
 # https://deeplearningcourses.com/c/unsupervised-deep-learning-in-python
 # https://www.udemy.com/unsupervised-deep-learning-in-python
+from __future__ import print_function, division
+from builtins import range, input
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
 import numpy as np
 import theano
 import theano.tensor as T
@@ -18,8 +23,12 @@ def __init__(self, M, an_id):
         self.rng = RandomStreams()
 
     def fit(self, X, learning_rate=0.1, epochs=1, batch_sz=100, show_fig=False):
+        # cast to float32
+        learning_rate = np.float32(learning_rate)
+
+
         N, D = X.shape
-        n_batches = N / batch_sz
+        n_batches = N // batch_sz
 
         W0 = init_weights((D, self.M))
         self.W = theano.shared(W0, 'W_%s' % self.id)
@@ -28,14 +37,6 @@ def fit(self, X, learning_rate=0.1, epochs=1, batch_sz=100, show_fig=False):
         self.params = [self.W, self.c, self.b]
         self.forward_params = [self.W, self.c]
 
-        # we won't use this to fit the RBM but we will use these for backpropagation later
-        # TODO: technically they should be reset before doing backprop
-        self.dW = theano.shared(np.zeros(W0.shape), 'dW_%s' % self.id)
-        self.dc = theano.shared(np.zeros(self.M), 'dbh_%s' % self.id)
-        self.db = theano.shared(np.zeros(D), 'dbo_%s' % self.id)
-        self.dparams = [self.dW, self.dc, self.db]
-        self.forward_dparams = [self.dW, self.dc]
-
         X_in = T.matrix('X_%s' % self.id)
 
         # attach it to the object so it can be used later
@@ -50,7 +51,7 @@ def fit(self, X, learning_rate=0.1, epochs=1, batch_sz=100, show_fig=False):
         # but we would like to see how this cost function changes
         # as we do contrastive divergence
         X_hat = self.forward_output(X_in)
-        cost = -(X_in * T.log(X_hat) + (1 - X_in) * T.log(1 - X_hat)).sum() / (batch_sz * D)
+        cost = -(X_in * T.log(X_hat) + (1 - X_in) * T.log(1 - X_hat)).mean()
         cost_op = theano.function(
             inputs=[X_in],
             outputs=cost,
@@ -71,15 +72,15 @@ def fit(self, X, learning_rate=0.1, epochs=1, batch_sz=100, show_fig=False):
         )
 
         costs = []
-        print "training rbm: %s" % self.id
-        for i in xrange(epochs):
-            print "epoch:", i
+        print("training rbm: %s" % self.id)
+        for i in range(epochs):
+            print("epoch:", i)
             X = shuffle(X)
-            for j in xrange(n_batches):
+            for j in range(n_batches):
                 batch = X[j*batch_sz:(j*batch_sz + batch_sz)]
                 train_op(batch)
                 the_cost = cost_op(X)  # technically we could also get the cost for Xtest here
-                print "j / n_batches:", j, "/", n_batches, "cost:", the_cost
+                print("j / n_batches:", j, "/", n_batches, "cost:", the_cost)
                 costs.append(the_cost)
         if show_fig:
             plt.plot(costs)
 
@@ -1,5 +1,10 @@
 # https://deeplearningcourses.com/c/unsupervised-deep-learning-in-python
 # https://www.udemy.com/unsupervised-deep-learning-in-python
+from __future__ import print_function, division
+from builtins import range
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
 import nltk
 import numpy as np
 import matplotlib.pyplot as plt
@@ -39,6 +44,7 @@ def my_tokenizer(s):
 for title in titles:
     try:
         title = title.encode('ascii', 'ignore') # this will throw exception if bad characters
+        title = title.decode('utf-8')
         all_titles.append(title)
         tokens = my_tokenizer(title)
         all_tokens.append(tokens)
@@ -47,8 +53,8 @@ def my_tokenizer(s):
                 word_index_map[token] = current_index
                 current_index += 1
                 index_word_map.append(token)
-    except:
-        pass
+    except Exception as e:
+        print(e)
 
 
 
@@ -67,13 +73,13 @@ def tokens_to_vector(tokens):
 for tokens in all_tokens:
     X[:,i] = tokens_to_vector(tokens)
     i += 1
-print "X.shape:", X.shape
+print("X.shape:", X.shape)
 
 def main():
     tsne = TSNE(perplexity=40)
     Z = tsne.fit_transform(X)
     plt.scatter(Z[:,0], Z[:,1])
-    for i in xrange(D):
+    for i in range(D):
         plt.annotate(s=index_word_map[i], xy=(Z[i,0], Z[i,1]))
     plt.show()
 
 
@@ -1,5 +1,10 @@
 # https://deeplearningcourses.com/c/unsupervised-deep-learning-in-python
 # https://www.udemy.com/unsupervised-deep-learning-in-python
+from __future__ import print_function, division
+from builtins import range
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
 import numpy as np
 import matplotlib.pyplot as plt
 
@@ -13,16 +18,16 @@ def get_donut_data():
 
     # distance from origin is radius + random normal
     # angle theta is uniformly distributed between (0, 2pi)
-    R1 = np.random.randn(N/2) + R_inner
-    theta = 2*np.pi*np.random.random(N/2)
+    R1 = np.random.randn(N//2) + R_inner
+    theta = 2*np.pi*np.random.random(N//2)
     X_inner = np.concatenate([[R1 * np.cos(theta)], [R1 * np.sin(theta)]]).T
 
-    R2 = np.random.randn(N/2) + R_outer
-    theta = 2*np.pi*np.random.random(N/2)
+    R2 = np.random.randn(N//2) + R_outer
+    theta = 2*np.pi*np.random.random(N//2)
     X_outer = np.concatenate([[R2 * np.cos(theta)], [R2 * np.sin(theta)]]).T
 
     X = np.concatenate([ X_inner, X_outer ])
-    Y = np.array([0]*(N/2) + [1]*(N/2))
+    Y = np.array([0]*(N//2) + [1]*(N//2))
     return X, Y
 
 
 
@@ -1,5 +1,10 @@
 # https://deeplearningcourses.com/c/unsupervised-deep-learning-in-python
 # https://www.udemy.com/unsupervised-deep-learning-in-python
+from __future__ import print_function, division
+from builtins import range
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
 import numpy as np
 import matplotlib.pyplot as plt
 
@@ -10,7 +15,7 @@
 import sys
 sys.path.append(os.path.abspath('..'))
 from unsupervised_class.kmeans_mnist import purity
-from unsupervised_class.gmm import gmm
+from sklearn.mixture import GaussianMixture
 
 
 def main():
@@ -26,10 +31,17 @@ def main():
     plt.show()
 
     # purity measure from unsupervised machine learning pt 1
-    _, Rfull = gmm(X, 10, max_iter=30, smoothing=10e-1)
-    print "full purity:", purity(Y, Rfull)
-    _, Rreduced = gmm(Z, 10, max_iter=30, smoothing=10e-1)
-    print "reduced purity:", purity(Y, Rreduced)
+    # maximum purity is 1, higher is better
+    gmm = GaussianMixture(n_components=10)
+    gmm.fit(X)
+    Rfull = gmm.predict_proba(X)
+    print("Rfull.shape:", Rfull.shape)
+    print("full purity:", purity(Y, Rfull))
+
+    # now try the same thing on the reduced data
+    gmm.fit(Z)
+    Rreduced = gmm.predict_proba(Z)
+    print("reduced purity:", purity(Y, Rreduced))
 
 if __name__ == '__main__':
     main()
@@ -1,5 +1,10 @@
 # https://deeplearningcourses.com/c/unsupervised-deep-learning-in-python
 # https://www.udemy.com/unsupervised-deep-learning-in-python
+from __future__ import print_function, division
+from builtins import range
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
 import numpy as np
 import matplotlib.pyplot as plt