dropout update

lazyprogrammer · lazyprogrammer · commit cd73081d6e88 · 2017-09-21T01:13:44.000-04:00
diff --git a/ann_class2/dropout_tensorflow.py b/ann_class2/dropout_tensorflow.py
@@ -1,3 +1,8 @@
+from __future__ import print_function, division
+from builtins import range
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
 # For the class Data Science: Practical Deep Learning Concepts in Theano and TensorFlow
 # https://deeplearningcourses.com/c/data-science-deep-learning-in-theano-tensorflow
 # https://www.udemy.com/data-science-deep-learning-in-theano-tensorflow
@@ -13,7 +18,7 @@ class HiddenLayer(object):
     def __init__(self, M1, M2):
         self.M1 = M1
         self.M2 = M2
-        W = np.random.randn(M1, M2) / np.sqrt(2.0 / M1)
+        W = np.random.randn(M1, M2) * np.sqrt(2.0 / M1)
         b = np.zeros(M2)
         self.W = tf.Variable(W.astype(np.float32))
         self.b = tf.Variable(b.astype(np.float32))
@@ -28,7 +33,7 @@ def __init__(self, hidden_layer_sizes, p_keep):
         self.hidden_layer_sizes = hidden_layer_sizes
         self.dropout_rates = p_keep
 
-    def fit(self, X, Y, lr=1e-4, mu=0.9, decay=0.9, epochs=8, batch_sz=100, split=True, print_every=20):
+    def fit(self, X, Y, lr=1e-4, mu=0.9, decay=0.9, epochs=15, batch_sz=100, split=True, print_every=20):
         # make a validation set
         X, Y = shuffle(X, Y)
         X = X.astype(np.float32)
@@ -48,7 +53,7 @@ def fit(self, X, Y, lr=1e-4, mu=0.9, decay=0.9, epochs=8, batch_sz=100, split=Tr
             h = HiddenLayer(M1, M2)
             self.hidden_layers.append(h)
             M1 = M2
-        W = np.random.randn(M1, K) / np.sqrt(M1)
+        W = np.random.randn(M1, K) * np.sqrt(2.0 / M1)
         b = np.zeros(K)
         self.W = tf.Variable(W.astype(np.float32))
         self.b = tf.Variable(b.astype(np.float32))
@@ -71,44 +76,61 @@ def fit(self, X, Y, lr=1e-4, mu=0.9, decay=0.9, epochs=8, batch_sz=100, split=Tr
         )
         train_op = tf.train.RMSPropOptimizer(lr, decay=decay, momentum=mu).minimize(cost)
         # train_op = tf.train.MomentumOptimizer(lr, momentum=mu).minimize(cost)
+        # train_op = tf.train.AdamOptimizer(lr).minimize(cost)
         prediction = self.predict(inputs)
 
-        n_batches = N / batch_sz
+        # validation cost will be calculated separately since nothing will be dropped
+        test_logits = self.forward_test(inputs)
+        test_cost = tf.reduce_mean(
+            tf.nn.sparse_softmax_cross_entropy_with_logits(
+                logits=test_logits,
+                labels=labels
+            )
+        )
+
+        n_batches = N // batch_sz
         costs = []
         init = tf.global_variables_initializer()
         with tf.Session() as session:
             session.run(init)
-            for i in xrange(epochs):
-                print "epoch:", i, "n_batches:", n_batches
+            for i in range(epochs):
+                print("epoch:", i, "n_batches:", n_batches)
                 X, Y = shuffle(X, Y)
-                for j in xrange(n_batches):
+                for j in range(n_batches):
                     Xbatch = X[j*batch_sz:(j*batch_sz+batch_sz)]
                     Ybatch = Y[j*batch_sz:(j*batch_sz+batch_sz)]
 
                     session.run(train_op, feed_dict={inputs: Xbatch, labels: Ybatch})
 
                     if j % print_every == 0:
-                        c = session.run(cost, feed_dict={inputs: Xvalid, labels: Yvalid})
+                        c = session.run(test_cost, feed_dict={inputs: Xvalid, labels: Yvalid})
                         p = session.run(prediction, feed_dict={inputs: Xvalid})
                         costs.append(c)
                         e = error_rate(Yvalid, p)
-                        print "i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e
+                        print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e)
         
         plt.plot(costs)
         plt.show()
 
     def forward(self, X):
-        # no need to define different functions for train and predict
-        # tf.nn.dropout takes care of the differences for us
+        # tf.nn.dropout scales inputs by 1/p_keep
+        # therefore, during test time, we don't have to scale anything
         Z = X
         Z = tf.nn.dropout(Z, self.dropout_rates[0])
         for h, p in zip(self.hidden_layers, self.dropout_rates[1:]):
             Z = h.forward(Z)
             Z = tf.nn.dropout(Z, p)
         return tf.matmul(Z, self.W) + self.b
 
+    def forward_test(self, X):
+        Z = X
+        Z = tf.nn.dropout(Z, self.dropout_rates[0])
+        for h, p in zip(self.hidden_layers, self.dropout_rates[1:]):
+            Z = h.forward(Z)
+        return tf.matmul(Z, self.W) + self.b
+
     def predict(self, X):
-        pY = self.forward(X)
+        pY = self.forward_test(X)
         return tf.argmax(pY, 1)
 
 
diff --git a/ann_class2/dropout_theano.py b/ann_class2/dropout_theano.py
@@ -20,7 +20,7 @@ def __init__(self, M1, M2, an_id):
         self.id = an_id
         self.M1 = M1
         self.M2 = M2
-        W = np.random.randn(M1, M2) / np.sqrt(2.0 / M1)
+        W = np.random.randn(M1, M2) * np.sqrt(2.0 / M1)
         b = np.zeros(M2)
         self.W = theano.shared(W, 'W_%s' % self.id)
         self.b = theano.shared(b, 'b_%s' % self.id)
@@ -56,7 +56,7 @@ def fit(self, X, Y, learning_rate=1e-4, mu=0.9, decay=0.9, epochs=8, batch_sz=10
             self.hidden_layers.append(h)
             M1 = M2
             count += 1
-        W = np.random.randn(M1, K) / np.sqrt(M1)
+        W = np.random.randn(M1, K) * np.sqrt(2.0 / M1)
         b = np.zeros(K)
         self.W = theano.shared(W, 'W_logreg')
         self.b = theano.shared(b, 'b_logreg')
@@ -111,11 +111,11 @@ def fit(self, X, Y, learning_rate=1e-4, mu=0.9, decay=0.9, epochs=8, batch_sz=10
         prediction = self.predict(thX)
         cost_predict_op = theano.function(inputs=[thX, thY], outputs=[cost_predict, prediction])
 
-        n_batches = N / batch_sz
+        n_batches = N // batch_sz
         costs = []
-        for i in xrange(epochs):
+        for i in range(epochs):
             X, Y = shuffle(X, Y)
-            for j in xrange(n_batches):
+            for j in range(n_batches):
                 Xbatch = X[j*batch_sz:(j*batch_sz+batch_sz)]
                 Ybatch = Y[j*batch_sz:(j*batch_sz+batch_sz)]
 
@@ -125,7 +125,7 @@ def fit(self, X, Y, learning_rate=1e-4, mu=0.9, decay=0.9, epochs=8, batch_sz=10
                     c, p = cost_predict_op(Xvalid, Yvalid)
                     costs.append(c)
                     e = error_rate(Yvalid, p)
-                    print "i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e
+                    print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e)
         
         if show_fig:
             plt.plot(costs)