better rmsprop

lazyprogrammer · lazyprogrammer · commit 97e04710a7a8 · 2017-06-26T03:38:41.000-04:00
diff --git a/ann_class2/dropout_theano.py b/ann_class2/dropout_theano.py
@@ -35,7 +35,7 @@ def __init__(self, hidden_layer_sizes, p_keep):
         self.hidden_layer_sizes = hidden_layer_sizes
         self.dropout_rates = p_keep
 
-    def fit(self, X, Y, learning_rate=1e-6, mu=0.99, decay=0.999, epochs=300, batch_sz=100, show_fig=False):
+    def fit(self, X, Y, learning_rate=1e-4, mu=0.9, decay=0.9, epochs=8, batch_sz=100, show_fig=False):
         # make a validation set
         X, Y = shuffle(X, Y)
         X = X.astype(np.float32)
@@ -66,12 +66,6 @@ def fit(self, X, Y, learning_rate=1e-6, mu=0.99, decay=0.999, epochs=300, batch_
         for h in self.hidden_layers:
             self.params += h.params
 
-        # for momentum
-        dparams = [theano.shared(np.zeros(p.get_value().shape)) for p in self.params]
-
-        # for rmsprop
-        cache = [theano.shared(np.zeros(p.get_value().shape)) for p in self.params]
-
         # set up theano functions and variables
         thX = T.matrix('X')
         thY = T.ivector('Y')
@@ -80,12 +74,23 @@ def fit(self, X, Y, learning_rate=1e-6, mu=0.99, decay=0.999, epochs=300, batch_
         # this cost is for training
         cost = -T.mean(T.log(pY_train[T.arange(thY.shape[0]), thY]))
 
+        # gradients wrt each param
+        grads = T.grad(cost, self.params)
+
+        # for momentum
+        dparams = [theano.shared(np.zeros_like(p.get_value())) for p in self.params]
+
+        # for rmsprop
+        cache = [theano.shared(np.ones_like(p.get_value())) for p in self.params]
+
+        new_cache = [decay*c + (1-decay)*g*g for p, c, g in zip(self.params, cache, grads)]
+        new_dparams = [mu*dp - learning_rate*g/T.sqrt(new_c + 1e-10) for p, new_c, dp, g in zip(self.params, new_cache, dparams, grads)]
         updates = [
-            (c, decay*c + (1-decay)*T.grad(cost, p)*T.grad(cost, p)) for p, c in zip(self.params, cache)
+            (c, new_c) for c, new_c in zip(cache, new_cache)
         ] + [
-            (p, p + mu*dp - learning_rate*T.grad(cost, p)/T.sqrt(c + 1e-10)) for p, c, dp in zip(self.params, cache, dparams)
+            (dp, new_dp) for dp, new_dp in zip(dparams, new_dparams)
         ] + [
-            (dp, mu*dp - learning_rate*T.grad(cost, p)/T.sqrt(c + 1e-10)) for p, c, dp in zip(self.params, cache, dparams)
+            (p, p + new_dp) for p, new_dp in zip(self.params, new_dparams)
         ]
 
         # momentum only