update momentum

lazyprogrammer · lazyprogrammer · commit eee6fee523cb · 2017-12-05T02:01:07.000-05:00
diff --git a/ann_class2/momentum.py b/ann_class2/momentum.py
@@ -50,10 +50,15 @@ def main():
     W2 = np.random.randn(M, K) / np.sqrt(M)
     b2 = np.zeros(K)
 
+    # save initial weights
+    W1_0 = W1.copy()
+    b1_0 = b1.copy()
+    W2_0 = W2.copy()
+    b2_0 = b2.copy()
+
     # 1. batch
-    # cost = -16
-    LL_batch = []
-    CR_batch = []
+    losses_batch = []
+    errors_batch = []
     for i in range(max_iter):
         for j in range(n_batches):
             Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
@@ -68,26 +73,25 @@ def main():
             b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)
 
             if j % print_period == 0:
-                # calculate just for LL
                 pY, _ = forward(Xtest, W1, b1, W2, b2)
-                ll = cost(pY, Ytest_ind)
-                LL_batch.append(ll)
-                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll))
+                l = cost(pY, Ytest_ind)
+                losses_batch.append(l)
+                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))
 
-                err = error_rate(pY, Ytest)
-                CR_batch.append(err)
-                print("Error rate:", err)
+                e = error_rate(pY, Ytest)
+                errors_batch.append(e)
+                print("Error rate:", e)
 
     pY, _ = forward(Xtest, W1, b1, W2, b2)
     print("Final error rate:", error_rate(pY, Ytest))
 
     # 2. batch with momentum
-    W1 = np.random.randn(D, M) / np.sqrt(D)
-    b1 = np.zeros(M)
-    W2 = np.random.randn(M, K) / np.sqrt(M)
-    b2 = np.zeros(K)
-    LL_momentum = []
-    CR_momentum = []
+    W1 = W1_0.copy()
+    b1 = b1_0.copy()
+    W2 = W2_0.copy()
+    b2 = b2_0.copy()
+    losses_momentum = []
+    errors_momentum = []
     mu = 0.9
     dW2 = 0
     db2 = 0
@@ -99,100 +103,92 @@ def main():
             Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
             pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
 
+            # gradients
+            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
+            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
+            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
+            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1
+
+            # update velocities
+            dW2 = mu*dW2 - lr*gW2
+            db2 = mu*db2 - lr*gb2
+            dW1 = mu*dW1 - lr*gW1
+            db1 = mu*db1 - lr*gb1
+
             # updates
-            dW2 = mu*dW2 - lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
             W2 += dW2
-            db2 = mu*db2 - lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
             b2 += db2
-            dW1 = mu*dW1 - lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
             W1 += dW1
-            db1 = mu*db1 - lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)
             b1 += db1
 
             if j % print_period == 0:
-                # calculate just for LL
                 pY, _ = forward(Xtest, W1, b1, W2, b2)
-                # print "pY:", pY
-                ll = cost(pY, Ytest_ind)
-                LL_momentum.append(ll)
-                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll))
-
-                err = error_rate(pY, Ytest)
-                CR_momentum.append(err)
-                print("Error rate:", err)
+                l = cost(pY, Ytest_ind)
+                losses_momentum.append(l)
+                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))
+
+                e = error_rate(pY, Ytest)
+                errors_momentum.append(e)
+                print("Error rate:", e)
     pY, _ = forward(Xtest, W1, b1, W2, b2)
     print("Final error rate:", error_rate(pY, Ytest))
 
 
     # 3. batch with Nesterov momentum
-    W1 = np.random.randn(D, M) / np.sqrt(D)
-    b1 = np.zeros(M)
-    W2 = np.random.randn(M, K) / np.sqrt(M)
-    b2 = np.zeros(K)
-    LL_nest = []
-    CR_nest = []
+    W1 = W1_0.copy()
+    b1 = b1_0.copy()
+    W2 = W2_0.copy()
+    b2 = b2_0.copy()
+
+    losses_nesterov = []
+    errors_nesterov = []
+
     mu = 0.9
-    # alternate version uses dW
-    # dW2 = 0
-    # db2 = 0
-    # dW1 = 0
-    # db1 = 0
     vW2 = 0
     vb2 = 0
     vW1 = 0
     vb1 = 0
     for i in range(max_iter):
         for j in range(n_batches):
-            # because we want g(t) = grad(f(W(t-1) - lr*mu*dW(t-1)))
-            # dW(t) = mu*dW(t-1) + g(t)
-            # W(t) = W(t-1) - mu*dW(t)
-            W1_tmp = W1 - lr*mu*vW1
-            b1_tmp = b1 - lr*mu*vb1
-            W2_tmp = W2 - lr*mu*vW2
-            b2_tmp = b2 - lr*mu*vb2
-
             Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
             Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
-            # pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
-            pYbatch, Z = forward(Xbatch, W1_tmp, b1_tmp, W2_tmp, b2_tmp)
+            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
 
             # updates
-            # dW2 = mu*mu*dW2 - (1 + mu)*lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
-            # W2 += dW2
-            # db2 = mu*mu*db2 - (1 + mu)*lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
-            # b2 += db2
-            # dW1 = mu*mu*dW1 - (1 + mu)*lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
-            # W1 += dW1
-            # db1 = mu*mu*db1 - (1 + mu)*lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)
-            # b1 += db1
-            vW2 = mu*vW2 + derivative_w2(Z, Ybatch, pYbatch) + reg*W2_tmp
-            W2 -= lr*vW2
-            vb2 = mu*vb2 + derivative_b2(Ybatch, pYbatch) + reg*b2_tmp
-            b2 -= lr*vb2
-            vW1 = mu*vW1 + derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2_tmp) + reg*W1_tmp
-            W1 -= lr*vW1
-            vb1 = mu*vb1 + derivative_b1(Z, Ybatch, pYbatch, W2_tmp) + reg*b1_tmp
-            b1 -= lr*vb1
+            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
+            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
+            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
+            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1
+
+            # v update
+            vW2 = mu*vW2 - lr*gW2
+            vb2 = mu*vb2 - lr*gb2
+            vW1 = mu*vW1 - lr*gW1
+            vb1 = mu*vb1 - lr*gb1
+
+            # param update
+            W2 += mu*vW2 - lr*gW2
+            b2 += mu*vb2 - lr*gb2
+            W1 += mu*vW1 - lr*gW1
+            b1 += mu*vb1 - lr*gb1
 
             if j % print_period == 0:
-                # calculate just for LL
                 pY, _ = forward(Xtest, W1, b1, W2, b2)
-                # print "pY:", pY
-                ll = cost(pY, Ytest_ind)
-                LL_nest.append(ll)
-                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll))
-
-                err = error_rate(pY, Ytest)
-                CR_nest.append(err)
-                print("Error rate:", err)
+                l = cost(pY, Ytest_ind)
+                losses_nesterov.append(l)
+                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))
+
+                e = error_rate(pY, Ytest)
+                errors_nesterov.append(e)
+                print("Error rate:", e)
     pY, _ = forward(Xtest, W1, b1, W2, b2)
     print("Final error rate:", error_rate(pY, Ytest))
 
 
 
-    plt.plot(LL_batch, label="batch")
-    plt.plot(LL_momentum, label="momentum")
-    plt.plot(LL_nest, label="nesterov")
+    plt.plot(losses_batch, label="batch")
+    plt.plot(losses_momentum, label="momentum")
+    plt.plot(losses_nesterov, label="nesterov")
     plt.legend()
     plt.show()