0130

wuziheng · Jan 31, 2018 · eb30ec9 · eb30ec9
1 parent dde25aa
commit eb30ec9
Show file tree

Hide file tree

Showing 14 changed files with 296 additions and 167 deletions.
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
diff --git a/README.md b/README.md
@@ -58,6 +58,8 @@
 
 ------
 
+*Target3*: 　实现不同的激活函数（relu系和sigmoid系)，比对分析：
+
 * 给train_epoch读入图片添加了shuffle
 * 完成了不同的激活函数relu,leaky-relu,sigmoid,tanh,elu, prelu
 * 完成了对激活函数的grad_check,实际上sigmoid确实容易出现gradient-vanish,所以一开始用1e-5学习率基本收敛的特别慢，所以实际测试里面调整到了1e-3
@@ -82,11 +84,39 @@
 
 由于我们没有精调参数，所以这里就不分析比较准确率曲线，我们就分析不同激活函数的收敛速度：
 
-1. 左图可以看到tanh,sigmoid即使采用了更大的学习率，在收敛速度上依然比relu要慢不少
-2. 右图我们可以看到，这么多种relu,在收敛速度上，没有质的区别，即使是不同alpha的leak-relu，区别也不大。但是我们还是可以勉强认为leaky-relu稍微比relu强一些。
+1. 左图可以看到tanh,sigmoid还是存在比较明显的gradient vanish，网络只有2层。即使采用了更大的学习率，在收敛速度上依然比relu要慢不少
+2. 右图我们可以看到，这么多种relu,在收敛速度上，没有质的区别，即使是不同alpha的leak-relu（alpha=0.01,0.001），区别也不大。但是我们还是可以勉强认为leaky-relu稍微比relu强一些。
 
 <center>
 
 <img src="fig/activation-loss1.jpg" style="zoom:100%"/>  <img src="fig/activation-loss2.jpg" style="zoom:100%"/>
 
-</center>
+</center>
+
+～
+
+**2018.01.30**
+
+------
+
+由于不可抗力（～要给女友写论文做实验～），中断了几天。计划给Variable class添加method和initializer属性,用于全局控制变量的优化方法和初始化（这里就感觉到，应该抽象一个graph class,然后把Variable, Operator抽象成Node,方便控制全局的方法，例如初始化，优化方法设置，源求导，汇求值，全局apply_gradient等等，都可以变成graph的方法）。
+
+原始的版本实现了SGD与MSRA.计划实现：
+
+* method: sgd, momentum ,Nesterov, Adam ,RMSProp 等进行比对
+* initializer: MSRA,  Xavier, Zeros, Const
+
+ps:注意到之前的版本apply_gradient，diff都没有/batch_size,从这个版本添加上了.这样就收敛速度就不会显式的受到batch_size的影响(这个地方还是需要思考一下，因为从epoch角度来讲，不除比较稳定)。当然我就把初始learning_rate调大了50倍．同时，添加了util.learning_rate_decay,用于让学习率自动衰减.
+
+| *learning_rate* | *batch_size* | *decay_rate* | *decay_epoch* | *initializer* |
+| :-------------: | :----------: | :----------: | :-----------: | :-----------: |
+|      5e-4       |      64      |     0.1      |       5       |     MSRA      |
+
+| version  |      |      |      |      |
+| :------: | ---- | ---- | ---- | ---- |
+|   SGD    |      |      |      |      |
+| Momentum |      |      |      |      |
+| Nesterov |      |      |      |      |
+|   Adam   |      |      |      |      |
+
+这里我们也一样比较了几种method在LRELU下的表现，由于初始化是随机的＋网络很浅，所以好像差别也不是特别明显，没有activation之间表现的差距那么明显。之后可能会在更深的网络上进行测试把。
diff --git a/lenet_tensor.py b/lenet_tensor.py
@@ -2,14 +2,15 @@
 import tensor.Variable as var
 import tensor.Operator as op
 import tensor.Activation as activation
+from tensor.util import learning_rate_exponential_decay
 import plot
 
 import time
 import struct
 from glob import glob
 import os
 
-VERSION = 'TENSOR_SGD_PRELU'
+VERSION = 'TENSOR_Adagrad_RELU'
 
 
 def load_mnist(path, kind='train'):
@@ -34,18 +35,25 @@ def load_mnist(path, kind='train'):
 
 def inference(x, output_num):
     conv1_out = op.Conv2D((5, 5, 1, 12), input_variable=x, name='conv1', padding='VALID').output_variables
-    relu1_out = activation.Prelu(input_variable=conv1_out, name='Prelu1').output_variables
+    relu1_out = activation.Relu(input_variable=conv1_out, name='relu1').output_variables
     pool1_out = op.MaxPooling(ksize=2, input_variable=relu1_out, name='pool1').output_variables
 
     conv2_out = op.Conv2D((3, 3, 12, 24), input_variable=pool1_out, name='conv2').output_variables
-    relu2_out = activation.Prelu(input_variable=conv2_out, name='Prelu2').output_variables
+    relu2_out = activation.Relu(input_variable=conv2_out, name='relu2').output_variables
     pool2_out = op.MaxPooling(ksize=2, input_variable=relu2_out, name='pool2').output_variables
 
     fc_out = op.FullyConnect(output_num=output_num, input_variable=pool2_out, name='fc').output_variables
     return fc_out
 
 
 batch_size = 64
+global_step = 0
+# set method
+for k in var.GLOBAL_VARIABLE_SCOPE:
+    s = var.GLOBAL_VARIABLE_SCOPE[k]
+    if isinstance(s, var.Variable) and s.learnable:
+        s.set_method_adagrad()
+
 img_placeholder = var.Variable((batch_size, 28, 28, 1), 'input')
 label_placeholder = var.Variable([batch_size, 1], 'label')
 
@@ -63,10 +71,16 @@ def inference(x, output_num):
 
 with open('logs/%s_log.txt'%VERSION, 'wb') as logf:
     for epoch in range(20):
-        learning_rate = 1e-5
+        # random shuffle
+        order = np.arange(images.shape[0])
+        np.random.shuffle(order)
+        _images = images[order]
+        _labels = labels[order]
 
+        # batch
         batch_loss = 0
         batch_acc = 0
+
         val_acc = 0
         val_loss = 0
 
@@ -75,13 +89,9 @@ def inference(x, output_num):
         train_loss = 0
 
         for i in range(images.shape[0] / batch_size):
-            # feed
-            # random shuffle
-            order = np.arange(images.shape[0])
-            np.random.shuffle(order)
-            _images = images[order]
-            _labels = labels[order]
+            learning_rate = learning_rate_exponential_decay(1e-4, global_step, 0.1, 5000)
 
+            # feed
             img_placeholder.data = _images[i * batch_size:(i + 1) * batch_size].reshape([batch_size, 28, 28, 1])
             label_placeholder.data = _labels[i * batch_size:(i + 1) * batch_size]
 
@@ -103,9 +113,10 @@ def inference(x, output_num):
             for k in var.GLOBAL_VARIABLE_SCOPE:
                 s = var.GLOBAL_VARIABLE_SCOPE[k]
                 if isinstance(s, var.Variable) and s.learnable:
-                    s.apply_gradient(learning_rate=learning_rate, decay_rate=0.0004)
+                    s.apply_gradient(learning_rate=learning_rate, decay_rate=0.0004, batch_size=batch_size)
                 if isinstance(s, var.Variable):
                     s.diff = np.zeros(s.shape)
+                global_step += 1
 
 
             if i % 50 == 0 and i!= 0:
@@ -114,17 +125,15 @@ def inference(x, output_num):
                                                                                                      i, batch_acc / float(
                               batch_size), batch_loss / batch_size, learning_rate)
                 logf.write(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + \
-                          "  %s epoch: %d ,  batch: %5d , avg_batch_acc: %.4f  avg_batch_loss: %.4f  learning_rate %f" % (VERSION,epoch,
+                          "  %s epoch: %d ,  batch: %5d , avg_batch_acc: %.4f  avg_batch_loss: %.4f  learning_rate %f\n" % (VERSION,epoch,
                                                                                                      i, batch_acc / float(
                               batch_size), batch_loss / batch_size, learning_rate))
                 loss_collect.append(batch_loss / batch_size)
                 acc_collect.append(batch_acc / float(batch_size))
 
-
             batch_loss = 0
             batch_acc = 0
 
-
         print time.strftime("%Y-%m-%d %H:%M:%S",
                                 time.localtime()) + "  epoch: %5d , train_acc: %.4f  avg_train_loss: %.4f" % (
                 epoch, train_acc / float(int(images.shape[0]/batch_size)*batch_size), train_loss / images.shape[0])

diff --git a/logs/elu.txt → logs/activation/elu.txt b/logs/elu.txt → logs/activation/elu.txt
diff --git a/logs/lrelu.txt → logs/activation/lrelu.txt b/logs/lrelu.txt → logs/activation/lrelu.txt
diff --git a/logs/lrelu1.txt → logs/activation/lrelu1.txt b/logs/lrelu1.txt → logs/activation/lrelu1.txt
diff --git a/logs/relu.txt → logs/activation/relu.txt b/logs/relu.txt → logs/activation/relu.txt
diff --git a/logs/sigmoid.txt → logs/activation/sigmoid.txt b/logs/sigmoid.txt → logs/activation/sigmoid.txt
diff --git a/logs/tanh.txt → logs/activation/tanh.txt b/logs/tanh.txt → logs/activation/tanh.txt
diff --git a/plot.pyc b/plot.pyc
diff --git a/tensor/Operator.py b/tensor/Operator.py
@@ -262,32 +262,6 @@ def backward(self):
             return
 
 
-# class Relu(Operator):
-#     def __init__(self, input_variable=Variable, name=str):
-#         self.input_variables = input_variable
-#         self.output_variables = Variable(self.input_variables.shape, name='out', scope=name)
-#         Operator.__init__(self, name, self.input_variables, self.output_variables)
-#
-#     def forward(self):
-#         if self.wait_forward:
-#             for parent in self.parent:
-#                 GLOBAL_VARIABLE_SCOPE[parent].eval()
-#             self.output_variables.data = np.maximum(self.input_variables.data, 0)
-#             self.wait_forward = False
-#             return
-#         else:
-#             pass
-#
-#     def backward(self):
-#         if self.wait_forward:
-#             pass
-#         else:
-#             for child in self.child:
-#                 GLOBAL_VARIABLE_SCOPE[child].diff_eval()
-#             self.input_variables.diff = self.output_variables.diff
-#             self.output_variables.diff[self.input_variables.data < 0] = 0
-#             self.wait_forward = True
-#             return
 
 
 class SoftmaxLoss(Operator):

diff --git a/tensor/Variable.py b/tensor/Variable.py
@@ -1,12 +1,16 @@
 import numpy as np
 from util import initializer
+import math
 
 if 'GLOBAL_VARIABLE_SCOPE' not in globals():
     # global GLOBAL_VARIABLE_SCOPE
     GLOBAL_VARIABLE_SCOPE = {}
 
 
 class Variable(object):
+    initial = 'MSRA'
+    method = 'SGD'
+
     def __init__(self, shape=list, name=str, scope='', grad=True, learnable=False, init='MSRA'):
         if scope != '':
             self.scope = scope if scope[-1] == '/' else scope + '/'
@@ -25,7 +29,8 @@ def __init__(self, shape=list, name=str, scope='', grad=True, learnable=False, i
                 raise Exception("Variable name: %s shape is not list of int"%self.name)
 
         self.shape = shape
-        self.data = initializer(shape, init)
+        self.data = initializer(shape, self.initial)
+
         self.child = []
         self.parent = []
 
@@ -50,12 +55,58 @@ def diff_eval(self):
 
         return self.diff
 
-    def apply_gradient(self, learning_rate=float, decay_rate=float, method='SGD'):
+    def apply_gradient(self, learning_rate=float, decay_rate=float, batch_size=1):
         self.data *= (1 - decay_rate)
-        if method == 'SGD':
+        if self.method == 'SGD':
             learning_rate = learning_rate
-        self.data -= learning_rate*self.diff
-        self.diff *= 0
+            self.data -= (learning_rate*self.diff/batch_size)
+            self.diff *= 0
+
+        elif self.method == 'Momentum':
+            self.mtmp = self.momentum * self.mtmp + self.diff/batch_size
+            self.data -= learning_rate * self.mtmp
+            self.diff *= 0
+
+        elif self.method == 'NGA':
+            self.mtmp = self.momentum * self.mtmp + self.diff / batch_size + self.momentum*(self.diff-self.lastdiff)/batch_size
+            self.data -= learning_rate * self.mtmp
+            self.lastdiff = self.diff
+            self.diff *= 0
+
+        elif self.method == 'Adam':
+            self.t += 1
+            learning_rate_t = learning_rate * math.sqrt(1 - pow(self.beta2, self.t)) / (1 - pow(self.beta1, self.t))
+            self.m_t = self.beta1 * self.m_t + (1 - self.beta1) * self.diff / batch_size
+            self.v_t = self.beta2 * self.v_t + (1 - self.beta2) * ((self.diff / batch_size) ** 2)
+            self.data -= learning_rate_t * self.m_t / (self.v_t + self.epsilon) ** 0.5
+            self.diff *= 0
+
+        else:
+            raise Exception('No apply_gradient method: %s'%self.method)
+
+    def set_method_sgd(self):
+        self.method = 'SGD'
+
+    def set_method_momentum(self, momentum=0.9):
+        self.method = 'Momentum'
+        self.momentum = momentum
+        self.mtmp = np.zeros(self.diff.shape)
+
+    def set_method_nga(self,momentum=0.9):
+        self.method = 'NGA'
+        self.lastdiff = np.zeros(self.diff.shape)
+        self.momentum= momentum
+        self.mtmp = np.zeros(self.diff.shape)
+
+    def set_method_adam(self, beta1=0.9, beta2=0.999, epsilon=1e-8):
+        self.method = 'Adam'
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.m_t = np.zeros(self.diff.shape)
+        self.v_t = np.zeros(self.diff.shape)
+        self.t = 0
+
 
 
 def get_by_name(name):

diff --git a/tensor/Variable.pyc b/tensor/Variable.pyc
diff --git a/tensor/util.py b/tensor/util.py
@@ -14,4 +14,12 @@ def initializer(shape, method):
         return np.random.standard_normal(shape) / weights_scale
 
 
+def learning_rate_exponential_decay(learning_rate, global_step, decay_rate=0.1, decay_steps=5000):
+    '''
+    Applies exponential decay to learning rate
+    decayed_learning_rate = learning_rate * decay_rate ^ (global_step/decay_steps)
+    :return: learning rate decayed by step
+    '''
 
+    decayed_learning_rate = learning_rate * pow(decay_rate,float(global_step/decay_steps))
+    return decayed_learning_rate