Skip to content

Commit

Permalink
0130
Browse files Browse the repository at this point in the history
  • Loading branch information
wuziheng committed Jan 31, 2018
1 parent dde25aa commit eb30ec9
Show file tree
Hide file tree
Showing 14 changed files with 296 additions and 167 deletions.
295 changes: 176 additions & 119 deletions .idea/workspace.xml

Large diffs are not rendered by default.

36 changes: 33 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@

------

*Target3*:  实现不同的激活函数(relu系和sigmoid系),比对分析:

* 给train_epoch读入图片添加了shuffle
* 完成了不同的激活函数relu,leaky-relu,sigmoid,tanh,elu, prelu
* 完成了对激活函数的grad_check,实际上sigmoid确实容易出现gradient-vanish,所以一开始用1e-5学习率基本收敛的特别慢,所以实际测试里面调整到了1e-3
Expand All @@ -82,11 +84,39 @@

由于我们没有精调参数,所以这里就不分析比较准确率曲线,我们就分析不同激活函数的收敛速度:

1. 左图可以看到tanh,sigmoid即使采用了更大的学习率,在收敛速度上依然比relu要慢不少
2. 右图我们可以看到,这么多种relu,在收敛速度上,没有质的区别,即使是不同alpha的leak-relu,区别也不大。但是我们还是可以勉强认为leaky-relu稍微比relu强一些。
1. 左图可以看到tanh,sigmoid还是存在比较明显的gradient vanish,网络只有2层。即使采用了更大的学习率,在收敛速度上依然比relu要慢不少
2. 右图我们可以看到,这么多种relu,在收敛速度上,没有质的区别,即使是不同alpha的leak-relu(alpha=0.01,0.001),区别也不大。但是我们还是可以勉强认为leaky-relu稍微比relu强一些。

<center>

<img src="fig/activation-loss1.jpg" style="zoom:100%"/> <img src="fig/activation-loss2.jpg" style="zoom:100%"/>

</center>
</center>


**2018.01.30**

------

由于不可抗力(~要给女友写论文做实验~),中断了几天。计划给Variable class添加method和initializer属性,用于全局控制变量的优化方法和初始化(这里就感觉到,应该抽象一个graph class,然后把Variable, Operator抽象成Node,方便控制全局的方法,例如初始化,优化方法设置,源求导,汇求值,全局apply_gradient等等,都可以变成graph的方法)。

原始的版本实现了SGD与MSRA.计划实现:

* method: sgd, momentum ,Nesterov, Adam ,RMSProp 等进行比对
* initializer: MSRA, Xavier, Zeros, Const

ps:注意到之前的版本apply_gradient,diff都没有/batch_size,从这个版本添加上了.这样就收敛速度就不会显式的受到batch_size的影响(这个地方还是需要思考一下,因为从epoch角度来讲,不除比较稳定)。当然我就把初始learning_rate调大了50倍.同时,添加了util.learning_rate_decay,用于让学习率自动衰减.

| *learning_rate* | *batch_size* | *decay_rate* | *decay_epoch* | *initializer* |
| :-------------: | :----------: | :----------: | :-----------: | :-----------: |
| 5e-4 | 64 | 0.1 | 5 | MSRA |

| version | | | | |
| :------: | ---- | ---- | ---- | ---- |
| SGD | | | | |
| Momentum | | | | |
| Nesterov | | | | |
| Adam | | | | |

这里我们也一样比较了几种method在LRELU下的表现,由于初始化是随机的+网络很浅,所以好像差别也不是特别明显,没有activation之间表现的差距那么明显。之后可能会在更深的网络上进行测试把。
37 changes: 23 additions & 14 deletions lenet_tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@
import tensor.Variable as var
import tensor.Operator as op
import tensor.Activation as activation
from tensor.util import learning_rate_exponential_decay
import plot

import time
import struct
from glob import glob
import os

VERSION = 'TENSOR_SGD_PRELU'
VERSION = 'TENSOR_Adagrad_RELU'


def load_mnist(path, kind='train'):
Expand All @@ -34,18 +35,25 @@ def load_mnist(path, kind='train'):

def inference(x, output_num):
conv1_out = op.Conv2D((5, 5, 1, 12), input_variable=x, name='conv1', padding='VALID').output_variables
relu1_out = activation.Prelu(input_variable=conv1_out, name='Prelu1').output_variables
relu1_out = activation.Relu(input_variable=conv1_out, name='relu1').output_variables
pool1_out = op.MaxPooling(ksize=2, input_variable=relu1_out, name='pool1').output_variables

conv2_out = op.Conv2D((3, 3, 12, 24), input_variable=pool1_out, name='conv2').output_variables
relu2_out = activation.Prelu(input_variable=conv2_out, name='Prelu2').output_variables
relu2_out = activation.Relu(input_variable=conv2_out, name='relu2').output_variables
pool2_out = op.MaxPooling(ksize=2, input_variable=relu2_out, name='pool2').output_variables

fc_out = op.FullyConnect(output_num=output_num, input_variable=pool2_out, name='fc').output_variables
return fc_out


batch_size = 64
global_step = 0
# set method
for k in var.GLOBAL_VARIABLE_SCOPE:
s = var.GLOBAL_VARIABLE_SCOPE[k]
if isinstance(s, var.Variable) and s.learnable:
s.set_method_adagrad()

img_placeholder = var.Variable((batch_size, 28, 28, 1), 'input')
label_placeholder = var.Variable([batch_size, 1], 'label')

Expand All @@ -63,10 +71,16 @@ def inference(x, output_num):

with open('logs/%s_log.txt'%VERSION, 'wb') as logf:
for epoch in range(20):
learning_rate = 1e-5
# random shuffle
order = np.arange(images.shape[0])
np.random.shuffle(order)
_images = images[order]
_labels = labels[order]

# batch
batch_loss = 0
batch_acc = 0

val_acc = 0
val_loss = 0

Expand All @@ -75,13 +89,9 @@ def inference(x, output_num):
train_loss = 0

for i in range(images.shape[0] / batch_size):
# feed
# random shuffle
order = np.arange(images.shape[0])
np.random.shuffle(order)
_images = images[order]
_labels = labels[order]
learning_rate = learning_rate_exponential_decay(1e-4, global_step, 0.1, 5000)

# feed
img_placeholder.data = _images[i * batch_size:(i + 1) * batch_size].reshape([batch_size, 28, 28, 1])
label_placeholder.data = _labels[i * batch_size:(i + 1) * batch_size]

Expand All @@ -103,9 +113,10 @@ def inference(x, output_num):
for k in var.GLOBAL_VARIABLE_SCOPE:
s = var.GLOBAL_VARIABLE_SCOPE[k]
if isinstance(s, var.Variable) and s.learnable:
s.apply_gradient(learning_rate=learning_rate, decay_rate=0.0004)
s.apply_gradient(learning_rate=learning_rate, decay_rate=0.0004, batch_size=batch_size)
if isinstance(s, var.Variable):
s.diff = np.zeros(s.shape)
global_step += 1


if i % 50 == 0 and i!= 0:
Expand All @@ -114,17 +125,15 @@ def inference(x, output_num):
i, batch_acc / float(
batch_size), batch_loss / batch_size, learning_rate)
logf.write(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + \
" %s epoch: %d , batch: %5d , avg_batch_acc: %.4f avg_batch_loss: %.4f learning_rate %f" % (VERSION,epoch,
" %s epoch: %d , batch: %5d , avg_batch_acc: %.4f avg_batch_loss: %.4f learning_rate %f\n" % (VERSION,epoch,
i, batch_acc / float(
batch_size), batch_loss / batch_size, learning_rate))
loss_collect.append(batch_loss / batch_size)
acc_collect.append(batch_acc / float(batch_size))


batch_loss = 0
batch_acc = 0


print time.strftime("%Y-%m-%d %H:%M:%S",
time.localtime()) + " epoch: %5d , train_acc: %.4f avg_train_loss: %.4f" % (
epoch, train_acc / float(int(images.shape[0]/batch_size)*batch_size), train_loss / images.shape[0])
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Binary file modified plot.pyc
Binary file not shown.
26 changes: 0 additions & 26 deletions tensor/Operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,32 +262,6 @@ def backward(self):
return


# class Relu(Operator):
# def __init__(self, input_variable=Variable, name=str):
# self.input_variables = input_variable
# self.output_variables = Variable(self.input_variables.shape, name='out', scope=name)
# Operator.__init__(self, name, self.input_variables, self.output_variables)
#
# def forward(self):
# if self.wait_forward:
# for parent in self.parent:
# GLOBAL_VARIABLE_SCOPE[parent].eval()
# self.output_variables.data = np.maximum(self.input_variables.data, 0)
# self.wait_forward = False
# return
# else:
# pass
#
# def backward(self):
# if self.wait_forward:
# pass
# else:
# for child in self.child:
# GLOBAL_VARIABLE_SCOPE[child].diff_eval()
# self.input_variables.diff = self.output_variables.diff
# self.output_variables.diff[self.input_variables.data < 0] = 0
# self.wait_forward = True
# return


class SoftmaxLoss(Operator):
Expand Down
61 changes: 56 additions & 5 deletions tensor/Variable.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import numpy as np
from util import initializer
import math

if 'GLOBAL_VARIABLE_SCOPE' not in globals():
# global GLOBAL_VARIABLE_SCOPE
GLOBAL_VARIABLE_SCOPE = {}


class Variable(object):
initial = 'MSRA'
method = 'SGD'

def __init__(self, shape=list, name=str, scope='', grad=True, learnable=False, init='MSRA'):
if scope != '':
self.scope = scope if scope[-1] == '/' else scope + '/'
Expand All @@ -25,7 +29,8 @@ def __init__(self, shape=list, name=str, scope='', grad=True, learnable=False, i
raise Exception("Variable name: %s shape is not list of int"%self.name)

self.shape = shape
self.data = initializer(shape, init)
self.data = initializer(shape, self.initial)

self.child = []
self.parent = []

Expand All @@ -50,12 +55,58 @@ def diff_eval(self):

return self.diff

def apply_gradient(self, learning_rate=float, decay_rate=float, method='SGD'):
def apply_gradient(self, learning_rate=float, decay_rate=float, batch_size=1):
self.data *= (1 - decay_rate)
if method == 'SGD':
if self.method == 'SGD':
learning_rate = learning_rate
self.data -= learning_rate*self.diff
self.diff *= 0
self.data -= (learning_rate*self.diff/batch_size)
self.diff *= 0

elif self.method == 'Momentum':
self.mtmp = self.momentum * self.mtmp + self.diff/batch_size
self.data -= learning_rate * self.mtmp
self.diff *= 0

elif self.method == 'NGA':
self.mtmp = self.momentum * self.mtmp + self.diff / batch_size + self.momentum*(self.diff-self.lastdiff)/batch_size
self.data -= learning_rate * self.mtmp
self.lastdiff = self.diff
self.diff *= 0

elif self.method == 'Adam':
self.t += 1
learning_rate_t = learning_rate * math.sqrt(1 - pow(self.beta2, self.t)) / (1 - pow(self.beta1, self.t))
self.m_t = self.beta1 * self.m_t + (1 - self.beta1) * self.diff / batch_size
self.v_t = self.beta2 * self.v_t + (1 - self.beta2) * ((self.diff / batch_size) ** 2)
self.data -= learning_rate_t * self.m_t / (self.v_t + self.epsilon) ** 0.5
self.diff *= 0

else:
raise Exception('No apply_gradient method: %s'%self.method)

def set_method_sgd(self):
self.method = 'SGD'

def set_method_momentum(self, momentum=0.9):
self.method = 'Momentum'
self.momentum = momentum
self.mtmp = np.zeros(self.diff.shape)

def set_method_nga(self,momentum=0.9):
self.method = 'NGA'
self.lastdiff = np.zeros(self.diff.shape)
self.momentum= momentum
self.mtmp = np.zeros(self.diff.shape)

def set_method_adam(self, beta1=0.9, beta2=0.999, epsilon=1e-8):
self.method = 'Adam'
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
self.m_t = np.zeros(self.diff.shape)
self.v_t = np.zeros(self.diff.shape)
self.t = 0



def get_by_name(name):
Expand Down
Binary file modified tensor/Variable.pyc
Binary file not shown.
8 changes: 8 additions & 0 deletions tensor/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,12 @@ def initializer(shape, method):
return np.random.standard_normal(shape) / weights_scale


def learning_rate_exponential_decay(learning_rate, global_step, decay_rate=0.1, decay_steps=5000):
'''
Applies exponential decay to learning rate
decayed_learning_rate = learning_rate * decay_rate ^ (global_step/decay_steps)
:return: learning rate decayed by step
'''

decayed_learning_rate = learning_rate * pow(decay_rate,float(global_step/decay_steps))
return decayed_learning_rate

0 comments on commit eb30ec9

Please sign in to comment.