@@ -35,7 +35,7 @@ def __init__(self, hidden_layer_sizes, p_keep):
35
35
self .hidden_layer_sizes = hidden_layer_sizes
36
36
self .dropout_rates = p_keep
37
37
38
- def fit (self , X , Y , learning_rate = 1e-6 , mu = 0.99 , decay = 0.999 , epochs = 300 , batch_sz = 100 , show_fig = False ):
38
+ def fit (self , X , Y , learning_rate = 1e-4 , mu = 0.9 , decay = 0.9 , epochs = 8 , batch_sz = 100 , show_fig = False ):
39
39
# make a validation set
40
40
X , Y = shuffle (X , Y )
41
41
X = X .astype (np .float32 )
@@ -66,12 +66,6 @@ def fit(self, X, Y, learning_rate=1e-6, mu=0.99, decay=0.999, epochs=300, batch_
66
66
for h in self .hidden_layers :
67
67
self .params += h .params
68
68
69
- # for momentum
70
- dparams = [theano .shared (np .zeros (p .get_value ().shape )) for p in self .params ]
71
-
72
- # for rmsprop
73
- cache = [theano .shared (np .zeros (p .get_value ().shape )) for p in self .params ]
74
-
75
69
# set up theano functions and variables
76
70
thX = T .matrix ('X' )
77
71
thY = T .ivector ('Y' )
@@ -80,12 +74,23 @@ def fit(self, X, Y, learning_rate=1e-6, mu=0.99, decay=0.999, epochs=300, batch_
80
74
# this cost is for training
81
75
cost = - T .mean (T .log (pY_train [T .arange (thY .shape [0 ]), thY ]))
82
76
77
+ # gradients wrt each param
78
+ grads = T .grad (cost , self .params )
79
+
80
+ # for momentum
81
+ dparams = [theano .shared (np .zeros_like (p .get_value ())) for p in self .params ]
82
+
83
+ # for rmsprop
84
+ cache = [theano .shared (np .ones_like (p .get_value ())) for p in self .params ]
85
+
86
+ new_cache = [decay * c + (1 - decay )* g * g for p , c , g in zip (self .params , cache , grads )]
87
+ new_dparams = [mu * dp - learning_rate * g / T .sqrt (new_c + 1e-10 ) for p , new_c , dp , g in zip (self .params , new_cache , dparams , grads )]
83
88
updates = [
84
- (c , decay * c + ( 1 - decay ) * T . grad ( cost , p ) * T . grad ( cost , p )) for p , c in zip (self . params , cache )
89
+ (c , new_c ) for c , new_c in zip (cache , new_cache )
85
90
] + [
86
- (p , p + mu * dp - learning_rate * T . grad ( cost , p ) / T . sqrt ( c + 1e-10 )) for p , c , dp in zip (self . params , cache , dparams )
91
+ (dp , new_dp ) for dp , new_dp in zip (dparams , new_dparams )
87
92
] + [
88
- (dp , mu * dp - learning_rate * T . grad ( cost , p ) / T . sqrt ( c + 1e-10 )) for p , c , dp in zip (self .params , cache , dparams )
93
+ (p , p + new_dp ) for p , new_dp in zip (self .params , new_dparams )
89
94
]
90
95
91
96
# momentum only
0 commit comments