@@ -483,15 +483,13 @@ class Adam(Optimizer):
483
483
clip_gradient : float, optional
484
484
clip gradient in range [-clip_gradient, clip_gradient]
485
485
"""
486
- def __init__ (self , learning_rate = 0.002 , beta1 = 0.9 , beta2 = 0.999 , epsilon = 1e-8 ,
486
+ def __init__ (self , learning_rate = 0.001 , beta1 = 0.9 , beta2 = 0.999 , epsilon = 1e-8 ,
487
487
decay_factor = (1 - 1e-8 ), ** kwargs ):
488
488
super (Adam , self ).__init__ (learning_rate = learning_rate , ** kwargs )
489
489
self .beta1 = beta1
490
490
self .beta2 = beta2
491
491
self .epsilon = epsilon
492
492
self .decay_factor = decay_factor
493
- self .time = 0
494
- self .time_first_index = None
495
493
496
494
def create_state (self , index , weight ):
497
495
"""Create additional optimizer state: mean, variance
@@ -502,7 +500,6 @@ def create_state(self, index, weight):
502
500
The weight data
503
501
504
502
"""
505
- self .time_first_index = None # time is incremented only on the first index
506
503
return (zeros (weight .shape , weight .context , dtype = weight .dtype ), # mean
507
504
zeros (weight .shape , weight .context , dtype = weight .dtype )) # variance
508
505
@@ -528,37 +525,25 @@ def update(self, index, weight, grad, state):
528
525
lr = self ._get_lr (index )
529
526
self ._update_count (index )
530
527
528
+ t = self ._index_update_count [index ]
531
529
mean , variance = state
532
530
533
- # increment time only when the first parameters is called
534
- if self .time_first_index is None :
535
- self .time_first_index = index
536
- self .time = 0 # all parameters share the same time
537
- elif self .time_first_index == index :
538
- self .time += 1
531
+ grad *= self .rescale_grad
532
+ if self .clip_gradient is not None :
533
+ clip (grad , - self .clip_gradient , self .clip_gradient , out = grad )
539
534
540
- t1 = self .time + 1
541
- learning_rate = (lr *
542
- math .sqrt (1. - self .beta2 ** t1 ) /
543
- (1. - self .beta1 ** t1 ))
544
- beta_1t = self .beta1 * self .decay_factor ** (t1 - 1 )
535
+ mean [:] = self .beta1 * mean + (1. - self .beta1 ) * grad
536
+ variance [:] = self .beta2 * variance + (1. - self .beta2 ) * grad * grad
545
537
546
- grad = grad * self .rescale_grad
547
- if self .clip_gradient is not None :
548
- grad = clip (grad , - self .clip_gradient , self .clip_gradient )
538
+ coef1 = 1. - self .beta1 ** t
539
+ coef2 = 1. - self .beta2 ** t
540
+ lr *= math .sqrt (coef2 )/ coef1
541
+
542
+ weight [:] -= lr * mean / (sqrt (variance ) + self .epsilon )
549
543
550
- mean_t = beta_1t * mean + (1. - beta_1t ) * grad
551
- variance_t = (self .beta2 * variance +
552
- (1. - self .beta2 ) * grad * grad )
553
- step = (learning_rate * mean_t /
554
- (sqrt (variance_t ) + self .epsilon ))
555
544
wd = self ._get_wd (index )
556
545
if wd > 0. :
557
- step += lr * wd * weight
558
-
559
- weight [:] += - step
560
- mean [:] = mean_t
561
- variance [:] = variance_t
546
+ weight [:] -= (lr * wd ) * weight
562
547
563
548
@register
564
549
class AdaGrad (Optimizer ):
0 commit comments