-
Notifications
You must be signed in to change notification settings - Fork 60
Improve log, save origin yaml, and fix adan #272
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,31 +1,12 @@ | ||
# Copyright 2020-2022 Huawei Technologies Co., Ltd | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# ============================================================================ | ||
"""adan""" | ||
from __future__ import absolute_import | ||
|
||
import mindspore as ms | ||
from mindspore import ops | ||
from mindspore.common import dtype as mstype | ||
from mindspore.common.api import ms_function | ||
from mindspore.common.tensor import Tensor | ||
from mindspore.nn.optim.optimizer import Optimizer, opt_init_args_register | ||
from mindspore.ops import composite as C | ||
from mindspore.ops import functional as F | ||
from mindspore.ops import operations as P | ||
|
||
_adan_opt = C.MultitypeFuncGraph("adan_opt") | ||
_scaler_one = Tensor(1, mstype.int32) | ||
_scaler_ten = Tensor(10, mstype.float32) | ||
_adan_opt = ops.MultitypeFuncGraph("adan_opt") | ||
|
||
|
||
@_adan_opt.register( | ||
|
@@ -73,67 +54,55 @@ def _update_run_op( | |
Returns: | ||
Tensor, the new value of v after updating. | ||
""" | ||
op_cast = P.Cast() | ||
op_mul = P.Mul() | ||
op_square = P.Square() | ||
op_sqrt = P.Sqrt() | ||
op_cast = P.Cast() | ||
op_reshape = P.Reshape() | ||
op_shape = P.Shape() | ||
|
||
success = True | ||
|
||
# if global_step == 0.0: # init | ||
# TODO: use global_step==0 as the condition to init prev_gradient as gradient | ||
# if (F.reduce_min(prev_gradient) == 0.0) and (F.reduce_max(prev_gradient) == 0.0): | ||
if F.reduce_sum(prev_gradient) == 0.0: | ||
success = F.depend(success, F.assign(prev_gradient, gradient)) | ||
|
||
# TODO: is casting needed? | ||
op_mul = ops.Mul() | ||
op_square = ops.Square() | ||
op_sqrt = ops.Sqrt() | ||
op_cast = ops.Cast() | ||
op_reshape = ops.Reshape() | ||
op_shape = ops.Shape() | ||
|
||
success = ms.Tensor(True, dtype=ms.bool_) | ||
|
||
if ops.reduce_sum(prev_gradient) == 0.0: | ||
success = ops.depend(success, ops.assign(prev_gradient, gradient)) | ||
|
||
param_fp32 = op_cast(param, mstype.float32) | ||
m_fp32 = op_cast(m, mstype.float32) | ||
v_fp32 = op_cast(v, mstype.float32) | ||
n_fp32 = op_cast(n, mstype.float32) | ||
gradient_fp32 = op_cast(gradient, mstype.float32) | ||
prev_gradient_fp32 = op_cast(prev_gradient, mstype.float32) | ||
|
||
next_m = op_mul(F.tuple_to_array((1.0,)) - beta1, m_fp32) + op_mul(beta1, gradient_fp32) | ||
next_m = op_mul(ops.tuple_to_array((1.0,)) - beta1, m_fp32) + op_mul(beta1, gradient_fp32) | ||
|
||
next_v = op_mul(F.tuple_to_array((1.0,)) - beta2, v_fp32) + op_mul(beta2, gradient_fp32 - prev_gradient_fp32) | ||
next_v = op_mul(ops.tuple_to_array((1.0,)) - beta2, v_fp32) + op_mul(beta2, gradient_fp32 - prev_gradient_fp32) | ||
|
||
next_n = op_mul(F.tuple_to_array((1.0,)) - beta3, n_fp32) + op_mul( | ||
beta3, op_square(gradient + op_mul(F.tuple_to_array((1.0,)) - beta2, gradient_fp32 - prev_gradient_fp32)) | ||
next_n = op_mul(ops.tuple_to_array((1.0,)) - beta3, n_fp32) + op_mul( | ||
beta3, op_square(gradient + op_mul(ops.tuple_to_array((1.0,)) - beta2, gradient_fp32 - prev_gradient_fp32)) | ||
) | ||
|
||
lr_t = lr / (eps + op_sqrt(next_n)) | ||
|
||
update = next_m + op_mul(F.tuple_to_array((1.0,)) - beta2, next_v) | ||
|
||
# if decay_flag: | ||
# update = op_mul(weight_decay, param_fp32) + update | ||
update = next_m + op_mul(ops.tuple_to_array((1.0,)) - beta2, next_v) | ||
|
||
next_param = param_fp32 - op_reshape(op_mul(lr_t, update), op_shape(param_fp32)) | ||
|
||
next_param = next_param / (Tensor(1.0, mstype.float32) + op_mul(weight_decay, lr_t)) | ||
|
||
success = F.depend(success, F.assign(param, op_cast(next_param, F.dtype(param)))) | ||
success = F.depend(success, F.assign(m, op_cast(next_m, F.dtype(m)))) | ||
success = F.depend(success, F.assign(v, op_cast(next_v, F.dtype(v)))) | ||
success = F.depend(success, F.assign(n, op_cast(next_n, F.dtype(n)))) | ||
success = F.depend(success, F.assign(prev_gradient, gradient)) | ||
success = ops.depend(success, ops.assign(param, op_cast(next_param, ops.dtype(param)))) | ||
success = ops.depend(success, ops.assign(m, op_cast(next_m, ops.dtype(m)))) | ||
success = ops.depend(success, ops.assign(v, op_cast(next_v, ops.dtype(v)))) | ||
success = ops.depend(success, ops.assign(n, op_cast(next_n, ops.dtype(n)))) | ||
success = ops.depend(success, ops.assign(prev_gradient, gradient)) | ||
|
||
return op_cast(next_param, F.dtype(param)) | ||
return op_cast(next_param, ops.dtype(param)) | ||
|
||
|
||
def _check_param_value(beta1, beta2, eps, use_locking, prim_name): | ||
def _check_param_value(beta1, beta2, eps, prim_name): | ||
"""Check the type of inputs.""" | ||
assert isinstance(beta1, float), f"For '{prim_name}', the type of 'beta1' must be 'float', but got type '{type(beta1).__name__}'." | ||
assert isinstance(beta2, float), f"For '{prim_name}', the type of 'beta2' must be 'float', but got type '{type(beta2).__name__}'." | ||
assert isinstance(eps, float), f"For '{prim_name}', the type of 'eps' must be 'float', but got type '{type(eps).__name__}'." | ||
assert 0.0 < beta1 < 1.0, f"For '{prim_name}', the range of 'beta1' must be (0.0, 1.0), but got {beta1}." | ||
assert 0.0 < beta2 < 1.0, f"For '{prim_name}', the range of 'beta2' must be (0.0, 1.0), but got {beta2}." | ||
assert eps > 0, f"For '{prim_name}', the 'eps' must be positive, but got {eps}." | ||
assert isinstance(use_locking, bool), f"For '{prim_name}', the type of 'use_locking' must be 'bool', but got type '{type(use_locking).__name__}'." | ||
assert isinstance(beta1, float) and 0 <= beta1 <= 1.0, f"For {prim_name}, beta1 should between 0 and 1" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Update the docstring of function |
||
assert isinstance(beta2, float) and 0 <= beta2 <= 1.0, f"For {prim_name}, beta2 should between 0 and 1" | ||
assert isinstance(eps, float) and eps > 0, f"For {prim_name}, eps should be bigger than 0" | ||
|
||
|
||
class Adan(Optimizer): | ||
|
@@ -153,28 +122,26 @@ def __init__( | |
beta3=0.99, | ||
eps=1e-8, | ||
use_locking=False, | ||
weight_decay=1e-6, | ||
weight_decay=0.0, | ||
loss_scale=1.0, | ||
): | ||
super().__init__( | ||
learning_rate, params, weight_decay=weight_decay, loss_scale=loss_scale | ||
) # Optimized inherit weight decay is bloaked. weight decay is computed in this py. | ||
|
||
_check_param_value(beta1, beta2, eps, use_locking, self.cls_name) | ||
_check_param_value(beta1, beta2, eps, self.cls_name) | ||
assert isinstance(use_locking, bool), f"For {self.cls_name}, use_looking should be bool" | ||
|
||
self.beta1 = Tensor(beta1, mstype.float32) | ||
self.beta2 = Tensor(beta2, mstype.float32) | ||
self.beta3 = Tensor(beta3, mstype.float32) | ||
# self.beta1_power = Parameter(initializer(1, [1], mstype.float32), name="beta1_power") | ||
# self.beta2_power = Parameter(initializer(1, [1], mstype.float32), name="beta2_power") | ||
# self.beta3_power = Parameter(initializer(1, [1], mstype.float32), name="beta3_power") | ||
|
||
self.eps = Tensor(eps, mstype.float32) | ||
self.use_locking = use_locking | ||
self.moment1 = self._parameters.clone(prefix="moment1", init="zeros") # m | ||
self.moment2 = self._parameters.clone(prefix="moment2", init="zeros") # v | ||
self.moment3 = self._parameters.clone(prefix="moment3", init="zeros") # n | ||
self.prev_gradient = self._parameters.clone(prefix="prev_gradient", init="zeros") | ||
# print('prev g: ', type(self.prev_gradient)) | ||
|
||
self.weight_decay = Tensor(weight_decay, mstype.float32) | ||
|
||
|
@@ -184,29 +151,23 @@ def construct(self, gradients): | |
moment1 = self.moment1 | ||
moment2 = self.moment2 | ||
moment3 = self.moment3 | ||
# vhat = self.vhat | ||
|
||
gradients = self.flatten_gradients(gradients) | ||
# gradients = self.decay_weight(gradients) # we decay weight in adan_opt func | ||
gradients = self.gradients_centralization(gradients) | ||
gradients = self.scale_grad(gradients) | ||
gradients = self._grad_sparse_indices_deduplicate(gradients) | ||
lr = self.get_lr() | ||
# weight_decay = self.get_weight_decay() | ||
|
||
# if self.global_step == 0: | ||
# success = F.depend(True, F.assign(self.prev_gradient, gradients)) | ||
|
||
# TODO: currently not support dist | ||
success = self.map_( | ||
F.partial(_adan_opt, self.beta1, self.beta2, self.beta3, self.eps, lr, self.weight_decay), | ||
ops.partial(_adan_opt, self.beta1, self.beta2, self.beta3, self.eps, lr, self.weight_decay), | ||
params, | ||
moment1, | ||
moment2, | ||
moment3, | ||
gradients, | ||
self.prev_gradient, | ||
) | ||
# params, moment1, moment2, moment3, gradients, self.prev_gradient, self.global_step) | ||
|
||
return success | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ addict | |
matplotlib | ||
addict | ||
numpy | ||
shutils | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is not |
||
imgaug>=0.4.0 | ||
tqdm>=4.64.1 | ||
opencv-python-headless>=3.4.18.65 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if the number of samples in the evaluation set is a prime number, then the batch size will be set to 1. This can significantly increase evaluation time. Can we just set
drop_remainder
toFalse
for the evaluation set and leave the batch size as it is?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
agree. The time consumption of run batch size = 1 is usually longer than the time running with two different batch sizes, the compiling time of model due to the different batch size is negligible
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Even when we set
drop_remainder
to False, the last batch will be padded tobatch_size
, leading to an inaccurate evaluation result.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I remember it is not padded, the remainder will be output with different batch size.