Skip to content

Add loss average meter #285

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions mindocr/utils/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from mindspore.train.callback._callback import Callback, _handle_loss
from .visualize import draw_bboxes, show_imgs, recover_image
from .recorder import PerfRecorder
from .misc import AverageMeter

__all__ = ['Evaluator', 'EvalSaveCallback']

Expand Down Expand Up @@ -207,7 +208,7 @@ def __init__(self,
self.epoch_start_time = time.time()
self.step_start_time = time.time()

self._losses = []
self._loss_avg_meter = AverageMeter()

self._reduce_sum = ms.ops.AllReduce()
self._device_num = device_num
Expand All @@ -231,15 +232,15 @@ def on_train_step_end(self, run_context):
data_sink_mode = cb_params.dataset_sink_mode
cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1

self._losses.append(self._loss_reduce(loss))
self._loss_avg_meter.update(self._loss_reduce(loss))

if not data_sink_mode and cur_step_in_epoch % self.log_interval == 0:
opt = cb_params.train_network.optimizer
learning_rate = opt.learning_rate
cur_lr = learning_rate(opt.global_step - 1).asnumpy().squeeze()
per_step_time = (time.time() - self.step_start_time) * 1000 / self.log_interval
fps = self.batch_size * 1000 / per_step_time
loss = self._losses[-1].asnumpy()
loss = self._loss_avg_meter.val.asnumpy()
msg = f"epoch: [{cur_epoch}/{cb_params.epoch_num}] step: [{cur_step_in_epoch}/{cb_params.batch_num}], " \
f"loss: {loss:.6f}, lr: {cur_lr:.6f}, per step time: {per_step_time:.3f} ms, fps: {fps:.2f} img/s"
self.logger(msg)
Expand All @@ -251,7 +252,7 @@ def on_train_epoch_begin(self, run_context):
Args:
run_context (RunContext): Include some information of the model.
"""
self._losses.clear()
self._loss_avg_meter.reset()
self.epoch_start_time = time.time()
self.step_start_time = time.time()

Expand All @@ -265,7 +266,7 @@ def on_train_epoch_end(self, run_context):
cb_params = run_context.original_args()
cur_epoch = cb_params.cur_epoch_num
train_time = (time.time() - self.epoch_start_time)
train_loss = ms.ops.stack(self._losses).mean().asnumpy()
train_loss = self._loss_avg_meter.avg.asnumpy()

epoch_time = (time.time() - self.epoch_start_time)
per_step_time = epoch_time * 1000 / cb_params.batch_num
Expand Down
21 changes: 21 additions & 0 deletions mindocr/utils/misc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import mindspore as ms
from mindspore import Tensor


class AverageMeter:
"""Computes and stores the average and current value"""

def __init__(self) -> None:
self.reset()

def reset(self) -> None:
self.val = Tensor(0.0, dtype=ms.float32)
self.avg = Tensor(0.0, dtype=ms.float32)
self.sum = Tensor(0.0, dtype=ms.float32)
self.count = Tensor(0.0, dtype=ms.float32)

def update(self, val: Tensor, n: int = 1) -> None:
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count