Skip to content

add logger module #180

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions configs/det/dbnet/db_r50_icdar15.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ system:
distribute: False
amp_level: 'O0'
seed: 42
log_interval: 10
val_while_train: True
drop_overflow_update: False

Expand Down
1 change: 1 addition & 0 deletions configs/det/dbnet/db_r50_synthtext.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ system:
distribute: True
amp_level: 'O0'
seed: 42
log_interval: 100
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

与默认值相同,可删去。

val_while_train: False
drop_overflow_update: False

Expand Down
1 change: 1 addition & 0 deletions configs/rec/crnn/crnn_icdar15.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ system:
distribute: False
amp_level: 'O0'
seed: 42
log_interval: 100
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

与默认值相同,可删去。

val_while_train: True
drop_overflow_update: True

Expand Down
1 change: 1 addition & 0 deletions configs/rec/crnn/crnn_resnet34.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ system:
distribute: True
amp_level: 'O3'
seed: 42
log_interval: 100
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

与默认值相同,可删去。

val_while_train: True
drop_overflow_update: False

Expand Down
1 change: 1 addition & 0 deletions configs/rec/crnn/crnn_vgg7.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ system:
distribute: True
amp_level: 'O3'
seed: 42
log_interval: 100
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

与默认值相同,可删去。

val_while_train: True
drop_overflow_update: False

Expand Down
141 changes: 46 additions & 95 deletions mindocr/utils/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class Evaluator:
Args:
metric:
"""

def __init__(self, network, loss_fn=None, postprocessor=None, metrics=None, visualize=False, verbose=False,
**kwargs):
self.net = network
Expand All @@ -27,7 +28,8 @@ def __init__(self, network, loss_fn=None, postprocessor=None, metrics=None, visu
self.metrics = metrics if isinstance(metrics, List) else [metrics]
self.metric_names = []
for m in metrics:
assert hasattr(m, 'metric_names') and isinstance(m.metric_names, List), f'Metric object must contain `metric_names` attribute to indicate the metric names as a List type, but not found in {m.__class__.__name__}'
assert hasattr(m, 'metric_names') and isinstance(m.metric_names,
List), f'Metric object must contain `metric_names` attribute to indicate the metric names as a List type, but not found in {m.__class__.__name__}'
self.metric_names += m.metric_names

self.visualize = visualize
Expand Down Expand Up @@ -110,24 +112,31 @@ class EvalSaveCallback(Callback):
loader (Dataset): dataloader
saving_config (dict):
"""

def __init__(self,
network,
loader=None,
loss_fn=None,
postprocessor=None,
metrics=None,
rank_id=0,
logger=None,
batch_size=20,
ckpt_save_dir='./',
main_indicator='hmean',
val_interval=1,
val_start_epoch=1,
log_interval=1,
):
self.rank_id = rank_id
self.is_main_device = rank_id in [0, None]
self.loader_eval = loader
self.network = network
self.logger = print if logger is None else logger.info
self.val_interval = val_interval
self.val_start_epoch = val_start_epoch
self.log_interval = log_interval
self.batch_size = batch_size
if self.loader_eval is not None:
self.net_evaluator = Evaluator(network, loss_fn, postprocessor, metrics)
self.main_indicator = main_indicator
Expand All @@ -140,9 +149,9 @@ def __init__(self,
if not os.path.exists(ckpt_save_dir):
os.makedirs(ckpt_save_dir)

self._losses = list()
self.last_epoch_end_time = time.time()

def on_train_step_begin(self, run_context):
self.epoch_start_time = time.time()
self.step_start_time = time.time()

def on_train_step_end(self, run_context):
Expand All @@ -161,14 +170,28 @@ def on_train_step_end(self, run_context):
# TODO: need to stop gradient here ?
self._losses.append(loss.asnumpy())

if not data_sink_mode and cur_step_in_epoch % self.log_interval == 0:
opt = cb_params.train_network.optimizer
learning_rate = opt.learning_rate
cur_lr = learning_rate(opt.global_step - 1).asnumpy()
per_step_time = (time.time() - self.step_start_time) * 1000 / self.log_interval
fps = self.batch_size * 1000 / per_step_time
loss = np.average(self._losses)
msg = "epoch: [%s/%s] step: [%s/%s], loss: %.6f, lr: %.6f, per step time: %.3f ms, fps: %.2f img/s" % (
cur_epoch, cb_params.epoch_num, cur_step_in_epoch, cb_params.batch_num,
loss, cur_lr, per_step_time, fps)
self.logger(msg)
self.step_start_time = time.time()

def on_train_epoch_begin(self, run_context):
"""
Called before each epoch beginning.
Args:
run_context (RunContext): Include some information of the model.
"""
self._losses = []
self._losses.clear()
self.epoch_start_time = time.time()
self.step_start_time = time.time()

def on_train_epoch_end(self, run_context):
"""
Expand All @@ -181,23 +204,25 @@ def on_train_epoch_end(self, run_context):
loss = cb_params.net_outputs
cur_epoch = cb_params.cur_epoch_num
train_time = (time.time() - self.epoch_start_time)
train_loss = np.average(self._losses) # TODO: aggregate training loss for multiple cards
train_loss = np.average(self._losses) # TODO: aggregate training loss for multiple cards

print(
f"Epoch: {cur_epoch}, "
f"loss:{train_loss:.5f}, training time:{train_time:.3f}s"
)
epoch_time = (time.time() - self.epoch_start_time)
per_step_time = epoch_time * 1000 / cb_params.batch_num
fps = 1000 * self.batch_size / per_step_time
msg = "epoch: [%s/%s], loss: %.6f, epoch time: %.3f s, per step time: %.3f ms, fps: %.2f img/s" % (
cur_epoch, cb_params.epoch_num, train_loss, epoch_time, per_step_time, fps)
self.logger(msg)

eval_done = False
if (self.loader_eval is not None):
if self.loader_eval is not None:
if cur_epoch >= self.val_start_epoch and (cur_epoch - self.val_start_epoch) % self.val_interval == 0:
eval_start = time.time()
measures = self.net_evaluator.eval(self.loader_eval)
eval_done = True
if self.is_main_device:
perf = measures[self.main_indicator]
eval_time = time.time() - eval_start
print(f'Performance: {measures}, eval time: {eval_time}')
self.logger(f'Performance: {measures}, eval time: {eval_time}')
else:
measures = {m_name: None for m_name in self.net_evaluator.metric_names}
eval_time = 0
Expand All @@ -209,104 +234,30 @@ def on_train_epoch_end(self, run_context):
if self.is_main_device:
# save best models
if (self.main_indicator == 'train_loss' and perf < self.best_perf) \
or (self.main_indicator != 'train_loss' and eval_done and perf > self.best_perf): # when val_while_train enabled, only find best checkpoint after eval done.
self.best_perf = perf
save_checkpoint(self.network, os.path.join(self.ckpt_save_dir, 'best.ckpt'))
print(f'=> Best {self.main_indicator}: {self.best_perf}, checkpoint saved.')
or (
self.main_indicator != 'train_loss' and eval_done and perf > self.best_perf): # when val_while_train enabled, only find best checkpoint after eval done.
self.best_perf = perf
save_checkpoint(self.network, os.path.join(self.ckpt_save_dir, 'best.ckpt'))
self.logger(f'=> Best {self.main_indicator}: {self.best_perf}, checkpoint saved.')

# record results
if cur_epoch == 1:
if self.loader_eval is not None:
perf_columns = ['loss'] + list(measures.keys()) + ['train_time', 'eval_time']
else:
perf_columns = ['loss', 'train_time']
self.rec = PerfRecorder(self.ckpt_save_dir, metric_names=perf_columns) # record column names
perf_columns = ['loss', 'train_time']
self.rec = PerfRecorder(self.ckpt_save_dir, metric_names=perf_columns) # record column names

if self.loader_eval is not None:
epoch_perf_values = [cur_epoch, train_loss] + list(measures.values()) + [train_time, eval_time]
else:
epoch_perf_values = [cur_epoch, train_loss, train_time]
self.rec.add(*epoch_perf_values) # record column values
self.rec.add(*epoch_perf_values) # record column values

tot_time = time.time()-self.last_epoch_end_time
tot_time = time.time() - self.last_epoch_end_time
self.last_epoch_end_time = time.time()
print(f'Total time cost for epoch {cur_epoch}: {tot_time}')

def on_train_end(self, run_context):
if self.is_main_device:
self.rec.save_curves() # save performance curve figure
print(f'=> Best {self.main_indicator}: {self.best_perf} \nTraining completed!')


class LossCallBack(Callback):
"""
Monitor the loss in training.
If the loss in NAN or INF terminating training.
"""

def __init__(self, epoch_size, batch_size, logger, per_print_times=1, global_steps=0):
super(LossCallBack, self).__init__()
self.epoch_size = epoch_size
self.batch_size = batch_size
self.logger = logger
self.global_steps = global_steps
self._per_print_times = per_print_times
self.step_start_time = time.time()
self.epoch_start_time = time.time()

def _handle_loss(self, net_outputs):
"""Handle loss"""
if isinstance(net_outputs, (tuple, list)):
if isinstance(net_outputs[0], ms.Tensor) and isinstance(net_outputs[0].asnumpy(), np.ndarray):
loss = net_outputs[0].asnumpy()
elif isinstance(net_outputs, ms.Tensor) and isinstance(net_outputs.asnumpy(), np.ndarray):
loss = float(np.mean(net_outputs.asumpy()))
return loss

def on_train_epoch_begin(self, run_context):
self.epoch_start_time = time.time()
self.step_start_time = time.time()

def on_train_step_end(self, run_context):
cb_params = run_context.original_args()
loss = self._handle_loss(cb_params.net_outputs)
overflow = str(cb_params.net_outputs[1].asnumpy())
cur_epoch = cb_params.cur_epoch_num
data_sink_mode = cb_params.get('dataset_sink_mode', False)
if not data_sink_mode:
cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1
if isinstance(loss, float) and (np.isnan(loss) or np.isinf(loss)):
raise ValueError(
"epoch: {} step: {}. Invalid loss, terminating training.".format(cur_epoch, cur_step_in_epoch))

if cur_step_in_epoch % self._per_print_times == 0:
per_step_time = (time.time() - self.step_start_time) * 1000 / self._per_print_times
fps = self.batch_size * 1000 / per_step_time
lr = cb_params.train_network.optimizer.learning_rate
cur_lr = lr(ms.Tensor(self.global_steps)).asnumpy()
msg = "epoch: [%s/%s] step: [%s/%s], loss: %.6f, overflow: %s, lr: %.6f, per step time: %.3f ms, fps: %.2f img/s" % (
cur_epoch, self.epoch_size, cur_step_in_epoch, cb_params.batch_num,
loss, overflow, cur_lr, per_step_time, fps)
self.logger.info(msg)
self.step_start_time = time.time()
self.global_steps += 1

def on_train_epoch_end(self, run_context):
cb_params = run_context.original_args()
loss = self._handle_loss(cb_params.net_outputs)
cur_epoch_num = cb_params.cur_epoch_num
epoch_time = time.time() - self.epoch_start_time
per_step_time = epoch_time * 1000 / cb_params.batch_num
fps = 1000 * self.batch_size / per_step_time
msg = 'epoch: [%s/%s] loss: %.6f, epoch time: %.3f s, per step time: %.3f ms, fps: %.2f' % (
cur_epoch_num, self.epoch_size, loss, epoch_time, per_step_time, fps)
self.logger.info(msg)


class ResumeCallback(Callback):
def __init__(self, start_epoch_num=0):
super(ResumeCallback, self).__init__()
self.start_epoch_num = start_epoch_num

def on_train_epoch_begin(self, run_context):
run_context.original_args().cur_epoch_num += self.start_epoch_num
self.logger(f'=> Best {self.main_indicator}: {self.best_perf} \nTraining completed!')
65 changes: 65 additions & 0 deletions mindocr/utils/logger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""Custom Logger."""
import os
import sys
import logging


class Logger(logging.Logger):
"""
Logger.

Args:
logger_name: String. Logger name.
rank: Integer. Rank id.
"""
def __init__(self, logger_name, rank=0, is_main_device=False, log_fn=None):
super(Logger, self).__init__(logger_name)
self.rank = rank
self.log_fn = log_fn
if is_main_device:
console = logging.StreamHandler(sys.stdout)
console.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s')
console.setFormatter(formatter)
self.addHandler(console)

def setup_logging_file(self, log_dir):
"""Setup logging file."""
if not os.path.exists(log_dir):
os.makedirs(log_dir, exist_ok=True)
log_name = 'log_%s.txt' % self.rank
self.log_fn = os.path.join(log_dir, log_name)
fh = logging.FileHandler(self.log_fn)
fh.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s')
fh.setFormatter(formatter)
self.addHandler(fh)

def info(self, msg, *args, **kwargs):
if self.isEnabledFor(logging.INFO):
self._log(logging.INFO, msg, args, **kwargs)

def save_args(self, args):
self.info('Args:')
args_dict = vars(args)
for key in args_dict.keys():
self.info('--> %s: %s', key, args_dict[key])
self.info('')

def important_info(self, msg, *args, **kwargs):
if self.isEnabledFor(logging.INFO) and self.rank == 0:
line_width = 2
important_msg = '\n'
important_msg += ('*'*70 + '\n')*line_width
important_msg += ('*'*line_width + '\n')*2
important_msg += '*'*line_width + ' '*8 + msg + '\n'
important_msg += ('*'*line_width + '\n')*2
important_msg += ('*'*70 + '\n')*line_width
self.info(important_msg, *args, **kwargs)


def get_logger(log_dir, rank, is_main_device):
"""Get Logger."""
logger = Logger('mindocr', rank, is_main_device)
logger.setup_logging_file(log_dir)
return logger
21 changes: 21 additions & 0 deletions tests/ut/test_logger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import sys
sys.path.append('.')

import pytest
import yaml
from addict import Dict
from mindocr.utils.logger import get_logger


@pytest.mark.parametrize('task', ['det', 'rec'])
def test_logger(task):
if task == 'det':
config_fp = 'configs/det/dbnet/db_r50_icdar15.yaml'
elif task=='rec':
config_fp = 'configs/rec/crnn/crnn_icdar15.yaml'

with open(config_fp) as fp:
cfg = yaml.safe_load(fp)
cfg = Dict(cfg)

logger = get_logger(cfg.train.ckpt_save_dir, rank=0, is_main_device=True)
Loading