Skip to content

Commit

Permalink
Add log util (#230)
Browse files Browse the repository at this point in the history
* Add log util

* replace all occurrences of print and logging

* address format

* disable propagate to avoid duplicate log
  • Loading branch information
chunyang-wen authored Jun 4, 2020
1 parent 2a1c5db commit e1ad880
Show file tree
Hide file tree
Showing 16 changed files with 290 additions and 214 deletions.
12 changes: 7 additions & 5 deletions deepspeed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from deepspeed.pt.deepspeed_light import DeepSpeedLight
from deepspeed.pt.deepspeed_light import ADAM_OPTIMIZER, LAMB_OPTIMIZER
from deepspeed.pt.deepspeed_lr_schedules import add_tuning_arguments
from deepspeed.pt.log_utils import logger
from deepspeed.pt.deepspeed_cuda import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
from deepspeed.pt.deepspeed_config import DeepSpeedConfig

Expand Down Expand Up @@ -82,11 +83,12 @@ def initialize(args,
* ``lr_scheduler``: Wrapped lr scheduler if user ``lr_scheduler`` is passed, or
if ``lr_scheduler`` specified in JSON configuration. Otherwise ``None``.
"""
print("DeepSpeed info: version={}, git-hash={}, git-branch={}".format(
__version__,
__git_hash__,
__git_branch__),
flush=True)
logger.info(
"DeepSpeed info: version={}, git-hash={}, git-branch={}".format(
__version__,
__git_hash__,
__git_branch__),
)

engine = DeepSpeedLight(args=args,
model=model,
Expand Down
68 changes: 37 additions & 31 deletions deepspeed/pt/deepspeed_checkpointing.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from deepspeed.pt.deepspeed_timer import SynchronizedWallClockTimer as Timers
import torch.distributed as dist
from deepspeed.pt.deepspeed_config import DeepSpeedConfig
from deepspeed.pt.log_utils import logger

#DeepSpeed Checkpointing Enabled or Disabled
deepspeed_checkpointing_enabled = False
Expand Down Expand Up @@ -58,20 +59,23 @@ def see_memory_usage(message, force=False):
return
#dist.barrier()
if dist.get_rank() == 0:
print(message)
print("Memory Allocated ",
torch.cuda.memory_allocated() / (1024 * 1024 * 1024),
"GigaBytes")
print("Max Memory Allocated ",
torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024),
"GigaBytes")
print("Cache Allocated ",
torch.cuda.memory_cached() / (1024 * 1024 * 1024),
"GigaBytes")
print("Max cache Allocated ",
torch.cuda.max_memory_cached() / (1024 * 1024 * 1024),
"GigaBytes")
print(" ")
logger.info(message)
logger.info(
"Memory Allocated %s GigaBytes",
torch.cuda.memory_allocated() / (1024 * 1024 * 1024),
)
logger.info(
"Max Memory Allocated %s GigaBytes",
torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024),
)
logger.info(
"Cache Allocated %s GigaBytes",
torch.cuda.memory_cached() / (1024 * 1024 * 1024),
)
logger.info(
"Max cache Allocated %s GigaBytes",
torch.cuda.max_memory_cached() / (1024 * 1024 * 1024),
)
#input("Press Any Key To Continue ..")


Expand Down Expand Up @@ -240,15 +244,16 @@ def model_parallel_cuda_manual_seed(seed):
data_parallel_seed = seed

if torch.distributed.get_rank() == 0:
print('> initializing model parallel cuda seeds on global rank {}, '
'model parallel rank {}, and data parallel rank {} with '
'model parallel seed: {} and data parallel seed: {}'.format(
torch.distributed.get_rank(),
mpu.get_model_parallel_rank(),
mpu.get_data_parallel_rank(),
model_parallel_seed,
data_parallel_seed),
flush=True)
logger.info(
'> initializing model parallel cuda seeds on global rank {}, '
'model parallel rank {}, and data parallel rank {} with '
'model parallel seed: {} and data parallel seed: {}'.format(
torch.distributed.get_rank(),
mpu.get_model_parallel_rank(),
mpu.get_data_parallel_rank(),
model_parallel_seed,
data_parallel_seed),
)
_CUDA_RNG_STATE_TRACKER.reset()
# Set the default state.
torch.cuda.manual_seed(data_parallel_seed)
Expand Down Expand Up @@ -348,15 +353,15 @@ def forward(ctx, run_function, *args):
if cuda_device is None:
see_memory_usage("First Forward Begining", force=True)
if dist.get_rank() == 0:
print(f"Activation Checkpointing Information")
print(
logger.info(f"Activation Checkpointing Information")
logger.info(
f"----Partition Activations {PARTITION_ACTIVATIONS}, CPU CHECKPOINTING {PA_TO_CPU}"
)
print(
logger.info(
f"----contiguous Memory Checkpointing {CONTIGUOUS_CHECKPOINTING} with {num_layers} total layers"
)
print(f"----Synchronization {SYNCHRONIZE}")
print(f"----Profiling {PROFILE_TIME}")
logger.info(f"----Synchronization {SYNCHRONIZE}")
logger.info(f"----Profiling {PROFILE_TIME}")

cuda_device = torch.cuda.current_device()
transport_stream = torch.cuda.Stream(device=cuda_device)
Expand Down Expand Up @@ -464,7 +469,7 @@ def forward(ctx, run_function, *args):
else:
new_args.append(size)
#if dist.get_rank() == 0:
# print (f"The stored tensor is {contiguous_size} and orginal one is {size} ")
# logger.info(f"The stored tensor is {contiguous_size} and orginal one is {size} ")

ctx.save_for_backward(*new_args)
else:
Expand Down Expand Up @@ -562,7 +567,8 @@ def partition_activations_in_checkpoint(partition_activation):
global PARTITION_ACTIVATIONS
PARTITION_ACTIVATIONS = partition_activation
if dist.get_rank() == 0:
print(f"**************Partition Activations {PARTITION_ACTIVATIONS}************")
logger.info(
f"**************Partition Activations {PARTITION_ACTIVATIONS}************")


def set_num_layers(nlayers):
Expand Down Expand Up @@ -601,7 +607,7 @@ def _configure_using_config_file(deepspeed_config):
PA_TO_CPU, SYNCHRONIZE, PROFILE_TIME

config = DeepSpeedConfig(deepspeed_config).activation_checkpointing_config
print(config.repr())
logger.info(config.repr())
PARTITION_ACTIVATIONS = config.partition_activations
CONTIGUOUS_CHECKPOINTING = config.contiguous_memory_optimization
num_layers = config.number_checkpoints
Expand Down
16 changes: 8 additions & 8 deletions deepspeed/pt/deepspeed_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
"""

import torch
import logging
import json
from deepspeed.pt.deepspeed_constants import *
from deepspeed.pt.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, DELAYED_SHIFT, MIN_LOSS_SCALE
from deepspeed.pt.deepspeed_config_utils import get_scalar_param, dict_raise_error_on_duplicate_keys
from deepspeed.pt.deepspeed_zero_config import DeepSpeedZeroConfig
from deepspeed.pt.deepspeed_checkpointing_config import DeepSpeedActivationCheckpointingConfig
from deepspeed.pt.log_utils import logger

TENSOR_CORE_ALIGN_SIZE = 8
ADAM_OPTIMIZER = 'adam'
Expand Down Expand Up @@ -407,7 +407,7 @@ def _set_batch_related_parameters(self):
assert False, \
'Either train_batch_size or micro_batch_per_gpu needs to be provided'

print(
logger.info(
f' After Train batch {self.train_batch_size} micro_batch {self.train_micro_batch_size_per_gpu} and grad_acc {self.gradient_accumulation_steps}'
)

Expand All @@ -421,13 +421,13 @@ def _do_sanity_check(self):
self._do_warning_check()

def print(self, name):
print('{}:'.format(name), flush=True)
logger.info('{}:'.format(name))
for arg in sorted(vars(self)):
if arg != '_param_dict':
dots = '.' * (29 - len(arg))
print(' {} {} {}'.format(arg, dots, getattr(self, arg)), flush=True)
logger.info(' {} {} {}'.format(arg, dots, getattr(self, arg)))

print(' json = {}'.format(
logger.info(' json = {}'.format(
json.dumps(self._param_dict,
sort_keys=True,
indent=4,
Expand All @@ -449,7 +449,7 @@ def _do_warning_check(self):

vocabulary_size = self._param_dict.get(VOCABULARY_SIZE, VOCABULARY_SIZE_DEFAULT)
if vocabulary_size and vocabulary_size % TENSOR_CORE_ALIGN_SIZE != 0:
logging.warning(
logger.warning(
"DeepSpeedConfig: vocabulary size {} is not aligned to {}, may import tensor core utilization."
.format(vocabulary_size,
TENSOR_CORE_ALIGN_SIZE))
Expand All @@ -458,12 +458,12 @@ def _do_warning_check(self):
MAX_GRAD_NORM in self.optimizer_params.keys() and \
self.optimizer_params[MAX_GRAD_NORM] > 0:
if fp16_enabled:
logging.warning(
logger.warning(
'DeepSpeedConfig: In FP16 mode, DeepSpeed will pass {}:{} to FP16 wrapper'
.format(MAX_GRAD_NORM,
self.optimizer_params[MAX_GRAD_NORM]))
else:
logging.warning(
logger.warning(
'DeepSpeedConfig: In FP32 mode, DeepSpeed does not permit MAX_GRAD_NORM ({}) > 0, setting to zero'
.format(self.optimizer_params[MAX_GRAD_NORM]))
self.optimizer_params[MAX_GRAD_NORM] = 0.0
24 changes: 13 additions & 11 deletions deepspeed/pt/deepspeed_launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from collections import defaultdict
from argparse import ArgumentParser, REMAINDER

from deepspeed.pt.log_utils import logger


def parse_args():
parser = ArgumentParser(description="DeepSpeed distributed training launch"
Expand Down Expand Up @@ -59,24 +61,24 @@ def main():

for k in current_env.keys():
if "NCCL" in k:
print(args.node_rank, k, current_env[k], flush=True)
logger.info("%s %s %s", args.node_rank, k, current_env[k])

world_info = None
assert args.world_info != "None", "must provide world info dict"
world_info = base64.urlsafe_b64decode(args.world_info)
world_info = json.loads(world_info)

print("WORLD INFO DICT: {}".format(world_info), flush=True)
logger.info("WORLD INFO DICT: {}".format(world_info))
node_list = list(world_info.keys())
args.nnodes = len(node_list)
local_node = node_list[args.node_rank]
local_gpu_ids = world_info[local_node]
num_local_procs = len(local_gpu_ids)
print("nnodes={}, num_local_procs={}, node_rank={}".format(
args.nnodes,
num_local_procs,
args.node_rank),
flush=True)
logger.info(
"nnodes={}, num_local_procs={}, node_rank={}".format(args.nnodes,
num_local_procs,
args.node_rank),
)

global_rank_mapping = defaultdict(list)
curr_global_rank = 0
Expand All @@ -87,11 +89,11 @@ def main():
for gid in gids:
global_rank_mapping[node_id].append(curr_global_rank)
curr_global_rank += 1
print("global_rank_mapping={}".format(global_rank_mapping), flush=True)
print("dist_world_size={}".format(dist_world_size), flush=True)
logger.info("global_rank_mapping={}".format(global_rank_mapping))
logger.info("dist_world_size={}".format(dist_world_size))
current_env["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, local_gpu_ids))
print("Setting CUDA_VISIBLE_DEVICES={}".format(current_env["CUDA_VISIBLE_DEVICES"]),
flush=True)
logger.info("Setting CUDA_VISIBLE_DEVICES={}".format(
current_env["CUDA_VISIBLE_DEVICES"]))
exclusion_counts_per_node = None

# set PyTorch distributed related environmental variables
Expand Down
Loading

0 comments on commit e1ad880

Please sign in to comment.