Add log util (#230)

* Add log util * replace all occurrences of print and logging * address format * disable propagate to avoid duplicate log
microsoft · Jun 4, 2020 · e1ad880 · e1ad880
1 parent 2a1c5db
commit e1ad880
Show file tree

Hide file tree

Showing 16 changed files with 290 additions and 214 deletions.
diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py
@@ -5,6 +5,7 @@
 from deepspeed.pt.deepspeed_light import DeepSpeedLight
 from deepspeed.pt.deepspeed_light import ADAM_OPTIMIZER, LAMB_OPTIMIZER
 from deepspeed.pt.deepspeed_lr_schedules import add_tuning_arguments
+from deepspeed.pt.log_utils import logger
 from deepspeed.pt.deepspeed_cuda import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
 from deepspeed.pt.deepspeed_config import DeepSpeedConfig
 
@@ -82,11 +83,12 @@ def initialize(args,
         * ``lr_scheduler``: Wrapped lr scheduler if user ``lr_scheduler`` is passed, or
           if ``lr_scheduler`` specified in JSON configuration. Otherwise ``None``.
     """
-    print("DeepSpeed info: version={}, git-hash={}, git-branch={}".format(
-        __version__,
-        __git_hash__,
-        __git_branch__),
-          flush=True)
+    logger.info(
+        "DeepSpeed info: version={}, git-hash={}, git-branch={}".format(
+            __version__,
+            __git_hash__,
+            __git_branch__),
+    )
 
     engine = DeepSpeedLight(args=args,
                             model=model,

diff --git a/deepspeed/pt/deepspeed_checkpointing.py b/deepspeed/pt/deepspeed_checkpointing.py
@@ -22,6 +22,7 @@
 from deepspeed.pt.deepspeed_timer import SynchronizedWallClockTimer as Timers
 import torch.distributed as dist
 from deepspeed.pt.deepspeed_config import DeepSpeedConfig
+from deepspeed.pt.log_utils import logger
 
 #DeepSpeed Checkpointing Enabled or Disabled
 deepspeed_checkpointing_enabled = False
@@ -58,20 +59,23 @@ def see_memory_usage(message, force=False):
         return
     #dist.barrier()
     if dist.get_rank() == 0:
-        print(message)
-        print("Memory Allocated ",
-              torch.cuda.memory_allocated() / (1024 * 1024 * 1024),
-              "GigaBytes")
-        print("Max Memory Allocated ",
-              torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024),
-              "GigaBytes")
-        print("Cache Allocated ",
-              torch.cuda.memory_cached() / (1024 * 1024 * 1024),
-              "GigaBytes")
-        print("Max cache Allocated ",
-              torch.cuda.max_memory_cached() / (1024 * 1024 * 1024),
-              "GigaBytes")
-        print(" ")
+        logger.info(message)
+        logger.info(
+            "Memory Allocated %s GigaBytes",
+            torch.cuda.memory_allocated() / (1024 * 1024 * 1024),
+        )
+        logger.info(
+            "Max Memory Allocated %s GigaBytes",
+            torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024),
+        )
+        logger.info(
+            "Cache Allocated %s GigaBytes",
+            torch.cuda.memory_cached() / (1024 * 1024 * 1024),
+        )
+        logger.info(
+            "Max cache Allocated %s GigaBytes",
+            torch.cuda.max_memory_cached() / (1024 * 1024 * 1024),
+        )
         #input("Press Any Key To Continue ..")
 
 
@@ -240,15 +244,16 @@ def model_parallel_cuda_manual_seed(seed):
     data_parallel_seed = seed
 
     if torch.distributed.get_rank() == 0:
-        print('> initializing model parallel cuda seeds on global rank {}, '
-              'model parallel rank {}, and data parallel rank {} with '
-              'model parallel seed: {} and data parallel seed: {}'.format(
-                  torch.distributed.get_rank(),
-                  mpu.get_model_parallel_rank(),
-                  mpu.get_data_parallel_rank(),
-                  model_parallel_seed,
-                  data_parallel_seed),
-              flush=True)
+        logger.info(
+            '> initializing model parallel cuda seeds on global rank {}, '
+            'model parallel rank {}, and data parallel rank {} with '
+            'model parallel seed: {} and data parallel seed: {}'.format(
+                torch.distributed.get_rank(),
+                mpu.get_model_parallel_rank(),
+                mpu.get_data_parallel_rank(),
+                model_parallel_seed,
+                data_parallel_seed),
+        )
     _CUDA_RNG_STATE_TRACKER.reset()
     # Set the default state.
     torch.cuda.manual_seed(data_parallel_seed)
@@ -348,15 +353,15 @@ def forward(ctx, run_function, *args):
         if cuda_device is None:
             see_memory_usage("First Forward Begining", force=True)
             if dist.get_rank() == 0:
-                print(f"Activation Checkpointing Information")
-                print(
+                logger.info(f"Activation Checkpointing Information")
+                logger.info(
                     f"----Partition Activations {PARTITION_ACTIVATIONS}, CPU CHECKPOINTING {PA_TO_CPU}"
                 )
-                print(
+                logger.info(
                     f"----contiguous Memory Checkpointing {CONTIGUOUS_CHECKPOINTING} with {num_layers} total layers"
                 )
-                print(f"----Synchronization {SYNCHRONIZE}")
-                print(f"----Profiling {PROFILE_TIME}")
+                logger.info(f"----Synchronization {SYNCHRONIZE}")
+                logger.info(f"----Profiling {PROFILE_TIME}")
 
             cuda_device = torch.cuda.current_device()
             transport_stream = torch.cuda.Stream(device=cuda_device)
@@ -464,7 +469,7 @@ def forward(ctx, run_function, *args):
                 else:
                     new_args.append(size)
                 #if dist.get_rank() == 0:
-                #    print (f"The stored tensor is {contiguous_size} and orginal one is {size} ")
+                #    logger.info(f"The stored tensor is {contiguous_size} and orginal one is {size} ")
 
             ctx.save_for_backward(*new_args)
         else:
@@ -562,7 +567,8 @@ def partition_activations_in_checkpoint(partition_activation):
     global PARTITION_ACTIVATIONS
     PARTITION_ACTIVATIONS = partition_activation
     if dist.get_rank() == 0:
-        print(f"**************Partition Activations {PARTITION_ACTIVATIONS}************")
+        logger.info(
+            f"**************Partition Activations {PARTITION_ACTIVATIONS}************")
 
 
 def set_num_layers(nlayers):
@@ -601,7 +607,7 @@ def _configure_using_config_file(deepspeed_config):
             PA_TO_CPU, SYNCHRONIZE, PROFILE_TIME
 
     config = DeepSpeedConfig(deepspeed_config).activation_checkpointing_config
-    print(config.repr())
+    logger.info(config.repr())
     PARTITION_ACTIVATIONS = config.partition_activations
     CONTIGUOUS_CHECKPOINTING = config.contiguous_memory_optimization
     num_layers = config.number_checkpoints

diff --git a/deepspeed/pt/deepspeed_config.py b/deepspeed/pt/deepspeed_config.py
@@ -4,13 +4,13 @@
 """
 
 import torch
-import logging
 import json
 from deepspeed.pt.deepspeed_constants import *
 from deepspeed.pt.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, DELAYED_SHIFT, MIN_LOSS_SCALE
 from deepspeed.pt.deepspeed_config_utils import get_scalar_param, dict_raise_error_on_duplicate_keys
 from deepspeed.pt.deepspeed_zero_config import DeepSpeedZeroConfig
 from deepspeed.pt.deepspeed_checkpointing_config import DeepSpeedActivationCheckpointingConfig
+from deepspeed.pt.log_utils import logger
 
 TENSOR_CORE_ALIGN_SIZE = 8
 ADAM_OPTIMIZER = 'adam'
@@ -407,7 +407,7 @@ def _set_batch_related_parameters(self):
             assert False, \
                 'Either train_batch_size or micro_batch_per_gpu needs to be provided'
 
-        print(
+        logger.info(
             f' After Train batch {self.train_batch_size} micro_batch {self.train_micro_batch_size_per_gpu} and grad_acc {self.gradient_accumulation_steps}'
         )
 
@@ -421,13 +421,13 @@ def _do_sanity_check(self):
         self._do_warning_check()
 
     def print(self, name):
-        print('{}:'.format(name), flush=True)
+        logger.info('{}:'.format(name))
         for arg in sorted(vars(self)):
             if arg != '_param_dict':
                 dots = '.' * (29 - len(arg))
-                print('  {} {} {}'.format(arg, dots, getattr(self, arg)), flush=True)
+                logger.info('  {} {} {}'.format(arg, dots, getattr(self, arg)))
 
-        print('  json = {}'.format(
+        logger.info('  json = {}'.format(
             json.dumps(self._param_dict,
                        sort_keys=True,
                        indent=4,
@@ -449,7 +449,7 @@ def _do_warning_check(self):
 
         vocabulary_size = self._param_dict.get(VOCABULARY_SIZE, VOCABULARY_SIZE_DEFAULT)
         if vocabulary_size and vocabulary_size % TENSOR_CORE_ALIGN_SIZE != 0:
-            logging.warning(
+            logger.warning(
                 "DeepSpeedConfig: vocabulary size {} is not aligned to {}, may import tensor core utilization."
                 .format(vocabulary_size,
                         TENSOR_CORE_ALIGN_SIZE))
@@ -458,12 +458,12 @@ def _do_warning_check(self):
             MAX_GRAD_NORM in self.optimizer_params.keys() and \
                 self.optimizer_params[MAX_GRAD_NORM] > 0:
             if fp16_enabled:
-                logging.warning(
+                logger.warning(
                     'DeepSpeedConfig: In FP16 mode, DeepSpeed will pass {}:{} to FP16 wrapper'
                     .format(MAX_GRAD_NORM,
                             self.optimizer_params[MAX_GRAD_NORM]))
             else:
-                logging.warning(
+                logger.warning(
                     'DeepSpeedConfig: In FP32 mode, DeepSpeed does not permit MAX_GRAD_NORM ({}) > 0, setting to zero'
                     .format(self.optimizer_params[MAX_GRAD_NORM]))
                 self.optimizer_params[MAX_GRAD_NORM] = 0.0
diff --git a/deepspeed/pt/deepspeed_launch.py b/deepspeed/pt/deepspeed_launch.py
@@ -10,6 +10,8 @@
 from collections import defaultdict
 from argparse import ArgumentParser, REMAINDER
 
+from deepspeed.pt.log_utils import logger
+
 
 def parse_args():
     parser = ArgumentParser(description="DeepSpeed distributed training launch"
@@ -59,24 +61,24 @@ def main():
 
     for k in current_env.keys():
         if "NCCL" in k:
-            print(args.node_rank, k, current_env[k], flush=True)
+            logger.info("%s %s %s", args.node_rank, k, current_env[k])
 
     world_info = None
     assert args.world_info != "None", "must provide world info dict"
     world_info = base64.urlsafe_b64decode(args.world_info)
     world_info = json.loads(world_info)
 
-    print("WORLD INFO DICT: {}".format(world_info), flush=True)
+    logger.info("WORLD INFO DICT: {}".format(world_info))
     node_list = list(world_info.keys())
     args.nnodes = len(node_list)
     local_node = node_list[args.node_rank]
     local_gpu_ids = world_info[local_node]
     num_local_procs = len(local_gpu_ids)
-    print("nnodes={}, num_local_procs={}, node_rank={}".format(
-        args.nnodes,
-        num_local_procs,
-        args.node_rank),
-          flush=True)
+    logger.info(
+        "nnodes={}, num_local_procs={}, node_rank={}".format(args.nnodes,
+                                                             num_local_procs,
+                                                             args.node_rank),
+    )
 
     global_rank_mapping = defaultdict(list)
     curr_global_rank = 0
@@ -87,11 +89,11 @@ def main():
         for gid in gids:
             global_rank_mapping[node_id].append(curr_global_rank)
             curr_global_rank += 1
-    print("global_rank_mapping={}".format(global_rank_mapping), flush=True)
-    print("dist_world_size={}".format(dist_world_size), flush=True)
+    logger.info("global_rank_mapping={}".format(global_rank_mapping))
+    logger.info("dist_world_size={}".format(dist_world_size))
     current_env["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, local_gpu_ids))
-    print("Setting CUDA_VISIBLE_DEVICES={}".format(current_env["CUDA_VISIBLE_DEVICES"]),
-          flush=True)
+    logger.info("Setting CUDA_VISIBLE_DEVICES={}".format(
+        current_env["CUDA_VISIBLE_DEVICES"]))
     exclusion_counts_per_node = None
 
     # set PyTorch distributed related environmental variables