config docs, remove old log_summary func, fix imports

microsoft · jeffra · Jul 25, 2022 · May 27, 2022 · May 27, 2022 · May 27, 2022
commit 9343f8789b413a56519beeaaf3c03d1c6e96f92c
@@ -127,7 +127,6 @@ def log_wrapper(*args, **kwargs):
                 msg_size = get_msg_size_from_args(func, *args, **kwargs)
                 log_name = get_debug_log_name(func_args, comms_logger.debug)
                 timers(log_name).start()
-                #timers(func_args['log_name']).start()
         # Return the op, then stop the op's timer
         try:
             return func(*args, **kwargs)
@@ -137,10 +136,6 @@ def log_wrapper(*args, **kwargs):
                         'log_name' in kwargs
                         and kwargs['log_name'] in comms_logger.prof_ops):
                     log_name = get_debug_log_name(func_args, comms_logger.debug)
-                    #timers(func_args['log_name']).stop()
-                    # need temp var since 'elapsed' resets events
-                    #time_elapsed = timers(func_args['log_name']).elapsed(reset=False)
-                    #comms_logger.append(func_args['log_name'], time_elapsed, msg_size)
                     timers(log_name).stop()
                     # need temp var since 'elapsed' resets events
                     time_elapsed = timers(log_name).elapsed(reset=False)
@@ -149,24 +144,6 @@ def log_wrapper(*args, **kwargs):
     return log_wrapper
 
 
-def log_summary(coll_names, ranks=None):
-    global cdb
-    if coll_names == ['all']:
-        coll_names = timers.get_timers()
-    timers.log(names=coll_names, reset=False)
-    # Populate records for averaging and remove empty ones
-    #for name in coll_names:
-    #    print(timers(name).elapsed(reset=False))
-    # Calculate average dict
-    coll_means = timers.get_mean(coll_names, reset=False)
-    # Print averages
-    for coll, mean in coll_means.items():
-        string = f"rank={cdb.get_rank()} avg time (ms)" + " | {}: {:.2f}".format(
-            coll,
-            mean / 1000.0)
-        log_dist(string, ranks=ranks or [0])
-
-
 # For compatibility with torch distributed's init_process_group, we shall retain the signature from PyTorch code.
 # DeepSpeed NCCL/MPI backend may not need all these params as we will have our own implementation.
 # Please read full torch.distributed API docs from https://pytorch.org/docs/stable/distributed.html
@@ -481,7 +458,7 @@ def barrier(group=None, prof=False, log_name='barrier', debug=get_caller_func())
     return cdb.barrier()
 
 
-def log_summary_new():
+def log_summary():
     global cdb
     barrier(log_name='log_summary_barrier')
     if cdb.get_rank() == 0:

@@ -33,7 +33,7 @@
 
 # comms logger profile all ops signal
 COMMS_LOGGER_PROF_ALL = "prof_all"
-COMMS_LOGGER_PROF_ALL_DEFAULT = False
+COMMS_LOGGER_PROF_ALL_DEFAULT = True
 
 # comms logger show all ops signal
 COMMS_LOGGER_DEBUG = "debug"

@@ -264,12 +264,10 @@ def __init__(
 
         self._set_distributed_vars(args)
 
-
         dist.configure(self._config)
 
         self.monitor = MonitorMaster(self._config.monitor_config)
 
-
         see_memory_usage(
             f"DeepSpeed Engine: Before configure distributed model",
             force=self.memory_breakdown(),

@@ -3,8 +3,6 @@
 import os
 import math
 
-from deepspeed import comm as dist
-
 log_levels = {
     "debug": logging.DEBUG,
     "info": logging.INFO,
@@ -48,7 +46,7 @@ def create_logger(name=None, level=logging.INFO):
 
 
 def log_dist(message, ranks=None, level=logging.INFO):
-    import deepspeed.comm as dist
+    from deepspeed import comm as dist
     """Log message when one of following condition meets
 
     + not dist.is_initialized()
@@ -72,7 +70,7 @@ def log_dist(message, ranks=None, level=logging.INFO):
 
 
 def print_json_dist(message, ranks=None, path=None):
-    import deepspeed.comm as dist
+    from deepspeed import comm as dist
     """Print message when one of following condition meets
 
     + not dist.is_initialized()

@@ -49,6 +49,7 @@ collections:
       - mixture-of-experts-nlg.md
       - mixture-of-experts-inference.md
       - monitor.md
+      - comms-logging.md
       - one-cycle.md
       - onebit-adam.md
       - zero-one-adam.md

@@ -63,6 +63,8 @@ lnav:
         url: /docs/config-json/#sparse-attention
       - title: 'Monitoring'
         url: /docs/config-json/#monitoring-module-tensorboard-wandb-csv
+      - title: 'Communication Logging'
+        url: /docs/config-json/#communication-logging
   - title: 'Tutorials'
     url: /tutorials/
     children:
@@ -102,6 +104,8 @@ lnav:
         url: /tutorials/MoQ-tutorial/
       - title: 'Monitoring'
         url: /tutorials/monitor
+      - title: 'Communication Logging'
+        url: /tutorials/comms-logging
       - title: 'One-Cycle Schedule'
         url: /tutorials/one-cycle/
       - title: 'One-Bit Adam'

@@ -1044,3 +1044,32 @@ Example of <i>**csv_monitor**</i> configuration:
     "job_name": "train_bert"
 }
 ```
+
+### Communication Logging
+
+
+DeepSpeed provides a flexible communication logging tool which can automatically detect and record communication operations launched via `deepspeed.comm`. Once the logs are populated, they can be summarized with `deepspeed.comm.log_summary()`. For more detail and example usage, see the [tutorial](/tutorials/comms-logging/))
+
+
+<i>**comms_logger**</i>: [dictionary]
+
+| Fields | Value                                                                                                                                                                                                                                                                                                        |Default |
+| ------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----- |
+| enabled   | Whether communication logging is enabled. | `false` |
+| verbose | Whether to immediately print every communication operation  | `false` |
+| prof_all  | Whether to profile all operations. | `true` |
+| debug  | Appends the caller function to each communication operation's `log_name`. | `false` |
+| prof_ops  | A list of communication operations to log (only the specified ops will be profiled). | `[]` |
+
+
+Example of <i>**comms_logger**</i> configuration:
+
+```json
+"comms_logger": {
+  "enabled": true,
+  "verbose": false,
+  "prof_all": true,
+  "debug": false,
+  "prof_ops": ["all_reduce", "custom_all_reduce_name"]
+}
+```