Added pytorch 1.8 profiler as hook with tensorboard visualization.

louaaron · facebook-github-bot · commit 76ec0a2042f7 · 2021-07-09T15:27:08.000-07:00
Summary: Added a new hook with uses pytorch's new profiler (in versions 1.8.1+) to better log and visualize training details. In particular, this new hook includes the ability to use tensorboard visualizations when compared to the previous Autograd Hook.

Reviewed By: vaibhava0

Differential Revision: D29624951

fbshipit-source-id: 26e2b9cecf85ae2c545dc15a8103d6e1d983a94a
diff --git a/detectron2/engine/hooks.py b/detectron2/engine/hooks.py
@@ -7,6 +7,7 @@
 import os
 import tempfile
 import time
+import warnings
 from collections import Counter
 import torch
 from fvcore.common.checkpoint import PeriodicCheckpointer as _PeriodicCheckpointer
@@ -31,6 +32,7 @@
     "AutogradProfiler",
     "EvalHook",
     "PreciseBN",
+    "TorchProfiler",
 ]
 
 
@@ -268,45 +270,59 @@ def load_state_dict(self, state_dict):
             self.scheduler.load_state_dict(state_dict)
 
 
-class AutogradProfiler(HookBase):
+class TorchProfiler(HookBase):
     """
-    A hook which runs `torch.autograd.profiler.profile`.
+    A hook which runs `torch.profiler.profile`.
 
     Examples:
     ::
-        hooks.AutogradProfiler(
-             lambda trainer: trainer.iter > 10 and trainer.iter < 20, self.cfg.OUTPUT_DIR
+        hooks.TorchProfiler(
+             lambda trainer: 10 < trainer.iter < 20, self.cfg.OUTPUT_DIR
         )
 
     The above example will run the profiler for iteration 10~20 and dump
     results to ``OUTPUT_DIR``. We did not profile the first few iterations
     because they are typically slower than the rest.
-    The result files can be loaded in the ``chrome://tracing`` page in chrome browser.
-
-    Note:
-        When used together with NCCL on older version of GPUs,
-        autograd profiler may cause deadlock because it unnecessarily allocates
-        memory on every device it sees. The memory management calls, if
-        interleaved with NCCL calls, lead to deadlock on GPUs that do not
-        support ``cudaLaunchCooperativeKernelMultiDevice``.
+    The result files can be loaded in the ``chrome://tracing`` page in chrome browser,
+    and the tensorboard visualizations can be visualized using
+    ``tensorboard --logdir OUTPUT_DIR/log``
     """
 
-    def __init__(self, enable_predicate, output_dir, *, use_cuda=True):
+    def __init__(self, enable_predicate, output_dir, *, activities=None, save_tensorboard=True):
         """
         Args:
             enable_predicate (callable[trainer -> bool]): a function which takes a trainer,
                 and returns whether to enable the profiler.
                 It will be called once every step, and can be used to select which steps to profile.
             output_dir (str): the output directory to dump tracing files.
-            use_cuda (bool): same as in `torch.autograd.profiler.profile`.
+            activities (iterable): same as in `torch.profiler.profile`.
+            save_tensorboard (bool): whether to save tensorboard visualizations at (output_dir)/log/
         """
         self._enable_predicate = enable_predicate
-        self._use_cuda = use_cuda
+        self._activities = activities
         self._output_dir = output_dir
+        self._save_tensorboard = save_tensorboard
 
     def before_step(self):
         if self._enable_predicate(self.trainer):
-            self._profiler = torch.autograd.profiler.profile(use_cuda=self._use_cuda)
+            if self._save_tensorboard:
+                on_trace_ready = torch.profiler.tensorboard_trace_handler(
+                    os.path.join(
+                        self._output_dir,
+                        "log",
+                        "profiler-tensorboard-iter{}".format(self.trainer.iter),
+                    )
+                )
+            else:
+                on_trace_ready = None
+            self._profiler = torch.profiler.profile(
+                activities=self._activities,
+                on_trace_ready=on_trace_ready,
+                record_shapes=True,
+                profile_memory=True,
+                with_stack=True,
+                with_flops=True,
+            )
             self._profiler.__enter__()
         else:
             self._profiler = None
@@ -332,6 +348,51 @@ def after_step(self):
                 f.write(content)
 
 
+class AutogradProfiler(TorchProfiler):
+    """
+    A hook which runs `torch.autograd.profiler.profile`.
+
+    Examples:
+    ::
+        hooks.AutogradProfiler(
+             lambda trainer: 10 < trainer.iter < 20, self.cfg.OUTPUT_DIR
+        )
+
+    The above example will run the profiler for iteration 10~20 and dump
+    results to ``OUTPUT_DIR``. We did not profile the first few iterations
+    because they are typically slower than the rest.
+    The result files can be loaded in the ``chrome://tracing`` page in chrome browser.
+
+    Note:
+        When used together with NCCL on older version of GPUs,
+        autograd profiler may cause deadlock because it unnecessarily allocates
+        memory on every device it sees. The memory management calls, if
+        interleaved with NCCL calls, lead to deadlock on GPUs that do not
+        support ``cudaLaunchCooperativeKernelMultiDevice``.
+    """
+
+    def __init__(self, enable_predicate, output_dir, *, use_cuda=True):
+        """
+        Args:
+            enable_predicate (callable[trainer -> bool]): a function which takes a trainer,
+                and returns whether to enable the profiler.
+                It will be called once every step, and can be used to select which steps to profile.
+            output_dir (str): the output directory to dump tracing files.
+            use_cuda (bool): same as in `torch.autograd.profiler.profile`.
+        """
+        warnings.warn("AutogradProfiler has been deprecated in favor of TorchProfiler.")
+        self._enable_predicate = enable_predicate
+        self._use_cuda = use_cuda
+        self._output_dir = output_dir
+
+    def before_step(self):
+        if self._enable_predicate(self.trainer):
+            self._profiler = torch.autograd.profiler.profile(use_cuda=self._use_cuda)
+            self._profiler.__enter__()
+        else:
+            self._profiler = None
+
+
 class EvalHook(HookBase):
     """
     Run an evaluation function periodically, and at the end of training.
diff --git a/tools/benchmark.py b/tools/benchmark.py
@@ -113,7 +113,13 @@ def f():
     max_iter = 400
     trainer = (AMPTrainer if cfg.SOLVER.AMP.ENABLED else SimpleTrainer)(model, f(), optimizer)
     trainer.register_hooks(
-        [hooks.IterationTimer(), hooks.PeriodicWriter([CommonMetricPrinter(max_iter)])]
+        [
+            hooks.IterationTimer(),
+            hooks.PeriodicWriter([CommonMetricPrinter(max_iter)]),
+            hooks.TorchProfiler(
+                lambda trainer: trainer.iter == max_iter - 1, cfg.OUTPUT_DIR, save_tensorboard=True
+            ),
+        ]
     )
     trainer.train(1, max_iter)