[DDP Comm Hook] Do not expose hook_then_optimizer as a public method (pytorch#62532)

Yi Wang · facebook-github-bot · commit 2ec4f69b480a · 2021-08-02T12:25:01.000-07:00
Summary: Pull Request resolved: pytorch#62532 This method is not stable at this time, so avoid releasing it when DDP communication hook feature is released as a stable feature. ghstack-source-id: 134787831 Test Plan: buck test mode/dev-nosan caffe2/test/distributed:c10d -- test_ddp_hook_with_optimizer_parity buck test mode/dev-nosan caffe2/test/distributed:distributed_nccl_fork -- test_hook_then_optimizer_nccl Reviewed By: rohan-varma Differential Revision: D30031222 fbshipit-source-id: e03a8e13fee5116a5ddd724eb76316ee98f2a676
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
@@ -1607,12 +1607,12 @@ def _test_hook_then_optimizer(self, gradient_as_bucket_view=False):
         sgd_lr = 1e-2
         sgd_momentum = 0.9
         sgd_weight_decay = 0.01
-        opt_hook_state = default.OptimizerHookState(
+        opt_hook_state = default._OptimizerHookState(
             _FunctionalSGD, sgd_lr, momentum=sgd_momentum, weight_decay=sgd_weight_decay
         )
         gpu_model = self._gpu_model_with_ddp_comm_hook(
             process_group,
-            default.hook_then_optimizer(hook, opt_hook_state),
+            default._hook_then_optimizer(hook, opt_hook_state),
             gradient_as_bucket_view,
             hook_state,
         )
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py
@@ -70,7 +70,7 @@ def decompress(fut):
     return fut.then(decompress)
 
 
-class OptimizerHookState(object):
+class _OptimizerHookState(object):
     """
     Holds state for running optimizer in-line after DDP communication hook.
     Currently contains only optimizer class which must have a method `step_param`.
@@ -93,11 +93,18 @@ def __init__(
             )
 
 
-def hook_then_optimizer(
+# TODO: Add an example to use such a wrapper.
+def _hook_then_optimizer(
     hook: Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]],
-    optimizer_state: OptimizerHookState,
+    optimizer_state: _OptimizerHookState,
 ) -> Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]:
-    """Runs optimizer in a functional fashion after DDP communication hook."""
+    r"""
+    Runs optimizer in a functional fashion after DDP communication hook.
+
+    .. warning ::
+        This API is experimental adn subject to change.
+    """
+
 
     def hook_then_optimizer_wrapper(
         hook_state, bucket: dist.GradBucket
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
@@ -3865,15 +3865,15 @@ def _test_ddp_hook_with_optimizer_parity(
 
                     # Register hook that runs allreduce + functional SGD step.
                     allreduce_hook = default.allreduce_hook
-                    opt_hook_state = default.OptimizerHookState(
+                    opt_hook_state = default._OptimizerHookState(
                         _FunctionalSGD,
                         sgd_lr,
                         momentum=sgd_momentum,
                         weight_decay=sgd_weight_decay,
                     )
                     ddp_model_with_optimizer_hook.register_comm_hook(
                         None,
-                        default.hook_then_optimizer(allreduce_hook, opt_hook_state),
+                        default._hook_then_optimizer(allreduce_hook, opt_hook_state),
                     )
                     # Create DDP model with no hook that does optimizer after
                     # backward.