Enforce explicit ProcessGroup passed into DefaultState (pytorch#84105)

rohan-varma · pytorchmergebot · commit adc9a1e2fbd0 · 2022-08-27T03:12:20.000Z
Would prefer to enforce that users pass in explicit PG into these state objects when using comm hooks with FSDP, so that it is clear and easy debugable over which processes communication is taking place. Pull Request resolved: pytorch#84105 Approved by: https://github.com/mrshenli, https://github.com/zhaojuanmao
diff --git a/test/distributed/fsdp/test_fsdp_comm_hooks.py b/test/distributed/fsdp/test_fsdp_comm_hooks.py
@@ -7,6 +7,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import distributed as dist
+from torch.distributed.distributed_c10d import _get_default_group
 from torch.distributed.algorithms._comm_hooks import default_hooks
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp import MixedPrecision
@@ -423,7 +424,7 @@ def test_fp16_hook(
         sharding_strategy: Optional[ShardingStrategy]
     ):
 
-        state = default_hooks.LowPrecisionState(process_group=None)
+        state = default_hooks.LowPrecisionState(process_group=_get_default_group())
         hook = default_hooks.fp16_compress_hook
 
         self._check_low_precision_hook(state, hook, sharding_strategy, torch.float16, has_wrapping)
@@ -450,7 +451,7 @@ def test_bf16_hook(
         sharding_strategy: Optional[ShardingStrategy]
     ):
 
-        state = default_hooks.LowPrecisionState(process_group=None)
+        state = default_hooks.LowPrecisionState(process_group=_get_default_group())
         hook = default_hooks.bf16_compress_hook
 
         self._check_low_precision_hook(state, hook, sharding_strategy, torch.bfloat16, has_wrapping)
diff --git a/torch/distributed/algorithms/_comm_hooks/default_hooks.py b/torch/distributed/algorithms/_comm_hooks/default_hooks.py
@@ -1,7 +1,6 @@
 import functools
 import torch
 import torch.distributed as dist
-from torch.distributed import distributed_c10d
 
 
 class DefaultState(object):
@@ -22,9 +21,11 @@ class DefaultState(object):
 
     def __init__(
         self,
-        process_group
+        process_group: dist.ProcessGroup
     ):
-        self.process_group = process_group if process_group is not None else distributed_c10d._get_default_group()
+        if process_group is None:
+            raise ValueError(f"Expected to pass in an explicit ProcessGroup to {self}.")
+        self.process_group = process_group
         self.world_size = dist.get_world_size(process_group)
         # Setting two factors `self.gradient_predivide_factor`
         # and `self.gradient_postdivide_factor` to avoid underflow and overflow