[FSDP] Include buffers in ignored_modules

Andrew Gu · pytorchmergebot · commit 73b33de98991 · 2022-05-05T12:29:59.000Z
Pull Request resolved: pytorch#76784 Approved by: https://github.com/rohan-varma
diff --git a/test/distributed/fsdp/test_fsdp_state_dict.py b/test/distributed/fsdp/test_fsdp_state_dict.py
@@ -52,6 +52,7 @@
 
 INNER_SHAPE = [4, 4]
 OUTER_SHAPE = [4, 5]
+BUFFER_SHAPE = [5, 5]
 
 _SUPPORTED_STATE_DICT_IMPLS = ["state_dict", "local_state_dict"]
 
@@ -63,12 +64,14 @@
 
 
 class Model(Module):
-    def __init__(self, wrap_fsdp):
+    def __init__(self, wrap_fsdp, register_buffer=False):
         super().__init__()
         self.inner = Linear(*INNER_SHAPE)
         if wrap_fsdp:
             self.inner = FSDP(self.inner)
         self.outer = Linear(*OUTER_SHAPE)
+        if register_buffer:
+            self.outer.register_buffer("buffer", torch.randn(BUFFER_SHAPE))
 
     def forward(self, x):
         # Forward twice.
@@ -444,34 +447,42 @@ def test_wrong_state_dict_config(self):
 
     @skip_if_lt_x_gpu(2)
     def test_state_dict_with_ignored_modules(self):
-        # Initialize an FSDP-wrapped model with an ignored module
-        model = Model(wrap_fsdp=True).cuda()
+        # Initialize an FSDP-wrapped model with an ignored module that includes
+        # both parameters and a buffer
+        model = Model(wrap_fsdp=True, register_buffer=True).cuda()
         ignored_modules = [model.outer]
-        ignored_param_to_param_name = {
+        ignored_tensor_to_tensor_name = {
             model.outer.bias: "outer.bias", model.outer.weight: "outer.weight",
+            model.outer.buffer: "outer.buffer",
         }
         fsdp_model = FSDP(model, ignored_modules=ignored_modules)
         with FSDP.state_dict_type(fsdp_model, StateDictType.FULL_STATE_DICT):
-            sd = fsdp_model.state_dict()
-
+            sd1 = fsdp_model.state_dict()
         with FSDP.summon_full_params(fsdp_model):
             fsdp_params = deepcopy(list(fsdp_model.parameters()))
         # Check that the ignored parameters are not cloned
-
-        for param, param_name in ignored_param_to_param_name.items():
-            self.assertTrue(param_name in sd)
-            self.assertEqual(param.data_ptr(), sd[param_name].data_ptr())
+        for tensor, tensor_name in ignored_tensor_to_tensor_name.items():
+            self.assertTrue(tensor_name in sd1)
+            self.assertEqual(tensor.data_ptr(), sd1[tensor_name].data_ptr())
         # Check that the state dict can be loaded into a non-wrapped version of
         # the model
-        nonwrapped_model = Model(wrap_fsdp=False).cuda()
+        nonwrapped_model = Model(wrap_fsdp=False, register_buffer=True).cuda()
         for param in nonwrapped_model.parameters():
             with torch.no_grad():
                 param.zero_()
-
-        nonwrapped_model.load_state_dict(sd)
+        nonwrapped_model.load_state_dict(sd1)
         local_params = list(nonwrapped_model.parameters())
         for fsdp_param, local_param in zip(fsdp_params, local_params):
             self.assertEqual(fsdp_param, local_param)
+        # Check that if we save a state dict again, the ignored parameters and
+        # buffers still have the same data pointer
+        with FSDP.state_dict_type(fsdp_model, StateDictType.FULL_STATE_DICT):
+            sd2 = fsdp_model.state_dict()
+        for tensor, tensor_name in ignored_tensor_to_tensor_name.items():
+            self.assertTrue(tensor_name in sd1)  # check again just in case
+            self.assertTrue(tensor_name in sd2)
+            self.assertEqual(tensor.data_ptr(), sd2[tensor_name].data_ptr())
+            self.assertEqual(sd1[tensor_name].data_ptr(), sd2[tensor_name].data_ptr())
 
 
 instantiate_parametrized_tests(TestFSDPStateDict)
diff --git a/torch/distributed/fsdp/_utils.py b/torch/distributed/fsdp/_utils.py
@@ -1,9 +1,8 @@
-from typing import Dict, List, Tuple, Union, Any, Callable, Set
-from torch.nn.utils.rnn import PackedSequence
+from collections import OrderedDict
+from typing import Any, Callable, Dict, List, Set, Tuple, Union
 
 import torch
-
-from collections import OrderedDict
+from torch.nn.utils.rnn import PackedSequence
 
 """Useful functions to deal with tensor types with other python container types."""
 
@@ -56,3 +55,29 @@ def _replace_by_prefix(
         new_key = new_prefix + key[len(old_prefix) :]
         state_dict[new_key] = state_dict[key]
         del state_dict[key]
+
+
+def _apply_to_modules(
+    root_module: torch.nn.Module,
+    module_fn: Callable,
+    return_fn: Callable,
+    *args,
+    **kwargs,
+):
+    """
+    Performs a pre-order traversal of the modules in the hierarchy rooted at
+    ``root_module``, applying ``module_fn`` at each module and finally
+    returning a value using ``return_fn``. The traversal constructs the full
+    module prefix name (e.g. "module.submodule." just like in model state dict)
+    and makes that available to ``module_fn``.
+    """
+    def f(module: torch.nn.Module, prefix: str, *args, **kwargs):
+        # Call the module function before recursing over children (pre-order)
+        module_fn(module, prefix, *args, **kwargs)
+        for submodule_name, submodule in module.named_children():
+            if submodule is not None:
+                new_prefix = prefix + submodule_name + "."
+                f(submodule, new_prefix, *args, **kwargs)
+
+    f(root_module, "", *args, **kwargs)
+    return return_fn(*args, **kwargs)
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -40,12 +40,6 @@
 from torch.distributed.distributed_c10d import _get_default_group
 from torch.nn.parameter import Parameter
 
-from .flatten_params_wrapper import (
-    FLAT_PARAM,
-    FPW_MODULE,
-    FlatParameter,
-    FlattenParamsWrapper,
-)
 from ._optim_utils import (
     _broadcast_pos_dim_tensor_states,
     _broadcast_processed_optim_state_dict,
@@ -56,15 +50,21 @@
     _process_pos_dim_tensor_state,
     _unflatten_optim_state,
 )
-from ._utils import _apply_to_tensors, _replace_by_prefix
+from ._utils import _apply_to_modules, _apply_to_tensors, _replace_by_prefix
+from .flatten_params_wrapper import (
+    FLAT_PARAM,
+    FPW_MODULE,
+    FlatParameter,
+    FlattenParamsWrapper,
+)
 from .wrap import _recursive_wrap
 
 if TYPE_CHECKING:
     from collections import OrderedDict  # noqa: F401
 
 _TORCHDISTX_AVAIL = True
 try:
-    from torchdistx import fake, deferred_init
+    from torchdistx import deferred_init, fake
 except ImportError:
     _TORCHDISTX_AVAIL = False
 
@@ -490,10 +490,10 @@ class FullyShardedDataParallel(nn.Module):
             accuracy during model training. If ``None``, no mixed precision is applied.
             (Default: ``None``)
         ignored_modules (Optional[Iterable[torch.nn.Module]]): Modules whose
-            own parameters and child modules' parameters are ignored by this
-            instance. None of the modules directly in ``ignored_modules``
-            should be :class:`FullyShardedDataParallel` instances, and any
-            child modules that are already-constructed
+            own parameters and child modules' parameters and buffers are
+            ignored by this instance. None of the modules directly in
+            ``ignored_modules`` should be :class:`FullyShardedDataParallel`
+            instances, and any child modules that are already-constructed
             :class:`FullyShardedDataParallel` instances will not be ignored if
             they are nested under this instance. This argument may be used to
             avoid sharding specific parameters when using an
@@ -549,16 +549,7 @@ def __init__(
         # Save the ignored modules and their parameters, including the
         # parameter names, which are needed to filter the model state dict
         self._ignored_modules = self._get_ignored_modules(ignored_modules)
-        ignored_params = self._get_ignored_params(self._ignored_modules)
-        param_to_unflat_param_names = _get_param_to_unflat_param_names(module)
-        self._ignored_param_to_param_name = {}
-        for param in ignored_params:
-            unflat_param_names = param_to_unflat_param_names[param]
-            assert len(unflat_param_names) == 1, \
-                "Only `FlatParameter`s can map to >1 unflattened parameter " \
-                "name, and `_get_ignored_params()` should have excluded " \
-                "them; check `_get_param_to_unflat_param_names()`"
-            self._ignored_param_to_param_name[param] = unflat_param_names[0]
+        ignored_params = self._get_ignored_parameters()
         # if auto_wrap_policy is specified, submodules should not be
         # already wrapped, otherwise we'd attempt to double wrap them resulting
         # in errors.
@@ -776,19 +767,67 @@ def _get_ignored_modules(
         )
         return ignored_modules
 
-    def _get_ignored_params(
-        self,
-        ignored_modules: Set[torch.nn.Module],
-    ) -> Set[torch.nn.Parameter]:
-        """
-        Returns the parameters of the modules in ``ignored_modules`` as a
+    def _get_ignored_parameters(self) -> Set[torch.nn.Parameter]:
+        """Returns the parameters of the modules in ``ignored_modules`` as a
         :class:`set`, excluding any :class:`FlatParameter` s.
         """
+        assert hasattr(self, "_ignored_modules"), \
+            "Expects `self._ignored_modules` to be initialized"
         return set(
-            p for m in ignored_modules for p in m.parameters()
+            p for m in self._ignored_modules for p in m.parameters()
             if not isinstance(p, FlatParameter)
         )
 
+    def _get_ignored_named_tensors(
+        self,
+        ignored_modules: Set[torch.nn.Module],
+        named_tensor_fn: Callable,
+    ) -> Set[Tuple[str, torch.Tensor]]:
+        """
+        This performs a module walk to get the full parameter and buffer names
+        depending on ``named_tensor_fn``, which should either be
+        ``named_parameters()`` or ``named_buffers()`. We require a separate
+        :meth:`_get_ignored_parameters` that does not use this module walk
+        since that method needs to be called in the FSDP constructor before any
+        wrapping occurs, which means that we cannot start a module walk from
+        ``self`` as in this method.
+        """
+        def module_fn(module, prefix, ignored_named_tensors, ignored_modules):
+            if module in ignored_modules:
+                assert not isinstance(module, FullyShardedDataParallel) and \
+                    not isinstance(module, FlattenParamsWrapper), \
+                    "Ignoring FSDP modules is meaningless since their " \
+                    "parameters are not flattened into this FSDP module anyway"
+                for param_name, param in named_tensor_fn(module):
+                    prefixed_param_name = clean_param_name(prefix + param_name)
+                    ignored_named_tensors.add((prefixed_param_name, param))
+
+        def return_fn(ignored_named_tensors, *args):
+            return ignored_named_tensors
+
+        ignored_named_tensors = set()
+        return _apply_to_modules(
+            self, module_fn, return_fn, ignored_named_tensors, ignored_modules,
+        )
+
+    def _get_ignored_named_parameters(self) -> Set[Tuple[str, torch.Tensor]]:
+        """Returns the named parameters of the modules in ``ignored_modules``,
+        excluding any :class:`FlatParameter` s."""
+        assert hasattr(self, "_ignored_modules"), \
+            "Expects `self._ignored_modules` to be initialized"
+        return self._get_ignored_named_tensors(
+            self._ignored_modules, lambda m: m.named_parameters(recurse=False),
+        )
+
+    def _get_ignored_named_buffers(self) -> Set[Tuple[str, torch.Tensor]]:
+        """Returns the named buffers of the modules in ``ignored_modules``,
+        excluding any :class:`FlatParameter` s."""
+        assert hasattr(self, "_ignored_modules"), \
+            "Expects `self._ignored_modules` to be initialized"
+        return self._get_ignored_named_tensors(
+            self._ignored_modules, lambda m: m.named_buffers(recurse=False),
+        )
+
     @classmethod
     def _check_wrapped(cls, begin_module, check_fn, err_fn):
         for _, mod in begin_module.named_modules():
@@ -1496,12 +1535,14 @@ def _full_post_state_dict_hook(
         if not state_dict:
             return state_dict
 
-        ignored_param_names = set(self._ignored_param_to_param_name.values())
+        ignored_named_params = self._get_ignored_named_parameters()
+        ignored_named_buffers = self._get_ignored_named_buffers()
+        ignored_names = set(n for n, _ in ignored_named_params)
+        ignored_names.update(n for n, _ in ignored_named_buffers)
         for key in state_dict:
-            # Do not need to clone ignored parameters since they are not
-            # sharded
-            clean_param_name = key.replace(FSDP_WRAPPED_MODULE + ".", "").replace(FPW_MODULE + ".", "")
-            if clean_param_name in ignored_param_names:
+            # Do not need to clone ignored parameters and buffers since they
+            # are not sharded
+            if clean_param_name(key) in ignored_names:
                 continue
             # Due to recursive call of summon_full_params, avoid unnecessary
             # reclone of tensors in case they have already been cloned.
@@ -2547,11 +2588,7 @@ def _finalize_params(fsdp_module: FullyShardedDataParallel) -> None:
             if isinstance(m, FullyShardedDataParallel):
                 _finalize_params(m)
                 m._pre_backward_hook_has_run = False
-                if any(
-                    p not in self._ignored_param_to_param_name
-                    and p.requires_grad
-                    for p in m.parameters()
-                ):
+                if any(p.requires_grad for p in m.parameters()):
                     # Check if the module has params and if any of them has
                     # the `requires_grad` field set. If `requires_grad=False` for
                     # all the params, the post_backward hook will not fire and the
@@ -3477,25 +3514,19 @@ def _get_param_to_unflat_param_names(
         model (torch.nn.Module): Root module (which may or may not be a
             :class:`FullyShardedDataParallel` instance).
     """
-    param_to_unflat_param_names: Dict[torch.nn.Parameter, List[str]] = {}
-
-    def clean_param_name(prefix, param_info):
+    def _clean_param_name(prefix, param_info):
         """This replicates the parameter name cleaning logic in model state
         dict but avoids gathering any parameters."""
-        name = prefix + param_info.module_name + "." + param_info.param_name
-        # FSDP full parameter names may not have both (i.e. `FSDP_PREFIX`), so
-        # we call `replace()` twice separately
-        name = name.replace(FSDP_WRAPPED_MODULE + ".", "")
-        name = name.replace(FPW_MODULE + ".", "")
+        name = clean_param_name(
+            prefix + param_info.module_name + "." + param_info.param_name
+        )
         return name
 
-    def f(param_to_unflat_param_names, module: torch.nn.Module, prefix: str):
-        # For FSDP modules, only add the entry when considering the contained
-        # `FlattenParamsWrapper` to avoid duplication
+    def module_fn(module, prefix, param_to_unflat_param_names):
         if not isinstance(module, FullyShardedDataParallel):
             for param_name, param in module.named_parameters(recurse=False):
                 prefixed_param_names = [
-                    clean_param_name(prefix, param_info)
+                    _clean_param_name(prefix, param_info)
                     for param_info in param._param_infos
                 ] if isinstance(param, FlatParameter) else [prefix + param_name]
                 # If this parameter has already been visited, then it is a
@@ -3504,13 +3535,13 @@ def f(param_to_unflat_param_names, module: torch.nn.Module, prefix: str):
                 if not is_shared_param:
                     param_to_unflat_param_names[param] = prefixed_param_names
 
-        for submodule_name, submodule in module.named_children():
-            if submodule is not None:
-                new_prefix = prefix + submodule_name + "."
-                f(param_to_unflat_param_names, submodule, new_prefix)
+    def return_fn(param_to_unflat_param_names):
+        return param_to_unflat_param_names
 
-    f(param_to_unflat_param_names, model, "")
-    return param_to_unflat_param_names
+    param_to_unflat_param_names: Dict[torch.nn.Parameter, List[str]] = {}
+    return _apply_to_modules(
+        model, module_fn, return_fn, param_to_unflat_param_names,
+    )
 
 
 def _get_param_to_param_name(
@@ -3550,3 +3581,12 @@ def _get_param_name_to_param(
     """Constructs the inverse mapping of :meth:`_get_param_to_param_name`."""
     param_to_param_name = _get_param_to_param_name(model)
     return dict(zip(param_to_param_name.values(), param_to_param_name.keys()))
+
+
+def clean_param_name(param_name: str) -> str:
+    """Cleans the parameter name by removing any FSDP-related prefixes."""
+    # FSDP full parameter names may not have both (i.e. `FSDP_PREFIX`), so we
+    # call `replace()` twice separately
+    param_name = param_name.replace(FSDP_WRAPPED_MODULE + ".", "")
+    param_name = param_name.replace(FPW_MODULE + ".", "")
+    return param_name