Lightning-AI · gkroiz · Oct 8, 2023 · Oct 8, 2023 · Oct 8, 2023 · Oct 8, 2023
@@ -475,7 +475,8 @@ def _save_checkpoint_shard(
             # convert the state
             if isinstance(obj, Module) and isinstance(obj, XLAFSDP):
                 converted = obj.state_dict()
-                # add shard_metadata to state
+                # add shard_metadata to state. this format is defined by
+                # https://github.com/pytorch/xla/blob/v2.1.0/torch_xla/distributed/fsdp/state_dict_utils.py#L122-L125
                 converted_state["shard_metadata"] = obj.get_shard_metadata()
             elif isinstance(obj, Optimizer):
                 converted = obj.state_dict()
@@ -566,11 +567,7 @@ def load_checkpoint(
             if len(loaded_metadata_keys):
                 for key in loaded_metadata_keys:
                     metadata[key] = sharded_ckpt[key]
-
-            # remove "shard_metadata" that is loaded in
-            if "shard_metadata" in metadata:
-                metadata.pop("shard_metadata")
-
+            metadata.pop("shard_metadata", None)
             return metadata
 
         if self._state_dict_type == "full":
@@ -591,6 +588,11 @@ def load_checkpoint(
                 )
             if "model" not in state or not isinstance(model := state["model"], torch.nn.Module):
                 raise NotImplementedError("XLAFSDP only supports a single model instance with 'model' as the key.")
+            if any(isinstance(mod, XLAFSDP) for mod in model.modules()):
+                raise ValueError(
+                    "`XLAFSDPStrategy` does not support loading full model checkpoint"
+                    " if the model or any submodules are manually wrapped."
+                )
             full_ckpt = torch.load(path)
             model.load_state_dict(full_ckpt.pop("model"), strict=strict)
             return full_ckpt

@@ -15,6 +15,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added support for the `max_size_cycle|max_size|min_size` iteration modes during evaluation ([#17163](https://github.com/Lightning-AI/lightning/pull/17163))
 - Added support for the TPU-v4 architecture ([#17227](https://github.com/Lightning-AI/lightning/pull/17227))
 - Added support for XLA's new PJRT runtime ([#17352](https://github.com/Lightning-AI/lightning/pull/17352))
+- Added support for Fully Sharded Data Parallel (FSDP) training with XLA ([#18746](https://github.com/Lightning-AI/lightning/pull/18746))
 - Check for invalid TPU device inputs ([#17227](https://github.com/Lightning-AI/lightning/pull/17227))
 - Added `XLAStrategy(sync_module_states=bool)` to control whether to broadcast the parameters to all devices ([#17522](https://github.com/Lightning-AI/lightning/pull/17522))
 - Added support for multiple optimizer parameter groups when using the FSDP strategy ([#17309](https://github.com/Lightning-AI/lightning/pull/17309))

@@ -13,9 +13,10 @@
 # limitations under the License.
 import os
 from functools import partial
-from typing import Any, Callable
+from typing import Any, Callable, Union
 
 import torch
+from torch.optim import Optimizer
 from typing_extensions import get_args
 
 import lightning.pytorch as pl
@@ -59,6 +60,9 @@ def __init__(self, precision: _PRECISION_INPUT = "32-true") -> None:
         else:
             self._desired_dtype = torch.float32
 
+        # boolean flag for simplicity over an entirely new class
+        self._using_fsdp = False
+
     def optimizer_step(  # type: ignore[override]
         self,
         optimizer: Optimizable,
@@ -68,7 +72,8 @@ def optimizer_step(  # type: ignore[override]
     ) -> Any:
         import torch_xla.core.xla_model as xm
 
-        closure = partial(self._xla_wrap_closure, optimizer, closure)
+        if not self._using_fsdp:
+            closure = partial(self._reduce_gradients, optimizer, closure)
         closure = partial(self._wrap_closure, model, optimizer, closure)
         closure_result = optimizer.step(closure=closure, **kwargs)
         xm.mark_step()
@@ -87,9 +92,22 @@ def teardown(self) -> None:
         os.environ.pop("XLA_USE_BF16", None)
         os.environ.pop("XLA_USE_F16", None)
 
-    def _xla_wrap_closure(self, optimizer: Optimizable, closure: Callable[[], Any]) -> Any:
+    def _reduce_gradients(self, optimizer: Optimizable, closure: Callable[[], Any]) -> Any:
         import torch_xla.core.xla_model as xm
 
         closure_result = closure()
         xm.reduce_gradients(optimizer)
         return closure_result
+
+    def clip_grad_by_norm(self, optimizer: Optimizer, clip_val: Union[int, float]) -> None:
+        if self._using_fsdp:
+            # Not supported by us because we need a module reference, this would need to go through the Strategy
+            # as in Fabric
+            raise NotImplementedError("XLA's FSDP strategy does not support to clip gradients by norm.")
+        return super().clip_grad_by_value(optimizer, clip_val)
+
+    def clip_grad_by_value(self, optimizer: Optimizer, clip_val: Union[int, float]) -> None:
+        if self._using_fsdp:
+            # Not supported by XLA
+            raise NotImplementedError("XLA's FSDP strategy does not support to clip gradients by value.")
+        return super().clip_grad_by_value(optimizer, clip_val)
@@ -23,6 +23,7 @@
 from lightning.pytorch.strategies.single_xla import SingleDeviceXLAStrategy  # noqa: F401
 from lightning.pytorch.strategies.strategy import Strategy
 from lightning.pytorch.strategies.xla import XLAStrategy  # noqa: F401
+from lightning.pytorch.strategies.xla_fsdp import XLAFSDPStrategy  # noqa: F401
 
 StrategyRegistry = _StrategyRegistry()
 _register_classes(StrategyRegistry, "register_strategies", sys.modules[__name__], Strategy)

@@ -58,6 +58,7 @@
 from lightning.fabric.utilities.optimizer import _optimizers_to_device
 from lightning.fabric.utilities.seed import reset_seed
 from lightning.fabric.utilities.types import _PATH, ReduceOp
+from lightning.fabric.utilities.warnings import PossibleUserWarning
 from lightning.pytorch.core.optimizer import LightningOptimizer
 from lightning.pytorch.plugins.precision import PrecisionPlugin
 from lightning.pytorch.plugins.precision.fsdp import FSDPPrecisionPlugin
@@ -66,7 +67,7 @@
 from lightning.pytorch.strategies.strategy import TBroadcast
 from lightning.pytorch.trainer.states import TrainerFn
 from lightning.pytorch.utilities.model_helpers import is_overridden
-from lightning.pytorch.utilities.rank_zero import rank_zero_info, rank_zero_only, rank_zero_warn
+from lightning.pytorch.utilities.rank_zero import rank_zero_only, rank_zero_warn
 
 if TYPE_CHECKING:
     from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, MixedPrecision, ShardingStrategy
@@ -311,9 +312,10 @@ def setup(self, trainer: "pl.Trainer") -> None:
 
         if is_overridden("configure_sharded_model", self.lightning_module):
             # legacy: we don't skip setup with the `configure_model` alternative
-            rank_zero_info(
+            rank_zero_warn(
                 "You have overridden `LightningModule.configure_sharded_model` hook. It will assume that all the layers"
-                " are already wrapped for sharding and won't wrap the entire model using `FullyShardedDataParallel`."
+                " are already wrapped for sharding and won't wrap the entire model using `FullyShardedDataParallel`.",
+                category=PossibleUserWarning,
             )
         else:
             self.model = self._setup_model(self.model)

@@ -49,7 +49,7 @@ class _XLALauncher(_MultiProcessingLauncher):
 
     """
 
-    def __init__(self, strategy: "pl.strategies.XLAStrategy") -> None:
+    def __init__(self, strategy: Union["pl.strategies.XLAStrategy", "pl.strategies.XLAFSDPStrategy"]) -> None:
         if not _XLA_AVAILABLE:
             raise ModuleNotFoundError(str(_XLA_AVAILABLE))
         super().__init__(strategy=strategy, start_method="fork")