[FSDP] Remove redundant GPU memory restore and improve code style (THUDM#658)

Hecate0821 · web-flow · commit 48cebc93a04d · 2025-11-01T16:54:53.000+08:00
diff --git a/slime/backends/fsdp_utils/actor.py b/slime/backends/fsdp_utils/actor.py
@@ -1,5 +1,4 @@
 from argparse import Namespace
-from collections.abc import Mapping
 from contextlib import nullcontext
 from itertools import accumulate
 
@@ -669,65 +668,20 @@ def update_gpu_params_dict(self, params_dict: dict[str, torch.Tensor]) -> None:
 
         Parameters:
             params_dict: Source mapping from parameter names to CPU tensors.
-        """
-        self._load_cpu_state_dict(params_dict)
-        torch.cuda.synchronize()
-
-    def load_ref_model(self, ref_load_path: str | None) -> None:
-        """Load reference model weights once and cache them on CPU.
 
-        Parameters:
-            ref_load_path: Path to a directory containing a HF checkpoint. If
-                None, a ValueError is raised.
+        Note:
+            This method handles both regular Tensors and DTensors. For DTensors,
+            it properly distributes the full tensor according to FSDP sharding.
         """
-        if ref_load_path is None:
-            raise ValueError("ref_load_path must be provided when loading reference model")
-
-        current_weights = {}
-        self.update_cpu_params_dict(current_weights)
-
-        try:
-            import os
-
-            if os.path.isdir(ref_load_path):
-                temp_ref_model = AutoModelForCausalLM.from_pretrained(
-                    ref_load_path,
-                    trust_remote_code=True,
-                    torch_dtype=torch.bfloat16,
-                    device_map="cpu",
-                )
-
-                ref_state_dict = temp_ref_model.state_dict()
-                self.weights["ref"] = {}
-
-                for name, tensor in ref_state_dict.items():
-                    actor_tensor = current_weights.get(name)
-                    target_dtype = actor_tensor.dtype if actor_tensor is not None else tensor.dtype
-                    cpu_tensor = tensor.detach().to(device="cpu", dtype=target_dtype, copy=True)
-                    self.weights["ref"][name] = cpu_tensor.pin_memory()
-
-                del temp_ref_model
-                torch.cuda.empty_cache()
-            else:
-                raise NotImplementedError(f"Loading from checkpoint file {ref_load_path} not yet implemented")
-
-            print("Reference model parameters loaded and stored in CPU memory")
-
-        finally:
-            self.update_gpu_params_dict(current_weights)
-
-    @torch.no_grad()
-    def _load_cpu_state_dict(self, full_state_dict: Mapping[str, torch.Tensor]) -> None:
-        """Load a CPU full-state dict into the model, handling DTensor shards."""
-
+        # Cache parameter and buffer maps for efficiency
         if not hasattr(self, "_fsdp_param_map"):
             self._fsdp_param_map = dict(self.model.named_parameters())
             self._fsdp_buffer_map = dict(self.model.named_buffers())
 
         param_map = self._fsdp_param_map
         buffer_map = self._fsdp_buffer_map
 
-        for name, src in full_state_dict.items():
+        for name, src in params_dict.items():
             if not torch.is_tensor(src):
                 continue
 
@@ -753,8 +707,50 @@ def _load_cpu_state_dict(self, full_state_dict: Mapping[str, torch.Tensor]) -> N
                 )
                 dst_tensor.copy_(distributed)
             else:
+                # Regular tensor: just move to GPU
                 dst_tensor.copy_(src_tensor.to(device=dst_tensor.device, non_blocking=True))
 
+        torch.cuda.synchronize()
+
+    def load_ref_model(self, ref_load_path: str | None) -> None:
+        """Load reference model weights once and cache them on CPU.
+
+        Parameters:
+            ref_load_path: Path to a directory containing a HF checkpoint. If
+                None, a ValueError is raised.
+        """
+        if ref_load_path is None:
+            raise ValueError("ref_load_path must be provided when loading reference model")
+
+        import os
+
+        if os.path.isdir(ref_load_path):
+            # Get actor weights for dtype matching
+            actor_weights = {}
+            self.update_cpu_params_dict(actor_weights)
+
+            temp_ref_model = AutoModelForCausalLM.from_pretrained(
+                ref_load_path,
+                trust_remote_code=True,
+                torch_dtype=torch.bfloat16,
+                device_map="cpu",
+            )
+            ref_state_dict = temp_ref_model.state_dict()
+            self.weights["ref"] = {}
+
+            for name, tensor in ref_state_dict.items():
+                actor_tensor = actor_weights.get(name)
+                target_dtype = actor_tensor.dtype if actor_tensor is not None else tensor.dtype
+                cpu_tensor = tensor.detach().to(device="cpu", dtype=target_dtype, copy=True)
+                self.weights["ref"][name] = cpu_tensor.pin_memory()
+
+            del temp_ref_model
+            torch.cuda.empty_cache()
+        else:
+            raise NotImplementedError(f"Loading from checkpoint file {ref_load_path} not yet implemented")
+
+        print("Reference model parameters loaded and stored in CPU memory")
+
 
 def selective_log_softmax_raw(logits: torch.Tensor, input_ids: torch.Tensor) -> torch.Tensor:
     """Fused version of the common `log_softmax -> gather` operation.
diff --git a/tests/test_qwen3-0.6B_fsdp_distributed.sh b/tests/test_qwen3-0.6B_fsdp_distributed.sh
@@ -15,6 +15,7 @@ set -ex
 # will prevent ray from buffering stdout/stderr
 export PYTHONBUFFERED=16
 
+
 CKPT_ARGS=(
    --hf-checkpoint /root/Qwen3-0.6B
    --ref-load /root/Qwen3-0.6B
@@ -80,9 +81,9 @@ ray job submit --address="http://127.0.0.1:8265" \
    }' \
    -- python3 train.py \
    --actor-num-nodes 1 \
-   --actor-num-gpus-per-node 4 \
-   --colocate \
+   --actor-num-gpus-per-node 2 \
    --train-backend fsdp \
+   --rollout-num-gpus 2 \
    ${CKPT_ARGS[@]} \
    ${ROLLOUT_ARGS[@]} \
    ${OPTIMIZER_ARGS[@]} \