update tests, make init_weights an explicit method for user to call

bdhirsh · bdhirsh · commit 63b2e091e2c9 · 2025-07-01T15:33:38.000-07:00
diff --git a/autoparallel/api.py b/autoparallel/api.py
@@ -418,9 +418,20 @@ def apply_placement(self, sharding_placement=None):
                 **sharded_weights_with_fqns,
                 **sharded_buffers_with_fqns,
             }
-            with stateless._reparametrize_module(self.model, sharded_params_buffers):
-                self.model.init_weights()
+
+            def init_weights():
+                with stateless._reparametrize_module(
+                    self.model, sharded_params_buffers
+                ):
+                    self.model.init_weights()
+
+        else:
+            init_weights = None
 
         self.parallel_model = self.parallel_model_fn(sharded_weights, sharded_buffers)
+        # assign an init_weights method onto the output mod.
+        # all it does is sneakily run the original user mod's init_weights method,
+        # but with our new DTensor sharded params attached to the user module.
+        self.parallel_model.init_weights = init_weights
 
         return self.parallel_model
diff --git a/examples/example_autoparallel.py b/examples/example_autoparallel.py
@@ -100,6 +100,9 @@ def input_fn():
 sharding_placement = autop.optimize_placement()
 parallel_mod = autop.apply_placement(sharding_placement)
 
+# run weight init on our sharded DTensor params
+parallel_mod.init_weights()
+
 # now let's run it
 x = (torch.rand(bs // mesh.shape[0], seq_len, dim1, device="cuda"),)
 out = parallel_mod(*x)
diff --git a/examples/example_llama3.py b/examples/example_llama3.py
@@ -475,7 +475,6 @@ def __init__(self, model_args: TransformerModelArgs):
             self.layers[str(layer_id)] = TransformerBlock(layer_id, model_args)
         self.norm = nn.RMSNorm(model_args.dim, eps=model_args.norm_eps)
         self.output = nn.Linear(model_args.dim, model_args.vocab_size, bias=False)
-        self.init_weights()
 
     def init_weights(
         self,
@@ -628,6 +627,9 @@ def input_fn():
 print(f"Took {time.time() - t:.2f} s")
 parallel_mod = autop.apply_placement(sharding_placement)
 
+# run weight init on our sharded DTensor params
+parallel_mod.init_weights()
+
 # now let's run it
 x = (
     torch.randint(