pytorch · Andrei-Aksionov · Jun 1, 2025 · Jun 1, 2025 · Jun 1, 2025 · Jun 1, 2025
diff --git a/recipes/__init__.py b/recipes/__init__.py
@@ -16,8 +16,8 @@
 # Since we don't want the tests to to incorrectly assume that recipes are
 # importable, we have to explicitly raise an error here.
 
-raise ModuleNotFoundError(
-    "The torchtune recipes directory isn't a package and you should not import anything from here. "
-    "Refer to our docs for detailed instructions on how to use recipes: "
-    "https://pytorch.org/torchtune/main/deep_dives/recipe_deepdive.html"
-)
+# raise ModuleNotFoundError(
+#     "The torchtune recipes directory isn't a package and you should not import anything from here. "
+#     "Refer to our docs for detailed instructions on how to use recipes: "
+#     "https://pytorch.org/torchtune/main/deep_dives/recipe_deepdive.html"
+# )
diff --git a/recipes/configs/llama2/7B_lora_single_device.yaml b/recipes/configs/llama2/7B_lora_single_device.yaml
@@ -64,14 +64,15 @@ optimizer:
 lr_scheduler:
   _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
+optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
 
 loss:
   _component_: torchtune.modules.loss.LinearCrossEntropyLoss
 
 # Training
 epochs: 1
 max_steps_per_epoch: null
-gradient_accumulation_steps: 8  # Use to increase effective batch size
+gradient_accumulation_steps: 1  # Use to increase effective batch size
 clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 

diff --git a/recipes/configs/llama3/8B_lora_single_device.yaml b/recipes/configs/llama3/8B_lora_single_device.yaml
@@ -66,6 +66,7 @@ optimizer:
 lr_scheduler:
   _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
+optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
 
 loss:
   _component_: torchtune.modules.loss.LinearCrossEntropyLoss

diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py
@@ -262,7 +262,7 @@ def setup(self, cfg: DictConfig) -> None:
             enable_activation_checkpointing=self._enable_activation_checkpointing,
             enable_activation_offloading=self._enable_activation_offloading,
             compile_model=self._compile,
-            model_state_dict=ckpt_dict[training.MODEL_KEY],
+            checkpoint_dict=ckpt_dict,
         )
         self._tokenizer = config.instantiate(cfg.tokenizer)
         self._logger.info("Tokenizer is initialized from file.")
@@ -286,10 +286,7 @@ def setup(self, cfg: DictConfig) -> None:
             # progress.
             if self._enable_async_checkpointing:
                 try:
-                    ckpt_dict = self._checkpoint_client.load_distributed_checkpoint(
-                        self._model,
-                        self.optimizer,
-                    )
+                    ckpt_dict = self._load_distributed_checkpoint()
                 except Exception as e:
                     self._logger.warning(
                         f"Failed to load distributed checkpoint: {e}. Training will start from the base checkpoint."
@@ -350,6 +347,12 @@ def setup(self, cfg: DictConfig) -> None:
         # if cfg is missing profiler key or if `cfg.profiler.enabled = False`
         self._profiler = self._setup_profiler(cfg.get(PROFILER_KEY, None))
 
+    def _load_distributed_checkpoint(self) -> dict[str, Any]:
+        return self._checkpoint_client.load_distributed_checkpoint(
+            model=self._model,
+            optimizer=self.optimizer,
+        )
+
     def _setup_profiler(
         self, cfg_profiler: Optional[DictConfig] = None
     ) -> Union[torch.profiler.profile, DummyProfiler]:
@@ -388,7 +391,7 @@ def _setup_model(
         enable_activation_checkpointing: bool,
         enable_activation_offloading: bool,
         compile_model: bool,
-        model_state_dict: dict[str, Any],
+        checkpoint_dict: dict[str, Any],
     ) -> nn.Module:
         """
         Set up the model including enabling activation checkpointing.
@@ -404,7 +407,7 @@ def _setup_model(
                 model, auto_wrap_policy={modules.TransformerSelfAttentionLayer}
             )
 
-        model.load_state_dict(model_state_dict)
+        model.load_state_dict(checkpoint_dict[training.MODEL_KEY])
 
         # Validate model was loaded in with the expected dtype.
         training.validate_expected_param_dtype(