pytorch · ebsmothers · Oct 29, 2024 · Oct 29, 2024 · Oct 30, 2024 · Oct 30, 2024
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
@@ -130,7 +130,8 @@ def __init__(self, cfg: DictConfig) -> None:
 
         # _is_rank_zero is used primarily for logging. In the future, the logger
         # should directly take care of this
-        _, rank = training.get_world_size_and_rank()
+        self._world_size, rank = training.get_world_size_and_rank()
+        self._rank = rank
         self._is_rank_zero = rank == 0
 
         # Training cfg
@@ -631,7 +632,7 @@ def train(self) -> None:
         # clean up before training begins
         training.cleanup_before_training()
 
-        _, rank = training.get_world_size_and_rank()
+        self._world_size, rank = training.get_world_size_and_rank()
 
         # zero out the gradients before starting training
         if not self._optimizer_in_bwd:
@@ -697,15 +698,24 @@ def train(self) -> None:
                 # Compute loss
                 # Loss is normalized by default so we multiply by the number of tokens
                 # This way we can normalize by the total number of tokens if we're accumulating gradients
-                running_loss += self._loss_fn(logits, labels) * current_num_tokens
+                current_loss = self._loss_fn(logits, labels) * current_num_tokens
 
                 # free logits otherwise it peaks backward memory
                 del logits
 
+                running_loss += current_loss
+
+                if (idx + 1) % self._gradient_accumulation_steps != 0:
+                    with training.no_sync(self._model):
+                        current_loss.backward()
+                else:
+                    current_loss.backward()
+
                 # Step with optimizer
                 if (idx + 1) % self._gradient_accumulation_steps == 0:
-                    loss = running_loss / num_tokens
-                    loss.backward()
+                    local_num_tokens = num_tokens.detach().clone()
+                    torch.distributed.all_reduce(num_tokens)
+                    training.scale_grads(self._model, self._world_size / num_tokens)
                     if self._clip_grad_norm is not None:
                         if self._optimizer_in_bwd:
                             raise NotImplementedError(
@@ -722,7 +732,7 @@ def train(self) -> None:
                     # Update the number of steps when the weights are updated
                     self.global_step += 1
 
-                    loss_to_log = loss.item()
+                    loss_to_log = running_loss.item() / num_tokens
                     pbar.update(1)
                     pbar.set_description(
                         f"{curr_epoch + 1}|{self.global_step}|Loss: {loss_to_log}"
@@ -743,7 +753,8 @@ def train(self) -> None:
                                     else self._optim_ckpt_wrapper
                                 ),
                             ),
-                            "tokens_per_second_per_gpu": num_tokens / time_per_step,
+                            "tokens_per_second_per_gpu": local_num_tokens
+                            / time_per_step,
                         }
                         if self._log_peak_memory_stats:
                             log_dict.update(

diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py
@@ -641,12 +641,13 @@ def train(self) -> None:
 
                 # Loss is normalized by default so we multiply by the number of tokens
                 # This way we can normalize by the total number of tokens if we're accumulating gradients
-                running_loss += self._loss_step(batch) * current_num_tokens
+                current_loss = self._loss_step(batch) * current_num_tokens
+                running_loss += current_loss
+                current_loss.backward()
 
                 # Step with optimizer
                 if (idx + 1) % self._gradient_accumulation_steps == 0:
-                    loss = running_loss / num_tokens
-                    loss.backward()
+                    training.scale_grads(self._model, 1 / num_tokens)
                     if self._clip_grad_norm is not None:
                         grad_norm = torch.nn.utils.clip_grad_norm_(
                             self._model.parameters(),
@@ -661,7 +662,7 @@ def train(self) -> None:
                         self._lr_scheduler.step()
                     self.global_step += 1
 
-                    loss_to_log = loss.item()
+                    loss_to_log = running_loss.item() / num_tokens
                     pbar.update(1)
                     pbar.set_description(
                         f"{curr_epoch + 1}|{self.global_step}|Loss: {loss_to_log}"

diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py
@@ -704,15 +704,14 @@ def train(self) -> None:
                     class_loss, kd_loss = self._loss_step(batch)
                     running_class_loss += class_loss * current_num_tokens
                     running_kd_loss += kd_loss * current_num_tokens
+                    current_loss = (
+                        1 - self._kd_ratio
+                    ) * class_loss + self._kd_ratio * kd_loss
+                    current_loss.backward()
 
                     # Step with optimizer
                     if (idx + 1) % self._gradient_accumulation_steps == 0:
-                        class_loss = running_class_loss / num_tokens
-                        kd_loss = running_kd_loss / num_tokens
-                        loss = (
-                            1 - self._kd_ratio
-                        ) * class_loss + self._kd_ratio * kd_loss
-                        loss.backward()
+                        training.scale_grads(self._model, 1 / num_tokens)
                         if self._clip_grad_norm is not None:
                             grad_norm = torch.nn.utils.clip_grad_norm_(
                                 self._model.parameters(),
@@ -724,8 +723,8 @@ def train(self) -> None:
                         # Update the number of steps when the weights are updated
                         self.global_step += 1
 
-                        class_loss_to_log = class_loss.item()
-                        kd_loss_to_log = kd_loss.item()
+                        class_loss_to_log = running_class_loss.item() / num_tokens
+                        kd_loss_to_log = running_kd_loss.item() / num_tokens
                         loss_to_log = (
                             1 - self._kd_ratio
                         ) * class_loss_to_log + self._kd_ratio * kd_loss_to_log

diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
@@ -797,15 +797,17 @@ def train(self) -> None:
                 # Compute loss
                 # Loss is normalized by default so we multiply by the number of tokens
                 # This way we can normalize by the total number of tokens if we're accumulating gradients
-                running_loss += self._loss_fn(logits, labels) * current_num_tokens
+                current_loss = self._loss_fn(logits, labels) * current_num_tokens
 
                 # free logits otherwise it peaks backward memory
                 del logits
 
+                running_loss += current_loss
+                current_loss.backward()
+
                 # Step with optimizer
                 if (idx + 1) % self._gradient_accumulation_steps == 0:
-                    loss = running_loss / num_tokens
-                    loss.backward()
+                    training.scale_grads(self._model, 1 / num_tokens)
                     if self._clip_grad_norm is not None:
                         grad_norm = torch.nn.utils.clip_grad_norm_(
                             self._model.parameters(),
@@ -818,7 +820,7 @@ def train(self) -> None:
                     # Update the number of steps when the weights are updated
                     self.global_step += 1
 
-                    loss_to_log = loss.item()
+                    loss_to_log = running_loss.item() / num_tokens
                     pbar.update(1)
                     pbar.set_description(
                         f"{curr_epoch + 1}|{self.global_step}|Loss: {loss_to_log}"

diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py
@@ -694,12 +694,13 @@ def train(self) -> None:
 
                     # Loss is normalized by default so we multiply by the number of tokens
                     # This way we can normalize by the total number of tokens if we're accumulating gradients
-                    running_loss += self._loss_step(batch) * current_num_tokens
+                    current_loss = self._loss_step(batch) * current_num_tokens
+                    running_loss += current_loss
+                    current_loss.backward()
 
                     # Step with optimizer
                     if (idx + 1) % self._gradient_accumulation_steps == 0:
-                        loss = running_loss / num_tokens
-                        loss.backward()
+                        training.scale_grads(self._model, 1 / num_tokens)
                         if self._clip_grad_norm is not None:
                             grad_norm = torch.nn.utils.clip_grad_norm_(
                                 self._model.parameters(),
@@ -711,7 +712,7 @@ def train(self) -> None:
                         # Update the number of steps when the weights are updated
                         self.global_step += 1
 
-                        loss_to_log = loss.item()
+                        loss_to_log = running_loss.item() / num_tokens
                         pbar.update(1)
                         pbar.set_description(
                             f"{curr_epoch + 1}|{self.global_step}|Loss: {loss_to_log}"

diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py
@@ -692,22 +692,25 @@ def train(self) -> None:
                     logits = logits.reshape(-1, logits.size(-1))
 
                 # Compute loss
-                running_loss += self._loss_fn(logits, labels) * current_num_tokens
+                current_loss = self._loss_fn(logits, labels) * current_num_tokens
+
                 # free logits otherwise it peaks backward memory
                 del logits
 
+                running_loss += current_loss
+                current_loss.backward
+
                 # Step with optimizer
                 if (idx + 1) % self._gradient_accumulation_steps == 0:
-                    loss = running_loss / num_tokens
-                    loss.backward()
+                    training.scale_grads(self._model, 1 / num_tokens)
 
                     self._optimizer.step()
                     self._optimizer.zero_grad(set_to_none=True)
 
                     # Update the number of steps when the weights are updated
                     self.global_step += 1
 
-                    loss_to_log = loss.item()
+                    loss_to_log = running_loss.item() / num_tokens
                     pbar.update(1)
                     pbar.set_description(
                         f"{curr_epoch + 1}|{self.global_step}|Loss: {loss_to_log}"

diff --git a/torchtune/training/__init__.py b/torchtune/training/__init__.py
@@ -18,11 +18,13 @@
     load_from_full_model_state_dict,
     load_from_full_optimizer_state_dict,
     lora_fsdp_wrap_policy,
+    no_sync,
     prepare_model_for_fsdp_with_meta_device,
     set_torch_num_threads,
     shard_model,
     validate_no_params_on_meta_device,
 )
+from torchtune.training._grad_scaler import scale_grads
 from torchtune.training._profiler import (
     DEFAULT_PROFILE_DIR,
     DEFAULT_PROFILER_ACTIVITIES,
@@ -132,4 +134,6 @@
     "NoOpManager",
     "OffloadActivations",
     "FormattedCheckpointFiles",
+    "scale_grads",
+    "no_sync",
 ]
diff --git a/torchtune/training/_distributed.py b/torchtune/training/_distributed.py
@@ -5,10 +5,22 @@
 # LICENSE file in the root directory of this source tree.
 
 
+import contextlib
 import logging
 import os
 from itertools import chain
-from typing import Any, Callable, cast, Dict, List, Optional, Set, Tuple, Type
+from typing import (
+    Any,
+    Callable,
+    cast,
+    Dict,
+    Generator,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    Type,
+)
 
 import torch
 import torch.distributed as dist
@@ -679,3 +691,12 @@ def shard_model(
 
     # Finally shard the entire model to account for any stragglers
     fully_shard(model, **fsdp_kwargs)
+
+
+@contextlib.contextmanager
+def no_sync(model: nn.Module) -> Generator[None, None, None]:
+    model.set_requires_gradient_sync(False)
+    try:
+        yield
+    finally:
+        model.set_requires_gradient_sync(True)
diff --git a/torchtune/training/_grad_scaler.py b/torchtune/training/_grad_scaler.py
@@ -0,0 +1,14 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from torch import nn
+
+
+def scale_grads(m: nn.Module, scaler: torch.Tensor) -> None:
+    for p in m.parameters():
+        if p.grad is not None:
+            p.grad *= scaler