pytorch · ebsmothers · Oct 29, 2024 · Oct 29, 2024 · Oct 30, 2024 · Oct 30, 2024
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
@@ -152,26 +152,10 @@ def __init__(self, cfg: DictConfig) -> None:
         self._gradient_accumulation_steps = cfg.gradient_accumulation_steps
         self._optimizer_in_bwd = cfg.get("optimizer_in_bwd", False)
 
-        # activation checkpointing/offloading
-        self._enable_activation_checkpointing = cfg.get(
-            "enable_activation_checkpointing", False
-        )
-        self._enable_activation_offloading = cfg.get(
-            "enable_activation_offloading", False
-        )
-        if self._enable_activation_offloading:
-            if self._device.type != "cuda":
-                raise RuntimeError(
-                    "enable_activation_offloading should only be True when training on CUDA"
-                )
-            if not self._enable_activation_checkpointing:
-                raise RuntimeError(
-                    "enable_activation_offloading should only be True when enable_activation_checkpointing is True"
-                )
-        elif self._enable_activation_checkpointing:
-            log.info(
-                "Hint: enable_activation_checkpointing is True, but enable_activation_offloading isn't. "
-                "Enabling activation offloading should reduce memory further."
+        if self._gradient_accumulation_steps > 1 and self._optimizer_in_bwd:
+            raise RuntimeError(
+                "Gradient accumulation is not supported with optimizer in bwd."
+                "Please set gradient_accumulation_steps=1, or optimizer_in_bwd=False."
             )
 
         # activation checkpointing/offloading
@@ -720,7 +704,7 @@ def train(self) -> None:
         # clean up before training begins
         training.cleanup_before_training()
 
-        _, rank = training.get_world_size_and_rank()
+        world_size, rank = training.get_world_size_and_rank()
 
         # zero out the gradients before starting training
         if not self._optimizer_in_bwd:
@@ -787,15 +771,31 @@ def train(self) -> None:
                 # Compute loss
                 # Loss is normalized by default so we multiply by the number of tokens
                 # This way we can normalize by the total number of tokens if we're accumulating gradients
-                running_loss += self._loss_fn(logits, labels) * current_num_tokens
+                current_loss = self._loss_fn(logits, labels) * current_num_tokens
 
                 # free logits otherwise it peaks backward memory
                 del logits
 
+                running_loss += current_loss
+
+                # For optimizer in backward, we need to normalize before calling backward
+                # This case and gradient accumulation are mutually exclusive
+                if self._optimizer_in_bwd:
+                    torch.distributed.all_reduce(num_tokens)
+                    torch.distributed.all_reduce(running_loss)
+                    current_loss = current_loss / num_tokens
+
+                current_loss.backward()
+
                 # Step with optimizer
                 if (idx + 1) % self._gradient_accumulation_steps == 0:
-                    loss = running_loss / num_tokens
-                    loss.backward()
+                    if not self._optimizer_in_bwd:
+                        # Get total number of tokens across all ranks to normalize gradients
+                        torch.distributed.all_reduce(num_tokens)
+                        # This will ensure that the logged loss matches what we're optimizing
+                        torch.distributed.all_reduce(running_loss)
+                        # Manually scale the gradients from unnormalized loss by total # of tokens
+                        training.scale_grads(self._model, 1 / num_tokens)
                     if self._clip_grad_norm is not None:
                         if self._optimizer_in_bwd:
                             raise NotImplementedError(
@@ -812,7 +812,7 @@ def train(self) -> None:
                     # Update the number of steps when the weights are updated
                     self.global_step += 1
 
-                    loss_to_log = loss.item()
+                    loss_to_log = running_loss.item() / num_tokens
                     pbar.update(1)
                     pbar.set_description(
                         f"{curr_epoch + 1}|{self.global_step}|Loss: {loss_to_log}"
@@ -833,7 +833,8 @@ def train(self) -> None:
                                     else self._optim_ckpt_wrapper
                                 ),
                             ),
-                            "tokens_per_second_per_gpu": num_tokens / time_per_step,
+                            "tokens_per_second_per_gpu": num_tokens
+                            / (time_per_step * world_size),
                         }
                         if self._log_peak_memory_stats:
                             log_dict.update(

diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py
@@ -686,12 +686,13 @@ def train(self) -> None:
 
                 # Loss is normalized by default so we multiply by the number of tokens
                 # This way we can normalize by the total number of tokens if we're accumulating gradients
-                running_loss += self._loss_step(batch) * current_num_tokens
+                current_loss = self._loss_step(batch) * current_num_tokens
+                running_loss += current_loss
+                current_loss.backward()
 
                 # Step with optimizer
                 if (idx + 1) % self._gradient_accumulation_steps == 0:
-                    loss = running_loss / num_tokens
-                    loss.backward()
+                    training.scale_grads(self._model, 1 / num_tokens)
                     if self._clip_grad_norm is not None:
                         grad_norm = torch.nn.utils.clip_grad_norm_(
                             self._model.parameters(),
@@ -706,7 +707,7 @@ def train(self) -> None:
                         self._lr_scheduler.step()
                     self.global_step += 1
 
-                    loss_to_log = loss.item()
+                    loss_to_log = running_loss.item() / num_tokens
                     pbar.update(1)
                     pbar.set_description(
                         f"{curr_epoch + 1}|{self.global_step}|Loss: {loss_to_log}"

diff --git a/recipes/knowledge_distillation_distributed.py b/recipes/knowledge_distillation_distributed.py
@@ -821,7 +821,7 @@ def train(self) -> None:
         # clean up before training begins
         training.cleanup_before_training()
 
-        _, rank = training.get_world_size_and_rank()
+        world_size, rank = training.get_world_size_and_rank()
 
         # zero out the gradients before starting training
         self._optimizer.zero_grad()
@@ -857,7 +857,7 @@ def train(self) -> None:
                 ):
                     torch.cuda.memory._record_memory_history()
 
-                batch = {k: v.to(self._device) for k, v in batch.items()}
+                utils.batch_to_device(batch, self._device)
 
                 # Calculate the number of unmasked tokens in the current batch
                 # and increment the total number of tokens seen in the step
@@ -869,13 +869,22 @@ def train(self) -> None:
                 class_loss, kd_loss = self._loss_step(batch)
                 running_class_loss += class_loss * current_num_tokens
                 running_kd_loss += kd_loss * current_num_tokens
+                current_loss = (
+                    1 - self._kd_ratio
+                ) * class_loss + self._kd_ratio * kd_loss
+                current_loss.backward()
 
                 # Step with optimizer
                 if (idx + 1) % self._gradient_accumulation_steps == 0:
-                    class_loss = running_class_loss / num_tokens
-                    kd_loss = running_kd_loss / num_tokens
-                    loss = (1 - self._kd_ratio) * class_loss + self._kd_ratio * kd_loss
-                    loss.backward()
+                    # Get total number of tokens across all ranks to normalize gradients
+                    torch.distributed.all_reduce(num_tokens)
+                    # This will ensure that the logged loss matches what we're optimizing
+                    torch.distributed.all_reduce(running_class_loss)
+                    torch.distributed.all_reduce(running_kd_loss)
+                    # Manually scale the gradients from unnormalized loss by total # of tokens
+                    training.scale_grads(self._model, 1 / num_tokens)
+                    class_loss_to_log = running_class_loss.item() / num_tokens
+                    kd_loss_to_log = running_kd_loss.item() / num_tokens
                     self._optimizer.step()
                     self._optimizer.zero_grad(set_to_none=True)
                     self._lr_scheduler.step()
@@ -903,7 +912,8 @@ def train(self) -> None:
                             "class_loss": class_loss_to_log,
                             "kd_loss": kd_loss_to_log,
                             "lr": self._optimizer.param_groups[0]["lr"],
-                            "tokens_per_second_per_gpu": num_tokens / time_per_step,
+                            "tokens_per_second_per_gpu": num_tokens
+                            / (time_per_step * world_size),
                         }
                         if self._log_peak_memory_stats:
                             log_dict.update(

diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py
@@ -704,15 +704,14 @@ def train(self) -> None:
                     class_loss, kd_loss = self._loss_step(batch)
                     running_class_loss += class_loss * current_num_tokens
                     running_kd_loss += kd_loss * current_num_tokens
+                    current_loss = (
+                        1 - self._kd_ratio
+                    ) * class_loss + self._kd_ratio * kd_loss
+                    current_loss.backward()
 
                     # Step with optimizer
                     if (idx + 1) % self._gradient_accumulation_steps == 0:
-                        class_loss = running_class_loss / num_tokens
-                        kd_loss = running_kd_loss / num_tokens
-                        loss = (
-                            1 - self._kd_ratio
-                        ) * class_loss + self._kd_ratio * kd_loss
-                        loss.backward()
+                        training.scale_grads(self._model, 1 / num_tokens)
                         if self._clip_grad_norm is not None:
                             grad_norm = torch.nn.utils.clip_grad_norm_(
                                 self._model.parameters(),
@@ -724,8 +723,8 @@ def train(self) -> None:
                         # Update the number of steps when the weights are updated
                         self.global_step += 1
 
-                        class_loss_to_log = class_loss.item()
-                        kd_loss_to_log = kd_loss.item()
+                        class_loss_to_log = running_class_loss.item() / num_tokens
+                        kd_loss_to_log = running_kd_loss.item() / num_tokens
                         loss_to_log = (
                             1 - self._kd_ratio
                         ) * class_loss_to_log + self._kd_ratio * kd_loss_to_log

diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
@@ -748,7 +748,7 @@ def train(self) -> None:
         # clean up before training begins
         training.cleanup_before_training()
 
-        _, rank = training.get_world_size_and_rank()
+        world_size, rank = training.get_world_size_and_rank()
 
         # zero out the gradients before starting training
         self._optimizer.zero_grad()
@@ -812,15 +812,22 @@ def train(self) -> None:
                 # Compute loss
                 # Loss is normalized by default so we multiply by the number of tokens
                 # This way we can normalize by the total number of tokens if we're accumulating gradients
-                running_loss += self._loss_fn(logits, labels) * current_num_tokens
+                current_loss = self._loss_fn(logits, labels) * current_num_tokens
 
                 # free logits otherwise it peaks backward memory
                 del logits
 
+                running_loss += current_loss
+                current_loss.backward()
+
                 # Step with optimizer
                 if (idx + 1) % self._gradient_accumulation_steps == 0:
-                    loss = running_loss / num_tokens
-                    loss.backward()
+                    # Get total number of tokens across all ranks to normalize gradients
+                    torch.distributed.all_reduce(num_tokens)
+                    # This will ensure that the logged loss matches what we're optimizing
+                    torch.distributed.all_reduce(running_loss)
+                    # Manually scale the gradients from unnormalized loss by total # of tokens
+                    training.scale_grads(self._model, 1 / num_tokens)
                     if self._clip_grad_norm is not None:
                         grad_norm = torch.nn.utils.clip_grad_norm_(
                             self._model.parameters(),
@@ -833,7 +840,7 @@ def train(self) -> None:
                     # Update the number of steps when the weights are updated
                     self.global_step += 1
 
-                    loss_to_log = loss.item()
+                    loss_to_log = running_loss.item() / num_tokens
                     pbar.update(1)
                     pbar.set_description(
                         f"{curr_epoch + 1}|{self.global_step}|Loss: {loss_to_log}"
@@ -848,7 +855,8 @@ def train(self) -> None:
                         log_dict = {
                             "loss": loss_to_log,
                             "lr": self._optimizer.param_groups[0]["lr"],
-                            "tokens_per_second_per_gpu": num_tokens / time_per_step,
+                            "tokens_per_second_per_gpu": num_tokens
+                            / (time_per_step * world_size),
                         }
                         if self._log_peak_memory_stats:
                             log_dict.update(

diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py
@@ -692,12 +692,13 @@ def train(self) -> None:
 
                     # Loss is normalized by default so we multiply by the number of tokens
                     # This way we can normalize by the total number of tokens if we're accumulating gradients
-                    running_loss += self._loss_step(batch) * current_num_tokens
+                    current_loss = self._loss_step(batch) * current_num_tokens
+                    running_loss += current_loss
+                    current_loss.backward()
 
                     # Step with optimizer
                     if (idx + 1) % self._gradient_accumulation_steps == 0:
-                        loss = running_loss / num_tokens
-                        loss.backward()
+                        training.scale_grads(self._model, 1 / num_tokens)
                         if self._clip_grad_norm is not None:
                             grad_norm = torch.nn.utils.clip_grad_norm_(
                                 self._model.parameters(),
@@ -709,7 +710,7 @@ def train(self) -> None:
                         # Update the number of steps when the weights are updated
                         self.global_step += 1
 
-                        loss_to_log = loss.item()
+                        loss_to_log = running_loss.item() / num_tokens
                         pbar.update(1)
                         pbar.set_description(
                             f"{curr_epoch + 1}|{self.global_step}|Loss: {loss_to_log}"

diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py
@@ -599,8 +599,7 @@ def train(self) -> None:
         """
         # clean up before training begins
         training.cleanup_before_training()
-
-        _, rank = training.get_world_size_and_rank()
+        world_size, rank = training.get_world_size_and_rank()
 
         # zero out the gradients before starting training
         self._optimizer.zero_grad()
@@ -668,18 +667,16 @@ def train(self) -> None:
 
                 # Calculate the number of unmasked tokens in the current batch
                 # and increment the total number of tokens seen in the step
+
+                utils.batch_to_device(batch, self._device)
+
                 current_num_tokens = (
                     batch["labels"] != self._loss_fn.ignore_index
                 ).sum()
                 num_tokens += current_num_tokens
+                labels = batch.pop("labels")
 
-                labels = labels.to(self._device)
-                mask = mask.to(self._device) if mask is not None else None
-                input_pos = (
-                    input_pos.to(self._device) if input_pos is not None else None
-                )
-
-                logits = self._model(tokens, mask=mask, input_pos=input_pos)
+                logits = self._model(**batch)
 
                 # Shift labels to compute loss
                 # equivalent to doing labels[..., 1:] and logits[..., :-1, :]
@@ -692,22 +689,30 @@ def train(self) -> None:
                     logits = logits.reshape(-1, logits.size(-1))
 
                 # Compute loss
-                running_loss += self._loss_fn(logits, labels) * current_num_tokens
+                current_loss = self._loss_fn(logits, labels) * current_num_tokens
+
                 # free logits otherwise it peaks backward memory
                 del logits
 
+                running_loss += current_loss
+                current_loss.backward()
+
                 # Step with optimizer
                 if (idx + 1) % self._gradient_accumulation_steps == 0:
-                    loss = running_loss / num_tokens
-                    loss.backward()
+                    # Get total number of tokens across all ranks to normalize gradients
+                    torch.distributed.all_reduce(num_tokens)
+                    # This will ensure that the logged loss matches what we're optimizing
+                    torch.distributed.all_reduce(running_loss)
+                    # Manually scale the gradients from unnormalized loss by total # of tokens
+                    training.scale_grads(self._model, 1 / num_tokens)
 
                     self._optimizer.step()
                     self._optimizer.zero_grad(set_to_none=True)
 
                     # Update the number of steps when the weights are updated
                     self.global_step += 1
 
-                    loss_to_log = loss.item()
+                    loss_to_log = running_loss.item() / num_tokens
                     pbar.update(1)
                     pbar.set_description(
                         f"{curr_epoch + 1}|{self.global_step}|Loss: {loss_to_log}"
@@ -722,7 +727,9 @@ def train(self) -> None:
                         log_dict = {
                             "loss": loss_to_log,
                             "lr": self._optimizer.param_groups[0]["lr"],
-                            "tokens_per_second_per_gpu": num_tokens / time_per_step,
+                            "tokens_per_second_per_gpu": (
+                                num_tokens / time_per_step * world_size
+                            ),
                         }
                         if self._log_peak_memory_stats:
                             log_dict.update(

diff --git a/tests/recipes/test_full_finetune_distributed.py b/tests/recipes/test_full_finetune_distributed.py
@@ -45,21 +45,20 @@ def _get_test_config_overrides(self):
 
     def _fetch_expected_loss_values(self, model_type):
         loss_values_map = {
-            "llama2": [10.5136, 10.4813, 10.5088, 10.5250],
-            "llama3": [12.0673, 11.9072, 11.9302, 11.9355],
+            "llama2": [10.5209, 10.5217, 10.4945, 10.5136],
+            "llama3": [11.9839, 11.9684, 11.9596, 11.93656],
         }
         return loss_values_map[model_type]
 
     @pytest.mark.integration_test
     @pytest.mark.parametrize(
-        "config, model_type, ckpt_type, micro_batch_size, gradient_accumulation_steps",
+        "config, model_type, ckpt_type, micro_batch_size, gradient_accumulation_steps, optim_in_bwd",
         [
-            ("llama2/7B_full", "llama2", "hf", 1, 4),
-            ("llama3/8B_full", "llama3", "tune", 1, 4),
-            ("llama3/8B_full", "llama3", "tune", 4, 1),
+            ("llama2/7B_full", "llama2", "hf", 1, 4, False),
+            ("llama3/8B_full", "llama3", "tune", 1, 4, False),
+            ("llama3/8B_full", "llama3", "tune", 4, 1, True),
         ],
     )
-    @pytest.mark.parametrize("optim_in_bwd", [True, False])
     @gpu_test(gpu_count=2)
     def test_loss(
         self,