🎢 [Callbacks] BEMA (#3855)

kashif · qgallouedec · web-flow · commit 206964ce16e1 · 2025-08-14T13:54:52.000-07:00
Co-authored-by: Quentin Gallouédec &lt;45557362+qgallouedec@users.noreply.github.com&gt;
Co-authored-by: Quentin Gallouédec &lt;gallouedec.quentin@gmail.com&gt;
diff --git a/docs/source/callbacks.md b/docs/source/callbacks.md
@@ -18,4 +18,8 @@
 
 ## MergeModelCallback
 
-[[autodoc]] MergeModelCallback
+[[autodoc]] MergeModelCallback
+
+## BEMACallback
+
+[[autodoc]] BEMACallback
diff --git a/docs/source/paper_index.md b/docs/source/paper_index.md
@@ -24,4 +24,19 @@ training_args = GRPOConfig(
     gradient_accumulation_steps=1,
     steps_per_generation=4,  # partition rollout batch into 4 mini-batches. GSPO paper (v2), section 5.1. Must be 4 times gradient_accumulation_steps
 )
-```
+```
+
+## EMA Without the Lag: Bias-Corrected Iterate Averaging Schemes
+
+**📜 Paper**: https://huggingface.co/papers/2508.00180
+
+Bias-Corrected Exponential Moving Average (BEMA) improves the stability and efficiency of language model fine-tuning by reducing stochasticity and eliminating bias. To use BEMA with SFT as described in the paper, you can use the [`BEMACallback`]:
+
+```python
+from trl import BEMACallback, SFTTrainer
+
+trainer = SFTTrainer(
+    ...
+    callbacks=[BEMACallback()],
+)
+```
diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py
@@ -14,6 +14,7 @@
 
 import json
 import os
+from unittest.mock import call, patch
 
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, Trainer, TrainingArguments
@@ -22,7 +23,15 @@
 from transformers.utils import is_peft_available
 
 from tests.testing_utils import require_comet, require_mergekit
-from trl import BasePairwiseJudge, DPOConfig, DPOTrainer, LogCompletionsCallback, MergeModelCallback, WinRateCallback
+from trl import (
+    BasePairwiseJudge,
+    BEMACallback,
+    DPOConfig,
+    DPOTrainer,
+    LogCompletionsCallback,
+    MergeModelCallback,
+    WinRateCallback,
+)
 from trl.mergekit_utils import MergeConfig
 
 from .testing_utils import TrlTestCase
@@ -362,3 +371,125 @@ def test_every_checkpoint(self):
         for checkpoint in checkpoints:
             merged_path = os.path.join(checkpoint, "merged")
             self.assertTrue(os.path.isdir(merged_path), f"Merged folder does not exist in checkpoint {checkpoint}.")
+
+
+class BEMACallbackTester(TrlTestCase):
+    def setUp(self):
+        super().setUp()
+        self.model = AutoModelForCausalLM.from_pretrained("trl-internal-testing/tiny-Qwen2ForCausalLM-2.5")
+        self.tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-Qwen2ForCausalLM-2.5")
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        dataset = load_dataset("trl-internal-testing/zen", "standard_language_modeling")
+
+        def tokenize_function(examples, tokenizer):
+            out = tokenizer(examples["text"], padding="max_length", max_length=17)
+            out["labels"] = out["input_ids"].copy()
+            return out
+
+        self.dataset = dataset.map(
+            tokenize_function, fn_kwargs={"tokenizer": self.tokenizer}, remove_columns=["text"], batched=True
+        )
+
+    def test_model_saved(self):
+        """Test that BEMACallback saves the BEMA model."""
+        training_args = TrainingArguments(output_dir=self.tmp_dir, report_to="none")
+        bema_callback = BEMACallback(update_freq=2)
+        trainer = Trainer(
+            model=self.model,
+            args=training_args,
+            train_dataset=self.dataset["train"],
+            processing_class=self.tokenizer,
+            callbacks=[bema_callback],
+        )
+        trainer.train()
+
+        # Check that the BEMA model was saved and can be loaded
+        bema_path = os.path.join(self.tmp_dir, "bema")
+        self.assertTrue(os.path.isdir(bema_path), "BEMA directory was not created")
+        AutoModelForCausalLM.from_pretrained(bema_path)
+
+    def test_update_frequency_0(self):
+        """Test that BEMA callback respects the update frequency."""
+        training_args = TrainingArguments(output_dir=self.tmp_dir, report_to="none")
+        bema_callback = BEMACallback(update_freq=2)
+
+        with patch.object(bema_callback, "_update_bema_weights") as mock_update:
+            trainer = Trainer(
+                model=self.model,
+                args=training_args,
+                train_dataset=self.dataset["train"],
+                processing_class=self.tokenizer,
+                callbacks=[bema_callback],
+            )
+
+            trainer.train()
+
+            # Total 9 steps (17 samples, batch size 8, 3 epochs).
+            # BEMA starts after step 0 and updates every 2 steps → updates at 2, 4, 5, 8
+            self.assertEqual(mock_update.call_args_list, [call(2), call(4), call(6), call(8)])
+
+    def test_update_frequency_1(self):
+        """Test that BEMA callback respects the update frequency."""
+        training_args = TrainingArguments(output_dir=self.tmp_dir, report_to="none")
+        bema_callback = BEMACallback(update_freq=3)
+
+        with patch.object(bema_callback, "_update_bema_weights") as mock_update:
+            trainer = Trainer(
+                model=self.model,
+                args=training_args,
+                train_dataset=self.dataset["train"],
+                processing_class=self.tokenizer,
+                callbacks=[bema_callback],
+            )
+
+            trainer.train()
+
+            # Total 9 steps (17 samples, batch size 8, 3 epochs).
+            # BEMA starts after step 0 and updates every 3 steps → updates at 3, 6, 9
+            self.assertEqual(mock_update.call_args_list, [call(3), call(6), call(9)])
+
+    def test_update_frequency_2(self):
+        """Test that BEMA callback respects the update frequency."""
+        training_args = TrainingArguments(output_dir=self.tmp_dir, report_to="none")
+        bema_callback = BEMACallback(update_freq=2, update_after=3)
+
+        with patch.object(bema_callback, "_update_bema_weights") as mock_update:
+            trainer = Trainer(
+                model=self.model,
+                args=training_args,
+                train_dataset=self.dataset["train"],
+                processing_class=self.tokenizer,
+                callbacks=[bema_callback],
+            )
+
+            trainer.train()
+
+            # Total 9 steps (17 samples, batch size 8, 3 epochs).
+            # BEMA starts after step 3 and updates every 2 steps → updates at 5, 7, 9
+            self.assertEqual(mock_update.call_args_list, [call(5), call(7), call(9)])
+
+    def test_no_bema(self):
+        """Test that BEMACallback works without BEMA updates."""
+        training_args = TrainingArguments(output_dir=self.tmp_dir, report_to="none")
+        bema_callback = BEMACallback(update_freq=2, bias_power=0.0)
+        trainer = Trainer(
+            model=self.model,
+            args=training_args,
+            train_dataset=self.dataset["train"],
+            processing_class=self.tokenizer,
+            callbacks=[bema_callback],
+        )
+        trainer.train()
+
+    def test_no_ema(self):
+        """Test that BEMACallback works without EMA updates."""
+        training_args = TrainingArguments(output_dir=self.tmp_dir, report_to="none")
+        bema_callback = BEMACallback(update_freq=2, ema_power=0.0)
+        trainer = Trainer(
+            model=self.model,
+            args=training_args,
+            train_dataset=self.dataset["train"],
+            processing_class=self.tokenizer,
+            callbacks=[bema_callback],
+        )
+        trainer.train()
diff --git a/trl/__init__.py b/trl/__init__.py
@@ -69,7 +69,6 @@
         "KTOConfig",
         "KTOTrainer",
         "LogCompletionsCallback",
-        "MergeModelCallback",
         "ModelConfig",
         "NashMDConfig",
         "NashMDTrainer",
@@ -93,7 +92,7 @@
         "XPOConfig",
         "XPOTrainer",
     ],
-    "trainer.callbacks": ["MergeModelCallback", "RichProgressCallback", "SyncRefModelCallback"],
+    "trainer.callbacks": ["BEMACallback", "MergeModelCallback", "RichProgressCallback", "SyncRefModelCallback"],
     "trainer.utils": ["get_kbit_device_map", "get_peft_config", "get_quantization_config"],
 }
 
@@ -163,7 +162,6 @@
         KTOConfig,
         KTOTrainer,
         LogCompletionsCallback,
-        MergeModelCallback,
         ModelConfig,
         NashMDConfig,
         NashMDTrainer,
@@ -187,7 +185,7 @@
         XPOConfig,
         XPOTrainer,
     )
-    from .trainer.callbacks import RichProgressCallback, SyncRefModelCallback
+    from .trainer.callbacks import BEMACallback, MergeModelCallback, RichProgressCallback, SyncRefModelCallback
     from .trainer.utils import get_kbit_device_map, get_peft_config, get_quantization_config
 
     try:
diff --git a/trl/trainer/__init__.py b/trl/trainer/__init__.py
@@ -23,6 +23,7 @@
     "bco_config": ["BCOConfig"],
     "bco_trainer": ["BCOTrainer"],
     "callbacks": [
+        "BEMACallback",
         "LogCompletionsCallback",
         "MergeModelCallback",
         "RichProgressCallback",
@@ -93,6 +94,7 @@
     from .bco_config import BCOConfig
     from .bco_trainer import BCOTrainer
     from .callbacks import (
+        BEMACallback,
         LogCompletionsCallback,
         MergeModelCallback,
         RichProgressCallback,
diff --git a/trl/trainer/callbacks.py b/trl/trainer/callbacks.py