huggingface · behroozazarkhalili · Nov 3, 2025 · Nov 3, 2025 · Nov 3, 2025 · Nov 4, 2025
diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
@@ -13,7 +13,8 @@ concurrency:
 jobs:
   trl:
     name: "Build and push TRL Docker image"
-    runs-on: ubuntu-latest
+    runs-on:
+      group: aws-general-8-plus
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -52,7 +53,8 @@ jobs:
 
   trl-dev:
     name: "Build and push TRL Dev Docker image"
-    runs-on: ubuntu-latest
+    runs-on:
+      group: aws-general-8-plus
     steps:
       - name: Checkout code
         uses: actions/checkout@v4

diff --git a/README.md b/README.md
@@ -21,11 +21,11 @@
 
 **OpenEnv Integration:** TRL now supports **[OpenEnv](https://huggingface.co/blog/openenv)**, the open-source framework from Meta for defining, deploying, and interacting with environments in reinforcement learning and agentic workflows.
 
-Explore how to seamlessly integrate TRL with OpenEnv in our [dedicated documentation](openenv).
+Explore how to seamlessly integrate TRL with OpenEnv in our [dedicated documentation](https://huggingface.co/docs/trl/openenv).
 
 ## Overview
 
-TRL is a cutting-edge library designed for post-training foundation models using advanced techniques like Supervised Fine-Tuning (SFT), Proximal Policy Optimization (PPO), and Direct Preference Optimization (DPO). Built on top of the [🤗 Transformers](https://github.com/huggingface/transformers) ecosystem, TRL supports a variety of model architectures and modalities, and can be scaled-up across various hardware setups.
+TRL is a cutting-edge library designed for post-training foundation models using advanced techniques like Supervised Fine-Tuning (SFT), Group Relative Policy Optimization (GRPO), and Direct Preference Optimization (DPO). Built on top of the [🤗 Transformers](https://github.com/huggingface/transformers) ecosystem, TRL supports a variety of model architectures and modalities, and can be scaled-up across various hardware setups.
 
 ## Highlights
 
@@ -92,21 +92,21 @@ trainer.train()
 ```python
 from datasets import load_dataset
 from trl import GRPOTrainer
+from trl.rewards import accuracy_reward
 
-dataset = load_dataset("trl-lib/tldr", split="train")
-
-# Dummy reward function: count the number of unique characters in the completions
-def reward_num_unique_chars(completions, **kwargs):
-    return [len(set(c)) for c in completions]
+dataset = load_dataset("trl-lib/DeepMath-103K", split="train")
 
 trainer = GRPOTrainer(
     model="Qwen/Qwen2-0.5B-Instruct",
-    reward_funcs=reward_num_unique_chars,
+    reward_funcs=accuracy_reward,
     train_dataset=dataset,
 )
 trainer.train()
 ```
 
+> [NOTE!]
+> For reasoning models, use the `reasoning_accuracy_reward()` function for better results.
+
 ### `DPOTrainer`
 
 [`DPOTrainer`](https://huggingface.co/docs/trl/dpo_trainer) implements the popular [Direct Preference Optimization (DPO) algorithm](https://huggingface.co/papers/2305.18290) that was used to post-train [Llama 3](https://huggingface.co/papers/2407.21783) and many other models. Here is a basic example of how to use the `DPOTrainer`:

diff --git a/docker/trl-dev/Dockerfile b/docker/trl-dev/Dockerfile
@@ -1,6 +1,5 @@
-FROM pytorch/pytorch:2.8.0-cuda12.8-cudnn9-runtime
+FROM pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
 RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 RUN pip install --upgrade pip uv
 RUN uv pip install --system --no-cache "git+https://github.com/huggingface/trl.git#egg=trl[liger,peft,vlm]"
-RUN uv pip install --system hf_transfer liger_kernel trackio peft
-RUN uv pip install --system https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
+RUN uv pip install --system kernels liger_kernel peft trackio
diff --git a/docker/trl/Dockerfile b/docker/trl/Dockerfile
@@ -1,4 +1,4 @@
-FROM pytorch/pytorch:2.8.0-cuda12.8-cudnn9-runtime
+FROM pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
+RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 RUN pip install --upgrade pip uv
-RUN uv pip install --system trl[liger,peft,vlm] hf_transfer trackio
-RUN uv pip install --system https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
+RUN uv pip install --system trl[liger,peft,vlm] kernels trackio
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
@@ -12,6 +12,20 @@
   - local: paper_index
     title: Paper Index
   title: Conceptual Guides
+- sections: # Sorted alphabetically
+  - local: dpo_trainer
+    title: DPO
+  - local: grpo_trainer
+    title: GRPO
+  - local: kto_trainer
+    title: KTO
+  - local: reward_trainer
+    title: Reward
+  - local: rloo_trainer
+    title: RLOO
+  - local: sft_trainer
+    title: SFT
+  title: Trainers
 - sections:
   - local: clis
     title: Command Line Interface (CLI)
@@ -55,42 +69,10 @@
     title: LoRA Without Regret
   title: Examples
 - sections:
-  - sections: # Sorted alphabetically
-    - local: cpo_trainer
-      title: CPO
-    - local: dpo_trainer
-      title: DPO
-    - local: online_dpo_trainer
-      title: Online DPO
-    - local: gkd_trainer
-      title: GKD
-    - local: grpo_trainer
-      title: GRPO
-    - local: kto_trainer
-      title: KTO
-    - local: nash_md_trainer
-      title: Nash-MD
-    - local: orpo_trainer
-      title: ORPO
-    - local: ppo_trainer
-      title: PPO
-    - local: prm_trainer
-      title: PRM
-    - local: reward_trainer
-      title: Reward
-    - local: rloo_trainer
-      title: RLOO
-    - local: sft_trainer
-      title: SFT
-    - local: xpo_trainer
-      title: XPO
-    title: Trainers
   - local: models
     title: Model Classes
   - local: model_utils
     title: Model Utilities
-  - local: judges
-    title: Judges
   - local: callbacks
     title: Callbacks
   - local: data_utils
@@ -105,20 +87,48 @@
 - sections:
   - local: experimental_overview
     title: Experimental Overview
+  - local: openenv
+    title: OpenEnv Integration
   - local: bema_for_reference_model # Sorted alphabetically
     title: BEMA for Reference Model
   - local: bco_trainer
     title: BCO
+  - local: cpo_trainer
+    title: CPO
   - local: gfpo
     title: GFPO
+  - local: gkd_trainer
+    title: GKD
   - local: gold_trainer
     title: GOLD
   - local: grpo_with_replay_buffer
     title: GRPO With Replay Buffer
   - local: gspo_token
     title: GSPO-token
-  - local: papo_trainer
-    title: PAPO
+  - local: merge_model_callback
+    title: Merge Model Callback
   - local: openenv
     title: OpenEnv Integration
+  - local: papo_trainer
+    title: PAPO
+  - local: judges
+    title: Judges
+  - local: minillm_trainer
+    title: MiniLLM
+  - local: nash_md_trainer
+    title: Nash-MD
+  - local: online_dpo_trainer
+    title: Online DPO
+  - local: orpo_trainer
+    title: ORPO
+  - local: papo_trainer
+    title: PAPO
+  - local: ppo_trainer
+    title: PPO
+  - local: prm_trainer
+    title: PRM
+  - local: winrate_callback
+    title: WinRateCallback
+  - local: xpo_trainer
+    title: XPO
   title: Experimental
diff --git a/docs/source/callbacks.md b/docs/source/callbacks.md
@@ -8,18 +8,10 @@
 
 [[autodoc]] RichProgressCallback
 
-## WinRateCallback
-
-[[autodoc]] WinRateCallback
-
 ## LogCompletionsCallback
 
 [[autodoc]] LogCompletionsCallback
 
-## MergeModelCallback
-
-[[autodoc]] MergeModelCallback
-
 ## BEMACallback
 
 [[autodoc]] BEMACallback