diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml
index fae9c87542..67b4a0705a 100644
--- a/.github/workflows/gpu_test.yaml
+++ b/.github/workflows/gpu_test.yaml
@@ -53,7 +53,7 @@ jobs:
       - name: Install remaining dependencies
         run: |
           python -m pip install -e ".[dev]"
-          python -m pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@fb963f0f0a5b28b69763590bb59676072cf43a01
+          python -m pip install lm-eval==0.4.5
       - name: Run recipe and unit tests with coverage
         run: pytest tests --with-integration --cov=. --cov-report=xml --durations=20 -vv
       - name: Upload Coverage to Codecov
diff --git a/.github/workflows/recipe_test.yaml b/.github/workflows/recipe_test.yaml
index 2b335ec0c1..d5a2dbe790 100644
--- a/.github/workflows/recipe_test.yaml
+++ b/.github/workflows/recipe_test.yaml
@@ -42,7 +42,7 @@ jobs:
         run: |
           python -m pip install torch torchvision torchao
           python -m pip install -e ".[dev]"
-          python -m pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@fb963f0f0a5b28b69763590bb59676072cf43a01
+          python -m pip install lm-eval==0.4.5
       - name: Run recipe tests with coverage
         run: pytest tests -m integration_test --cov=. --cov-report=xml --durations=20 -vv
       - name: Upload Coverage to Codecov
diff --git a/.github/workflows/regression_test.yaml b/.github/workflows/regression_test.yaml
index 229371812c..80ee645f47 100644
--- a/.github/workflows/regression_test.yaml
+++ b/.github/workflows/regression_test.yaml
@@ -56,7 +56,7 @@ jobs:
       - name: Install remaining dependencies
         run: |
           python -m pip install -e ".[dev]"
-          python -m pip install lm-eval==0.4.*
+          python -m pip install lm-eval==0.4.5
       - name: Run regression tests with coverage
         run: pytest tests -m slow_integration_test --silence-s3-logs --cov=. --cov-report=xml --durations=20 -vv
       - name: Upload Coverage to Codecov
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 0000000000..dd5bf558c3
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,9 @@
+cff-version: 1.2.0
+title: "torchtune: PyTorch's finetuning library"
+message: "If you use this software, please cite it as below."
+type: software
+authors:
+  - given-names: "torchtune maintainers and contributors"
+url: "https//github.com/pytorch/torchtune"
+license: "BSD-3-Clause"
+date-released: "2024-04-14"
diff --git a/README.md b/README.md
index a66d3ded4c..2b702dc529 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@
 ![Recipe Integration Test](https://github.com/pytorch/torchtune/actions/workflows/recipe_test.yaml/badge.svg)
 [![](https://dcbadge.vercel.app/api/server/4Xsdn8Rr9Q?style=flat)](https://discord.gg/4Xsdn8Rr9Q)
 
-[**Introduction**](#introduction) | [**Installation**](#installation) | [**Get Started**](#get-started) |  [**Documentation**](https://pytorch.org/torchtune/main/index.html) | [**Community**](#community) | [**License**](#license)
+[**Introduction**](#introduction) | [**Installation**](#installation) | [**Get Started**](#get-started) |  [**Documentation**](https://pytorch.org/torchtune/main/index.html) | [**Community**](#community) | [**License**](#license) | [**Citing torchtune**](#citing-torchtune)
 
 > [!IMPORTANT]
 > Update September 25, 2024: torchtune has support for **Llama 3.2 11B Vision**, **Llama 3.2 3B**, and **Llama 3.2 1B** models! Try them out by following our installation instructions [here](#Installation), then run any of the text configs [here](recipes/configs/llama3_2) or vision configs [here](recipes/configs/llama3_2_vision).
@@ -282,3 +282,19 @@ We also want to acknowledge some awesome libraries and tools from the ecosystem:
 ## License
 
 torchtune is released under the [BSD 3 license](./LICENSE). However you may have other legal obligations that govern your use of other content, such as the terms of service for third-party models.
+
+
+## Citing torchtune
+
+If you find the torchtune library useful, please cite it in your work as below.
+
+```bibtex
+@software{torchtune,
+  title = {torchtune: PyTorch's finetuning library},
+  author = {torchtune maintainers and contributors},
+  url = {https//github.com/pytorch/torchtune},
+  license = {BSD-3-Clause},
+  month = apr,
+  year = {2024}
+}
+```
diff --git a/docs/source/api_ref_modules.rst b/docs/source/api_ref_modules.rst
index a31082e174..cc9a493147 100644
--- a/docs/source/api_ref_modules.rst
+++ b/docs/source/api_ref_modules.rst
@@ -14,7 +14,6 @@ Modeling Components and Building Blocks
     MultiHeadAttention
     FeedForward
     KVCache
-    get_cosine_schedule_with_warmup
     RotaryPositionalEmbeddings
     RMSNorm
     Fp32LayerNorm
diff --git a/docs/source/api_ref_training.rst b/docs/source/api_ref_training.rst
index 9d402b1e34..980e1d40db 100644
--- a/docs/source/api_ref_training.rst
+++ b/docs/source/api_ref_training.rst
@@ -74,6 +74,19 @@ Utilities to reduce memory consumption during training.
     create_optim_in_bwd_wrapper
     register_optim_in_bwd_hooks
 
+.. _lr_scheduler_label:
+
+Schedulers
+----------
+
+Utilities to control lr during the training process.
+
+.. autosummary::
+    :toctree: generated/
+    :nosignatures:
+
+    get_cosine_schedule_with_warmup
+
 .. _metric_logging_label:
 
 Metric Logging
diff --git a/docs/source/tutorials/memory_optimizations.rst b/docs/source/tutorials/memory_optimizations.rst
index be40d89134..04644093a9 100644
--- a/docs/source/tutorials/memory_optimizations.rst
+++ b/docs/source/tutorials/memory_optimizations.rst
@@ -128,7 +128,7 @@ For example: with ``batch_size=1`` and ``gradient_accumulation_steps=32`` we get
 .. note::
 
   For other components in torchtune which use "steps", such as :ref:`metric logging <metric_logging_label>`, or
-  :func:`learning rate schedulers <torchtune.modules.get_cosine_schedule_with_warmup>`, a "step" is counted as a
+  :func:`learning rate schedulers <torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup>`, a "step" is counted as a
   single update to model parameters, rather than a single model forward pass with the data.
   Suppose ``gradient_accumulation_steps = 4`` and ``log_every_n_steps = 10``.
   Metrics would be logged every 10 global steps, which translates to every 40 model forward passes.
diff --git a/recipes/configs/code_llama2/7B_lora_single_device.yaml b/recipes/configs/code_llama2/7B_lora_single_device.yaml
index 75daa2b454..263e3c12e1 100644
--- a/recipes/configs/code_llama2/7B_lora_single_device.yaml
+++ b/recipes/configs/code_llama2/7B_lora_single_device.yaml
@@ -64,7 +64,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
diff --git a/recipes/configs/code_llama2/7B_qlora_single_device.yaml b/recipes/configs/code_llama2/7B_qlora_single_device.yaml
index ab6b4e2b55..4f6fd9be61 100644
--- a/recipes/configs/code_llama2/7B_qlora_single_device.yaml
+++ b/recipes/configs/code_llama2/7B_qlora_single_device.yaml
@@ -64,7 +64,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
diff --git a/recipes/configs/gemma/2B_lora.yaml b/recipes/configs/gemma/2B_lora.yaml
index 5364ec2bce..b82faa39e2 100644
--- a/recipes/configs/gemma/2B_lora.yaml
+++ b/recipes/configs/gemma/2B_lora.yaml
@@ -55,7 +55,7 @@ optimizer:
   lr: 2e-5
 
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 10
 
 loss:
diff --git a/recipes/configs/gemma/2B_lora_single_device.yaml b/recipes/configs/gemma/2B_lora_single_device.yaml
index 786b0c7f2f..d6e1664b71 100644
--- a/recipes/configs/gemma/2B_lora_single_device.yaml
+++ b/recipes/configs/gemma/2B_lora_single_device.yaml
@@ -54,7 +54,7 @@ optimizer:
   lr: 2e-5
 
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 10
 
 loss:
diff --git a/recipes/configs/gemma/2B_qlora_single_device.yaml b/recipes/configs/gemma/2B_qlora_single_device.yaml
index 39ebc088e7..9b24d6c0ee 100644
--- a/recipes/configs/gemma/2B_qlora_single_device.yaml
+++ b/recipes/configs/gemma/2B_qlora_single_device.yaml
@@ -54,7 +54,7 @@ optimizer:
   lr: 2e-5
 
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 10
 
 loss:
diff --git a/recipes/configs/gemma/7B_lora.yaml b/recipes/configs/gemma/7B_lora.yaml
index a4ee960c17..6db9b0ab82 100644
--- a/recipes/configs/gemma/7B_lora.yaml
+++ b/recipes/configs/gemma/7B_lora.yaml
@@ -57,7 +57,7 @@ optimizer:
   lr: 2e-5
 
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 10
 
 loss:
diff --git a/recipes/configs/gemma/7B_lora_single_device.yaml b/recipes/configs/gemma/7B_lora_single_device.yaml
index 2edeab2047..c82f0b76ba 100644
--- a/recipes/configs/gemma/7B_lora_single_device.yaml
+++ b/recipes/configs/gemma/7B_lora_single_device.yaml
@@ -56,7 +56,7 @@ optimizer:
   lr: 5e-5
 
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 10
 
 loss:
diff --git a/recipes/configs/gemma/7B_qlora_single_device.yaml b/recipes/configs/gemma/7B_qlora_single_device.yaml
index 23d7465770..fcbccb786b 100644
--- a/recipes/configs/gemma/7B_qlora_single_device.yaml
+++ b/recipes/configs/gemma/7B_qlora_single_device.yaml
@@ -56,7 +56,7 @@ optimizer:
   lr: 2e-5
 
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 10
 
 loss:
diff --git a/recipes/configs/gemma/evaluation.yaml b/recipes/configs/gemma/evaluation.yaml
new file mode 100644
index 0000000000..2ff8f78546
--- /dev/null
+++ b/recipes/configs/gemma/evaluation.yaml
@@ -0,0 +1,39 @@
+# Config for EleutherEvalRecipe in eleuther_eval.py
+#
+# To launch, run the following command:
+#    tune run eleuther_eval --config gemma/evaluation
+
+# Model Arguments
+model:
+  _component_: torchtune.models.gemma.gemma_2b
+
+# Checkpointer
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/gemma-2b
+  checkpoint_files: [
+    model-00001-of-00002.safetensors,
+    model-00002-of-00002.safetensors,
+  ]
+  output_dir: ./ # Not needed
+  model_type: GEMMA
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.gemma.gemma_tokenizer
+  path: /tmp/gemma-2b/tokenizer.model
+
+# Environment
+device: cuda
+dtype: bf16
+seed: 1234 # It is not recommended to change this seed, b/c it matches EleutherAI's default seed
+
+# EleutherAI specific eval args
+tasks: ["truthfulqa_mc2"]
+limit: null
+max_seq_length: 4096
+batch_size: 8
+enable_kv_cache: True
+
+# Quantization specific args
+quantizer: null
diff --git a/recipes/configs/llama2/13B_lora.yaml b/recipes/configs/llama2/13B_lora.yaml
index 267725ab92..d657754139 100644
--- a/recipes/configs/llama2/13B_lora.yaml
+++ b/recipes/configs/llama2/13B_lora.yaml
@@ -64,7 +64,7 @@ optimizer:
   weight_decay: 0.01
   lr: 2e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama2/13B_qlora_single_device.yaml b/recipes/configs/llama2/13B_qlora_single_device.yaml
index 539d692382..56431fdff5 100644
--- a/recipes/configs/llama2/13B_qlora_single_device.yaml
+++ b/recipes/configs/llama2/13B_qlora_single_device.yaml
@@ -59,7 +59,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama2/70B_lora.yaml b/recipes/configs/llama2/70B_lora.yaml
index ff4f56493b..b4d0d9c9a9 100644
--- a/recipes/configs/llama2/70B_lora.yaml
+++ b/recipes/configs/llama2/70B_lora.yaml
@@ -64,7 +64,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama2/70B_qlora.yaml b/recipes/configs/llama2/70B_qlora.yaml
index b8ff55c01b..c1de2c2358 100644
--- a/recipes/configs/llama2/70B_qlora.yaml
+++ b/recipes/configs/llama2/70B_qlora.yaml
@@ -70,7 +70,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama2/7B_full_low_memory.yaml b/recipes/configs/llama2/7B_full_low_memory.yaml
index beb2248b23..06558009ed 100644
--- a/recipes/configs/llama2/7B_full_low_memory.yaml
+++ b/recipes/configs/llama2/7B_full_low_memory.yaml
@@ -55,7 +55,7 @@ optimizer:
   _component_: bitsandbytes.optim.PagedAdamW
   lr: 1e-5
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 optimizer_in_bwd: True
 loss:
diff --git a/recipes/configs/llama2/7B_lora.yaml b/recipes/configs/llama2/7B_lora.yaml
index 68e1d302df..2c9a694d7b 100644
--- a/recipes/configs/llama2/7B_lora.yaml
+++ b/recipes/configs/llama2/7B_lora.yaml
@@ -61,7 +61,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama2/7B_lora_dpo.yaml b/recipes/configs/llama2/7B_lora_dpo.yaml
index f6acfcb76e..26f824814f 100644
--- a/recipes/configs/llama2/7B_lora_dpo.yaml
+++ b/recipes/configs/llama2/7B_lora_dpo.yaml
@@ -58,7 +58,7 @@ optimizer:
   weight_decay: 0.05
   lr: 5e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml
index 458a023c36..2ad3988867 100644
--- a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml
+++ b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml
@@ -57,7 +57,7 @@ optimizer:
   weight_decay: 0.05
   lr: 5e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama2/7B_lora_single_device.yaml b/recipes/configs/llama2/7B_lora_single_device.yaml
index 6608bdc48d..ebaee584c2 100644
--- a/recipes/configs/llama2/7B_lora_single_device.yaml
+++ b/recipes/configs/llama2/7B_lora_single_device.yaml
@@ -59,7 +59,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama2/7B_qlora.yaml b/recipes/configs/llama2/7B_qlora.yaml
index 630d1f6357..052cdb9296 100644
--- a/recipes/configs/llama2/7B_qlora.yaml
+++ b/recipes/configs/llama2/7B_qlora.yaml
@@ -61,7 +61,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama2/7B_qlora_single_device.yaml b/recipes/configs/llama2/7B_qlora_single_device.yaml
index 062e66d833..0893f48579 100644
--- a/recipes/configs/llama2/7B_qlora_single_device.yaml
+++ b/recipes/configs/llama2/7B_qlora_single_device.yaml
@@ -58,7 +58,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama3/70B_lora.yaml b/recipes/configs/llama3/70B_lora.yaml
index 84bed19a02..f3a921f289 100644
--- a/recipes/configs/llama3/70B_lora.yaml
+++ b/recipes/configs/llama3/70B_lora.yaml
@@ -79,7 +79,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama3/8B_dora.yaml b/recipes/configs/llama3/8B_dora.yaml
index 3911e856c2..1265c82c72 100644
--- a/recipes/configs/llama3/8B_dora.yaml
+++ b/recipes/configs/llama3/8B_dora.yaml
@@ -54,7 +54,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama3/8B_dora_single_device.yaml b/recipes/configs/llama3/8B_dora_single_device.yaml
index 1f91dadda8..0fc0a484dc 100644
--- a/recipes/configs/llama3/8B_dora_single_device.yaml
+++ b/recipes/configs/llama3/8B_dora_single_device.yaml
@@ -56,7 +56,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama3/8B_full_single_device.yaml b/recipes/configs/llama3/8B_full_single_device.yaml
index 1d5479ccbc..cd3e3586ce 100644
--- a/recipes/configs/llama3/8B_full_single_device.yaml
+++ b/recipes/configs/llama3/8B_full_single_device.yaml
@@ -54,7 +54,7 @@ optimizer:
   _component_: bitsandbytes.optim.PagedAdamW8bit
   lr: 1e-5
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
diff --git a/recipes/configs/llama3/8B_lora.yaml b/recipes/configs/llama3/8B_lora.yaml
index 5c3510f466..d65138f348 100644
--- a/recipes/configs/llama3/8B_lora.yaml
+++ b/recipes/configs/llama3/8B_lora.yaml
@@ -59,7 +59,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama3/8B_lora_single_device.yaml b/recipes/configs/llama3/8B_lora_single_device.yaml
index 0d9cb71a16..e49afacbb1 100644
--- a/recipes/configs/llama3/8B_lora_single_device.yaml
+++ b/recipes/configs/llama3/8B_lora_single_device.yaml
@@ -58,7 +58,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama3/8B_qdora_single_device.yaml b/recipes/configs/llama3/8B_qdora_single_device.yaml
index 29a2a2d84f..7180c5a72c 100644
--- a/recipes/configs/llama3/8B_qdora_single_device.yaml
+++ b/recipes/configs/llama3/8B_qdora_single_device.yaml
@@ -57,7 +57,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama3/8B_qlora_single_device.yaml b/recipes/configs/llama3/8B_qlora_single_device.yaml
index 0d831a8b77..1eef476d17 100644
--- a/recipes/configs/llama3/8B_qlora_single_device.yaml
+++ b/recipes/configs/llama3/8B_qlora_single_device.yaml
@@ -57,7 +57,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama3_1/405B_qlora.yaml b/recipes/configs/llama3_1/405B_qlora.yaml
index 69583dd9d4..6398a840ec 100644
--- a/recipes/configs/llama3_1/405B_qlora.yaml
+++ b/recipes/configs/llama3_1/405B_qlora.yaml
@@ -58,7 +58,7 @@ optimizer:
   lr: 3e-4
   fused: True
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama3_1/70B_lora.yaml b/recipes/configs/llama3_1/70B_lora.yaml
index c4fa8d589c..861279127a 100644
--- a/recipes/configs/llama3_1/70B_lora.yaml
+++ b/recipes/configs/llama3_1/70B_lora.yaml
@@ -78,7 +78,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama3_1/8B_lora.yaml b/recipes/configs/llama3_1/8B_lora.yaml
index c6e94e0aab..5f101b170f 100644
--- a/recipes/configs/llama3_1/8B_lora.yaml
+++ b/recipes/configs/llama3_1/8B_lora.yaml
@@ -62,7 +62,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama3_1/8B_lora_single_device.yaml b/recipes/configs/llama3_1/8B_lora_single_device.yaml
index c951abc3a5..3991f728ce 100644
--- a/recipes/configs/llama3_1/8B_lora_single_device.yaml
+++ b/recipes/configs/llama3_1/8B_lora_single_device.yaml
@@ -61,7 +61,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama3_1/8B_qlora_single_device.yaml b/recipes/configs/llama3_1/8B_qlora_single_device.yaml
index 0b3e615bc9..a9b0662105 100644
--- a/recipes/configs/llama3_1/8B_qlora_single_device.yaml
+++ b/recipes/configs/llama3_1/8B_qlora_single_device.yaml
@@ -60,7 +60,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama3_2/1B_lora.yaml b/recipes/configs/llama3_2/1B_lora.yaml
index 1fb0f483b3..228e4989d5 100644
--- a/recipes/configs/llama3_2/1B_lora.yaml
+++ b/recipes/configs/llama3_2/1B_lora.yaml
@@ -59,7 +59,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama3_2/1B_lora_single_device.yaml b/recipes/configs/llama3_2/1B_lora_single_device.yaml
index c69728ac0d..c9ebed6dc7 100644
--- a/recipes/configs/llama3_2/1B_lora_single_device.yaml
+++ b/recipes/configs/llama3_2/1B_lora_single_device.yaml
@@ -58,7 +58,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama3_2/1B_qlora_single_device.yaml b/recipes/configs/llama3_2/1B_qlora_single_device.yaml
index ca60a687eb..da552b2a0f 100644
--- a/recipes/configs/llama3_2/1B_qlora_single_device.yaml
+++ b/recipes/configs/llama3_2/1B_qlora_single_device.yaml
@@ -57,7 +57,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama3_2/3B_lora.yaml b/recipes/configs/llama3_2/3B_lora.yaml
index 9a628f2c29..d13a303814 100644
--- a/recipes/configs/llama3_2/3B_lora.yaml
+++ b/recipes/configs/llama3_2/3B_lora.yaml
@@ -60,7 +60,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama3_2/3B_lora_single_device.yaml b/recipes/configs/llama3_2/3B_lora_single_device.yaml
index 8fd65dd913..255c75e227 100644
--- a/recipes/configs/llama3_2/3B_lora_single_device.yaml
+++ b/recipes/configs/llama3_2/3B_lora_single_device.yaml
@@ -59,7 +59,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama3_2/3B_qlora_single_device.yaml b/recipes/configs/llama3_2/3B_qlora_single_device.yaml
index 4547459282..360443b9e1 100644
--- a/recipes/configs/llama3_2/3B_qlora_single_device.yaml
+++ b/recipes/configs/llama3_2/3B_qlora_single_device.yaml
@@ -58,7 +58,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama3_2/knowledge_distillation_single_device.yaml b/recipes/configs/llama3_2/knowledge_distillation_single_device.yaml
index c621467582..9cb029666f 100644
--- a/recipes/configs/llama3_2/knowledge_distillation_single_device.yaml
+++ b/recipes/configs/llama3_2/knowledge_distillation_single_device.yaml
@@ -74,7 +74,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/llama3_2_vision/11B_full.yaml b/recipes/configs/llama3_2_vision/11B_full.yaml
index 2c8f1f58fd..ee9180dbcf 100644
--- a/recipes/configs/llama3_2_vision/11B_full.yaml
+++ b/recipes/configs/llama3_2_vision/11B_full.yaml
@@ -28,6 +28,7 @@ tokenizer:
   _component_: torchtune.models.llama3_2_vision.llama3_2_vision_transform
   path: /tmp/Llama-3.2-11B-Vision-Instruct/original/tokenizer.model
   image_size: 560
+  max_seq_len: 8192
 
 # Checkpointer
 checkpointer:
diff --git a/recipes/configs/llama3_2_vision/11B_full_single_device.yaml b/recipes/configs/llama3_2_vision/11B_full_single_device.yaml
index d42fb971e6..3372c1a540 100644
--- a/recipes/configs/llama3_2_vision/11B_full_single_device.yaml
+++ b/recipes/configs/llama3_2_vision/11B_full_single_device.yaml
@@ -30,6 +30,7 @@ tokenizer:
   _component_: torchtune.models.llama3_2_vision.llama3_2_vision_transform
   path: /tmp/Llama-3.2-11B-Vision-Instruct/original/tokenizer.model
   image_size: 560
+  max_seq_len: 8192
 
 # Checkpointer
 checkpointer:
diff --git a/recipes/configs/llama3_2_vision/11B_lora.yaml b/recipes/configs/llama3_2_vision/11B_lora.yaml
index e39ff367ba..357af64496 100644
--- a/recipes/configs/llama3_2_vision/11B_lora.yaml
+++ b/recipes/configs/llama3_2_vision/11B_lora.yaml
@@ -34,6 +34,7 @@ tokenizer:
   _component_: torchtune.models.llama3_2_vision.llama3_2_vision_transform
   path: /tmp/Llama-3.2-11B-Vision-Instruct/original/tokenizer.model
   image_size: 560
+  max_seq_len: 8192
 
 # Checkpointer
 checkpointer:
@@ -64,7 +65,7 @@ optimizer:
   weight_decay: 0.01
   lr: 2e-5
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
diff --git a/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml b/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml
index 827e04a815..f56828c301 100644
--- a/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml
+++ b/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml
@@ -32,6 +32,7 @@ tokenizer:
   _component_: torchtune.models.llama3_2_vision.llama3_2_vision_transform
   path: /tmp/Llama-3.2-11B-Vision-Instruct/original/tokenizer.model
   image_size: 560
+  max_seq_len: 8192
 
 # Checkpointer
 checkpointer:
@@ -63,7 +64,7 @@ optimizer:
   lr: 2e-5
 optimizer_in_bwd: False
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
diff --git a/recipes/configs/llama3_2_vision/evaluation.yaml b/recipes/configs/llama3_2_vision/evaluation.yaml
index 81c0ed3c94..69123d8045 100644
--- a/recipes/configs/llama3_2_vision/evaluation.yaml
+++ b/recipes/configs/llama3_2_vision/evaluation.yaml
@@ -3,8 +3,8 @@
 # This config assumes that you've run the following command before launching:
 #   tune download meta-llama/Llama-3.2-11B-Vision-Instruct --output-dir /tmp/Llama-3.2-11B-Vision-Instruct
 #
-# It also assumes that you've downloaded the EleutherAI Eval Harness:
-#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@fb963f0f0a5b28b69763590bb59676072cf43a01
+# It also assumes that you've downloaded the EleutherAI Eval Harness (v0.4.5):
+#   pip install lm_eval==0.4.5
 #
 # To launch, run the following command from root torchtune directory:
 #    tune run eleuther_eval --config llama3_2_vision/evaluation
diff --git a/recipes/configs/mistral/7B_lora.yaml b/recipes/configs/mistral/7B_lora.yaml
index fd2c637df7..08196660fc 100644
--- a/recipes/configs/mistral/7B_lora.yaml
+++ b/recipes/configs/mistral/7B_lora.yaml
@@ -63,7 +63,7 @@ optimizer:
   lr: 2e-5
 
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/mistral/7B_lora_single_device.yaml b/recipes/configs/mistral/7B_lora_single_device.yaml
index ccfb0c2cd4..2ebc9f798e 100644
--- a/recipes/configs/mistral/7B_lora_single_device.yaml
+++ b/recipes/configs/mistral/7B_lora_single_device.yaml
@@ -60,7 +60,7 @@ optimizer:
   lr: 2e-5
 
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/mistral/7B_qlora_single_device.yaml b/recipes/configs/mistral/7B_qlora_single_device.yaml
index 0e2fa20d94..3bbfebe3ba 100644
--- a/recipes/configs/mistral/7B_qlora_single_device.yaml
+++ b/recipes/configs/mistral/7B_qlora_single_device.yaml
@@ -61,7 +61,7 @@ optimizer:
   lr: 2e-5
 
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/mistral/evaluation.yaml b/recipes/configs/mistral/evaluation.yaml
new file mode 100644
index 0000000000..61d69dcb40
--- /dev/null
+++ b/recipes/configs/mistral/evaluation.yaml
@@ -0,0 +1,41 @@
+# Config for EleutherEvalRecipe in eleuther_eval.py
+#
+# To launch, run the following command:
+#    tune run eleuther_eval --config mistral/evaluation
+
+# Model Arguments
+model:
+  _component_: torchtune.models.mistral.mistral_7b
+
+# Checkpointer
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Mistral-7B-v0.1/
+  checkpoint_files: [
+    pytorch_model-00001-of-00002.bin,
+    pytorch_model-00002-of-00002.bin
+  ]
+  output_dir: /tmp/Mistral-7B-v0.1/
+  model_type: MISTRAL
+resume_from_checkpoint: False
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.mistral.mistral_tokenizer
+  path: /tmp/Mistral-7B-v0.1/tokenizer.model
+  max_seq_len: null
+
+# Environment
+device: cuda
+dtype: bf16
+seed: 1234 # It is not recommended to change this seed, b/c it matches EleutherAI's default seed
+
+# EleutherAI specific eval args
+tasks: ["truthfulqa_mc2"]
+limit: null
+max_seq_length: 4096
+batch_size: 8
+enable_kv_cache: True
+
+# Quantization specific args
+quantizer: null
diff --git a/recipes/configs/phi3/evaluation.yaml b/recipes/configs/phi3/evaluation.yaml
new file mode 100644
index 0000000000..ca2f1c9759
--- /dev/null
+++ b/recipes/configs/phi3/evaluation.yaml
@@ -0,0 +1,42 @@
+# Config for EleutherEvalRecipe in eleuther_eval.py
+#
+# To launch, run the following command:
+#    tune run eleuther_eval --config phi3/evaluation
+
+# Model Arguments
+model:
+  _component_: torchtune.models.phi3.phi3_mini
+
+# Checkpointer
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Phi-3-mini-4k-instruct
+  checkpoint_files: [
+    model-00001-of-00002.safetensors,
+    model-00002-of-00002.safetensors
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Phi-3-mini-4k-instruct
+  model_type: PHI3_MINI
+resume_from_checkpoint: False
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.phi3.phi3_mini_tokenizer
+  path: /tmp/Phi-3-mini-4k-instruct/tokenizer.model
+  max_seq_len: null
+
+# Environment
+device: cuda
+dtype: bf16
+seed: 1234 # It is not recommended to change this seed, b/c it matches EleutherAI's default seed
+
+# EleutherAI specific eval args
+tasks: ["truthfulqa_mc2"]
+limit: null
+max_seq_length: 4096
+batch_size: 8
+enable_kv_cache: True
+
+# Quantization specific args
+quantizer: null
diff --git a/recipes/configs/phi3/mini_lora.yaml b/recipes/configs/phi3/mini_lora.yaml
index 721a61790b..fff05885ef 100644
--- a/recipes/configs/phi3/mini_lora.yaml
+++ b/recipes/configs/phi3/mini_lora.yaml
@@ -64,7 +64,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
diff --git a/recipes/configs/phi3/mini_lora_single_device.yaml b/recipes/configs/phi3/mini_lora_single_device.yaml
index 7de8a30c94..b5c14b19ca 100644
--- a/recipes/configs/phi3/mini_lora_single_device.yaml
+++ b/recipes/configs/phi3/mini_lora_single_device.yaml
@@ -62,7 +62,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
diff --git a/recipes/configs/phi3/mini_qlora_single_device.yaml b/recipes/configs/phi3/mini_qlora_single_device.yaml
index 1d2d5c5cbc..10114bc67a 100644
--- a/recipes/configs/phi3/mini_qlora_single_device.yaml
+++ b/recipes/configs/phi3/mini_qlora_single_device.yaml
@@ -62,7 +62,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
diff --git a/recipes/configs/qwen2/0.5B_lora.yaml b/recipes/configs/qwen2/0.5B_lora.yaml
index 9ccd400897..e0608eba5c 100644
--- a/recipes/configs/qwen2/0.5B_lora.yaml
+++ b/recipes/configs/qwen2/0.5B_lora.yaml
@@ -60,7 +60,7 @@ optimizer:
   lr: 2e-3
 
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/qwen2/0.5B_lora_single_device.yaml b/recipes/configs/qwen2/0.5B_lora_single_device.yaml
index 343eb8ea14..602c63853a 100644
--- a/recipes/configs/qwen2/0.5B_lora_single_device.yaml
+++ b/recipes/configs/qwen2/0.5B_lora_single_device.yaml
@@ -58,7 +58,7 @@ optimizer:
   lr: 2e-3
 
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/qwen2/1.5B_lora.yaml b/recipes/configs/qwen2/1.5B_lora.yaml
index 84fd73696b..a496dade08 100644
--- a/recipes/configs/qwen2/1.5B_lora.yaml
+++ b/recipes/configs/qwen2/1.5B_lora.yaml
@@ -56,7 +56,7 @@ optimizer:
   lr: 2e-5
 
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/qwen2/1.5B_lora_single_device.yaml b/recipes/configs/qwen2/1.5B_lora_single_device.yaml
index 3e8377b6a1..b41269de1a 100644
--- a/recipes/configs/qwen2/1.5B_lora_single_device.yaml
+++ b/recipes/configs/qwen2/1.5B_lora_single_device.yaml
@@ -56,7 +56,7 @@ optimizer:
   lr: 2e-3
 
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/qwen2/7B_lora.yaml b/recipes/configs/qwen2/7B_lora.yaml
index f6a4cc2ac6..d3b63fd1df 100644
--- a/recipes/configs/qwen2/7B_lora.yaml
+++ b/recipes/configs/qwen2/7B_lora.yaml
@@ -62,7 +62,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/qwen2/7B_lora_single_device.yaml b/recipes/configs/qwen2/7B_lora_single_device.yaml
index 8b8d470f6d..6f9fb35b15 100644
--- a/recipes/configs/qwen2/7B_lora_single_device.yaml
+++ b/recipes/configs/qwen2/7B_lora_single_device.yaml
@@ -60,7 +60,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/configs/qwen2/knowledge_distillation_single_device.yaml b/recipes/configs/qwen2/knowledge_distillation_single_device.yaml
index 1254b6a33b..9cc894a7e5 100644
--- a/recipes/configs/qwen2/knowledge_distillation_single_device.yaml
+++ b/recipes/configs/qwen2/knowledge_distillation_single_device.yaml
@@ -67,7 +67,7 @@ optimizer:
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
 
 loss:
diff --git a/recipes/eleuther_eval.py b/recipes/eleuther_eval.py
index b07a3ad3ae..590e4f902a 100644
--- a/recipes/eleuther_eval.py
+++ b/recipes/eleuther_eval.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import importlib.metadata
 import sys
 import time
 
@@ -13,6 +12,12 @@
 import PIL
 
 import torch
+
+from lm_eval.evaluator import evaluate
+from lm_eval.models.hf_vlms import HFMultimodalLM
+from lm_eval.models.huggingface import HFLM
+from lm_eval.tasks import get_task_dict, TaskManager
+from lm_eval.utils import make_table
 from omegaconf import DictConfig
 
 from torchtune import config, training, utils
@@ -31,40 +36,6 @@
 from torchtune.recipe_interfaces import EvalRecipeInterface
 from torchtune.training import FullModelTorchTuneCheckpointer
 
-try:
-    import lm_eval
-except ImportError:
-    print(
-        "You must install the EleutherAI Eval Harness to run this recipe. "
-        "Please install with `pip install lm_eval>=0.4.2`"
-    )
-    sys.exit(1)
-
-lm_eval_version = importlib.metadata.version("lm_eval")
-if not lm_eval_version >= "0.4.2":
-    print(
-        "You must install the EleutherAI Eval Harness >= v0.4.2 to run this recipe. "
-        "Please install with `pip install lm_eval>=0.4.2`"
-    )
-    sys.exit(1)
-
-from lm_eval.evaluator import evaluate
-
-# User doesn't have to have nightlies installed, they just won't be able
-# to use the multimodal model
-try:
-    from lm_eval.models.hf_vlms import HFMultimodalLM
-except ImportError as e:
-    # Create a dummy class to avoid having to import the HF models
-    # TODO (@joecummings): Remove this once v0.4.5 patch is released
-    class HFMultimodalLM:
-        def __init__(self, *args, **kwargs):
-            pass
-
-
-from lm_eval.models.huggingface import HFLM
-from lm_eval.tasks import get_task_dict, TaskManager
-
 
 class _VLMEvalWrapper(HFMultimodalLM):
     """An EvalWrapper for EleutherAI's eval harness based on gpt-fast's
@@ -466,6 +437,16 @@ class EleutherEvalRecipe(EvalRecipeInterface):
     """
 
     def __init__(self, cfg: DictConfig) -> None:
+        # Double check we have the right Eval Harness version
+        from importlib.metadata import version
+
+        if version("lm-eval") != "0.4.5":
+            raise RuntimeError(
+                "This recipe requires EleutherAI Eval Harness v0.4.5. "
+                "Please install with `pip install lm-eval==0.4.5`"
+            )
+
+        # General variable initialization
         self.device = utils.get_device(device=cfg.device)
         self.dtype = training.get_dtype(dtype=cfg.dtype, device=self.device)
         self.logger = utils.get_logger(cfg.get("log_level", "info"))
@@ -568,7 +549,7 @@ def evaluate(self) -> None:
         self.logger.info(
             f"Max memory allocated: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB"
         )
-        formatted_output = lm_eval.utils.make_table(output)
+        formatted_output = make_table(output)
         self.logger.info(f"\n\n{formatted_output}\n")
 
 
diff --git a/tests/recipes/test_eleuther_eval.py b/tests/recipes/test_eleuther_eval.py
index fc4cd2fae2..1c3a7bb65f 100644
--- a/tests/recipes/test_eleuther_eval.py
+++ b/tests/recipes/test_eleuther_eval.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import builtins
 import math
 import re
 import runpy
@@ -64,17 +63,10 @@ def test_torchtune_checkpoint_eval_results(
 
         out = caplog.text
 
-        # v0.4.2 format
-        # |    Tasks     |Version|Filter|n-shot|Metric|Value |   |Stderr|
-        # |--------------|------:|------|-----:|------|-----:|---|-----:|
-        # |truthfulqa_mc2|      2|none  |     0|acc   |0.4497|±  |0.1067|
-
-        # v0.4.3 format
+        # Format of output is:
         # |    Tasks     |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
         # |--------------|------:|------|-----:|------|---|-----:|---|-----:|
         # |truthfulqa_mc2|      2|none  |     0|acc   |↑  |0.4497|±  |0.1067|
-
-        # The below RegEx command will pick up both formats
         search_results = re.search(
             r"acc(?:_norm)?\s*\|?\s*(?:\↑\s*\|?)?([\d.]+)", out.strip()
         )
@@ -83,18 +75,20 @@ def test_torchtune_checkpoint_eval_results(
         assert math.isclose(acc_result, expected_acc, abs_tol=0.05)
 
     @pytest.fixture
-    def hide_available_pkg(self, monkeypatch):
-        import_orig = builtins.__import__
+    def hide_correct_version_number(self, monkeypatch):
+        import importlib.metadata
+
+        import_orig = importlib.metadata.version
 
         def mocked_import(name, *args, **kwargs):
-            if name == "lm_eval":
-                raise ImportError()
+            if name == "lm-eval":
+                return "0.4.4"  # Hardcode wrong version number
             return import_orig(name, *args, **kwargs)
 
-        monkeypatch.setattr(builtins, "__import__", mocked_import)
+        monkeypatch.setattr(importlib.metadata, "version", mocked_import)
 
     @pytest.mark.integration_test
-    @pytest.mark.usefixtures("hide_available_pkg")
+    @pytest.mark.usefixtures("hide_correct_version_number")
     def test_eval_recipe_errors_without_lm_eval(self, capsys, monkeypatch, tmpdir):
         ckpt = "llama2_tune"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
@@ -116,16 +110,17 @@ def test_eval_recipe_errors_without_lm_eval(self, capsys, monkeypatch, tmpdir):
             device=cpu \
         """.split()
 
+        model_config = llama2_test_config()
+        cmd = cmd + model_config
+
         monkeypatch.setattr(sys, "argv", cmd)
-        with pytest.raises(SystemExit, match="1"):
+        with pytest.raises(
+            RuntimeError,
+            match="This recipe requires EleutherAI Eval Harness v0.4.5. "
+            "Please install with `pip install lm-eval==0.4.5`",
+        ):
             runpy.run_path(TUNE_PATH, run_name="__main__")
 
-        printed_err = capsys.readouterr().out
-        assert (
-            "You must install the EleutherAI Eval Harness to run this recipe"
-            in printed_err
-        )
-
     @pytest.mark.integration_test
     def test_eval_recipe_errors_with_quantization_hf_checkpointer(
         self, capsys, monkeypatch, tmpdir
diff --git a/tests/torchtune/_cli/test_download.py b/tests/torchtune/_cli/test_download.py
index 5dbd695226..8a6d6ba0ab 100644
--- a/tests/torchtune/_cli/test_download.py
+++ b/tests/torchtune/_cli/test_download.py
@@ -65,3 +65,44 @@ def test_download_calls_snapshot(self, capsys, monkeypatch, snapshot_download):
 
         # Make sure it was called twice
         assert snapshot_download.call_count == 3
+
+    # GatedRepoError without --hf-token (expect prompt for token)
+    def test_gated_repo_error_no_token(self, capsys, monkeypatch, snapshot_download):
+        model = "meta-llama/Llama-2-7b"
+        testargs = f"tune download {model}".split()
+        monkeypatch.setattr(sys, "argv", testargs)
+
+        # Expect GatedRepoError without --hf-token provided
+        with pytest.raises(SystemExit, match="2"):
+            runpy.run_path(TUNE_PATH, run_name="__main__")
+
+        out_err = capsys.readouterr()
+        # Check that error message prompts for --hf-token
+        assert (
+            "It looks like you are trying to access a gated repository." in out_err.err
+        )
+        assert (
+            "Please ensure you have access to the repository and have provided the proper Hugging Face API token"
+            in out_err.err
+        )
+
+    # GatedRepoError with --hf-token (should not ask for token)
+    def test_gated_repo_error_with_token(self, capsys, monkeypatch, snapshot_download):
+        model = "meta-llama/Llama-2-7b"
+        testargs = f"tune download {model} --hf-token valid_token".split()
+        monkeypatch.setattr(sys, "argv", testargs)
+
+        # Expect GatedRepoError with --hf-token provided
+        with pytest.raises(SystemExit, match="2"):
+            runpy.run_path(TUNE_PATH, run_name="__main__")
+
+        out_err = capsys.readouterr()
+        # Check that error message does not prompt for --hf-token again
+        assert (
+            "It looks like you are trying to access a gated repository." in out_err.err
+        )
+        assert "Please ensure you have access to the repository." in out_err.err
+        assert (
+            "Please ensure you have access to the repository and have provided the proper Hugging Face API token"
+            not in out_err.err
+        )
diff --git a/tests/torchtune/models/clip/test_clip_image_transform.py b/tests/torchtune/models/clip/test_clip_image_transform.py
index dd54fbaddd..a29ef83cef 100644
--- a/tests/torchtune/models/clip/test_clip_image_transform.py
+++ b/tests/torchtune/models/clip/test_clip_image_transform.py
@@ -37,17 +37,6 @@ class TestCLIPImageTransform:
                 "expected_tile_max": [1.0, 1.0],
                 "expected_tile_min": [0.0, 0.0],
                 "expected_aspect_ratio": [1, 2],
-                "pad_max_tiles": False,
-            },
-            {
-                "image_size": (100, 400, 3),
-                "expected_shape": torch.Size([4, 3, 224, 224]),
-                "resize_to_max_canvas": False,
-                "expected_tile_means": [0.2230, 0.1763, 0.0, 0.0],
-                "expected_tile_max": [1.0, 1.0, 0.0, 0.0],
-                "expected_tile_min": [0.0, 0.0, 0.0, 0.0],
-                "expected_aspect_ratio": [1, 2],
-                "pad_max_tiles": True,
             },
             {
                 "image_size": (1000, 300, 3),
@@ -57,7 +46,6 @@ class TestCLIPImageTransform:
                 "expected_tile_max": [0.9705, 0.9694, 0.9521, 0.9314],
                 "expected_tile_min": [0.0353, 0.0435, 0.0528, 0.0],
                 "expected_aspect_ratio": [4, 1],
-                "pad_max_tiles": False,
             },
             {
                 "image_size": (200, 200, 3),
@@ -67,7 +55,6 @@ class TestCLIPImageTransform:
                 "expected_tile_max": [0.9922, 0.9926, 0.9970, 0.9908],
                 "expected_tile_min": [0.0056, 0.0069, 0.0059, 0.0033],
                 "expected_aspect_ratio": [2, 2],
-                "pad_max_tiles": False,
                 "pad_tiles": 1,
             },
             {
@@ -78,17 +65,6 @@ class TestCLIPImageTransform:
                 "expected_tile_max": [1.0, 1.0, 1.0],
                 "expected_tile_min": [0.0, 0.0, 0.0],
                 "expected_aspect_ratio": [3, 1],
-                "pad_max_tiles": False,
-            },
-            {
-                "image_size": (600, 200, 3),
-                "expected_shape": torch.Size([4, 3, 224, 224]),
-                "resize_to_max_canvas": False,
-                "expected_tile_means": [0.4473, 0.4469, 0.3032, 0.0],
-                "expected_tile_max": [1.0, 1.0, 1.0, 0.0],
-                "expected_tile_min": [0.0, 0.0, 0.0, 0.0],
-                "expected_aspect_ratio": [3, 1],
-                "pad_max_tiles": True,
             },
         ],
     )
@@ -103,7 +79,6 @@ def test_clip_image_transform(self, params):
             resample="bilinear",
             dtype=torch.float32,
             resize_to_max_canvas=params["resize_to_max_canvas"],
-            pad_max_tiles=params["pad_max_tiles"],
         )
 
         image_transform_inference = CLIPImageTransformInference(
@@ -115,7 +90,6 @@ def test_clip_image_transform(self, params):
             resample="bilinear",
             resize_to_max_canvas=params["resize_to_max_canvas"],
             antialias=True,
-            pad_max_tiles=params["pad_max_tiles"],
         )
 
         # Generate a deterministic image using np.arange for reproducibility
@@ -169,13 +143,7 @@ def test_clip_image_transform(self, params):
         ), f"Expected aspect ratio {params['expected_aspect_ratio']} but got {tuple(output_ar.numpy())}"
 
         # number of tiles matches the product of the aspect ratio
-        if params["pad_max_tiles"]:
-            # max_num_tiles=4.
-            assert (
-                4 == output_image.shape[0]
-            ), f"Expected 4 tiles but got {output_image.shape[0]}"
-        else:
-            expected_num_tiles = output_ar[0] * output_ar[1]
-            assert (
-                expected_num_tiles == output_image.shape[0]
-            ), f"Expected {expected_num_tiles} tiles but got {output_image.shape[0]}"
+        expected_num_tiles = output_ar[0] * output_ar[1]
+        assert (
+            expected_num_tiles == output_image.shape[0]
+        ), f"Expected {expected_num_tiles} tiles but got {output_image.shape[0]}"
diff --git a/tests/torchtune/modules/test_cosine_with_warmup.py b/tests/torchtune/modules/test_cosine_with_warmup.py
index 274170e971..1cc4631708 100644
--- a/tests/torchtune/modules/test_cosine_with_warmup.py
+++ b/tests/torchtune/modules/test_cosine_with_warmup.py
@@ -13,7 +13,7 @@
 
 from tests.test_utils import assert_expected
 
-from torchtune.modules import get_cosine_schedule_with_warmup
+from torchtune.training.lr_schedulers import get_cosine_schedule_with_warmup
 
 
 class TestCosineLR:
diff --git a/torchtune/_cli/download.py b/torchtune/_cli/download.py
index b35b81cca0..82b4935c01 100644
--- a/torchtune/_cli/download.py
+++ b/torchtune/_cli/download.py
@@ -131,12 +131,18 @@ def _download_cmd(self, args: argparse.Namespace) -> None:
                 token=args.hf_token,
             )
         except GatedRepoError:
-            self._parser.error(
-                "It looks like you are trying to access a gated repository. Please ensure you "
-                "have access to the repository and have provided the proper Hugging Face API token "
-                "using the option `--hf-token` or by running `huggingface-cli login`."
-                "You can find your token by visiting https://huggingface.co/settings/tokens"
-            )
+            if args.hf_token:
+                self._parser.error(
+                    "It looks like you are trying to access a gated repository. Please ensure you "
+                    "have access to the repository."
+                )
+            else:
+                self._parser.error(
+                    "It looks like you are trying to access a gated repository. Please ensure you "
+                    "have access to the repository and have provided the proper Hugging Face API token "
+                    "using the option `--hf-token` or by running `huggingface-cli login`."
+                    "You can find your token by visiting https://huggingface.co/settings/tokens"
+                )
         except RepositoryNotFoundError:
             self._parser.error(
                 f"Repository '{args.repo_id}' not found on the Hugging Face Hub."
diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py
index ca1ce6150e..3f6697f593 100644
--- a/torchtune/_recipe_registry.py
+++ b/torchtune/_recipe_registry.py
@@ -328,6 +328,18 @@ class Recipe:
                 name="qwen2/evaluation",
                 file_path="qwen2/evaluation.yaml",
             ),
+            Config(
+                name="gemma/evaluation",
+                file_path="gemma/evaluation.yaml",
+            ),
+            Config(
+                name="phi3/evaluation",
+                file_path="phi3/evaluation.yaml",
+            ),
+            Config(
+                name="mistral/evaluation",
+                file_path="mistral/evaluation.yaml",
+            ),
         ],
         supports_distributed=False,
     ),
diff --git a/torchtune/data/_collate.py b/torchtune/data/_collate.py
index 562a81ff8f..055ab77350 100644
--- a/torchtune/data/_collate.py
+++ b/torchtune/data/_collate.py
@@ -222,6 +222,7 @@ def padded_collate_tiled_images_and_mask(
     padding_idx: int = 0,
     ignore_idx: int = CROSS_ENTROPY_IGNORE_IDX,
     pad_direction: str = "right",
+    pad_max_tiles: Optional[int] = None,
     pad_max_images: Optional[int] = None,
 ) -> Dict[str, torch.Tensor]:
     """Pad a batch of text sequences, tiled image tensors, aspect ratios,
@@ -259,6 +260,8 @@ def padded_collate_tiled_images_and_mask(
             :func:`torch.nn.utils.rnn.pad_sequence`, otherwise if ``pad_direction="left"``,
             we use :func:`torchtune.data.left_pad_sequence`. For training, we typically want to pad from the right.
             For inference, we typically want to pad from the left. Defaults to "right".
+        pad_max_tiles (Optional[int]): Maximum number of tiles to pad to. If None, will pad to the largest number of tiles
+            in the batch. Defaults to None.
         pad_max_images (Optional[int]): Maximum number of images to pad to. If None, will pad to the largest number of images
             in the batch. Defaults to None.
 
@@ -272,6 +275,7 @@ def padded_collate_tiled_images_and_mask(
 
     Raises:
         ValueError: if ``pad_direction`` is not one of "left" or "right".
+        ValueError: if pad_max_tiles is set to a value less than the largest number of tiles in an image.
 
     Example:
         >>> image_id = 1
@@ -355,6 +359,13 @@ def padded_collate_tiled_images_and_mask(
         for sample in batch
         for image in sample["encoder_input"]["images"]
     )
+    if pad_max_tiles is not None:
+        if pad_max_tiles < max_num_tiles:
+            raise ValueError(
+                f"More tiles in image {max_num_tiles}, than pad_max_tiles {pad_max_tiles}"
+            )
+        max_num_tiles = pad_max_tiles
+
     # Second loop: pad images and masks to max number of tiles, max text seq len in batch
     batch_images = []
     batch_masks = []
diff --git a/torchtune/data/_prompt_templates.py b/torchtune/data/_prompt_templates.py
index 67167b3ed9..a7fa070a2e 100644
--- a/torchtune/data/_prompt_templates.py
+++ b/torchtune/data/_prompt_templates.py
@@ -107,16 +107,17 @@ def __call__(
         """
         formatted_dialogue = []
         for message in messages:
+            content = message.content
             if message.role in self.template:
                 prepend_tag = self.template[message.role][0]
                 append_tag = self.template[message.role][1]
-                content = (
-                    [{"type": "text", "content": prepend_tag}]
-                    + message.content
-                    + [{"type": "text", "content": append_tag}]
-                )
-            else:
                 content = message.content
+
+                if isinstance(prepend_tag, str) and len(prepend_tag) > 0:
+                    content = [{"type": "text", "content": prepend_tag}] + content
+
+                if isinstance(append_tag, str) and len(append_tag) > 0:
+                    content = content + [{"type": "text", "content": append_tag}]
             formatted_dialogue.append(
                 Message(
                     role=message.role,
@@ -183,13 +184,20 @@ def __call__(
                 and index == len(messages) - 1
                 and len(message.text_content) == 0
             ):
-                content = [{"type": "text", "content": prepend_tag}] + message.content
+                content = message.content
+                if isinstance(prepend_tag, str) and len(prepend_tag) > 0:
+                    content = [
+                        {"type": "text", "content": prepend_tag}
+                    ] + message.content
             else:
-                content = (
-                    [{"type": "text", "content": prepend_tag}]
-                    + message.content
-                    + [{"type": "text", "content": append_tag}]
-                )
+                content = message.content
+
+                if isinstance(prepend_tag, str) and len(prepend_tag) > 0:
+                    content = [{"type": "text", "content": prepend_tag}] + content
+
+                if isinstance(append_tag, str) and len(append_tag) > 0:
+                    content = content + [{"type": "text", "content": append_tag}]
+
             formatted_dialogue.append(
                 Message(
                     role=message.role,
diff --git a/torchtune/models/clip/_transform.py b/torchtune/models/clip/_transform.py
index 533d27c865..a9b60624ff 100644
--- a/torchtune/models/clip/_transform.py
+++ b/torchtune/models/clip/_transform.py
@@ -15,7 +15,6 @@
     find_supported_resolutions,
     get_canvas_best_fit,
 )
-from torchtune.modules.transforms.vision_utils.pad_dim_to_size import pad_dim_to_size
 from torchtune.modules.transforms.vision_utils.resize_with_pad import resize_with_pad
 from torchtune.modules.transforms.vision_utils.tile_crop import tile_crop
 
@@ -63,7 +62,6 @@ class CLIPImageTransform:
             This will be used to generate possible_resolutions,
             e.g. [(224, 224), (224, 448), (448, 224)] if max_num_tiles = 2 and tile_size = 224.
             Default 4.
-        pad_max_tiles (bool): If True, the image will be padded to have tiles == max_num_tiles. Default False.
         dtype (torch.dtype): Data type of the output image. Default torch.bfloat16.
         resample (str): Resampling method used when resizing images. Supports any enum of
             ``torchvision.transforms.InterpolationMode``, e.g. "nearest", "nearest_exact", "bilinear", "bicubic".
@@ -101,7 +99,6 @@ def __init__(
         possible_resolutions: Optional[List[Tuple[int, int]]] = None,
         tile_size: int = 224,
         max_num_tiles: Optional[int] = 4,
-        pad_max_tiles: bool = False,
         dtype: torch.dtype = torch.bfloat16,
         resample: str = "bilinear",
         resize_to_max_canvas: bool = False,
@@ -142,7 +139,6 @@ def __init__(
         # tile_crop
         self.tile_size = tile_size
         self.tile_crop = tile_crop
-        self.pad_tile_size = max_num_tiles if pad_max_tiles else None
 
     def __call__(
         self, sample: Mapping[str, Any], inference: bool = False
@@ -190,8 +186,6 @@ def __call__(
 
         # Divide the image into equally sized tiles
         image = self.tile_crop(image=image, tile_size=self.tile_size)
-        if self.pad_tile_size:
-            image = pad_dim_to_size(image, size=self.pad_tile_size, dim=0)
 
         aspect_ratio = torch.tensor(best_resolution).reshape(-1) // self.tile_size
 
diff --git a/torchtune/models/llama3_2_vision/_transform.py b/torchtune/models/llama3_2_vision/_transform.py
index 4272f5a1e2..4dc4f781e9 100644
--- a/torchtune/models/llama3_2_vision/_transform.py
+++ b/torchtune/models/llama3_2_vision/_transform.py
@@ -86,7 +86,6 @@ def __init__(
             tile_size=tile_size,
             possible_resolutions=None,
             max_num_tiles=max_num_tiles,
-            pad_max_tiles=True,
             resample="bilinear",
             resize_to_max_canvas=False,
         )
diff --git a/torchtune/modules/__init__.py b/torchtune/modules/__init__.py
index 23d53d9fa1..32af70f8e5 100644
--- a/torchtune/modules/__init__.py
+++ b/torchtune/modules/__init__.py
@@ -34,7 +34,6 @@
     "TanhGate",
     "FeedForward",
     "FrozenNF4Linear",
-    "get_cosine_schedule_with_warmup",
     "KVCache",
     "RotaryPositionalEmbeddings",
     "RMSNorm",
@@ -51,4 +50,5 @@
     "local_kv_cache",
     "delete_kv_caches",
     "disable_kv_cache",
+    "get_cosine_schedule_with_warmup",
 ]
diff --git a/torchtune/modules/attention.py b/torchtune/modules/attention.py
index e3f7451ed1..879f0679cf 100644
--- a/torchtune/modules/attention.py
+++ b/torchtune/modules/attention.py
@@ -304,7 +304,7 @@ def forward(
             k,
             v,
             mask=mask,
-            dropout_p=self.attn_dropout,
+            dropout_p=self.attn_dropout if self.training else 0.0,
             is_causal=self.kv_cache is None and mask is None and self.is_causal,
         )
 
diff --git a/torchtune/modules/low_precision/_utils.py b/torchtune/modules/low_precision/_utils.py
deleted file mode 100644
index 30f02911e6..0000000000
--- a/torchtune/modules/low_precision/_utils.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from datetime import datetime
-from importlib.metadata import PackageNotFoundError, version
-from typing import Optional, Tuple
-
-import torch
-
-import torchao
-
-
-def _is_fbcode():
-    return not hasattr(torch.version, "git_version")
-
-
-def _nightly_version_ge(ao_version_str: str, date: str) -> bool:
-    """
-    Compare a torchao nightly version to a date of the form
-    %Y-%m-%d.
-
-    Returns True if the nightly version is greater than or equal to
-        the date, False otherwise
-    """
-    ao_datetime = datetime.strptime(ao_version_str.split("+")[0], "%Y.%m.%d")
-    return ao_datetime >= datetime.strptime(date, "%Y-%m-%d")
-
-
-def _get_torchao_version() -> Tuple[Optional[str], Optional[bool]]:
-    """
-    Get torchao version. Returns a tuple of two elements, the first element
-    is the version string, the second element is whether it's a nightly version.
-    For fbcode usage, return None, None.
-
-    Checks:
-        1) is_fbcode, then
-        3) torchao.__version__ (only defined for torchao >= 0.3.0), then
-        4) importlib's version(torchao)
-
-
-    If none of these work, raise an error.
-
-    """
-    if _is_fbcode():
-        return None, None
-    try:
-        ao_version = torchao.__version__
-    except AttributeError:
-        try:
-            ao_version = version("torchao")
-        except Exception as e:
-            raise PackageNotFoundError("Could not find torchao version") from e
-    is_nightly = "dev" in ao_version
-    return ao_version, is_nightly
diff --git a/torchtune/modules/lr_schedulers.py b/torchtune/modules/lr_schedulers.py
index ee5f2cd91d..30d73e5dd5 100644
--- a/torchtune/modules/lr_schedulers.py
+++ b/torchtune/modules/lr_schedulers.py
@@ -8,8 +8,13 @@
 
 import torch
 from torch.optim.lr_scheduler import LambdaLR
+from torchtune.utils._logging import deprecated
 
 
+@deprecated(
+    msg="Please use get_cosine_schedule_with_warmup from torchtune.training.lr_schedulers instead. \
+        "
+)
 def get_cosine_schedule_with_warmup(
     optimizer: torch.optim.Optimizer,
     num_warmup_steps: int,
diff --git a/torchtune/modules/model_fusion/_fusion.py b/torchtune/modules/model_fusion/_fusion.py
index 46507deb17..cf488d3a02 100644
--- a/torchtune/modules/model_fusion/_fusion.py
+++ b/torchtune/modules/model_fusion/_fusion.py
@@ -391,18 +391,26 @@ def setup_caches(
         )
 
     def caches_are_setup(self) -> bool:
-        """Check if the key value caches are setup."""
+        """
+        Check if the key value caches are setup. This means `setup_caches` has been called, and
+        the relevant attention modules in the model have created `KVCache`s.
+        """
         return self.decoder.caches_are_setup()
 
     def caches_are_enabled(self) -> bool:
         """
-        Checks if the key value caches are enabled. KV-caches must also have been setup
-        for them to be enabled.
+        Checks if the key value caches are enabled. Once KV-caches have been setup, the relevant
+        attention modules will be "enabled" and all forward passes will update the caches. This behaviour
+        can be disabled without altering the state of the KV-caches by "disabling" the KV-caches
+        using ``torchtune.modules.disable_kv_cache``, upon which ``caches_are_enabled`` would return False.
         """
         return self.decoder.caches_are_enabled()
 
     def reset_caches(self):
-        """Reset the key value caches."""
+        """
+        Resets KV-cache buffers on relevant attention modules to zero, and reset cache positions to zero,
+        without deleting or reallocating cache tensors.
+        """
         self.decoder.reset_caches()
 
     def forward(
diff --git a/torchtune/modules/transformer.py b/torchtune/modules/transformer.py
index eb6cd6e570..3b2d356c29 100644
--- a/torchtune/modules/transformer.py
+++ b/torchtune/modules/transformer.py
@@ -187,15 +187,26 @@ def setup_caches(
         self.attn.setup_cache(batch_size, dtype, encoder_max_seq_len)
 
     def caches_are_setup(self) -> bool:
-        """Check if the key value caches are setup."""
+        """
+        Check if the key value caches are setup. This means `setup_caches` has been called, and
+        the relevant attention modules in the model have created `KVCache`s.
+        """
         return self.attn.kv_cache is not None
 
     def caches_are_enabled(self) -> bool:
-        """Check if key value caches are enabled."""
+        """
+        Checks if the key value caches are enabled. Once KV-caches have been setup, the relevant
+        attention modules will be "enabled" and all forward passes will update the caches. This behaviour
+        can be disabled without altering the state of the KV-caches by "disabling" the KV-caches
+        using ``torchtune.modules.disable_kv_cache``, upon which ``caches_are_enabled`` would return False.
+        """
         return self.attn.cache_enabled
 
     def reset_cache(self):
-        """Reset the key value caches."""
+        """
+        Resets KV-cache buffers on relevant attention modules to zero, and reset cache positions to zero,
+        without deleting or reallocating cache tensors.
+        """
         self.attn.reset_cache()
 
     def _skip_mask(self, mask: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
diff --git a/torchtune/training/__init__.py b/torchtune/training/__init__.py
index 9e33f40067..bd111d05df 100644
--- a/torchtune/training/__init__.py
+++ b/torchtune/training/__init__.py
@@ -51,6 +51,7 @@
     TOTAL_EPOCHS_KEY,
     update_state_dict_for_classifier,
 )
+from torchtune.training.lr_schedulers import get_cosine_schedule_with_warmup
 from torchtune.training.memory import (
     cleanup_before_training,
     create_optim_in_bwd_wrapper,
@@ -91,6 +92,7 @@
     "STEPS_KEY",
     "TOTAL_EPOCHS_KEY",
     "get_quantizer_mode",
+    "get_cosine_schedule_with_warmup",
     "cleanup_before_training",
     "create_optim_in_bwd_wrapper",
     "get_memory_stats",
diff --git a/torchtune/training/_profiler.py b/torchtune/training/_profiler.py
index f8004a356e..d296006b5d 100644
--- a/torchtune/training/_profiler.py
+++ b/torchtune/training/_profiler.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 
+import datetime
 import os
 import time
 from functools import partial
@@ -97,8 +98,13 @@ def trace_handler(
 
     # Use tensorboard trace handler rather than directly exporting chrome traces since
     # tensorboard doesn't seem to be able to parse traces with prof.export_chrome_trace
+
+    now = datetime.datetime.now()
+
     exporter = tensorboard_trace_handler(
-        curr_trace_dir, worker_name=f"rank{rank}", use_gzip=True
+        curr_trace_dir,
+        worker_name=f"r0-{now.year}-{now.month}-{now.day}-{now.hour}-{now.minute}",
+        use_gzip=True,
     )
     exporter(prof)
 
diff --git a/torchtune/training/lr_schedulers.py b/torchtune/training/lr_schedulers.py
new file mode 100644
index 0000000000..ee5f2cd91d
--- /dev/null
+++ b/torchtune/training/lr_schedulers.py
@@ -0,0 +1,56 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+from torch.optim.lr_scheduler import LambdaLR
+
+
+def get_cosine_schedule_with_warmup(
+    optimizer: torch.optim.Optimizer,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    num_cycles: float = 0.5,
+    last_epoch: int = -1,
+) -> LambdaLR:
+    """
+    Create a learning rate schedule that linearly increases the learning rate from
+    0.0 to lr over ``num_warmup_steps``, then decreases to 0.0 on a cosine schedule over
+    the remaining ``num_training_steps-num_warmup_steps`` (assuming ``num_cycles`` = 0.5).
+
+    This is based on the Hugging Face implementation
+    https://github.com/huggingface/transformers/blob/v4.23.1/src/transformers/optimization.py#L104.
+
+    Args:
+        optimizer (torch.optim.Optimizer): The optimizer for which to
+            schedule the learning rate.
+        num_warmup_steps (int): The number of steps for the warmup phase.
+        num_training_steps (int): The total number of training steps.
+        num_cycles (float): The number of waves in the cosine schedule. Defaults to 0.5
+            (decrease from the max value to 0 following a half-cosine).
+        last_epoch (int): The index of the last epoch when resuming training. Defaults to -1
+
+    Returns:
+        torch.optim.lr_scheduler.LambdaLR with the appropriate schedule.
+    """
+
+    def lr_lambda(current_step: int) -> float:
+        # linear warmup phase
+        if current_step < num_warmup_steps:
+            return current_step / max(1, num_warmup_steps)
+
+        # cosine
+        progress = (current_step - num_warmup_steps) / max(
+            1, num_training_steps - num_warmup_steps
+        )
+
+        cosine_lr_multiple = 0.5 * (
+            1.0 + math.cos(math.pi * num_cycles * 2.0 * progress)
+        )
+        return max(0.0, cosine_lr_multiple)
+
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
diff --git a/torchtune/training/quantization.py b/torchtune/training/quantization.py
index debe49ab15..465e987981 100644
--- a/torchtune/training/quantization.py
+++ b/torchtune/training/quantization.py
@@ -6,7 +6,13 @@
 
 from typing import Callable, Optional
 
-from torchao.dtypes import TensorCoreTiledLayoutType
+from torchtune.utils._import_guard import _USE_NEW_TENSOR_CORE_TILED_LAYOUT_API
+
+if _USE_NEW_TENSOR_CORE_TILED_LAYOUT_API:
+    from torchao.dtypes import TensorCoreTiledLayout
+else:
+    from torchao.dtypes import TensorCoreTiledLayoutType as TensorCoreTiledLayout
+
 from torchao.quantization import (
     int4_weight_only,
     int8_dynamic_activation_int4_weight,
@@ -88,7 +94,7 @@ def __init__(self, groupsize: int = 128, inner_k_tiles: int = 8):
         self.inner_k_tiles = inner_k_tiles
 
     def quantize(self, model):
-        layout_type = TensorCoreTiledLayoutType(self.inner_k_tiles)
+        layout_type = TensorCoreTiledLayout(self.inner_k_tiles)
         quantize_fn = int4_weight_only(self.groupsize, layout_type)
         quantize_(model, quantize_fn)
         return model
diff --git a/torchtune/utils/_import_guard.py b/torchtune/utils/_import_guard.py
index c0779271fb..93e7941fbc 100644
--- a/torchtune/utils/_import_guard.py
+++ b/torchtune/utils/_import_guard.py
@@ -5,7 +5,8 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch
-from torchtune.utils._version import torch_version_ge
+import torchao
+from torchtune.utils._version import _is_fbcode, _nightly_version_ge, torch_version_ge
 
 # We can only use flex attention / BlockMask if torch version >= 2.5.0 and GPU is Turing / SM75 and above
 _SUPPORTS_FLEX_ATTENTION = (
@@ -13,3 +14,16 @@
     and torch.cuda.is_available()
     and torch.cuda.get_device_capability() >= (7, 5)
 )
+
+torchao_version = torchao.__version__
+
+_USE_NEW_TENSOR_CORE_TILED_LAYOUT_API = _is_fbcode() or (
+    not _is_fbcode()
+    and (
+        ("dev" not in torchao_version and torchao_version >= "0.6.0")
+        or (
+            "dev" in torchao_version
+            and _nightly_version_ge(torchao_version, "2024-10-10")
+        )
+    )
+)
diff --git a/torchtune/utils/_version.py b/torchtune/utils/_version.py
index 830a8ba079..9dcbd8e450 100644
--- a/torchtune/utils/_version.py
+++ b/torchtune/utils/_version.py
@@ -3,6 +3,9 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
+from datetime import datetime
+
 import torch
 
 
@@ -23,3 +26,21 @@ def torch_version_ge(version: str) -> bool:
         True
     """
     return version in torch.__version__ or torch.__version__ >= version
+
+
+def _is_fbcode():
+    return not hasattr(torch.version, "git_version")
+
+
+def _nightly_version_ge(ao_version_str: str, date: str) -> bool:
+    """
+    Compare a torchao nightly version to a date of the form
+    %Y-%m-%d.
+
+    Returns True if the nightly version is greater than or equal to
+        the date, False otherwise
+    """
+    ao_datetime = datetime.strptime(
+        ao_version_str.split("+")[0].split("dev")[1], "%Y%m%d"
+    )
+    return ao_datetime >= datetime.strptime(date, "%Y-%m-%d")