From dd4cf33d3ae81629578637cab8be4a120f34dfe7 Mon Sep 17 00:00:00 2001
From: Optimox <sebastien.fischman@gmail.com>
Date: Sun, 27 Oct 2024 10:21:32 +0100
Subject: [PATCH] update configs to match parallel PR

---
 recipes/configs/gemma2/27B_full.yaml                | 4 +++-
 recipes/configs/gemma2/27B_lora.yaml                | 4 +++-
 recipes/configs/gemma2/27B_lora_single_device.yaml  | 5 +++--
 recipes/configs/gemma2/27B_qlora_single_device.yaml | 5 +++--
 recipes/configs/gemma2/2B_full.yaml                 | 4 +++-
 recipes/configs/gemma2/2B_lora.yaml                 | 4 +++-
 recipes/configs/gemma2/2B_lora_single_device.yaml   | 5 +++--
 recipes/configs/gemma2/2B_qlora_single_device.yaml  | 5 +++--
 recipes/configs/gemma2/9B_full.yaml                 | 4 +++-
 recipes/configs/gemma2/9B_lora.yaml                 | 4 +++-
 recipes/configs/gemma2/9B_lora_single_device.yaml   | 5 +++--
 recipes/configs/gemma2/9B_qlora_single_device.yaml  | 5 +++--
 12 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/recipes/configs/gemma2/27B_full.yaml b/recipes/configs/gemma2/27B_full.yaml
index eebeefbd4..dee049024 100644
--- a/recipes/configs/gemma2/27B_full.yaml
+++ b/recipes/configs/gemma2/27B_full.yaml
@@ -23,6 +23,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -53,6 +54,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
+compile: False  # pytorch compile, set to true for perf/memory improvement
 
 # Training env
 device: cuda
@@ -69,4 +71,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma2-27b-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/gemma2/27B_lora.yaml b/recipes/configs/gemma2/27B_lora.yaml
index e78b40633..265895090 100644
--- a/recipes/configs/gemma2/27B_lora.yaml
+++ b/recipes/configs/gemma2/27B_lora.yaml
@@ -23,6 +23,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -65,6 +66,7 @@ batch_size: 4
 epochs: 3
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
+compile: False  # pytorch compile, set to true for perf/memory improvement
 
 # Training env
 device: cuda
@@ -81,4 +83,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma2-27b-lora
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/gemma2/27B_lora_single_device.yaml b/recipes/configs/gemma2/27B_lora_single_device.yaml
index 56727e529..e245aafa9 100644
--- a/recipes/configs/gemma2/27B_lora_single_device.yaml
+++ b/recipes/configs/gemma2/27B_lora_single_device.yaml
@@ -22,6 +22,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -64,7 +65,7 @@ batch_size: 2
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8
-compile: False
+compile: False  # pytorch compile, set to true for perf/memory improvement
 
 # Training env
 device: cuda
@@ -82,7 +83,7 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma2-27b-lora
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training
diff --git a/recipes/configs/gemma2/27B_qlora_single_device.yaml b/recipes/configs/gemma2/27B_qlora_single_device.yaml
index a1b7fcd37..2f0e7d6ca 100644
--- a/recipes/configs/gemma2/27B_qlora_single_device.yaml
+++ b/recipes/configs/gemma2/27B_qlora_single_device.yaml
@@ -22,6 +22,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -64,7 +65,7 @@ batch_size: 4
 epochs: 3
 max_steps_per_epoch: null
 gradient_accumulation_steps: 4
-compile: False
+compile: False  # pytorch compile, set to true for perf/memory improvement
 
 # Training env
 device: cuda
@@ -82,7 +83,7 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma2-27b-lora
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training
diff --git a/recipes/configs/gemma2/2B_full.yaml b/recipes/configs/gemma2/2B_full.yaml
index 9386fae4b..e302dd759 100644
--- a/recipes/configs/gemma2/2B_full.yaml
+++ b/recipes/configs/gemma2/2B_full.yaml
@@ -23,6 +23,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -55,6 +56,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
+compile: False  # pytorch compile, set to true for perf/memory improvement
 
 # Training env
 device: cuda
@@ -71,4 +73,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma2-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/gemma2/2B_lora.yaml b/recipes/configs/gemma2/2B_lora.yaml
index e6ef6e6e9..9a439ee0a 100644
--- a/recipes/configs/gemma2/2B_lora.yaml
+++ b/recipes/configs/gemma2/2B_lora.yaml
@@ -22,6 +22,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -67,6 +68,7 @@ batch_size: 4
 epochs: 3
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
+compile: False  # pytorch compile, set to true for perf/memory improvement
 
 # Training env
 device: cuda
@@ -83,4 +85,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma2-lora
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/gemma2/2B_lora_single_device.yaml b/recipes/configs/gemma2/2B_lora_single_device.yaml
index 484f133b4..1a2703fb4 100644
--- a/recipes/configs/gemma2/2B_lora_single_device.yaml
+++ b/recipes/configs/gemma2/2B_lora_single_device.yaml
@@ -22,6 +22,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -66,7 +67,7 @@ batch_size: 8
 epochs: 3
 max_steps_per_epoch: null
 gradient_accumulation_steps: 2
-compile: False
+compile: False  # pytorch compile, set to true for perf/memory improvement
 
 # Training env
 device: cuda
@@ -84,7 +85,7 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma2-lora
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training
diff --git a/recipes/configs/gemma2/2B_qlora_single_device.yaml b/recipes/configs/gemma2/2B_qlora_single_device.yaml
index b5d7c9147..c2525460f 100644
--- a/recipes/configs/gemma2/2B_qlora_single_device.yaml
+++ b/recipes/configs/gemma2/2B_qlora_single_device.yaml
@@ -22,6 +22,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -66,7 +67,7 @@ batch_size: 4
 epochs: 3
 max_steps_per_epoch: null
 gradient_accumulation_steps: 4
-compile: False
+compile: False  # pytorch compile, set to true for perf/memory improvement
 
 # Training env
 device: cuda
@@ -84,7 +85,7 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma2-lora
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training
diff --git a/recipes/configs/gemma2/9B_full.yaml b/recipes/configs/gemma2/9B_full.yaml
index d599970a2..0002b1c3b 100644
--- a/recipes/configs/gemma2/9B_full.yaml
+++ b/recipes/configs/gemma2/9B_full.yaml
@@ -23,6 +23,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -53,6 +54,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
+compile: False  # pytorch compile, set to true for perf/memory improvement
 
 # Training env
 device: cuda
@@ -69,4 +71,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma2-9b-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/gemma2/9B_lora.yaml b/recipes/configs/gemma2/9B_lora.yaml
index 1cf209a24..5b0141e9e 100644
--- a/recipes/configs/gemma2/9B_lora.yaml
+++ b/recipes/configs/gemma2/9B_lora.yaml
@@ -23,6 +23,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -65,6 +66,7 @@ batch_size: 4
 epochs: 3
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
+compile: False  # pytorch compile, set to true for perf/memory improvement
 
 # Training env
 device: cuda
@@ -81,4 +83,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma2-9b-lora
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/gemma2/9B_lora_single_device.yaml b/recipes/configs/gemma2/9B_lora_single_device.yaml
index 57d066bb0..197ee121a 100644
--- a/recipes/configs/gemma2/9B_lora_single_device.yaml
+++ b/recipes/configs/gemma2/9B_lora_single_device.yaml
@@ -22,6 +22,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -64,7 +65,7 @@ batch_size: 8
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 2
-compile: False
+compile: False  # pytorch compile, set to true for perf/memory improvement
 
 # Training env
 device: cuda
@@ -82,7 +83,7 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma2-9b-lora
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training
diff --git a/recipes/configs/gemma2/9B_qlora_single_device.yaml b/recipes/configs/gemma2/9B_qlora_single_device.yaml
index 3c198bead..80a330310 100644
--- a/recipes/configs/gemma2/9B_qlora_single_device.yaml
+++ b/recipes/configs/gemma2/9B_qlora_single_device.yaml
@@ -22,6 +22,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -64,7 +65,7 @@ batch_size: 4
 epochs: 3
 max_steps_per_epoch: null
 gradient_accumulation_steps: 4
-compile: False
+compile: False  # pytorch compile, set to true for perf/memory improvement
 
 # Training env
 device: cuda
@@ -82,7 +83,7 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma2-9b-lora
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training