WIP: non working flex attention

pytorch · Oct 22, 2024 · 6f89920 · 6f89920
1 parent e87f878
commit 6f89920
Show file tree

Hide file tree

Showing 8 changed files with 438 additions and 20 deletions.
diff --git a/recipes/configs/gemma2/2B_full.yaml b/recipes/configs/gemma2/2B_full.yaml
@@ -19,7 +19,7 @@
 # Tokenizer
 tokenizer:
   _component_: torchtune.models.gemma.gemma_tokenizer
-  path: /tmp/gemma2-2b/tokenizer.model
+  path: /tmp/gemma-2-2b/tokenizer.model
 
 # Dataset
 dataset:
@@ -33,14 +33,14 @@ model:
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
-  checkpoint_dir: /tmp/gemma2-2b/
+  checkpoint_dir: /tmp/gemma-2-2b/
   checkpoint_files: [
     model-00001-of-00003.safetensors,
     model-00002-of-00003.safetensors,
     model-00003-of-00003.safetensors,
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/gemma2-2b
+  output_dir: /tmp/gemma-2-2b
   model_type: GEMMA2
 resume_from_checkpoint: False
 

diff --git a/recipes/configs/gemma2/2B_lora.yaml b/recipes/configs/gemma2/2B_lora.yaml
@@ -18,7 +18,7 @@
 # Tokenizer
 tokenizer:
   _component_: torchtune.models.gemma.gemma_tokenizer
-  path: /tmp/gemma2-2b/tokenizer.model
+  path: /tmp/gemma-2-2b/tokenizer.model
 
 # Dataset
 dataset:
@@ -37,14 +37,14 @@ model:
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
-  checkpoint_dir: /tmp/gemma2-2b/
+  checkpoint_dir: /tmp/gemma-2-2b/
   checkpoint_files: [
     model-00001-of-00003.safetensors,
     model-00002-of-00003.safetensors,
     model-00003-of-00003.safetensors,
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/gemma2-2b
+  output_dir: /tmp/gemma-2-2b
   model_type: GEMMA2
 resume_from_checkpoint: False
 

diff --git a/recipes/configs/gemma2/2B_lora_single_device.yaml b/recipes/configs/gemma2/2B_lora_single_device.yaml
@@ -18,7 +18,7 @@
 # Tokenizer
 tokenizer:
   _component_: torchtune.models.gemma.gemma_tokenizer
-  path: /tmp/gemma2-2b/tokenizer.model
+  path: /tmp/gemma-2-2b/tokenizer.model
 
 # Dataset
 dataset:
@@ -44,7 +44,7 @@ checkpointer:
     model-00003-of-00003.safetensors,
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/gemma2-2b
+  output_dir: /tmp/gemma-2-2b
   model_type: GEMMA2
 resume_from_checkpoint: False
 save_adapter_weights_only: False
@@ -62,10 +62,10 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 
 # Fine-tuning arguments
-batch_size: 4
+batch_size: 8
 epochs: 3
 max_steps_per_epoch: null
-gradient_accumulation_steps: 4
+gradient_accumulation_steps: 2
 compile: False
 
 # Training env

diff --git a/recipes/configs/gemma2/2B_qlora_single_device.yaml b/recipes/configs/gemma2/2B_qlora_single_device.yaml
@@ -18,7 +18,7 @@
 # Tokenizer
 tokenizer:
   _component_: torchtune.models.gemma.gemma_tokenizer
-  path: /tmp/gemma2-2b/tokenizer.model
+  path: /tmp/gemma-2-2b/tokenizer.model
 
 # Dataset
 dataset:
@@ -37,14 +37,14 @@ model:
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
-  checkpoint_dir: /tmp/gemma2-2b/
+  checkpoint_dir: /tmp/gemma-2-2b/
   checkpoint_files: [
     model-00001-of-00003.safetensors,
     model-00002-of-00003.safetensors,
     model-00003-of-00003.safetensors,
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/gemma2-2b
+  output_dir: /tmp/gemma-2-2b
   model_type: GEMMA2
 resume_from_checkpoint: False
 save_adapter_weights_only: False

diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py
@@ -616,7 +616,6 @@ def save_checkpoint(self, epoch: int) -> None:
     def _loss_step(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
         # Shape [b, s], needed for the loss not the model
         labels = batch.pop("labels")
-
         # run model
         with self.activations_handling_ctx:
             logits = self._model(**batch)