Feat/support config file for training args (FLock-io#7)

* feat: add support for training args * fix: fix wrong dataclass name * chore: update README * feat: comment out model in training_args to bypass in training * fix: filter by model size
zeycan1 · Jun 3, 2024 · 534d606 · 534d606
1 parent 23a00a4
commit 534d606
Show file tree

Hide file tree

Showing 5 changed files with 93 additions and 26 deletions.
diff --git a/README.md b/README.md
@@ -21,6 +21,23 @@ pip install -r requirements.txt
 - [`merge.py`](merge.py) - Contains the utility function for merging LoRA weights. If you are training with LoRA, please ensure you merge the adapter before uploading to your Hugging Face repository.
 - [`demo.py`](demo.py) - A training script that implements LoRA fine-tuning for a Gemma-2B model.
 - [`full_automation.py`](full_automation.py) - A script that automate everything including get a task, download the training data, finetune Gemma-2B on training data, merge weights, upload to your HuggingFace model repo, and submit the task to fed-ledger.
+- [`training_args.yaml`](training_args.yaml) - A YAML defines the training hyper-parameters for fine-tuning. A detailed explanation on LoRA config can be found here: [LoRA Fine-tuning & Hyperparameters Explained](https://www.entrypointai.com/blog/lora-fine-tuning/)
+
+### Full Automation
+
+Simply run
+
+```bash
+TASK_ID=<task-id> FLOCK_API_KEY="<your-flock-api-key-stakes-as-node-for-the-task>" HF_TOKEN="<your-hf-token>" CUDA_VISIBLE_DEVICES=0 HF_USERNAME="your-hf-user-name" python full_automation.py
+```
+
+The above command will automatically train and submit multiple LLMs that are smaller the max parameters limitation for the given task.
+
+#### Bypass certain models
+
+If you want to bypass certain models, simply comment out the model config in the [`training_args.yaml`](training_args.yaml)
+
+---
 
 ### Play with demo.py
 
@@ -40,7 +57,6 @@ This command initiates fine-tuning on the demo dataset, saves the fine-tuned mod
 
 [HuggingFace Models Uploading](https://huggingface.co/docs/hub/en/models-uploading)
 
-
 #### Getting the task id
 
 Before you submit the model script for a task, you will first need to stake on the task as a node.
@@ -64,13 +80,3 @@ curl --location 'https://fed-ledger-prod.flock.io/api/v1/tasks/submit-result' \
     }
 }'
 ```
-
-### Full Automation
-
-Simply run
-
-```bash
-TASK_ID=<task-id> FLOCK_API_KEY="<your-flock-api-key-stakes-as-node-for-the-task>" HF_TOKEN="<your-hf-token>" CUDA_VISIBLE_DEVICES=0 HF_USERNAME="your-hf-user-name" python full_automation.py
-```
-
-The above command will automatically train and submit multiple LLMs that are smaller the max parameters limitation for the given task.
diff --git a/demo.py b/demo.py
@@ -1,4 +1,5 @@
 import os
+from dataclasses import dataclass
 
 import torch
 from peft import LoraConfig
@@ -11,20 +12,28 @@
 from utils.constants import model2template
 
 
+@dataclass
+class LoraTrainingArguments:
+    per_device_train_batch_size: int
+    gradient_accumulation_steps: int
+    num_train_epochs: int
+    lora_rank: int
+    lora_alpha: int
+    lora_dropout: int
+
+
 def train_and_merge(
-    model_id: str = "google/gemma-2b",
-    num_train_epochs: int = 1,
-    per_device_train_batch_size: int = 1,
-    gradient_accumulation_steps: int = 8,
-    context_length: int = 512,
+    model_id: str, context_length: int, training_args: LoraTrainingArguments
 ):
     assert model_id in model2template, f"model_id {model_id} not supported"
     lora_config = LoraConfig(
-        r=8,
+        r=training_args.lora_rank,
         target_modules=[
             "q_proj",
             "v_proj",
         ],
+        lora_alpha=training_args.lora_alpha,
+        lora_dropout=training_args.lora_dropout,
         task_type="CAUSAL_LM",
     )
 
@@ -36,16 +45,16 @@ def train_and_merge(
     )
 
     training_args = TrainingArguments(
-        per_device_train_batch_size=per_device_train_batch_size,
-        gradient_accumulation_steps=gradient_accumulation_steps,
+        per_device_train_batch_size=training_args.per_device_train_batch_size,
+        gradient_accumulation_steps=training_args.gradient_accumulation_steps,
         warmup_steps=100,
         learning_rate=2e-4,
         bf16=True,
         logging_steps=20,
         output_dir="outputs",
         optim="paged_adamw_8bit",
         remove_unused_columns=False,
-        num_train_epochs=num_train_epochs,
+        num_train_epochs=training_args.num_train_epochs,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_id,

diff --git a/full_automation.py b/full_automation.py
@@ -4,40 +4,52 @@
 
 import requests
 import torch
+import yaml
 from loguru import logger
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from demo import train_and_merge
+from demo import LoraTrainingArguments, train_and_merge
 from utils.constants import model2base_model, model2size
 from utils.flock_api import get_task, submit_task
 
 HF_USERNAME = os.environ["HF_USERNAME"]
 
 if __name__ == "__main__":
     task_id = os.environ["TASK_ID"]
+    # load trainin args
+    # define the path of the current file
+    current_folder = os.path.dirname(os.path.realpath(__file__))
+    with open(f"{current_folder}/training_args.yaml", "r") as f:
+        all_training_args = yaml.safe_load(f)
+
     task = get_task(task_id)
     # log the task info
-    print(json.dumps(task, indent=4))
+    logger.info(json.dumps(task, indent=4))
     # download data from a presigned url
     data_url = task["data"]["training_set_url"]
     context_length = task["data"]["context_length"]
     max_params = task["data"]["max_params"]
 
     # filter out the model within the max_params
     model2size = {k: v for k, v in model2size.items() if v <= max_params}
-    logger.info(f"Models within the max_params: {model2size.keys()}")
+    all_training_args = {k: v for k, v in all_training_args.items() if k in model2size}
+    logger.info(f"Models within the max_params: {all_training_args.keys()}")
     # download in chunks
     response = requests.get(data_url, stream=True)
     with open("demo_data.jsonl", "wb") as f:
         for chunk in response.iter_content(chunk_size=8192):
             f.write(chunk)
 
     # train all feasible models and merge
-    for model_id in model2size.keys():
+    for model_id in all_training_args.keys():
         logger.info(f"Start to train the model {model_id}...")
         # if OOM, proceed to the next model
         try:
-            train_and_merge(model_id=model_id, context_length=context_length)
+            train_and_merge(
+                model_id=model_id,
+                context_length=context_length,
+                training_args=LoraTrainingArguments(**all_training_args[model_id]),
+            )
         except RuntimeError as e:
             logger.error(f"Error: {e}")
             logger.info("Proceed to the next model...")

diff --git a/requirements.txt b/requirements.txt
@@ -3,4 +3,5 @@ transformers>=4.37.2
 peft>=0.10.0
 loguru
 trl>=0.8.1
-bitsandbytes
+bitsandbytes
+pyyaml
diff --git a/training_args.yaml b/training_args.yaml
@@ -0,0 +1,39 @@
+Qwen/Qwen1.5-0.5B:
+  per_device_train_batch_size: 1
+  gradient_accumulation_steps: 8
+  num_train_epochs: 1
+  lora_rank: 8
+  lora_alpha: 16
+  lora_dropout: 0.1
+
+Qwen/Qwen1.5-1.8B:
+  per_device_train_batch_size: 1
+  gradient_accumulation_steps: 8
+  num_train_epochs: 1
+  lora_rank: 4
+  lora_alpha: 8
+  lora_dropout: 0.1
+
+Qwen/Qwen1.5-7B:
+  per_device_train_batch_size: 1
+  gradient_accumulation_steps: 8
+  num_train_epochs: 1
+  lora_rank: 4
+  lora_alpha: 8
+  lora_dropout: 0.1
+
+google/gemma-2b:
+  per_device_train_batch_size: 1
+  gradient_accumulation_steps: 8
+  num_train_epochs: 1
+  lora_rank: 4
+  lora_alpha: 8
+  lora_dropout: 0.1
+
+google/gemma-7b:
+  per_device_train_batch_size: 1
+  gradient_accumulation_steps: 8
+  num_train_epochs: 1
+  lora_rank: 4
+  lora_alpha: 8
+  lora_dropout: 0.1