Skip to content

Commit

Permalink
Feat/support config file for training args (FLock-io#7)
Browse files Browse the repository at this point in the history
* feat: add support for training args

* fix: fix wrong dataclass name

* chore: update README

* feat: comment out model in training_args to bypass in training

* fix: filter by model size
  • Loading branch information
nickcom007 authored Jun 3, 2024
1 parent 23a00a4 commit 534d606
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 26 deletions.
28 changes: 17 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,23 @@ pip install -r requirements.txt
- [`merge.py`](merge.py) - Contains the utility function for merging LoRA weights. If you are training with LoRA, please ensure you merge the adapter before uploading to your Hugging Face repository.
- [`demo.py`](demo.py) - A training script that implements LoRA fine-tuning for a Gemma-2B model.
- [`full_automation.py`](full_automation.py) - A script that automate everything including get a task, download the training data, finetune Gemma-2B on training data, merge weights, upload to your HuggingFace model repo, and submit the task to fed-ledger.
- [`training_args.yaml`](training_args.yaml) - A YAML defines the training hyper-parameters for fine-tuning. A detailed explanation on LoRA config can be found here: [LoRA Fine-tuning & Hyperparameters Explained](https://www.entrypointai.com/blog/lora-fine-tuning/)

### Full Automation

Simply run

```bash
TASK_ID=<task-id> FLOCK_API_KEY="<your-flock-api-key-stakes-as-node-for-the-task>" HF_TOKEN="<your-hf-token>" CUDA_VISIBLE_DEVICES=0 HF_USERNAME="your-hf-user-name" python full_automation.py
```

The above command will automatically train and submit multiple LLMs that are smaller the max parameters limitation for the given task.

#### Bypass certain models

If you want to bypass certain models, simply comment out the model config in the [`training_args.yaml`](training_args.yaml)

---

### Play with demo.py

Expand All @@ -40,7 +57,6 @@ This command initiates fine-tuning on the demo dataset, saves the fine-tuned mod

[HuggingFace Models Uploading](https://huggingface.co/docs/hub/en/models-uploading)


#### Getting the task id

Before you submit the model script for a task, you will first need to stake on the task as a node.
Expand All @@ -64,13 +80,3 @@ curl --location 'https://fed-ledger-prod.flock.io/api/v1/tasks/submit-result' \
}
}'
```

### Full Automation

Simply run

```bash
TASK_ID=<task-id> FLOCK_API_KEY="<your-flock-api-key-stakes-as-node-for-the-task>" HF_TOKEN="<your-hf-token>" CUDA_VISIBLE_DEVICES=0 HF_USERNAME="your-hf-user-name" python full_automation.py
```

The above command will automatically train and submit multiple LLMs that are smaller the max parameters limitation for the given task.
27 changes: 18 additions & 9 deletions demo.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
from dataclasses import dataclass

import torch
from peft import LoraConfig
Expand All @@ -11,20 +12,28 @@
from utils.constants import model2template


@dataclass
class LoraTrainingArguments:
per_device_train_batch_size: int
gradient_accumulation_steps: int
num_train_epochs: int
lora_rank: int
lora_alpha: int
lora_dropout: int


def train_and_merge(
model_id: str = "google/gemma-2b",
num_train_epochs: int = 1,
per_device_train_batch_size: int = 1,
gradient_accumulation_steps: int = 8,
context_length: int = 512,
model_id: str, context_length: int, training_args: LoraTrainingArguments
):
assert model_id in model2template, f"model_id {model_id} not supported"
lora_config = LoraConfig(
r=8,
r=training_args.lora_rank,
target_modules=[
"q_proj",
"v_proj",
],
lora_alpha=training_args.lora_alpha,
lora_dropout=training_args.lora_dropout,
task_type="CAUSAL_LM",
)

Expand All @@ -36,16 +45,16 @@ def train_and_merge(
)

training_args = TrainingArguments(
per_device_train_batch_size=per_device_train_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
per_device_train_batch_size=training_args.per_device_train_batch_size,
gradient_accumulation_steps=training_args.gradient_accumulation_steps,
warmup_steps=100,
learning_rate=2e-4,
bf16=True,
logging_steps=20,
output_dir="outputs",
optim="paged_adamw_8bit",
remove_unused_columns=False,
num_train_epochs=num_train_epochs,
num_train_epochs=training_args.num_train_epochs,
)
tokenizer = AutoTokenizer.from_pretrained(
model_id,
Expand Down
22 changes: 17 additions & 5 deletions full_automation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,40 +4,52 @@

import requests
import torch
import yaml
from loguru import logger
from transformers import AutoModelForCausalLM, AutoTokenizer

from demo import train_and_merge
from demo import LoraTrainingArguments, train_and_merge
from utils.constants import model2base_model, model2size
from utils.flock_api import get_task, submit_task

HF_USERNAME = os.environ["HF_USERNAME"]

if __name__ == "__main__":
task_id = os.environ["TASK_ID"]
# load trainin args
# define the path of the current file
current_folder = os.path.dirname(os.path.realpath(__file__))
with open(f"{current_folder}/training_args.yaml", "r") as f:
all_training_args = yaml.safe_load(f)

task = get_task(task_id)
# log the task info
print(json.dumps(task, indent=4))
logger.info(json.dumps(task, indent=4))
# download data from a presigned url
data_url = task["data"]["training_set_url"]
context_length = task["data"]["context_length"]
max_params = task["data"]["max_params"]

# filter out the model within the max_params
model2size = {k: v for k, v in model2size.items() if v <= max_params}
logger.info(f"Models within the max_params: {model2size.keys()}")
all_training_args = {k: v for k, v in all_training_args.items() if k in model2size}
logger.info(f"Models within the max_params: {all_training_args.keys()}")
# download in chunks
response = requests.get(data_url, stream=True)
with open("demo_data.jsonl", "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)

# train all feasible models and merge
for model_id in model2size.keys():
for model_id in all_training_args.keys():
logger.info(f"Start to train the model {model_id}...")
# if OOM, proceed to the next model
try:
train_and_merge(model_id=model_id, context_length=context_length)
train_and_merge(
model_id=model_id,
context_length=context_length,
training_args=LoraTrainingArguments(**all_training_args[model_id]),
)
except RuntimeError as e:
logger.error(f"Error: {e}")
logger.info("Proceed to the next model...")
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ transformers>=4.37.2
peft>=0.10.0
loguru
trl>=0.8.1
bitsandbytes
bitsandbytes
pyyaml
39 changes: 39 additions & 0 deletions training_args.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
Qwen/Qwen1.5-0.5B:
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
num_train_epochs: 1
lora_rank: 8
lora_alpha: 16
lora_dropout: 0.1

Qwen/Qwen1.5-1.8B:
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
num_train_epochs: 1
lora_rank: 4
lora_alpha: 8
lora_dropout: 0.1

Qwen/Qwen1.5-7B:
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
num_train_epochs: 1
lora_rank: 4
lora_alpha: 8
lora_dropout: 0.1

google/gemma-2b:
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
num_train_epochs: 1
lora_rank: 4
lora_alpha: 8
lora_dropout: 0.1

google/gemma-7b:
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
num_train_epochs: 1
lora_rank: 4
lora_alpha: 8
lora_dropout: 0.1

0 comments on commit 534d606

Please sign in to comment.