Merge pull request FLock-io#1 from FLock-io/feat/make-a-fully-automat…

…e-script feat: add a fully automated training script
limberc · May 9, 2024 · b5e13e2 · b5e13e2
2 parents 92e27d5 + a71db29
commit b5e13e2
Show file tree

Hide file tree

Showing 3 changed files with 174 additions and 70 deletions.
diff --git a/README.md b/README.md
@@ -20,8 +20,11 @@ pip install -r requirements.txt
 - [`demo_data.jsonl`](demo_data.jsonl) - Follows the shareGPT format. The training data you receive from the `fed-ledger` is in exactly the same format.
 - [`merge.py`](merge.py) - Contains the utility function for merging LoRA weights. If you are training with LoRA, please ensure you merge the adapter before uploading to your Hugging Face repository.
 - [`demo.py`](demo.py) - A training script that implements LoRA fine-tuning for a Gemma-2B model.
+- [`full_automation.py`](full_automation.py) - A script that automate everything including get a task, download the training data, finetune Gemma-2B on training data, merge weights, upload to your HuggingFace model repo, and submit the task to fed-ledger.
 
-### Start the Training
+### Play with demo.py
+
+#### Start the Training
 
 Execute the following command to start the training:
 
@@ -33,11 +36,11 @@ The HF token is required due to the Gemma License.
 
 This command initiates fine-tuning on the demo dataset, saves the fine-tuned model, merges the adapter to the base model, and saves the final model.
 
-### Upload the model folder to your HuggingFace repo
+#### Upload the model folder to your HuggingFace repo
 
 [HuggingFace Models Uploading](https://huggingface.co/docs/hub/en/models-uploading)
 
-### Submit the model
+#### Submit the model
 
 ```bash
 
@@ -52,3 +55,11 @@ curl --location 'https://fed-ledger-prod.flock.io/api/v1/tasks/submit-result' \
     }
 }'
 ```
+
+### Full Automation
+
+Simply run
+
+```bash
+TASK_ID=<task-id> FLOCK_API_KEY="<your-flock-api-key-stakes-as-node-for-the-task>" HF_TOKEN="<your-hf-token>" CUDA_VISIBLE_DEVICES=0 python full_automtion.py
+```
diff --git a/demo.py b/demo.py
@@ -6,80 +6,93 @@
                           BitsAndBytesConfig, TrainingArguments)
 from trl import SFTTrainer
 
-from dataset import SFTDataCollator, GemmaSFTDataset
+from dataset import GemmaSFTDataset, SFTDataCollator
 from merge import merge_lora_to_base_model
 
-lora_config = LoraConfig(
-    r=8,
-    target_modules=[
-        "q_proj",
-        "o_proj",
-        "k_proj",
-        "v_proj",
-        "gate_proj",
-        "up_proj",
-        "down_proj",
-    ],
-    task_type="CAUSAL_LM",
-)
 
+def train_and_merge(
+    num_train_epochs: int = 3,
+    per_device_train_batch_size: int = 1,
+    gradient_accumulation_steps: int = 8,
+    context_length: int = 512,
+):
+    lora_config = LoraConfig(
+        r=8,
+        target_modules=[
+            "q_proj",
+            "o_proj",
+            "k_proj",
+            "v_proj",
+            "gate_proj",
+            "up_proj",
+            "down_proj",
+        ],
+        task_type="CAUSAL_LM",
+    )
 
-model_id = "google/gemma-2b"
-# Load model in 4-bit to do qLoRA
-bnb_config = BitsAndBytesConfig(
-    load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
-)
+    model_id = "google/gemma-2b"
+    # Load model in 4-bit to do qLoRA
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+    )
 
-training_args = TrainingArguments(
-    per_device_train_batch_size=1,
-    gradient_accumulation_steps=8,
-    warmup_steps=2,
-    max_steps=10,
-    learning_rate=2e-4,
-    bf16=True,
-    logging_steps=1,
-    output_dir="outputs",
-    optim="paged_adamw_8bit",
-    remove_unused_columns=False,
-)
-tokenizer = AutoTokenizer.from_pretrained(
-    model_id,
-    use_fast=True,
-)
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    quantization_config=bnb_config,
-    device_map={"": 0},
-    token=os.environ["HF_TOKEN"],
-)
+    training_args = TrainingArguments(
+        per_device_train_batch_size=per_device_train_batch_size,
+        gradient_accumulation_steps=gradient_accumulation_steps,
+        warmup_steps=100,
+        learning_rate=2e-4,
+        bf16=True,
+        logging_steps=20,
+        output_dir="outputs",
+        optim="paged_adamw_8bit",
+        remove_unused_columns=False,
+        num_train_epochs=num_train_epochs,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_id,
+        use_fast=True,
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        quantization_config=bnb_config,
+        device_map={"": 0},
+        token=os.environ["HF_TOKEN"],
+    )
 
-# Load dataset
-dataset = GemmaSFTDataset(
-    file="demo_data.jsonl",
-    tokenizer=tokenizer,
-    max_seq_length=512,
-)
+    # Load dataset
+    dataset = GemmaSFTDataset(
+        file="demo_data.jsonl",
+        tokenizer=tokenizer,
+        max_seq_length=context_length,
+    )
 
-# Define trainer
-trainer = SFTTrainer(
-    model=model,
-    train_dataset=dataset,
-    args=training_args,
-    peft_config=lora_config,
-    packing=True,
-    data_collator=SFTDataCollator(tokenizer, max_seq_length=512),
-    max_seq_length=512,
-)
+    # Define trainer
+    trainer = SFTTrainer(
+        model=model,
+        train_dataset=dataset,
+        args=training_args,
+        peft_config=lora_config,
+        packing=True,
+        data_collator=SFTDataCollator(tokenizer, max_seq_length=context_length),
+        max_seq_length=context_length,
+    )
 
-# Train model
-trainer.train()
+    # Train model
+    trainer.train()
 
-# save model
-trainer.save_model("outputs")
+    # save model
+    trainer.save_model("outputs")
 
-# merge lora to base model
-merge_lora_to_base_model(
-    model_name_or_path="google/gemma-2b",
-    adapter_name_or_path="outputs",
-    save_path="merged_model",
-)
+    # merge lora to base model
+    print("Training Completed. Start to merge the weights....")
+    merge_lora_to_base_model(
+        model_name_or_path="google/gemma-2b",
+        adapter_name_or_path="outputs",
+        save_path="merged_model",
+    )
+
+
+if __name__ == "__main__":
+    train_and_merge()
diff --git a/full_automtion.py b/full_automtion.py
@@ -0,0 +1,80 @@
+import json
+import os
+import time
+
+import requests
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from demo import train_and_merge
+
+FLOCK_API_KEY = os.environ["FLOCK_API_KEY"]
+FED_LEDGER_BASE_URL = "https://fed-ledger-prod.flock.io/api/v1"
+
+
+def get_task(task_id: int):
+    response = requests.request(
+        "GET", f"{FED_LEDGER_BASE_URL}/tasks/get?task_id={task_id}"
+    )
+    return response.json()
+
+
+def submit_task(task_id: int, hg_repo_id: str):
+    payload = json.dumps(
+        {"task_id": task_id, "data": {"hg_repo_id": hg_repo_id, "base_model": "gemma"}}
+    )
+    headers = {
+        "flock-api-key": FLOCK_API_KEY,
+        "Content-Type": "application/json",
+    }
+    response = requests.request(
+        "POST",
+        f"{FED_LEDGER_BASE_URL}/tasks/submit-result",
+        headers=headers,
+        data=payload,
+    )
+    return response.json()
+
+
+if __name__ == "__main__":
+    task_id = os.environ["TASK_ID"]
+    task = get_task(task_id)
+    # download data from a presigned url
+    data_url = task["data"]["training_set_url"]
+    context_length = task["data"]["context_length"]
+    # download in chunks
+    response = requests.get(data_url, stream=True)
+    with open("demo_data.jsonl", "wb") as f:
+        for chunk in response.iter_content(chunk_size=128):
+            f.write(chunk)
+    # train and merge
+    print("Start to train the model...")
+    train_and_merge(num_train_epochs=1, context_length=256)
+
+    # generate a random repo id based on timestamp
+    hg_repo_id = "gemma-2b-flock-" + str(int(time.time()))
+
+    # load the merged model
+    model = AutoModelForCausalLM.from_pretrained(
+        "merged_model",
+        trust_remote_code=True,
+        low_cpu_mem_usage=True,
+        torch_dtype=torch.float16,
+        device_map={"": "cpu"},
+    )
+
+    # upload
+    print("Start to push the model to the hub...")
+    model.push_to_hub(
+        repo_id=hg_repo_id, use_temp_dir=True, token=os.environ["HF_TOKEN"]
+    )
+    # upload tokenizer as well
+    tokenizer = AutoTokenizer.from_pretrained(
+        "merged_model",
+    )
+    tokenizer.push_to_hub(
+        repo_id=hg_repo_id, use_temp_dir=True, token=os.environ["HF_TOKEN"]
+    )
+    # submit
+    submit_task(task_id, hg_repo_id)
+    print("Task submitted successfully")