forked from FLock-io/testnet-training-node-quickstart
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfull_automation.py
85 lines (73 loc) · 2.44 KB
/
full_automation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import json
import os
import time
import requests
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from demo import train_and_merge
FLOCK_API_KEY = os.environ["FLOCK_API_KEY"]
FED_LEDGER_BASE_URL = "https://fed-ledger-prod.flock.io/api/v1"
HF_USERNAME = os.environ["HF_USERNAME"]
def get_task(task_id: int):
response = requests.request(
"GET", f"{FED_LEDGER_BASE_URL}/tasks/get?task_id={task_id}"
)
return response.json()
def submit_task(task_id: int, hg_repo_id: str):
payload = json.dumps(
{"task_id": task_id, "data": {"hg_repo_id": hg_repo_id, "base_model": "gemma"}}
)
headers = {
"flock-api-key": FLOCK_API_KEY,
"Content-Type": "application/json",
}
response = requests.request(
"POST",
f"{FED_LEDGER_BASE_URL}/tasks/submit-result",
headers=headers,
data=payload,
)
if response.status_code != 200:
raise Exception(f"Failed to submit task: {response.text}")
return response.json()
if __name__ == "__main__":
task_id = os.environ["TASK_ID"]
task = get_task(task_id)
# log the task info
print(json.dumps(task, indent=4))
# download data from a presigned url
data_url = task["data"]["training_set_url"]
context_length = task["data"]["context_length"]
# download in chunks
response = requests.get(data_url, stream=True)
with open("demo_data.jsonl", "wb") as f:
for chunk in response.iter_content(chunk_size=128):
f.write(chunk)
# train and merge
print("Start to train the model...")
train_and_merge(context_length=context_length)
# generate a random repo id based on timestamp
hf_repo_id = "gemma-2b-flock-" + str(int(time.time()))
# load the merged model
model = AutoModelForCausalLM.from_pretrained(
"merged_model",
trust_remote_code=True,
low_cpu_mem_usage=True,
torch_dtype=torch.float16,
device_map={"": "cpu"},
)
# upload
print("Start to push the model to the hub...")
model.push_to_hub(
repo_id=hf_repo_id, use_temp_dir=True, token=os.environ["HF_TOKEN"]
)
# upload tokenizer as well
tokenizer = AutoTokenizer.from_pretrained(
"merged_model",
)
tokenizer.push_to_hub(
repo_id=hf_repo_id, use_temp_dir=True, token=os.environ["HF_TOKEN"]
)
# submit
submit_task(task_id, f"{HF_USERNAME}/{hf_repo_id}")
print("Task submitted successfully")