FSDP-Multi-GPU-Training/Makefile at main · codeamt/FSDP-Multi-GPU-Training · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
SHELL := /bin/bash
VENV = .venv
PYTHON = $(VENV)/bin/python
UV = $(VENV)/bin/uv
S5CMD = $(VENV)/bin/s5cmd
BUCKET_NAME ?= your-new-bucket-name
AWS_REGION ?= us-east-1
HF_DATASET_ID ?=
CHECKPOINT_DIR ?= checkpoints/
IMAGE ?= unsloth-fsdp-training:latest
AWS_SECRET_ID ?= dev/ProjectName/ProjectSecrets

# Load environment from .env if present
ifneq (,$(wildcard .env))
include .env
export
endif

train-unsloth:
	$(PYTHON) scripts/train.py --config scripts/configs/unsloth/finance-alpaca.yaml

train-fsdp:
	$(PYTHON) scripts/train.py --config scripts/configs/fsdp/llama-7b.yaml
#JOB_NAME ?=

# Multi-GPU training with torchrun
NGPU ?= 8

## Optional NCCL environment hints (uncomment and adjust as needed)
# export NCCL_DEBUG=INFO
# export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
# export NCCL_IB_DISABLE=0
# export NCCL_NET_GDR_LEVEL=2
# export NCCL_SOCKET_IFNAME=eth0   # or bond0/eniX inside cloud envs
# export GLOO_SOCKET_IFNAME=eth0
# export CUDA_LAUNCH_BLOCKING=0

train-fsdp-mgpu:
	torchrun --nproc_per_node=$(NGPU) \
		scripts/train.py --config scripts/configs/fsdp/llama-7b.yaml

train-unsloth-mgpu:
	torchrun --nproc_per_node=$(NGPU) \
		scripts/train.py --config scripts/configs/unsloth/finance-alpaca.yaml

train-fsdp-smoke-mgpu:
	torchrun --nproc_per_node=$(NGPU) \
		scripts/train.py --config scripts/configs/fsdp/smoke.yaml

init:
	@echo "Setting up environment with uv..."
	pip install uv
    $(UV) venv $(VENV)
    source $(VENV)/bin/activate
    $(UV) pip install -e ".[dev]"
	$(UV) sync
	$(UV)$(PYTHON) -m pre_commit install

# Data processing with s5cmd and bucket creation
create-bucket:
	@echo "Creating bucket $(BUCKET_NAME) in $(AWS_REGION)..."
	@aws s3api head-bucket --bucket $(BUCKET_NAME) 2>/dev/null || \
	@aws s3api create-bucket \
		--bucket $(BUCKET_NAME) \
		--region $(AWS_REGION) \
		--create-bucket-configuration LocationConstraint=$(AWS_REGION)
	@aws s3api put-public-access-block \
		--bucket $(BUCKET_NAME) \
		--public-access-block-configuration "BlockPublicAcls=true,IgnorePublicAcls=true,BlockPublicPolicy=true,RestrictPublicBuckets=true"

# Data processing
download-data: create-bucket
	@echo "Syncing data with s5cmd..."
	$(UV) $(S5CMD) sync "s3://$(BUCKET_NAME)/llm-data/" data/
	$(UV) $(S5CMD) sync "s3://$(BUCKET_NAME)/configs/" configs/

download-hf-dataset:
	$(UV) $(PYTHON) -c "from datasets import load_dataset; load_dataset('$(HF_DATASET_ID)').save_to_disk('data/$(HF_DATASET_ID)')"

upload-hf-to-s3:
	$(UV) $(S5CMD) sync data/$(DATASET_NAME)/ s3://$(BUCKET_NAME)/llm-data/$(HF_DATASET_ID)/

# Training commands
# (single-GPU; use torchrun target for multi-GPU)
train-unsloth-single:
	$(UV) $(PYTHON) scripts/train.py --config scripts/configs/unsloth/finance-alpaca.yaml

train-fsdp-single:
	$(UV) $(PYTHON) scripts/train.py --config scripts/configs/fsdp/llama-7b.yaml

train-unsloth-smoke:
	$(UV) $(PYTHON) scripts/train.py --config scripts/configs/unsloth/smoke.yaml

train-fsdp-smoke:
	$(UV) $(PYTHON) scripts/train.py --config scripts/configs/fsdp/smoke.yaml

unsloth-to-fsdp-checkpoiny-conversion:
  $(UV) $(PYTHON) scripts/tools/convert_checkpoint.py \
  --source old_checkpoint.pt \
  --target new_checkpoint.pt \
  --strategy unsloth \
  --target_strategy fsdp


upload-outputs:#Should take JOB_NAME arg
	@echo "Uploading outputs to S3..."
	$(UV) $(S5CMD) sync outputs/ "s3://$(BUCKET_NAME)/outputs/$(JOB_NAME)/"

# Testing
test:
	$(UV) $(PYTHON) -m pytest tests/ -v --cov=common --cov=scripts --cov-report=html

test-unit:
	$(UV) $(PYTHON) -m pytest tests/unit/ -v

test-integration:
	$(UV) $(PYTHON) -m pytest tests/integration/ -v

test-data:
	$(UV) $(PYTHON) -m pytest tests/unit/test_data.py -v --cov=scripts.core.data_loader

test-data-unit:
	$(UV) $(PYTHON) -m pytest tests/unit/test_data.py::TestDataLoaderUnit -v

test-data-integration:
	$(UV) $(PYTHON) -m pytest tests/integration/test_data.py::TestDataLoaderIntegration -v \
		--cov=scripts.core.data_loader --cov-report=term-missing

# Backups
backup-checkpoints:
	$(UV) $(S5CMD) sync $(CHECKPOINT_DIR) s3://$(BUCKET_NAME)/backups/checkpoints/$$(/bin/date +%Y%m%d)

# Formatting and linting
format:
	$(UV) $(PYTHON) -m black .
	$(UV) $(PYTHON) -m isort .

lint:
	$(UV) $(PYTHON) -m flake8 common scripts
	$(UV) $(PYTHON) -m mypy --ignore-missing-imports .

# Documentation
docs:
	$(UV) $(PYTHON) -m mkdocs build
	$(UV) $(S5CMD) sync site/ s3://$(BUCKET_NAME)/docs/ --delete

# Tooling and Security
check-tools:
	@command -v aws >/dev/null 2>&1 || { echo "aws CLI missing. Install AWS CLI v2."; exit 1; }
	@[ -x "$(S5CMD)" ] || { echo "s5cmd missing in venv. Run 'make init'."; exit 1; }
	@[ -x "$(PYTHON)" ] || { echo "Python venv not initialized. Run 'make init'."; exit 1; }
	@echo "All required tools present."

# Security
security-scan:
	$(UV) $(PYTHON) -m pip_audit || pip-audit || true
	bandit -r common/ scripts/
	git secrets --scan || true

encrypted-checkpoint:
	$(UV) $(S5CMD) cp --sse=aws:kms $(CHECKPOINT_DIR) s3://$(BUCKET_NAME)/encrypted-checkpoints/

api-audit:
	@echo "Monitoring recent GetObject events via CloudTrail..."
	@aws cloudtrail lookup-events --lookup-attributes AttributeKey=EventName,AttributeValue=GetObject

# Cleanup
clean:
	rm -rf __pycache__ .pytest_cache htmlcov .coverage .DS_STORE
	find . -name '*.pyc' -delete

# -----------------------------
# Docker targets
# -----------------------------
.PHONY: docker-build docker-shell-local docker-shell-secrets docker-bootstrap docker-train

# Common docker flags
DOCKER_RUN = docker run --rm -it
DOCKER_GPU = --gpus all
MOUNT_RO   = -v $$(pwd):/workspace:ro
MOUNT_RW   = -v $$(pwd):/workspace

docker-build:
	docker build -t $(IMAGE) .

# Local shell with example.env, read-only mount
docker-shell-local: docker-build
	$(DOCKER_RUN) $(DOCKER_GPU) --env-file example.env $(MOUNT_RO) $(IMAGE) bash

# Shell that fetches secrets at runtime via AWS_SECRET_ID
docker-shell-secrets: docker-build
	$(DOCKER_RUN) $(DOCKER_GPU) -e AWS_REGION=$(AWS_REGION) -e AWS_SECRET_ID=$(AWS_SECRET_ID) $(MOUNT_RO) $(IMAGE) bash

# Bootstrap S3 bucket and Secrets Manager from current env (SECRET_KEYS or AWS_SECRETS_JSON)
# Usage: make docker-bootstrap SECRET_KEYS="HF_TOKEN,WANDB_API_KEY" HF_TOKEN=... WANDB_API_KEY=...
docker-bootstrap: docker-build
	$(DOCKER_RUN) $(DOCKER_GPU) \
		-e AWS_REGION=$(AWS_REGION) \
		-e AWS_BOOTSTRAP=1 \
		-e S3_BUCKET_NAME=$(BUCKET_NAME) \
		-e AWS_SECRET_ID=$(AWS_SECRET_ID) \
		$(foreach v,$(SECRET_KEYS),-e $(v)="$($(v))") \
		-e SECRET_KEYS="$(SECRET_KEYS)" \
		$(MOUNT_RO) \
		$(IMAGE) bash

# Run training inside the container using the installed package module
# Examples:
#   make docker-train CONFIG=scripts/configs/fsdp/smoke.yaml SMOKE=1
#   make docker-train CONFIG=scripts/configs/unsloth/smoke.yaml BACKEND=unsloth SMOKE=1
CONFIG ?= scripts/configs/fsdp/smoke.yaml
BACKEND ?=
SMOKE ?=
docker-train: docker-build
	$(DOCKER_RUN) $(DOCKER_GPU) $(MOUNT_RW) -e AWS_REGION=$(AWS_REGION) $(IMAGE) \
		bash -lc "python -m fsdp_unsloth.cli --config $(CONFIG) $(if $(BACKEND),--backend $(BACKEND),) $(if $(SMOKE),--smoke,)"

# -----------------------------
# Secrets Manager targets
# -----------------------------
.PHONY: secrets-get secrets-create-from-env secrets-update-from-env secrets-update-from-file

# Print the current secret JSON
secrets-get:
	aws secretsmanager get-secret-value \
		--region $(AWS_REGION) \
		--secret-id $(AWS_SECRET_ID) \
		--query 'SecretString' --output text | jq .

# Create secret from env values defined by SECRET_KEYS (comma-separated)
secrets-create-from-env:
	@test -n "$(SECRET_KEYS)" || (echo "SECRET_KEYS is required (e.g. HF_TOKEN,WANDB_API_KEY,WANDB_PROJECT)"; exit 1)
	@json='{}'; \
	IFS=, ; for k in $(SECRET_KEYS); do \
	  k=$${k// /}; v=$${!k}; \
	  json=$$(jq --arg key $$k --arg val "$$v" '. + {($$key): $$val}' <<<"$$json"); \
	done; \
	aws secretsmanager create-secret \
		--region $(AWS_REGION) \
		--name $(AWS_SECRET_ID) \
		--secret-string "$$json"

# Update existing secret from env values defined by SECRET_KEYS (comma-separated)
secrets-update-from-env:
	@test -n "$(SECRET_KEYS)" || (echo "SECRET_KEYS is required (e.g. HF_TOKEN,WANDB_API_KEY,WANDB_PROJECT)"; exit 1)
	@json='{}'; \
	IFS=, ; for k in $(SECRET_KEYS); do \
	  k=$${k// /}; v=$${!k}; \
	  json=$$(jq --arg key $$k --arg val "$$v" '. + {($$key): $$val}' <<<"$$json"); \
	done; \
	aws secretsmanager update-secret \
		--region $(AWS_REGION) \
		--secret-id $(AWS_SECRET_ID) \
		--secret-string "$$json"

# Update existing secret from a JSON file: make secrets-update-from-file SECRET_FILE=path/to/secret.json
secrets-update-from-file:
	@test -n "$(SECRET_FILE)" || (echo "SECRET_FILE is required (path to JSON)"; exit 1)
	aws secretsmanager update-secret \
		--region $(AWS_REGION) \
		--secret-id $(AWS_SECRET_ID) \
		--secret-string file://$(SECRET_FILE)