Skip to content

Commit b218ea2

Browse files
author
Tibor Mach
committed
Sagemaker deployment steps
1 parent f5e29bf commit b218ea2

File tree

5 files changed

+335
-8
lines changed

5 files changed

+335
-8
lines changed

example-get-started-experiments/code/.github/workflows/deploy-model.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ jobs:
6060
- run: dvc remote add -d --local storage s3://dvc-public/remote/get-started-pools
6161

6262
- run: |
63-
MODEL_DATA=$(dvc get --show-url . model.tar.gz)
63+
MODEL_DATA=$(dvc get --show-url . sagemaker/model.tar.gz)
6464
python sagemaker/deploy_model.py \
6565
--name ${{ needs.parse.outputs.name }} \
6666
--stage ${{ needs.parse.outputs.stage }} \

example-get-started-experiments/code/notebooks/TrainSegModel-logging.ipynb

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,9 +242,11 @@
242242
" # add additional post-training summary metrics\n",
243243
" live.summary[\"evaluate/dice_multi\"] = evaluate(learn)\n",
244244
"\n",
245+
" torch.save(learn.model, (models_dir / \"model.pth\").absolute())\n",
246+
"\n",
245247
" # save model artifact to dvc\n",
246248
" live.log_artifact(\n",
247-
" str(models_dir / \"model.pkl\"),\n",
249+
" str(models_dir / \"model.pth\"),\n",
248250
" type=\"model\",\n",
249251
" name=\"pool-segmentation\",\n",
250252
" desc=\"This is a Computer Vision (CV) model that's segmenting out swimming pools from satellite images.\",\n",
Lines changed: 296 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,296 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"from pathlib import Path\n",
10+
"\n",
11+
"ROOT = Path(\"../\")\n",
12+
"DATA = ROOT / \"data\""
13+
]
14+
},
15+
{
16+
"cell_type": "code",
17+
"execution_count": null,
18+
"metadata": {},
19+
"outputs": [],
20+
"source": [
21+
"import warnings\n",
22+
"warnings.filterwarnings(\"ignore\")"
23+
]
24+
},
25+
{
26+
"cell_type": "code",
27+
"execution_count": null,
28+
"metadata": {},
29+
"outputs": [],
30+
"source": [
31+
"import shutil\n",
32+
"from functools import partial\n",
33+
"\n",
34+
"import numpy as np\n",
35+
"import torch\n",
36+
"from box import ConfigBox\n",
37+
"from dvclive import Live\n",
38+
"from dvclive.fastai import DVCLiveCallback\n",
39+
"from fastai.data.all import Normalize, get_files\n",
40+
"from fastai.metrics import DiceMulti\n",
41+
"from fastai.vision.all import (Resize, SegmentationDataLoaders,\n",
42+
" imagenet_stats, models, unet_learner)\n",
43+
"from ruamel.yaml import YAML\n",
44+
"from PIL import Image"
45+
]
46+
},
47+
{
48+
"attachments": {},
49+
"cell_type": "markdown",
50+
"metadata": {},
51+
"source": [
52+
"### Load data and split it into train/test\n",
53+
"\n",
54+
"We have some [data in DVC](https://dvc.org/doc/start/data-management/data-versioning) that we can pull. \n",
55+
"\n",
56+
"This data includes:\n",
57+
"* satellite images\n",
58+
"* masks of the swimming pools in each satellite image\n",
59+
"\n",
60+
"DVC can help connect your data to your repo, but it isn't necessary to have your data in DVC to start tracking experiments with DVC and DVCLive."
61+
]
62+
},
63+
{
64+
"cell_type": "code",
65+
"execution_count": null,
66+
"metadata": {},
67+
"outputs": [],
68+
"source": [
69+
"!dvc pull"
70+
]
71+
},
72+
{
73+
"cell_type": "code",
74+
"execution_count": null,
75+
"metadata": {},
76+
"outputs": [],
77+
"source": [
78+
"test_regions = [\"REGION_1-\"]\n",
79+
"\n",
80+
"img_fpaths = get_files(DATA / \"pool_data\" / \"images\", extensions=\".jpg\")\n",
81+
"\n",
82+
"train_data_dir = DATA / \"train_data\"\n",
83+
"train_data_dir.mkdir(exist_ok=True)\n",
84+
"test_data_dir = DATA / \"test_data\"\n",
85+
"test_data_dir.mkdir(exist_ok=True)\n",
86+
"for img_path in img_fpaths:\n",
87+
" msk_path = DATA / \"pool_data\" / \"masks\" / f\"{img_path.stem}.png\"\n",
88+
" if any(region in str(img_path) for region in test_regions):\n",
89+
" shutil.copy(img_path, test_data_dir)\n",
90+
" shutil.copy(msk_path, test_data_dir)\n",
91+
" else:\n",
92+
" shutil.copy(img_path, train_data_dir)\n",
93+
" shutil.copy(msk_path, train_data_dir)"
94+
]
95+
},
96+
{
97+
"attachments": {},
98+
"cell_type": "markdown",
99+
"metadata": {},
100+
"source": [
101+
"### Create a data loader\n",
102+
"\n",
103+
"Load and prepare the images and masks by creating a data loader."
104+
]
105+
},
106+
{
107+
"cell_type": "code",
108+
"execution_count": null,
109+
"metadata": {},
110+
"outputs": [],
111+
"source": [
112+
"def get_mask_path(x, train_data_dir):\n",
113+
" return Path(train_data_dir) / f\"{Path(x).stem}.png\""
114+
]
115+
},
116+
{
117+
"cell_type": "code",
118+
"execution_count": null,
119+
"metadata": {},
120+
"outputs": [],
121+
"source": [
122+
"bs = 8\n",
123+
"valid_pct = 0.20\n",
124+
"img_size = 256\n",
125+
"\n",
126+
"data_loader = SegmentationDataLoaders.from_label_func(\n",
127+
" path=train_data_dir,\n",
128+
" fnames=get_files(train_data_dir, extensions=\".jpg\"),\n",
129+
" label_func=partial(get_mask_path, train_data_dir=train_data_dir),\n",
130+
" codes=[\"not-pool\", \"pool\"],\n",
131+
" bs=bs,\n",
132+
" valid_pct=valid_pct,\n",
133+
" item_tfms=Resize(img_size),\n",
134+
" batch_tfms=[\n",
135+
" Normalize.from_stats(*imagenet_stats),\n",
136+
" ],\n",
137+
" )"
138+
]
139+
},
140+
{
141+
"attachments": {},
142+
"cell_type": "markdown",
143+
"metadata": {},
144+
"source": [
145+
"### Review a sample batch of data\n",
146+
"\n",
147+
"Below are some examples of the images overlaid with their masks."
148+
]
149+
},
150+
{
151+
"cell_type": "code",
152+
"execution_count": null,
153+
"metadata": {},
154+
"outputs": [],
155+
"source": [
156+
"data_loader.show_batch(alpha=0.7)"
157+
]
158+
},
159+
{
160+
"attachments": {},
161+
"cell_type": "markdown",
162+
"metadata": {},
163+
"source": [
164+
"### Train multiple models with different learning rates using `DVCLiveCallback`\n",
165+
"\n",
166+
"Set up model training, using DVCLive to capture the results of each experiment."
167+
]
168+
},
169+
{
170+
"cell_type": "code",
171+
"execution_count": null,
172+
"metadata": {},
173+
"outputs": [],
174+
"source": [
175+
"def dice(mask_pred, mask_true, classes=[0, 1], eps=1e-6):\n",
176+
" dice_list = []\n",
177+
" for c in classes:\n",
178+
" y_true = mask_true == c\n",
179+
" y_pred = mask_pred == c\n",
180+
" intersection = 2.0 * np.sum(y_true * y_pred)\n",
181+
" dice = intersection / (np.sum(y_true) + np.sum(y_pred) + eps)\n",
182+
" dice_list.append(dice)\n",
183+
" return np.mean(dice_list)\n",
184+
"\n",
185+
"def evaluate(learn):\n",
186+
" test_img_fpaths = sorted(get_files(DATA / \"test_data\", extensions=\".jpg\"))\n",
187+
" test_dl = learn.dls.test_dl(test_img_fpaths)\n",
188+
" preds, _ = learn.get_preds(dl=test_dl)\n",
189+
" masks_pred = np.array(preds[:, 1, :] > 0.5, dtype=np.uint8)\n",
190+
" test_mask_fpaths = [\n",
191+
" get_mask_path(fpath, DATA / \"test_data\") for fpath in test_img_fpaths\n",
192+
" ]\n",
193+
" masks_true = [Image.open(mask_path) for mask_path in test_mask_fpaths]\n",
194+
" dice_multi = 0.0\n",
195+
" for ii in range(len(masks_true)):\n",
196+
" mask_pred, mask_true = masks_pred[ii], masks_true[ii]\n",
197+
" width, height = mask_true.shape[1], mask_true.shape[0]\n",
198+
" mask_pred = np.array(\n",
199+
" Image.fromarray(mask_pred).resize((width, height)),\n",
200+
" dtype=int\n",
201+
" )\n",
202+
" mask_true = np.array(mask_true, dtype=int)\n",
203+
" dice_multi += dice(mask_true, mask_pred) / len(masks_true)\n",
204+
" return dice_multi"
205+
]
206+
},
207+
{
208+
"cell_type": "code",
209+
"execution_count": null,
210+
"metadata": {
211+
"scrolled": false
212+
},
213+
"outputs": [],
214+
"source": [
215+
"import tarfile\n",
216+
"\n",
217+
"train_arch = 'shufflenet_v2_x2_0'\n",
218+
"models_dir = ROOT / \"models\"\n",
219+
"models_dir.mkdir(exist_ok=True)\n",
220+
"results_dir = ROOT / \"results\" / \"train\"\n",
221+
"\n",
222+
"for base_lr in [0.001, 0.005]:\n",
223+
" # initialize dvclive, optionally provide output path, and save results as a dvc experiment\n",
224+
" with Live(str(results_dir), save_dvc_exp=True, report=\"notebook\") as live:\n",
225+
" # log a parameter\n",
226+
" live.log_param(\"train_arch\", train_arch)\n",
227+
" fine_tune_args = {\n",
228+
" 'epochs': 2,\n",
229+
" 'base_lr': base_lr\n",
230+
" }\n",
231+
" # log a dict of parameters\n",
232+
" live.log_params(fine_tune_args)\n",
233+
"\n",
234+
" learn = unet_learner(data_loader, \n",
235+
" arch=getattr(models, train_arch), \n",
236+
" metrics=DiceMulti)\n",
237+
" # train model and automatically capture metrics with DVCLiveCallback\n",
238+
" learn.fine_tune(\n",
239+
" **fine_tune_args,\n",
240+
" cbs=[DVCLiveCallback(live=live)])\n",
241+
"\n",
242+
" learn.export(fname=(models_dir / \"model.pkl\").absolute())\n",
243+
"\n",
244+
" # add additional post-training summary metrics\n",
245+
" live.summary[\"evaluate/dice_multi\"] = evaluate(learn)\n",
246+
"\n",
247+
" torch.save(learn.model, (models_dir / \"model.pth\").absolute())\n",
248+
"\n",
249+
" # save model artifact to dvc\n",
250+
" live.log_artifact(\n",
251+
" str(models_dir / \"model.pth\"),\n",
252+
" type=\"model\",\n",
253+
" name=\"pool-segmentation\",\n",
254+
" desc=\"This is a Computer Vision (CV) model that's segmenting out swimming pools from satellite images.\",\n",
255+
" labels=[\"cv\", \"segmentation\", \"satellite-images\", \"unet\"],\n",
256+
" )\n",
257+
"\n",
258+
" # For deploying the model to Sagemaker will also create a sagemaker model archive\n",
259+
" # and save it to dvc - this is because Sagemaker expects a particular archive structure\n",
260+
" sagemaker_dir = ROOT / \"sagemaker\"\n",
261+
"\n",
262+
" with tarfile.open(sagemaker_dir / \"model.tar.gz\", \"w:gz\") as tar:\n",
263+
" tar.add(sagemaker_dir / \"code\", \"code\")\n",
264+
" tar.add(models_dir / \"model.pth\", \"code/model.pth\")\n",
265+
"\n",
266+
" live.log_artifact(sagemaker_dir / \"model.tar.gz\")\n"
267+
]
268+
}
269+
],
270+
"metadata": {
271+
"kernelspec": {
272+
"display_name": "Python 3 (ipykernel)",
273+
"language": "python",
274+
"name": "python3"
275+
},
276+
"language_info": {
277+
"codemirror_mode": {
278+
"name": "ipython",
279+
"version": 3
280+
},
281+
"file_extension": ".py",
282+
"mimetype": "text/x-python",
283+
"name": "python",
284+
"nbconvert_exporter": "python",
285+
"pygments_lexer": "ipython3",
286+
"version": "3.10.6"
287+
},
288+
"vscode": {
289+
"interpreter": {
290+
"hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1"
291+
}
292+
}
293+
},
294+
"nbformat": 4,
295+
"nbformat_minor": 4
296+
}

example-get-started-experiments/code/notebooks/TrainSegModel.ipynb renamed to example-get-started-experiments/code/notebooks/TrainSegModel-start.ipynb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -237,10 +237,10 @@
237237
" **fine_tune_args,\n",
238238
" cbs=[DVCLiveCallback(live=live)])\n",
239239
"\n",
240-
" learn.export(fname=(models_dir / \"model.pkl\").absolute())\n",
241-
"\n",
242240
" # add additional post-training summary metrics\n",
243-
" live.summary[\"evaluate/dice_multi\"] = evaluate(learn)\n"
241+
" live.summary[\"evaluate/dice_multi\"] = evaluate(learn)\n",
242+
"\n",
243+
" torch.save(learn.model, (models_dir / \"model.pth\").absolute())\n"
244244
]
245245
}
246246
],

example-get-started-experiments/generate.sh

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ git commit -m "Add data"
7474
dvc pull
7575

7676
mkdir notebooks
77-
cp $HERE/code/notebooks/TrainSegModel.ipynb notebooks/TrainSegModel.ipynb
77+
cp $HERE/code/notebooks/TrainSegModel-start.ipynb notebooks/TrainSegModel.ipynb
7878
git add .
7979
git commit -m "Add training notebook with DVCLive metrics"
8080
git tag -a "0-notebook-dvclive" -m "Notebook training pipeline"
@@ -83,7 +83,6 @@ git tag -a "0-notebook-dvclive" -m "Notebook training pipeline"
8383
cp $HERE/code/notebooks/TrainSegModel-logging.ipynb notebooks/TrainSegModel.ipynb
8484
git add .
8585
tick
86-
8786
git commit -m "Add artifact logging/saving using DVCLive"
8887
git tag -a "1-added-model-logging" -m "Added model logging/saving to pipeline"
8988

@@ -104,9 +103,39 @@ git tag -a "2-model-added-to-registry" -m "Running pipeline and commiting best e
104103
gto register pool-segmentation --version v1.0.0
105104
gto assign pool-segmentation --version v1.0.0 --stage dev
106105
tick
107-
gto assign pool-segmentation --version v1.0.0 --stage prod
106+
gto assign pool-segmentation --version v1.0.0 --stage staging
108107
gto deprecate pool-segmentation v1.0.0 dev
109108

109+
cp $HERE/code/notebooks/TrainSegModel-sagemaker.ipynb notebooks/TrainSegModel.ipynb
110+
cp -r $HERE/code/sagemaker .
111+
git add .
112+
tick
113+
git commit -m "Add code for sagemaker deployment"
114+
git tag -a "3-sagemaker-deployment-code" -m "Added code for sagemaker deployment"
115+
116+
117+
### 3 - new experiment with the sagemaker model archive logged
118+
119+
jupyter nbconvert --execute 'notebooks/TrainSegModel.ipynb' --inplace
120+
# Apply best experiment
121+
BEST_EXP_ROW=$(dvc exp show --drop '.*' --keep 'Experiment|evaluate/dice_multi|base_lr' --csv --sort-by evaluate/dice_multi | tail -n 1)
122+
BEST_EXP_NAME=$(echo $BEST_EXP_ROW | cut -d, -f 1)
123+
BEST_EXP_BASE_LR=$(echo $BEST_EXP_ROW | cut -d, -f 3)
124+
dvc exp apply $BEST_EXP_NAME
125+
126+
git add .
127+
tick
128+
git commit -m "Experiment with sagemaker model archive"
129+
git tag -a "4-sagemaker-model-registered" -m "commiting experiment with the sagemaker archive"
130+
gto register pool-segmentation --version v1.1.0
131+
gto assign pool-segmentation --version v1.1.0 --stage staging
132+
tick
133+
134+
# Now we want to register the model to prod and trigger deployment
135+
136+
# gto assign pool-segmentation --version v1.0.0 --stage prod
137+
# gto deprecate pool-segmentation v1.0.0 staging
138+
110139
dvc push -A
111140

112141
popd

0 commit comments

Comments
 (0)