From b218ea280bab8177553db2b98a40ce07022ba2a9 Mon Sep 17 00:00:00 2001 From: Tibor Mach Date: Sun, 8 Oct 2023 21:26:10 +0200 Subject: [PATCH] Sagemaker deployment steps --- .../code/.github/workflows/deploy-model.yml | 2 +- .../notebooks/TrainSegModel-logging.ipynb | 4 +- .../notebooks/TrainSegModel-sagemaker.ipynb | 296 ++++++++++++++++++ ...gModel.ipynb => TrainSegModel-start.ipynb} | 6 +- example-get-started-experiments/generate.sh | 35 ++- 5 files changed, 335 insertions(+), 8 deletions(-) create mode 100644 example-get-started-experiments/code/notebooks/TrainSegModel-sagemaker.ipynb rename example-get-started-experiments/code/notebooks/{TrainSegModel.ipynb => TrainSegModel-start.ipynb} (98%) diff --git a/example-get-started-experiments/code/.github/workflows/deploy-model.yml b/example-get-started-experiments/code/.github/workflows/deploy-model.yml index 8132fb3..f4cf8bf 100644 --- a/example-get-started-experiments/code/.github/workflows/deploy-model.yml +++ b/example-get-started-experiments/code/.github/workflows/deploy-model.yml @@ -60,7 +60,7 @@ jobs: - run: dvc remote add -d --local storage s3://dvc-public/remote/get-started-pools - run: | - MODEL_DATA=$(dvc get --show-url . model.tar.gz) + MODEL_DATA=$(dvc get --show-url . sagemaker/model.tar.gz) python sagemaker/deploy_model.py \ --name ${{ needs.parse.outputs.name }} \ --stage ${{ needs.parse.outputs.stage }} \ diff --git a/example-get-started-experiments/code/notebooks/TrainSegModel-logging.ipynb b/example-get-started-experiments/code/notebooks/TrainSegModel-logging.ipynb index f33af78..a03c918 100644 --- a/example-get-started-experiments/code/notebooks/TrainSegModel-logging.ipynb +++ b/example-get-started-experiments/code/notebooks/TrainSegModel-logging.ipynb @@ -242,9 +242,11 @@ " # add additional post-training summary metrics\n", " live.summary[\"evaluate/dice_multi\"] = evaluate(learn)\n", "\n", + " torch.save(learn.model, (models_dir / \"model.pth\").absolute())\n", + "\n", " # save model artifact to dvc\n", " live.log_artifact(\n", - " str(models_dir / \"model.pkl\"),\n", + " str(models_dir / \"model.pth\"),\n", " type=\"model\",\n", " name=\"pool-segmentation\",\n", " desc=\"This is a Computer Vision (CV) model that's segmenting out swimming pools from satellite images.\",\n", diff --git a/example-get-started-experiments/code/notebooks/TrainSegModel-sagemaker.ipynb b/example-get-started-experiments/code/notebooks/TrainSegModel-sagemaker.ipynb new file mode 100644 index 0000000..cc306e3 --- /dev/null +++ b/example-get-started-experiments/code/notebooks/TrainSegModel-sagemaker.ipynb @@ -0,0 +1,296 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "ROOT = Path(\"../\")\n", + "DATA = ROOT / \"data\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import shutil\n", + "from functools import partial\n", + "\n", + "import numpy as np\n", + "import torch\n", + "from box import ConfigBox\n", + "from dvclive import Live\n", + "from dvclive.fastai import DVCLiveCallback\n", + "from fastai.data.all import Normalize, get_files\n", + "from fastai.metrics import DiceMulti\n", + "from fastai.vision.all import (Resize, SegmentationDataLoaders,\n", + " imagenet_stats, models, unet_learner)\n", + "from ruamel.yaml import YAML\n", + "from PIL import Image" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load data and split it into train/test\n", + "\n", + "We have some [data in DVC](https://dvc.org/doc/start/data-management/data-versioning) that we can pull. \n", + "\n", + "This data includes:\n", + "* satellite images\n", + "* masks of the swimming pools in each satellite image\n", + "\n", + "DVC can help connect your data to your repo, but it isn't necessary to have your data in DVC to start tracking experiments with DVC and DVCLive." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!dvc pull" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_regions = [\"REGION_1-\"]\n", + "\n", + "img_fpaths = get_files(DATA / \"pool_data\" / \"images\", extensions=\".jpg\")\n", + "\n", + "train_data_dir = DATA / \"train_data\"\n", + "train_data_dir.mkdir(exist_ok=True)\n", + "test_data_dir = DATA / \"test_data\"\n", + "test_data_dir.mkdir(exist_ok=True)\n", + "for img_path in img_fpaths:\n", + " msk_path = DATA / \"pool_data\" / \"masks\" / f\"{img_path.stem}.png\"\n", + " if any(region in str(img_path) for region in test_regions):\n", + " shutil.copy(img_path, test_data_dir)\n", + " shutil.copy(msk_path, test_data_dir)\n", + " else:\n", + " shutil.copy(img_path, train_data_dir)\n", + " shutil.copy(msk_path, train_data_dir)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a data loader\n", + "\n", + "Load and prepare the images and masks by creating a data loader." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_mask_path(x, train_data_dir):\n", + " return Path(train_data_dir) / f\"{Path(x).stem}.png\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bs = 8\n", + "valid_pct = 0.20\n", + "img_size = 256\n", + "\n", + "data_loader = SegmentationDataLoaders.from_label_func(\n", + " path=train_data_dir,\n", + " fnames=get_files(train_data_dir, extensions=\".jpg\"),\n", + " label_func=partial(get_mask_path, train_data_dir=train_data_dir),\n", + " codes=[\"not-pool\", \"pool\"],\n", + " bs=bs,\n", + " valid_pct=valid_pct,\n", + " item_tfms=Resize(img_size),\n", + " batch_tfms=[\n", + " Normalize.from_stats(*imagenet_stats),\n", + " ],\n", + " )" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Review a sample batch of data\n", + "\n", + "Below are some examples of the images overlaid with their masks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_loader.show_batch(alpha=0.7)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Train multiple models with different learning rates using `DVCLiveCallback`\n", + "\n", + "Set up model training, using DVCLive to capture the results of each experiment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def dice(mask_pred, mask_true, classes=[0, 1], eps=1e-6):\n", + " dice_list = []\n", + " for c in classes:\n", + " y_true = mask_true == c\n", + " y_pred = mask_pred == c\n", + " intersection = 2.0 * np.sum(y_true * y_pred)\n", + " dice = intersection / (np.sum(y_true) + np.sum(y_pred) + eps)\n", + " dice_list.append(dice)\n", + " return np.mean(dice_list)\n", + "\n", + "def evaluate(learn):\n", + " test_img_fpaths = sorted(get_files(DATA / \"test_data\", extensions=\".jpg\"))\n", + " test_dl = learn.dls.test_dl(test_img_fpaths)\n", + " preds, _ = learn.get_preds(dl=test_dl)\n", + " masks_pred = np.array(preds[:, 1, :] > 0.5, dtype=np.uint8)\n", + " test_mask_fpaths = [\n", + " get_mask_path(fpath, DATA / \"test_data\") for fpath in test_img_fpaths\n", + " ]\n", + " masks_true = [Image.open(mask_path) for mask_path in test_mask_fpaths]\n", + " dice_multi = 0.0\n", + " for ii in range(len(masks_true)):\n", + " mask_pred, mask_true = masks_pred[ii], masks_true[ii]\n", + " width, height = mask_true.shape[1], mask_true.shape[0]\n", + " mask_pred = np.array(\n", + " Image.fromarray(mask_pred).resize((width, height)),\n", + " dtype=int\n", + " )\n", + " mask_true = np.array(mask_true, dtype=int)\n", + " dice_multi += dice(mask_true, mask_pred) / len(masks_true)\n", + " return dice_multi" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "import tarfile\n", + "\n", + "train_arch = 'shufflenet_v2_x2_0'\n", + "models_dir = ROOT / \"models\"\n", + "models_dir.mkdir(exist_ok=True)\n", + "results_dir = ROOT / \"results\" / \"train\"\n", + "\n", + "for base_lr in [0.001, 0.005]:\n", + " # initialize dvclive, optionally provide output path, and save results as a dvc experiment\n", + " with Live(str(results_dir), save_dvc_exp=True, report=\"notebook\") as live:\n", + " # log a parameter\n", + " live.log_param(\"train_arch\", train_arch)\n", + " fine_tune_args = {\n", + " 'epochs': 2,\n", + " 'base_lr': base_lr\n", + " }\n", + " # log a dict of parameters\n", + " live.log_params(fine_tune_args)\n", + "\n", + " learn = unet_learner(data_loader, \n", + " arch=getattr(models, train_arch), \n", + " metrics=DiceMulti)\n", + " # train model and automatically capture metrics with DVCLiveCallback\n", + " learn.fine_tune(\n", + " **fine_tune_args,\n", + " cbs=[DVCLiveCallback(live=live)])\n", + "\n", + " learn.export(fname=(models_dir / \"model.pkl\").absolute())\n", + "\n", + " # add additional post-training summary metrics\n", + " live.summary[\"evaluate/dice_multi\"] = evaluate(learn)\n", + "\n", + " torch.save(learn.model, (models_dir / \"model.pth\").absolute())\n", + "\n", + " # save model artifact to dvc\n", + " live.log_artifact(\n", + " str(models_dir / \"model.pth\"),\n", + " type=\"model\",\n", + " name=\"pool-segmentation\",\n", + " desc=\"This is a Computer Vision (CV) model that's segmenting out swimming pools from satellite images.\",\n", + " labels=[\"cv\", \"segmentation\", \"satellite-images\", \"unet\"],\n", + " )\n", + "\n", + " # For deploying the model to Sagemaker will also create a sagemaker model archive\n", + " # and save it to dvc - this is because Sagemaker expects a particular archive structure\n", + " sagemaker_dir = ROOT / \"sagemaker\"\n", + "\n", + " with tarfile.open(sagemaker_dir / \"model.tar.gz\", \"w:gz\") as tar:\n", + " tar.add(sagemaker_dir / \"code\", \"code\")\n", + " tar.add(models_dir / \"model.pth\", \"code/model.pth\")\n", + "\n", + " live.log_artifact(sagemaker_dir / \"model.tar.gz\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + }, + "vscode": { + "interpreter": { + "hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1" + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/example-get-started-experiments/code/notebooks/TrainSegModel.ipynb b/example-get-started-experiments/code/notebooks/TrainSegModel-start.ipynb similarity index 98% rename from example-get-started-experiments/code/notebooks/TrainSegModel.ipynb rename to example-get-started-experiments/code/notebooks/TrainSegModel-start.ipynb index 1582341..63305f7 100644 --- a/example-get-started-experiments/code/notebooks/TrainSegModel.ipynb +++ b/example-get-started-experiments/code/notebooks/TrainSegModel-start.ipynb @@ -237,10 +237,10 @@ " **fine_tune_args,\n", " cbs=[DVCLiveCallback(live=live)])\n", "\n", - " learn.export(fname=(models_dir / \"model.pkl\").absolute())\n", - "\n", " # add additional post-training summary metrics\n", - " live.summary[\"evaluate/dice_multi\"] = evaluate(learn)\n" + " live.summary[\"evaluate/dice_multi\"] = evaluate(learn)\n", + "\n", + " torch.save(learn.model, (models_dir / \"model.pth\").absolute())\n" ] } ], diff --git a/example-get-started-experiments/generate.sh b/example-get-started-experiments/generate.sh index b916f70..4c4895b 100755 --- a/example-get-started-experiments/generate.sh +++ b/example-get-started-experiments/generate.sh @@ -74,7 +74,7 @@ git commit -m "Add data" dvc pull mkdir notebooks -cp $HERE/code/notebooks/TrainSegModel.ipynb notebooks/TrainSegModel.ipynb +cp $HERE/code/notebooks/TrainSegModel-start.ipynb notebooks/TrainSegModel.ipynb git add . git commit -m "Add training notebook with DVCLive metrics" git tag -a "0-notebook-dvclive" -m "Notebook training pipeline" @@ -83,7 +83,6 @@ git tag -a "0-notebook-dvclive" -m "Notebook training pipeline" cp $HERE/code/notebooks/TrainSegModel-logging.ipynb notebooks/TrainSegModel.ipynb git add . tick - git commit -m "Add artifact logging/saving using DVCLive" git tag -a "1-added-model-logging" -m "Added model logging/saving to pipeline" @@ -104,9 +103,39 @@ git tag -a "2-model-added-to-registry" -m "Running pipeline and commiting best e gto register pool-segmentation --version v1.0.0 gto assign pool-segmentation --version v1.0.0 --stage dev tick -gto assign pool-segmentation --version v1.0.0 --stage prod +gto assign pool-segmentation --version v1.0.0 --stage staging gto deprecate pool-segmentation v1.0.0 dev +cp $HERE/code/notebooks/TrainSegModel-sagemaker.ipynb notebooks/TrainSegModel.ipynb +cp -r $HERE/code/sagemaker . +git add . +tick +git commit -m "Add code for sagemaker deployment" +git tag -a "3-sagemaker-deployment-code" -m "Added code for sagemaker deployment" + + +### 3 - new experiment with the sagemaker model archive logged + +jupyter nbconvert --execute 'notebooks/TrainSegModel.ipynb' --inplace +# Apply best experiment +BEST_EXP_ROW=$(dvc exp show --drop '.*' --keep 'Experiment|evaluate/dice_multi|base_lr' --csv --sort-by evaluate/dice_multi | tail -n 1) +BEST_EXP_NAME=$(echo $BEST_EXP_ROW | cut -d, -f 1) +BEST_EXP_BASE_LR=$(echo $BEST_EXP_ROW | cut -d, -f 3) +dvc exp apply $BEST_EXP_NAME + +git add . +tick +git commit -m "Experiment with sagemaker model archive" +git tag -a "4-sagemaker-model-registered" -m "commiting experiment with the sagemaker archive" +gto register pool-segmentation --version v1.1.0 +gto assign pool-segmentation --version v1.1.0 --stage staging +tick + +# Now we want to register the model to prod and trigger deployment + +# gto assign pool-segmentation --version v1.0.0 --stage prod +# gto deprecate pool-segmentation v1.0.0 staging + dvc push -A popd