From 3ebd075212e0a761b982880707ec497c36a99d80 Mon Sep 17 00:00:00 2001 From: Dustin Luong Date: Wed, 19 Aug 2020 15:41:22 -0700 Subject: [PATCH] feat(components): AWS SageMaker - Add optional parameter to allow training component to accept parameters related to Debugger (#4283) * Implemented debugger for training component with sample pipeline, unit tests, and integration test * Implemented changes from PR, refactored utils.py, made sample pipeline more succinct, removed hardcoding from integration tests * Added default parameter for sample pipeline and fixed grammar for sample README, refactored _utils.py for fstrings and fixed offset for errors * Removed aws secret lines * Terminate debug rules when terminating training job, Terminate debug rules if terminate is pressed after training job has completed, added integration tests for stop_debug_rules, updated READMEs for train and sample, renamed sample pipeline, removed tensorboard, updated sagemaker version to sagemaker 2.1.0. * Terminate debug rules when terminating training job, Terminate debug rules if terminate is pressed after training job has completed, added integration tests for stop_debug_rules, updated READMEs for train and sample, renamed sample pipeline, removed tensorboard, updated sagemaker version to sagemaker 2.1.0. * Removed extra files, cleaned integration test * Changed integration test to use sample debugger pipeline * Processing jobs created from debug rules will not terminate, fixing other small issues * Removed debug from pipeline definition, removed extra line, removed unused function * Changelog and image tag updates --- components/aws/sagemaker/Changelog.md | 8 +- components/aws/sagemaker/Dockerfile | 4 +- .../aws/sagemaker/THIRD-PARTY-LICENSES.txt | 8 +- .../sagemaker/batch_transform/component.yaml | 2 +- components/aws/sagemaker/common/_utils.py | 138 +++++++++++++++--- .../aws/sagemaker/common/train.template.yaml | 4 +- .../aws/sagemaker/deploy/component.yaml | 2 +- .../aws/sagemaker/ground_truth/component.yaml | 4 +- .../hyperparameter_tuning/component.yaml | 4 +- components/aws/sagemaker/model/component.yaml | 4 +- .../aws/sagemaker/process/component.yaml | 4 +- .../tests/integration_tests/README.md | 4 +- .../component_tests/test_train_component.py | 6 +- .../test_workteam_component.py | 7 +- .../tests/integration_tests/environment.yml | 2 +- .../config.yaml | 7 + .../resources/definition/training_pipeline.py | 2 +- .../tests/integration_tests/utils/__init__.py | 14 +- .../tests/unit_tests/requirements.txt | 4 +- .../tests/unit_tests/tests/test_hpo.py | 24 +-- .../tests/unit_tests/tests/test_train.py | 97 ++++++++++-- components/aws/sagemaker/train/README.md | 47 ++++++ components/aws/sagemaker/train/component.yaml | 14 +- components/aws/sagemaker/train/src/train.py | 8 +- .../aws/sagemaker/workteam/component.yaml | 4 +- .../sagemaker_debugger_demo/README.md | 56 +++++++ .../debugger-training-pipeline.py | 99 +++++++++++++ .../simple_train_pipeline/README.md | 4 +- 28 files changed, 493 insertions(+), 88 deletions(-) create mode 100644 components/aws/sagemaker/tests/integration_tests/resources/config/xgboost-mnist-trainingjob-debugger/config.yaml create mode 100644 samples/contrib/aws-samples/sagemaker_debugger_demo/README.md create mode 100644 samples/contrib/aws-samples/sagemaker_debugger_demo/debugger-training-pipeline.py diff --git a/components/aws/sagemaker/Changelog.md b/components/aws/sagemaker/Changelog.md index 388a03f5eaf..59a8715519a 100644 --- a/components/aws/sagemaker/Changelog.md +++ b/components/aws/sagemaker/Changelog.md @@ -4,6 +4,12 @@ The version of the AWS SageMaker Components is determined by the docker image ta Repository: https://hub.docker.com/repository/docker/amazon/aws-sagemaker-kfp-components --------------------------------------------- +**Change log for version 0.8.0** +- Add functionality to configure SageMaker Debugger for Training component + +> Pull requests " [#4283](https://github.com/kubeflow/pipelines/pull/4283/) + + **Change log for version 0.7.0** - Add functionality to assume role when sending SageMaker requests @@ -29,7 +35,7 @@ Repository: https://hub.docker.com/repository/docker/amazon/aws-sagemaker-kfp-c **Change log for version 0.5.1** -- Update region support for GroudTruth component +- Update region support for GroundTruth component - Make `label_category_config` an optional parameter in Ground Truth component > Pull requests : [#3932](https://github.com/kubeflow/pipelines/pull/3932) diff --git a/components/aws/sagemaker/Dockerfile b/components/aws/sagemaker/Dockerfile index 481cb8db44f..3cb88434a23 100644 --- a/components/aws/sagemaker/Dockerfile +++ b/components/aws/sagemaker/Dockerfile @@ -23,8 +23,8 @@ RUN yum update -y \ unzip RUN pip3 install \ - boto3==1.13.19 \ - sagemaker==1.54.0 \ + boto3==1.14.12 \ + sagemaker==2.1.0 \ pathlib2==2.3.5 \ pyyaml==3.12 diff --git a/components/aws/sagemaker/THIRD-PARTY-LICENSES.txt b/components/aws/sagemaker/THIRD-PARTY-LICENSES.txt index cc3f1c8e16d..8b48ef1d497 100644 --- a/components/aws/sagemaker/THIRD-PARTY-LICENSES.txt +++ b/components/aws/sagemaker/THIRD-PARTY-LICENSES.txt @@ -1,7 +1,7 @@ -** Amazon SageMaker Components for Kubeflow Pipelines; version 0.7.0 -- +** Amazon SageMaker Components for Kubeflow Pipelines; version 0.8.0 -- https://github.com/kubeflow/pipelines/tree/master/components/aws/sagemaker Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -** boto3; version 1.12.33 -- https://github.com/boto/boto3/ +** boto3; version 1.14.12 -- https://github.com/boto/boto3/ Copyright 2013-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. ** botocore; version 1.15.33 -- https://github.com/boto/botocore Botocore @@ -12,7 +12,7 @@ https://importlib-metadata.readthedocs.io/en/latest/ ** s3transfer; version 0.3.3 -- https://github.com/boto/s3transfer/ s3transfer Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved. -** sagemaker; version 1.54.0 -- https://aws.amazon.com/sagemaker/ +** sagemaker; version 2.1.0 -- https://aws.amazon.com/sagemaker/ Amazon SageMaker Python SDK Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. ** smdebug-rulesconfig; version 0.1.2 -- @@ -982,4 +982,4 @@ OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -For more information, please refer to \ No newline at end of file +For more information, please refer to diff --git a/components/aws/sagemaker/batch_transform/component.yaml b/components/aws/sagemaker/batch_transform/component.yaml index 45035cad1e2..64ac8892b15 100644 --- a/components/aws/sagemaker/batch_transform/component.yaml +++ b/components/aws/sagemaker/batch_transform/component.yaml @@ -102,7 +102,7 @@ outputs: - {name: output_location, description: 'S3 URI of the transform job results.'} implementation: container: - image: amazon/aws-sagemaker-kfp-components:0.7.0 + image: amazon/aws-sagemaker-kfp-components:0.8.0 command: ['python3'] args: [ batch_transform.py, diff --git a/components/aws/sagemaker/common/_utils.py b/components/aws/sagemaker/common/_utils.py index 68c5afad83e..3198cb4d81f 100644 --- a/components/aws/sagemaker/common/_utils.py +++ b/components/aws/sagemaker/common/_utils.py @@ -22,6 +22,7 @@ import re import json from pathlib2 import Path +from enum import Enum, auto import boto3 from boto3.session import Session @@ -36,7 +37,7 @@ from botocore.exceptions import ClientError from botocore.session import Session as BotocoreSession -from sagemaker.amazon.amazon_estimator import get_image_uri +from sagemaker.image_uris import retrieve import logging logging.getLogger().setLevel(logging.INFO) @@ -99,6 +100,9 @@ def get_component_version(): return component_version +def print_log_header(header_len, title=""): + logging.info(f"{title:*^{header_len}}") + def print_logs_for_job(cw_client, log_grp, job_name): """Gets the CloudWatch logs for SageMaker jobs""" try: @@ -206,12 +210,12 @@ def create_training_job_request(args): # TODO: Adjust this implementation to account for custom algorithm resources names that are the same as built-in algorithm names algo_name = args['algorithm_name'].lower().strip() if algo_name in built_in_algos.keys(): - request['AlgorithmSpecification']['TrainingImage'] = get_image_uri(args['region'], built_in_algos[algo_name]) + request['AlgorithmSpecification']['TrainingImage'] = retrieve(built_in_algos[algo_name], args['region']) request['AlgorithmSpecification'].pop('AlgorithmName') logging.warning('Algorithm name is found as an Amazon built-in algorithm. Using built-in algorithm.') # Just to give the user more leeway for built-in algorithm name inputs elif algo_name in built_in_algos.values(): - request['AlgorithmSpecification']['TrainingImage'] = get_image_uri(args['region'], algo_name) + request['AlgorithmSpecification']['TrainingImage'] = retrieve(algo_name, args['region']) request['AlgorithmSpecification'].pop('AlgorithmName') logging.warning('Algorithm name is found as an Amazon built-in algorithm. Using built-in algorithm.') else: @@ -258,6 +262,17 @@ def create_training_job_request(args): enable_spot_instance_support(request, args) + ### Update DebugHookConfig and DebugRuleConfigurations + if args['debug_hook_config']: + request['DebugHookConfig'] = args['debug_hook_config'] + else: + request.pop('DebugHookConfig') + + if args['debug_rule_config']: + request['DebugRuleConfigurations'] = args['debug_rule_config'] + else: + request.pop('DebugRuleConfigurations') + ### Update tags for key, val in args['tags'].items(): request['Tags'].append({'Key': key, 'Value': val}) @@ -282,18 +297,94 @@ def create_training_job(client, args): def wait_for_training_job(client, training_job_name, poll_interval=30): - while(True): - response = client.describe_training_job(TrainingJobName=training_job_name) - status = response['TrainingJobStatus'] - if status == 'Completed': - logging.info("Training job ended with status: " + status) - break - if status == 'Failed': - message = response['FailureReason'] - logging.info('Training failed with the following error: {}'.format(message)) - raise Exception('Training job failed') - logging.info("Training job is still in status: " + status) - time.sleep(poll_interval) + while(True): + response = client.describe_training_job(TrainingJobName=training_job_name) + status = response['TrainingJobStatus'] + if status == 'Completed': + logging.info("Training job ended with status: " + status) + break + if status == 'Failed': + message = response['FailureReason'] + logging.info(f'Training failed with the following error: {message}') + raise Exception('Training job failed') + logging.info("Training job is still in status: " + status) + time.sleep(poll_interval) + + +def wait_for_debug_rules(client, training_job_name, poll_interval=30): + first_poll = True + while(True): + response = client.describe_training_job(TrainingJobName=training_job_name) + if 'DebugRuleEvaluationStatuses' not in response: + break + if first_poll: + logging.info("Polling for status of all debug rules:") + first_poll = False + if DebugRulesStatus.from_describe(response) != DebugRulesStatus.INPROGRESS: + logging.info("Rules have ended with status:\n") + print_debug_rule_status(response, True) + break + print_debug_rule_status(response) + time.sleep(poll_interval) + + +class DebugRulesStatus(Enum): + COMPLETED = auto() + ERRORED = auto() + INPROGRESS = auto() + + @classmethod + def from_describe(self, response): + has_error = False + for debug_rule in response['DebugRuleEvaluationStatuses']: + if debug_rule['RuleEvaluationStatus'] == "Error": + has_error = True + if debug_rule['RuleEvaluationStatus'] == "InProgress": + return DebugRulesStatus.INPROGRESS + if has_error: + return DebugRulesStatus.ERRORED + else: + return DebugRulesStatus.COMPLETED + + +def print_debug_rule_status(response, last_print=False): + """ + Example of DebugRuleEvaluationStatuses: + response['DebugRuleEvaluationStatuses'] = + [{ + "RuleConfigurationName": "VanishingGradient", + "RuleEvaluationStatus": "IssuesFound", + "StatusDetails": "There was an issue." + }] + + If last_print is False: + INFO:root: - LossNotDecreasing: InProgress + INFO:root: - Overtraining: NoIssuesFound + ERROR:root:- CustomGradientRule: Error + + If last_print is True: + INFO:root: - LossNotDecreasing: IssuesFound + INFO:root: - RuleEvaluationConditionMet: Evaluation of the rule LossNotDecreasing at step 10 resulted in the condition being met + """ + for debug_rule in response['DebugRuleEvaluationStatuses']: + line_ending = "\n" if last_print else "" + if 'StatusDetails' in debug_rule: + status_details = f"- {debug_rule['StatusDetails'].rstrip()}{line_ending}" + line_ending = "" + else: + status_details = "" + rule_status = f"- {debug_rule['RuleConfigurationName']}: {debug_rule['RuleEvaluationStatus']}{line_ending}" + if debug_rule['RuleEvaluationStatus'] == "Error": + log = logging.error + status_padding = 1 + else: + log = logging.info + status_padding = 2 + + log(f"{status_padding * ' '}{rule_status}") + if last_print and status_details: + log(f"{(status_padding + 2) * ' '}{status_details}") + print_log_header(50) def get_model_artifacts_from_job(client, job_name): @@ -314,10 +405,13 @@ def get_image_from_job(client, job_name): def stop_training_job(client, job_name): - try: - client.stop_training_job(TrainingJobName=job_name) - except ClientError as e: - raise Exception(e.response['Error']['Message']) + response = client.describe_training_job(TrainingJobName=job_name) + if response["TrainingJobStatus"] == "InProgress": + try: + client.stop_training_job(TrainingJobName=job_name) + return job_name + except ClientError as e: + raise Exception(e.response['Error']['Message']) def create_model(client, args): @@ -611,12 +705,12 @@ def create_hyperparameter_tuning_job_request(args): # TODO: Adjust this implementation to account for custom algorithm resources names that are the same as built-in algorithm names algo_name = args['algorithm_name'].lower().strip() if algo_name in built_in_algos.keys(): - request['TrainingJobDefinition']['AlgorithmSpecification']['TrainingImage'] = get_image_uri(args['region'], built_in_algos[algo_name]) + request['TrainingJobDefinition']['AlgorithmSpecification']['TrainingImage'] = retrieve(built_in_algos[algo_name], args['region']) request['TrainingJobDefinition']['AlgorithmSpecification'].pop('AlgorithmName') logging.warning('Algorithm name is found as an Amazon built-in algorithm. Using built-in algorithm.') # To give the user more leeway for built-in algorithm name inputs elif algo_name in built_in_algos.values(): - request['TrainingJobDefinition']['AlgorithmSpecification']['TrainingImage'] = get_image_uri(args['region'], algo_name) + request['TrainingJobDefinition']['AlgorithmSpecification']['TrainingImage'] = retrieve(algo_name, args['region']) request['TrainingJobDefinition']['AlgorithmSpecification'].pop('AlgorithmName') logging.warning('Algorithm name is found as an Amazon built-in algorithm. Using built-in algorithm.') else: @@ -1135,4 +1229,4 @@ def write_output(output_path, output_value, json_encode=False): write_value = json.dumps(output_value) if json_encode else output_value Path(output_path).parent.mkdir(parents=True, exist_ok=True) - Path(output_path).write_text(write_value) \ No newline at end of file + Path(output_path).write_text(write_value) diff --git a/components/aws/sagemaker/common/train.template.yaml b/components/aws/sagemaker/common/train.template.yaml index 618531bb128..d3e6d828125 100644 --- a/components/aws/sagemaker/common/train.template.yaml +++ b/components/aws/sagemaker/common/train.template.yaml @@ -21,10 +21,12 @@ VpcConfig: StoppingCondition: MaxRuntimeInSeconds: 86400 MaxWaitTimeInSeconds: 86400 +DebugHookConfig: {} +DebugRuleConfigurations: [] CheckpointConfig: S3Uri: '' LocalPath: '' Tags: [] EnableNetworkIsolation: True EnableInterContainerTrafficEncryption: False -EnableManagedSpotTraining: False \ No newline at end of file +EnableManagedSpotTraining: False diff --git a/components/aws/sagemaker/deploy/component.yaml b/components/aws/sagemaker/deploy/component.yaml index e499bdad288..d2bc4b9a778 100644 --- a/components/aws/sagemaker/deploy/component.yaml +++ b/components/aws/sagemaker/deploy/component.yaml @@ -108,7 +108,7 @@ outputs: - {name: endpoint_name, description: 'Endpoint name'} implementation: container: - image: amazon/aws-sagemaker-kfp-components:0.7.0 + image: amazon/aws-sagemaker-kfp-components:0.8.0 command: ['python3'] args: [ deploy.py, diff --git a/components/aws/sagemaker/ground_truth/component.yaml b/components/aws/sagemaker/ground_truth/component.yaml index b1a8645fda8..4a875d00ee3 100644 --- a/components/aws/sagemaker/ground_truth/component.yaml +++ b/components/aws/sagemaker/ground_truth/component.yaml @@ -123,7 +123,7 @@ outputs: - {name: active_learning_model_arn, description: 'The ARN for the most recent Amazon SageMaker model trained as part of automated data labeling.'} implementation: container: - image: amazon/aws-sagemaker-kfp-components:0.7.0 + image: amazon/aws-sagemaker-kfp-components:0.8.0 command: ['python3'] args: [ ground_truth.py, @@ -161,4 +161,4 @@ implementation: --tags, {inputValue: tags}, --output_manifest_location_output_path, {outputPath: output_manifest_location}, --active_learning_model_arn_output_path, {outputPath: active_learning_model_arn} - ] \ No newline at end of file + ] diff --git a/components/aws/sagemaker/hyperparameter_tuning/component.yaml b/components/aws/sagemaker/hyperparameter_tuning/component.yaml index cd166a07636..de08506342a 100644 --- a/components/aws/sagemaker/hyperparameter_tuning/component.yaml +++ b/components/aws/sagemaker/hyperparameter_tuning/component.yaml @@ -154,7 +154,7 @@ outputs: description: 'The registry path of the Docker image that contains the training algorithm' implementation: container: - image: amazon/aws-sagemaker-kfp-components:0.7.0 + image: amazon/aws-sagemaker-kfp-components:0.8.0 command: ['python3'] args: [ hyperparameter_tuning.py, @@ -200,4 +200,4 @@ implementation: --best_job_name_output_path, {outputPath: best_job_name}, --best_hyperparameters_output_path, {outputPath: best_hyperparameters}, --training_image_output_path, {outputPath: training_image} - ] \ No newline at end of file + ] diff --git a/components/aws/sagemaker/model/component.yaml b/components/aws/sagemaker/model/component.yaml index d5ec9196ae6..0b89a8d05e2 100644 --- a/components/aws/sagemaker/model/component.yaml +++ b/components/aws/sagemaker/model/component.yaml @@ -63,7 +63,7 @@ outputs: - {name: model_name, description: 'The model name SageMaker created'} implementation: container: - image: amazon/aws-sagemaker-kfp-components:0.7.0 + image: amazon/aws-sagemaker-kfp-components:0.8.0 command: ['python3'] args: [ create_model.py, @@ -83,4 +83,4 @@ implementation: --network_isolation, {inputValue: network_isolation}, --tags, {inputValue: tags}, --model_name_output_path, {outputPath: model_name} - ] \ No newline at end of file + ] diff --git a/components/aws/sagemaker/process/component.yaml b/components/aws/sagemaker/process/component.yaml index bdc9d09260a..6ad6441117f 100644 --- a/components/aws/sagemaker/process/component.yaml +++ b/components/aws/sagemaker/process/component.yaml @@ -93,7 +93,7 @@ outputs: - {name: output_artifacts, description: 'A dictionary containing the output S3 artifacts'} implementation: container: - image: amazon/aws-sagemaker-kfp-components:0.7.0 + image: amazon/aws-sagemaker-kfp-components:0.8.0 command: ['python3'] args: [ process.py, @@ -121,4 +121,4 @@ implementation: --tags, {inputValue: tags}, --job_name_output_path, {outputPath: job_name}, --output_artifacts_output_path, {outputPath: output_artifacts} - ] \ No newline at end of file + ] diff --git a/components/aws/sagemaker/tests/integration_tests/README.md b/components/aws/sagemaker/tests/integration_tests/README.md index dec4b0bdae0..e22689bd8e4 100644 --- a/components/aws/sagemaker/tests/integration_tests/README.md +++ b/components/aws/sagemaker/tests/integration_tests/README.md @@ -9,7 +9,7 @@ 1. In the following Python script, change the bucket name and run the [`s3_sample_data_creator.py`](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/mnist-kmeans-sagemaker#the-sample-dataset) to create an S3 bucket with the sample mnist dataset in the region where you want to run the tests. 2. To prepare the dataset for the SageMaker GroundTruth Component test, follow the steps in the `[GroundTruth Sample README](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/ground_truth_pipeline_demo#prep-the-dataset-label-categories-and-ui-template)`. -3. To prepare the processing script for the SageMaker Processing Component tests, upload the `scripts/kmeans_preprocessing.py` script to your bucket. This can be done by replacing ` with your bucket name and running `aws s3 cp scripts/kmeans_preprocessing.py s3:///mnist_kmeans_example/processing_code/kmeans_preprocessing.py` +3. To prepare the processing script for the SageMaker Processing Component tests, upload the `scripts/kmeans_preprocessing.py` script to your bucket. This can be done by replacing `` with your bucket name and running `aws s3 cp scripts/kmeans_preprocessing.py s3:///mnist_kmeans_example/processing_code/kmeans_preprocessing.py` ## Step to run integration tests @@ -22,4 +22,4 @@ 1. Navigate to the root of this github directory. 1. Run `docker build . -f components/aws/sagemaker/tests/integration_tests/Dockerfile -t amazon/integration_test` 1. Run the image, injecting your environment variable files: - 1. Run `docker run --env-file components/aws/sagemaker/tests/integration_tests/.env amazon/integration_test` \ No newline at end of file + 1. Run `docker run --env-file components/aws/sagemaker/tests/integration_tests/.env amazon/integration_test` diff --git a/components/aws/sagemaker/tests/integration_tests/component_tests/test_train_component.py b/components/aws/sagemaker/tests/integration_tests/component_tests/test_train_component.py index bc9a4f09746..e6e52af4ca4 100644 --- a/components/aws/sagemaker/tests/integration_tests/component_tests/test_train_component.py +++ b/components/aws/sagemaker/tests/integration_tests/component_tests/test_train_component.py @@ -13,9 +13,13 @@ pytest.param( "resources/config/simple-mnist-training", marks=pytest.mark.canary_test ), - pytest.param("resources/config/fsx-mnist-training", marks=pytest.mark.fsx_test), + pytest.param( + "resources/config/fsx-mnist-training", + marks=pytest.mark.fsx_test + ), "resources/config/spot-sample-pipeline-training", "resources/config/assume-role-training", + "resources/config/xgboost-mnist-trainingjob-debugger" ], ) def test_trainingjob( diff --git a/components/aws/sagemaker/tests/integration_tests/component_tests/test_workteam_component.py b/components/aws/sagemaker/tests/integration_tests/component_tests/test_workteam_component.py index 8bea2b2f83f..69965dd1d84 100644 --- a/components/aws/sagemaker/tests/integration_tests/component_tests/test_workteam_component.py +++ b/components/aws/sagemaker/tests/integration_tests/component_tests/test_workteam_component.py @@ -44,7 +44,12 @@ def create_workteamjob( @pytest.mark.parametrize( "test_file_dir", - [pytest.param("resources/config/create-workteam", marks=pytest.mark.canary_test)], + [ + pytest.param( + "resources/config/create-workteam", + marks=pytest.mark.canary_test + ) + ], ) def test_workteamjob( kfp_client, experiment_id, region, sagemaker_client, test_file_dir diff --git a/components/aws/sagemaker/tests/integration_tests/environment.yml b/components/aws/sagemaker/tests/integration_tests/environment.yml index 13704192363..2187ea9ebe6 100644 --- a/components/aws/sagemaker/tests/integration_tests/environment.yml +++ b/components/aws/sagemaker/tests/integration_tests/environment.yml @@ -17,7 +17,7 @@ dependencies: - kubernetes==11.0.* - kfp==0.5.* - minio==5.0.10 - - sagemaker==1.56.* + - sagemaker==2.1.* - ruamel.yaml==0.16.* diff --git a/components/aws/sagemaker/tests/integration_tests/resources/config/xgboost-mnist-trainingjob-debugger/config.yaml b/components/aws/sagemaker/tests/integration_tests/resources/config/xgboost-mnist-trainingjob-debugger/config.yaml new file mode 100644 index 00000000000..41ca55793c4 --- /dev/null +++ b/components/aws/sagemaker/tests/integration_tests/resources/config/xgboost-mnist-trainingjob-debugger/config.yaml @@ -0,0 +1,7 @@ +PipelineDefinition: ../../../../../samples/contrib/aws-samples/sagemaker_debugger_demo/debugger-training-pipeline.py +TestName: xgboost-mnist-trainingjob-debugger +Timeout: 3600 +ExpectedTrainingImage: 683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:0.90-2-cpu-py3 +Arguments: + bucket_name: ((DATA_BUCKET)) + role_arn: ((ROLE_ARN)) diff --git a/components/aws/sagemaker/tests/integration_tests/resources/definition/training_pipeline.py b/components/aws/sagemaker/tests/integration_tests/resources/definition/training_pipeline.py index c8518296eef..3ba49180f34 100644 --- a/components/aws/sagemaker/tests/integration_tests/resources/definition/training_pipeline.py +++ b/components/aws/sagemaker/tests/integration_tests/resources/definition/training_pipeline.py @@ -24,7 +24,7 @@ def training_pipeline( traffic_encryption="", spot_instance="", max_wait_time="", - checkpoint_config="{}", + checkpoint_config="", vpc_security_group_ids="", vpc_subnets="", assume_role="", diff --git a/components/aws/sagemaker/tests/integration_tests/utils/__init__.py b/components/aws/sagemaker/tests/integration_tests/utils/__init__.py index 98731a0bc4f..e142fa20831 100644 --- a/components/aws/sagemaker/tests/integration_tests/utils/__init__.py +++ b/components/aws/sagemaker/tests/integration_tests/utils/__init__.py @@ -7,7 +7,7 @@ import string import shutil -from sagemaker.amazon.amazon_estimator import get_image_uri +from sagemaker.image_uris import retrieve def get_region(): @@ -42,12 +42,12 @@ def get_fsx_id(): return os.environ.get("FSX_ID") -def get_assume_role_arn(): - return os.environ.get("ASSUME_ROLE_ARN") +def get_algorithm_image_registry(framework, region, version=None): + return retrieve(framework, region, version).split(".")[0] -def get_algorithm_image_registry(region, algorithm): - return get_image_uri(region, algorithm).split(".")[0] +def get_assume_role_arn(): + return os.environ.get("ASSUME_ROLE_ARN") def run_command(cmd, *popenargs, **kwargs): @@ -84,7 +84,9 @@ def replace_placeholders(input_filename, output_filename): "((REGION))": region, "((ROLE_ARN))": get_role_arn(), "((DATA_BUCKET))": get_s3_data_bucket(), - "((KMEANS_REGISTRY))": get_algorithm_image_registry(region, "kmeans"), + "((KMEANS_REGISTRY))": get_algorithm_image_registry("kmeans", region, "1"), + "((XGBOOST_REGISTRY))": get_algorithm_image_registry("xgboost", region, "1.0-1"), + "((BUILTIN_RULE_IMAGE))": get_algorithm_image_registry("debugger", region), "((FSX_ID))": get_fsx_id(), "((FSX_SUBNET))": get_fsx_subnet(), "((FSX_SECURITY_GROUP))": get_fsx_security_group(), diff --git a/components/aws/sagemaker/tests/unit_tests/requirements.txt b/components/aws/sagemaker/tests/unit_tests/requirements.txt index cb309fa658f..3c54a000190 100644 --- a/components/aws/sagemaker/tests/unit_tests/requirements.txt +++ b/components/aws/sagemaker/tests/unit_tests/requirements.txt @@ -1,6 +1,6 @@ -boto3==1.12.33 +boto3==1.14.12 coverage==5.1 pathlib2==2.3.5 pytest==5.4.1 pyyaml==5.3.1 -sagemaker==1.56.1 \ No newline at end of file +sagemaker==2.1.0 diff --git a/components/aws/sagemaker/tests/unit_tests/tests/test_hpo.py b/components/aws/sagemaker/tests/unit_tests/tests/test_hpo.py index d0a6b4a6085..d2b1af7af1d 100644 --- a/components/aws/sagemaker/tests/unit_tests/tests/test_hpo.py +++ b/components/aws/sagemaker/tests/unit_tests/tests/test_hpo.py @@ -287,13 +287,13 @@ def test_known_algorithm_key(self): parsed_args = self.parser.parse_args(known_algorithm_args) - # Patch get_image_uri - _utils.get_image_uri = MagicMock() - _utils.get_image_uri.return_value = "seq2seq-url" + # Patch retrieve + _utils.retrieve = MagicMock() + _utils.retrieve.return_value = "seq2seq-url" response = _utils.create_hyperparameter_tuning_job_request(vars(parsed_args)) - _utils.get_image_uri.assert_called_with('us-west-2', 'seq2seq') + _utils.retrieve.assert_called_with('seq2seq', 'us-west-2') self.assertEqual(response['TrainingJobDefinition']['AlgorithmSpecification']['TrainingImage'], "seq2seq-url") @@ -306,13 +306,13 @@ def test_known_algorithm_value(self): parsed_args = self.parser.parse_args(known_algorithm_args) - # Patch get_image_uri - _utils.get_image_uri = MagicMock() - _utils.get_image_uri.return_value = "seq2seq-url" + # Patch retrieve + _utils.retrieve = MagicMock() + _utils.retrieve.return_value = "seq2seq-url" response = _utils.create_hyperparameter_tuning_job_request(vars(parsed_args)) - _utils.get_image_uri.assert_called_with('us-west-2', 'seq2seq') + _utils.retrieve.assert_called_with('seq2seq', 'us-west-2') self.assertEqual(response['TrainingJobDefinition']['AlgorithmSpecification']['TrainingImage'], "seq2seq-url") @@ -324,14 +324,14 @@ def test_unknown_algorithm(self): parsed_args = self.parser.parse_args(known_algorithm_args) - # Patch get_image_uri - _utils.get_image_uri = MagicMock() - _utils.get_image_uri.return_value = "unknown-url" + # Patch retrieve + _utils.retrieve = MagicMock() + _utils.retrieve.return_value = "unknown-url" response = _utils.create_hyperparameter_tuning_job_request(vars(parsed_args)) # Should just place the algorithm name in regardless - _utils.get_image_uri.assert_not_called() + _utils.retrieve.assert_not_called() self.assertEqual(response['TrainingJobDefinition']['AlgorithmSpecification']['AlgorithmName'], "unknown algorithm") def test_no_channels(self): diff --git a/components/aws/sagemaker/tests/unit_tests/tests/test_train.py b/components/aws/sagemaker/tests/unit_tests/tests/test_train.py index 0806a9c128f..a5b449fad2e 100644 --- a/components/aws/sagemaker/tests/unit_tests/tests/test_train.py +++ b/components/aws/sagemaker/tests/unit_tests/tests/test_train.py @@ -112,14 +112,18 @@ def test_main_stop_training_job(self): def test_utils_stop_training_job(self): mock_sm_client = MagicMock() - mock_sm_client.stop_training_job.return_value = None + mock_sm_client.stop_training_job.return_value = 'FakeJobName' + + mock_sm_client.describe_training_job.side_effect = [ + {"TrainingJobStatus": "InProgress"} + ] response = _utils.stop_training_job(mock_sm_client, 'FakeJobName') mock_sm_client.stop_training_job.assert_called_once_with( TrainingJobName='FakeJobName' ) - self.assertEqual(response, None) + self.assertEqual(response, 'FakeJobName') def test_sagemaker_exception_in_create_training_job(self): mock_client = MagicMock() @@ -157,6 +161,28 @@ def test_wait_for_failed_job(self): _utils.wait_for_training_job(mock_client, 'training-job', 0) self.assertEqual(mock_client.describe_training_job.call_count, 4) + + def test_wait_for_debug_rules(self): + mock_client = MagicMock() + mock_client.describe_training_job.side_effect = [ + {"DebugRuleEvaluationStatuses": [{"RuleConfigurationName": "rule1", "RuleEvaluationStatus": "InProgress"}, {"RuleConfigurationName": "rule2", "RuleEvaluationStatus": "InProgress"}]}, + {"DebugRuleEvaluationStatuses": [{"RuleConfigurationName": "rule1", "RuleEvaluationStatus": "NoIssuesFound"}, {"RuleConfigurationName": "rule2", "RuleEvaluationStatus": "InProgress"}]}, + {"DebugRuleEvaluationStatuses": [{"RuleConfigurationName": "rule1", "RuleEvaluationStatus": "NoIssuesFound"}, {"RuleConfigurationName": "rule2", "RuleEvaluationStatus": "IssuesFound"}]}, + {"DebugRuleEvaluationStatuses": [{"RuleConfigurationName": "rule1", "RuleEvaluationStatus": "Should not be called"}, {"RuleConfigurationName": "rule2", "RuleEvaluationStatus": "Should not be called"}]}, + ] + _utils.wait_for_debug_rules(mock_client, 'training-job', 0) + self.assertEqual(mock_client.describe_training_job.call_count, 3) + + def test_wait_for_errored_rule(self): + mock_client = MagicMock() + mock_client.describe_training_job.side_effect = [ + {"DebugRuleEvaluationStatuses": [{"RuleConfigurationName": "rule1", "RuleEvaluationStatus": "InProgress"}, {"RuleConfigurationName": "rule2", "RuleEvaluationStatus": "InProgress"}]}, + {"DebugRuleEvaluationStatuses": [{"RuleConfigurationName": "rule1", "RuleEvaluationStatus": "Error"}, {"RuleConfigurationName": "rule2", "RuleEvaluationStatus": "InProgress"}]}, + {"DebugRuleEvaluationStatuses": [{"RuleConfigurationName": "rule1", "RuleEvaluationStatus": "Error"}, {"RuleConfigurationName": "rule2", "RuleEvaluationStatus": "NoIssuesFound"}]}, + {"DebugRuleEvaluationStatuses": [{"RuleConfigurationName": "rule1", "RuleEvaluationStatus": "Should not be called"}, {"RuleConfigurationName": "rule2", "RuleEvaluationStatus": "Should not be called"}]}, + ] + _utils.wait_for_debug_rules(mock_client, 'training-job', 0) + self.assertEqual(mock_client.describe_training_job.call_count, 3) def test_get_model_artifacts_from_job(self): mock_client = MagicMock() @@ -233,13 +259,13 @@ def test_known_algorithm_key(self): parsed_args = self.parser.parse_args(known_algorithm_args) - # Patch get_image_uri - _utils.get_image_uri = MagicMock() - _utils.get_image_uri.return_value = "seq2seq-url" + # Patch retrieve + _utils.retrieve = MagicMock() + _utils.retrieve.return_value = "seq2seq-url" response = _utils.create_training_job_request(vars(parsed_args)) - _utils.get_image_uri.assert_called_with('us-west-2', 'seq2seq') + _utils.retrieve.assert_called_with('seq2seq', 'us-west-2') self.assertEqual(response['AlgorithmSpecification']['TrainingImage'], "seq2seq-url") def test_known_algorithm_value(self): @@ -251,13 +277,13 @@ def test_known_algorithm_value(self): parsed_args = self.parser.parse_args(known_algorithm_args) - # Patch get_image_uri - _utils.get_image_uri = MagicMock() - _utils.get_image_uri.return_value = "seq2seq-url" + # Patch retrieve + _utils.retrieve = MagicMock() + _utils.retrieve.return_value = "seq2seq-url" response = _utils.create_training_job_request(vars(parsed_args)) - _utils.get_image_uri.assert_called_with('us-west-2', 'seq2seq') + _utils.retrieve.assert_called_with('seq2seq', 'us-west-2') self.assertEqual(response['AlgorithmSpecification']['TrainingImage'], "seq2seq-url") def test_unknown_algorithm(self): @@ -268,14 +294,14 @@ def test_unknown_algorithm(self): parsed_args = self.parser.parse_args(known_algorithm_args) - # Patch get_image_uri - _utils.get_image_uri = MagicMock() - _utils.get_image_uri.return_value = "unknown-url" + # Patch retrieve + _utils.retrieve = MagicMock() + _utils.retrieve.return_value = "unknown-url" response = _utils.create_training_job_request(vars(parsed_args)) # Should just place the algorithm name in regardless - _utils.get_image_uri.assert_not_called() + _utils.retrieve.assert_not_called() self.assertEqual(response['AlgorithmSpecification']['AlgorithmName'], "unknown algorithm") def test_no_channels(self): @@ -342,6 +368,49 @@ def test_spot_bad_args(self): with self.assertRaises(Exception): _utils.create_training_job_request(vars(arg)) + def test_hook_min_args(self): + good_args = self.parser.parse_args(required_args + ['--debug_hook_config', '{"S3OutputPath": "s3://fake-uri/"}']) + response = _utils.create_training_job_request(vars(good_args)) + self.assertEqual(response['DebugHookConfig']['S3OutputPath'], "s3://fake-uri/") + + def test_hook_max_args(self): + good_args = self.parser.parse_args(required_args + ['--debug_hook_config', '{"S3OutputPath": "s3://fake-uri/", "LocalPath": "/local/path/", "HookParameters": {"key": "value"}, "CollectionConfigurations": [{"CollectionName": "collection1", "CollectionParameters": {"key1": "value1"}}, {"CollectionName": "collection2", "CollectionParameters": {"key2": "value2", "key3": "value3"}}]}']) + response = _utils.create_training_job_request(vars(good_args)) + self.assertEqual(response['DebugHookConfig']['S3OutputPath'], "s3://fake-uri/") + self.assertEqual(response['DebugHookConfig']['LocalPath'], "/local/path/") + self.assertEqual(response['DebugHookConfig']['HookParameters'], {"key": "value"}) + self.assertEqual(response['DebugHookConfig']['CollectionConfigurations'], [ + { + "CollectionName": "collection1", + "CollectionParameters": { + "key1": "value1" + } + }, { + "CollectionName": "collection2", + "CollectionParameters": { + "key2": "value2", + "key3": "value3" + } + } + ]) + + def test_rule_max_args(self): + good_args = self.parser.parse_args(required_args + ['--debug_rule_config', '[{"InstanceType": "ml.m4.xlarge", "LocalPath": "/local/path/", "RuleConfigurationName": "rule_name", "RuleEvaluatorImage": "test-image", "RuleParameters": {"key1": "value1"}, "S3OutputPath": "s3://fake-uri/", "VolumeSizeInGB": 1}]']) + response = _utils.create_training_job_request(vars(good_args)) + self.assertEqual(response['DebugRuleConfigurations'][0]['InstanceType'], 'ml.m4.xlarge') + self.assertEqual(response['DebugRuleConfigurations'][0]['LocalPath'], '/local/path/') + self.assertEqual(response['DebugRuleConfigurations'][0]['RuleConfigurationName'], 'rule_name') + self.assertEqual(response['DebugRuleConfigurations'][0]['RuleEvaluatorImage'], 'test-image') + self.assertEqual(response['DebugRuleConfigurations'][0]['RuleParameters'], {"key1": "value1"}) + self.assertEqual(response['DebugRuleConfigurations'][0]['S3OutputPath'], 's3://fake-uri/') + self.assertEqual(response['DebugRuleConfigurations'][0]['VolumeSizeInGB'], 1) + + def test_rule_min_good_args(self): + good_args = self.parser.parse_args(required_args + ['--debug_rule_config', '[{"RuleConfigurationName": "rule_name", "RuleEvaluatorImage": "test-image"}]']) + response = _utils.create_training_job_request(vars(good_args)) + self.assertEqual(response['DebugRuleConfigurations'][0]['RuleConfigurationName'], 'rule_name') + self.assertEqual(response['DebugRuleConfigurations'][0]['RuleEvaluatorImage'], 'test-image') + def test_spot_lesser_wait_time(self): args = self.parser.parse_args(required_args + ['--spot_instance', 'True', '--max_wait_time', '3599', '--checkpoint_config', '{"S3Uri": "s3://fake-uri/", "LocalPath": "local-path"}']) with self.assertRaises(Exception): diff --git a/components/aws/sagemaker/train/README.md b/components/aws/sagemaker/train/README.md index 37a1c502dad..4f80bb65be4 100644 --- a/components/aws/sagemaker/train/README.md +++ b/components/aws/sagemaker/train/README.md @@ -36,14 +36,61 @@ traffic_encryption | Encrypts all communications between ML compute instances in spot_instance | Use managed spot training if true | No | Boolean | False, True | False | max_wait_time | The maximum time in seconds you are willing to wait for a managed spot training job to complete | Yes | Int | ≤ 432000 (5 days) | 86400 (1 day) | checkpoint_config | Dictionary of information about the output location for managed spot training checkpoint data | Yes | Dict | | {} | +debug_hook_config | Dictionary of configuration information for the debug hook parameters, collection configurations, and storage paths | Yes | Dict | | {} | +debug_rule_config | List of configuration information for debugging rules. | Yes | List of Dicts | | [] | tags | Key-value pairs to categorize AWS resources | Yes | Dict | | {} | +Notes: +* Please use the links in the [Resources section](#Resources) for detailed information on each input parameter and SageMaker APIs used in this component. +* The value of `RuleEvaluatorImage` will depend on two things: the region and whether the rule is a built-in or a custom rule. [Debugger Registry URLs](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-docker-images-rules.html) in the [Resources section](#Resources) will lead you to the documentation which outlines what the value of `RuleEvaluatorImage` will be. +* The format for the [`debug_hook_config`](https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_DebugHookConfig.html) field is: +``` +{ + "CollectionConfigurations": [ + { + 'CollectionName': 'string', + 'CollectionParameters': { + 'string' : 'string' + } + } + ], + 'HookParameters': { + 'string' : 'string' + }, + 'LocalPath': 'string', + 'S3OutputPath': 'string' +} +``` +* The format for the [`debug_rule_config`](https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_DebugRuleConfiguration.html) field is: +``` +[ + { + 'InstanceType': 'string', + 'LocalPath': 'string', + 'RuleConfigurationName': 'string', + 'RuleEvaluatorImage': 'string', + 'RuleParameters': { + 'string' : 'string' + }, + 'S3OutputPath': 'string', + 'VolumeSizeInGB': number + } +] +``` + ## Output Stores the Model in the s3 bucket you specified # Example code Simple example pipeline with only Train component : [simple_train_pipeline](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/simple_train_pipeline) +Sample Pipeline for Training Component with Debugger: [sagemaker_debugger_demo](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/sagemaker_debugger_demo) # Resources * [Using Amazon built-in algorithms](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html) +* [Amazon SageMaker Debugger](https://docs.aws.amazon.com/sagemaker/latest/dg/train-debugger.html) +* [Available Frameworks to Use Debugger](https://docs.aws.amazon.com/sagemaker/latest/dg/train-debugger.html#debugger-supported-aws-containers) +* [Debugger Built-In Rules](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-built-in-rules.html) +* [Debugger Custom Rules](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-custom-rules.html) +* [Debugger Registry URLs](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-docker-images-rules.html) +* [Debugger API Examples](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-createtrainingjob-api.html) diff --git a/components/aws/sagemaker/train/component.yaml b/components/aws/sagemaker/train/component.yaml index 9baf9340ccc..80020bea53e 100644 --- a/components/aws/sagemaker/train/component.yaml +++ b/components/aws/sagemaker/train/component.yaml @@ -94,6 +94,14 @@ inputs: description: 'The endpoint URL for the private link VPC endpoint.' default: '' type: String + - name: debug_hook_config + description: 'Configuration information for the debug hook parameters, collection configuration, and storage paths.' + default: '{}' + type: JsonObject + - name: debug_rule_config + description: 'Configuration information for debugging rules.' + default: '[]' + type: JsonArray - name: assume_role description: 'The ARN of an IAM role to assume when connecting to SageMaker.' default: '' @@ -108,7 +116,7 @@ outputs: - {name: training_image, description: 'The registry path of the Docker image that contains the training algorithm'} implementation: container: - image: amazon/aws-sagemaker-kfp-components:0.7.0 + image: amazon/aws-sagemaker-kfp-components:0.8.0 command: ['python3'] args: [ train.py, @@ -134,6 +142,8 @@ implementation: --vpc_subnets, {inputValue: vpc_subnets}, --network_isolation, {inputValue: network_isolation}, --traffic_encryption, {inputValue: traffic_encryption}, + --debug_hook_config, {inputValue: debug_hook_config}, + --debug_rule_config, {inputValue: debug_rule_config}, --spot_instance, {inputValue: spot_instance}, --max_wait_time, {inputValue: max_wait_time}, --checkpoint_config, {inputValue: checkpoint_config}, @@ -141,4 +151,4 @@ implementation: --model_artifact_url_output_path, {outputPath: model_artifact_url}, --job_name_output_path, {outputPath: job_name}, --training_image_output_path, {outputPath: training_image} - ] \ No newline at end of file + ] diff --git a/components/aws/sagemaker/train/src/train.py b/components/aws/sagemaker/train/src/train.py index 2dd333b362d..2095140110e 100644 --- a/components/aws/sagemaker/train/src/train.py +++ b/components/aws/sagemaker/train/src/train.py @@ -40,6 +40,8 @@ def create_parser(): parser.add_argument('--vpc_subnets', type=str, required=False, help='The ID of the subnets in the VPC to which you want to connect your hpo job.') parser.add_argument('--network_isolation', type=_utils.str_to_bool, required=False, help='Isolates the training container.', default=True) parser.add_argument('--traffic_encryption', type=_utils.str_to_bool, required=False, help='Encrypts all communications between ML compute instances in distributed training.', default=False) + parser.add_argument('--debug_hook_config', type=_utils.yaml_or_json_str, required=False, help='Configuration information for the debug hook parameters, collection configuration, and storage paths.', default={}) + parser.add_argument('--debug_rule_config', type=_utils.yaml_or_json_str, required=False, help='Configuration information for debugging rules.', default=[]) ### Start spot instance support parser.add_argument('--spot_instance', type=_utils.str_to_bool, required=False, help='Use managed spot training.', default=False) @@ -68,13 +70,15 @@ def main(argv=None): job_name = _utils.create_training_job(client, vars(args)) def signal_term_handler(signalNumber, frame): - _utils.stop_training_job(client, job_name) - logging.info(f"Training Job: {job_name} request submitted to Stop") + job_stopped = _utils.stop_training_job(client, job_name) + if job_stopped: + logging.info(f"Training Job: {job_stopped} request submitted to Stop") signal.signal(signal.SIGTERM, signal_term_handler) logging.info('Job request submitted. Waiting for completion...') try: _utils.wait_for_training_job(client, job_name) + _utils.wait_for_debug_rules(client, job_name) except: raise finally: diff --git a/components/aws/sagemaker/workteam/component.yaml b/components/aws/sagemaker/workteam/component.yaml index 4afa852a68b..7d3c0468df7 100644 --- a/components/aws/sagemaker/workteam/component.yaml +++ b/components/aws/sagemaker/workteam/component.yaml @@ -40,7 +40,7 @@ outputs: - {name: workteam_arn, description: 'The ARN of the workteam.'} implementation: container: - image: amazon/aws-sagemaker-kfp-components:0.7.0 + image: amazon/aws-sagemaker-kfp-components:0.8.0 command: ['python3'] args: [ workteam.py, @@ -55,4 +55,4 @@ implementation: --sns_topic, {inputValue: sns_topic}, --tags, {inputValue: tags}, --workteam_arn_output_path, {outputPath: workteam_arn} - ] \ No newline at end of file + ] diff --git a/samples/contrib/aws-samples/sagemaker_debugger_demo/README.md b/samples/contrib/aws-samples/sagemaker_debugger_demo/README.md new file mode 100644 index 00000000000..b3d0d0e359c --- /dev/null +++ b/samples/contrib/aws-samples/sagemaker_debugger_demo/README.md @@ -0,0 +1,56 @@ +# Sample Pipeline for Training Component with Debugger + +The `sagemaker-debugger-demo.py` sample creates a pipeline consisting of only a training component. In that component we are using the XGBoost algorithm but with poor hyperparameter choices. By enabling debugger rules and hooks, we can quickly learn that the model produced has issues. + +## Prerequisites + +This pipeline uses the exact same setup as [simple_training_pipeline](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/simple_train_pipeline). For the purposes of this demonstration, all resources will be created in the `us-east-1` region. + +## Steps +1. Compile the pipeline: + `dsl-compile --py debugger-training-pipeline.py --output debugger-training-pipeline.tar.gz` +2. In the Kubeflow UI, upload this compiled pipeline specification (the .tar.gz file), fill in the necessary run parameters, and click create run. +3. Once the pipeline has finished running, you can view the results of each debugger rule under 'Logs'. + +Inputs format to `debug_hook_config` and `debug_rule_config` : +```buildoutcfg +debug_hook_config = { + "S3OutputPath": "s3:///path/for/data/emission/", + "LocalPath": "/local/path/for/data/emission/", + "CollectionConfigurations": [ + { + "CollectionName": "losses", + "CollectionParameters": { + "start_step": "25", + "end_step": "150" + } + }, { + "CollectionName": "gradient", + "CollectionParameters": { + "start_step": "5", + "end_step": "100" + } + } + ], + "HookParameters": { + "save_interval": "10" + } +} + +debug_rule_config = { + "RuleConfigurationName": "rule_name" + "RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest" + "RuleParameters": { + "rule_to_invoke": "VanishingGradient", + "threshold": "0.01" + } +} +``` + +# Resources +* [Amazon SageMaker Debugger](https://docs.aws.amazon.com/sagemaker/latest/dg/train-debugger.html) +* [Available Frameworks to Use Debugger](https://docs.aws.amazon.com/sagemaker/latest/dg/train-debugger.html#debugger-supported-aws-containers) +* [Debugger Built-In Rules](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-built-in-rules.html) +* [Debugger Custom Rules](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-custom-rules.html) +* [Debugger API Examples](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-createtrainingjob-api.html) + diff --git a/samples/contrib/aws-samples/sagemaker_debugger_demo/debugger-training-pipeline.py b/samples/contrib/aws-samples/sagemaker_debugger_demo/debugger-training-pipeline.py new file mode 100644 index 00000000000..5b83c0ad3ff --- /dev/null +++ b/samples/contrib/aws-samples/sagemaker_debugger_demo/debugger-training-pipeline.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 + +import kfp +import json +import os +import copy +from kfp import components +from kfp import dsl + + +cur_file_dir = os.path.dirname(__file__) +components_dir = os.path.join(cur_file_dir, '../../../../components/aws/sagemaker/') + +sagemaker_train_op = components.load_component_from_file(components_dir + '/train/component.yaml') + +def training_input(input_name, s3_uri, content_type): + return { + "ChannelName": input_name, + "DataSource": {"S3DataSource": {"S3Uri": s3_uri, "S3DataType": "S3Prefix"}}, + "ContentType": content_type + } + + +def training_debug_hook(s3_uri, collection_dict): + return { + 'S3OutputPath': s3_uri, + 'CollectionConfigurations': format_collection_config(collection_dict) + } + + +def format_collection_config(collection_dict): + output = [] + for key, val in collection_dict.items(): + output.append({'CollectionName': key, 'CollectionParameters': val}) + return output + + +def training_debug_rules(rule_name, parameters): + return { + 'RuleConfigurationName': rule_name, + 'RuleEvaluatorImage': '503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest', + 'RuleParameters': parameters + } + + +collections = { + 'feature_importance' : { + 'save_interval': '5' + }, + 'losses' : { + 'save_interval': '10' + }, + 'average_shap': { + 'save_interval': '5' + }, + 'metrics': { + 'save_interval': '3' + } +} + + +bad_hyperparameters = { + 'max_depth': '5', + 'eta': '0', + 'gamma': '4', + 'min_child_weight': '6', + 'silent': '0', + 'subsample': '0.7', + 'num_round': '50' +} + + +@dsl.pipeline( + name='XGBoost Training Pipeline with bad hyperparameters', + description='SageMaker training job test with debugger' +) +def training(role_arn="", bucket_name="my-bucket"): + train_channels = [ + training_input("train", f"s3://{bucket_name}/mnist_kmeans_example/input/valid_data.csv", 'text/csv') + ] + train_debug_rules = [ + training_debug_rules("LossNotDecreasing", {"rule_to_invoke": "LossNotDecreasing", "tensor_regex": ".*"}), + training_debug_rules("Overtraining", {'rule_to_invoke': 'Overtraining', 'patience_train': '10', 'patience_validation': '20'}), + ] + training = sagemaker_train_op( + region='us-east-1', + image='683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:0.90-2-cpu-py3', + hyperparameters=bad_hyperparameters, + channels=train_channels, + instance_type='ml.m5.2xlarge', + model_artifact_path=f's3://{bucket_name}/mnist_kmeans_example/output/model', + debug_hook_config=training_debug_hook(f's3://{bucket_name}/mnist_kmeans_example/hook_config', collections), + debug_rule_config=train_debug_rules, + role=role_arn, + ) + + +if __name__ == '__main__': + kfp.compiler.Compiler().compile(training, __file__ + '.zip') diff --git a/samples/contrib/aws-samples/simple_train_pipeline/README.md b/samples/contrib/aws-samples/simple_train_pipeline/README.md index a12bb43a1ca..f4b9c1fb1e6 100644 --- a/samples/contrib/aws-samples/simple_train_pipeline/README.md +++ b/samples/contrib/aws-samples/simple_train_pipeline/README.md @@ -71,7 +71,7 @@ Run this file with the follow command: `python3 s3_sample_data_creator.py` 1. Compile the pipeline: `dsl-compile --py training-pipeline.py --output training-pipeline.tar.gz` 2. In the Kubeflow UI, upload this compiled pipeline specification (the .tar.gz file) and click on create run. -3. Once the pipeline completes, you can see the outputs under 'Output parameters' in the HPO component's Input/Output section. +3. Once the pipeline completes, you can see the outputs under 'Output parameters' in the Training component's Input/Output section. Example inputs to this pipeline : ```buildoutcfg @@ -118,4 +118,4 @@ role : Paste the role ARN that you noted down # Resources -* [Using Amazon built-in algorithms](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html) \ No newline at end of file +* [Using Amazon built-in algorithms](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html)