Skip to content

Commit

Permalink
[AWS SageMaker] Print SageMaker job logs in kfp UI (#3954)
Browse files Browse the repository at this point in the history
* Print logs for AWS SM Componenets on KFP UI

* address comments

* update version number to 0.5.0

* update yaml to version 0.5.0

* update changelog
  • Loading branch information
akartsky authored Jun 19, 2020
1 parent 6698fe7 commit b3d8e04
Show file tree
Hide file tree
Showing 14 changed files with 66 additions and 12 deletions.
5 changes: 5 additions & 0 deletions components/aws/sagemaker/Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ The version of the AWS SageMaker Components is determined by the docker image ta
Repository: https://hub.docker.com/repository/docker/amazon/aws-sagemaker-kfp-components

---------------------------------------------
**Change log for version 0.5.0**
- Print SageMaker logs in KFP UI for Train, Transform and Process component

> Pull requests : [#3954](https://github.com/kubeflow/pipelines/pull/3954)
**Change log for version 0.4.1**
- Fix breaking bug in HPO component

Expand Down
2 changes: 1 addition & 1 deletion components/aws/sagemaker/THIRD-PARTY-LICENSES.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
** Amazon SageMaker Components for Kubeflow Pipelines; version 0.4.1 --
** Amazon SageMaker Components for Kubeflow Pipelines; version 0.5.0 --
https://github.com/kubeflow/pipelines/tree/master/components/aws/sagemaker
Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
** boto3; version 1.12.33 -- https://github.com/boto/boto3/
Expand Down
2 changes: 1 addition & 1 deletion components/aws/sagemaker/batch_transform/component.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ outputs:
- {name: output_location, description: 'S3 URI of the transform job results.'}
implementation:
container:
image: amazon/aws-sagemaker-kfp-components:0.4.1
image: amazon/aws-sagemaker-kfp-components:0.5.0
command: ['python3']
args: [
batch_transform.py,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,14 @@ def main(argv=None):
logging.info('Submitting Batch Transformation request to SageMaker...')
batch_job_name = _utils.create_transform_job(client, vars(args))
logging.info('Batch Job request submitted. Waiting for completion...')
_utils.wait_for_transform_job(client, batch_job_name)

try:
_utils.wait_for_transform_job(client, batch_job_name)
except:
raise
finally:
cw_client = _utils.get_cloudwatch_client(args.region)
_utils.print_logs_for_job(cw_client, '/aws/sagemaker/TransformJobs', batch_job_name)

Path(args.output_location_file).parent.mkdir(parents=True, exist_ok=True)
with open(args.output_location_file, 'w') as f:
Expand Down
29 changes: 29 additions & 0 deletions components/aws/sagemaker/common/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,30 @@ def get_component_version():
return component_version


def print_logs_for_job(cw_client, log_grp, job_name):
"""Gets the CloudWatch logs for SageMaker jobs"""
try:
logging.info('\n******************** CloudWatch logs for {} {} ********************\n'.format(log_grp, job_name))

log_streams = cw_client.describe_log_streams(
logGroupName=log_grp,
logStreamNamePrefix=job_name + '/'
)['logStreams']

for log_stream in log_streams:
logging.info('\n***** {} *****\n'.format(log_stream['logStreamName']))
response = cw_client.get_log_events(
logGroupName=log_grp,
logStreamName=log_stream['logStreamName']
)
for event in response['events']:
logging.info(event['message'])

logging.info('\n******************** End of CloudWatch logs for {} {} ********************\n'.format(log_grp, job_name))
except Exception as e:
logging.error(e)


def get_sagemaker_client(region, endpoint_url=None):
"""Builds a client to the AWS SageMaker API."""
session_config = botocore.config.Config(
Expand All @@ -92,6 +116,11 @@ def get_sagemaker_client(region, endpoint_url=None):
return client


def get_cloudwatch_client(region):
client = boto3.client('logs', region_name=region)
return client


def create_training_job_request(args):
### Documentation: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.create_training_job
with open(os.path.join(__cwd__, 'train.template.yaml'), 'r') as f:
Expand Down
2 changes: 1 addition & 1 deletion components/aws/sagemaker/deploy/component.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ outputs:
- {name: endpoint_name, description: 'Endpoint name'}
implementation:
container:
image: amazon/aws-sagemaker-kfp-components:0.4.1
image: amazon/aws-sagemaker-kfp-components:0.5.0
command: ['python3']
args: [
deploy.py,
Expand Down
2 changes: 1 addition & 1 deletion components/aws/sagemaker/ground_truth/component.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ outputs:
- {name: active_learning_model_arn, description: 'The ARN for the most recent Amazon SageMaker model trained as part of automated data labeling.'}
implementation:
container:
image: amazon/aws-sagemaker-kfp-components:0.4.1
image: amazon/aws-sagemaker-kfp-components:0.5.0
command: ['python3']
args: [
ground_truth.py,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ outputs:
description: 'The registry path of the Docker image that contains the training algorithm'
implementation:
container:
image: amazon/aws-sagemaker-kfp-components:0.4.1
image: amazon/aws-sagemaker-kfp-components:0.5.0
command: ['python3']
args: [
hyperparameter_tuning.py,
Expand Down
2 changes: 1 addition & 1 deletion components/aws/sagemaker/model/component.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ outputs:
- {name: model_name, description: 'The model name Sagemaker created'}
implementation:
container:
image: amazon/aws-sagemaker-kfp-components:0.4.1
image: amazon/aws-sagemaker-kfp-components:0.5.0
command: ['python3']
args: [
create_model.py,
Expand Down
2 changes: 1 addition & 1 deletion components/aws/sagemaker/process/component.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ outputs:
- {name: output_artifacts, description: 'A dictionary containing the output S3 artifacts'}
implementation:
container:
image: amazon/aws-sagemaker-kfp-components:0.4.1
image: amazon/aws-sagemaker-kfp-components:0.5.0
command: ['python3']
args: [
process.py,
Expand Down
9 changes: 8 additions & 1 deletion components/aws/sagemaker/process/src/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,14 @@ def main(argv=None):
logging.info('Submitting Processing Job to SageMaker...')
job_name = _utils.create_processing_job(client, vars(args))
logging.info('Job request submitted. Waiting for completion...')
_utils.wait_for_processing_job(client, job_name)

try:
_utils.wait_for_processing_job(client, job_name)
except:
raise
finally:
cw_client = _utils.get_cloudwatch_client(args.region)
_utils.print_logs_for_job(cw_client, '/aws/sagemaker/ProcessingJobs', job_name)

outputs = _utils.get_processing_job_outputs(client, job_name)

Expand Down
2 changes: 1 addition & 1 deletion components/aws/sagemaker/train/component.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ outputs:
- {name: training_image, description: 'The registry path of the Docker image that contains the training algorithm'}
implementation:
container:
image: amazon/aws-sagemaker-kfp-components:0.4.1
image: amazon/aws-sagemaker-kfp-components:0.5.0
command: ['python3']
args: [
train.py,
Expand Down
8 changes: 7 additions & 1 deletion components/aws/sagemaker/train/src/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,13 @@ def main(argv=None):
logging.info('Submitting Training Job to SageMaker...')
job_name = _utils.create_training_job(client, vars(args))
logging.info('Job request submitted. Waiting for completion...')
_utils.wait_for_training_job(client, job_name)
try:
_utils.wait_for_training_job(client, job_name)
except:
raise
finally:
cw_client = _utils.get_cloudwatch_client(args.region)
_utils.print_logs_for_job(cw_client, '/aws/sagemaker/TrainingJobs', job_name)

image = _utils.get_image_from_job(client, job_name)
model_artifact_url = _utils.get_model_artifacts_from_job(client, job_name)
Expand Down
2 changes: 1 addition & 1 deletion components/aws/sagemaker/workteam/component.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ outputs:
- {name: workteam_arn, description: 'The ARN of the workteam.'}
implementation:
container:
image: amazon/aws-sagemaker-kfp-components:0.4.1
image: amazon/aws-sagemaker-kfp-components:0.5.0
command: ['python3']
args: [
workteam.py,
Expand Down

0 comments on commit b3d8e04

Please sign in to comment.