Skip to content

Commit

Permalink
SDK - Components - Calculate component hash digest (#3726)
Browse files Browse the repository at this point in the history
* SDK - Components - Calculate component hash digest

The digest is calculated when loading the component from URL, tfile or text.
Slightly refactored component loading - streams are no longer used, only bytes.
TODO: Calculate the digest if missing
TODO: Report possible digest conflicts

* Updated the test graph component

* Using the actual digest in the test
  • Loading branch information
Ark-kun committed May 13, 2020
1 parent bd4be88 commit fe30d54
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 19 deletions.
42 changes: 23 additions & 19 deletions sdk/python/kfp/components/_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def _fix_component_uri(uri: str) -> str:

def _load_component_spec_from_file(path) -> ComponentSpec:
with open(path, 'rb') as component_stream:
return _load_component_spec_from_yaml_or_zip_stream(component_stream)
return _load_component_spec_from_yaml_or_zip_bytes(component_stream.read())


def _load_component_spec_from_url(url: str):
Expand All @@ -148,33 +148,30 @@ def _load_component_spec_from_url(url: str):


def _load_component_spec_from_yaml_or_zip_bytes(data: bytes):
import io
component_stream = io.BytesIO(data)
return _load_component_spec_from_yaml_or_zip_stream(component_stream)


def _load_component_spec_from_yaml_or_zip_stream(stream) -> ComponentSpec:
'''Loads component spec from a stream.
'''Loads component spec from binary data.
The stream can be YAML or a zip file with a component.yaml file inside.
The data can be a YAML file or a zip file with a component.yaml file inside.
'''
import zipfile
stream.seek(0)
import io
stream = io.BytesIO(data)
if zipfile.is_zipfile(stream):
stream.seek(0)
with zipfile.ZipFile(stream) as zip_obj:
with zip_obj.open(_COMPONENT_FILE_NAME_IN_ARCHIVE) as component_stream:
return _load_component_spec_from_component_text(
text_or_file=component_stream,
)
else:
stream.seek(0)
return _load_component_spec_from_component_text(stream)
data = zip_obj.read(_COMPONENT_FILE_NAME_IN_ARCHIVE)
return _load_component_spec_from_component_text(data)


def _load_component_spec_from_component_text(text_or_file) -> ComponentSpec:
component_dict = load_yaml(text_or_file)
def _load_component_spec_from_component_text(text) -> ComponentSpec:
component_dict = load_yaml(text)
component_spec = ComponentSpec.from_dict(component_dict)

# Calculating hash digest for the component
import hashlib
data = text if isinstance(text, bytes) else text.encode('utf-8')
digest = hashlib.sha256(data).hexdigest()
component_spec._digest = digest

return component_spec


Expand Down Expand Up @@ -287,6 +284,13 @@ def _create_task_factory_from_component_spec(component_spec:ComponentSpec, compo
component_ref = ComponentReference(spec=component_spec, url=component_filename)
else:
component_ref.spec = component_spec

digest = getattr(component_spec, '_digest', None)
# TODO: Calculate the digest if missing
if digest:
# TODO: Report possible digest conflicts
component_ref.digest = digest


def create_task_from_component_and_arguments(pythonic_arguments):
arguments = {
Expand Down
12 changes: 12 additions & 0 deletions sdk/python/tests/components/test_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,18 @@ def test_loading_minimal_component(self):

self.assertEqual(task_factory1.component_spec.implementation.container.image, component_dict['implementation']['container']['image'])

def test_digest_of_loaded_component(self):
component_text = textwrap.dedent('''\
implementation:
container:
image: busybox
'''
)
task_factory1 = comp.load_component_from_text(component_text)
task1 = task_factory1()

self.assertEqual(task1.component_ref.digest, '1ede211233e869581d098673962c2c1e8c1e4cebb7cf5d7332c2f73cb4900823')

def test_accessing_component_spec_from_task_factory(self):
component_text = '''\
implementation:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ implementation:
tasks:
Automl create dataset for tables:
componentRef:
digest: 98381958ba8b0d2b83a23a78f482f08b48e665409820b3a6254bccdbcf206df3
url: https://raw.githubusercontent.com/kubeflow/pipelines/b3179d86b239a08bf4884b50dbf3a9151da96d66/components/gcp/automl/create_dataset_for_tables/component.yaml
arguments:
gcp_project_id:
Expand All @@ -51,6 +52,7 @@ implementation:
inputName: dataset_display_name
Automl import data from bigquery:
componentRef:
digest: a965621525a9081a8c7d4c12806bf4359a03b9842a7d3e891ab5b48422dbe527
url: https://raw.githubusercontent.com/kubeflow/pipelines/b3179d86b239a08bf4884b50dbf3a9151da96d66/components/gcp/automl/import_data_from_bigquery/component.yaml
arguments:
dataset_path:
Expand All @@ -63,6 +65,7 @@ implementation:
inputName: dataset_bq_input_uri
Automl split dataset table column names:
componentRef:
digest: a77ef9ecb87e543290a02b3fa933bcd5e67947a12f6d011fdd37bf38c1b26ade
url: https://raw.githubusercontent.com/kubeflow/pipelines/b3179d86b239a08bf4884b50dbf3a9151da96d66/components/gcp/automl/split_dataset_table_column_names/component.yaml
arguments:
dataset_path:
Expand All @@ -76,6 +79,7 @@ implementation:
table_index: '0'
Automl create model for tables:
componentRef:
digest: e52ee882685380988ee2f4de6beacdcd0d2ab21d37bef45c4e16a20a224d374e
url: https://raw.githubusercontent.com/kubeflow/pipelines/b3179d86b239a08bf4884b50dbf3a9151da96d66/components/gcp/automl/create_model_for_tables/component.yaml
arguments:
gcp_project_id:
Expand Down Expand Up @@ -108,6 +112,7 @@ implementation:
inputName: train_budget_milli_node_hours
Automl prediction service batch predict:
componentRef:
digest: 908ea1855f5aa3d35f60145f0f15007ea437b35b0a1be2fd1d0db5a76221cad1
url: https://raw.githubusercontent.com/kubeflow/pipelines/b3179d86b239a08bf4884b50dbf3a9151da96d66/components/gcp/automl/prediction_service_batch_predict/component.yaml
arguments:
model_path:
Expand Down

0 comments on commit fe30d54

Please sign in to comment.