SDK - Components - Calculate component hash digest (#3726)

* SDK - Components - Calculate component hash digest The digest is calculated when loading the component from URL, tfile or text. Slightly refactored component loading - streams are no longer used, only bytes. TODO: Calculate the digest if missing TODO: Report possible digest conflicts * Updated the test graph component * Using the actual digest in the test
kubeflow · May 13, 2020 · fe30d54 · fe30d54
1 parent bd4be88
commit fe30d54
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 19 deletions.
diff --git a/sdk/python/kfp/components/_components.py b/sdk/python/kfp/components/_components.py
@@ -129,7 +129,7 @@ def _fix_component_uri(uri: str) -> str:
 
 def _load_component_spec_from_file(path) -> ComponentSpec:
     with open(path, 'rb') as component_stream:
-        return _load_component_spec_from_yaml_or_zip_stream(component_stream)
+        return _load_component_spec_from_yaml_or_zip_bytes(component_stream.read())
 
 
 def _load_component_spec_from_url(url: str):
@@ -148,33 +148,30 @@ def _load_component_spec_from_url(url: str):
 
 
 def _load_component_spec_from_yaml_or_zip_bytes(data: bytes):
-    import io
-    component_stream = io.BytesIO(data)
-    return _load_component_spec_from_yaml_or_zip_stream(component_stream)
-
-
-def _load_component_spec_from_yaml_or_zip_stream(stream) -> ComponentSpec:
-    '''Loads component spec from a stream.
+    '''Loads component spec from binary data.
 
-    The stream can be YAML or a zip file with a component.yaml file inside.
+    The data can be a YAML file or a zip file with a component.yaml file inside.
     '''
     import zipfile
-    stream.seek(0)
+    import io
+    stream = io.BytesIO(data)
     if zipfile.is_zipfile(stream):
         stream.seek(0)
         with zipfile.ZipFile(stream) as zip_obj:
-            with zip_obj.open(_COMPONENT_FILE_NAME_IN_ARCHIVE) as component_stream:
-                return _load_component_spec_from_component_text(
-                    text_or_file=component_stream,
-                )
-    else:
-        stream.seek(0)
-        return _load_component_spec_from_component_text(stream)
+            data = zip_obj.read(_COMPONENT_FILE_NAME_IN_ARCHIVE)
+    return _load_component_spec_from_component_text(data)
 
 
-def _load_component_spec_from_component_text(text_or_file) -> ComponentSpec:
-    component_dict = load_yaml(text_or_file)
+def _load_component_spec_from_component_text(text) -> ComponentSpec:
+    component_dict = load_yaml(text)
     component_spec = ComponentSpec.from_dict(component_dict)
+
+    # Calculating hash digest for the component
+    import hashlib
+    data = text if isinstance(text, bytes) else text.encode('utf-8')
+    digest = hashlib.sha256(data).hexdigest()
+    component_spec._digest = digest
+
     return component_spec
 
 
@@ -287,6 +284,13 @@ def _create_task_factory_from_component_spec(component_spec:ComponentSpec, compo
         component_ref = ComponentReference(spec=component_spec, url=component_filename)
     else:
         component_ref.spec = component_spec
+
+    digest = getattr(component_spec, '_digest', None)
+    # TODO: Calculate the digest if missing
+    if digest:
+        # TODO: Report possible digest conflicts
+        component_ref.digest = digest
+
 
     def create_task_from_component_and_arguments(pythonic_arguments):
         arguments = {

diff --git a/sdk/python/tests/components/test_components.py b/sdk/python/tests/components/test_components.py
@@ -87,6 +87,18 @@ def test_loading_minimal_component(self):
 
         self.assertEqual(task_factory1.component_spec.implementation.container.image, component_dict['implementation']['container']['image'])
 
+    def test_digest_of_loaded_component(self):
+        component_text = textwrap.dedent('''\
+            implementation:
+              container:
+                image: busybox
+            '''
+        )
+        task_factory1 = comp.load_component_from_text(component_text)
+        task1 = task_factory1()
+
+        self.assertEqual(task1.component_ref.digest, '1ede211233e869581d098673962c2c1e8c1e4cebb7cf5d7332c2f73cb4900823')
+
     def test_accessing_component_spec_from_task_factory(self):
         component_text = '''\
 implementation:

diff --git a/...hon/tests/components/test_data/retail_product_stockout_prediction_pipeline.component.yaml b/...hon/tests/components/test_data/retail_product_stockout_prediction_pipeline.component.yaml
@@ -38,6 +38,7 @@ implementation:
     tasks:
       Automl create dataset for tables:
         componentRef:
+          digest: 98381958ba8b0d2b83a23a78f482f08b48e665409820b3a6254bccdbcf206df3
           url: https://raw.githubusercontent.com/kubeflow/pipelines/b3179d86b239a08bf4884b50dbf3a9151da96d66/components/gcp/automl/create_dataset_for_tables/component.yaml
         arguments:
           gcp_project_id:
@@ -51,6 +52,7 @@ implementation:
               inputName: dataset_display_name
       Automl import data from bigquery:
         componentRef:
+          digest: a965621525a9081a8c7d4c12806bf4359a03b9842a7d3e891ab5b48422dbe527
           url: https://raw.githubusercontent.com/kubeflow/pipelines/b3179d86b239a08bf4884b50dbf3a9151da96d66/components/gcp/automl/import_data_from_bigquery/component.yaml
         arguments:
           dataset_path:
@@ -63,6 +65,7 @@ implementation:
               inputName: dataset_bq_input_uri
       Automl split dataset table column names:
         componentRef:
+          digest: a77ef9ecb87e543290a02b3fa933bcd5e67947a12f6d011fdd37bf38c1b26ade
           url: https://raw.githubusercontent.com/kubeflow/pipelines/b3179d86b239a08bf4884b50dbf3a9151da96d66/components/gcp/automl/split_dataset_table_column_names/component.yaml
         arguments:
           dataset_path:
@@ -76,6 +79,7 @@ implementation:
           table_index: '0'
       Automl create model for tables:
         componentRef:
+          digest: e52ee882685380988ee2f4de6beacdcd0d2ab21d37bef45c4e16a20a224d374e
           url: https://raw.githubusercontent.com/kubeflow/pipelines/b3179d86b239a08bf4884b50dbf3a9151da96d66/components/gcp/automl/create_model_for_tables/component.yaml
         arguments:
           gcp_project_id:
@@ -108,6 +112,7 @@ implementation:
               inputName: train_budget_milli_node_hours
       Automl prediction service batch predict:
         componentRef:
+          digest: 908ea1855f5aa3d35f60145f0f15007ea437b35b0a1be2fd1d0db5a76221cad1
           url: https://raw.githubusercontent.com/kubeflow/pipelines/b3179d86b239a08bf4884b50dbf3a9151da96d66/components/gcp/automl/prediction_service_batch_predict/component.yaml
         arguments:
           model_path: