amygdala
diff --git a/‎ml/kubeflow-pipelines/keras_tuner/components/kubeflow-resources/bikesw_training/bw_hptune_standalone.py
Lines changed: 2 additions & 2 deletions b/‎ml/kubeflow-pipelines/keras_tuner/components/kubeflow-resources/bikesw_training/bw_hptune_standalone.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎ml/kubeflow-pipelines/keras_tuner/components/tfdv/Dockerfile
Lines changed: 2 additions & 1 deletion b/‎ml/kubeflow-pipelines/keras_tuner/components/tfdv/Dockerfile
Lines changed: 2 additions & 1 deletion
diff --git a/‎ml/kubeflow-pipelines/keras_tuner/components/tfdv/tfdv.py
Lines changed: 16 additions & 44 deletions b/‎ml/kubeflow-pipelines/keras_tuner/components/tfdv/tfdv.py
Lines changed: 16 additions & 44 deletions
diff --git a/‎ml/kubeflow-pipelines/keras_tuner/components/tfdv/tfdv_compare.py
Lines changed: 60 additions & 0 deletions b/‎ml/kubeflow-pipelines/keras_tuner/components/tfdv/tfdv_compare.py
Lines changed: 60 additions & 0 deletions
diff --git a/‎ml/kubeflow-pipelines/keras_tuner/components/tfdv_component.yaml
Lines changed: 137 additions & 0 deletions b/‎ml/kubeflow-pipelines/keras_tuner/components/tfdv_component.yaml
Lines changed: 137 additions & 0 deletions
@@ -48,9 +48,9 @@ def create_model(hp):
       linear_feature_columns=sparse.values(),
       dnn_feature_columns=real.values(),
       num_hidden_layers=hp.Int('num_hidden_layers', 2, 5),
-      dnn_hidden_units1=hp.Int('hidden_size', 32, 256, step=32),
+      dnn_hidden_units1=hp.Int('hidden_size', 16, 256, step=32),
       learning_rate=hp.Choice('learning_rate',
-                    values=[1e-1, 1e-2, 1e-3, 1e-4])
+                    values=[5e-1, 1e-1, 1e-2, 1e-3, 1e-4])
     )
 
   model.summary()
 
@@ -15,6 +15,7 @@
 FROM gcr.io/deeplearning-platform-release/tf2-cpu.2-3:latest
 
 ADD requirements.txt /
-ADD tfdv.py /
+# ADD tfdv.py /
+RUN pip install -U tensorflow-data-validation
 RUN pip download tensorflow_data_validation --no-deps --platform manylinux2010_x86_64 --only-binary=:all:
 RUN pip install -U "apache-beam[gcp]"
@@ -1,4 +1,4 @@
-# Copyright 2020 Google Inc. All Rights Reserved.
+# Copyright 2021 Google Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,25 +12,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import NamedTuple
 
-def generate_tfdv_stats(input_data: str, output_path: str, job_name: str, use_dataflow: bool,
+
+def generate_tfdv_stats(input_data: str, output_path: str, job_name: str, use_dataflow: str,
                         project_id: str, region:str, gcs_temp_location: str, gcs_staging_location: str,
-                        whl_location: str = '', requirements_file: str = 'requirements.txt'):
-  import tensorflow_data_validation as tfdv
+                        whl_location: str = '', requirements_file: str = 'requirements.txt'
+) -> NamedTuple('Outputs', [('stats_path', str)]):
+
   import logging
   import time
 
-  import tensorflow_data_validation.statistics.stats_impl
   import tensorflow_data_validation as tfdv
+  import tensorflow_data_validation.statistics.stats_impl
   from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions, StandardOptions, SetupOptions
 
   # pip download tensorflow_data_validation --no-deps --platform manylinux2010_x86_64 --only-binary=:all:
   # CHANGE this if your download resulted in a different filename.
 
+  logging.getLogger().setLevel(logging.INFO)
+  logging.info("output path: %s", output_path)
+  logging.info("Building pipeline options")
   # Create and set your PipelineOptions.
   options = PipelineOptions()
 
-  if use_dataflow:
+  if use_dataflow == 'true':
+    logging.info("using Dataflow")
     if not whl_location:
       logging.warning('tfdv whl file required with dataflow runner.')
       exit(1)
@@ -53,46 +60,11 @@ def generate_tfdv_stats(input_data: str, output_path: str, job_name: str, use_da
     data_location=input_data, output_path=output_path,
     pipeline_options=options)
 
-
-def main():
-
-  logging.getLogger().setLevel(logging.INFO)
-  parser = argparse.ArgumentParser(description='TVDV')
-
-  parser.add_argument(
-      '--project_id', default='aju-vtests2')
-  parser.add_argument(
-      '--region', default='us-central1')
-  parser.add_argument(
-      '--job_name', required=True)
-  parser.add_argument(
-      '--gcs-staging-location', required=True)
-  parser.add_argument(
-      '--gcs-temp-location', required=True)
-  parser.add_argument(
-      '--output-path', required=True)
-  parser.add_argument(
-      '--data-path', required=True)
-  # TFDV whl required for Dataflow runner. Download whl file with this command:
-  # pip download tensorflow_data_validation --no-deps --platform manylinux2010_x86_64 --only-binary=:all:
-  parser.add_argument('--whl-location')
-  parser.add_argument('--requirements_file', default='requirements.txt')
-  parser.add_argument('--use-dataflow', default=False, help='Run on Dataflow', action='store_true')
-  parser.add_argument('--local', dest='use-dataflow', help='Run locally', action='store_false')
-  args = parser.parse_args()
-
-  use_dataflow = False
-  if args.use_dataflow:
-    use_dataflow = True
-
-  generate_tfdv_stats(args.data_path, args.output_path, args.job_name, use_dataflow,
-      args.project_id, args.region, args.gcs_temp_location, args.gcs_staging_location,
-      args.whl_location, args.requirements_file)
-
+  return (output_path, )
 
 
 if __name__ == '__main__':
   import kfp
   kfp.components.func_to_container_op(generate_tfdv_stats,
-      output_component_file='../tfdv_component.yaml', base_image='gcr.io/aju-vtests2/tfdv-tests:v6')
-  # main()
+      output_component_file='../tfdv_component.yaml',
+      base_image='gcr.io/google-samples/tfdv-tests:v1')
@@ -0,0 +1,60 @@
+# Copyright 2021 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import NamedTuple
+
+
+def tfdv_detect_drift(
+    stats_older_path: str, stats_new_path: str
+) -> NamedTuple('Outputs', [('drift', str)]):
+
+  import logging
+  import time
+
+  import tensorflow_data_validation as tfdv
+  import tensorflow_data_validation.statistics.stats_impl
+
+  logging.getLogger().setLevel(logging.INFO)
+  logging.info('stats_older_path: %s', stats_older_path)
+  logging.info('stats_new_path: %s', stats_new_path)
+
+  if stats_older_path == 'none':
+    return ('true', )
+
+  stats1 = tfdv.load_statistics(stats_older_path)
+  stats2 = tfdv.load_statistics(stats_new_path)
+
+  schema1 = tfdv.infer_schema(statistics=stats1)
+  tfdv.get_feature(schema1, 'duration').drift_comparator.jensen_shannon_divergence.threshold = 0.01
+  drift_anomalies = tfdv.validate_statistics(
+      statistics=stats2, schema=schema1, previous_statistics=stats1)
+  logging.info('drift analysis results: %s', drift_anomalies.drift_skew_info)
+
+  from google.protobuf.json_format import MessageToDict
+  d = MessageToDict(drift_anomalies)
+  val = d['driftSkewInfo'][0]['driftMeasurements'][0]['value']
+  thresh = d['driftSkewInfo'][0]['driftMeasurements'][0]['threshold']
+  logging.info('value %s and threshold %s', val, thresh)
+  res = 'true'
+  if val < thresh:
+    res = 'false'
+  logging.info('train decision: %s', res)
+  return (res, )
+
+
+if __name__ == '__main__':
+  import kfp
+  kfp.components.func_to_container_op(tfdv_detect_drift,
+      output_component_file='../tfdv_drift_component.yaml',
+      base_image='gcr.io/google-samples/tfdv-tests:v1')
@@ -0,0 +1,137 @@
+name: Generate tfdv stats
+inputs:
+- {name: input_data, type: String}
+- {name: output_path, type: String}
+- {name: job_name, type: String}
+- {name: use_dataflow, type: String}
+- {name: project_id, type: String}
+- {name: region, type: String}
+- {name: gcs_temp_location, type: String}
+- {name: gcs_staging_location, type: String}
+- {name: whl_location, type: String, default: '', optional: true}
+- {name: requirements_file, type: String, default: requirements.txt, optional: true}
+outputs:
+- {name: stats_path, type: String}
+implementation:
+  container:
+    image: gcr.io/google-samples/tfdv-tests:v1
+    command:
+    - sh
+    - -ec
+    - |
+      program_path=$(mktemp)
+      printf "%s" "$0" > "$program_path"
+      python3 -u "$program_path" "$@"
+    - |
+      def generate_tfdv_stats(input_data, output_path, job_name, use_dataflow,
+                              project_id, region, gcs_temp_location, gcs_staging_location,
+                              whl_location = '', requirements_file = 'requirements.txt'
+      ):
+
+        import logging
+        import time
+
+        import tensorflow_data_validation as tfdv
+        import tensorflow_data_validation.statistics.stats_impl
+        from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions, StandardOptions, SetupOptions
+
+        # pip download tensorflow_data_validation --no-deps --platform manylinux2010_x86_64 --only-binary=:all:
+        # CHANGE this if your download resulted in a different filename.
+
+        logging.getLogger().setLevel(logging.INFO)
+        logging.info("output path: %s", output_path)
+        logging.info("Building pipeline options")
+        # Create and set your PipelineOptions.
+        options = PipelineOptions()
+
+        if use_dataflow == 'true':
+          logging.info("using Dataflow")
+          if not whl_location:
+            logging.warning('tfdv whl file required with dataflow runner.')
+            exit(1)
+          # For Cloud execution, set the Cloud Platform project, job_name,
+          # staging location, temp_location and specify DataflowRunner.
+          google_cloud_options = options.view_as(GoogleCloudOptions)
+          google_cloud_options.project = project_id
+          google_cloud_options.job_name = '{}-{}'.format(job_name, str(int(time.time())))
+          google_cloud_options.staging_location = gcs_staging_location
+          google_cloud_options.temp_location = gcs_temp_location
+          google_cloud_options.region = region
+          options.view_as(StandardOptions).runner = 'DataflowRunner'
+
+          setup_options = options.view_as(SetupOptions)
+          # PATH_TO_WHL_FILE should point to the downloaded tfdv wheel file.
+          setup_options.extra_packages = [whl_location]
+          setup_options.requirements_file = 'requirements.txt'
+
+        tfdv.generate_statistics_from_csv(
+          data_location=input_data, output_path=output_path,
+          pipeline_options=options)
+
+        return (output_path, )
+
+      def _serialize_str(str_value: str) -> str:
+          if not isinstance(str_value, str):
+              raise TypeError('Value "{}" has type "{}" instead of str.'.format(str(str_value), str(type(str_value))))
+          return str_value
+
+      import argparse
+      _parser = argparse.ArgumentParser(prog='Generate tfdv stats', description='')
+      _parser.add_argument("--input-data", dest="input_data", type=str, required=True, default=argparse.SUPPRESS)
+      _parser.add_argument("--output-path", dest="output_path", type=str, required=True, default=argparse.SUPPRESS)
+      _parser.add_argument("--job-name", dest="job_name", type=str, required=True, default=argparse.SUPPRESS)
+      _parser.add_argument("--use-dataflow", dest="use_dataflow", type=str, required=True, default=argparse.SUPPRESS)
+      _parser.add_argument("--project-id", dest="project_id", type=str, required=True, default=argparse.SUPPRESS)
+      _parser.add_argument("--region", dest="region", type=str, required=True, default=argparse.SUPPRESS)
+      _parser.add_argument("--gcs-temp-location", dest="gcs_temp_location", type=str, required=True, default=argparse.SUPPRESS)
+      _parser.add_argument("--gcs-staging-location", dest="gcs_staging_location", type=str, required=True, default=argparse.SUPPRESS)
+      _parser.add_argument("--whl-location", dest="whl_location", type=str, required=False, default=argparse.SUPPRESS)
+      _parser.add_argument("--requirements-file", dest="requirements_file", type=str, required=False, default=argparse.SUPPRESS)
+      _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
+      _parsed_args = vars(_parser.parse_args())
+      _output_files = _parsed_args.pop("_output_paths", [])
+
+      _outputs = generate_tfdv_stats(**_parsed_args)
+
+      _output_serializers = [
+          _serialize_str,
+
+      ]
+
+      import os
+      for idx, output_file in enumerate(_output_files):
+          try:
+              os.makedirs(os.path.dirname(output_file))
+          except OSError:
+              pass
+          with open(output_file, 'w') as f:
+              f.write(_output_serializers[idx](_outputs[idx]))
+    args:
+    - --input-data
+    - {inputValue: input_data}
+    - --output-path
+    - {inputValue: output_path}
+    - --job-name
+    - {inputValue: job_name}
+    - --use-dataflow
+    - {inputValue: use_dataflow}
+    - --project-id
+    - {inputValue: project_id}
+    - --region
+    - {inputValue: region}
+    - --gcs-temp-location
+    - {inputValue: gcs_temp_location}
+    - --gcs-staging-location
+    - {inputValue: gcs_staging_location}
+    - if:
+        cond: {isPresent: whl_location}
+        then:
+        - --whl-location
+        - {inputValue: whl_location}
+    - if:
+        cond: {isPresent: requirements_file}
+        then:
+        - --requirements-file
+        - {inputValue: requirements_file}
+    - '----output-paths'
+    - {outputPath: stats_path}