Skip to content

Commit 96cfd3d

Browse files
authored
small notebook update + some initial TFDV example code (#80)
1 parent 1c16bc8 commit 96cfd3d

File tree

4 files changed

+122
-2
lines changed

4 files changed

+122
-2
lines changed
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Copyright 2020 Google Inc. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
FROM gcr.io/deeplearning-platform-release/tf2-cpu.2-3:latest
16+
17+
ADD requirements.txt /
18+
ADD tfdv.py /
19+
RUN pip download tensorflow_data_validation --no-deps --platform manylinux2010_x86_64 --only-binary=:all:
20+
RUN pip install -U "apache-beam[gcp]"
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
ipython==7.16.1
2+
ipython-genutils==0.2.0
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
# Copyright 2020 Google Inc. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
def generate_tfdv_stats(input_data: str, output_path: str, job_name: str, use_dataflow: bool,
17+
project_id: str, region:str, gcs_temp_location: str, gcs_staging_location: str,
18+
whl_location: str = '', requirements_file: str = 'requirements.txt'):
19+
import tensorflow_data_validation as tfdv
20+
import logging
21+
import time
22+
23+
import tensorflow_data_validation.statistics.stats_impl
24+
import tensorflow_data_validation as tfdv
25+
from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions, StandardOptions, SetupOptions
26+
27+
# pip download tensorflow_data_validation --no-deps --platform manylinux2010_x86_64 --only-binary=:all:
28+
# CHANGE this if your download resulted in a different filename.
29+
30+
# Create and set your PipelineOptions.
31+
options = PipelineOptions()
32+
33+
if use_dataflow:
34+
if not whl_location:
35+
logging.warning('tfdv whl file required with dataflow runner.')
36+
exit(1)
37+
# For Cloud execution, set the Cloud Platform project, job_name,
38+
# staging location, temp_location and specify DataflowRunner.
39+
google_cloud_options = options.view_as(GoogleCloudOptions)
40+
google_cloud_options.project = project_id
41+
google_cloud_options.job_name = '{}-{}'.format(job_name, str(int(time.time())))
42+
google_cloud_options.staging_location = gcs_staging_location
43+
google_cloud_options.temp_location = gcs_temp_location
44+
google_cloud_options.region = region
45+
options.view_as(StandardOptions).runner = 'DataflowRunner'
46+
47+
setup_options = options.view_as(SetupOptions)
48+
# PATH_TO_WHL_FILE should point to the downloaded tfdv wheel file.
49+
setup_options.extra_packages = [whl_location]
50+
setup_options.requirements_file = 'requirements.txt'
51+
52+
tfdv.generate_statistics_from_csv(
53+
data_location=input_data, output_path=output_path,
54+
pipeline_options=options)
55+
56+
57+
def main():
58+
59+
logging.getLogger().setLevel(logging.INFO)
60+
parser = argparse.ArgumentParser(description='TVDV')
61+
62+
parser.add_argument(
63+
'--project_id', default='aju-vtests2')
64+
parser.add_argument(
65+
'--region', default='us-central1')
66+
parser.add_argument(
67+
'--job_name', required=True)
68+
parser.add_argument(
69+
'--gcs-staging-location', required=True)
70+
parser.add_argument(
71+
'--gcs-temp-location', required=True)
72+
parser.add_argument(
73+
'--output-path', required=True)
74+
parser.add_argument(
75+
'--data-path', required=True)
76+
# TFDV whl required for Dataflow runner. Download whl file with this command:
77+
# pip download tensorflow_data_validation --no-deps --platform manylinux2010_x86_64 --only-binary=:all:
78+
parser.add_argument('--whl-location')
79+
parser.add_argument('--requirements_file', default='requirements.txt')
80+
parser.add_argument('--use-dataflow', default=False, help='Run on Dataflow', action='store_true')
81+
parser.add_argument('--local', dest='use-dataflow', help='Run locally', action='store_false')
82+
args = parser.parse_args()
83+
84+
use_dataflow = False
85+
if args.use_dataflow:
86+
use_dataflow = True
87+
88+
generate_tfdv_stats(args.data_path, args.output_path, args.job_name, use_dataflow,
89+
args.project_id, args.region, args.gcs_temp_location, args.gcs_staging_location,
90+
args.whl_location, args.requirements_file)
91+
92+
93+
94+
if __name__ == '__main__':
95+
import kfp
96+
kfp.components.func_to_container_op(generate_tfdv_stats,
97+
output_component_file='../tfdv_component.yaml', base_image='gcr.io/aju-vtests2/tfdv-tests:v6')
98+
# main()

ml/notebook_examples/caipp/kfp_in_a_notebook.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
"\n",
3232
"### Create an AI Platform Notebooks instance\n",
3333
"\n",
34-
"If you're not doing so already, run this notebook on an AI Platform Notebook instance. See setup instructions [here](https://cloud.google.com/ai-platform/notebooks/docs). (It's possible to run the notebook using other Jupyter environments, but that requires some additional auth setup that we won't cover here). Once your notebook instance is set up, you should be able to use [this link](https://console.cloud.google.com/ai-platform/notebooks/deploy-notebook?name=KFP%20from%20a%20notebook&download_url=https%3A%2F%2Fgist.githubusercontent.com%2Famygdala%2F871642b656c53949a13365ec903a41c9%2Fraw%2Fedaf23d967aa6a8d0297cd2548e80dca8b9b6a36%2Fjupytercon_kfp.ipynb&url=https%3A%2F%2Fgist.github.com%2Famygdala%2F871642b656c53949a13365ec903a41c9) to upload the notebook.\n",
34+
"If you're not doing so already, run this notebook on an AI Platform Notebook instance. See setup instructions [here](https://cloud.google.com/ai-platform/notebooks/docs). (It's possible to run the notebook using other Jupyter environments, but that requires some additional auth setup that we won't cover here). Once your notebook instance is set up, you should be able to use [this link](https://console.cloud.google.com/ai-platform/notebooks/deploy-notebook?name=KFP%20from%20a%20notebook&download_url=https%3A%2F%2Fraw.githubusercontent.com%2Famygdala%2Fcode-snippets%2Fmaster%2Fml%2Fnotebook_examples%2Fcaipp%2Fkfp_in_a_notebook.ipynb&url=https%3A%2F%2Fgithub.com%2Famygdala%2Fcode-snippets%2Fblob%2Fmaster%2Fml%2Fnotebook_examples%2Fcaipp%2Fkfp_in_a_notebook.ipynb) to upload the notebook.\n",
3535
"\n",
3636
"### Install AI Platform Pipelines\n",
3737
"\n",
@@ -787,7 +787,7 @@
787787
"name": "python",
788788
"nbconvert_exporter": "python",
789789
"pygments_lexer": "ipython3",
790-
"version": "3.7.8"
790+
"version": "3.6.8"
791791
}
792792
},
793793
"nbformat": 4,

0 commit comments

Comments
 (0)