fix(components): Update GCP component container to Python 3.7. Fixes #…

…4959 (#4960) * Update GCP component container to Python 3.7, fix Dataflow client * fix test * add back the http patch
kubeflow · Jan 7, 2021 · 8463992 · 8463992
1 parent 7540ba5
commit 8463992
Show file tree

Hide file tree

Showing 8 changed files with 49 additions and 40 deletions.
diff --git a/components/gcp/container/Dockerfile b/components/gcp/container/Dockerfile
@@ -1,4 +1,4 @@
-# Copyright 2018 The Kubeflow Authors
+# Copyright 2021 The Kubeflow Authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,19 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM python:2.7-slim-jessie
+FROM python:3.7-slim
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
 	wget patch \
 	&& rm -rf /var/lib/apt/lists/*
 
-RUN pip2 install apache-beam[gcp]==2.10.0
+RUN pip install apache-beam[gcp]
 RUN pip install pandas
 
 ADD build /ml
 WORKDIR /ml
 RUN pip install .
 
-RUN patch /usr/local/lib/python2.7/site-packages/googleapiclient/http.py < /ml/patches/http.patch
+# The patch sets User Agent for telemetry purpose.
+# It is based on "google-api-python-client==1.7.8", and needs to be updated when upgrading the package.
+RUN patch /usr/local/lib/python3.7/site-packages/googleapiclient/http.py < /ml/patches/http.patch
 
 ENTRYPOINT ["python", "-u", "-m", "kfp_component.launcher"]
diff --git a/components/gcp/container/component_sdk/python/kfp_component/google/dataflow/_client.py b/components/gcp/container/component_sdk/python/kfp_component/google/dataflow/_client.py
@@ -1,4 +1,4 @@
-# Copyright 2018 Google LLC
+# Copyright 2021 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 
 class DataflowClient:
     def __init__(self):
-        self._df = discovery.build('dataflow', 'v1b3')
+        self._df = discovery.build('dataflow', 'v1b3', cache_discovery=False)
 
-    def launch_template(self, project_id, gcs_path, location, 
+    def launch_template(self, project_id, gcs_path, location,
         validate_only, launch_parameters):
         return self._df.projects().locations().templates().launch(
             projectId = project_id,
@@ -47,7 +47,7 @@ def cancel_job(self, project_id, job_id, location):
             }
         ).execute()
 
-    def list_aggregated_jobs(self, project_id, filter=None, 
+    def list_aggregated_jobs(self, project_id, filter=None,
         view=None, page_size=None, page_token=None, location=None):
         return self._df.projects().jobs().aggregated(
             projectId = project_id,

diff --git a/...onents/gcp/container/component_sdk/python/kfp_component/google/dataflow/_launch_python.py b/...onents/gcp/container/component_sdk/python/kfp_component/google/dataflow/_launch_python.py
@@ -1,4 +1,4 @@
-# Copyright 2018 Google LLC
+# Copyright 2021 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,12 +19,12 @@
 from google.cloud import storage
 from kfp_component.core import KfpExecutionContext
 from ._client import DataflowClient
-from ._common_ops import (wait_and_dump_job, stage_file, get_staging_location, 
+from ._common_ops import (wait_and_dump_job, stage_file, get_staging_location,
     read_job_id_and_location, upload_job_id_and_location)
 from ._process import Process
 from ..storage import parse_blob_path
 
-def launch_python(python_file_path, project_id, staging_dir=None, requirements_file_path=None, 
+def launch_python(python_file_path, project_id, region, staging_dir=None, requirements_file_path=None,
     args=[], wait_interval=30,
     job_id_output_path='/tmp/kfp/output/dataflow/job_id.txt',
     job_object_output_path='/tmp/kfp/output/dataflow/job.json',
@@ -33,12 +33,13 @@ def launch_python(python_file_path, project_id, staging_dir=None, requirements_f
 
     Args:
         python_file_path (str): The gcs or local path to the python file to run.
-        project_id (str): The ID of the parent project.
-        staging_dir (str): Optional. The GCS directory for keeping staging files. 
+        project_id (str): The ID of the GCP project to run the Dataflow job.
+        region (str): The GCP region to run the Dataflow job.
+        staging_dir (str): Optional. The GCS directory for keeping staging files.
             A random subdirectory will be created under the directory to keep job info
-            for resuming the job in case of failure and it will be passed as 
+            for resuming the job in case of failure and it will be passed as
             `staging_location` and `temp_location` command line args of the beam code.
-        requirements_file_path (str): Optional, the gcs or local path to the pip 
+        requirements_file_path (str): Optional, the gcs or local path to the pip
             requirements file.
         args (list): The list of args to pass to the python file.
         wait_interval (int): The wait seconds between polling.
@@ -70,7 +71,7 @@ def cancel():
 
         _install_requirements(requirements_file_path)
         python_file_path = stage_file(python_file_path)
-        cmd = _prepare_cmd(project_id, python_file_path, args, staging_location)
+        cmd = _prepare_cmd(project_id, region, python_file_path, args, staging_location)
         sub_process = Process(cmd)
         for line in sub_process.read_lines():
             job_id, location = _extract_job_id_and_location(line)
@@ -83,35 +84,36 @@ def cancel():
             logging.warning('No dataflow job was found when '
                 'running the python file.')
             return None
-        job = df_client.get_job(project_id, job_id, 
+        job = df_client.get_job(project_id, job_id,
             location=location)
         return wait_and_dump_job(df_client, project_id, location, job,
             wait_interval,
             job_id_output_path=job_id_output_path,
             job_object_output_path=job_object_output_path,
         )
 
-def _prepare_cmd(project_id, python_file_path, args, staging_location):
+def _prepare_cmd(project_id, region, python_file_path, args, staging_location):
     dataflow_args = [
-        '--runner', 'dataflow', 
-        '--project', project_id]
+        '--runner', 'DataflowRunner',
+        '--project', project_id,
+        '--region', region]
     if staging_location:
         dataflow_args += ['--staging_location', staging_location, '--temp_location', staging_location]
-    return (['python', '-u', python_file_path] + 
+    return (['python', '-u', python_file_path] +
         dataflow_args + args)
 
 def _extract_job_id_and_location(line):
     """Returns (job_id, location) from matched log.
     """
     job_id_pattern = re.compile(
-        br'.*console.cloud.google.com/dataflow.*/locations/([a-z|0-9|A-Z|\-|\_]+)/jobs/([a-z|0-9|A-Z|\-|\_]+).*')
+        br'.*console.cloud.google.com/dataflow/jobs/(?P<location>[a-z|0-9|A-Z|\-|\_]+)/(?P<job_id>[a-z|0-9|A-Z|\-|\_]+).*')
     matched_job_id = job_id_pattern.search(line or '')
     if matched_job_id:
-        return (matched_job_id.group(2).decode(), matched_job_id.group(1).decode())
+        return (matched_job_id.group('job_id').decode(), matched_job_id.group('location').decode())
     return (None, None)
 
 def _install_requirements(requirements_file_path):
     if not requirements_file_path:
         return
     requirements_file_path = stage_file(requirements_file_path)
-    subprocess.check_call(['pip', 'install', '-r', requirements_file_path])
+    subprocess.check_call(['pip', 'install', '-r', requirements_file_path])
diff --git a/components/gcp/container/component_sdk/python/setup.py b/components/gcp/container/component_sdk/python/setup.py
@@ -36,8 +36,6 @@
         'Intended Audience :: Science/Research',
         'License :: OSI Approved :: Apache Software License',
         'Programming Language :: Python',
-        'Programming Language :: Python :: 2',
-        'Programming Language :: Python :: 2.7',
         'Programming Language :: Python :: 3',
         'Programming Language :: Python :: 3.5',
         'Programming Language :: Python :: 3.6',

diff --git a/components/gcp/container/component_sdk/python/tests/google/dataflow/test__launch_python.py b/components/gcp/container/component_sdk/python/tests/google/dataflow/test__launch_python.py
@@ -1,4 +1,4 @@
-# Copyright 2018 Google LLC
+# Copyright 2021 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -29,27 +29,27 @@
 @mock.patch(MODULE + '.subprocess')
 class LaunchPythonTest(unittest.TestCase):
 
-    def test_launch_python_succeed(self, mock_subprocess, mock_process, 
+    def test_launch_python_succeed(self, mock_subprocess, mock_process,
         mock_client, mock_context, mock_stage_file, mock_display, mock_storage):
         mock_context().__enter__().context_id.return_value = 'ctx-1'
         mock_storage.Client().bucket().blob().exists.return_value = False
         mock_process().read_lines.return_value = [
-            b'https://console.cloud.google.com/dataflow/locations/us-central1/jobs/job-1?project=project-1'
+            b'https://console.cloud.google.com/dataflow/jobs/us-central1/job-1?project=project-1'
         ]
         expected_job = {
             'id': 'job-1',
             'currentState': 'JOB_STATE_DONE'
         }
         mock_client().get_job.return_value = expected_job
 
-        result = launch_python('/tmp/test.py', 'project-1', staging_dir='gs://staging/dir')
+        result = launch_python('/tmp/test.py', 'project-1', 'us-central1', staging_dir='gs://staging/dir')
 
         self.assertEqual(expected_job, result)
         mock_storage.Client().bucket().blob().upload_from_string.assert_called_with(
             'job-1,us-central1'
         )
 
-    def test_launch_python_retry_succeed(self, mock_subprocess, mock_process, 
+    def test_launch_python_retry_succeed(self, mock_subprocess, mock_process,
         mock_client, mock_context, mock_stage_file, mock_display, mock_storage):
         mock_context().__enter__().context_id.return_value = 'ctx-1'
         mock_storage.Client().bucket().blob().exists.return_value = True
@@ -60,20 +60,19 @@ def test_launch_python_retry_succeed(self, mock_subprocess, mock_process,
         }
         mock_client().get_job.return_value = expected_job
 
-        result = launch_python('/tmp/test.py', 'project-1', staging_dir='gs://staging/dir')
+        result = launch_python('/tmp/test.py', 'project-1', 'us-central1', staging_dir='gs://staging/dir')
 
         self.assertEqual(expected_job, result)
         mock_process.assert_not_called()
 
-    def test_launch_python_no_job_created(self, mock_subprocess, mock_process, 
+    def test_launch_python_no_job_created(self, mock_subprocess, mock_process,
         mock_client, mock_context, mock_stage_file, mock_display, mock_storage):
         mock_context().__enter__().context_id.return_value = 'ctx-1'
         mock_process().read_lines.return_value = [
             b'no job id',
             b'no job id'
         ]
 
-        result = launch_python('/tmp/test.py', 'project-1')
+        result = launch_python('/tmp/test.py', 'project-1', 'us-central1')
 
         self.assertEqual(None, result)
-
diff --git a/components/gcp/container/component_sdk/python/tox.ini b/components/gcp/container/component_sdk/python/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = py27,py35,py36,py37,py38
+envlist = py35,py36,py37,py38
 skip_missing_interpreters = true
 
 [testenv]

diff --git a/components/gcp/dataflow/launch_python/component.yaml b/components/gcp/dataflow/launch_python/component.yaml
@@ -23,8 +23,11 @@ inputs:
     description: 'The gcs or local path to the python file to run.'
     type: String
   - name: project_id
-    description: 'The ID of the parent project.'
-    type: GCPProjectID
+    description: 'The ID of the GCP project to run the Dataflow job.'
+    type: String
+  - name: region
+    description: 'The GCP region to run the Dataflow job.'
+    type: String
   - name: staging_dir
     description: >-
       Optional. The GCS directory for keeping staging files.
@@ -59,6 +62,7 @@ implementation:
       kfp_component.google.dataflow, launch_python,
       --python_file_path, {inputValue: python_file_path},
       --project_id, {inputValue: project_id},
+      --region, {inputValue: region},
       --staging_dir, {inputValue: staging_dir},
       --requirements_file_path, {inputValue: requirements_file_path},
       --args, {inputValue: args},

diff --git a/components/gcp/dataflow/launch_python/sample.ipynb b/components/gcp/dataflow/launch_python/sample.ipynb
@@ -22,7 +22,8 @@
     "Name | Description | Optional |  Data type| Accepted values | Default |\n",
     ":--- | :----------| :----------| :----------| :----------| :---------- |\n",
     "python_file_path |  The path to the Cloud Storage bucket or local directory containing the Python file to be run. |  |  GCSPath |  |  |\n",
-    "project_id |  The ID of the Google Cloud Platform (GCP) project  containing the Cloud Dataflow job.| | GCPProjectID | | |\n",
+    "project_id |  The ID of the Google Cloud Platform (GCP) project to run the Cloud Dataflow job.| | String | | |\n",
+    "region |  The Google Cloud Platform (GCP) region to run the Cloud Dataflow job.| | String | | |\n",
     "staging_dir  |   The path to the Cloud Storage directory where the staging files are stored. A random subdirectory will be created under the staging directory to keep the  job information.This is done so that you can resume the job in case of failure. `staging_dir` is passed as the command line arguments (`staging_location` and `temp_location`) of the Beam code. |   Yes  |   GCSPath  |   |   None  |\n",
     "requirements_file_path |   The path to the Cloud Storage bucket or local directory containing the pip requirements file. | Yes | GCSPath |  | None |\n",
     "args |  The list of arguments to pass to the Python file. | No |  List | A list of string arguments | None |\n",
@@ -263,6 +264,7 @@
    "source": [
     "# Required Parameters\n",
     "PROJECT_ID = '<Please put your project ID here>'\n",
+    "REGION = '<Please put a GCP region here>'\n"
     "GCS_STAGING_DIR = 'gs://<Please put your GCS path here>' # No ending slash"
    ]
   },
@@ -299,6 +301,7 @@
     "def pipeline(\n",
     "    python_file_path = 'gs://ml-pipeline-playground/samples/dataflow/wc/wc.py',\n",
     "    project_id = PROJECT_ID,\n",
+    "    region = REGION,\n",
     "    staging_dir = GCS_STAGING_DIR,\n",
     "    requirements_file_path = 'gs://ml-pipeline-playground/samples/dataflow/wc/requirements.txt',\n",
     "    args = json.dumps([\n",
@@ -309,6 +312,7 @@
     "    dataflow_python_op(\n",
     "        python_file_path = python_file_path, \n",
     "        project_id = project_id, \n",
+    "        region = region, \n",
     "        staging_dir = staging_dir, \n",
     "        requirements_file_path = requirements_file_path, \n",
     "        args = args,\n",
@@ -412,4 +416,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}