fix(sample): Fix missing --region in dataflow sample notebook. Fixes …

…kubeflow#5007 (kubeflow#5036) * Update dataflow.ipynb * Update dataflow.ipynb * Update dataflow.config.yaml * Update dataflow.config.yaml
numerology · Jan 26, 2021 · 0fd6580 · 0fd6580
1 parent a8b7fc9
commit 0fd6580
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 9 deletions.
diff --git a/samples/core/dataflow/dataflow.ipynb b/samples/core/dataflow/dataflow.ipynb
@@ -15,7 +15,8 @@
     "Name | Description | Optional |  Data type| Accepted values | Default |\n",
     ":--- | :----------| :----------| :----------| :----------| :---------- |\n",
     "python_file_path |  The path to the Cloud Storage bucket or local directory containing the Python file to be run. |  |  GCSPath |  |  |\n",
-    "project_id |  The ID of the Google Cloud Platform (GCP) project  containing the Cloud Dataflow job.| | GCPProjectID | | |\n",
+    "project_id |  The ID of the Google Cloud Platform (GCP) project  containing the Cloud Dataflow job.| | String | | |\n",
+    "region |  The Google Cloud Platform (GCP) region to run the Cloud Dataflow job.| | String | | |\n",
     "staging_dir  |   The path to the Cloud Storage directory where the staging files are stored. A random subdirectory will be created under the staging directory to keep the  job information.This is done so that you can resume the job in case of failure. `staging_dir` is passed as the command line arguments (`staging_location` and `temp_location`) of the Beam code. |   Yes  |   GCSPath  |   |   None  |\n",
     "requirements_file_path |   The path to the Cloud Storage bucket or local directory containing the pip requirements file. | Yes | GCSPath |  | None |\n",
     "args |  The list of arguments to pass to the Python file. | No |  List | A list of string arguments | None |\n",
@@ -28,7 +29,7 @@
     "- A  `requirements.txt` file which includes a list of dependent packages.\n",
     "\n",
     "The Beam Python code should follow the [Beam programming guide](https://beam.apache.org/documentation/programming-guide/) as well as the following additional requirements to be compatible with this component:\n",
-    "- It accepts the command line arguments `--project`, `--temp_location`, `--staging_location`, which are [standard Dataflow Runner options](https://cloud.google.com/dataflow/docs/guides/specifying-exec-params#setting-other-cloud-pipeline-options).\n",
+    "- It accepts the command line arguments `--project`, `--region`, `--temp_location`, `--staging_location`, which are [standard Dataflow Runner options](https://cloud.google.com/dataflow/docs/guides/specifying-exec-params#setting-other-cloud-pipeline-options).\n",
     "- It enables `info logging` before the start of a Cloud Dataflow job in the Python code. This is important to allow the component to track the status and ID of the job that is created. For example, calling `logging.getLogger().setLevel(logging.INFO)` before any other code.\n",
     "\n",
     "\n",
@@ -74,6 +75,7 @@
    "outputs": [],
    "source": [
     "project = 'Input your PROJECT ID'\n",
+    "region = 'Input GCP region' # For example, 'us-central1'\n",
     "output = 'Input your GCS bucket name' # No ending slash\n"
    ]
   },
@@ -124,7 +126,7 @@
      "text": [
       "Help on function Launch Python:\n",
       "\n",
-      "Launch Python(python_file_path:str, project_id:'GCPProjectID', staging_dir:'GCSPath'='', requirements_file_path:'GCSPath'='', args:list='[]', wait_interval:int='30')\n",
+      "Launch Python(python_file_path:str, project_id:str, region:str, staging_dir:'GCSPath'='', requirements_file_path:'GCSPath'='', args:list='[]', wait_interval:int='30')\n",
       "    Launch Python\n",
       "    Launch a self-executing beam python file.\n",
       "\n"
@@ -227,22 +229,25 @@
       "                      help='Input file to process.')\r\n",
       "  parser.add_argument('--output',\r\n",
       "                      dest='output',\r\n",
-      "                      # CHANGE 1/5: The Google Cloud Storage path is required\r\n",
+      "                      # CHANGE 1/6: The Google Cloud Storage path is required\r\n",
       "                      # for outputting the results.\r\n",
       "                      default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX',\r\n",
       "                      help='Output file to write results to.')\r\n",
       "  known_args, pipeline_args = parser.parse_known_args(argv)\r\n",
       "  # pipeline_args.extend([\r\n",
-      "  #     # CHANGE 2/5: (OPTIONAL) Change this to DataflowRunner to\r\n",
+      "  #     # CHANGE 2/6: (OPTIONAL) Change this to DataflowRunner to\r\n",
       "  #     # run your pipeline on the Google Cloud Dataflow Service.\r\n",
       "  #     '--runner=DirectRunner',\r\n",
-      "  #     # CHANGE 3/5: Your project ID is required in order to run your pipeline on\r\n",
+      "  #     # CHANGE 3/6: Your project ID is required in order to run your pipeline on\r\n",
       "  #     # the Google Cloud Dataflow Service.\r\n",
       "  #     '--project=SET_YOUR_PROJECT_ID_HERE',\r\n",
-      "  #     # CHANGE 4/5: Your Google Cloud Storage path is required for staging local\r\n",
+      "  #     # CHANGE 4/6: A GCP region is required in order to run your pipeline on\r\n",
+      "  #     # the Google Cloud Dataflow Service.\r\n",
+      "  #     '--region=SET_GCP_REGION_HERE',\r\n",
+      "  #     # CHANGE 5/6: Your Google Cloud Storage path is required for staging local\r\n",
       "  #     # files.\r\n",
       "  #     '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY',\r\n",
-      "  #     # CHANGE 5/5: Your Google Cloud Storage path is required for temporary\r\n",
+      "  #     # CHANGE 6/6: Your Google Cloud Storage path is required for temporary\r\n",
       "  #     # files.\r\n",
       "  #     '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY',\r\n",
       "  #     '--job_name=your-wordcount-job',\r\n",
@@ -311,6 +316,7 @@
     "def pipeline(\n",
     "    python_file_path = 'gs://ml-pipeline/sample-pipeline/word-count/wc.py',\n",
     "    project_id = project,\n",
+    "    region = region,\n",
     "    staging_dir = output,\n",
     "    requirements_file_path = 'gs://ml-pipeline/sample-pipeline/word-count/requirements.txt',\n",
     "    args = json.dumps([\n",
@@ -321,6 +327,7 @@
     "    dataflow_python_op(\n",
     "        python_file_path = python_file_path, \n",
     "        project_id = project_id, \n",
+    "        region = region, \n",    
     "        staging_dir = staging_dir, \n",
     "        requirements_file_path = requirements_file_path, \n",
     "        args = args,\n",

diff --git a/test/sample-test/configs/dataflow.config.yaml b/test/sample-test/configs/dataflow.config.yaml
@@ -16,4 +16,5 @@ test_name: dataflow
 notebook_params:
   output:
   project: ml-pipeline-test
-run_pipeline: True
+  region: us-central1
+run_pipeline: True