diff --git a/samples/core/dataflow/dataflow.ipynb b/samples/core/dataflow/dataflow.ipynb index 2d671283a5a..4cb427be562 100644 --- a/samples/core/dataflow/dataflow.ipynb +++ b/samples/core/dataflow/dataflow.ipynb @@ -15,7 +15,8 @@ "Name | Description | Optional | Data type| Accepted values | Default |\n", ":--- | :----------| :----------| :----------| :----------| :---------- |\n", "python_file_path | The path to the Cloud Storage bucket or local directory containing the Python file to be run. | | GCSPath | | |\n", - "project_id | The ID of the Google Cloud Platform (GCP) project containing the Cloud Dataflow job.| | GCPProjectID | | |\n", + "project_id | The ID of the Google Cloud Platform (GCP) project containing the Cloud Dataflow job.| | String | | |\n", + "region | The Google Cloud Platform (GCP) region to run the Cloud Dataflow job.| | String | | |\n", "staging_dir | The path to the Cloud Storage directory where the staging files are stored. A random subdirectory will be created under the staging directory to keep the job information.This is done so that you can resume the job in case of failure. `staging_dir` is passed as the command line arguments (`staging_location` and `temp_location`) of the Beam code. | Yes | GCSPath | | None |\n", "requirements_file_path | The path to the Cloud Storage bucket or local directory containing the pip requirements file. | Yes | GCSPath | | None |\n", "args | The list of arguments to pass to the Python file. | No | List | A list of string arguments | None |\n", @@ -28,7 +29,7 @@ "- A `requirements.txt` file which includes a list of dependent packages.\n", "\n", "The Beam Python code should follow the [Beam programming guide](https://beam.apache.org/documentation/programming-guide/) as well as the following additional requirements to be compatible with this component:\n", - "- It accepts the command line arguments `--project`, `--temp_location`, `--staging_location`, which are [standard Dataflow Runner options](https://cloud.google.com/dataflow/docs/guides/specifying-exec-params#setting-other-cloud-pipeline-options).\n", + "- It accepts the command line arguments `--project`, `--region`, `--temp_location`, `--staging_location`, which are [standard Dataflow Runner options](https://cloud.google.com/dataflow/docs/guides/specifying-exec-params#setting-other-cloud-pipeline-options).\n", "- It enables `info logging` before the start of a Cloud Dataflow job in the Python code. This is important to allow the component to track the status and ID of the job that is created. For example, calling `logging.getLogger().setLevel(logging.INFO)` before any other code.\n", "\n", "\n", @@ -74,6 +75,7 @@ "outputs": [], "source": [ "project = 'Input your PROJECT ID'\n", + "region = 'Input GCP region' # For example, 'us-central1'\n", "output = 'Input your GCS bucket name' # No ending slash\n" ] }, @@ -124,7 +126,7 @@ "text": [ "Help on function Launch Python:\n", "\n", - "Launch Python(python_file_path:str, project_id:'GCPProjectID', staging_dir:'GCSPath'='', requirements_file_path:'GCSPath'='', args:list='[]', wait_interval:int='30')\n", + "Launch Python(python_file_path:str, project_id:str, region:str, staging_dir:'GCSPath'='', requirements_file_path:'GCSPath'='', args:list='[]', wait_interval:int='30')\n", " Launch Python\n", " Launch a self-executing beam python file.\n", "\n" @@ -227,22 +229,25 @@ " help='Input file to process.')\r\n", " parser.add_argument('--output',\r\n", " dest='output',\r\n", - " # CHANGE 1/5: The Google Cloud Storage path is required\r\n", + " # CHANGE 1/6: The Google Cloud Storage path is required\r\n", " # for outputting the results.\r\n", " default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX',\r\n", " help='Output file to write results to.')\r\n", " known_args, pipeline_args = parser.parse_known_args(argv)\r\n", " # pipeline_args.extend([\r\n", - " # # CHANGE 2/5: (OPTIONAL) Change this to DataflowRunner to\r\n", + " # # CHANGE 2/6: (OPTIONAL) Change this to DataflowRunner to\r\n", " # # run your pipeline on the Google Cloud Dataflow Service.\r\n", " # '--runner=DirectRunner',\r\n", - " # # CHANGE 3/5: Your project ID is required in order to run your pipeline on\r\n", + " # # CHANGE 3/6: Your project ID is required in order to run your pipeline on\r\n", " # # the Google Cloud Dataflow Service.\r\n", " # '--project=SET_YOUR_PROJECT_ID_HERE',\r\n", - " # # CHANGE 4/5: Your Google Cloud Storage path is required for staging local\r\n", + " # # CHANGE 4/6: A GCP region is required in order to run your pipeline on\r\n", + " # # the Google Cloud Dataflow Service.\r\n", + " # '--region=SET_GCP_REGION_HERE',\r\n", + " # # CHANGE 5/6: Your Google Cloud Storage path is required for staging local\r\n", " # # files.\r\n", " # '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY',\r\n", - " # # CHANGE 5/5: Your Google Cloud Storage path is required for temporary\r\n", + " # # CHANGE 6/6: Your Google Cloud Storage path is required for temporary\r\n", " # # files.\r\n", " # '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY',\r\n", " # '--job_name=your-wordcount-job',\r\n", @@ -311,6 +316,7 @@ "def pipeline(\n", " python_file_path = 'gs://ml-pipeline/sample-pipeline/word-count/wc.py',\n", " project_id = project,\n", + " region = region,\n", " staging_dir = output,\n", " requirements_file_path = 'gs://ml-pipeline/sample-pipeline/word-count/requirements.txt',\n", " args = json.dumps([\n", @@ -321,6 +327,7 @@ " dataflow_python_op(\n", " python_file_path = python_file_path, \n", " project_id = project_id, \n", + " region = region, \n", " staging_dir = staging_dir, \n", " requirements_file_path = requirements_file_path, \n", " args = args,\n", diff --git a/test/sample-test/configs/dataflow.config.yaml b/test/sample-test/configs/dataflow.config.yaml index 60938377cbc..8228f120817 100644 --- a/test/sample-test/configs/dataflow.config.yaml +++ b/test/sample-test/configs/dataflow.config.yaml @@ -16,4 +16,5 @@ test_name: dataflow notebook_params: output: project: ml-pipeline-test -run_pipeline: True \ No newline at end of file + region: us-central1 +run_pipeline: True