fixed code bugs

EC2 Default User · EC2 Default User · commit cdbdfddde6bf · 2019-10-30T06:58:13.000Z
diff --git a/content/_index.md b/content/_index.md
@@ -6,9 +6,7 @@ weight: 1
 
 # Distributed Training Workshop
 
-Welcome to the distributed training workshop with TensorFlow on Amazon SageMaker and Amazon Elastic Kubernetes Service (EKS)
-
-
+### Welcome to the distributed training workshop with TensorFlow on Amazon SageMaker and Amazon Elastic Kubernetes Service (EKS)
 ### At the end of this workshop, you'll be able to:
 
 #### Identify when to consider distributed training
diff --git a/notebooks/part-1-horovod/cifar10-distributed.ipynb b/notebooks/part-1-horovod/cifar10-distributed.ipynb
@@ -186,9 +186,9 @@
     "if not os.path.exists(checkpoint_dir):\n",
     "    os.makedirs(checkpoint_dir)\n",
     "\n",
-    "train_dir = '../data/train'\n",
-    "validation_dir = '../data/validation'\n",
-    "eval_dir = '../data/eval'\n",
+    "train_dir = '../dataset/train'\n",
+    "validation_dir = '../dataset/validation'\n",
+    "eval_dir = '../dataset/eval'\n",
     "\n",
     "train_dataset = make_batch(train_dir+'/train.tfrecords',  batch_size)\n",
     "val_dataset = make_batch(validation_dir+'/validation.tfrecords', batch_size)\n",
diff --git a/notebooks/part-2-sagemaker/cifar10-sagemaker-distributed.ipynb b/notebooks/part-2-sagemaker/cifar10-sagemaker-distributed.ipynb
@@ -19,7 +19,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -30,8 +30,8 @@
     "\n",
     "sagemaker_session = sagemaker.Session()\n",
     "role = sagemaker.get_execution_role()\n",
-    "#bucket_name = 'tfworld2019-<your_bucket_name>'\n",
-    "bucket_name = 'tfworld2019'"
+    "\n",
+    "bucket_name = '<your_bucket_name>'"
    ]
   },
   {
@@ -41,7 +41,7 @@
     "**Step 2:** Specify hyperparameters, instance type and number of instances to distribute training to. The `hvd_processes_per_host` corrosponds to number of GPUs per instances. \n",
     "For example, if you choose:\n",
     "```\n",
-    "hvd_instance_type = 'ml.p3.8large'\n",
+    "hvd_instance_type = 'ml.p3.8xlarge'\n",
     "hvd_instance_count = 2\n",
     "hvd_processes_per_host = 4\n",
     "```\n",
@@ -138,6 +138,15 @@
     "                  job_name=job_name, wait=False)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Note**: in the `estimator_hvd.fit()` function above, change`wait=True` if you want to see the training output in the Jupyter notebook.\n",
+    "Advantage of setting `wait=False`, is that you can continue to run cells. \n",
+    "Since we're unblocked due to `wait=False` we can now launch tensorboard in the notebook and monitor progress."
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -147,22 +156,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "TensorBoard 1.14.0 at http://ip-172-16-89-111:6006/ (Press CTRL+C to quit)\n",
-      "W1028 20:55:37.536751 140564607526656 core_plugin.py:172] Unable to get first event timestamp for run sm-dist-1x1-gpu-instances2019-10-24-10-08-55-297: No event timestamp could be found\n",
-      "W1028 20:55:37.777247 140564607526656 core_plugin.py:172] Unable to get first event timestamp for run sm-dist-1x8-gpu-instances2019-10-24-07-43-40-297: No event timestamp could be found\n",
-      "W1028 20:55:37.984411 140564607526656 core_plugin.py:172] Unable to get first event timestamp for run sm-dist-2x1-gpu-instances2019-10-28-10-24-06-301: No event timestamp could be found\n",
-      "W1028 20:55:38.320934 140564607526656 core_plugin.py:172] Unable to get first event timestamp for run sm-dist-2x1-workers2019-10-28-20-28-23-301: No event timestamp could be found\n",
-      "^C\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "!S3_REGION=us-west-2 tensorboard --logdir s3://{bucket_name}/tensorboard_logs/"
    ]
@@ -171,11 +167,19 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Open a new browser tan and navigate to the folloiwng link to access TensorBoard:\n",
-    "<br> https://tfworld2019.notebook.us-west-2.sagemaker.aws/proxy/6006/\n",
-    "<br> Make sure that the name of the notebook instance is correct in the link above.\n",
+    "Open a new browser and navigate to the folloiwng link to access TensorBoard:\n",
+    "<br> https://***your_notebook_name***.notebook.us-west-2.sagemaker.aws/proxy/6006/\n",
+    "<br> <br> \n",
+    "**Note:** Make sure to replace `your_notebook_name` with the name of the notebook instance. You can find the name of your notebook instance on the browser URL.\n",
     "<br> Don't forget the slash at the end of the URL 6006/"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/notebooks/part-3-kubernetes/specs/eks_tf_training_job-cpu.yaml b/notebooks/part-3-kubernetes/specs/eks_tf_training_job-cpu.yaml
@@ -12,7 +12,7 @@ spec:
       restartPolicy: Never
       containers:
       - name: eks-tf-dist-job
-        image: 453691756499.dkr.ecr.us-west-2.amazonaws.com/tfworld2019:latest
+        image: <YOUR_DOCKER_IMAGE>
         env:
         - name: HDF5_USE_FILE_LOCKING
           value: 'FALSE'
diff --git a/notebooks/part-3-kubernetes/specs/eks_tf_training_job-gpu.yaml b/notebooks/part-3-kubernetes/specs/eks_tf_training_job-gpu.yaml
@@ -12,7 +12,7 @@ spec:
       restartPolicy: Never
       containers:
       - name: eks-tf-dist-job
-        image: 453691756499.dkr.ecr.us-west-2.amazonaws.com/tfworld2019:latest
+        image: <YOUR_DOCKER_IMAGE>
         env:
         - name: HDF5_USE_FILE_LOCKING
           value: 'FALSE'