chore: Update notebooks to use the DecodeDetections layer

All Jupyter notebooks were updated to reflect the use of the new `DecodeDetections` layer that replaces `decode_y()` during inference time.
nicocurat · Mar 5, 2018 · a1764e9 · a1764e9
1 parent 8ef6a1f
commit a1764e9
Show file tree

Hide file tree

Showing 7 changed files with 128 additions and 75 deletions.
diff --git a/ssd300_evaluation_COCO.ipynb b/ssd300_evaluation_COCO.ipynb
@@ -18,7 +18,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "from keras import backend as K\n",
@@ -41,6 +43,8 @@
     "from keras_ssd_loss import SSDLoss\n",
     "from keras_layer_AnchorBoxes import AnchorBoxes\n",
     "from keras_layer_L2Normalization import L2Normalization\n",
+    "from keras_layer_DecodeDetections import DecodeDetections\n",
+    "from keras_layer_DecodeDetections2 import DecodeDetections2\n",
     "from ssd_batch_generator import BatchGenerator\n",
     "from coco_utils import get_coco_category_maps, predict_all_to_json\n",
     "\n",
@@ -81,7 +85,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "# 1: Build the Keras model\n",
@@ -90,6 +96,7 @@
     "\n",
     "model = ssd_300(image_size=(img_height, img_width, 3),\n",
     "                n_classes=80,\n",
+    "                mode='inference',\n",
     "                l2_regularization=0.0005,\n",
     "                scales=[0.07, 0.15, 0.33, 0.51, 0.69, 0.87, 1.05], # The scales for Pascal VOC are [0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05]\n",
     "                aspect_ratios_per_layer=[[1.0, 2.0, 0.5],\n",
@@ -106,7 +113,11 @@
     "                coords='centroids',\n",
     "                normalize_coords=True,\n",
     "                subtract_mean=[123, 117, 104],\n",
-    "                swap_channels=True)\n",
+    "                swap_channels=True,\n",
+    "                confidence_thresh=0.5,\n",
+    "                iou_threshold=0.45,\n",
+    "                top_k=200,\n",
+    "                nms_max_output_size=400)\n",
     "\n",
     "# 2: Load the trained weights into the model.\n",
     "\n",
@@ -157,6 +168,7 @@
     "\n",
     "model = load_model(model_path, custom_objects={'AnchorBoxes': AnchorBoxes,\n",
     "                                               'L2Normalization': L2Normalization,\n",
+    "                                               'DecodeDetections': DecodeDetections,\n",
     "                                               'compute_loss': ssd_loss.compute_loss})"
    ]
   },
@@ -241,6 +253,7 @@
     "                    batch_generator=dataset,\n",
     "                    batch_size=batch_size,\n",
     "                    batch_generator_mode='resize',\n",
+    "                    model_mode='inference',\n",
     "                    confidence_thresh=0.01,\n",
     "                    iou_threshold=0.45,\n",
     "                    top_k=200,\n",

diff --git a/ssd300_evaluation_Pascal_VOC.ipynb b/ssd300_evaluation_Pascal_VOC.ipynb
@@ -14,7 +14,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "from keras import backend as K\n",
@@ -28,6 +30,8 @@
     "from keras_ssd_loss import SSDLoss\n",
     "from keras_layer_AnchorBoxes import AnchorBoxes\n",
     "from keras_layer_L2Normalization import L2Normalization\n",
+    "from keras_layer_DecodeDetections import DecodeDetections\n",
+    "from keras_layer_DecodeDetections2 import DecodeDetections2\n",
     "from ssd_box_encode_decode_utils import SSDBoxEncoder\n",
     "from ssd_batch_generator import BatchGenerator\n",
     "from pascal_voc_utils import predict_all_to_txt\n",
@@ -69,7 +73,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "# 1: Build the Keras model\n",
@@ -78,6 +84,7 @@
     "\n",
     "model = ssd_300(image_size=(img_height, img_width, 3),\n",
     "                n_classes=20,\n",
+    "                mode='inference',\n",
     "                l2_regularization=0.0005,\n",
     "                scales=[0.07, 0.15, 0.33, 0.51, 0.69, 0.87, 1.05], # The scales for Pascal VOC are [0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05]\n",
     "                aspect_ratios_per_layer=[[1.0, 2.0, 0.5],\n",
@@ -94,7 +101,11 @@
     "                coords='centroids',\n",
     "                normalize_coords=True,\n",
     "                subtract_mean=[123, 117, 104],\n",
-    "                swap_channels=True)\n",
+    "                swap_channels=True,\n",
+    "                confidence_thresh=0.5,\n",
+    "                iou_threshold=0.45,\n",
+    "                top_k=200,\n",
+    "                nms_max_output_size=400)\n",
     "\n",
     "# 2: Load the trained weights into the model.\n",
     "\n",
@@ -144,6 +155,7 @@
     "\n",
     "model = load_model(model_path, custom_objects={'AnchorBoxes': AnchorBoxes,\n",
     "                                               'L2Normalization': L2Normalization,\n",
+    "                                               'DecodeDetections': DecodeDetections,\n",
     "                                               'compute_loss': ssd_loss.compute_loss})"
    ]
   },
@@ -233,6 +245,7 @@
     "                            'horse', 'motorbike', 'person', 'pottedplant',\n",
     "                            'sheep', 'sofa', 'train', 'tvmonitor'],\n",
     "                   out_file_prefix='ssd300_07+12_2007_test_eval/comp3_det_test_',\n",
+    "                   model_mode='inference',\n",
     "                   confidence_thresh=0.01,\n",
     "                   iou_threshold=0.45,\n",
     "                   top_k=200,\n",

diff --git a/ssd300_inference.ipynb b/ssd300_inference.ipynb
@@ -29,6 +29,8 @@
     "from keras_ssd_loss import SSDLoss\n",
     "from keras_layer_AnchorBoxes import AnchorBoxes\n",
     "from keras_layer_L2Normalization import L2Normalization\n",
+    "from keras_layer_DecodeDetections import DecodeDetections\n",
+    "from keras_layer_DecodeDetections2 import DecodeDetections2\n",
     "from ssd_box_encode_decode_utils import decode_y, decode_y2\n",
     "from ssd_batch_generator import BatchGenerator\n",
     "\n",
@@ -80,6 +82,7 @@
     "\n",
     "model = ssd_300(image_size=(img_height, img_width, 3),\n",
     "                n_classes=20,\n",
+    "                mode='inference',\n",
     "                l2_regularization=0.0005,\n",
     "                scales=[0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05], # The scales for MS COCO are [0.07, 0.15, 0.33, 0.51, 0.69, 0.87, 1.05]\n",
     "                aspect_ratios_per_layer=[[1.0, 2.0, 0.5],\n",
@@ -96,7 +99,11 @@
     "                coords='centroids',\n",
     "                normalize_coords=True,\n",
     "                subtract_mean=[123, 117, 104],\n",
-    "                swap_channels=True)\n",
+    "                swap_channels=True,\n",
+    "                confidence_thresh=0.5,\n",
+    "                iou_threshold=0.45,\n",
+    "                top_k=200,\n",
+    "                nms_max_output_size=400)\n",
     "\n",
     "# 2: Load the trained weights into the model.\n",
     "\n",
@@ -147,6 +154,7 @@
     "\n",
     "model = load_model(model_path, custom_objects={'AnchorBoxes': AnchorBoxes,\n",
     "                                               'L2Normalization': L2Normalization,\n",
+    "                                               'DecodeDetections': DecodeDetections,\n",
     "                                               'compute_loss': ssd_loss.compute_loss})"
    ]
   },
@@ -184,7 +192,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 3. Make predictions and decode them"
+    "## 3. Make predictions"
    ]
   },
   {
@@ -198,6 +206,13 @@
     "y_pred = model.predict(input_images)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`y_pred` contains a fixed number of predictions per batch item (200 if you use the original model configuration), many of which are low-confidence predictions or dummy entries. We therefore need to apply a confidence threshold to filter out the bad predictions. Set this confidence threshold value how you see fit."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 43,
@@ -216,19 +231,14 @@
     }
    ],
    "source": [
-    "y_pred_decoded = decode_y(y_pred,\n",
-    "                          confidence_thresh=0.5,\n",
-    "                          iou_threshold=0.45,\n",
-    "                          top_k=200,\n",
-    "                          input_coords='centroids',\n",
-    "                          normalize_coords=True,\n",
-    "                          img_height=img_height,\n",
-    "                          img_width=img_width)\n",
+    "confidence_threshold = 0.5\n",
+    "\n",
+    "y_pred = [y_pred[k][y_pred[k,:,1] > confidence_threshold] for k in range(y_pred.shape[0])]\n",
     "\n",
     "np.set_printoptions(precision=2, suppress=True, linewidth=90)\n",
     "print(\"Predicted boxes:\\n\")\n",
-    "print('    class    conf  xmin     ymin   xmax    ymax')\n",
-    "print(y_pred_decoded[0])"
+    "print('    class    conf  xmin    ymin    xmax    ymax')\n",
+    "print(y_pred[0])"
    ]
   },
   {
@@ -273,7 +283,7 @@
     "\n",
     "current_axis = plt.gca()\n",
     "\n",
-    "for box in y_pred_decoded[0]:\n",
+    "for box in y_pred[0]:\n",
     "    # Transform the predicted bounding boxes for the 300x300 image to the original image dimensions.\n",
     "    xmin = box[-4] * orig_images[0].shape[1] / img_width\n",
     "    ymin = box[-3] * orig_images[0].shape[0] / img_height\n",
@@ -308,9 +318,9 @@
     "\n",
     "# TODO: Set the paths to the datasets here.\n",
     "\n",
-    "VOC_2007_test_images_dir         = '../../datasets/VOCdevkit/VOC2007_Test/JPEGImages/'\n",
-    "VOC_2007_test_annotations_dir    = '../../datasets/VOCdevkit/VOC2007_Test/Annotations/'\n",
-    "VOC_2007_test_image_set_filename = '../../datasets/VOCdevkit/VOC2007_Test/ImageSets/Main/test.txt'\n",
+    "VOC_2007_test_images_dir         = '../../datasets/VOCdevkit/VOC2007/JPEGImages/'\n",
+    "VOC_2007_test_annotations_dir    = '../../datasets/VOCdevkit/VOC2007/Annotations/'\n",
+    "VOC_2007_test_image_set_filename = '../../datasets/VOCdevkit/VOC2007/ImageSets/Main/test.txt'\n",
     "\n",
     "# The XML parser needs to now what object class names to look for and in which order to map them to integers.\n",
     "classes = ['background',\n",
@@ -404,21 +414,14 @@
     }
    ],
    "source": [
-    "# Decode the predictions.\n",
+    "confidence_threshold = 0.5\n",
     "\n",
-    "y_pred_decoded = decode_y(y_pred,\n",
-    "                          confidence_thresh=0.5,\n",
-    "                          iou_threshold=0.45,\n",
-    "                          top_k=200,\n",
-    "                          input_coords='centroids',\n",
-    "                          normalize_coords=True,\n",
-    "                          img_height=img_height,\n",
-    "                          img_width=img_width)\n",
+    "y_pred = [y_pred[k][y_pred[k,:,1] > confidence_threshold] for k in range(y_pred.shape[0])]\n",
     "\n",
     "np.set_printoptions(precision=2, suppress=True, linewidth=90)\n",
     "print(\"Predicted boxes:\\n\")\n",
     "print('    class    conf  xmin    ymin    xmax    ymax')\n",
-    "print(y_pred_decoded[i])"
+    "print(y_pred[0])"
    ]
   },
   {
@@ -457,7 +460,7 @@
     "    current_axis.add_patch(plt.Rectangle((xmin, ymin), xmax-xmin, ymax-ymin, color='green', fill=False, linewidth=2))  \n",
     "    current_axis.text(xmin, ymin, label, size='x-large', color='white', bbox={'facecolor':'green', 'alpha':1.0})\n",
     "\n",
-    "for k, box in enumerate(y_pred_decoded[i]):\n",
+    "for k, box in enumerate(y_pred[i]):\n",
     "    # Transform the predicted bounding boxes for the 300x300 image to the original image dimensions.\n",
     "    xmin = (box[-4] * batch_inverse_coord_transform[i,0,1]) + batch_inverse_coord_transform[i,0,0]\n",
     "    ymin = (box[-3] * batch_inverse_coord_transform[i,1,1]) + batch_inverse_coord_transform[i,1,0]\n",

diff --git a/ssd300_training.ipynb b/ssd300_training.ipynb
@@ -35,6 +35,8 @@
     "from keras_ssd_loss import SSDLoss\n",
     "from keras_layer_AnchorBoxes import AnchorBoxes\n",
     "from keras_layer_L2Normalization import L2Normalization\n",
+    "from keras_layer_DecodeDetections import DecodeDetections\n",
+    "from keras_layer_DecodeDetections2 import DecodeDetections2\n",
     "from ssd_box_encode_decode_utils import SSDBoxEncoder, decode_y, decode_y2\n",
     "from ssd_batch_generator import BatchGenerator\n",
     "\n",
@@ -138,6 +140,7 @@
     "\n",
     "model = ssd_300(image_size=(img_height, img_width, img_channels),\n",
     "                n_classes=n_classes,\n",
+    "                mode='training',\n",
     "                l2_regularization=0.0005,\n",
     "                scales=scales,\n",
     "                aspect_ratios_per_layer=aspect_ratios,\n",
@@ -176,7 +179,9 @@
     "\n",
     "If you have previously created and saved a model and would now like to load it, simply execute the next code cell. The only thing you need to do is to set the path to the saved model HDF5 file that you would like to load.\n",
     "\n",
-    "The SSD model contains custom objects: Neither the loss function nor the anchor box or L2-normalization layer types are contained in the Keras core library, so we need to provide them to the model loader."
+    "The SSD model contains custom objects: Neither the loss function nor the anchor box or L2-normalization layer types are contained in the Keras core library, so we need to provide them to the model loader.\n",
+    "\n",
+    "This next code cell assumes that you want to load a model that was created in 'training' mode. If you want to load a model that was created in 'inference' or 'inference_fast' mode, you'll have to add the `DecodeDetections` or `DecodeDetections2` layer type to the `custom_objects` dictionary below."
    ]
   },
   {
@@ -545,7 +550,11 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now let's decode the raw prediction `y_pred`. The function `decode_y()` with default argument values follows the procedure of the original implementation: First a very low confidence threshold of 0.01 is applied to filter out the majority of the predicted boxes, then greedy non-maximum suppression is performed per class with an intersection-over-union threshold of 0.45, and out of what is left after that, the top 200 highest confidence boxes are returned. Those settings are for precision-recall scoring purposes though, in order to get some usable final predictions, the confidence threshold must be set higher, e.g. to 0.5.\n",
+    "Now let's decode the raw prediction `y_pred`.\n",
+    "\n",
+    "Had we created the model in 'inference' or 'inference_fast' mode, then the model's final layer would be the `DecodeDetection` layer and `y_pred` would already be the decoded predictions, but since we created the model in 'training' mode, the model outputs raw predictions that we still need to decode and filter. The two functions `decode_y()` and `decode_y2()` do exactly what the `DecodeDetection` and `DecodeDetection2` layers would do, just in Numpy instead of TensorFlow (i.e. on the CPU instead of the GPU).\n",
+    "\n",
+    "The function `decode_y()` with default argument values follows the procedure of the original implementation: First a very low confidence threshold of 0.01 is applied to filter out the majority of the predicted boxes, then greedy non-maximum suppression is performed per class with an intersection-over-union threshold of 0.45, and out of what is left after that, the top 200 highest confidence boxes are returned. Those settings are for precision-recall scoring purposes though, in order to get some usable final predictions, the confidence threshold must be set higher, e.g. to 0.5.\n",
     "\n",
     "The function `decode_y2()` uses an alternative procedure that performs NMS globally instead of per-class. `decode_y()` performs NMS per class. It is important to understand what difference that makes. One point is that doing NMS per class for 20 classes will take roughly 20-times the time to do NMS just once over all classes, but this slow-down doesn't matter much when decoding a single batch. The more important point is to understand what difference it can make for the resulting final predictions. Performing NMS globally means that the strongest candidate box will eliminate all close boxes around it regardless of their predicted class. This can be good and bad. For example, if one box correctly predicts a sheep and another box incorrectly predicts a cow at similar coordinates, then global NMS would eliminate the incorrect cow box (because it is too close to the correct sheep box), while per-class NMS would not eliminate the incorrect cow box (because boxes are only compared within the same object class). On the other hand, if two objects of different classes are very close together and overlapping and are predicted correctly, then global NMS might eliminate one of the two correct predictions because they are too close together, while per-class NMS will keep both predictions. It's up to you which decoder you use, but I'll preset the `decode_y()` decoder that follows the paper."
    ]