Skip to content

Commit

Permalink
chore: Update notebooks to use the DecodeDetections layer
Browse files Browse the repository at this point in the history
All Jupyter notebooks were updated to reflect the use of the new `DecodeDetections` layer that replaces `decode_y()` during inference time.
  • Loading branch information
pierluigiferrari committed Mar 5, 2018
1 parent 8ef6a1f commit a1764e9
Show file tree
Hide file tree
Showing 7 changed files with 128 additions and 75 deletions.
19 changes: 16 additions & 3 deletions ssd300_evaluation_COCO.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from keras import backend as K\n",
Expand All @@ -41,6 +43,8 @@
"from keras_ssd_loss import SSDLoss\n",
"from keras_layer_AnchorBoxes import AnchorBoxes\n",
"from keras_layer_L2Normalization import L2Normalization\n",
"from keras_layer_DecodeDetections import DecodeDetections\n",
"from keras_layer_DecodeDetections2 import DecodeDetections2\n",
"from ssd_batch_generator import BatchGenerator\n",
"from coco_utils import get_coco_category_maps, predict_all_to_json\n",
"\n",
Expand Down Expand Up @@ -81,7 +85,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# 1: Build the Keras model\n",
Expand All @@ -90,6 +96,7 @@
"\n",
"model = ssd_300(image_size=(img_height, img_width, 3),\n",
" n_classes=80,\n",
" mode='inference',\n",
" l2_regularization=0.0005,\n",
" scales=[0.07, 0.15, 0.33, 0.51, 0.69, 0.87, 1.05], # The scales for Pascal VOC are [0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05]\n",
" aspect_ratios_per_layer=[[1.0, 2.0, 0.5],\n",
Expand All @@ -106,7 +113,11 @@
" coords='centroids',\n",
" normalize_coords=True,\n",
" subtract_mean=[123, 117, 104],\n",
" swap_channels=True)\n",
" swap_channels=True,\n",
" confidence_thresh=0.5,\n",
" iou_threshold=0.45,\n",
" top_k=200,\n",
" nms_max_output_size=400)\n",
"\n",
"# 2: Load the trained weights into the model.\n",
"\n",
Expand Down Expand Up @@ -157,6 +168,7 @@
"\n",
"model = load_model(model_path, custom_objects={'AnchorBoxes': AnchorBoxes,\n",
" 'L2Normalization': L2Normalization,\n",
" 'DecodeDetections': DecodeDetections,\n",
" 'compute_loss': ssd_loss.compute_loss})"
]
},
Expand Down Expand Up @@ -241,6 +253,7 @@
" batch_generator=dataset,\n",
" batch_size=batch_size,\n",
" batch_generator_mode='resize',\n",
" model_mode='inference',\n",
" confidence_thresh=0.01,\n",
" iou_threshold=0.45,\n",
" top_k=200,\n",
Expand Down
19 changes: 16 additions & 3 deletions ssd300_evaluation_Pascal_VOC.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from keras import backend as K\n",
Expand All @@ -28,6 +30,8 @@
"from keras_ssd_loss import SSDLoss\n",
"from keras_layer_AnchorBoxes import AnchorBoxes\n",
"from keras_layer_L2Normalization import L2Normalization\n",
"from keras_layer_DecodeDetections import DecodeDetections\n",
"from keras_layer_DecodeDetections2 import DecodeDetections2\n",
"from ssd_box_encode_decode_utils import SSDBoxEncoder\n",
"from ssd_batch_generator import BatchGenerator\n",
"from pascal_voc_utils import predict_all_to_txt\n",
Expand Down Expand Up @@ -69,7 +73,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# 1: Build the Keras model\n",
Expand All @@ -78,6 +84,7 @@
"\n",
"model = ssd_300(image_size=(img_height, img_width, 3),\n",
" n_classes=20,\n",
" mode='inference',\n",
" l2_regularization=0.0005,\n",
" scales=[0.07, 0.15, 0.33, 0.51, 0.69, 0.87, 1.05], # The scales for Pascal VOC are [0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05]\n",
" aspect_ratios_per_layer=[[1.0, 2.0, 0.5],\n",
Expand All @@ -94,7 +101,11 @@
" coords='centroids',\n",
" normalize_coords=True,\n",
" subtract_mean=[123, 117, 104],\n",
" swap_channels=True)\n",
" swap_channels=True,\n",
" confidence_thresh=0.5,\n",
" iou_threshold=0.45,\n",
" top_k=200,\n",
" nms_max_output_size=400)\n",
"\n",
"# 2: Load the trained weights into the model.\n",
"\n",
Expand Down Expand Up @@ -144,6 +155,7 @@
"\n",
"model = load_model(model_path, custom_objects={'AnchorBoxes': AnchorBoxes,\n",
" 'L2Normalization': L2Normalization,\n",
" 'DecodeDetections': DecodeDetections,\n",
" 'compute_loss': ssd_loss.compute_loss})"
]
},
Expand Down Expand Up @@ -233,6 +245,7 @@
" 'horse', 'motorbike', 'person', 'pottedplant',\n",
" 'sheep', 'sofa', 'train', 'tvmonitor'],\n",
" out_file_prefix='ssd300_07+12_2007_test_eval/comp3_det_test_',\n",
" model_mode='inference',\n",
" confidence_thresh=0.01,\n",
" iou_threshold=0.45,\n",
" top_k=200,\n",
Expand Down
57 changes: 30 additions & 27 deletions ssd300_inference.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
"from keras_ssd_loss import SSDLoss\n",
"from keras_layer_AnchorBoxes import AnchorBoxes\n",
"from keras_layer_L2Normalization import L2Normalization\n",
"from keras_layer_DecodeDetections import DecodeDetections\n",
"from keras_layer_DecodeDetections2 import DecodeDetections2\n",
"from ssd_box_encode_decode_utils import decode_y, decode_y2\n",
"from ssd_batch_generator import BatchGenerator\n",
"\n",
Expand Down Expand Up @@ -80,6 +82,7 @@
"\n",
"model = ssd_300(image_size=(img_height, img_width, 3),\n",
" n_classes=20,\n",
" mode='inference',\n",
" l2_regularization=0.0005,\n",
" scales=[0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05], # The scales for MS COCO are [0.07, 0.15, 0.33, 0.51, 0.69, 0.87, 1.05]\n",
" aspect_ratios_per_layer=[[1.0, 2.0, 0.5],\n",
Expand All @@ -96,7 +99,11 @@
" coords='centroids',\n",
" normalize_coords=True,\n",
" subtract_mean=[123, 117, 104],\n",
" swap_channels=True)\n",
" swap_channels=True,\n",
" confidence_thresh=0.5,\n",
" iou_threshold=0.45,\n",
" top_k=200,\n",
" nms_max_output_size=400)\n",
"\n",
"# 2: Load the trained weights into the model.\n",
"\n",
Expand Down Expand Up @@ -147,6 +154,7 @@
"\n",
"model = load_model(model_path, custom_objects={'AnchorBoxes': AnchorBoxes,\n",
" 'L2Normalization': L2Normalization,\n",
" 'DecodeDetections': DecodeDetections,\n",
" 'compute_loss': ssd_loss.compute_loss})"
]
},
Expand Down Expand Up @@ -184,7 +192,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Make predictions and decode them"
"## 3. Make predictions"
]
},
{
Expand All @@ -198,6 +206,13 @@
"y_pred = model.predict(input_images)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`y_pred` contains a fixed number of predictions per batch item (200 if you use the original model configuration), many of which are low-confidence predictions or dummy entries. We therefore need to apply a confidence threshold to filter out the bad predictions. Set this confidence threshold value how you see fit."
]
},
{
"cell_type": "code",
"execution_count": 43,
Expand All @@ -216,19 +231,14 @@
}
],
"source": [
"y_pred_decoded = decode_y(y_pred,\n",
" confidence_thresh=0.5,\n",
" iou_threshold=0.45,\n",
" top_k=200,\n",
" input_coords='centroids',\n",
" normalize_coords=True,\n",
" img_height=img_height,\n",
" img_width=img_width)\n",
"confidence_threshold = 0.5\n",
"\n",
"y_pred = [y_pred[k][y_pred[k,:,1] > confidence_threshold] for k in range(y_pred.shape[0])]\n",
"\n",
"np.set_printoptions(precision=2, suppress=True, linewidth=90)\n",
"print(\"Predicted boxes:\\n\")\n",
"print(' class conf xmin ymin xmax ymax')\n",
"print(y_pred_decoded[0])"
"print(' class conf xmin ymin xmax ymax')\n",
"print(y_pred[0])"
]
},
{
Expand Down Expand Up @@ -273,7 +283,7 @@
"\n",
"current_axis = plt.gca()\n",
"\n",
"for box in y_pred_decoded[0]:\n",
"for box in y_pred[0]:\n",
" # Transform the predicted bounding boxes for the 300x300 image to the original image dimensions.\n",
" xmin = box[-4] * orig_images[0].shape[1] / img_width\n",
" ymin = box[-3] * orig_images[0].shape[0] / img_height\n",
Expand Down Expand Up @@ -308,9 +318,9 @@
"\n",
"# TODO: Set the paths to the datasets here.\n",
"\n",
"VOC_2007_test_images_dir = '../../datasets/VOCdevkit/VOC2007_Test/JPEGImages/'\n",
"VOC_2007_test_annotations_dir = '../../datasets/VOCdevkit/VOC2007_Test/Annotations/'\n",
"VOC_2007_test_image_set_filename = '../../datasets/VOCdevkit/VOC2007_Test/ImageSets/Main/test.txt'\n",
"VOC_2007_test_images_dir = '../../datasets/VOCdevkit/VOC2007/JPEGImages/'\n",
"VOC_2007_test_annotations_dir = '../../datasets/VOCdevkit/VOC2007/Annotations/'\n",
"VOC_2007_test_image_set_filename = '../../datasets/VOCdevkit/VOC2007/ImageSets/Main/test.txt'\n",
"\n",
"# The XML parser needs to now what object class names to look for and in which order to map them to integers.\n",
"classes = ['background',\n",
Expand Down Expand Up @@ -404,21 +414,14 @@
}
],
"source": [
"# Decode the predictions.\n",
"confidence_threshold = 0.5\n",
"\n",
"y_pred_decoded = decode_y(y_pred,\n",
" confidence_thresh=0.5,\n",
" iou_threshold=0.45,\n",
" top_k=200,\n",
" input_coords='centroids',\n",
" normalize_coords=True,\n",
" img_height=img_height,\n",
" img_width=img_width)\n",
"y_pred = [y_pred[k][y_pred[k,:,1] > confidence_threshold] for k in range(y_pred.shape[0])]\n",
"\n",
"np.set_printoptions(precision=2, suppress=True, linewidth=90)\n",
"print(\"Predicted boxes:\\n\")\n",
"print(' class conf xmin ymin xmax ymax')\n",
"print(y_pred_decoded[i])"
"print(y_pred[0])"
]
},
{
Expand Down Expand Up @@ -457,7 +460,7 @@
" current_axis.add_patch(plt.Rectangle((xmin, ymin), xmax-xmin, ymax-ymin, color='green', fill=False, linewidth=2)) \n",
" current_axis.text(xmin, ymin, label, size='x-large', color='white', bbox={'facecolor':'green', 'alpha':1.0})\n",
"\n",
"for k, box in enumerate(y_pred_decoded[i]):\n",
"for k, box in enumerate(y_pred[i]):\n",
" # Transform the predicted bounding boxes for the 300x300 image to the original image dimensions.\n",
" xmin = (box[-4] * batch_inverse_coord_transform[i,0,1]) + batch_inverse_coord_transform[i,0,0]\n",
" ymin = (box[-3] * batch_inverse_coord_transform[i,1,1]) + batch_inverse_coord_transform[i,1,0]\n",
Expand Down
13 changes: 11 additions & 2 deletions ssd300_training.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
"from keras_ssd_loss import SSDLoss\n",
"from keras_layer_AnchorBoxes import AnchorBoxes\n",
"from keras_layer_L2Normalization import L2Normalization\n",
"from keras_layer_DecodeDetections import DecodeDetections\n",
"from keras_layer_DecodeDetections2 import DecodeDetections2\n",
"from ssd_box_encode_decode_utils import SSDBoxEncoder, decode_y, decode_y2\n",
"from ssd_batch_generator import BatchGenerator\n",
"\n",
Expand Down Expand Up @@ -138,6 +140,7 @@
"\n",
"model = ssd_300(image_size=(img_height, img_width, img_channels),\n",
" n_classes=n_classes,\n",
" mode='training',\n",
" l2_regularization=0.0005,\n",
" scales=scales,\n",
" aspect_ratios_per_layer=aspect_ratios,\n",
Expand Down Expand Up @@ -176,7 +179,9 @@
"\n",
"If you have previously created and saved a model and would now like to load it, simply execute the next code cell. The only thing you need to do is to set the path to the saved model HDF5 file that you would like to load.\n",
"\n",
"The SSD model contains custom objects: Neither the loss function nor the anchor box or L2-normalization layer types are contained in the Keras core library, so we need to provide them to the model loader."
"The SSD model contains custom objects: Neither the loss function nor the anchor box or L2-normalization layer types are contained in the Keras core library, so we need to provide them to the model loader.\n",
"\n",
"This next code cell assumes that you want to load a model that was created in 'training' mode. If you want to load a model that was created in 'inference' or 'inference_fast' mode, you'll have to add the `DecodeDetections` or `DecodeDetections2` layer type to the `custom_objects` dictionary below."
]
},
{
Expand Down Expand Up @@ -545,7 +550,11 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Now let's decode the raw prediction `y_pred`. The function `decode_y()` with default argument values follows the procedure of the original implementation: First a very low confidence threshold of 0.01 is applied to filter out the majority of the predicted boxes, then greedy non-maximum suppression is performed per class with an intersection-over-union threshold of 0.45, and out of what is left after that, the top 200 highest confidence boxes are returned. Those settings are for precision-recall scoring purposes though, in order to get some usable final predictions, the confidence threshold must be set higher, e.g. to 0.5.\n",
"Now let's decode the raw prediction `y_pred`.\n",
"\n",
"Had we created the model in 'inference' or 'inference_fast' mode, then the model's final layer would be the `DecodeDetection` layer and `y_pred` would already be the decoded predictions, but since we created the model in 'training' mode, the model outputs raw predictions that we still need to decode and filter. The two functions `decode_y()` and `decode_y2()` do exactly what the `DecodeDetection` and `DecodeDetection2` layers would do, just in Numpy instead of TensorFlow (i.e. on the CPU instead of the GPU).\n",
"\n",
"The function `decode_y()` with default argument values follows the procedure of the original implementation: First a very low confidence threshold of 0.01 is applied to filter out the majority of the predicted boxes, then greedy non-maximum suppression is performed per class with an intersection-over-union threshold of 0.45, and out of what is left after that, the top 200 highest confidence boxes are returned. Those settings are for precision-recall scoring purposes though, in order to get some usable final predictions, the confidence threshold must be set higher, e.g. to 0.5.\n",
"\n",
"The function `decode_y2()` uses an alternative procedure that performs NMS globally instead of per-class. `decode_y()` performs NMS per class. It is important to understand what difference that makes. One point is that doing NMS per class for 20 classes will take roughly 20-times the time to do NMS just once over all classes, but this slow-down doesn't matter much when decoding a single batch. The more important point is to understand what difference it can make for the resulting final predictions. Performing NMS globally means that the strongest candidate box will eliminate all close boxes around it regardless of their predicted class. This can be good and bad. For example, if one box correctly predicts a sheep and another box incorrectly predicts a cow at similar coordinates, then global NMS would eliminate the incorrect cow box (because it is too close to the correct sheep box), while per-class NMS would not eliminate the incorrect cow box (because boxes are only compared within the same object class). On the other hand, if two objects of different classes are very close together and overlapping and are predicted correctly, then global NMS might eliminate one of the two correct predictions because they are too close together, while per-class NMS will keep both predictions. It's up to you which decoder you use, but I'll preset the `decode_y()` decoder that follows the paper."
]
Expand Down
Loading

0 comments on commit a1764e9

Please sign in to comment.