From d183dd927b5e2fcf773b5bc685a185e8f79d409f Mon Sep 17 00:00:00 2001
From: Pierluigi Ferrari <pierluigi.ferrari@gmx.com>
Date: Fri, 19 Jan 2018 00:44:13 +0100
Subject: [PATCH] fix: Correct mean normalization channel order

The order of the mean normalization values was BGR when it should have been RGB for this implementation. Fixed now.
---
 keras_ssd300.py              |  2 +-
 keras_ssd512.py              |  2 +-
 ssd300_evaluation_COCO.ipynb | 52 +++++++++++++++++-------------------
 ssd300_inference.ipynb       |  2 +-
 ssd300_training.ipynb        |  7 ++---
 ssd512_inference.ipynb       |  2 +-
 6 files changed, 32 insertions(+), 35 deletions(-)

diff --git a/keras_ssd300.py b/keras_ssd300.py
index 8740a307..8988a0b1 100644
--- a/keras_ssd300.py
+++ b/keras_ssd300.py
@@ -45,7 +45,7 @@ def ssd_300(image_size,
             variances=[0.1, 0.1, 0.2, 0.2],
             coords='centroids',
             normalize_coords=False,
-            subtract_mean=[104, 117, 123],
+            subtract_mean=[123, 117, 104],
             divide_by_stddev=None,
             swap_channels=True,
             return_predictor_sizes=False):
diff --git a/keras_ssd512.py b/keras_ssd512.py
index f075da80..ae90a831 100644
--- a/keras_ssd512.py
+++ b/keras_ssd512.py
@@ -46,7 +46,7 @@ def ssd_512(image_size,
             variances=[0.1, 0.1, 0.2, 0.2],
             coords='centroids',
             normalize_coords=False,
-            subtract_mean=[104, 117, 123],
+            subtract_mean=[123, 117, 104],
             divide_by_stddev=None,
             swap_channels=True,
             return_predictor_sizes=False):
diff --git a/ssd300_evaluation_COCO.ipynb b/ssd300_evaluation_COCO.ipynb
index a9f17fb8..aa9ee199 100644
--- a/ssd300_evaluation_COCO.ipynb
+++ b/ssd300_evaluation_COCO.ipynb
@@ -18,9 +18,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from keras import backend as K\n",
@@ -83,9 +81,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# 1: Build the Keras model\n",
@@ -109,7 +105,7 @@
     "                variances=[0.1, 0.1, 0.2, 0.2],\n",
     "                coords='centroids',\n",
     "                normalize_coords=True,\n",
-    "                subtract_mean=[104, 117, 123],\n",
+    "                subtract_mean=[123, 117, 104],\n",
     "                swap_channels=True)\n",
     "\n",
     "# 2: Load the trained weights into the model.\n",
@@ -210,7 +206,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {
     "collapsed": true
    },
@@ -223,7 +219,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -231,7 +227,7 @@
      "output_type": "stream",
      "text": [
       "Number of images in the evaluation dataset: 5000\n",
-      "Producing results file: 100%|██████████| 250/250 [15:02<00:00,  3.79s/it]\n",
+      "Producing results file: 100%|██████████| 250/250 [17:07<00:00,  4.29s/it]\n",
       "Prediction results saved in 'detections_val2017_ssd300_results.json'\n"
      ]
     }
@@ -263,7 +259,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -271,11 +267,11 @@
      "output_type": "stream",
      "text": [
       "loading annotations into memory...\n",
-      "Done (t=0.54s)\n",
+      "Done (t=0.41s)\n",
       "creating index...\n",
       "index created!\n",
       "Loading and preparing results...\n",
-      "DONE (t=5.32s)\n",
+      "DONE (t=5.34s)\n",
       "creating index...\n",
       "index created!\n"
      ]
@@ -289,7 +285,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -298,21 +294,21 @@
      "text": [
       "Running per image evaluation...\n",
       "Evaluate annotation type *bbox*\n",
-      "DONE (t=68.27s).\n",
+      "DONE (t=69.19s).\n",
       "Accumulating evaluation results...\n",
-      "DONE (t=10.48s).\n",
-      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.241\n",
-      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.415\n",
-      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.247\n",
-      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.057\n",
-      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.257\n",
-      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.405\n",
-      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.229\n",
-      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.335\n",
-      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.356\n",
-      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.097\n",
-      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.395\n",
-      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.568\n"
+      "DONE (t=14.10s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.247\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.424\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.253\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.059\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.264\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.414\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.232\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.341\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.362\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.102\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.401\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.577\n"
      ]
     }
    ],
diff --git a/ssd300_inference.ipynb b/ssd300_inference.ipynb
index fc6b100d..9ba95da1 100644
--- a/ssd300_inference.ipynb
+++ b/ssd300_inference.ipynb
@@ -95,7 +95,7 @@
     "                variances=[0.1, 0.1, 0.2, 0.2],\n",
     "                coords='centroids',\n",
     "                normalize_coords=True,\n",
-    "                subtract_mean=[104, 117, 123],\n",
+    "                subtract_mean=[123, 117, 104],\n",
     "                swap_channels=True)\n",
     "\n",
     "# 2: Load the trained weights into the model.\n",
diff --git a/ssd300_training.ipynb b/ssd300_training.ipynb
index 90f31ad1..94a8ca5a 100644
--- a/ssd300_training.ipynb
+++ b/ssd300_training.ipynb
@@ -76,10 +76,11 @@
     "img_height = 300 # Height of the input images\n",
     "img_width = 300 # Width of the input images\n",
     "img_channels = 3 # Number of color channels of the input images\n",
-    "subtract_mean = [104, 117, 123] # The per-channel mean of the images in the dataset\n",
+    "subtract_mean = [123, 117, 104] # The per-channel mean of the images in the dataset\n",
     "swap_channels = True # The color channel order in the original SSD is BGR\n",
     "n_classes = 20 # Number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO\n",
-    "scales = [0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05] # The anchor box scaling factors used in the original SSD300 for the Pascal VOC datasets, the factors for the MS COCO dataset are smaller, namely [0.07, 0.15, 0.33, 0.51, 0.69, 0.87, 1.05]\n",
+    "scales_voc = [0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05] # The anchor box scaling factors used in the original SSD300 for the Pascal VOC datasets\n",
+    "scales_coco = [0.07, 0.15, 0.33, 0.51, 0.69, 0.87, 1.05] # The anchor box scaling factors used in the original SSD300 for the MS COCO datasets\n",
     "aspect_ratios = [[1.0, 2.0, 0.5],\n",
     "                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],\n",
     "                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],\n",
@@ -137,7 +138,7 @@
     "model = ssd_300(image_size=(img_height, img_width, img_channels),\n",
     "                n_classes=n_classes,\n",
     "                l2_regularization=0.0005,\n",
-    "                scales=scales,\n",
+    "                scales=scales_voc,\n",
     "                aspect_ratios_per_layer=aspect_ratios,\n",
     "                two_boxes_for_ar1=two_boxes_for_ar1,\n",
     "                steps=steps,\n",
diff --git a/ssd512_inference.ipynb b/ssd512_inference.ipynb
index 4a53c66b..741b6b7e 100644
--- a/ssd512_inference.ipynb
+++ b/ssd512_inference.ipynb
@@ -96,7 +96,7 @@
     "               variances=[0.1, 0.1, 0.2, 0.2],\n",
     "               coords='centroids',\n",
     "               normalize_coords=True,\n",
-    "               subtract_mean=[104, 117, 123],\n",
+    "               subtract_mean=[123, 117, 104],\n",
     "               swap_channels=True)\n",
     "\n",
     "# 2: Load the trained weights into the model.\n",