PaddlePaddle · guoshengCS · Aug 15, 2017 · Aug 15, 2017 · qingqing01 · Aug 23, 2017
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
@@ -474,6 +474,11 @@ multibox_loss
 ..  autoclass:: paddle.v2.layer.multibox_loss
     :noindex:
 
+rcnn_loss
+--------------
+..  autoclass:: paddle.v2.layer.rcnn_loss
+    :noindex:
+
 Check Layer
 ============
 
@@ -510,3 +515,8 @@ detection_output
 ----------------
 ..  autoclass:: paddle.v2.layer.detection_output
     :noindex:
+
+rcnn_detection
+----------------
+..  autoclass:: paddle.v2.layer.rcnn_detection
+    :noindex:
diff --git a/paddle/gserver/layers/DetectionUtil.cpp b/paddle/gserver/layers/DetectionUtil.cpp
@@ -573,4 +573,147 @@ NormalizedBBox clipBBox(const NormalizedBBox& bbox) {
   return clippedBBox;
 }
 
+void applyNMSFast(const vector<pair<real, NormalizedBBox>>& bboxes,
+                  size_t topK,
+                  real confThreshold,
+                  real nmsThreshold,
+                  vector<size_t>* indices) {
+  vector<pair<real, size_t>> scores;
+  for (size_t i = 0; i < bboxes.size(); ++i) {
+    scores.push_back(std::make_pair(bboxes[i].first, i));
+  }
+  std::stable_sort(scores.begin(), scores.end(), sortScorePairDescend<size_t>);
+  if (topK > 0 && topK < scores.size()) scores.resize(topK);
+  while (scores.size() > 0) {
+    const size_t idx = scores.front().second;
+    bool keep = true;
+    for (size_t i = 0; i < indices->size(); ++i) {
+      if (keep) {
+        const size_t savedIdx = (*indices)[i];
+        real overlap =
+            jaccardOverlap(bboxes[idx].second, bboxes[savedIdx].second);
+        keep = overlap <= nmsThreshold;
+      } else {
+        break;
+      }
+    }
+    if (keep) indices->push_back(idx);
+    scores.erase(scores.begin());
+  }
+}
+
+size_t getDetectionIndices(
+    const size_t backgroundId,
+    const size_t confThreshold,
+    const size_t nmsTopK,
+    const real nmsThreshold,
+    const size_t keepTopK,
+    const map<size_t, map<size_t, vector<pair<real, NormalizedBBox>>>>&
+        allDecodedBBoxes,
+    map<size_t, map<size_t, vector<size_t>>>* allDetectionIndices) {
+  size_t totalKeepNum = 0;
+  for (const auto& batchIdxBBoxesPair : allDecodedBBoxes) {
+    size_t batchIdx = batchIdxBBoxesPair.first;
+    std::map<size_t, std::vector<size_t>>& indices =
+        (*allDetectionIndices)[batchIdx];
+    size_t numDetected = 0;
+    for (const auto& classDecodedBBoxesPair : batchIdxBBoxesPair.second) {
+      size_t classId = classDecodedBBoxesPair.first;
+      if (classId == backgroundId) {
+        continue;
+      } else {
+        applyNMSFast(classDecodedBBoxesPair.second,
+                     nmsTopK,
+                     confThreshold,
+                     nmsThreshold,
+                     &(indices[classId]));
+        numDetected += indices[classId].size();
+      }
+    }
+    if (keepTopK > 0 && numDetected > keepTopK) {
+      vector<pair<real, pair<size_t, size_t>>> scoreIndexPairs;
+      for (const auto& classDecodedBBoxesPair : batchIdxBBoxesPair.second) {
+        size_t classId = classDecodedBBoxesPair.first;
+        const vector<size_t>& labelIndices = indices[classId];
+        for (size_t i = 0; i < labelIndices.size(); ++i) {
+          real score = classDecodedBBoxesPair.second[labelIndices[i]].first;
+          scoreIndexPairs.push_back(
+              std::make_pair(score, std::make_pair(classId, labelIndices[i])));
+        }
+      }
+      std::sort(scoreIndexPairs.begin(),
+                scoreIndexPairs.end(),
+                sortScorePairDescend<pair<size_t, size_t>>);
+      scoreIndexPairs.resize(keepTopK);
+      indices.clear();
+      for (size_t i = 0; i < scoreIndexPairs.size(); ++i) {
+        size_t label = scoreIndexPairs[i].second.first;
+        size_t idx = scoreIndexPairs[i].second.second;
+        indices[label].push_back(idx);
+      }
+      numDetected = keepTopK;
+    }
+    totalKeepNum += numDetected;
+  }
+  return totalKeepNum;
+}
+
+void getDetectionOutput(
+    const size_t numKept,
+    const map<size_t, map<size_t, vector<size_t>>>& allIndices,
+    const map<size_t, map<size_t, vector<pair<real, NormalizedBBox>>>>&
+        allDecodedBBoxes,
+    Matrix& out) {
+  MatrixPtr outBuffer;
+  Matrix::resizeOrCreate(outBuffer, numKept, 7, false, false);
+  real* bufferData = outBuffer->getData();
+  size_t count = 0;
+  for (const auto& batchIdxIndicesPair : allIndices) {
+    size_t batchIdx = batchIdxIndicesPair.first;
+    for (const auto& classIndicesPair : batchIdxIndicesPair.second) {
+      size_t classId = classIndicesPair.first;
+      const vector<size_t>& indices = classIndicesPair.second;
+      const vector<pair<real, NormalizedBBox>>& scoreBBoxes =
+          allDecodedBBoxes.at(batchIdx).at(classId);
+      for (size_t i = 0; i < indices.size(); ++i) {
+        size_t idx = indices[i];
+        bufferData[count * 7] = batchIdx;
+        bufferData[count * 7 + 1] = classId;
+        bufferData[count * 7 + 2] = scoreBBoxes[idx].first;
+        bufferData[count * 7 + 3] = scoreBBoxes[idx].second.xMin;
+        bufferData[count * 7 + 4] = scoreBBoxes[idx].second.yMin;
+        bufferData[count * 7 + 5] = scoreBBoxes[idx].second.xMax;
+        bufferData[count * 7 + 6] = scoreBBoxes[idx].second.yMax;
+        ++count;
+      }
+    }
+  }
+  out.copyFrom(bufferData, numKept * 7);
+}
+
+NormalizedBBox decodeBBox(const vector<real>& priorBBoxData,
+                          const vector<real>& locPredData) {
+  real priorBoxWidth = priorBBoxData[2] - priorBBoxData[0] + 1;
+  real priorBoxHeight = priorBBoxData[3] - priorBBoxData[1] + 1;
+  real priorBoxCenterX = priorBBoxData[0] + priorBoxWidth / 2;
+  real priorBoxCenterY = priorBBoxData[1] + priorBoxHeight / 2;
+  real dx = locPredData[0];
+  real dy = locPredData[1];
+  real dw = locPredData[2];
+  real dh = locPredData[3];
+
+  real decodedBBoxCenterX = dx * priorBoxWidth + priorBoxCenterX;
+  real decodedBBoxCenterY = dy * priorBoxHeight + priorBoxCenterY;
+  real decodedBBoxWidth = std::exp(dw) * priorBoxWidth;
+  real decodedBBoxHeight = std::exp(dh) * priorBoxHeight;
+
+  NormalizedBBox decodedBBox;
+  decodedBBox.xMin = decodedBBoxCenterX - decodedBBoxWidth / 2;
+  decodedBBox.yMin = decodedBBoxCenterY - decodedBBoxHeight / 2;
+  decodedBBox.xMax = decodedBBoxCenterX + decodedBBoxWidth / 2;
+  decodedBBox.yMax = decodedBBoxCenterY + decodedBBoxHeight / 2;
+
+  return decodedBBox;
+}
+
 }  // namespace paddle
diff --git a/paddle/gserver/layers/DetectionUtil.h b/paddle/gserver/layers/DetectionUtil.h
@@ -304,4 +304,66 @@ void getDetectionOutput(const real* confData,
 
 NormalizedBBox clipBBox(const NormalizedBBox& bbox);
 
+/**
+ * @brief Do NMS for bboxes to remove duplicated bboxes
+ * @param bboxes BBoxes to apply NMS
+ * @param topK Number to keep
+ * @param confThreshold Low boundary of confidence score
+ * @param nmsThreshold Threshold of overlap
+ * @param indices Indices of high quality bboxes
+ */
+void applyNMSFast(const vector<NormalizedBBox>& bboxes,
+                  size_t topK,
+                  real confThreshold,
+                  real nmsThreshold,
+                  vector<size_t>* indices);
+
+/**
+ * @brief Get detection results which satify requirements
+ * @param backgroundId Background class
+ * @param confThreshold Threshold of class confidence
+ * @param nmsTopK Used in NMS operation to keep top k bbox
+ * @param nmsThreshold Used in NMS, threshold of overlap
+ * @param keepTopK How many bboxes keeped in an image
+ * @param allDecodedBBoxes Decoded bboxes for all images
+ * @param allDetectionIndices Save detection bbox indices
+ */
+size_t getDetectionIndices(
+    const size_t backgroundId,
+    const size_t confThreshold,
+    const size_t nmsTopK,
+    const real nmsThreshold,
+    const size_t keepTopK,
+    const map<size_t, map<size_t, vector<pair<real, NormalizedBBox>>>>&
+        allDecodedBBoxes,
+    map<size_t, map<size_t, vector<size_t>>>* allDetectionIndices);
+
+/**
+ * @brief Get detection results
+ * @param confData Confidence scores
+ * @param numPriorBBoxes Prior bbox number
+ * @param numClasses Class number
+ * @param batchSize Image number
+ * @param allIndices Indices of predicted bboxes
+ * @param allDecodedBBoxes BBoxes decoded
+ * @param out Output matrix
+ * image number | label | confidence score | xMin | yMin | xMax | yMax
+ */
+void getDetectionOutput(
+    const size_t numKept,
+    const map<size_t, map<size_t, vector<size_t>>>& allIndices,
+    const map<size_t, map<size_t, vector<pair<real, NormalizedBBox>>>>&
+        allDecodedBBoxes,
+    Matrix& out);
+
+/**
+ * @brief Decode prior bbox with offset parameters
+ * and variances of prior bbox are considered
+ * @param priorBBox Prior bbox to be decoded
+ * @param priorBBoxVar Variance parameters of prior bbox
+ * @param locPredData Offset parameters
+ */
+NormalizedBBox decodeBBox(const vector<real>& priorBBoxData,
+                          const vector<real>& locPredData);
+
 }  // namespace paddle
diff --git a/paddle/gserver/layers/RCNNDetectionLayer.cpp b/paddle/gserver/layers/RCNNDetectionLayer.cpp
@@ -0,0 +1,123 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "RCNNDetectionLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(rcnn_detection, RCNNDetectionLayer);
+
+bool RCNNDetectionLayer::init(const LayerMap& layerMap,
+                              const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  auto& layerConf = config_.inputs(0).rcnn_detection_conf();
+  nmsThreshold_ = layerConf.nms_threshold();
+  confidenceThreshold_ = layerConf.confidence_threshold();
+  nmsTopK_ = layerConf.nms_top_k();
+  keepTopK_ = layerConf.keep_top_k();
+  numClasses_ = layerConf.num_classes();
+  backgroundId_ = layerConf.background_id();
+  return true;
+}
+
+void RCNNDetectionLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr roiValue = getInputValue(0);
+  MatrixPtr locPredValue = getInputValue(1);
+  MatrixPtr confPredValue = getInputValue(2);
+
+  // do softmax
+  MatrixPtr confPredNormValue;
+  Matrix::resizeOrCreate(confPredNormValue,
+                         confPredValue->getHeight(),
+                         confPredValue->getWidth(),
+                         false,
+                         useGpu_);
+  confPredNormValue->copyFrom(*confPredValue);
+  confPredNormValue->softmax(*confPredNormValue);
+  confPredValue = confPredNormValue;
+
+  if (useGpu_) {  // copy data from GPU
+    MatrixPtr roiCpuBuffer;
+    Matrix::resizeOrCreate(roiCpuBuffer,
+                           roiValue->getHeight(),
+                           roiValue->getWidth(),
+                           false,
+                           false);
+    MatrixPtr locCpuBuffer;
+    Matrix::resizeOrCreate(locCpuBuffer,
+                           locPredValue->getHeight(),
+                           locPredValue->getWidth(),
+                           false,
+                           false);
+    MatrixPtr confCpuBuffer;
+    Matrix::resizeOrCreate(confCpuBuffer,
+                           confPredValue->getHeight(),
+                           confPredValue->getWidth(),
+                           false,
+                           false);
+    roiCpuBuffer->copyFrom(*roiValue);
+    locCpuBuffer->copyFrom(*locPredValue);
+    confCpuBuffer->copyFrom(*confPredValue);
+    roiValue = roiCpuBuffer;
+    locPredValue = locCpuBuffer;
+    confPredValue = confCpuBuffer;
+  }
+
+  // The format of the RoI is:
+  // | batch_idx | xmin | ymin | xmax | ymax |
+  real* roisData = roiValue->getData();
+  size_t roiDim = roiValue->getWidth();
+  size_t roiNum = roiValue->getHeight();
+  real* locPredData = locPredValue->getData();
+  real* confPredData = confPredValue->getData();
+
+  // <batchIdx, <classIdx, <(score, box)>>>
+  std::map<size_t,
+           std::map<size_t, std::vector<std::pair<real, NormalizedBBox>>>>
+      allDecodedBBoxes;
+  for (size_t n = 0; n < roiNum; ++n) {
+    int batchIdx = *(roisData + n * roiDim);
+    std::vector<real> roiLocData(4);  // RoI location
+    for (size_t j = 0; j < 4; ++j) {
+      roiLocData[j] = *(roisData + n * roiDim + 1 + j);
+    }
+    // location predictions for each class
+    for (size_t c = 0; c < numClasses_; ++c) {
+      if (c == backgroundId_) continue;
+      std::vector<real> predLocData(4);
+      for (size_t j = 0; j < 4; ++j) {
+        predLocData[j] = *(locPredData + n * numClasses_ * 4 + c * 4 + j);
+      }
+      real predConfData = *(confPredData + n * numClasses_ + c);
+      allDecodedBBoxes[batchIdx][c].push_back(
+          std::make_pair(predConfData, decodeBBox(roiLocData, predLocData)));
+    }
+  }
+  // <batchIdx, <classIdx, <bboxIdxes>>
+  std::map<size_t, std::map<size_t, std::vector<size_t>>> allIndices;
+  size_t numKept = getDetectionIndices(backgroundId_,
+                                       confidenceThreshold_,
+                                       nmsTopK_,
+                                       nmsThreshold_,
+                                       keepTopK_,
+                                       allDecodedBBoxes,
+                                       &allIndices);
+  resetOutput(numKept, 7);
+  MatrixPtr outV = getOutputValue();
+  getDetectionOutput(numKept, allIndices, allDecodedBBoxes, *outV);
+}
+
+}  // namespace paddle