Perception: upgrade PointPillars model & reduce time delay

gezp · Sep 17, 2020 · 8456b8d · 8456b8d
1 parent 0fe6d35
commit 8456b8d
Show file tree

Hide file tree

Showing 21 changed files with 206 additions and 182 deletions.
diff --git a/modules/perception/base/object.cc b/modules/perception/base/object.cc
@@ -27,7 +27,7 @@ Object::Object() {
   type_probs.resize(static_cast<int>(ObjectType::MAX_OBJECT_TYPE), 0);
   sub_type_probs.resize(static_cast<int>(ObjectSubType::MAX_OBJECT_TYPE), 0.0f);
   b_cipv = false;
-  feature.reset();
+//  feature.reset();
 }
 
 void Object::Reset() {
@@ -68,7 +68,7 @@ void Object::Reset() {
   radar_supplement.Reset();
   camera_supplement.Reset();
   fusion_supplement.Reset();
-  feature.reset();
+//  feature.reset();
 }
 
 std::string Object::ToString() const {

diff --git a/modules/perception/base/object.h b/modules/perception/base/object.h
@@ -25,7 +25,7 @@
 #include "modules/perception/base/object_types.h"
 #include "modules/perception/base/point_cloud.h"
 #include "modules/perception/base/vehicle_struct.h"
-#include "modules/prediction/proto/feature.pb.h"
+// #include "modules/prediction/proto/feature.pb.h"
 
 namespace apollo {
 namespace perception {
@@ -117,7 +117,7 @@ struct alignas(16) Object {
   FusionObjectSupplement fusion_supplement;
 
   // @debug feature to be used for semantic mapping
-  std::shared_ptr<apollo::prediction::Feature> feature;
+//  std::shared_ptr<apollo::prediction::Feature> feature;
 };
 
 using ObjectPtr = std::shared_ptr<Object>;

diff --git a/modules/perception/common/perception_gflags.cc b/modules/perception/common/perception_gflags.cc
@@ -39,10 +39,10 @@ DEFINE_string(work_root, "", "Project work root direcotry.");
 
 // lidar_point_pillars
 DEFINE_int32(gpu_id, 0, "The id of gpu used for inference.");
-DEFINE_string(pfe_onnx_file,
+DEFINE_string(pfe_torch_file,
               "/apollo/modules/perception/production/data/perception/lidar/"
-              "models/detection/point_pillars/pfe.onnx",
-              "The path of pillars feature extractor onnx file.");
+              "models/detection/point_pillars/pfe.pt",
+              "The path of pillars feature extractor torch file.");
 DEFINE_string(rpn_onnx_file,
               "/apollo/modules/perception/production/data/perception/lidar/"
               "models/detection/point_pillars/rpn.onnx",

diff --git a/modules/perception/common/perception_gflags.h b/modules/perception/common/perception_gflags.h
@@ -33,7 +33,7 @@ DECLARE_string(work_root);
 
 // lidar_point_pillars
 DECLARE_int32(gpu_id);
-DECLARE_string(pfe_onnx_file);
+DECLARE_string(pfe_torch_file);
 DECLARE_string(rpn_onnx_file);
 DECLARE_double(normalizing_factor);
 DECLARE_int32(num_point_feature);

diff --git a/modules/perception/lidar/lib/detection/lidar_point_pillars/BUILD b/modules/perception/lidar/lib/detection/lidar_point_pillars/BUILD
@@ -26,6 +26,7 @@ cc_library(
         ":preprocess_points",
         ":scatter_cuda",
         "//modules/perception/common:perception_gflags",
+        "//third_party:libtorch",
         "@local_config_cuda//cuda:cudart",
         "@local_config_tensorrt//:tensorrt",
     ],

diff --git a/modules/perception/lidar/lib/detection/lidar_point_pillars/params.h b/modules/perception/lidar/lib/detection/lidar_point_pillars/params.h
@@ -27,68 +27,64 @@ class Params {
   static constexpr float kPillarXSize = 0.25f;
   static constexpr float kPillarYSize = 0.25f;
   static constexpr float kPillarZSize = 8.0f;
-  static constexpr float kMinXRange = -50.0f;
-  static constexpr float kMinYRange = -50.0f;
+  static constexpr float kMinXRange = -100.0f;
+  static constexpr float kMinYRange = -70.0f;
   static constexpr float kMinZRange = -5.0f;
-  static constexpr float kMaxXRange = 50.0f;
-  static constexpr float kMaxYRange = 50.0f;
+  static constexpr float kMaxXRange = 100.0f;
+  static constexpr float kMaxYRange = 70.0f;
   static constexpr float kMaxZRange = 3.0f;
-  static constexpr int kNumClass = 10;
+  static constexpr int kNumClass = 9;
   static constexpr int kMaxNumPillars = 30000;
   static constexpr int kMaxNumPointsPerPillar = 60;
   static constexpr int kNumPointFeature = 5;  // x, y, z, i, delta of time
-  static constexpr int kNumAnchor = 100 * 100 * 12 + 160 * 160 * 8;
+  static constexpr int kNumAnchor = 200 * 140 * 8 + 220 * 140 * 8;
   static constexpr int kNumOutputBoxFeature = 7;
   static constexpr int kBatchSize = 1;
-  static constexpr int kNumIndsForScan = 512;
+  static constexpr int kNumIndsForScan = 1024;
   static constexpr int kNumThreads = 64;
   static constexpr int kNumBoxCorners = 4;
 
   static std::vector<int> AnchorStrides() { return std::vector<int>{4, 2}; }
 
-  static std::vector<int> NumAnchorSets() { return std::vector<int>{12, 8}; }
+  static std::vector<int> NumAnchorSets() { return std::vector<int>{8, 8}; }
 
   static std::vector<std::vector<float>> AnchorDxSizes() {
     return std::vector<std::vector<float>>{
-        std::vector<float>{2.94046906f, 1.95017717f, 2.73050468f, 3.0f, 2.0f,
-                           2.4560939f},
+        std::vector<float>{2.94046906f, 1.95017717f, 2.73050468f, 2.4560939f},
         std::vector<float>{2.49008838f, 0.60058911f, 0.76279481f, 0.66344886f,
                            0.39694519f}};
   }
 
   static std::vector<std::vector<float>> AnchorDySizes() {
     return std::vector<std::vector<float>>{
-        std::vector<float>{11.1885991, 4.60718145f, 6.38352896f, 15.0f, 3.0f,
-                           6.73778078f},
+        std::vector<float>{11.1885991, 4.60718145f, 6.38352896f, 6.73778078f},
         std::vector<float>{0.48578221f, 1.68452161f, 2.09973778f, 0.7256437f,
                            0.40359262f}};
   }
 
   static std::vector<std::vector<float>> AnchorDzSizes() {
     return std::vector<std::vector<float>>{
-        std::vector<float>{3.47030982f, 1.72270761f, 3.13312415f, 3.8f, 3.8f,
-                           2.73004906f},
+        std::vector<float>{3.47030982f, 1.72270761f, 3.13312415f, 2.73004906f},
         std::vector<float>{0.98297065f, 1.27192197f, 1.44403034f, 1.75748069f,
                            1.06232151f}};
   }
 
   static std::vector<std::vector<float>> AnchorZCoors() {
     return std::vector<std::vector<float>>{
-        std::vector<float>{-0.0715754f, -0.93897414f, -0.08168083f, 0.22228277f,
-                           0.22228277f, -0.37937912f},
+        std::vector<float>{-0.0715754f, -0.93897414f, -0.08168083f,
+                           -0.37937912f},
         std::vector<float>{-1.27247968f, -1.03743013f, -0.99194854f,
                            -0.73911038f, -1.27868911f}};
   }
 
   static std::vector<std::vector<int>> NumAnchorRo() {
-    return std::vector<std::vector<int>>{std::vector<int>{2, 2, 2, 2, 2, 2},
+    return std::vector<std::vector<int>>{std::vector<int>{2, 2, 2, 2},
                                          std::vector<int>{2, 2, 2, 1, 1}};
   }
 
   static std::vector<std::vector<float>> AnchorRo() {
     return std::vector<std::vector<float>>{
-        std::vector<float>{0, M_PI / 2, 0, M_PI / 2, 0, M_PI / 2, 0, M_PI / 2,
-                           0, M_PI / 2, 0, M_PI / 2},
+        std::vector<float>{0, M_PI / 2, 0, M_PI / 2, 0, M_PI / 2, 0, M_PI / 2},
         std::vector<float>{0, M_PI / 2, 0, M_PI / 2, 0, M_PI / 2, 0, 0}};
   }
 

diff --git a/modules/perception/lidar/lib/detection/lidar_point_pillars/point_pillars.cc b/modules/perception/lidar/lib/detection/lidar_point_pillars/point_pillars.cc
@@ -54,7 +54,6 @@ const int PointPillars::kNumClass = Params::kNumClass;
 const int PointPillars::kMaxNumPillars = Params::kMaxNumPillars;
 const int PointPillars::kMaxNumPointsPerPillar = Params::kMaxNumPointsPerPillar;
 const int PointPillars::kNumPointFeature = Params::kNumPointFeature;
-const int PointPillars::kPfeOutputSize = kMaxNumPillars * 64;
 const int PointPillars::kGridXSize =
     static_cast<int>((kMaxXRange - kMinXRange) / kPillarXSize);
 const int PointPillars::kGridYSize =
@@ -79,10 +78,10 @@ const std::vector<int> PointPillars::kAnchorRanges{
     kGridXSize,
     0,
     kGridYSize,
-    static_cast<int>(kGridXSize * 0.1),
-    static_cast<int>(kGridXSize * 0.9),
-    static_cast<int>(kGridYSize * 0.1),
-    static_cast<int>(kGridYSize * 0.9)};
+    static_cast<int>(kGridXSize * 0.25),
+    static_cast<int>(kGridXSize * 0.75),
+    static_cast<int>(kGridYSize * 0.25),
+    static_cast<int>(kGridYSize * 0.75)};
 const std::vector<int> PointPillars::kNumAnchorSets = Params::NumAnchorSets();
 const std::vector<std::vector<float>> PointPillars::kAnchorDxSizes =
     Params::AnchorDxSizes();
@@ -100,12 +99,12 @@ const std::vector<std::vector<float>> PointPillars::kAnchorRo =
 PointPillars::PointPillars(const bool reproduce_result_mode,
                            const float score_threshold,
                            const float nms_overlap_threshold,
-                           const std::string pfe_onnx_file,
+                           const std::string pfe_torch_file,
                            const std::string rpn_onnx_file)
     : reproduce_result_mode_(reproduce_result_mode),
       score_threshold_(score_threshold),
       nms_overlap_threshold_(nms_overlap_threshold),
-      pfe_onnx_file_(pfe_onnx_file),
+      pfe_torch_file_(pfe_torch_file),
       rpn_onnx_file_(rpn_onnx_file) {
   if (reproduce_result_mode_) {
     preprocess_points_ptr_.reset(new PreprocessPoints(
@@ -134,6 +133,7 @@ PointPillars::PointPillars(const bool reproduce_result_mode,
                           kNumBoxCorners, kNumOutputBoxFeature));
 
   DeviceMemoryMalloc();
+  InitTorch();
   InitTRT();
   InitAnchors();
 }
@@ -170,7 +170,6 @@ PointPillars::~PointPillars() {
   GPU_CHECK(cudaFree(pfe_buffers_[0]));
   GPU_CHECK(cudaFree(pfe_buffers_[1]));
   GPU_CHECK(cudaFree(pfe_buffers_[2]));
-  GPU_CHECK(cudaFree(pfe_buffers_[3]));
 
   GPU_CHECK(cudaFree(rpn_buffers_[0]));
   GPU_CHECK(cudaFree(rpn_buffers_[1]));
@@ -193,12 +192,8 @@ PointPillars::~PointPillars() {
   GPU_CHECK(cudaFree(dev_box_for_nms_));
   GPU_CHECK(cudaFree(dev_filter_count_));
 
-  pfe_context_->destroy();
   rpn_context_->destroy();
-
-  pfe_runtime_->destroy();
   rpn_runtime_->destroy();
-  pfe_engine_->destroy();
   rpn_engine_->destroy();
 }
 
@@ -242,7 +237,6 @@ void PointPillars::DeviceMemoryMalloc() {
                                        kNumPointFeature * sizeof(float)));
   GPU_CHECK(cudaMalloc(&pfe_buffers_[1], kMaxNumPillars * sizeof(float)));
   GPU_CHECK(cudaMalloc(&pfe_buffers_[2], kMaxNumPillars * 4 * sizeof(float)));
-  GPU_CHECK(cudaMalloc(&pfe_buffers_[3], kPfeOutputSize * sizeof(float)));
 
   GPU_CHECK(cudaMalloc(&rpn_buffers_[0], kRpnInputSize * sizeof(float)));
   GPU_CHECK(cudaMalloc(&rpn_buffers_[1], kRpnBoxOutputSize * sizeof(float)));
@@ -449,34 +443,40 @@ void PointPillars::ConvertAnchors2BoxAnchors(float* anchors_px,
   }
 }
 
+void PointPillars::InitTorch() {
+  if (gpu_id_ >= 0) {
+    device_type_ = torch::kCUDA;
+    device_id_ = gpu_id_;
+  } else {
+    device_type_ = torch::kCPU;
+  }
+
+  // Init torch net
+  torch::Device device(device_type_, device_id_);
+  pfe_net_ = torch::jit::load(pfe_torch_file_, device);
+}
+
 void PointPillars::InitTRT() {
   // create a TensorRT model from the onnx model and serialize it to a stream
-  nvinfer1::IHostMemory* pfe_trt_model_stream{nullptr};
   nvinfer1::IHostMemory* rpn_trt_model_stream{nullptr};
-  OnnxToTRTModel(pfe_onnx_file_, &pfe_trt_model_stream);
   OnnxToTRTModel(rpn_onnx_file_, &rpn_trt_model_stream);
-  if (pfe_trt_model_stream == nullptr || rpn_trt_model_stream == nullptr) {
+  if (rpn_trt_model_stream == nullptr) {
     std::cerr << "Failed to load ONNX file " << std::endl;
   }
 
   // deserialize the engine
-  pfe_runtime_ = nvinfer1::createInferRuntime(g_logger_);
   rpn_runtime_ = nvinfer1::createInferRuntime(g_logger_);
-  if (pfe_runtime_ == nullptr || rpn_runtime_ == nullptr) {
+  if (rpn_runtime_ == nullptr) {
     std::cerr << "Failed to create TensorRT Runtime object." << std::endl;
   }
-  pfe_engine_ = pfe_runtime_->deserializeCudaEngine(
-      pfe_trt_model_stream->data(), pfe_trt_model_stream->size(), nullptr);
   rpn_engine_ = rpn_runtime_->deserializeCudaEngine(
       rpn_trt_model_stream->data(), rpn_trt_model_stream->size(), nullptr);
-  if (pfe_engine_ == nullptr || rpn_engine_ == nullptr) {
+  if (rpn_engine_ == nullptr) {
     std::cerr << "Failed to create TensorRT Engine." << std::endl;
   }
-  pfe_trt_model_stream->destroy();
   rpn_trt_model_stream->destroy();
-  pfe_context_ = pfe_engine_->createExecutionContext();
   rpn_context_ = rpn_engine_->createExecutionContext();
-  if (pfe_context_ == nullptr || rpn_context_ == nullptr) {
+  if (rpn_context_ == nullptr) {
     std::cerr << "Failed to create TensorRT Execution Context." << std::endl;
   }
 }
@@ -634,13 +634,32 @@ void PointPillars::DoInference(const float* in_points_array,
                             kMaxNumPillars * 4 * sizeof(float),
                             cudaMemcpyDeviceToDevice, stream));
 
-  pfe_context_->enqueueV2(pfe_buffers_, stream, nullptr);
+  torch::Tensor tensor_pillar_point_feature = torch::from_blob(pfe_buffers_[0],
+      {kMaxNumPillars, kMaxNumPointsPerPillar, kNumPointFeature},
+      torch::kCUDA);
+  torch::Tensor tensor_num_points_per_pillar = torch::from_blob(
+      pfe_buffers_[1], {kMaxNumPillars}, torch::kCUDA);
+  torch::Tensor tensor_pillar_coors = torch::from_blob(pfe_buffers_[2],
+      {kMaxNumPillars, 4}, torch::kCUDA);
+
+  torch::Device device(device_type_, device_id_);
+  if (device_id_ >= 0) {
+    tensor_pillar_point_feature.to(device);
+    tensor_num_points_per_pillar.to(device);
+    tensor_pillar_coors.to(device);
+  } else {
+    return;
+  }
+
+  auto pfe_output = pfe_net_.forward({tensor_pillar_point_feature,
+                                      tensor_num_points_per_pillar,
+                                      tensor_pillar_coors}).toTensor();
 
   GPU_CHECK(
       cudaMemset(dev_scattered_feature_, 0, kRpnInputSize * sizeof(float)));
   scatter_cuda_ptr_->DoScatterCuda(
       host_pillar_count_[0], dev_x_coors_, dev_y_coors_,
-      reinterpret_cast<float*>(pfe_buffers_[3]), dev_scattered_feature_);
+      pfe_output.data_ptr<float>(), dev_scattered_feature_);
 
   GPU_CHECK(cudaMemcpyAsync(rpn_buffers_[0], dev_scattered_feature_,
                             kBatchSize * kRpnInputSize * sizeof(float),

diff --git a/modules/perception/lidar/lib/detection/lidar_point_pillars/point_pillars.h b/modules/perception/lidar/lib/detection/lidar_point_pillars/point_pillars.h
@@ -53,6 +53,9 @@
 #include "NvInfer.h"
 #include "NvOnnxParser.h"
 
+#include "torch/script.h"
+#include "torch/torch.h"
+
 // headers in local files
 #include "modules/perception/lidar/lib/detection/lidar_point_pillars/anchor_mask_cuda.h"
 #include "modules/perception/lidar/lib/detection/lidar_point_pillars/common.h"
@@ -115,7 +118,6 @@ class PointPillars {
   static const int kMaxNumPillars;
   static const int kMaxNumPointsPerPillar;
   static const int kNumPointFeature;
-  static const int kPfeOutputSize;
   static const int kGridXSize;
   static const int kGridYSize;
   static const int kGridZSize;
@@ -145,7 +147,7 @@ class PointPillars {
   const bool reproduce_result_mode_;
   const float score_threshold_;
   const float nms_overlap_threshold_;
-  const std::string pfe_onnx_file_;
+  const std::string pfe_torch_file_;
   const std::string rpn_onnx_file_;
   // end initializer list
 
@@ -180,7 +182,7 @@ class PointPillars {
   float* dev_box_anchors_max_y_;
   int* dev_anchor_mask_;
 
-  void* pfe_buffers_[4];
+  void* pfe_buffers_[3];
   void* rpn_buffers_[4];
 
   float* dev_scattered_feature_;
@@ -206,11 +208,12 @@ class PointPillars {
   std::unique_ptr<PostprocessCuda> postprocess_cuda_ptr_;
 
   Logger g_logger_;
-  nvinfer1::IExecutionContext* pfe_context_;
+  int device_id_ = -1;
+  int gpu_id_ = 0;
+  torch::DeviceType device_type_;
+  torch::jit::script::Module pfe_net_;
   nvinfer1::IExecutionContext* rpn_context_;
-  nvinfer1::IRuntime* pfe_runtime_;
   nvinfer1::IRuntime* rpn_runtime_;
-  nvinfer1::ICudaEngine* pfe_engine_;
   nvinfer1::ICudaEngine* rpn_engine_;
 
   /**
@@ -225,6 +228,12 @@ class PointPillars {
    */
   void InitAnchors();
 
+  /**
+   * @brief Initializing LibTorch net
+   * @details Called in the constructor
+   */
+  void InitTorch();
+
   /**
    * @brief Initializing TensorRT instances
    * @details Called in the constructor
@@ -323,13 +332,13 @@ class PointPillars {
    * reproducible for the same input
    * @param[in] score_threshold Score threshold for filtering output
    * @param[in] nms_overlap_threshold IOU threshold for NMS
-   * @param[in] pfe_onnx_file Pillar Feature Extractor ONNX file path
+   * @param[in] pfe_torch_file Pillar Feature Extractor Torch file path
    * @param[in] rpn_onnx_file Region Proposal Network ONNX file path
    * @details Variables could be changed through point_pillars_detection
    */
   PointPillars(const bool reproduce_result_mode, const float score_threshold,
                const float nms_overlap_threshold,
-               const std::string pfe_onnx_file,
+               const std::string pfe_torch_file,
                const std::string rpn_onnx_file);
   ~PointPillars();