diff --git a/modules/perception/common/perception_gflags.cc b/modules/perception/common/perception_gflags.cc
index 6350d2ca4f4..009812fd4c8 100644
--- a/modules/perception/common/perception_gflags.cc
+++ b/modules/perception/common/perception_gflags.cc
@@ -39,10 +39,10 @@ DEFINE_string(work_root, "", "Project work root direcotry.");
 
 // lidar_point_pillars
 DEFINE_int32(gpu_id, 0, "The id of gpu used for inference.");
-DEFINE_string(pfe_torch_file,
+DEFINE_string(pfe_onnx_file,
               "/apollo/modules/perception/production/data/perception/lidar/"
-              "models/detection/point_pillars/pfe.pt",
-              "The path of pillars feature extractor torch file.");
+              "models/detection/point_pillars/pfe.onnx",
+              "The path of PFE onnx file.");
 DEFINE_string(rpn_onnx_file,
               "/apollo/modules/perception/production/data/perception/lidar/"
               "models/detection/point_pillars/rpn.onnx",
diff --git a/modules/perception/common/perception_gflags.h b/modules/perception/common/perception_gflags.h
index f3639cc50f3..5bcd08f9b3b 100644
--- a/modules/perception/common/perception_gflags.h
+++ b/modules/perception/common/perception_gflags.h
@@ -33,7 +33,7 @@ DECLARE_string(work_root);
 
 // lidar_point_pillars
 DECLARE_int32(gpu_id);
-DECLARE_string(pfe_torch_file);
+DECLARE_string(pfe_onnx_file);
 DECLARE_string(rpn_onnx_file);
 DECLARE_double(normalizing_factor);
 DECLARE_int32(num_point_feature);
diff --git a/modules/perception/lidar/lib/detection/lidar_point_pillars/BUILD b/modules/perception/lidar/lib/detection/lidar_point_pillars/BUILD
index 696fcae52b6..4cf57f39b93 100644
--- a/modules/perception/lidar/lib/detection/lidar_point_pillars/BUILD
+++ b/modules/perception/lidar/lib/detection/lidar_point_pillars/BUILD
@@ -22,12 +22,12 @@ cc_library(
         ":common",
         ":nms_cuda",
         ":params",
+        ":pfe_cuda",
         ":postprocess_cuda",
         ":preprocess_points",
         ":scatter_cuda",
         "//cyber/common",
         "//modules/perception/common:perception_gflags",
-        "//third_party:libtorch",
         "@local_config_cuda//cuda:cudart",
         "@local_config_tensorrt//:tensorrt",
     ],
@@ -93,6 +93,16 @@ cuda_library(
     ],
 )
 
+cuda_library(
+    name = "pfe_cuda",
+    srcs = ["pfe_cuda.cu"],
+    hdrs = ["pfe_cuda.h"],
+    deps = [
+        ":common",
+        "@local_config_cuda//cuda:cudart",
+    ],
+)
+
 cuda_library(
     name = "postprocess_cuda",
     srcs = ["postprocess_cuda.cu"],
diff --git a/modules/perception/lidar/lib/detection/lidar_point_pillars/params.h b/modules/perception/lidar/lib/detection/lidar_point_pillars/params.h
index c1532fbab80..7bb042fb58b 100644
--- a/modules/perception/lidar/lib/detection/lidar_point_pillars/params.h
+++ b/modules/perception/lidar/lib/detection/lidar_point_pillars/params.h
@@ -37,6 +37,7 @@ class Params {
   static constexpr int kMaxNumPillars = 30000;
   static constexpr int kMaxNumPointsPerPillar = 60;
   static constexpr int kNumPointFeature = 5;  // x, y, z, i, delta of time
+  static constexpr int kNumGatherPointFeature = 9;
   static constexpr int kNumAnchor = 200 * 140 * 8 + 220 * 140 * 8;
   static constexpr int kNumOutputBoxFeature = 7;
   static constexpr int kBatchSize = 1;
diff --git a/modules/perception/lidar/lib/detection/lidar_point_pillars/pfe_cuda.cu b/modules/perception/lidar/lib/detection/lidar_point_pillars/pfe_cuda.cu
new file mode 100644
index 00000000000..7fafb966e44
--- /dev/null
+++ b/modules/perception/lidar/lib/detection/lidar_point_pillars/pfe_cuda.cu
@@ -0,0 +1,99 @@
+/******************************************************************************
+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *****************************************************************************/
+
+#include "modules/perception/lidar/lib/detection/lidar_point_pillars/pfe_cuda.h"
+
+namespace apollo {
+namespace perception {
+namespace lidar {
+
+__global__ void gather_point_feature_kernel(
+    float* dev_pillar_point_feature, float* dev_num_points_per_pillar,
+    float* dev_pillar_coors, float* dev_pfe_gather_feature,
+    int max_num_points_per_pillar, int num_point_feature,
+    int num_gather_point_feature, float pillar_x_size, float pillar_y_size,
+    float min_x_range, float min_y_range) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  float point_mean[3];
+  for (int point_id = 0; point_id < dev_num_points_per_pillar[tid];
+       ++point_id) {
+    for (int i = 0; i < 3; ++i) {
+      int feature_id = i + point_id * num_point_feature +
+                       tid * max_num_points_per_pillar * num_point_feature;
+      point_mean[i] += dev_pillar_point_feature[feature_id];
+    }
+  }
+  for (int i = 0; i < 3; ++i) {
+    point_mean[i] = point_mean[i] / dev_num_points_per_pillar[tid];
+  }
+
+  float x_offset = pillar_x_size / 2 + min_x_range;
+  float y_offset = pillar_y_size / 2 + min_y_range;
+  for (int point_id = 0; point_id < dev_num_points_per_pillar[tid];
+       ++point_id) {
+    int point_head_id = point_id * num_point_feature +
+                        tid * max_num_points_per_pillar * num_point_feature;
+    int pfe_gather_head_id = point_id * num_gather_point_feature +
+        tid * max_num_points_per_pillar * num_gather_point_feature;
+    float point_x = dev_pillar_point_feature[point_head_id];
+    float point_y = dev_pillar_point_feature[point_head_id + 1];
+    dev_pfe_gather_feature[pfe_gather_head_id] =
+        sqrt(point_x * point_x + point_y * point_y);
+    for (int i = 2; i < num_point_feature; ++i) {
+      dev_pfe_gather_feature[pfe_gather_head_id + i - 1] =
+          dev_pillar_point_feature[point_head_id + i];
+    }
+    for (int i = 4; i < 7; ++i) {
+      dev_pfe_gather_feature[pfe_gather_head_id + i] =
+          dev_pillar_point_feature[point_head_id + i - 4] - point_mean[i - 4];
+    }
+    dev_pfe_gather_feature[pfe_gather_head_id + 7] =
+        dev_pillar_point_feature[point_head_id] -
+        (dev_pillar_coors[tid * 4 + 3] * pillar_x_size + x_offset);
+    dev_pfe_gather_feature[pfe_gather_head_id + 8] =
+        dev_pillar_point_feature[point_head_id + 1] -
+        (dev_pillar_coors[tid * 4 + 2] * pillar_y_size + y_offset);
+  }
+}
+
+PfeCuda::PfeCuda(int max_num_pillars, int max_num_points_per_pillar,
+                 int num_point_feature, int num_gather_point_feature,
+                 float pillar_x_size, float pillar_y_size, float min_x_range,
+                 float min_y_range, int num_threads)
+    : max_num_pillars_(max_num_pillars),
+      max_num_points_per_pillar_(max_num_points_per_pillar),
+      num_point_feature_(num_point_feature),
+      num_gather_point_feature_(num_gather_point_feature),
+      pillar_x_size_(pillar_x_size),
+      pillar_y_size_(pillar_y_size),
+      min_x_range_(min_x_range),
+      min_y_range_(min_y_range),
+      num_threads_(num_threads) {}
+
+void PfeCuda::GatherPointFeature(float* dev_pillar_point_feature,
+                                 float* dev_num_points_per_pillar,
+                                 float* dev_pillar_coors,
+                                 float* dev_pfe_gather_feature) {
+  const int num_blocks = DIVUP(max_num_pillars_, num_threads_);
+  gather_point_feature_kernel<<<num_blocks, num_threads_>>>(
+      dev_pillar_point_feature, dev_num_points_per_pillar, dev_pillar_coors,
+      dev_pfe_gather_feature, max_num_points_per_pillar_, num_point_feature_,
+      num_gather_point_feature_, pillar_x_size_, pillar_y_size_, min_x_range_,
+      min_y_range_);
+}
+}  // namespace lidar
+}  // namespace perception
+}  // namespace apollo
diff --git a/modules/perception/lidar/lib/detection/lidar_point_pillars/pfe_cuda.h b/modules/perception/lidar/lib/detection/lidar_point_pillars/pfe_cuda.h
new file mode 100644
index 00000000000..48de3eecd60
--- /dev/null
+++ b/modules/perception/lidar/lib/detection/lidar_point_pillars/pfe_cuda.h
@@ -0,0 +1,51 @@
+/******************************************************************************
+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *****************************************************************************/
+
+#pragma once
+
+#include "modules/perception/lidar/lib/detection/lidar_point_pillars/common.h"
+
+namespace apollo {
+namespace perception {
+namespace lidar {
+
+class PfeCuda {
+ public:
+  PfeCuda(int max_num_pillars, int max_num_points_per_pillar,
+          int num_point_feature, int num_gather_point_feature,
+          float pillar_x_size, float pillar_y_size, float min_x_range,
+          float min_y_range, int num_threads);
+  ~PfeCuda() = default;
+
+  void GatherPointFeature(float* dev_pillar_point_feature,
+                          float* dev_num_points_per_pillar,
+                          float* dev_pillar_coors,
+                          float* dev_pfe_gather_feature);
+
+ private:
+  int max_num_pillars_;
+  int max_num_points_per_pillar_;
+  int num_point_feature_;
+  int num_gather_point_feature_;
+  float pillar_x_size_;
+  float pillar_y_size_;
+  float min_x_range_;
+  float min_y_range_;
+  int num_threads_;
+};
+}  // namespace lidar
+}  // namespace perception
+}  // namespace apollo
diff --git a/modules/perception/lidar/lib/detection/lidar_point_pillars/point_pillars.cc b/modules/perception/lidar/lib/detection/lidar_point_pillars/point_pillars.cc
index 3d3cd6c16bf..37a8f718f49 100644
--- a/modules/perception/lidar/lib/detection/lidar_point_pillars/point_pillars.cc
+++ b/modules/perception/lidar/lib/detection/lidar_point_pillars/point_pillars.cc
@@ -29,16 +29,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include "modules/perception/lidar/lib/detection/lidar_point_pillars/point_pillars.h"
 
-// headers in STL
 #include <chrono>
 #include <iostream>
 
 #include "cyber/common/log.h"
 
-// headers in local files
-#include "modules/perception/lidar/lib/detection/lidar_point_pillars/point_pillars.h"
-
 namespace apollo {
 namespace perception {
 namespace lidar {
@@ -56,6 +53,7 @@ const int PointPillars::kNumClass = Params::kNumClass;
 const int PointPillars::kMaxNumPillars = Params::kMaxNumPillars;
 const int PointPillars::kMaxNumPointsPerPillar = Params::kMaxNumPointsPerPillar;
 const int PointPillars::kNumPointFeature = Params::kNumPointFeature;
+const int PointPillars::kNumGatherPointFeature = Params::kNumGatherPointFeature;
 const int PointPillars::kGridXSize =
     static_cast<int>((kMaxXRange - kMinXRange) / kPillarXSize);
 const int PointPillars::kGridYSize =
@@ -101,13 +99,15 @@ const std::vector<std::vector<float>> PointPillars::kAnchorRo =
 PointPillars::PointPillars(const bool reproduce_result_mode,
                            const float score_threshold,
                            const float nms_overlap_threshold,
-                           const std::string pfe_torch_file,
+                           const std::string pfe_onnx_file,
                            const std::string rpn_onnx_file)
     : reproduce_result_mode_(reproduce_result_mode),
       score_threshold_(score_threshold),
       nms_overlap_threshold_(nms_overlap_threshold),
-      pfe_torch_file_(pfe_torch_file),
-      rpn_onnx_file_(rpn_onnx_file) {
+      pfe_onnx_file_(pfe_onnx_file),
+      rpn_onnx_file_(rpn_onnx_file),
+      pfe_engine_(nullptr),
+      rpn_engine_(nullptr) {
   if (reproduce_result_mode_) {
     preprocess_points_ptr_.reset(new PreprocessPoints(
         kMaxNumPillars, kMaxNumPointsPerPillar, kNumPointFeature, kGridXSize,
@@ -124,8 +124,12 @@ PointPillars::PointPillars(const bool reproduce_result_mode,
       kNumThreads, kNumIndsForScan, kNumAnchor, kMinXRange, kMinYRange,
       kPillarXSize, kPillarYSize, kGridXSize, kGridYSize));
 
-  scatter_cuda_ptr_.reset(
-      new ScatterCuda(kNumThreads, kMaxNumPillars, kGridXSize, kGridYSize));
+  pfe_cuda_ptr_.reset(new PfeCuda(kMaxNumPillars, kMaxNumPointsPerPillar,
+                                  kNumPointFeature, kNumGatherPointFeature,
+                                  kPillarXSize, kPillarYSize, kMinXRange,
+                                  kMinYRange, kNumThreads));
+
+  scatter_cuda_ptr_.reset(new ScatterCuda(kNumThreads, kGridXSize, kGridYSize));
 
   const float float_min = std::numeric_limits<float>::lowest();
   const float float_max = std::numeric_limits<float>::max();
@@ -135,7 +139,6 @@ PointPillars::PointPillars(const bool reproduce_result_mode,
                           kNumBoxCorners, kNumOutputBoxFeature));
 
   DeviceMemoryMalloc();
-  InitTorch();
   InitTRT();
   InitAnchors();
 }
@@ -169,9 +172,9 @@ PointPillars::~PointPillars() {
   GPU_CHECK(cudaFree(dev_box_anchors_max_y_));
   GPU_CHECK(cudaFree(dev_anchor_mask_));
 
+  GPU_CHECK(cudaFree(dev_pfe_gather_feature_));
   GPU_CHECK(cudaFree(pfe_buffers_[0]));
   GPU_CHECK(cudaFree(pfe_buffers_[1]));
-  GPU_CHECK(cudaFree(pfe_buffers_[2]));
 
   GPU_CHECK(cudaFree(rpn_buffers_[0]));
   GPU_CHECK(cudaFree(rpn_buffers_[1]));
@@ -194,8 +197,9 @@ PointPillars::~PointPillars() {
   GPU_CHECK(cudaFree(dev_box_for_nms_));
   GPU_CHECK(cudaFree(dev_filter_count_));
 
+  pfe_context_->destroy();
   rpn_context_->destroy();
-  rpn_runtime_->destroy();
+  pfe_engine_->destroy();
   rpn_engine_->destroy();
 }
 
@@ -234,11 +238,13 @@ void PointPillars::DeviceMemoryMalloc() {
 
   // for trt inference
   // create GPU buffers and a stream
+  GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pfe_gather_feature_),
+                       kMaxNumPillars * kMaxNumPointsPerPillar *
+                           kNumGatherPointFeature * sizeof(float)));
   GPU_CHECK(
       cudaMalloc(&pfe_buffers_[0], kMaxNumPillars * kMaxNumPointsPerPillar *
-                                       kNumPointFeature * sizeof(float)));
-  GPU_CHECK(cudaMalloc(&pfe_buffers_[1], kMaxNumPillars * sizeof(float)));
-  GPU_CHECK(cudaMalloc(&pfe_buffers_[2], kMaxNumPillars * 4 * sizeof(float)));
+                                       kNumGatherPointFeature * sizeof(float)));
+  GPU_CHECK(cudaMalloc(&pfe_buffers_[1], kMaxNumPillars * 64 * sizeof(float)));
 
   GPU_CHECK(cudaMalloc(&rpn_buffers_[0], kRpnInputSize * sizeof(float)));
   GPU_CHECK(cudaMalloc(&rpn_buffers_[1], kRpnBoxOutputSize * sizeof(float)));
@@ -445,48 +451,25 @@ void PointPillars::ConvertAnchors2BoxAnchors(float* anchors_px,
   }
 }
 
-void PointPillars::InitTorch() {
-  if (gpu_id_ >= 0) {
-    device_type_ = torch::kCUDA;
-    device_id_ = gpu_id_;
-  } else {
-    device_type_ = torch::kCPU;
-  }
-
-  // Init torch net
-  torch::Device device(device_type_, device_id_);
-  pfe_net_ = torch::jit::load(pfe_torch_file_, device);
-}
-
 void PointPillars::InitTRT() {
-  // create a TensorRT model from the onnx model and serialize it to a stream
-  nvinfer1::IHostMemory* rpn_trt_model_stream{nullptr};
-  OnnxToTRTModel(rpn_onnx_file_, &rpn_trt_model_stream);
-  if (rpn_trt_model_stream == nullptr) {
-    std::cerr << "Failed to load ONNX file " << std::endl;
+  // create a TensorRT model from the onnx model and load it into an engine
+  OnnxToTRTModel(pfe_onnx_file_, &pfe_engine_);
+  OnnxToTRTModel(rpn_onnx_file_, &rpn_engine_);
+  if (pfe_engine_ == nullptr || rpn_engine_ == nullptr) {
+    AERROR << "Failed to load ONNX file.";
   }
 
-  // deserialize the engine
-  rpn_runtime_ = nvinfer1::createInferRuntime(g_logger_);
-  if (rpn_runtime_ == nullptr) {
-    std::cerr << "Failed to create TensorRT Runtime object." << std::endl;
-  }
-  rpn_engine_ = rpn_runtime_->deserializeCudaEngine(
-      rpn_trt_model_stream->data(), rpn_trt_model_stream->size(), nullptr);
-  if (rpn_engine_ == nullptr) {
-    std::cerr << "Failed to create TensorRT Engine." << std::endl;
-  }
-  rpn_trt_model_stream->destroy();
+  // create execution context from the engine
+  pfe_context_ = pfe_engine_->createExecutionContext();
   rpn_context_ = rpn_engine_->createExecutionContext();
-  if (rpn_context_ == nullptr) {
-    std::cerr << "Failed to create TensorRT Execution Context." << std::endl;
+  if (pfe_context_ == nullptr || rpn_context_ == nullptr) {
+    AERROR << "Failed to create TensorRT Execution Context.";
   }
 }
 
 void PointPillars::OnnxToTRTModel(
     const std::string& model_file,  // name of the onnx model
-    nvinfer1::IHostMemory**
-        trt_model_stream) {  // output buffer for the TensorRT model
+    nvinfer1::ICudaEngine** engine_ptr) {
   int verbosity = static_cast<int>(nvinfer1::ILogger::Severity::kWARNING);
 
   // create the builder
@@ -497,6 +480,7 @@ void PointPillars::OnnxToTRTModel(
   nvinfer1::INetworkDefinition* network =
       builder->createNetworkV2(explicit_batch);
 
+  // parse onnx model
   auto parser = nvonnxparser::createParser(*network, g_logger_);
   if (!parser->parseFromFile(model_file.c_str(), verbosity)) {
     std::string msg("failed to parse onnx file");
@@ -511,11 +495,8 @@ void PointPillars::OnnxToTRTModel(
   nvinfer1::ICudaEngine* engine =
       builder->buildEngineWithConfig(*network, *config);
 
+  *engine_ptr = engine;
   parser->destroy();
-
-  // serialize the engine, then close everything down
-  *trt_model_stream = engine->serialize();
-  engine->destroy();
   network->destroy();
   config->destroy();
   builder->destroy();
@@ -616,11 +597,6 @@ void PointPillars::DoInference(const float* in_points_array,
                                const int in_num_points,
                                std::vector<float>* out_detections,
                                std::vector<int>* out_labels) {
-  if (device_id_ < 0) {
-    AERROR << "Torch is not using GPU!";
-    return;
-  }
-
   Preprocess(in_points_array, in_num_points);
 
   anchor_mask_cuda_ptr_->DoAnchorMaskCuda(
@@ -630,39 +606,24 @@ void PointPillars::DoInference(const float* in_points_array,
 
   cudaStream_t stream;
   GPU_CHECK(cudaStreamCreate(&stream));
-  GPU_CHECK(cudaMemcpyAsync(pfe_buffers_[0], dev_pillar_point_feature_,
+
+  GPU_CHECK(cudaMemset(dev_pfe_gather_feature_, 0,
+                       kMaxNumPillars * kMaxNumPointsPerPillar *
+                           kNumGatherPointFeature * sizeof(float)));
+  pfe_cuda_ptr_->GatherPointFeature(dev_pillar_point_feature_,
+                                    dev_num_points_per_pillar_,
+                                    dev_pillar_coors_, dev_pfe_gather_feature_);
+  GPU_CHECK(cudaMemcpyAsync(pfe_buffers_[0], dev_pfe_gather_feature_,
                             kMaxNumPillars * kMaxNumPointsPerPillar *
-                                kNumPointFeature * sizeof(float),
+                                kNumGatherPointFeature * sizeof(float),
                             cudaMemcpyDeviceToDevice, stream));
-  GPU_CHECK(cudaMemcpyAsync(pfe_buffers_[1], dev_num_points_per_pillar_,
-                            kMaxNumPillars * sizeof(float),
-                            cudaMemcpyDeviceToDevice, stream));
-  GPU_CHECK(cudaMemcpyAsync(pfe_buffers_[2], dev_pillar_coors_,
-                            kMaxNumPillars * 4 * sizeof(float),
-                            cudaMemcpyDeviceToDevice, stream));
-
-  torch::Tensor tensor_pillar_point_feature = torch::from_blob(pfe_buffers_[0],
-      {kMaxNumPillars, kMaxNumPointsPerPillar, kNumPointFeature},
-      torch::kCUDA);
-  torch::Tensor tensor_num_points_per_pillar = torch::from_blob(
-      pfe_buffers_[1], {kMaxNumPillars}, torch::kCUDA);
-  torch::Tensor tensor_pillar_coors = torch::from_blob(pfe_buffers_[2],
-      {kMaxNumPillars, 4}, torch::kCUDA);
-
-  torch::Device device(device_type_, device_id_);
-  tensor_pillar_point_feature.to(device);
-  tensor_num_points_per_pillar.to(device);
-  tensor_pillar_coors.to(device);
-
-  auto pfe_output = pfe_net_.forward({tensor_pillar_point_feature,
-                                      tensor_num_points_per_pillar,
-                                      tensor_pillar_coors}).toTensor();
+  pfe_context_->enqueueV2(pfe_buffers_, stream, nullptr);
 
   GPU_CHECK(
       cudaMemset(dev_scattered_feature_, 0, kRpnInputSize * sizeof(float)));
   scatter_cuda_ptr_->DoScatterCuda(
       host_pillar_count_[0], dev_x_coors_, dev_y_coors_,
-      pfe_output.data_ptr<float>(), dev_scattered_feature_);
+      reinterpret_cast<float*>(pfe_buffers_[1]), dev_scattered_feature_);
 
   GPU_CHECK(cudaMemcpyAsync(rpn_buffers_[0], dev_scattered_feature_,
                             kBatchSize * kRpnInputSize * sizeof(float),
diff --git a/modules/perception/lidar/lib/detection/lidar_point_pillars/point_pillars.h b/modules/perception/lidar/lib/detection/lidar_point_pillars/point_pillars.h
index d22a089c590..9b79c080079 100644
--- a/modules/perception/lidar/lib/detection/lidar_point_pillars/point_pillars.h
+++ b/modules/perception/lidar/lib/detection/lidar_point_pillars/point_pillars.h
@@ -53,13 +53,11 @@
 #include "NvInfer.h"
 #include "NvOnnxParser.h"
 
-#include "torch/script.h"
-#include "torch/torch.h"
-
 // headers in local files
 #include "modules/perception/lidar/lib/detection/lidar_point_pillars/anchor_mask_cuda.h"
 #include "modules/perception/lidar/lib/detection/lidar_point_pillars/common.h"
 #include "modules/perception/lidar/lib/detection/lidar_point_pillars/params.h"
+#include "modules/perception/lidar/lib/detection/lidar_point_pillars/pfe_cuda.h"
 #include "modules/perception/lidar/lib/detection/lidar_point_pillars/postprocess_cuda.h"
 #include "modules/perception/lidar/lib/detection/lidar_point_pillars/preprocess_points.h"
 #include "modules/perception/lidar/lib/detection/lidar_point_pillars/preprocess_points_cuda.h"
@@ -118,6 +116,7 @@ class PointPillars {
   static const int kMaxNumPillars;
   static const int kMaxNumPointsPerPillar;
   static const int kNumPointFeature;
+  static const int kNumGatherPointFeature;
   static const int kGridXSize;
   static const int kGridYSize;
   static const int kGridZSize;
@@ -147,7 +146,7 @@ class PointPillars {
   const bool reproduce_result_mode_;
   const float score_threshold_;
   const float nms_overlap_threshold_;
-  const std::string pfe_torch_file_;
+  const std::string pfe_onnx_file_;
   const std::string rpn_onnx_file_;
   // end initializer list
 
@@ -182,7 +181,9 @@ class PointPillars {
   float* dev_box_anchors_max_y_;
   int* dev_anchor_mask_;
 
-  void* pfe_buffers_[3];
+  float* dev_pfe_gather_feature_;
+  void* pfe_buffers_[2];
+
   void* rpn_buffers_[4];
 
   float* dev_scattered_feature_;
@@ -204,17 +205,15 @@ class PointPillars {
   std::unique_ptr<PreprocessPoints> preprocess_points_ptr_;
   std::unique_ptr<PreprocessPointsCuda> preprocess_points_cuda_ptr_;
   std::unique_ptr<AnchorMaskCuda> anchor_mask_cuda_ptr_;
+  std::unique_ptr<PfeCuda> pfe_cuda_ptr_;
   std::unique_ptr<ScatterCuda> scatter_cuda_ptr_;
   std::unique_ptr<PostprocessCuda> postprocess_cuda_ptr_;
 
   Logger g_logger_;
-  int device_id_ = -1;
-  int gpu_id_ = 0;
-  torch::DeviceType device_type_;
-  torch::jit::script::Module pfe_net_;
-  nvinfer1::IExecutionContext* rpn_context_;
-  nvinfer1::IRuntime* rpn_runtime_;
+  nvinfer1::ICudaEngine* pfe_engine_;
   nvinfer1::ICudaEngine* rpn_engine_;
+  nvinfer1::IExecutionContext* pfe_context_;
+  nvinfer1::IExecutionContext* rpn_context_;
 
   /**
    * @brief Memory allocation for device memory
@@ -228,12 +227,6 @@ class PointPillars {
    */
   void InitAnchors();
 
-  /**
-   * @brief Initializing LibTorch net
-   * @details Called in the constructor
-   */
-  void InitTorch();
-
   /**
    * @brief Initializing TensorRT instances
    * @details Called in the constructor
@@ -266,11 +259,11 @@ class PointPillars {
   /**
    * @brief Convert ONNX to TensorRT model
    * @param[in] model_file ONNX model file path
-   * @param[out] trt_model_stream TensorRT model made out of ONNX model
+   * @param[out] engine_ptr TensorRT model engine made out of ONNX model
    * @details Load ONNX model, and convert it to TensorRT model
    */
   void OnnxToTRTModel(const std::string& model_file,
-                      nvinfer1::IHostMemory** trt_model_stream);
+                      nvinfer1::ICudaEngine** engine_ptr);
 
   /**
    * @brief Preproces points
@@ -332,13 +325,13 @@ class PointPillars {
    * reproducible for the same input
    * @param[in] score_threshold Score threshold for filtering output
    * @param[in] nms_overlap_threshold IOU threshold for NMS
-   * @param[in] pfe_torch_file Pillar Feature Extractor Torch file path
+   * @param[in] pfe_onnx_file Pillar Feature Extractor ONNX file path
    * @param[in] rpn_onnx_file Region Proposal Network ONNX file path
    * @details Variables could be changed through point_pillars_detection
    */
   PointPillars(const bool reproduce_result_mode, const float score_threshold,
                const float nms_overlap_threshold,
-               const std::string pfe_torch_file,
+               const std::string pfe_onnx_file,
                const std::string rpn_onnx_file);
   ~PointPillars();
 
diff --git a/modules/perception/lidar/lib/detection/lidar_point_pillars/point_pillars_detection.cc b/modules/perception/lidar/lib/detection/lidar_point_pillars/point_pillars_detection.cc
index 0d38c00c534..8181802c2dd 100644
--- a/modules/perception/lidar/lib/detection/lidar_point_pillars/point_pillars_detection.cc
+++ b/modules/perception/lidar/lib/detection/lidar_point_pillars/point_pillars_detection.cc
@@ -22,7 +22,6 @@
 #include <cuda_runtime_api.h>
 
 #include "cyber/common/log.h"
-
 #include "modules/perception/base/object_pool_types.h"
 #include "modules/perception/base/point_cloud_util.h"
 #include "modules/perception/common/perception_gflags.h"
@@ -55,7 +54,7 @@ PointPillarsDetection::PointPillarsDetection()
 bool PointPillarsDetection::Init(const DetectionInitOptions& options) {
   point_pillars_ptr_.reset(new PointPillars(
       FLAGS_reproduce_result_mode, FLAGS_score_threshold,
-      FLAGS_nms_overlap_threshold, FLAGS_pfe_torch_file, FLAGS_rpn_onnx_file));
+      FLAGS_nms_overlap_threshold, FLAGS_pfe_onnx_file, FLAGS_rpn_onnx_file));
   return true;
 }
 
diff --git a/modules/perception/lidar/lib/detection/lidar_point_pillars/point_pillars_test.cc b/modules/perception/lidar/lib/detection/lidar_point_pillars/point_pillars_test.cc
index c9a6dee0f45..e775e0eb5c2 100644
--- a/modules/perception/lidar/lib/detection/lidar_point_pillars/point_pillars_test.cc
+++ b/modules/perception/lidar/lib/detection/lidar_point_pillars/point_pillars_test.cc
@@ -150,7 +150,7 @@ TestClass::TestClass()
 
   point_pillars_ptr_.reset(new PointPillars(
       reproduce_result_mode, score_threshold, nms_overlap_threshold,
-      FLAGS_pfe_torch_file, FLAGS_rpn_onnx_file));
+      FLAGS_pfe_onnx_file, FLAGS_rpn_onnx_file));
 }
 
 TestClass::TestClass(const int num_class, const int max_num_pillars,
@@ -192,7 +192,7 @@ TestClass::TestClass(const int num_class, const int max_num_pillars,
 
   point_pillars_ptr_.reset(new PointPillars(
       reproduce_result_mode, score_threshold, nms_overlap_threshold,
-      FLAGS_pfe_torch_file, FLAGS_rpn_onnx_file));
+      FLAGS_pfe_onnx_file, FLAGS_rpn_onnx_file));
 }
 
 void TestClass::Preprocess(const float* in_points_array, int in_num_points,
diff --git a/modules/perception/lidar/lib/detection/lidar_point_pillars/scatter_cuda.cu b/modules/perception/lidar/lib/detection/lidar_point_pillars/scatter_cuda.cu
index b2c8d349bb1..8264a9b36c6 100644
--- a/modules/perception/lidar/lib/detection/lidar_point_pillars/scatter_cuda.cu
+++ b/modules/perception/lidar/lib/detection/lidar_point_pillars/scatter_cuda.cu
@@ -38,31 +38,29 @@ namespace perception {
 namespace lidar {
 
 __global__ void scatter_kernel(int *x_coors, int *y_coors, float *pfe_output,
-                               float *scattered_feature,
-                               const int max_num_pillars_,
-                               const int grid_x_size, const int grid_y_size) {
+                               float *scattered_feature, const int grid_x_size,
+                               const int grid_y_size) {
   int i_pillar = blockIdx.x;
   int i_feature = threadIdx.x;
   int x_ind = x_coors[i_pillar];
   int y_ind = y_coors[i_pillar];
-  float feature = pfe_output[i_feature * max_num_pillars_ + i_pillar];
+  float feature = pfe_output[i_pillar * 64 + i_feature];
   scattered_feature[i_feature * grid_y_size * grid_x_size +
                     y_ind * grid_x_size + x_ind] = feature;
 }
 
-ScatterCuda::ScatterCuda(const int num_threads, const int max_num_pillars,
-                         const int grid_x_size, const int grid_y_size)
+ScatterCuda::ScatterCuda(const int num_threads, const int grid_x_size,
+                         const int grid_y_size)
     : num_threads_(num_threads),
-      max_num_pillars_(max_num_pillars),
       grid_x_size_(grid_x_size),
       grid_y_size_(grid_y_size) {}
 
 void ScatterCuda::DoScatterCuda(const int pillar_count, int *x_coors,
                                 int *y_coors, float *pfe_output,
                                 float *scattered_feature) {
-  scatter_kernel<<<pillar_count, num_threads_>>>(
-      x_coors, y_coors, pfe_output, scattered_feature, max_num_pillars_,
-      grid_x_size_, grid_y_size_);
+  scatter_kernel<<<pillar_count, num_threads_>>>(x_coors, y_coors, pfe_output,
+                                                 scattered_feature,
+                                                 grid_x_size_, grid_y_size_);
 }
 
 }  // namespace lidar
diff --git a/modules/perception/lidar/lib/detection/lidar_point_pillars/scatter_cuda.h b/modules/perception/lidar/lib/detection/lidar_point_pillars/scatter_cuda.h
index adc36e4cd4c..f6dee1f931a 100644
--- a/modules/perception/lidar/lib/detection/lidar_point_pillars/scatter_cuda.h
+++ b/modules/perception/lidar/lib/detection/lidar_point_pillars/scatter_cuda.h
@@ -46,7 +46,6 @@ namespace lidar {
 class ScatterCuda {
  private:
   const int num_threads_;
-  const int max_num_pillars_;
   const int grid_x_size_;
   const int grid_y_size_;
 
@@ -54,13 +53,12 @@ class ScatterCuda {
   /**
    * @brief Constructor
    * @param[in] num_threads The number of threads to launch cuda kernel
-   * @param[in] max_num_pillars Maximum number of pillars
    * @param[in] grid_x_size Number of pillars in x-coordinate
    * @param[in] grid_y_size Number of pillars in y-coordinate
    * @details Captital variables never change after the compile
    */
-  ScatterCuda(const int num_threads, const int max_num_pillars,
-              const int grid_x_size, const int grid_y_size);
+  ScatterCuda(const int num_threads, const int grid_x_size,
+              const int grid_y_size);
 
   /**
    * @brief Call scatter cuda kernel
diff --git a/modules/perception/production/data/perception/lidar/models/detection/point_pillars/pfe.onnx b/modules/perception/production/data/perception/lidar/models/detection/point_pillars/pfe.onnx
new file mode 100644
index 00000000000..dc0027294d9
Binary files /dev/null and b/modules/perception/production/data/perception/lidar/models/detection/point_pillars/pfe.onnx differ
diff --git a/modules/perception/production/data/perception/lidar/models/detection/point_pillars/pfe.pt b/modules/perception/production/data/perception/lidar/models/detection/point_pillars/pfe.pt
deleted file mode 100644
index b813ccad2ce..00000000000
Binary files a/modules/perception/production/data/perception/lidar/models/detection/point_pillars/pfe.pt and /dev/null differ