wang-xinyu · wang-xinyu · Jul 29, 2024 · Jul 10, 2024 · Jul 11, 2024 · Jul 11, 2024
diff --git a/yolov5/yolov5_trt10/CMakeLists.txt b/yolov5/yolov5_trt10/CMakeLists.txt
@@ -45,3 +45,9 @@ target_link_libraries(yolov5_det nvinfer)
 target_link_libraries(yolov5_det cudart)
 target_link_libraries(yolov5_det myplugins)
 target_link_libraries(yolov5_det ${OpenCV_LIBS})
+
+add_executable(yolov5_seg yolov5_seg.cpp ${SRCS})
+target_link_libraries(yolov5_seg nvinfer)
+target_link_libraries(yolov5_seg cudart)
+target_link_libraries(yolov5_seg myplugins)
+target_link_libraries(yolov5_seg ${OpenCV_LIBS})
diff --git a/yolov5/yolov5_trt10/README.md b/yolov5/yolov5_trt10/README.md
@@ -12,6 +12,7 @@ TensorRT: TensorRT-10.2.0.19
 
 * [x] YOLOv5-cls support FP32/FP16/INT8 and Python/C++ API
 * [x] YOLOv5-det support FP32/FP16/INT8 and Python/C++ API
+* [x] YOLOv5-seg support FP32/FP16/INT8 and Python/C++ API
 
 ## Config
 
@@ -28,9 +29,11 @@ git clone -b trt10 https://github.com/wang-xinyu/tensorrtx.git
 cd yolov5/
 wget https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5n-cls.pt
 wget https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5n.pt
+wget https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5n-seg.pt
 cp [PATH-TO-TENSORRTX]/yolov5/gen_wts.py .
 python gen_wts.py -w yolov5n-cls.pt -o yolov5n-cls.wts -t cls
 python gen_wts.py -w yolov5n.pt -o yolov5n.wts
+python gen_wts.py -w yolov5n-seg.pt -o yolov5n.wts -t seg
 # A file 'yolov5n.wts' will be generated.
 ```
 
@@ -89,11 +92,33 @@ make
 # The results are displayed in the console
 ```
 
+#### Segmentation
+
+```shell
+cd [PATH-TO-TENSORRTX]/yolov5/yolov5_trt10
+# Update kNumClass in src/config.h if your model is trained on custom dataset
+mkdir build
+cd build
+cp [PATH-TO-ultralytics-yolov5]/yolov5n-seg.wts .
+cmake ..
+make
+
+# Build and serialize TensorRT engine
+./yolov5_seg -s yolov5n-seg.wts yolov5n-seg.engine [n/s/m/l/x]
+
+# Download the labels file
+wget -O coco.txt https://raw.githubusercontent.com/amikelive/coco-labels/master/coco-labels-2014_2017.txt
+
+# Run inference
+./yolov5_seg -d yolov5n-seg.engine ../../images coco.txt
+# The results are displayed in the console
+```
+
 ## INT8 Quantization
 1. Prepare calibration images, you can randomly select 1000s images from your train set. For coco, you can also download my calibration images `coco_calib` from [GoogleDrive](https://drive.google.com/drive/folders/1s7jE9DtOngZMzJC1uL307J2MiaGwdRSI?usp=sharing) or [BaiduPan](https://pan.baidu.com/s/1GOm_-JobpyLMAqZWCDUhKg) pwd: a9wh
 2. unzip it in yolov5_trt10/build
 3. set the macro `USE_INT8` in src/config.h and make again
 4. serialize the model and test
 
 ## More Information
-See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)
+See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)
diff --git a/yolov5/yolov5_trt10/src/config.h b/yolov5/yolov5_trt10/src/config.h
@@ -13,6 +13,7 @@
 // you can set them to whatever you want.
 const static char* kInputTensorName = "data";
 const static char* kOutputTensorName = "prob";
+const static char* kProtoTensorName = "proto";
 
 // Detection model and Segmentation model' number of classes
 constexpr static int kNumClass = 80;

diff --git a/yolov5/yolov5_trt10/src/model.cpp b/yolov5/yolov5_trt10/src/model.cpp
@@ -226,18 +226,59 @@ static ILayer* SPPF(INetworkDefinition* network, std::map<std::string, Weights>&
     return cv2;
 }
 
+static ILayer* convBlockProto(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input,
+                              int outch, int ksize, int s, int g, std::string lname) {
+    Weights emptywts{DataType::kFLOAT, nullptr, 0};
+    int p = ksize / 3;
+    IConvolutionLayer* conv1 =
+            network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[lname + ".conv.weight"], emptywts);
+    assert(conv1);
+    conv1->setStrideNd(DimsHW{s, s});
+    conv1->setPaddingNd(DimsHW{p, p});
+    conv1->setNbGroups(g);
+    conv1->setName((lname + ".conv").c_str());
+    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-3);
+    assert(bn1);
+    bn1->setName((lname + ".bn").c_str());
+
+    // This concat operator is not used for calculation, in order to prevent the operator fusion unrealized error when int8 is quantized.
+    // Error Code 10: Internal Error (Could not find any implementation for node
+    // model.24.proto.cv3.conv + model.24.proto.cv3.bn + PWN(PWN(model.24.proto.cv3.sigmoid), PWN(model.24.proto.cv3.silu)).)
+#if defined(USE_INT8)
+    ITensor* inputTensors[] = {bn1->getOutput(0)};
+    auto concat = network->addConcatenation(inputTensors, 1);
+
+    // silu = x * sigmoid
+    auto sig = network->addActivation(*concat->getOutput(0), ActivationType::kSIGMOID);
+    assert(sig);
+    sig->setName((lname + ".sigmoid").c_str());
+    auto ew = network->addElementWise(*concat->getOutput(0), *sig->getOutput(0), ElementWiseOperation::kPROD);
+    assert(ew);
+    ew->setName((lname + ".silu").c_str());
+#else
+    // silu = x * sigmoid
+    auto sig = network->addActivation(*bn1->getOutput(0), ActivationType::kSIGMOID);
+    assert(sig);
+    sig->setName((lname + ".sigmoid").c_str());
+    auto ew = network->addElementWise(*bn1->getOutput(0), *sig->getOutput(0), ElementWiseOperation::kPROD);
+    assert(ew);
+    ew->setName((lname + ".silu").c_str());
+#endif
+    return ew;
+}
+
 static ILayer* Proto(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int c_,
                      int c2, std::string lname) {
-    auto cv1 = convBlock(network, weightMap, input, c_, 3, 1, 1, lname + ".cv1");
+    auto cv1 = convBlockProto(network, weightMap, input, c_, 3, 1, 1, lname + ".cv1");
 
     auto upsample = network->addResize(*cv1->getOutput(0));
     assert(upsample);
     upsample->setResizeMode(nvinfer1::InterpolationMode::kNEAREST);
     const float scales[] = {1, 1, 2, 2};
     upsample->setScales(scales, 4);
 
-    auto cv2 = convBlock(network, weightMap, *upsample->getOutput(0), c_, 3, 1, 1, lname + ".cv2");
-    auto cv3 = convBlock(network, weightMap, *cv2->getOutput(0), c2, 1, 1, 1, lname + ".cv3");
+    auto cv2 = convBlockProto(network, weightMap, *upsample->getOutput(0), c_, 3, 1, 1, lname + ".cv2");
+    auto cv3 = convBlockProto(network, weightMap, *cv2->getOutput(0), c2, 1, 1, 1, lname + ".cv3");
     assert(cv3);
     return cv3;
 }
@@ -610,3 +651,115 @@ nvinfer1::IHostMemory* build_det_p6_engine(unsigned int maxBatchSize, IBuilder*
 
     return serialized_model;
 }
+
+IHostMemory* build_seg_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt,
+                              float& gd, float& gw, std::string& wts_name) {
+    //	INetworkDefinition *network = builder->createNetworkV2(0U);
+    INetworkDefinition* network =
+            builder->createNetworkV2(1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));
+    ITensor* data = network->addInput(kInputTensorName, dt, Dims4{maxBatchSize, 3, kInputH, kInputW});
+    assert(data);
+    std::map<std::string, Weights> weightMap = loadWeights(wts_name);
+
+    // Backbone
+    auto conv0 = convBlock(network, weightMap, *data, get_width(64, gw), 6, 2, 1, "model.0");
+    assert(conv0);
+    auto conv1 = convBlock(network, weightMap, *conv0->getOutput(0), get_width(128, gw), 3, 2, 1, "model.1");
+    auto bottleneck_CSP2 = C3(network, weightMap, *conv1->getOutput(0), get_width(128, gw), get_width(128, gw),
+                              get_depth(3, gd), true, 1, 0.5, "model.2");
+    auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), get_width(256, gw), 3, 2, 1, "model.3");
+    auto bottleneck_csp4 = C3(network, weightMap, *conv3->getOutput(0), get_width(256, gw), get_width(256, gw),
+                              get_depth(6, gd), true, 1, 0.5, "model.4");
+    auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), get_width(512, gw), 3, 2, 1, "model.5");
+    auto bottleneck_csp6 = C3(network, weightMap, *conv5->getOutput(0), get_width(512, gw), get_width(512, gw),
+                              get_depth(9, gd), true, 1, 0.5, "model.6");
+    auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), get_width(1024, gw), 3, 2, 1, "model.7");
+    auto bottleneck_csp8 = C3(network, weightMap, *conv7->getOutput(0), get_width(1024, gw), get_width(1024, gw),
+                              get_depth(3, gd), true, 1, 0.5, "model.8");
+    auto spp9 = SPPF(network, weightMap, *bottleneck_csp8->getOutput(0), get_width(1024, gw), get_width(1024, gw), 5,
+                     "model.9");
+
+    // Head
+    auto conv10 = convBlock(network, weightMap, *spp9->getOutput(0), get_width(512, gw), 1, 1, 1, "model.10");
+
+    auto upsample11 = network->addResize(*conv10->getOutput(0));
+    assert(upsample11);
+    upsample11->setResizeMode(InterpolationMode::kNEAREST);
+    upsample11->setOutputDimensions(bottleneck_csp6->getOutput(0)->getDimensions());
+
+    ITensor* inputTensors12[] = {upsample11->getOutput(0), bottleneck_csp6->getOutput(0)};
+    auto cat12 = network->addConcatenation(inputTensors12, 2);
+    auto bottleneck_csp13 = C3(network, weightMap, *cat12->getOutput(0), get_width(1024, gw), get_width(512, gw),
+                               get_depth(3, gd), false, 1, 0.5, "model.13");
+    auto conv14 =
+            convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), get_width(256, gw), 1, 1, 1, "model.14");
+
+    auto upsample15 = network->addResize(*conv14->getOutput(0));
+    assert(upsample15);
+    upsample15->setResizeMode(InterpolationMode::kNEAREST);
+    upsample15->setOutputDimensions(bottleneck_csp4->getOutput(0)->getDimensions());
+
+    ITensor* inputTensors16[] = {upsample15->getOutput(0), bottleneck_csp4->getOutput(0)};
+    auto cat16 = network->addConcatenation(inputTensors16, 2);
+
+    auto bottleneck_csp17 = C3(network, weightMap, *cat16->getOutput(0), get_width(512, gw), get_width(256, gw),
+                               get_depth(3, gd), false, 1, 0.5, "model.17");
+
+    // Segmentation
+    IConvolutionLayer* det0 =
+            network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (32 + kNumClass + 5), DimsHW{1, 1},
+                                      weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]);
+    auto conv18 =
+            convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), get_width(256, gw), 3, 2, 1, "model.18");
+    ITensor* inputTensors19[] = {conv18->getOutput(0), conv14->getOutput(0)};
+    auto cat19 = network->addConcatenation(inputTensors19, 2);
+    auto bottleneck_csp20 = C3(network, weightMap, *cat19->getOutput(0), get_width(512, gw), get_width(512, gw),
+                               get_depth(3, gd), false, 1, 0.5, "model.20");
+    IConvolutionLayer* det1 =
+            network->addConvolutionNd(*bottleneck_csp20->getOutput(0), 3 * (32 + kNumClass + 5), DimsHW{1, 1},
+                                      weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]);
+    auto conv21 =
+            convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), get_width(512, gw), 3, 2, 1, "model.21");
+    ITensor* inputTensors22[] = {conv21->getOutput(0), conv10->getOutput(0)};
+    auto cat22 = network->addConcatenation(inputTensors22, 2);
+    auto bottleneck_csp23 = C3(network, weightMap, *cat22->getOutput(0), get_width(1024, gw), get_width(1024, gw),
+                               get_depth(3, gd), false, 1, 0.5, "model.23");
+    IConvolutionLayer* det2 =
+            network->addConvolutionNd(*bottleneck_csp23->getOutput(0), 3 * (32 + kNumClass + 5), DimsHW{1, 1},
+                                      weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]);
+
+    auto yolo = addYoLoLayer(network, weightMap, "model.24", std::vector<IConvolutionLayer*>{det0, det1, det2}, true);
+    yolo->getOutput(0)->setName(kOutputTensorName);
+    network->markOutput(*yolo->getOutput(0));
+
+    auto proto = Proto(network, weightMap, *bottleneck_csp17->getOutput(0), get_width(256, gw), 32, "model.24.proto");
+    proto->getOutput(0)->setName(kProtoTensorName);
+    network->markOutput(*proto->getOutput(0));
+
+    // Engine config
+    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20));
+#if defined(USE_FP16)
+    config->setFlag(BuilderFlag::kFP16);
+#elif defined(USE_INT8)
+    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
+    assert(builder->platformHasFastInt8());
+    config->setFlag(BuilderFlag::kINT8);
+    Int8EntropyCalibrator2* calibrator =
+            new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName);
+    config->setInt8Calibrator(calibrator);
+#endif
+
+    std::cout << "Building engine, please wait for a while..." << std::endl;
+    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
+    std::cout << "Build engine successfully!" << std::endl;
+
+    // Don't need the network any more
+    delete network;
+
+    // Release host memory
+    for (auto& mem : weightMap) {
+        free((void*)(mem.second.values));
+    }
+
+    return serialized_model;
+}
diff --git a/yolov5/yolov5_trt10/src/model.h b/yolov5/yolov5_trt10/src/model.h
@@ -14,3 +14,7 @@ nvinfer1::IHostMemory* build_det_engine(unsigned int maxBatchSize, nvinfer1::IBu
 nvinfer1::IHostMemory* build_det_p6_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder,
                                            nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, float& gd,
                                            float& gw, std::string& wts_name);
+
+nvinfer1::IHostMemory* build_seg_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder,
+                                        nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, float& gd, float& gw,
+                                        std::string& wts_name);
diff --git a/yolov5/yolov5_trt10/src/postprocess.cpp b/yolov5/yolov5_trt10/src/postprocess.cpp
@@ -24,6 +24,11 @@ cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
         t = t / r_h;
         b = b / r_h;
     }
+    // Prevent code from cross-border access
+    l = (std::max)(0.f, l);
+    r = (std::min)((float)img.cols, r);
+    t = (std::max)(0.f, t);
+    b = (std::min)((float)img.rows, b);
     return cv::Rect(round(l), round(t), round(r - l), round(b - t));
 }
 
@@ -54,6 +59,15 @@ void nms(std::vector<Detection>& res, float* output, float conf_thresh, float nm
             continue;
         Detection det;
         memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
+        // Prevent code from cross-border access
+        auto left = (std::max)(det.bbox[0] - det.bbox[2] / 2.f, 0.f);
+        auto top = (std::max)(det.bbox[1] - det.bbox[3] / 2.f, 0.f);
+        auto right = (std::min)(det.bbox[0] + det.bbox[2] / 2.f, kInputW - 1.f);
+        auto bottom = (std::min)(det.bbox[1] + det.bbox[3] / 2.f, kInputH - 1.f);
+        det.bbox[2] = right - left;
+        det.bbox[3] = bottom - top;
+        det.bbox[0] = left + det.bbox[2] / 2.f;
+        det.bbox[1] = top + det.bbox[3] / 2.f;
         if (m.count(det.class_id) == 0)
             m.emplace(det.class_id, std::vector<Detection>());
         m[det.class_id].push_back(det);