@@ -81,7 +81,7 @@ void AKAZE_homograpy_check(
8181 }
8282}
8383
84- #define UNSUPPORTED_YOLO_VERSION " Unsupported yolo version. Supported versions are v3, v5 , v8."
84+ #define UNSUPPORTED_YOLO_VERSION " Unsupported yolo version. Supported versions are v3, v4, v5, v6, v7 , v8."
8585
8686void yolo_postprocess (
8787 const int spatial_width,
@@ -104,7 +104,7 @@ void yolo_postprocess(
104104 for (auto out : outs)
105105 {
106106 int offset;
107- float scale_x, scale_y ;
107+ float box_scale_w, box_scale_h ;
108108
109109 if (out.dims != 2 && out.dims != 3 ) {
110110 CV_Error (cv::Error::StsAssert, UNSUPPORTED_YOLO_VERSION " out.dims != 2 && out.dims != 3" );
@@ -120,17 +120,17 @@ void yolo_postprocess(
120120 }
121121
122122 // relative coordinates
123- scale_x = (float )img_width * scale;
124- scale_y = (float )img_height * scale;
123+ box_scale_w = (float )img_width * scale;
124+ box_scale_h = (float )img_height * scale;
125125 }
126126 else {
127127 if (out.size [0 ] != 1 ) {
128128 CV_Error (cv::Error::StsAssert, UNSUPPORTED_YOLO_VERSION " out.size[0] != 1" );
129129 }
130130
131131 out = out.reshape (1 , out.size [1 ]);
132- scale_x = (float )img_width / spatial_width * scale;
133- scale_y = (float )img_height / spatial_height * scale;
132+ box_scale_w = (float )img_width / spatial_width * scale;
133+ box_scale_h = (float )img_height / spatial_height * scale;
134134
135135 // yolov5 has an output of shape (batchSize, 25200, 85) (Num classes + box[x,y,w,h] + confidence[c])
136136 // yolov8 has an output of shape (batchSize, 84, 8400) (Num classes + box[x,y,w,h])
@@ -169,10 +169,10 @@ void yolo_postprocess(
169169
170170 if (maxScore >= score_threshold)
171171 {
172- double centerX = (double )data[0 ] * scale_x ;
173- double centerY = (double )data[1 ] * scale_y ;
174- double width = (double )data[2 ] * scale_x ;
175- double height = (double )data[3 ] * scale_y ;
172+ double centerX = (double )data[0 ] * box_scale_w ;
173+ double centerY = (double )data[1 ] * box_scale_h ;
174+ double width = (double )data[2 ] * box_scale_w ;
175+ double height = (double )data[3 ] * box_scale_h ;
176176 double left = centerX - width / 2 ;
177177 double top = centerY - height / 2 ;
178178
@@ -184,6 +184,58 @@ void yolo_postprocess(
184184 }
185185}
186186
187+ namespace {
188+ void yolo_object_detection_postprocess (
189+ const float box_scale_w,
190+ const float box_scale_h,
191+ const float confidence_threshold,
192+ const cv::Mat& out,
193+ cv::Mat& classes_scores,
194+ std::vector<int >& class_ids,
195+ std::vector<float >& confidences,
196+ std::vector<cv::Rect2d>& bboxes,
197+ const int offset,
198+ const int background_label_id
199+ )
200+ {
201+ classes_scores.cols = out.cols - offset;
202+
203+ // Scan through all the bounding boxes output from the network and keep only the
204+ // ones with high confidence scores. Assign the box's class label as the class
205+ // with the highest score for the box.
206+
207+ float * detection = (float *) out.data ;
208+
209+ for (int i = 0 ; i < out.rows ; ++i, detection += out.cols )
210+ {
211+ if (background_label_id < 0 && offset == 5 && detection[4 ] < confidence_threshold) {
212+ continue ;
213+ }
214+
215+ classes_scores.data = reinterpret_cast <uchar*>(detection + offset);
216+
217+ // Get the value and location of the maximum score
218+ double confidence;
219+ Point maxClassLoc;
220+ minMaxLoc (classes_scores, 0 , &confidence, 0 , &maxClassLoc);
221+ if (confidence <= confidence_threshold) {
222+ continue ;
223+ }
224+
225+ double centerX = (double )detection[0 ] * box_scale_w;
226+ double centerY = (double )detection[1 ] * box_scale_h;
227+ double width = (double )detection[2 ] * box_scale_w;
228+ double height = (double )detection[3 ] * box_scale_h;
229+ double left = centerX - width / 2 ;
230+ double top = centerY - height / 2 ;
231+
232+ class_ids.push_back (maxClassLoc.x );
233+ confidences.push_back ((float )confidence);
234+ bboxes.push_back (Rect2d (left, top, width, height));
235+ }
236+ }
237+ }
238+
187239void object_detection_postprocess (
188240 const cv::dnn::Net& net,
189241 const int inpWidth,
@@ -204,7 +256,7 @@ void object_detection_postprocess(
204256 auto outLayerType = lastLayer->type ;
205257
206258 Mat classes_scores (1 , 0 , CV_32FC1);
207- float scale_x, scale_y ;
259+ float box_scale_w, box_scale_h ;
208260
209261 if (outLayerType == " DetectionOutput" )
210262 {
@@ -228,19 +280,19 @@ void object_detection_postprocess(
228280
229281 if (data[i + 5 ] - data[i + 3 ] < 1 ) {
230282 // relative coordinates
231- scale_x = inpWidth * imgScale;
232- scale_y = inpHeight * imgScale;
283+ box_scale_w = inpWidth * imgScale;
284+ box_scale_h = inpHeight * imgScale;
233285 }
234286 else {
235287 // absolute coordinate
236- scale_x = imgScale;
237- scale_y = imgScale;
288+ box_scale_w = imgScale;
289+ box_scale_h = imgScale;
238290 }
239291
240- double left = (double )data[i + 3 ] * scale_x ;
241- double top = (double )data[i + 4 ] * scale_y ;
242- double width = (double )data[i + 5 ] * scale_x - left + 1 ;
243- double height = (double )data[i + 6 ] * scale_y - top + 1 ;
292+ double left = (double )data[i + 3 ] * box_scale_w ;
293+ double top = (double )data[i + 4 ] * box_scale_h ;
294+ double width = (double )data[i + 5 ] * box_scale_w - left + 1 ;
295+ double height = (double )data[i + 6 ] * box_scale_h - top + 1 ;
244296
245297 int class_id = (int )(data[i + 1 ]);
246298 if (background_label_id >= 0 && background_label_id <= class_id) {
@@ -254,125 +306,77 @@ void object_detection_postprocess(
254306 }
255307 else if (outLayerType == " Region" )
256308 {
257- // yolo v4
309+ // yolo v3, v4
258310
259311 // relative coordinates
260- scale_x = inpWidth * imgScale;
261- scale_y = inpHeight * imgScale;
312+ box_scale_w = inpWidth * imgScale;
313+ box_scale_h = inpHeight * imgScale;
314+ int offset = 5 ;
262315
263316 // Network produces output blob with a shape NxC where N is a number of
264317 // detected objects and C is a number of classes + 4 where the first 4
265318 // numbers are [center_x, center_y, width, height]
266319 for (auto out : outs)
267320 {
268- classes_scores.cols = out.cols - 5 ;
269- float * data = (float *)out.data ;
270-
271- for (int j = 0 ; j < out.rows ; ++j, data += out.cols )
272- {
273- classes_scores.data = reinterpret_cast <uchar*>(data + 5 );
274-
275- // Get the value and location of the maximum score
276- double confidence;
277- Point maxClassLoc;
278- minMaxLoc (classes_scores, 0 , &confidence, 0 , &maxClassLoc);
279- if (confidence <= confidence_threshold) {
280- continue ;
281- }
282-
283- double centerX = (double )data[0 ] * scale_x;
284- double centerY = (double )data[1 ] * scale_y;
285- double width = (double )data[2 ] * scale_x;
286- double height = (double )data[3 ] * scale_y;
287- double left = centerX - width / 2 ;
288- double top = centerY - height / 2 ;
289-
290- class_ids.push_back (maxClassLoc.x );
291- confidences.push_back ((float )confidence);
292- bboxes.push_back (Rect2d (left, top, width, height));
293- }
321+ yolo_object_detection_postprocess (
322+ box_scale_w,
323+ box_scale_h,
324+ confidence_threshold,
325+ out,
326+ classes_scores,
327+ class_ids,
328+ confidences,
329+ bboxes,
330+ offset,
331+ background_label_id
332+ );
294333 }
295334 }
296335 else if (outLayerType == " Identity" ) {
297336 for (auto out : outs)
298337 {
299338 int offset;
300- float scale_x, scale_y;
301-
302- if (out.dims != 2 && out.dims != 3 ) {
303- CV_Error (cv::Error::StsAssert, UNSUPPORTED_YOLO_VERSION " out.dims != 2 && out.dims != 3" );
304- }
339+ float box_scale_w, box_scale_h;
305340
306- if (out.dims == 2 ) {
307- // yolo v3
308- offset = 5 ;
309-
310- // relative coordinates
311- scale_x = inpWidth * imgScale;
312- scale_y = inpHeight * imgScale;
313- }
314- else {
341+ if (out.dims == 3 ) {
315342 if (out.size [0 ] != 1 ) {
316343 CV_Error (cv::Error::StsAssert, UNSUPPORTED_YOLO_VERSION " out.size[0] != 1" );
317344 }
318345
319346 out = out.reshape (1 , out.size [1 ]);
320347
321348 // absolute coordinate
322- scale_x = imgScale;
323- scale_y = imgScale;
349+ box_scale_w = imgScale;
350+ box_scale_h = imgScale;
324351
325- // yolov5 has an output of shape (batchSize, 25200, 85) (Num classes + box[x,y,w,h] + confidence[c])
326- // yolov8 has an output of shape (batchSize, 84, 8400) (Num classes + box[x,y,w,h])
327- if (out.rows == num_classes + 4 ) {
328- // yolo v8
352+ if (out.cols == num_classes + 5 ) {
353+ // yolo v5, v6, v7 has an output of shape (batchSize, 25200, 85) (Num classes + box[x,y,w,h] + confidence[c])
354+ offset = 5 ;
355+ }
356+ else if (out.rows == num_classes + 4 ) {
357+ // yolo v8 has an output of shape (batchSize, 84, 8400) (Num classes + box[x,y,w,h])
329358 offset = 4 ;
330359 cv::transpose (out, out);
331360 }
332- else if (out.cols == num_classes + 5 ) {
333- // yolo v5
334- offset = 5 ;
335- }
336361 else {
337- CV_Error (cv::Error::StsAssert, UNSUPPORTED_YOLO_VERSION);
362+ CV_Error (cv::Error::StsAssert, UNSUPPORTED_YOLO_VERSION " out.rows != num_classes + 4 && out.cols != num_classes + 5 " );
338363 }
364+ } else {
365+ CV_Error (cv::Error::StsAssert, UNSUPPORTED_YOLO_VERSION " out.dims != 3" );
339366 }
340367
341- classes_scores.cols = out.cols - offset;
342-
343- // Scan through all the bounding boxes output from the network and keep only the
344- // ones with high confidence scores. Assign the box's class label as the class
345- // with the highest score for the box.
346-
347- float * data = (float *)out.data ;
348-
349- for (int i = 0 ; i < out.rows ; ++i, data += out.cols )
350- {
351- if (offset == 5 && data[4 ] < confidence_threshold) {
352- continue ;
353- }
354-
355- classes_scores.data = reinterpret_cast <uchar*>(data + offset);
356-
357- // Get the value and location of the maximum score
358- double confidence;
359- Point maxClassLoc;
360- minMaxLoc (classes_scores, 0 , &confidence, 0 , &maxClassLoc);
361-
362- if (confidence >= confidence_threshold)
363- {
364- double centerX = (double )data[0 ] * scale_x;
365- double centerY = (double )data[1 ] * scale_y;
366- double width = (double )data[2 ] * scale_x;
367- double height = (double )data[3 ] * scale_y;
368- double left = centerX - width / 2 ;
369- double top = centerY - height / 2 ;
370-
371- class_ids.push_back (maxClassLoc.x );
372- confidences.push_back ((float )confidence);
373- bboxes.push_back (Rect2d (left, top, width, height));
374- }
375- }
368+ yolo_object_detection_postprocess (
369+ box_scale_w,
370+ box_scale_h,
371+ confidence_threshold,
372+ out,
373+ classes_scores,
374+ class_ids,
375+ confidences,
376+ bboxes,
377+ offset,
378+ background_label_id
379+ );
376380 }
377381 }
378382 else {
0 commit comments