1 /*M ///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 #include "../precomp.hpp"
44 #include "layers_common.hpp"
47 #include "../nms.inl.hpp"
48 #include "opencl_kernels_dnn.hpp"
61 float xmin, ymin, xmax, ymax;
64 : xmin(0), ymin(0), xmax(0), ymax(0), has_size_(false), size_(0) {}
66 float size() const { return size_; }
68 bool has_size() const { return has_size_; }
70 void set_size(float value) { size_ = value; has_size_ = true; }
72 void clear_size() { size_ = 0; has_size_ = false; }
80 static inline bool SortScorePairDescend(const std::pair<float, T>& pair1,
81 const std::pair<float, T>& pair2)
83 return pair1.first > pair2.first;
86 static inline float caffe_box_overlap(const util::NormalizedBBox& a, const util::NormalizedBBox& b);
90 class DetectionOutputLayerImpl : public DetectionOutputLayer
97 int _backgroundLabelId;
101 bool _varianceEncodedInTarget;
103 float _confidenceThreshold;
107 // Whenever predicted bounding boxes are respresented in YXHW instead of XYWH layout.
108 bool _locPredTransposed;
110 enum { _numAxes = 4 };
111 static const std::string _layerName;
113 typedef std::map<int, std::vector<util::NormalizedBBox> > LabelBBox;
115 bool getParameterDict(const LayerParams ¶ms,
116 const std::string ¶meterName,
119 if (!params.has(parameterName))
124 result = params.get(parameterName);
129 T getParameter(const LayerParams ¶ms,
130 const std::string ¶meterName,
132 const bool required=true,
133 const T& defaultValue=T())
136 bool success = getParameterDict(params, parameterName, dictValue);
141 std::string message = _layerName;
142 message += " layer parameter does not contain ";
143 message += parameterName;
144 message += " parameter.";
145 CV_ErrorNoReturn(Error::StsBadArg, message);
152 return dictValue.get<T>(idx);
155 void getCodeType(const LayerParams ¶ms)
157 String codeTypeString = params.get<String>("code_type").toLowerCase();
158 if (codeTypeString == "center_size")
159 _codeType = "CENTER_SIZE";
161 _codeType = "CORNER";
164 DetectionOutputLayerImpl(const LayerParams ¶ms)
166 _numClasses = getParameter<unsigned>(params, "num_classes");
167 _shareLocation = getParameter<bool>(params, "share_location");
168 _numLocClasses = _shareLocation ? 1 : _numClasses;
169 _backgroundLabelId = getParameter<int>(params, "background_label_id");
170 _varianceEncodedInTarget = getParameter<bool>(params, "variance_encoded_in_target", 0, false, false);
171 _keepTopK = getParameter<int>(params, "keep_top_k");
172 _confidenceThreshold = getParameter<float>(params, "confidence_threshold", 0, false, -FLT_MAX);
173 _topK = getParameter<int>(params, "top_k", 0, false, -1);
174 _locPredTransposed = getParameter<bool>(params, "loc_pred_transposed", 0, false, false);
178 // Parameters used in nms.
179 _nmsThreshold = getParameter<float>(params, "nms_threshold");
180 CV_Assert(_nmsThreshold > 0.);
182 setParamsFrom(params);
185 void checkInputs(const std::vector<Mat*> &inputs)
187 for (size_t i = 1; i < inputs.size(); i++)
189 CV_Assert(inputs[i]->size == inputs[0]->size);
193 bool getMemoryShapes(const std::vector<MatShape> &inputs,
194 const int requiredOutputs,
195 std::vector<MatShape> &outputs,
196 std::vector<MatShape> &internals) const
198 CV_Assert(inputs.size() > 0);
199 CV_Assert(inputs[0][0] == inputs[1][0]);
201 int numPriors = inputs[2][2] / 4;
202 CV_Assert((numPriors * _numLocClasses * 4) == inputs[0][1]);
203 CV_Assert(int(numPriors * _numClasses) == inputs[1][1]);
205 // num() and channels() are 1.
206 // Since the number of bboxes to be kept is unknown before nms, we manually
207 // set it to (fake) 1.
208 // Each row is a 7 dimension std::vector, which stores
209 // [image_id, label, confidence, xmin, ymin, xmax, ymax]
210 outputs.resize(1, shape(1, 1, 1, 7));
216 // Decode all bboxes in a batch
217 bool ocl_DecodeBBoxesAll(UMat& loc_mat, UMat& prior_mat,
218 const int num, const int numPriors, const bool share_location,
219 const int num_loc_classes, const int background_label_id,
220 const cv::String& code_type, const bool variance_encoded_in_target,
221 const bool clip, std::vector<LabelBBox>& all_decode_bboxes)
223 UMat outmat = UMat(loc_mat.dims, loc_mat.size, CV_32F);
224 size_t nthreads = loc_mat.total();
227 if (code_type == "CORNER")
228 kernel_name = "DecodeBBoxesCORNER";
229 else if (code_type == "CENTER_SIZE")
230 kernel_name = "DecodeBBoxesCENTER_SIZE";
234 for (int i = 0; i < num; ++i)
236 ocl::Kernel kernel(kernel_name.c_str(), ocl::dnn::detection_output_oclsrc);
237 kernel.set(0, (int)nthreads);
238 kernel.set(1, ocl::KernelArg::PtrReadOnly(loc_mat));
239 kernel.set(2, ocl::KernelArg::PtrReadOnly(prior_mat));
240 kernel.set(3, (int)variance_encoded_in_target);
241 kernel.set(4, (int)numPriors);
242 kernel.set(5, (int)share_location);
243 kernel.set(6, (int)num_loc_classes);
244 kernel.set(7, (int)background_label_id);
245 kernel.set(8, (int)clip);
246 kernel.set(9, ocl::KernelArg::PtrWriteOnly(outmat));
248 if (!kernel.run(1, &nthreads, NULL, false))
252 all_decode_bboxes.clear();
253 all_decode_bboxes.resize(num);
255 Mat mat = outmat.getMat(ACCESS_READ);
256 const float* decode_data = mat.ptr<float>();
257 for (int i = 0; i < num; ++i)
259 LabelBBox& decode_bboxes = all_decode_bboxes[i];
260 for (int c = 0; c < num_loc_classes; ++c)
262 int label = share_location ? -1 : c;
263 decode_bboxes[label].resize(numPriors);
264 for (int p = 0; p < numPriors; ++p)
266 int startIdx = p * num_loc_classes * 4;
267 util::NormalizedBBox& bbox = decode_bboxes[label][p];
268 bbox.xmin = decode_data[startIdx + c * 4];
269 bbox.ymin = decode_data[startIdx + c * 4 + 1];
270 bbox.xmax = decode_data[startIdx + c * 4 + 2];
271 bbox.ymax = decode_data[startIdx + c * 4 + 3];
279 void ocl_GetConfidenceScores(const UMat& inp1, const int num,
280 const int numPredsPerClass, const int numClasses,
281 std::vector<Mat>& confPreds)
283 int shape[] = { numClasses, numPredsPerClass };
284 for (int i = 0; i < num; i++)
285 confPreds.push_back(Mat(2, shape, CV_32F));
287 UMat umat = inp1.reshape(1, num * numPredsPerClass);
288 for (int i = 0; i < num; ++i)
290 Range ranges[] = { Range(i * numPredsPerClass, (i + 1) * numPredsPerClass), Range::all() };
291 transpose(umat(ranges), confPreds[i]);
295 bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
297 std::vector<UMat> inputs;
298 std::vector<UMat> outputs;
300 inps.getUMatVector(inputs);
301 outs.getUMatVector(outputs);
303 std::vector<LabelBBox> allDecodedBBoxes;
304 std::vector<Mat> allConfidenceScores;
306 int num = inputs[0].size[0];
308 // extract predictions from input layers
310 int numPriors = inputs[2].size[2] / 4;
312 // Retrieve all confidences
313 ocl_GetConfidenceScores(inputs[1], num, numPriors, _numClasses, allConfidenceScores);
315 // Decode all loc predictions to bboxes
316 bool ret = ocl_DecodeBBoxesAll(inputs[0], inputs[2], num, numPriors,
317 _shareLocation, _numLocClasses, _backgroundLabelId,
318 _codeType, _varianceEncodedInTarget, false,
325 std::vector<std::map<int, std::vector<int> > > allIndices;
326 for (int i = 0; i < num; ++i)
328 numKept += processDetections_(allDecodedBBoxes[i], allConfidenceScores[i], allIndices);
333 // Set confidences to zeros.
334 Range ranges[] = {Range::all(), Range::all(), Range::all(), Range(2, 3)};
335 outputs[0](ranges).setTo(0);
338 int outputShape[] = {1, 1, (int)numKept, 7};
339 UMat umat = UMat(4, outputShape, CV_32F);
341 Mat mat = umat.getMat(ACCESS_WRITE);
342 float* outputsData = mat.ptr<float>();
345 for (int i = 0; i < num; ++i)
347 count += outputDetections_(i, &outputsData[count * 7],
348 allDecodedBBoxes[i], allConfidenceScores[i],
351 CV_Assert(count == numKept);
354 outputs.push_back(umat);
355 outs.assign(outputs);
360 void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
363 CV_TRACE_ARG_VALUE(name, "name", name.c_str());
365 CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
366 OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
367 forward_ocl(inputs_arr, outputs_arr, internals_arr))
369 Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
372 void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
375 CV_TRACE_ARG_VALUE(name, "name", name.c_str());
377 std::vector<LabelBBox> allDecodedBBoxes;
378 std::vector<Mat> allConfidenceScores;
380 int num = inputs[0]->size[0];
382 // extract predictions from input layers
384 int numPriors = inputs[2]->size[2] / 4;
386 const float* locationData = inputs[0]->ptr<float>();
387 const float* confidenceData = inputs[1]->ptr<float>();
388 const float* priorData = inputs[2]->ptr<float>();
390 // Retrieve all location predictions
391 std::vector<LabelBBox> allLocationPredictions;
392 GetLocPredictions(locationData, num, numPriors, _numLocClasses,
393 _shareLocation, _locPredTransposed, allLocationPredictions);
395 // Retrieve all confidences
396 GetConfidenceScores(confidenceData, num, numPriors, _numClasses, allConfidenceScores);
398 // Retrieve all prior bboxes
399 std::vector<util::NormalizedBBox> priorBBoxes;
400 std::vector<std::vector<float> > priorVariances;
401 GetPriorBBoxes(priorData, numPriors, priorBBoxes, priorVariances);
403 // Decode all loc predictions to bboxes
404 DecodeBBoxesAll(allLocationPredictions, priorBBoxes, priorVariances, num,
405 _shareLocation, _numLocClasses, _backgroundLabelId,
406 _codeType, _varianceEncodedInTarget, false, allDecodedBBoxes);
410 std::vector<std::map<int, std::vector<int> > > allIndices;
411 for (int i = 0; i < num; ++i)
413 numKept += processDetections_(allDecodedBBoxes[i], allConfidenceScores[i], allIndices);
418 // Set confidences to zeros.
419 Range ranges[] = {Range::all(), Range::all(), Range::all(), Range(2, 3)};
420 outputs[0](ranges).setTo(0);
423 int outputShape[] = {1, 1, (int)numKept, 7};
424 outputs[0].create(4, outputShape, CV_32F);
425 float* outputsData = outputs[0].ptr<float>();
428 for (int i = 0; i < num; ++i)
430 count += outputDetections_(i, &outputsData[count * 7],
431 allDecodedBBoxes[i], allConfidenceScores[i],
434 CV_Assert(count == numKept);
437 size_t outputDetections_(
438 const int i, float* outputsData,
439 const LabelBBox& decodeBBoxes, Mat& confidenceScores,
440 const std::map<int, std::vector<int> >& indicesMap
444 for (std::map<int, std::vector<int> >::const_iterator it = indicesMap.begin(); it != indicesMap.end(); ++it)
446 int label = it->first;
447 if (confidenceScores.rows <= label)
448 CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find confidence predictions for label %d", label));
449 const std::vector<float>& scores = confidenceScores.row(label);
450 int locLabel = _shareLocation ? -1 : label;
451 LabelBBox::const_iterator label_bboxes = decodeBBoxes.find(locLabel);
452 if (label_bboxes == decodeBBoxes.end())
453 CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find location predictions for label %d", locLabel));
454 const std::vector<int>& indices = it->second;
456 for (size_t j = 0; j < indices.size(); ++j, ++count)
458 int idx = indices[j];
459 const util::NormalizedBBox& decode_bbox = label_bboxes->second[idx];
460 outputsData[count * 7] = i;
461 outputsData[count * 7 + 1] = label;
462 outputsData[count * 7 + 2] = scores[idx];
463 outputsData[count * 7 + 3] = decode_bbox.xmin;
464 outputsData[count * 7 + 4] = decode_bbox.ymin;
465 outputsData[count * 7 + 5] = decode_bbox.xmax;
466 outputsData[count * 7 + 6] = decode_bbox.ymax;
472 size_t processDetections_(
473 const LabelBBox& decodeBBoxes, Mat& confidenceScores,
474 std::vector<std::map<int, std::vector<int> > >& allIndices
477 std::map<int, std::vector<int> > indices;
478 size_t numDetections = 0;
479 for (int c = 0; c < (int)_numClasses; ++c)
481 if (c == _backgroundLabelId)
482 continue; // Ignore background class.
483 if (c >= confidenceScores.rows)
484 CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find confidence predictions for label %d", c));
486 const std::vector<float> scores = confidenceScores.row(c);
487 int label = _shareLocation ? -1 : c;
489 LabelBBox::const_iterator label_bboxes = decodeBBoxes.find(label);
490 if (label_bboxes == decodeBBoxes.end())
491 CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find location predictions for label %d", label));
492 NMSFast_(label_bboxes->second, scores, _confidenceThreshold, _nmsThreshold, 1.0, _topK,
493 indices[c], util::caffe_box_overlap);
494 numDetections += indices[c].size();
496 if (_keepTopK > -1 && numDetections > (size_t)_keepTopK)
498 std::vector<std::pair<float, std::pair<int, int> > > scoreIndexPairs;
499 for (std::map<int, std::vector<int> >::iterator it = indices.begin();
500 it != indices.end(); ++it)
502 int label = it->first;
503 const std::vector<int>& labelIndices = it->second;
504 if (label >= confidenceScores.rows)
505 CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find location predictions for label %d", label));
506 const std::vector<float>& scores = confidenceScores.row(label);
507 for (size_t j = 0; j < labelIndices.size(); ++j)
509 size_t idx = labelIndices[j];
510 CV_Assert(idx < scores.size());
511 scoreIndexPairs.push_back(std::make_pair(scores[idx], std::make_pair(label, idx)));
514 // Keep outputs k results per image.
515 std::sort(scoreIndexPairs.begin(), scoreIndexPairs.end(),
516 util::SortScorePairDescend<std::pair<int, int> >);
517 scoreIndexPairs.resize(_keepTopK);
519 std::map<int, std::vector<int> > newIndices;
520 for (size_t j = 0; j < scoreIndexPairs.size(); ++j)
522 int label = scoreIndexPairs[j].second.first;
523 int idx = scoreIndexPairs[j].second.second;
524 newIndices[label].push_back(idx);
526 allIndices.push_back(newIndices);
527 return (size_t)_keepTopK;
531 allIndices.push_back(indices);
532 return numDetections;
537 // **************************************************************
539 // **************************************************************
542 template<bool normalized>
543 static float BBoxSize(const util::NormalizedBBox& bbox)
545 if (bbox.xmax < bbox.xmin || bbox.ymax < bbox.ymin)
547 return 0; // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
557 float width = bbox.xmax - bbox.xmin;
558 float height = bbox.ymax - bbox.ymin;
561 return width * height;
565 // If bbox is not within range [0, 1].
566 return (width + 1) * (height + 1);
573 // Decode a bbox according to a prior bbox
574 template<bool variance_encoded_in_target>
575 static void DecodeBBox(
576 const util::NormalizedBBox& prior_bbox, const std::vector<float>& prior_variance,
577 const cv::String& code_type,
578 const bool clip_bbox, const util::NormalizedBBox& bbox,
579 util::NormalizedBBox& decode_bbox)
581 float bbox_xmin = variance_encoded_in_target ? bbox.xmin : prior_variance[0] * bbox.xmin;
582 float bbox_ymin = variance_encoded_in_target ? bbox.ymin : prior_variance[1] * bbox.ymin;
583 float bbox_xmax = variance_encoded_in_target ? bbox.xmax : prior_variance[2] * bbox.xmax;
584 float bbox_ymax = variance_encoded_in_target ? bbox.ymax : prior_variance[3] * bbox.ymax;
585 if (code_type == "CORNER")
587 decode_bbox.xmin = prior_bbox.xmin + bbox_xmin;
588 decode_bbox.ymin = prior_bbox.ymin + bbox_ymin;
589 decode_bbox.xmax = prior_bbox.xmax + bbox_xmax;
590 decode_bbox.ymax = prior_bbox.ymax + bbox_ymax;
592 else if (code_type == "CENTER_SIZE")
594 float prior_width = prior_bbox.xmax - prior_bbox.xmin;
595 CV_Assert(prior_width > 0);
596 float prior_height = prior_bbox.ymax - prior_bbox.ymin;
597 CV_Assert(prior_height > 0);
598 float prior_center_x = (prior_bbox.xmin + prior_bbox.xmax) * .5;
599 float prior_center_y = (prior_bbox.ymin + prior_bbox.ymax) * .5;
601 float decode_bbox_center_x, decode_bbox_center_y;
602 float decode_bbox_width, decode_bbox_height;
603 decode_bbox_center_x = bbox_xmin * prior_width + prior_center_x;
604 decode_bbox_center_y = bbox_ymin * prior_height + prior_center_y;
605 decode_bbox_width = exp(bbox_xmax) * prior_width;
606 decode_bbox_height = exp(bbox_ymax) * prior_height;
607 decode_bbox.xmin = decode_bbox_center_x - decode_bbox_width * .5;
608 decode_bbox.ymin = decode_bbox_center_y - decode_bbox_height * .5;
609 decode_bbox.xmax = decode_bbox_center_x + decode_bbox_width * .5;
610 decode_bbox.ymax = decode_bbox_center_y + decode_bbox_height * .5;
613 CV_ErrorNoReturn(Error::StsBadArg, "Unknown type.");
617 // Clip the util::NormalizedBBox such that the range for each corner is [0, 1]
618 decode_bbox.xmin = std::max(std::min(decode_bbox.xmin, 1.f), 0.f);
619 decode_bbox.ymin = std::max(std::min(decode_bbox.ymin, 1.f), 0.f);
620 decode_bbox.xmax = std::max(std::min(decode_bbox.xmax, 1.f), 0.f);
621 decode_bbox.ymax = std::max(std::min(decode_bbox.ymax, 1.f), 0.f);
623 decode_bbox.clear_size();
624 decode_bbox.set_size(BBoxSize<true>(decode_bbox));
627 // Decode a set of bboxes according to a set of prior bboxes
628 static void DecodeBBoxes(
629 const std::vector<util::NormalizedBBox>& prior_bboxes,
630 const std::vector<std::vector<float> >& prior_variances,
631 const cv::String& code_type, const bool variance_encoded_in_target,
632 const bool clip_bbox, const std::vector<util::NormalizedBBox>& bboxes,
633 std::vector<util::NormalizedBBox>& decode_bboxes)
635 CV_Assert(prior_bboxes.size() == prior_variances.size());
636 CV_Assert(prior_bboxes.size() == bboxes.size());
637 size_t num_bboxes = prior_bboxes.size();
638 CV_Assert(num_bboxes == 0 || prior_variances[0].size() == 4);
639 decode_bboxes.clear(); decode_bboxes.resize(num_bboxes);
640 if(variance_encoded_in_target)
642 for (int i = 0; i < num_bboxes; ++i)
643 DecodeBBox<true>(prior_bboxes[i], prior_variances[i], code_type,
644 clip_bbox, bboxes[i], decode_bboxes[i]);
648 for (int i = 0; i < num_bboxes; ++i)
649 DecodeBBox<false>(prior_bboxes[i], prior_variances[i], code_type,
650 clip_bbox, bboxes[i], decode_bboxes[i]);
654 // Decode all bboxes in a batch
655 static void DecodeBBoxesAll(const std::vector<LabelBBox>& all_loc_preds,
656 const std::vector<util::NormalizedBBox>& prior_bboxes,
657 const std::vector<std::vector<float> >& prior_variances,
658 const int num, const bool share_location,
659 const int num_loc_classes, const int background_label_id,
660 const cv::String& code_type, const bool variance_encoded_in_target,
661 const bool clip, std::vector<LabelBBox>& all_decode_bboxes)
663 CV_Assert(all_loc_preds.size() == num);
664 all_decode_bboxes.clear();
665 all_decode_bboxes.resize(num);
666 for (int i = 0; i < num; ++i)
668 // Decode predictions into bboxes.
669 const LabelBBox& loc_preds = all_loc_preds[i];
670 LabelBBox& decode_bboxes = all_decode_bboxes[i];
671 for (int c = 0; c < num_loc_classes; ++c)
673 int label = share_location ? -1 : c;
674 if (label == background_label_id)
675 continue; // Ignore background class.
676 LabelBBox::const_iterator label_loc_preds = loc_preds.find(label);
677 if (label_loc_preds == loc_preds.end())
678 CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find location predictions for label %d", label));
679 DecodeBBoxes(prior_bboxes, prior_variances,
680 code_type, variance_encoded_in_target, clip,
681 label_loc_preds->second, decode_bboxes[label]);
686 // Get prior bounding boxes from prior_data
687 // prior_data: 1 x 2 x num_priors * 4 x 1 blob.
688 // num_priors: number of priors.
689 // prior_bboxes: stores all the prior bboxes in the format of util::NormalizedBBox.
690 // prior_variances: stores all the variances needed by prior bboxes.
691 static void GetPriorBBoxes(const float* priorData, const int& numPriors,
692 std::vector<util::NormalizedBBox>& priorBBoxes,
693 std::vector<std::vector<float> >& priorVariances)
695 priorBBoxes.clear(); priorBBoxes.resize(numPriors);
696 priorVariances.clear(); priorVariances.resize(numPriors);
697 for (int i = 0; i < numPriors; ++i)
699 int startIdx = i * 4;
700 util::NormalizedBBox& bbox = priorBBoxes[i];
701 bbox.xmin = priorData[startIdx];
702 bbox.ymin = priorData[startIdx + 1];
703 bbox.xmax = priorData[startIdx + 2];
704 bbox.ymax = priorData[startIdx + 3];
705 bbox.set_size(BBoxSize<true>(bbox));
708 for (int i = 0; i < numPriors; ++i)
710 int startIdx = (numPriors + i) * 4;
711 // not needed here: priorVariances[i].clear();
712 for (int j = 0; j < 4; ++j)
714 priorVariances[i].push_back(priorData[startIdx + j]);
719 // Get location predictions from loc_data.
720 // loc_data: num x num_preds_per_class * num_loc_classes * 4 blob.
721 // num: the number of images.
722 // num_preds_per_class: number of predictions per class.
723 // num_loc_classes: number of location classes. It is 1 if share_location is
724 // true; and is equal to number of classes needed to predict otherwise.
725 // share_location: if true, all classes share the same location prediction.
726 // loc_pred_transposed: if true, represent four bounding box values as
727 // [y,x,height,width] or [x,y,width,height] otherwise.
728 // loc_preds: stores the location prediction, where each item contains
729 // location prediction for an image.
730 static void GetLocPredictions(const float* locData, const int num,
731 const int numPredsPerClass, const int numLocClasses,
732 const bool shareLocation, const bool locPredTransposed,
733 std::vector<LabelBBox>& locPreds)
738 CV_Assert(numLocClasses == 1);
740 locPreds.resize(num);
741 for (int i = 0; i < num; ++i, locData += numPredsPerClass * numLocClasses * 4)
743 LabelBBox& labelBBox = locPreds[i];
744 for (int p = 0; p < numPredsPerClass; ++p)
746 int startIdx = p * numLocClasses * 4;
747 for (int c = 0; c < numLocClasses; ++c)
749 int label = shareLocation ? -1 : c;
750 if (labelBBox.find(label) == labelBBox.end())
752 labelBBox[label].resize(numPredsPerClass);
754 util::NormalizedBBox& bbox = labelBBox[label][p];
755 if (locPredTransposed)
757 bbox.ymin = locData[startIdx + c * 4];
758 bbox.xmin = locData[startIdx + c * 4 + 1];
759 bbox.ymax = locData[startIdx + c * 4 + 2];
760 bbox.xmax = locData[startIdx + c * 4 + 3];
764 bbox.xmin = locData[startIdx + c * 4];
765 bbox.ymin = locData[startIdx + c * 4 + 1];
766 bbox.xmax = locData[startIdx + c * 4 + 2];
767 bbox.ymax = locData[startIdx + c * 4 + 3];
774 // Get confidence predictions from conf_data.
775 // conf_data: num x num_preds_per_class * num_classes blob.
776 // num: the number of images.
777 // num_preds_per_class: number of predictions per class.
778 // num_classes: number of classes.
779 // conf_preds: stores the confidence prediction, where each item contains
780 // confidence prediction for an image.
781 static void GetConfidenceScores(const float* confData, const int num,
782 const int numPredsPerClass, const int numClasses,
783 std::vector<Mat>& confPreds)
785 int shape[] = { numClasses, numPredsPerClass };
786 for (int i = 0; i < num; i++)
787 confPreds.push_back(Mat(2, shape, CV_32F));
789 for (int i = 0; i < num; ++i, confData += numPredsPerClass * numClasses)
791 Mat labelScores = confPreds[i];
792 for (int c = 0; c < numClasses; ++c)
794 for (int p = 0; p < numPredsPerClass; ++p)
796 labelScores.at<float>(c, p) = confData[p * numClasses + c];
802 // Compute the jaccard (intersection over union IoU) overlap between two bboxes.
803 template<bool normalized>
804 static float JaccardOverlap(const util::NormalizedBBox& bbox1,
805 const util::NormalizedBBox& bbox2)
807 util::NormalizedBBox intersect_bbox;
808 if (bbox2.xmin > bbox1.xmax || bbox2.xmax < bbox1.xmin ||
809 bbox2.ymin > bbox1.ymax || bbox2.ymax < bbox1.ymin)
811 // Return [0, 0, 0, 0] if there is no intersection.
812 intersect_bbox.xmin = 0;
813 intersect_bbox.ymin = 0;
814 intersect_bbox.xmax = 0;
815 intersect_bbox.ymax = 0;
819 intersect_bbox.xmin = std::max(bbox1.xmin, bbox2.xmin);
820 intersect_bbox.ymin = std::max(bbox1.ymin, bbox2.ymin);
821 intersect_bbox.xmax = std::min(bbox1.xmax, bbox2.xmax);
822 intersect_bbox.ymax = std::min(bbox1.ymax, bbox2.ymax);
825 float intersect_width, intersect_height;
826 intersect_width = intersect_bbox.xmax - intersect_bbox.xmin;
827 intersect_height = intersect_bbox.ymax - intersect_bbox.ymin;
828 if (intersect_width > 0 && intersect_height > 0)
835 float intersect_size = intersect_width * intersect_height;
836 float bbox1_size = BBoxSize<true>(bbox1);
837 float bbox2_size = BBoxSize<true>(bbox2);
838 return intersect_size / (bbox1_size + bbox2_size - intersect_size);
847 float util::caffe_box_overlap(const util::NormalizedBBox& a, const util::NormalizedBBox& b)
849 return DetectionOutputLayerImpl::JaccardOverlap<true>(a, b);
852 const std::string DetectionOutputLayerImpl::_layerName = std::string("DetectionOutput");
854 Ptr<DetectionOutputLayer> DetectionOutputLayer::create(const LayerParams ¶ms)
856 return Ptr<DetectionOutputLayer>(new DetectionOutputLayerImpl(params));