modules/dnn/src/layers/detection_output_layer.cpp

   1 /*M ///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #include "../precomp.hpp"
  44 #include "layers_common.hpp"
  45 #include <float.h>
  46 #include <string>
  47 #include "../nms.inl.hpp"
  48 #include "opencl_kernels_dnn.hpp"
  49
  50 namespace cv
  51 {
  52 namespace dnn
  53 {
  54
  55 namespace util
  56 {
  57
  58 class NormalizedBBox
  59 {
  60 public:
  61     float xmin, ymin, xmax, ymax;
  62
  63     NormalizedBBox()
  64         : xmin(0), ymin(0), xmax(0), ymax(0), has_size_(false), size_(0) {}
  65
  66     float size() const { return size_; }
  67
  68     bool has_size() const { return has_size_; }
  69
  70     void set_size(float value) { size_ = value; has_size_ = true; }
  71
  72     void clear_size() { size_ = 0; has_size_ = false; }
  73
  74 private:
  75     bool has_size_;
  76     float size_;
  77 };
  78
  79 template <typename T>
  80 static inline bool SortScorePairDescend(const std::pair<float, T>& pair1,
  81                           const std::pair<float, T>& pair2)
  82 {
  83     return pair1.first > pair2.first;
  84 }
  85
  86 static inline float caffe_box_overlap(const util::NormalizedBBox& a, const util::NormalizedBBox& b);
  87
  88 } // namespace
  89
  90 class DetectionOutputLayerImpl : public DetectionOutputLayer
  91 {
  92 public:
  93     unsigned _numClasses;
  94     bool _shareLocation;
  95     int _numLocClasses;
  96
  97     int _backgroundLabelId;
  98
  99     cv::String _codeType;
 100
 101     bool _varianceEncodedInTarget;
 102     int _keepTopK;
 103     float _confidenceThreshold;
 104
 105     float _nmsThreshold;
 106     int _topK;
 107     // Whenever predicted bounding boxes are respresented in YXHW instead of XYWH layout.
 108     bool _locPredTransposed;
 109
 110     enum { _numAxes = 4 };
 111     static const std::string _layerName;
 112
 113     typedef std::map<int, std::vector<util::NormalizedBBox> > LabelBBox;
 114
 115     bool getParameterDict(const LayerParams &params,
 116                           const std::string &parameterName,
 117                           DictValue& result)
 118     {
 119         if (!params.has(parameterName))
 120         {
 121             return false;
 122         }
 123
 124         result = params.get(parameterName);
 125         return true;
 126     }
 127
 128     template<typename T>
 129     T getParameter(const LayerParams &params,
 130                    const std::string &parameterName,
 131                    const size_t &idx=0,
 132                    const bool required=true,
 133                    const T& defaultValue=T())
 134     {
 135         DictValue dictValue;
 136         bool success = getParameterDict(params, parameterName, dictValue);
 137         if(!success)
 138         {
 139             if(required)
 140             {
 141                 std::string message = _layerName;
 142                 message += " layer parameter does not contain ";
 143                 message += parameterName;
 144                 message += " parameter.";
 145                 CV_ErrorNoReturn(Error::StsBadArg, message);
 146             }
 147             else
 148             {
 149                 return defaultValue;
 150             }
 151         }
 152         return dictValue.get<T>(idx);
 153     }
 154
 155     void getCodeType(const LayerParams &params)
 156     {
 157         String codeTypeString = params.get<String>("code_type").toLowerCase();
 158         if (codeTypeString == "center_size")
 159             _codeType = "CENTER_SIZE";
 160         else
 161             _codeType = "CORNER";
 162     }
 163
 164     DetectionOutputLayerImpl(const LayerParams &params)
 165     {
 166         _numClasses = getParameter<unsigned>(params, "num_classes");
 167         _shareLocation = getParameter<bool>(params, "share_location");
 168         _numLocClasses = _shareLocation ? 1 : _numClasses;
 169         _backgroundLabelId = getParameter<int>(params, "background_label_id");
 170         _varianceEncodedInTarget = getParameter<bool>(params, "variance_encoded_in_target", 0, false, false);
 171         _keepTopK = getParameter<int>(params, "keep_top_k");
 172         _confidenceThreshold = getParameter<float>(params, "confidence_threshold", 0, false, -FLT_MAX);
 173         _topK = getParameter<int>(params, "top_k", 0, false, -1);
 174         _locPredTransposed = getParameter<bool>(params, "loc_pred_transposed", 0, false, false);
 175
 176         getCodeType(params);
 177
 178         // Parameters used in nms.
 179         _nmsThreshold = getParameter<float>(params, "nms_threshold");
 180         CV_Assert(_nmsThreshold > 0.);
 181
 182         setParamsFrom(params);
 183     }
 184
 185     void checkInputs(const std::vector<Mat*> &inputs)
 186     {
 187         for (size_t i = 1; i < inputs.size(); i++)
 188         {
 189             CV_Assert(inputs[i]->size == inputs[0]->size);
 190         }
 191     }
 192
 193     bool getMemoryShapes(const std::vector<MatShape> &inputs,
 194                          const int requiredOutputs,
 195                          std::vector<MatShape> &outputs,
 196                          std::vector<MatShape> &internals) const
 197     {
 198         CV_Assert(inputs.size() > 0);
 199         CV_Assert(inputs[0][0] == inputs[1][0]);
 200
 201         int numPriors = inputs[2][2] / 4;
 202         CV_Assert((numPriors * _numLocClasses * 4) == inputs[0][1]);
 203         CV_Assert(int(numPriors * _numClasses) == inputs[1][1]);
 204
 205         // num() and channels() are 1.
 206         // Since the number of bboxes to be kept is unknown before nms, we manually
 207         // set it to (fake) 1.
 208         // Each row is a 7 dimension std::vector, which stores
 209         // [image_id, label, confidence, xmin, ymin, xmax, ymax]
 210         outputs.resize(1, shape(1, 1, 1, 7));
 211
 212         return false;
 213     }
 214
 215 #ifdef HAVE_OPENCL
 216     // Decode all bboxes in a batch
 217     bool ocl_DecodeBBoxesAll(UMat& loc_mat, UMat& prior_mat,
 218                              const int num, const int numPriors, const bool share_location,
 219                              const int num_loc_classes, const int background_label_id,
 220                              const cv::String& code_type, const bool variance_encoded_in_target,
 221                              const bool clip, std::vector<LabelBBox>& all_decode_bboxes)
 222     {
 223         UMat outmat = UMat(loc_mat.dims, loc_mat.size, CV_32F);
 224         size_t nthreads = loc_mat.total();
 225         String kernel_name;
 226
 227         if (code_type == "CORNER")
 228             kernel_name = "DecodeBBoxesCORNER";
 229         else if (code_type == "CENTER_SIZE")
 230             kernel_name = "DecodeBBoxesCENTER_SIZE";
 231         else
 232             return false;
 233
 234         for (int i = 0; i < num; ++i)
 235         {
 236             ocl::Kernel kernel(kernel_name.c_str(), ocl::dnn::detection_output_oclsrc);
 237             kernel.set(0, (int)nthreads);
 238             kernel.set(1, ocl::KernelArg::PtrReadOnly(loc_mat));
 239             kernel.set(2, ocl::KernelArg::PtrReadOnly(prior_mat));
 240             kernel.set(3, (int)variance_encoded_in_target);
 241             kernel.set(4, (int)numPriors);
 242             kernel.set(5, (int)share_location);
 243             kernel.set(6, (int)num_loc_classes);
 244             kernel.set(7, (int)background_label_id);
 245             kernel.set(8, (int)clip);
 246             kernel.set(9, ocl::KernelArg::PtrWriteOnly(outmat));
 247
 248             if (!kernel.run(1, &nthreads, NULL, false))
 249                 return false;
 250         }
 251
 252         all_decode_bboxes.clear();
 253         all_decode_bboxes.resize(num);
 254         {
 255             Mat mat = outmat.getMat(ACCESS_READ);
 256             const float* decode_data = mat.ptr<float>();
 257             for (int i = 0; i < num; ++i)
 258             {
 259                 LabelBBox& decode_bboxes = all_decode_bboxes[i];
 260                 for (int c = 0; c < num_loc_classes; ++c)
 261                 {
 262                     int label = share_location ? -1 : c;
 263                     decode_bboxes[label].resize(numPriors);
 264                     for (int p = 0; p < numPriors; ++p)
 265                     {
 266                         int startIdx = p * num_loc_classes * 4;
 267                         util::NormalizedBBox& bbox = decode_bboxes[label][p];
 268                         bbox.xmin = decode_data[startIdx + c * 4];
 269                         bbox.ymin = decode_data[startIdx + c * 4 + 1];
 270                         bbox.xmax = decode_data[startIdx + c * 4 + 2];
 271                         bbox.ymax = decode_data[startIdx + c * 4 + 3];
 272                     }
 273                 }
 274             }
 275         }
 276         return true;
 277     }
 278
 279     void ocl_GetConfidenceScores(const UMat& inp1, const int num,
 280                                  const int numPredsPerClass, const int numClasses,
 281                                  std::vector<Mat>& confPreds)
 282     {
 283         int shape[] = { numClasses, numPredsPerClass };
 284         for (int i = 0; i < num; i++)
 285             confPreds.push_back(Mat(2, shape, CV_32F));
 286
 287         UMat umat = inp1.reshape(1, num * numPredsPerClass);
 288         for (int i = 0; i < num; ++i)
 289         {
 290             Range ranges[] = { Range(i * numPredsPerClass, (i + 1) * numPredsPerClass), Range::all() };
 291             transpose(umat(ranges), confPreds[i]);
 292         }
 293     }
 294
 295     bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
 296     {
 297         std::vector<UMat> inputs;
 298         std::vector<UMat> outputs;
 299
 300         inps.getUMatVector(inputs);
 301         outs.getUMatVector(outputs);
 302
 303         std::vector<LabelBBox> allDecodedBBoxes;
 304         std::vector<Mat> allConfidenceScores;
 305
 306         int num = inputs[0].size[0];
 307
 308         // extract predictions from input layers
 309         {
 310             int numPriors = inputs[2].size[2] / 4;
 311
 312             // Retrieve all confidences
 313             ocl_GetConfidenceScores(inputs[1], num, numPriors, _numClasses, allConfidenceScores);
 314
 315             // Decode all loc predictions to bboxes
 316             bool ret = ocl_DecodeBBoxesAll(inputs[0], inputs[2], num, numPriors,
 317                                            _shareLocation, _numLocClasses, _backgroundLabelId,
 318                                            _codeType, _varianceEncodedInTarget, false,
 319                                            allDecodedBBoxes);
 320             if (!ret)
 321                 return false;
 322         }
 323
 324         size_t numKept = 0;
 325         std::vector<std::map<int, std::vector<int> > > allIndices;
 326         for (int i = 0; i < num; ++i)
 327         {
 328             numKept += processDetections_(allDecodedBBoxes[i], allConfidenceScores[i], allIndices);
 329         }
 330
 331         if (numKept == 0)
 332         {
 333             // Set confidences to zeros.
 334             Range ranges[] = {Range::all(), Range::all(), Range::all(), Range(2, 3)};
 335             outputs[0](ranges).setTo(0);
 336             return true;
 337         }
 338         int outputShape[] = {1, 1, (int)numKept, 7};
 339         UMat umat = UMat(4, outputShape, CV_32F);
 340         {
 341             Mat mat = umat.getMat(ACCESS_WRITE);
 342             float* outputsData = mat.ptr<float>();
 343
 344             size_t count = 0;
 345             for (int i = 0; i < num; ++i)
 346             {
 347                 count += outputDetections_(i, &outputsData[count * 7],
 348                                            allDecodedBBoxes[i], allConfidenceScores[i],
 349                                            allIndices[i]);
 350             }
 351             CV_Assert(count == numKept);
 352         }
 353         outputs.clear();
 354         outputs.push_back(umat);
 355         outs.assign(outputs);
 356         return true;
 357     }
 358 #endif
 359
 360     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
 361     {
 362         CV_TRACE_FUNCTION();
 363         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 364
 365         CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
 366                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
 367                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 368
 369         Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
 370     }
 371
 372     void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
 373     {
 374         CV_TRACE_FUNCTION();
 375         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 376
 377         std::vector<LabelBBox> allDecodedBBoxes;
 378         std::vector<Mat> allConfidenceScores;
 379
 380         int num = inputs[0]->size[0];
 381
 382         // extract predictions from input layers
 383         {
 384             int numPriors = inputs[2]->size[2] / 4;
 385
 386             const float* locationData = inputs[0]->ptr<float>();
 387             const float* confidenceData = inputs[1]->ptr<float>();
 388             const float* priorData = inputs[2]->ptr<float>();
 389
 390             // Retrieve all location predictions
 391             std::vector<LabelBBox> allLocationPredictions;
 392             GetLocPredictions(locationData, num, numPriors, _numLocClasses,
 393                               _shareLocation, _locPredTransposed, allLocationPredictions);
 394
 395             // Retrieve all confidences
 396             GetConfidenceScores(confidenceData, num, numPriors, _numClasses, allConfidenceScores);
 397
 398             // Retrieve all prior bboxes
 399             std::vector<util::NormalizedBBox> priorBBoxes;
 400             std::vector<std::vector<float> > priorVariances;
 401             GetPriorBBoxes(priorData, numPriors, priorBBoxes, priorVariances);
 402
 403             // Decode all loc predictions to bboxes
 404             DecodeBBoxesAll(allLocationPredictions, priorBBoxes, priorVariances, num,
 405                             _shareLocation, _numLocClasses, _backgroundLabelId,
 406                             _codeType, _varianceEncodedInTarget, false, allDecodedBBoxes);
 407         }
 408
 409         size_t numKept = 0;
 410         std::vector<std::map<int, std::vector<int> > > allIndices;
 411         for (int i = 0; i < num; ++i)
 412         {
 413             numKept += processDetections_(allDecodedBBoxes[i], allConfidenceScores[i], allIndices);
 414         }
 415
 416         if (numKept == 0)
 417         {
 418             // Set confidences to zeros.
 419             Range ranges[] = {Range::all(), Range::all(), Range::all(), Range(2, 3)};
 420             outputs[0](ranges).setTo(0);
 421             return;
 422         }
 423         int outputShape[] = {1, 1, (int)numKept, 7};
 424         outputs[0].create(4, outputShape, CV_32F);
 425         float* outputsData = outputs[0].ptr<float>();
 426
 427         size_t count = 0;
 428         for (int i = 0; i < num; ++i)
 429         {
 430             count += outputDetections_(i, &outputsData[count * 7],
 431                                        allDecodedBBoxes[i], allConfidenceScores[i],
 432                                        allIndices[i]);
 433         }
 434         CV_Assert(count == numKept);
 435     }
 436
 437     size_t outputDetections_(
 438             const int i, float* outputsData,
 439             const LabelBBox& decodeBBoxes, Mat& confidenceScores,
 440             const std::map<int, std::vector<int> >& indicesMap
 441     )
 442     {
 443         size_t count = 0;
 444         for (std::map<int, std::vector<int> >::const_iterator it = indicesMap.begin(); it != indicesMap.end(); ++it)
 445         {
 446             int label = it->first;
 447             if (confidenceScores.rows <= label)
 448                 CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find confidence predictions for label %d", label));
 449             const std::vector<float>& scores = confidenceScores.row(label);
 450             int locLabel = _shareLocation ? -1 : label;
 451             LabelBBox::const_iterator label_bboxes = decodeBBoxes.find(locLabel);
 452             if (label_bboxes == decodeBBoxes.end())
 453                 CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find location predictions for label %d", locLabel));
 454             const std::vector<int>& indices = it->second;
 455
 456             for (size_t j = 0; j < indices.size(); ++j, ++count)
 457             {
 458                 int idx = indices[j];
 459                 const util::NormalizedBBox& decode_bbox = label_bboxes->second[idx];
 460                 outputsData[count * 7] = i;
 461                 outputsData[count * 7 + 1] = label;
 462                 outputsData[count * 7 + 2] = scores[idx];
 463                 outputsData[count * 7 + 3] = decode_bbox.xmin;
 464                 outputsData[count * 7 + 4] = decode_bbox.ymin;
 465                 outputsData[count * 7 + 5] = decode_bbox.xmax;
 466                 outputsData[count * 7 + 6] = decode_bbox.ymax;
 467             }
 468         }
 469         return count;
 470     }
 471
 472     size_t processDetections_(
 473             const LabelBBox& decodeBBoxes, Mat& confidenceScores,
 474             std::vector<std::map<int, std::vector<int> > >& allIndices
 475     )
 476     {
 477         std::map<int, std::vector<int> > indices;
 478         size_t numDetections = 0;
 479         for (int c = 0; c < (int)_numClasses; ++c)
 480         {
 481             if (c == _backgroundLabelId)
 482                 continue; // Ignore background class.
 483             if (c >= confidenceScores.rows)
 484                 CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find confidence predictions for label %d", c));
 485
 486             const std::vector<float> scores = confidenceScores.row(c);
 487             int label = _shareLocation ? -1 : c;
 488
 489             LabelBBox::const_iterator label_bboxes = decodeBBoxes.find(label);
 490             if (label_bboxes == decodeBBoxes.end())
 491                 CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find location predictions for label %d", label));
 492             NMSFast_(label_bboxes->second, scores, _confidenceThreshold, _nmsThreshold, 1.0, _topK,
 493                 indices[c], util::caffe_box_overlap);
 494             numDetections += indices[c].size();
 495         }
 496         if (_keepTopK > -1 && numDetections > (size_t)_keepTopK)
 497         {
 498             std::vector<std::pair<float, std::pair<int, int> > > scoreIndexPairs;
 499             for (std::map<int, std::vector<int> >::iterator it = indices.begin();
 500                  it != indices.end(); ++it)
 501             {
 502                 int label = it->first;
 503                 const std::vector<int>& labelIndices = it->second;
 504                 if (label >= confidenceScores.rows)
 505                     CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find location predictions for label %d", label));
 506                 const std::vector<float>& scores = confidenceScores.row(label);
 507                 for (size_t j = 0; j < labelIndices.size(); ++j)
 508                 {
 509                     size_t idx = labelIndices[j];
 510                     CV_Assert(idx < scores.size());
 511                     scoreIndexPairs.push_back(std::make_pair(scores[idx], std::make_pair(label, idx)));
 512                 }
 513             }
 514             // Keep outputs k results per image.
 515             std::sort(scoreIndexPairs.begin(), scoreIndexPairs.end(),
 516                       util::SortScorePairDescend<std::pair<int, int> >);
 517             scoreIndexPairs.resize(_keepTopK);
 518
 519             std::map<int, std::vector<int> > newIndices;
 520             for (size_t j = 0; j < scoreIndexPairs.size(); ++j)
 521             {
 522                 int label = scoreIndexPairs[j].second.first;
 523                 int idx = scoreIndexPairs[j].second.second;
 524                 newIndices[label].push_back(idx);
 525             }
 526             allIndices.push_back(newIndices);
 527             return (size_t)_keepTopK;
 528         }
 529         else
 530         {
 531             allIndices.push_back(indices);
 532             return numDetections;
 533         }
 534     }
 535
 536
 537     // **************************************************************
 538     // Utility functions
 539     // **************************************************************
 540
 541     // Compute bbox size
 542     template<bool normalized>
 543     static float BBoxSize(const util::NormalizedBBox& bbox)
 544     {
 545         if (bbox.xmax < bbox.xmin || bbox.ymax < bbox.ymin)
 546         {
 547             return 0; // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
 548         }
 549         else
 550         {
 551             if (bbox.has_size())
 552             {
 553                 return bbox.size();
 554             }
 555             else
 556             {
 557                 float width = bbox.xmax - bbox.xmin;
 558                 float height = bbox.ymax - bbox.ymin;
 559                 if (normalized)
 560                 {
 561                     return width * height;
 562                 }
 563                 else
 564                 {
 565                     // If bbox is not within range [0, 1].
 566                     return (width + 1) * (height + 1);
 567                 }
 568             }
 569         }
 570     }
 571
 572
 573     // Decode a bbox according to a prior bbox
 574     template<bool variance_encoded_in_target>
 575     static void DecodeBBox(
 576         const util::NormalizedBBox& prior_bbox, const std::vector<float>& prior_variance,
 577         const cv::String& code_type,
 578         const bool clip_bbox, const util::NormalizedBBox& bbox,
 579         util::NormalizedBBox& decode_bbox)
 580     {
 581         float bbox_xmin = variance_encoded_in_target ? bbox.xmin : prior_variance[0] * bbox.xmin;
 582         float bbox_ymin = variance_encoded_in_target ? bbox.ymin : prior_variance[1] * bbox.ymin;
 583         float bbox_xmax = variance_encoded_in_target ? bbox.xmax : prior_variance[2] * bbox.xmax;
 584         float bbox_ymax = variance_encoded_in_target ? bbox.ymax : prior_variance[3] * bbox.ymax;
 585         if (code_type == "CORNER")
 586         {
 587             decode_bbox.xmin = prior_bbox.xmin + bbox_xmin;
 588             decode_bbox.ymin = prior_bbox.ymin + bbox_ymin;
 589             decode_bbox.xmax = prior_bbox.xmax + bbox_xmax;
 590             decode_bbox.ymax = prior_bbox.ymax + bbox_ymax;
 591         }
 592         else if (code_type == "CENTER_SIZE")
 593         {
 594             float prior_width = prior_bbox.xmax - prior_bbox.xmin;
 595             CV_Assert(prior_width > 0);
 596             float prior_height = prior_bbox.ymax - prior_bbox.ymin;
 597             CV_Assert(prior_height > 0);
 598             float prior_center_x = (prior_bbox.xmin + prior_bbox.xmax) * .5;
 599             float prior_center_y = (prior_bbox.ymin + prior_bbox.ymax) * .5;
 600
 601             float decode_bbox_center_x, decode_bbox_center_y;
 602             float decode_bbox_width, decode_bbox_height;
 603             decode_bbox_center_x = bbox_xmin * prior_width + prior_center_x;
 604             decode_bbox_center_y = bbox_ymin * prior_height + prior_center_y;
 605             decode_bbox_width = exp(bbox_xmax) * prior_width;
 606             decode_bbox_height = exp(bbox_ymax) * prior_height;
 607             decode_bbox.xmin = decode_bbox_center_x - decode_bbox_width * .5;
 608             decode_bbox.ymin = decode_bbox_center_y - decode_bbox_height * .5;
 609             decode_bbox.xmax = decode_bbox_center_x + decode_bbox_width * .5;
 610             decode_bbox.ymax = decode_bbox_center_y + decode_bbox_height * .5;
 611         }
 612         else
 613             CV_ErrorNoReturn(Error::StsBadArg, "Unknown type.");
 614
 615         if (clip_bbox)
 616         {
 617             // Clip the util::NormalizedBBox such that the range for each corner is [0, 1]
 618             decode_bbox.xmin = std::max(std::min(decode_bbox.xmin, 1.f), 0.f);
 619             decode_bbox.ymin = std::max(std::min(decode_bbox.ymin, 1.f), 0.f);
 620             decode_bbox.xmax = std::max(std::min(decode_bbox.xmax, 1.f), 0.f);
 621             decode_bbox.ymax = std::max(std::min(decode_bbox.ymax, 1.f), 0.f);
 622         }
 623         decode_bbox.clear_size();
 624         decode_bbox.set_size(BBoxSize<true>(decode_bbox));
 625     }
 626
 627     // Decode a set of bboxes according to a set of prior bboxes
 628     static void DecodeBBoxes(
 629         const std::vector<util::NormalizedBBox>& prior_bboxes,
 630         const std::vector<std::vector<float> >& prior_variances,
 631         const cv::String& code_type, const bool variance_encoded_in_target,
 632         const bool clip_bbox, const std::vector<util::NormalizedBBox>& bboxes,
 633         std::vector<util::NormalizedBBox>& decode_bboxes)
 634     {
 635         CV_Assert(prior_bboxes.size() == prior_variances.size());
 636         CV_Assert(prior_bboxes.size() == bboxes.size());
 637         size_t num_bboxes = prior_bboxes.size();
 638         CV_Assert(num_bboxes == 0 || prior_variances[0].size() == 4);
 639         decode_bboxes.clear(); decode_bboxes.resize(num_bboxes);
 640         if(variance_encoded_in_target)
 641         {
 642             for (int i = 0; i < num_bboxes; ++i)
 643                 DecodeBBox<true>(prior_bboxes[i], prior_variances[i], code_type,
 644                                  clip_bbox, bboxes[i], decode_bboxes[i]);
 645         }
 646         else
 647         {
 648             for (int i = 0; i < num_bboxes; ++i)
 649                 DecodeBBox<false>(prior_bboxes[i], prior_variances[i], code_type,
 650                                   clip_bbox, bboxes[i], decode_bboxes[i]);
 651         }
 652     }
 653
 654     // Decode all bboxes in a batch
 655     static void DecodeBBoxesAll(const std::vector<LabelBBox>& all_loc_preds,
 656         const std::vector<util::NormalizedBBox>& prior_bboxes,
 657         const std::vector<std::vector<float> >& prior_variances,
 658         const int num, const bool share_location,
 659         const int num_loc_classes, const int background_label_id,
 660         const cv::String& code_type, const bool variance_encoded_in_target,
 661         const bool clip, std::vector<LabelBBox>& all_decode_bboxes)
 662     {
 663         CV_Assert(all_loc_preds.size() == num);
 664         all_decode_bboxes.clear();
 665         all_decode_bboxes.resize(num);
 666         for (int i = 0; i < num; ++i)
 667         {
 668             // Decode predictions into bboxes.
 669             const LabelBBox& loc_preds = all_loc_preds[i];
 670             LabelBBox& decode_bboxes = all_decode_bboxes[i];
 671             for (int c = 0; c < num_loc_classes; ++c)
 672             {
 673                 int label = share_location ? -1 : c;
 674                 if (label == background_label_id)
 675                     continue; // Ignore background class.
 676                 LabelBBox::const_iterator label_loc_preds = loc_preds.find(label);
 677                 if (label_loc_preds == loc_preds.end())
 678                     CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find location predictions for label %d", label));
 679                 DecodeBBoxes(prior_bboxes, prior_variances,
 680                              code_type, variance_encoded_in_target, clip,
 681                              label_loc_preds->second, decode_bboxes[label]);
 682             }
 683         }
 684     }
 685
 686     // Get prior bounding boxes from prior_data
 687     //    prior_data: 1 x 2 x num_priors * 4 x 1 blob.
 688     //    num_priors: number of priors.
 689     //    prior_bboxes: stores all the prior bboxes in the format of util::NormalizedBBox.
 690     //    prior_variances: stores all the variances needed by prior bboxes.
 691     static void GetPriorBBoxes(const float* priorData, const int& numPriors,
 692                         std::vector<util::NormalizedBBox>& priorBBoxes,
 693                         std::vector<std::vector<float> >& priorVariances)
 694     {
 695         priorBBoxes.clear(); priorBBoxes.resize(numPriors);
 696         priorVariances.clear(); priorVariances.resize(numPriors);
 697         for (int i = 0; i < numPriors; ++i)
 698         {
 699             int startIdx = i * 4;
 700             util::NormalizedBBox& bbox = priorBBoxes[i];
 701             bbox.xmin = priorData[startIdx];
 702             bbox.ymin = priorData[startIdx + 1];
 703             bbox.xmax = priorData[startIdx + 2];
 704             bbox.ymax = priorData[startIdx + 3];
 705             bbox.set_size(BBoxSize<true>(bbox));
 706         }
 707
 708         for (int i = 0; i < numPriors; ++i)
 709         {
 710             int startIdx = (numPriors + i) * 4;
 711             // not needed here: priorVariances[i].clear();
 712             for (int j = 0; j < 4; ++j)
 713             {
 714                 priorVariances[i].push_back(priorData[startIdx + j]);
 715             }
 716         }
 717     }
 718
 719     // Get location predictions from loc_data.
 720     //    loc_data: num x num_preds_per_class * num_loc_classes * 4 blob.
 721     //    num: the number of images.
 722     //    num_preds_per_class: number of predictions per class.
 723     //    num_loc_classes: number of location classes. It is 1 if share_location is
 724     //      true; and is equal to number of classes needed to predict otherwise.
 725     //    share_location: if true, all classes share the same location prediction.
 726     //    loc_pred_transposed: if true, represent four bounding box values as
 727     //                         [y,x,height,width] or [x,y,width,height] otherwise.
 728     //    loc_preds: stores the location prediction, where each item contains
 729     //      location prediction for an image.
 730     static void GetLocPredictions(const float* locData, const int num,
 731                            const int numPredsPerClass, const int numLocClasses,
 732                            const bool shareLocation, const bool locPredTransposed,
 733                            std::vector<LabelBBox>& locPreds)
 734     {
 735         locPreds.clear();
 736         if (shareLocation)
 737         {
 738             CV_Assert(numLocClasses == 1);
 739         }
 740         locPreds.resize(num);
 741         for (int i = 0; i < num; ++i, locData += numPredsPerClass * numLocClasses * 4)
 742         {
 743             LabelBBox& labelBBox = locPreds[i];
 744             for (int p = 0; p < numPredsPerClass; ++p)
 745             {
 746                 int startIdx = p * numLocClasses * 4;
 747                 for (int c = 0; c < numLocClasses; ++c)
 748                 {
 749                     int label = shareLocation ? -1 : c;
 750                     if (labelBBox.find(label) == labelBBox.end())
 751                     {
 752                         labelBBox[label].resize(numPredsPerClass);
 753                     }
 754                     util::NormalizedBBox& bbox = labelBBox[label][p];
 755                     if (locPredTransposed)
 756                     {
 757                         bbox.ymin = locData[startIdx + c * 4];
 758                         bbox.xmin = locData[startIdx + c * 4 + 1];
 759                         bbox.ymax = locData[startIdx + c * 4 + 2];
 760                         bbox.xmax = locData[startIdx + c * 4 + 3];
 761                     }
 762                     else
 763                     {
 764                         bbox.xmin = locData[startIdx + c * 4];
 765                         bbox.ymin = locData[startIdx + c * 4 + 1];
 766                         bbox.xmax = locData[startIdx + c * 4 + 2];
 767                         bbox.ymax = locData[startIdx + c * 4 + 3];
 768                     }
 769                 }
 770             }
 771         }
 772     }
 773
 774     // Get confidence predictions from conf_data.
 775     //    conf_data: num x num_preds_per_class * num_classes blob.
 776     //    num: the number of images.
 777     //    num_preds_per_class: number of predictions per class.
 778     //    num_classes: number of classes.
 779     //    conf_preds: stores the confidence prediction, where each item contains
 780     //      confidence prediction for an image.
 781     static void GetConfidenceScores(const float* confData, const int num,
 782                              const int numPredsPerClass, const int numClasses,
 783                              std::vector<Mat>& confPreds)
 784     {
 785         int shape[] = { numClasses, numPredsPerClass };
 786         for (int i = 0; i < num; i++)
 787             confPreds.push_back(Mat(2, shape, CV_32F));
 788
 789         for (int i = 0; i < num; ++i, confData += numPredsPerClass * numClasses)
 790         {
 791             Mat labelScores = confPreds[i];
 792             for (int c = 0; c < numClasses; ++c)
 793             {
 794                 for (int p = 0; p < numPredsPerClass; ++p)
 795                 {
 796                     labelScores.at<float>(c, p) = confData[p * numClasses + c];
 797                 }
 798             }
 799         }
 800     }
 801
 802     // Compute the jaccard (intersection over union IoU) overlap between two bboxes.
 803     template<bool normalized>
 804     static float JaccardOverlap(const util::NormalizedBBox& bbox1,
 805                          const util::NormalizedBBox& bbox2)
 806     {
 807         util::NormalizedBBox intersect_bbox;
 808         if (bbox2.xmin > bbox1.xmax || bbox2.xmax < bbox1.xmin ||
 809             bbox2.ymin > bbox1.ymax || bbox2.ymax < bbox1.ymin)
 810         {
 811             // Return [0, 0, 0, 0] if there is no intersection.
 812             intersect_bbox.xmin = 0;
 813             intersect_bbox.ymin = 0;
 814             intersect_bbox.xmax = 0;
 815             intersect_bbox.ymax = 0;
 816         }
 817         else
 818         {
 819             intersect_bbox.xmin = std::max(bbox1.xmin, bbox2.xmin);
 820             intersect_bbox.ymin = std::max(bbox1.ymin, bbox2.ymin);
 821             intersect_bbox.xmax = std::min(bbox1.xmax, bbox2.xmax);
 822             intersect_bbox.ymax = std::min(bbox1.ymax, bbox2.ymax);
 823         }
 824
 825         float intersect_width, intersect_height;
 826         intersect_width = intersect_bbox.xmax - intersect_bbox.xmin;
 827         intersect_height = intersect_bbox.ymax - intersect_bbox.ymin;
 828         if (intersect_width > 0 && intersect_height > 0)
 829         {
 830             if (!normalized)
 831             {
 832                 intersect_width++;
 833                 intersect_height++;
 834             }
 835             float intersect_size = intersect_width * intersect_height;
 836             float bbox1_size = BBoxSize<true>(bbox1);
 837             float bbox2_size = BBoxSize<true>(bbox2);
 838             return intersect_size / (bbox1_size + bbox2_size - intersect_size);
 839         }
 840         else
 841         {
 842             return 0.;
 843         }
 844     }
 845 };
 846
 847 float util::caffe_box_overlap(const util::NormalizedBBox& a, const util::NormalizedBBox& b)
 848 {
 849     return DetectionOutputLayerImpl::JaccardOverlap<true>(a, b);
 850 }
 851
 852 const std::string DetectionOutputLayerImpl::_layerName = std::string("DetectionOutput");
 853
 854 Ptr<DetectionOutputLayer> DetectionOutputLayer::create(const LayerParams &params)
 855 {
 856     return Ptr<DetectionOutputLayer>(new DetectionOutputLayerImpl(params));
 857 }
 858
 859 }
 860 }