modules/dnn/src/layers/detection_output_layer.cpp

   1 /*M ///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #include "../precomp.hpp"
  44 #include "layers_common.hpp"
  45 #include <float.h>
  46 #include <string>
  47 #include "../nms.inl.hpp"
  48 #include "opencl_kernels_dnn.hpp"
  49
  50 namespace cv
  51 {
  52 namespace dnn
  53 {
  54
  55 namespace util
  56 {
  57
  58 class NormalizedBBox
  59 {
  60 public:
  61     float xmin, ymin, xmax, ymax;
  62
  63     NormalizedBBox()
  64         : xmin(0), ymin(0), xmax(0), ymax(0), has_size_(false), size_(0) {}
  65
  66     float size() const { return size_; }
  67
  68     bool has_size() const { return has_size_; }
  69
  70     void set_size(float value) { size_ = value; has_size_ = true; }
  71
  72     void clear_size() { size_ = 0; has_size_ = false; }
  73
  74 private:
  75     bool has_size_;
  76     float size_;
  77 };
  78
  79 template <typename T>
  80 static inline bool SortScorePairDescend(const std::pair<float, T>& pair1,
  81                           const std::pair<float, T>& pair2)
  82 {
  83     return pair1.first > pair2.first;
  84 }
  85
  86 static inline float caffe_box_overlap(const util::NormalizedBBox& a, const util::NormalizedBBox& b);
  87
  88 static inline float caffe_norm_box_overlap(const util::NormalizedBBox& a, const util::NormalizedBBox& b);
  89
  90 } // namespace
  91
  92 class DetectionOutputLayerImpl : public DetectionOutputLayer
  93 {
  94 public:
  95     unsigned _numClasses;
  96     bool _shareLocation;
  97     int _numLocClasses;
  98
  99     int _backgroundLabelId;
 100
 101     cv::String _codeType;
 102
 103     bool _varianceEncodedInTarget;
 104     int _keepTopK;
 105     float _confidenceThreshold;
 106
 107     float _nmsThreshold;
 108     int _topK;
 109     // Whenever predicted bounding boxes are respresented in YXHW instead of XYWH layout.
 110     bool _locPredTransposed;
 111     // It's true whenever predicted bounding boxes and proposals are normalized to [0, 1].
 112     bool _bboxesNormalized;
 113     bool _clip;
 114
 115     enum { _numAxes = 4 };
 116     static const std::string _layerName;
 117
 118     typedef std::map<int, std::vector<util::NormalizedBBox> > LabelBBox;
 119
 120     bool getParameterDict(const LayerParams &params,
 121                           const std::string &parameterName,
 122                           DictValue& result)
 123     {
 124         if (!params.has(parameterName))
 125         {
 126             return false;
 127         }
 128
 129         result = params.get(parameterName);
 130         return true;
 131     }
 132
 133     template<typename T>
 134     T getParameter(const LayerParams &params,
 135                    const std::string &parameterName,
 136                    const size_t &idx=0,
 137                    const bool required=true,
 138                    const T& defaultValue=T())
 139     {
 140         DictValue dictValue;
 141         bool success = getParameterDict(params, parameterName, dictValue);
 142         if(!success)
 143         {
 144             if(required)
 145             {
 146                 std::string message = _layerName;
 147                 message += " layer parameter does not contain ";
 148                 message += parameterName;
 149                 message += " parameter.";
 150                 CV_ErrorNoReturn(Error::StsBadArg, message);
 151             }
 152             else
 153             {
 154                 return defaultValue;
 155             }
 156         }
 157         return dictValue.get<T>(idx);
 158     }
 159
 160     void getCodeType(const LayerParams &params)
 161     {
 162         String codeTypeString = params.get<String>("code_type").toLowerCase();
 163         if (codeTypeString == "center_size")
 164             _codeType = "CENTER_SIZE";
 165         else
 166             _codeType = "CORNER";
 167     }
 168
 169     DetectionOutputLayerImpl(const LayerParams &params)
 170     {
 171         _numClasses = getParameter<unsigned>(params, "num_classes");
 172         _shareLocation = getParameter<bool>(params, "share_location");
 173         _numLocClasses = _shareLocation ? 1 : _numClasses;
 174         _backgroundLabelId = getParameter<int>(params, "background_label_id");
 175         _varianceEncodedInTarget = getParameter<bool>(params, "variance_encoded_in_target", 0, false, false);
 176         _keepTopK = getParameter<int>(params, "keep_top_k");
 177         _confidenceThreshold = getParameter<float>(params, "confidence_threshold", 0, false, -FLT_MAX);
 178         _topK = getParameter<int>(params, "top_k", 0, false, -1);
 179         _locPredTransposed = getParameter<bool>(params, "loc_pred_transposed", 0, false, false);
 180         _bboxesNormalized = getParameter<bool>(params, "normalized_bbox", 0, false, true);
 181         _clip = getParameter<bool>(params, "clip", 0, false, false);
 182
 183         getCodeType(params);
 184
 185         // Parameters used in nms.
 186         _nmsThreshold = getParameter<float>(params, "nms_threshold");
 187         CV_Assert(_nmsThreshold > 0.);
 188
 189         setParamsFrom(params);
 190     }
 191
 192     bool getMemoryShapes(const std::vector<MatShape> &inputs,
 193                          const int requiredOutputs,
 194                          std::vector<MatShape> &outputs,
 195                          std::vector<MatShape> &internals) const
 196     {
 197         CV_Assert(inputs.size() >= 3);
 198         CV_Assert(inputs[0][0] == inputs[1][0]);
 199
 200         int numPriors = inputs[2][2] / 4;
 201         CV_Assert((numPriors * _numLocClasses * 4) == inputs[0][1]);
 202         CV_Assert(int(numPriors * _numClasses) == inputs[1][1]);
 203
 204         // num() and channels() are 1.
 205         // Since the number of bboxes to be kept is unknown before nms, we manually
 206         // set it to (fake) 1.
 207         // Each row is a 7 dimension std::vector, which stores
 208         // [image_id, label, confidence, xmin, ymin, xmax, ymax]
 209         outputs.resize(1, shape(1, 1, 1, 7));
 210
 211         return false;
 212     }
 213
 214 #ifdef HAVE_OPENCL
 215     // Decode all bboxes in a batch
 216     bool ocl_DecodeBBoxesAll(UMat& loc_mat, UMat& prior_mat,
 217                              const int num, const int numPriors, const bool share_location,
 218                              const int num_loc_classes, const int background_label_id,
 219                              const cv::String& code_type, const bool variance_encoded_in_target,
 220                              const bool clip, std::vector<LabelBBox>& all_decode_bboxes)
 221     {
 222         UMat outmat = UMat(loc_mat.dims, loc_mat.size, CV_32F);
 223         size_t nthreads = loc_mat.total();
 224         String kernel_name;
 225
 226         if (code_type == "CORNER")
 227             kernel_name = "DecodeBBoxesCORNER";
 228         else if (code_type == "CENTER_SIZE")
 229             kernel_name = "DecodeBBoxesCENTER_SIZE";
 230         else
 231             return false;
 232
 233         for (int i = 0; i < num; ++i)
 234         {
 235             ocl::Kernel kernel(kernel_name.c_str(), ocl::dnn::detection_output_oclsrc);
 236             kernel.set(0, (int)nthreads);
 237             kernel.set(1, ocl::KernelArg::PtrReadOnly(loc_mat));
 238             kernel.set(2, ocl::KernelArg::PtrReadOnly(prior_mat));
 239             kernel.set(3, (int)variance_encoded_in_target);
 240             kernel.set(4, (int)numPriors);
 241             kernel.set(5, (int)share_location);
 242             kernel.set(6, (int)num_loc_classes);
 243             kernel.set(7, (int)background_label_id);
 244             kernel.set(8, (int)clip);
 245             kernel.set(9, ocl::KernelArg::PtrWriteOnly(outmat));
 246
 247             if (!kernel.run(1, &nthreads, NULL, false))
 248                 return false;
 249         }
 250
 251         all_decode_bboxes.clear();
 252         all_decode_bboxes.resize(num);
 253         {
 254             Mat mat = outmat.getMat(ACCESS_READ);
 255             const float* decode_data = mat.ptr<float>();
 256             for (int i = 0; i < num; ++i)
 257             {
 258                 LabelBBox& decode_bboxes = all_decode_bboxes[i];
 259                 for (int c = 0; c < num_loc_classes; ++c)
 260                 {
 261                     int label = share_location ? -1 : c;
 262                     decode_bboxes[label].resize(numPriors);
 263                     for (int p = 0; p < numPriors; ++p)
 264                     {
 265                         int startIdx = p * num_loc_classes * 4;
 266                         util::NormalizedBBox& bbox = decode_bboxes[label][p];
 267                         bbox.xmin = decode_data[startIdx + c * 4];
 268                         bbox.ymin = decode_data[startIdx + c * 4 + 1];
 269                         bbox.xmax = decode_data[startIdx + c * 4 + 2];
 270                         bbox.ymax = decode_data[startIdx + c * 4 + 3];
 271                     }
 272                 }
 273             }
 274         }
 275         return true;
 276     }
 277
 278     void ocl_GetConfidenceScores(const UMat& inp1, const int num,
 279                                  const int numPredsPerClass, const int numClasses,
 280                                  std::vector<Mat>& confPreds)
 281     {
 282         int shape[] = { numClasses, numPredsPerClass };
 283         for (int i = 0; i < num; i++)
 284             confPreds.push_back(Mat(2, shape, CV_32F));
 285
 286         UMat umat = inp1.reshape(1, num * numPredsPerClass);
 287         for (int i = 0; i < num; ++i)
 288         {
 289             Range ranges[] = { Range(i * numPredsPerClass, (i + 1) * numPredsPerClass), Range::all() };
 290             transpose(umat(ranges), confPreds[i]);
 291         }
 292     }
 293
 294     bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
 295     {
 296         std::vector<UMat> inputs;
 297         std::vector<UMat> outputs;
 298
 299         inps.getUMatVector(inputs);
 300         outs.getUMatVector(outputs);
 301
 302         std::vector<LabelBBox> allDecodedBBoxes;
 303         std::vector<Mat> allConfidenceScores;
 304
 305         int num = inputs[0].size[0];
 306
 307         // extract predictions from input layers
 308         {
 309             int numPriors = inputs[2].size[2] / 4;
 310
 311             // Retrieve all confidences
 312             ocl_GetConfidenceScores(inputs[1], num, numPriors, _numClasses, allConfidenceScores);
 313
 314             // Decode all loc predictions to bboxes
 315             bool ret = ocl_DecodeBBoxesAll(inputs[0], inputs[2], num, numPriors,
 316                                            _shareLocation, _numLocClasses, _backgroundLabelId,
 317                                            _codeType, _varianceEncodedInTarget, false,
 318                                            allDecodedBBoxes);
 319             if (!ret)
 320                 return false;
 321         }
 322
 323         size_t numKept = 0;
 324         std::vector<std::map<int, std::vector<int> > > allIndices;
 325         for (int i = 0; i < num; ++i)
 326         {
 327             numKept += processDetections_(allDecodedBBoxes[i], allConfidenceScores[i], allIndices);
 328         }
 329
 330         if (numKept == 0)
 331         {
 332             // Set confidences to zeros.
 333             Range ranges[] = {Range::all(), Range::all(), Range::all(), Range(2, 3)};
 334             outputs[0](ranges).setTo(0);
 335             return true;
 336         }
 337         int outputShape[] = {1, 1, (int)numKept, 7};
 338         UMat umat = UMat(4, outputShape, CV_32F);
 339         {
 340             Mat mat = umat.getMat(ACCESS_WRITE);
 341             float* outputsData = mat.ptr<float>();
 342
 343             size_t count = 0;
 344             for (int i = 0; i < num; ++i)
 345             {
 346                 count += outputDetections_(i, &outputsData[count * 7],
 347                                            allDecodedBBoxes[i], allConfidenceScores[i],
 348                                            allIndices[i]);
 349             }
 350             CV_Assert(count == numKept);
 351         }
 352         outputs.clear();
 353         outputs.push_back(umat);
 354         outs.assign(outputs);
 355         return true;
 356     }
 357 #endif
 358
 359     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
 360     {
 361         CV_TRACE_FUNCTION();
 362         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 363
 364         CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
 365                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
 366                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 367
 368         Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
 369     }
 370
 371     void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
 372     {
 373         CV_TRACE_FUNCTION();
 374         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 375
 376         std::vector<LabelBBox> allDecodedBBoxes;
 377         std::vector<Mat> allConfidenceScores;
 378
 379         int num = inputs[0]->size[0];
 380
 381         // extract predictions from input layers
 382         {
 383             int numPriors = inputs[2]->size[2] / 4;
 384
 385             const float* locationData = inputs[0]->ptr<float>();
 386             const float* confidenceData = inputs[1]->ptr<float>();
 387             const float* priorData = inputs[2]->ptr<float>();
 388
 389             // Retrieve all location predictions
 390             std::vector<LabelBBox> allLocationPredictions;
 391             GetLocPredictions(locationData, num, numPriors, _numLocClasses,
 392                               _shareLocation, _locPredTransposed, allLocationPredictions);
 393
 394             // Retrieve all confidences
 395             GetConfidenceScores(confidenceData, num, numPriors, _numClasses, allConfidenceScores);
 396
 397             // Retrieve all prior bboxes
 398             std::vector<util::NormalizedBBox> priorBBoxes;
 399             std::vector<std::vector<float> > priorVariances;
 400             GetPriorBBoxes(priorData, numPriors, _bboxesNormalized, priorBBoxes, priorVariances);
 401
 402             // Decode all loc predictions to bboxes
 403             util::NormalizedBBox clipBounds;
 404             if (_clip)
 405             {
 406                 CV_Assert(_bboxesNormalized || inputs.size() >= 4);
 407                 clipBounds.xmin = clipBounds.ymin = 0.0f;
 408                 if (_bboxesNormalized)
 409                     clipBounds.xmax = clipBounds.ymax = 1.0f;
 410                 else
 411                 {
 412                     // Input image sizes;
 413                     CV_Assert(inputs[3]->dims == 4);
 414                     clipBounds.xmax = inputs[3]->size[3] - 1;
 415                     clipBounds.ymax = inputs[3]->size[2] - 1;
 416                 }
 417             }
 418             DecodeBBoxesAll(allLocationPredictions, priorBBoxes, priorVariances, num,
 419                             _shareLocation, _numLocClasses, _backgroundLabelId,
 420                             _codeType, _varianceEncodedInTarget, _clip, clipBounds,
 421                             _bboxesNormalized, allDecodedBBoxes);
 422         }
 423
 424         size_t numKept = 0;
 425         std::vector<std::map<int, std::vector<int> > > allIndices;
 426         for (int i = 0; i < num; ++i)
 427         {
 428             numKept += processDetections_(allDecodedBBoxes[i], allConfidenceScores[i], allIndices);
 429         }
 430
 431         if (numKept == 0)
 432         {
 433             // Set confidences to zeros.
 434             Range ranges[] = {Range::all(), Range::all(), Range::all(), Range(2, 3)};
 435             outputs[0](ranges).setTo(0);
 436             return;
 437         }
 438         int outputShape[] = {1, 1, (int)numKept, 7};
 439         outputs[0].create(4, outputShape, CV_32F);
 440         float* outputsData = outputs[0].ptr<float>();
 441
 442         size_t count = 0;
 443         for (int i = 0; i < num; ++i)
 444         {
 445             count += outputDetections_(i, &outputsData[count * 7],
 446                                        allDecodedBBoxes[i], allConfidenceScores[i],
 447                                        allIndices[i]);
 448         }
 449         CV_Assert(count == numKept);
 450     }
 451
 452     size_t outputDetections_(
 453             const int i, float* outputsData,
 454             const LabelBBox& decodeBBoxes, Mat& confidenceScores,
 455             const std::map<int, std::vector<int> >& indicesMap
 456     )
 457     {
 458         size_t count = 0;
 459         for (std::map<int, std::vector<int> >::const_iterator it = indicesMap.begin(); it != indicesMap.end(); ++it)
 460         {
 461             int label = it->first;
 462             if (confidenceScores.rows <= label)
 463                 CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find confidence predictions for label %d", label));
 464             const std::vector<float>& scores = confidenceScores.row(label);
 465             int locLabel = _shareLocation ? -1 : label;
 466             LabelBBox::const_iterator label_bboxes = decodeBBoxes.find(locLabel);
 467             if (label_bboxes == decodeBBoxes.end())
 468                 CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find location predictions for label %d", locLabel));
 469             const std::vector<int>& indices = it->second;
 470
 471             for (size_t j = 0; j < indices.size(); ++j, ++count)
 472             {
 473                 int idx = indices[j];
 474                 const util::NormalizedBBox& decode_bbox = label_bboxes->second[idx];
 475                 outputsData[count * 7] = i;
 476                 outputsData[count * 7 + 1] = label;
 477                 outputsData[count * 7 + 2] = scores[idx];
 478                 outputsData[count * 7 + 3] = decode_bbox.xmin;
 479                 outputsData[count * 7 + 4] = decode_bbox.ymin;
 480                 outputsData[count * 7 + 5] = decode_bbox.xmax;
 481                 outputsData[count * 7 + 6] = decode_bbox.ymax;
 482             }
 483         }
 484         return count;
 485     }
 486
 487     size_t processDetections_(
 488             const LabelBBox& decodeBBoxes, Mat& confidenceScores,
 489             std::vector<std::map<int, std::vector<int> > >& allIndices
 490     )
 491     {
 492         std::map<int, std::vector<int> > indices;
 493         size_t numDetections = 0;
 494         for (int c = 0; c < (int)_numClasses; ++c)
 495         {
 496             if (c == _backgroundLabelId)
 497                 continue; // Ignore background class.
 498             if (c >= confidenceScores.rows)
 499                 CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find confidence predictions for label %d", c));
 500
 501             const std::vector<float> scores = confidenceScores.row(c);
 502             int label = _shareLocation ? -1 : c;
 503
 504             LabelBBox::const_iterator label_bboxes = decodeBBoxes.find(label);
 505             if (label_bboxes == decodeBBoxes.end())
 506                 CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find location predictions for label %d", label));
 507             if (_bboxesNormalized)
 508                 NMSFast_(label_bboxes->second, scores, _confidenceThreshold, _nmsThreshold, 1.0, _topK,
 509                          indices[c], util::caffe_norm_box_overlap);
 510             else
 511                 NMSFast_(label_bboxes->second, scores, _confidenceThreshold, _nmsThreshold, 1.0, _topK,
 512                          indices[c], util::caffe_box_overlap);
 513             numDetections += indices[c].size();
 514         }
 515         if (_keepTopK > -1 && numDetections > (size_t)_keepTopK)
 516         {
 517             std::vector<std::pair<float, std::pair<int, int> > > scoreIndexPairs;
 518             for (std::map<int, std::vector<int> >::iterator it = indices.begin();
 519                  it != indices.end(); ++it)
 520             {
 521                 int label = it->first;
 522                 const std::vector<int>& labelIndices = it->second;
 523                 if (label >= confidenceScores.rows)
 524                     CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find location predictions for label %d", label));
 525                 const std::vector<float>& scores = confidenceScores.row(label);
 526                 for (size_t j = 0; j < labelIndices.size(); ++j)
 527                 {
 528                     size_t idx = labelIndices[j];
 529                     CV_Assert(idx < scores.size());
 530                     scoreIndexPairs.push_back(std::make_pair(scores[idx], std::make_pair(label, idx)));
 531                 }
 532             }
 533             // Keep outputs k results per image.
 534             std::sort(scoreIndexPairs.begin(), scoreIndexPairs.end(),
 535                       util::SortScorePairDescend<std::pair<int, int> >);
 536             scoreIndexPairs.resize(_keepTopK);
 537
 538             std::map<int, std::vector<int> > newIndices;
 539             for (size_t j = 0; j < scoreIndexPairs.size(); ++j)
 540             {
 541                 int label = scoreIndexPairs[j].second.first;
 542                 int idx = scoreIndexPairs[j].second.second;
 543                 newIndices[label].push_back(idx);
 544             }
 545             allIndices.push_back(newIndices);
 546             return (size_t)_keepTopK;
 547         }
 548         else
 549         {
 550             allIndices.push_back(indices);
 551             return numDetections;
 552         }
 553     }
 554
 555
 556     // **************************************************************
 557     // Utility functions
 558     // **************************************************************
 559
 560     // Compute bbox size
 561     static float BBoxSize(const util::NormalizedBBox& bbox, bool normalized)
 562     {
 563         if (bbox.xmax < bbox.xmin || bbox.ymax < bbox.ymin)
 564         {
 565             return 0; // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
 566         }
 567         else
 568         {
 569             if (bbox.has_size())
 570             {
 571                 return bbox.size();
 572             }
 573             else
 574             {
 575                 float width = bbox.xmax - bbox.xmin;
 576                 float height = bbox.ymax - bbox.ymin;
 577                 if (normalized)
 578                 {
 579                     return width * height;
 580                 }
 581                 else
 582                 {
 583                     // If bbox is not within range [0, 1].
 584                     return (width + 1) * (height + 1);
 585                 }
 586             }
 587         }
 588     }
 589
 590
 591     // Decode a bbox according to a prior bbox
 592     template<bool variance_encoded_in_target>
 593     static void DecodeBBox(
 594         const util::NormalizedBBox& prior_bbox, const std::vector<float>& prior_variance,
 595         const cv::String& code_type,
 596         const bool clip_bbox, const util::NormalizedBBox& clip_bounds,
 597         const bool normalized_bbox, const util::NormalizedBBox& bbox,
 598         util::NormalizedBBox& decode_bbox)
 599     {
 600         float bbox_xmin = variance_encoded_in_target ? bbox.xmin : prior_variance[0] * bbox.xmin;
 601         float bbox_ymin = variance_encoded_in_target ? bbox.ymin : prior_variance[1] * bbox.ymin;
 602         float bbox_xmax = variance_encoded_in_target ? bbox.xmax : prior_variance[2] * bbox.xmax;
 603         float bbox_ymax = variance_encoded_in_target ? bbox.ymax : prior_variance[3] * bbox.ymax;
 604         if (code_type == "CORNER")
 605         {
 606             decode_bbox.xmin = prior_bbox.xmin + bbox_xmin;
 607             decode_bbox.ymin = prior_bbox.ymin + bbox_ymin;
 608             decode_bbox.xmax = prior_bbox.xmax + bbox_xmax;
 609             decode_bbox.ymax = prior_bbox.ymax + bbox_ymax;
 610         }
 611         else if (code_type == "CENTER_SIZE")
 612         {
 613             float prior_width = prior_bbox.xmax - prior_bbox.xmin;
 614             float prior_height = prior_bbox.ymax - prior_bbox.ymin;
 615             if (!normalized_bbox)
 616             {
 617                 prior_width += 1.0f;
 618                 prior_height += 1.0f;
 619             }
 620             CV_Assert(prior_width > 0);
 621             CV_Assert(prior_height > 0);
 622             float prior_center_x = prior_bbox.xmin + prior_width * .5;
 623             float prior_center_y = prior_bbox.ymin + prior_height * .5;
 624
 625             float decode_bbox_center_x, decode_bbox_center_y;
 626             float decode_bbox_width, decode_bbox_height;
 627             decode_bbox_center_x = bbox_xmin * prior_width + prior_center_x;
 628             decode_bbox_center_y = bbox_ymin * prior_height + prior_center_y;
 629             decode_bbox_width = exp(bbox_xmax) * prior_width;
 630             decode_bbox_height = exp(bbox_ymax) * prior_height;
 631             decode_bbox.xmin = decode_bbox_center_x - decode_bbox_width * .5;
 632             decode_bbox.ymin = decode_bbox_center_y - decode_bbox_height * .5;
 633             decode_bbox.xmax = decode_bbox_center_x + decode_bbox_width * .5;
 634             decode_bbox.ymax = decode_bbox_center_y + decode_bbox_height * .5;
 635         }
 636         else
 637             CV_ErrorNoReturn(Error::StsBadArg, "Unknown type.");
 638
 639         if (clip_bbox)
 640         {
 641             // Clip the util::NormalizedBBox.
 642             decode_bbox.xmin = std::max(std::min(decode_bbox.xmin, clip_bounds.xmax), clip_bounds.xmin);
 643             decode_bbox.ymin = std::max(std::min(decode_bbox.ymin, clip_bounds.ymax), clip_bounds.ymin);
 644             decode_bbox.xmax = std::max(std::min(decode_bbox.xmax, clip_bounds.xmax), clip_bounds.xmin);
 645             decode_bbox.ymax = std::max(std::min(decode_bbox.ymax, clip_bounds.ymax), clip_bounds.ymin);
 646         }
 647         decode_bbox.clear_size();
 648         decode_bbox.set_size(BBoxSize(decode_bbox, normalized_bbox));
 649     }
 650
 651     // Decode a set of bboxes according to a set of prior bboxes
 652     static void DecodeBBoxes(
 653         const std::vector<util::NormalizedBBox>& prior_bboxes,
 654         const std::vector<std::vector<float> >& prior_variances,
 655         const cv::String& code_type, const bool variance_encoded_in_target,
 656         const bool clip_bbox, const util::NormalizedBBox& clip_bounds,
 657         const bool normalized_bbox, const std::vector<util::NormalizedBBox>& bboxes,
 658         std::vector<util::NormalizedBBox>& decode_bboxes)
 659     {
 660         CV_Assert(prior_bboxes.size() == prior_variances.size());
 661         CV_Assert(prior_bboxes.size() == bboxes.size());
 662         size_t num_bboxes = prior_bboxes.size();
 663         CV_Assert(num_bboxes == 0 || prior_variances[0].size() == 4);
 664         decode_bboxes.clear(); decode_bboxes.resize(num_bboxes);
 665         if(variance_encoded_in_target)
 666         {
 667             for (int i = 0; i < num_bboxes; ++i)
 668                 DecodeBBox<true>(prior_bboxes[i], prior_variances[i], code_type,
 669                                  clip_bbox, clip_bounds, normalized_bbox,
 670                                  bboxes[i], decode_bboxes[i]);
 671         }
 672         else
 673         {
 674             for (int i = 0; i < num_bboxes; ++i)
 675                 DecodeBBox<false>(prior_bboxes[i], prior_variances[i], code_type,
 676                                   clip_bbox, clip_bounds, normalized_bbox,
 677                                   bboxes[i], decode_bboxes[i]);
 678         }
 679     }
 680
 681     // Decode all bboxes in a batch
 682     static void DecodeBBoxesAll(const std::vector<LabelBBox>& all_loc_preds,
 683         const std::vector<util::NormalizedBBox>& prior_bboxes,
 684         const std::vector<std::vector<float> >& prior_variances,
 685         const int num, const bool share_location,
 686         const int num_loc_classes, const int background_label_id,
 687         const cv::String& code_type, const bool variance_encoded_in_target,
 688         const bool clip, const util::NormalizedBBox& clip_bounds,
 689         const bool normalized_bbox, std::vector<LabelBBox>& all_decode_bboxes)
 690     {
 691         CV_Assert(all_loc_preds.size() == num);
 692         all_decode_bboxes.clear();
 693         all_decode_bboxes.resize(num);
 694         for (int i = 0; i < num; ++i)
 695         {
 696             // Decode predictions into bboxes.
 697             const LabelBBox& loc_preds = all_loc_preds[i];
 698             LabelBBox& decode_bboxes = all_decode_bboxes[i];
 699             for (int c = 0; c < num_loc_classes; ++c)
 700             {
 701                 int label = share_location ? -1 : c;
 702                 if (label == background_label_id)
 703                     continue; // Ignore background class.
 704                 LabelBBox::const_iterator label_loc_preds = loc_preds.find(label);
 705                 if (label_loc_preds == loc_preds.end())
 706                     CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find location predictions for label %d", label));
 707                 DecodeBBoxes(prior_bboxes, prior_variances,
 708                              code_type, variance_encoded_in_target, clip, clip_bounds,
 709                              normalized_bbox, label_loc_preds->second, decode_bboxes[label]);
 710             }
 711         }
 712     }
 713
 714     // Get prior bounding boxes from prior_data
 715     //    prior_data: 1 x 2 x num_priors * 4 x 1 blob.
 716     //    num_priors: number of priors.
 717     //    prior_bboxes: stores all the prior bboxes in the format of util::NormalizedBBox.
 718     //    prior_variances: stores all the variances needed by prior bboxes.
 719     static void GetPriorBBoxes(const float* priorData, const int& numPriors,
 720                         bool normalized_bbox, std::vector<util::NormalizedBBox>& priorBBoxes,
 721                         std::vector<std::vector<float> >& priorVariances)
 722     {
 723         priorBBoxes.clear(); priorBBoxes.resize(numPriors);
 724         priorVariances.clear(); priorVariances.resize(numPriors);
 725         for (int i = 0; i < numPriors; ++i)
 726         {
 727             int startIdx = i * 4;
 728             util::NormalizedBBox& bbox = priorBBoxes[i];
 729             bbox.xmin = priorData[startIdx];
 730             bbox.ymin = priorData[startIdx + 1];
 731             bbox.xmax = priorData[startIdx + 2];
 732             bbox.ymax = priorData[startIdx + 3];
 733             bbox.set_size(BBoxSize(bbox, normalized_bbox));
 734         }
 735
 736         for (int i = 0; i < numPriors; ++i)
 737         {
 738             int startIdx = (numPriors + i) * 4;
 739             // not needed here: priorVariances[i].clear();
 740             for (int j = 0; j < 4; ++j)
 741             {
 742                 priorVariances[i].push_back(priorData[startIdx + j]);
 743             }
 744         }
 745     }
 746
 747     // Get location predictions from loc_data.
 748     //    loc_data: num x num_preds_per_class * num_loc_classes * 4 blob.
 749     //    num: the number of images.
 750     //    num_preds_per_class: number of predictions per class.
 751     //    num_loc_classes: number of location classes. It is 1 if share_location is
 752     //      true; and is equal to number of classes needed to predict otherwise.
 753     //    share_location: if true, all classes share the same location prediction.
 754     //    loc_pred_transposed: if true, represent four bounding box values as
 755     //                         [y,x,height,width] or [x,y,width,height] otherwise.
 756     //    loc_preds: stores the location prediction, where each item contains
 757     //      location prediction for an image.
 758     static void GetLocPredictions(const float* locData, const int num,
 759                            const int numPredsPerClass, const int numLocClasses,
 760                            const bool shareLocation, const bool locPredTransposed,
 761                            std::vector<LabelBBox>& locPreds)
 762     {
 763         locPreds.clear();
 764         if (shareLocation)
 765         {
 766             CV_Assert(numLocClasses == 1);
 767         }
 768         locPreds.resize(num);
 769         for (int i = 0; i < num; ++i, locData += numPredsPerClass * numLocClasses * 4)
 770         {
 771             LabelBBox& labelBBox = locPreds[i];
 772             for (int p = 0; p < numPredsPerClass; ++p)
 773             {
 774                 int startIdx = p * numLocClasses * 4;
 775                 for (int c = 0; c < numLocClasses; ++c)
 776                 {
 777                     int label = shareLocation ? -1 : c;
 778                     if (labelBBox.find(label) == labelBBox.end())
 779                     {
 780                         labelBBox[label].resize(numPredsPerClass);
 781                     }
 782                     util::NormalizedBBox& bbox = labelBBox[label][p];
 783                     if (locPredTransposed)
 784                     {
 785                         bbox.ymin = locData[startIdx + c * 4];
 786                         bbox.xmin = locData[startIdx + c * 4 + 1];
 787                         bbox.ymax = locData[startIdx + c * 4 + 2];
 788                         bbox.xmax = locData[startIdx + c * 4 + 3];
 789                     }
 790                     else
 791                     {
 792                         bbox.xmin = locData[startIdx + c * 4];
 793                         bbox.ymin = locData[startIdx + c * 4 + 1];
 794                         bbox.xmax = locData[startIdx + c * 4 + 2];
 795                         bbox.ymax = locData[startIdx + c * 4 + 3];
 796                     }
 797                 }
 798             }
 799         }
 800     }
 801
 802     // Get confidence predictions from conf_data.
 803     //    conf_data: num x num_preds_per_class * num_classes blob.
 804     //    num: the number of images.
 805     //    num_preds_per_class: number of predictions per class.
 806     //    num_classes: number of classes.
 807     //    conf_preds: stores the confidence prediction, where each item contains
 808     //      confidence prediction for an image.
 809     static void GetConfidenceScores(const float* confData, const int num,
 810                              const int numPredsPerClass, const int numClasses,
 811                              std::vector<Mat>& confPreds)
 812     {
 813         int shape[] = { numClasses, numPredsPerClass };
 814         for (int i = 0; i < num; i++)
 815             confPreds.push_back(Mat(2, shape, CV_32F));
 816
 817         for (int i = 0; i < num; ++i, confData += numPredsPerClass * numClasses)
 818         {
 819             Mat labelScores = confPreds[i];
 820             for (int c = 0; c < numClasses; ++c)
 821             {
 822                 for (int p = 0; p < numPredsPerClass; ++p)
 823                 {
 824                     labelScores.at<float>(c, p) = confData[p * numClasses + c];
 825                 }
 826             }
 827         }
 828     }
 829
 830     // Compute the jaccard (intersection over union IoU) overlap between two bboxes.
 831     template<bool normalized>
 832     static float JaccardOverlap(const util::NormalizedBBox& bbox1,
 833                          const util::NormalizedBBox& bbox2)
 834     {
 835         util::NormalizedBBox intersect_bbox;
 836         intersect_bbox.xmin = std::max(bbox1.xmin, bbox2.xmin);
 837         intersect_bbox.ymin = std::max(bbox1.ymin, bbox2.ymin);
 838         intersect_bbox.xmax = std::min(bbox1.xmax, bbox2.xmax);
 839         intersect_bbox.ymax = std::min(bbox1.ymax, bbox2.ymax);
 840
 841         float intersect_size = BBoxSize(intersect_bbox, normalized);
 842         if (intersect_size > 0)
 843         {
 844             float bbox1_size = BBoxSize(bbox1, normalized);
 845             float bbox2_size = BBoxSize(bbox2, normalized);
 846             return intersect_size / (bbox1_size + bbox2_size - intersect_size);
 847         }
 848         else
 849         {
 850             return 0.;
 851         }
 852     }
 853 };
 854
 855 float util::caffe_box_overlap(const util::NormalizedBBox& a, const util::NormalizedBBox& b)
 856 {
 857     return DetectionOutputLayerImpl::JaccardOverlap<false>(a, b);
 858 }
 859
 860 float util::caffe_norm_box_overlap(const util::NormalizedBBox& a, const util::NormalizedBBox& b)
 861 {
 862     return DetectionOutputLayerImpl::JaccardOverlap<true>(a, b);
 863 }
 864
 865 const std::string DetectionOutputLayerImpl::_layerName = std::string("DetectionOutput");
 866
 867 Ptr<DetectionOutputLayer> DetectionOutputLayer::create(const LayerParams &params)
 868 {
 869     return Ptr<DetectionOutputLayer>(new DetectionOutputLayerImpl(params));
 870 }
 871
 872 }
 873 }