1 // Copyright (C) 2018 Intel Corporation
3 // SPDX-License-Identifier: Apache-2.0
6 #include "ext_list.hpp"
7 #include "ext_base.hpp"
16 namespace InferenceEngine {
17 namespace Extensions {
21 static bool SortScorePairDescend(const std::pair<float, T>& pair1,
22 const std::pair<float, T>& pair2) {
23 return pair1.first > pair2.first;
26 class DetectionOutputImpl: public ExtLayerBase {
28 explicit DetectionOutputImpl(const CNNLayer* layer) {
30 if (layer->insData.size() != 3)
31 THROW_IE_EXCEPTION << "Incorrect number of input edges.";
32 if (layer->outData.empty())
33 THROW_IE_EXCEPTION << "Incorrect number of output edges.";
35 _num_classes = layer->GetParamAsInt("num_classes");
36 _background_label_id = layer->GetParamAsInt("background_label_id", 0);
37 _top_k = layer->GetParamAsInt("top_k", -1);
38 _variance_encoded_in_target = layer->GetParamsAsBool("variance_encoded_in_target", false);
39 _keep_top_k = layer->GetParamAsInt("keep_top_k", -1);
40 _nms_threshold = layer->GetParamAsFloat("nms_threshold");
41 _confidence_threshold = layer->GetParamAsFloat("confidence_threshold", -FLT_MAX);
42 _share_location = layer->GetParamsAsBool("share_location", true);
43 _clip = layer->GetParamsAsBool("clip", false);
44 _decrease_label_id = layer->GetParamsAsBool("decrease_label_id", false);
45 _normalized = layer->GetParamsAsBool("normalized", true);
46 _image_height = layer->GetParamAsInt("input_height", 1);
47 _image_width = layer->GetParamAsInt("input_width", 1);
48 _prior_size = _normalized ? 4 : 5;
49 _offset = _normalized ? 0 : 1;
50 _num_loc_classes = _share_location ? 1 : _num_classes;
52 std::string code_type_str = layer->GetParamAsString("code_type", "caffe.PriorBoxParameter.CORNER");
53 _code_type = (code_type_str == "caffe.PriorBoxParameter.CENTER_SIZE" ? CodeType::CENTER_SIZE
56 _num_priors = static_cast<int>(layer->insData[idx_priors].lock()->dims[0] / _prior_size);
58 if (_num_priors * _num_loc_classes * 4 != layer->insData[idx_location].lock()->dims[0])
59 THROW_IE_EXCEPTION << "Number of priors must match number of location predictions.";
61 if (_num_priors * _num_classes != layer->insData[idx_confidence].lock()->dims[0])
62 THROW_IE_EXCEPTION << "Number of priors must match number of confidence predictions.";
64 if (_decrease_label_id && _background_label_id != 0)
65 THROW_IE_EXCEPTION << "Cannot use decrease_label_id and background_label_id parameter simultaneously.";
67 _num = static_cast<int>(layer->insData[idx_confidence].lock()->getTensorDesc().getDims()[0]);
69 InferenceEngine::SizeVector bboxes_size{static_cast<size_t>(_num),
70 static_cast<size_t>(_num_classes),
71 static_cast<size_t>(_num_priors),
73 _decoded_bboxes = InferenceEngine::make_shared_blob<float>({Precision::UNSPECIFIED, bboxes_size, NCHW});
74 _decoded_bboxes->allocate();
76 InferenceEngine::SizeVector buf_size{static_cast<size_t>(_num),
77 static_cast<size_t>(_num_classes),
78 static_cast<size_t>(_num_priors)};
79 _buffer = InferenceEngine::make_shared_blob<int>({Precision::UNSPECIFIED, buf_size, {buf_size, {0, 1, 2}}});
82 InferenceEngine::SizeVector indices_size{static_cast<size_t>(_num),
83 static_cast<size_t>(_num_classes),
84 static_cast<size_t>(_num_priors)};
85 _indices = InferenceEngine::make_shared_blob<int>(
86 {Precision::UNSPECIFIED, indices_size, {indices_size, {0, 1, 2}}});
89 InferenceEngine::SizeVector detections_size{static_cast<size_t>(_num * _num_classes)};
90 _detections_count = InferenceEngine::make_shared_blob<int>({Precision::UNSPECIFIED, detections_size, C});
91 _detections_count->allocate();
93 InferenceEngine::SizeVector conf_size = layer->insData[idx_confidence].lock()->dims;
94 _reordered_conf = InferenceEngine::make_shared_blob<float>({Precision::FP32, conf_size, ANY});
95 _reordered_conf->allocate();
97 InferenceEngine::SizeVector decoded_bboxes_size{static_cast<size_t>(_num),
98 static_cast<size_t>(_num_priors),
99 static_cast<size_t>(_num_classes)};
100 _bbox_sizes = InferenceEngine::make_shared_blob<float>(
101 {Precision::FP32, decoded_bboxes_size, {decoded_bboxes_size, {0, 1, 2}}});
102 _bbox_sizes->allocate();
104 InferenceEngine::SizeVector num_priors_actual_size{static_cast<size_t>(_num)};
105 _num_priors_actual = InferenceEngine::make_shared_blob<int>({Precision::UNSPECIFIED, num_priors_actual_size, C});
106 _num_priors_actual->allocate();
108 addConfig(layer, {DataConfigurator(ConfLayout::PLN),
109 DataConfigurator(ConfLayout::PLN),
110 DataConfigurator(ConfLayout::PLN)}, {DataConfigurator(ConfLayout::PLN)});
111 } catch (InferenceEngine::details::InferenceEngineException &ex) {
112 errorMsg = ex.what();
116 StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs,
117 ResponseDesc *resp) noexcept override {
118 float *dst_data = outputs[0]->buffer();
120 const float *loc_data = inputs[idx_location]->buffer();
121 const float *conf_data = inputs[idx_confidence]->buffer();
122 const float *prior_data = inputs[idx_priors]->buffer();
124 const int N = inputs[idx_confidence]->getTensorDesc().getDims()[0];
126 float *decoded_bboxes_data = _decoded_bboxes->buffer();
127 float *reordered_conf_data = _reordered_conf->buffer();
128 float *bbox_sizes_data = _bbox_sizes->buffer();
129 int *detections_data = _detections_count->buffer();
130 int *buffer_data = _buffer->buffer();
131 int *indices_data = _indices->buffer();
132 int *num_priors_actual = _num_priors_actual->buffer();
134 const float *prior_variances = prior_data + _num_priors*_prior_size;
135 const float *ppriors = prior_data;
137 for (int n = 0; n < N; ++n) {
138 if (_share_location) {
139 const float *ploc = loc_data + n*4*_num_priors;
140 float *pboxes = decoded_bboxes_data + n*4*_num_priors;
141 float *psizes = bbox_sizes_data + n*_num_priors;
142 decodeBBoxes(ppriors, ploc, prior_variances, pboxes, psizes, num_priors_actual, n);
144 for (int c = 0; c < _num_loc_classes; ++c) {
145 if (c == _background_label_id) {
149 const float *ploc = loc_data + n*4*_num_loc_classes*_num_priors + c*4;
150 float *pboxes = decoded_bboxes_data + n*4*_num_loc_classes*_num_priors + c*4*_num_priors;
151 float *psizes = bbox_sizes_data + n*_num_loc_classes*_num_priors + c*_num_priors;
152 decodeBBoxes(ppriors, ploc, prior_variances, pboxes, psizes, num_priors_actual, n);
157 for (int n = 0; n < N; ++n) {
158 for (int c = 0; c < _num_classes; ++c) {
159 for (int p = 0; p < _num_priors; ++p) {
160 reordered_conf_data[n*_num_priors*_num_classes + c*_num_priors + p] = conf_data[n*_num_priors*_num_classes + p*_num_classes + c];
165 memset(detections_data, 0, N*_num_classes*sizeof(int));
167 for (int n = 0; n < N; ++n) {
168 int detections_total = 0;
170 #pragma omp parallel for schedule(static)
171 for (int c = 0; c < _num_classes; ++c) {
172 if (c == _background_label_id) {
173 // Ignore background class.
177 int *pindices = indices_data + n*_num_classes*_num_priors + c*_num_priors;
178 int *pbuffer = buffer_data + c*_num_priors;
179 int *pdetections = detections_data + n*_num_classes + c;
181 const float *pconf = reordered_conf_data + n*_num_classes*_num_priors + c*_num_priors;
184 if (_share_location) {
185 pboxes = decoded_bboxes_data + n*4*_num_priors;
186 psizes = bbox_sizes_data + n*_num_priors;
188 pboxes = decoded_bboxes_data + n*4*_num_classes*_num_priors + c*4*_num_priors;
189 psizes = bbox_sizes_data + n*_num_classes*_num_priors + c*_num_priors;
192 nms(pconf, pboxes, psizes, pbuffer, pindices, *pdetections, num_priors_actual[n]);
195 for (int c = 0; c < _num_classes; ++c) {
196 detections_total += detections_data[n*_num_classes + c];
199 if (_keep_top_k > -1 && detections_total > _keep_top_k) {
200 std::vector<std::pair<float, std::pair<int, int>>> conf_index_class_map;
202 for (int c = 0; c < _num_classes; ++c) {
203 int detections = detections_data[n*_num_classes + c];
204 int *pindices = indices_data + n*_num_classes*_num_priors + c*_num_priors;
205 float *pconf = reordered_conf_data + n*_num_classes*_num_priors + c*_num_priors;
207 for (int i = 0; i < detections; ++i) {
208 int idx = pindices[i];
209 conf_index_class_map.push_back(std::make_pair(pconf[idx], std::make_pair(c, idx)));
213 std::sort(conf_index_class_map.begin(), conf_index_class_map.end(),
214 SortScorePairDescend<std::pair<int, int>>);
215 conf_index_class_map.resize(_keep_top_k);
217 // Store the new indices.
218 memset(detections_data + n*_num_classes, 0, _num_classes * sizeof(int));
220 for (int j = 0; j < conf_index_class_map.size(); ++j) {
221 int label = conf_index_class_map[j].second.first;
222 int idx = conf_index_class_map[j].second.second;
223 int *pindices = indices_data + n * _num_classes * _num_priors + label * _num_priors;
224 pindices[detections_data[n*_num_classes + label]] = idx;
225 detections_data[n*_num_classes + label]++;
230 const int DETECTION_SIZE = outputs[0]->getTensorDesc().getDims()[3];
231 if (DETECTION_SIZE != 7) {
232 return NOT_IMPLEMENTED;
235 auto dst_data_size = N * _keep_top_k * DETECTION_SIZE * sizeof(float);
237 if (dst_data_size > outputs[0]->byteSize()) {
238 return OUT_OF_BOUNDS;
241 memset(dst_data, 0, dst_data_size);
244 for (int n = 0; n < N; ++n) {
245 const float *pconf = reordered_conf_data + n * _num_priors * _num_classes;
246 const float *pboxes = decoded_bboxes_data + n*_num_priors*4*_num_loc_classes;
247 const int *pindices = indices_data + n*_num_classes*_num_priors;
249 for (int c = 0; c < _num_classes; ++c) {
250 for (int i = 0; i < detections_data[n*_num_classes + c]; ++i) {
251 int idx = pindices[c*_num_priors + i];
253 dst_data[count * DETECTION_SIZE + 0] = n;
254 dst_data[count * DETECTION_SIZE + 1] = _decrease_label_id ? c-1 : c;
255 dst_data[count * DETECTION_SIZE + 2] = pconf[c*_num_priors + idx];
257 float xmin = _share_location ? pboxes[idx*4 + 0] :
258 pboxes[c*4*_num_priors + idx*4 + 0];
259 float ymin = _share_location ? pboxes[idx*4 + 1] :
260 pboxes[c*4*_num_priors + idx*4 + 1];
261 float xmax = _share_location ? pboxes[idx*4 + 2] :
262 pboxes[c*4*_num_priors + idx*4 + 2];
263 float ymax = _share_location ? pboxes[idx*4 + 3] :
264 pboxes[c*4*_num_priors + idx*4 + 3];
266 dst_data[count * DETECTION_SIZE + 3] = xmin;
267 dst_data[count * DETECTION_SIZE + 4] = ymin;
268 dst_data[count * DETECTION_SIZE + 5] = xmax;
269 dst_data[count * DETECTION_SIZE + 6] = ymax;
276 if (count < N*_keep_top_k) {
277 // marker at end of boxes list
278 dst_data[count * DETECTION_SIZE + 0] = -1;
285 const int idx_location = 0;
286 const int idx_confidence = 1;
287 const int idx_priors = 2;
290 int _num_classes = 0;
291 int _background_label_id = 0;
293 int _variance_encoded_in_target = 0;
297 bool _share_location = false;
299 bool _decrease_label_id = false;
301 int _image_width = 0;
302 int _image_height = 0;
304 bool _normalized = true;
307 float _nms_threshold = 0.0f;
308 float _confidence_threshold = 0.0f;
311 int _num_loc_classes = 0;
319 void decodeBBoxes(const float *prior_data, const float *loc_data, const float *variance_data,
320 float *decoded_bboxes, float *decoded_bbox_sizes, int* num_priors_actual, int n);
322 void nms(const float *conf_data, const float *bboxes, const float *sizes,
323 int *buffer, int *indices, int &detections, int num_priors_actual);
325 InferenceEngine::Blob::Ptr _decoded_bboxes;
326 InferenceEngine::Blob::Ptr _buffer;
327 InferenceEngine::Blob::Ptr _indices;
328 InferenceEngine::Blob::Ptr _detections_count;
329 InferenceEngine::Blob::Ptr _reordered_conf;
330 InferenceEngine::Blob::Ptr _bbox_sizes;
331 InferenceEngine::Blob::Ptr _num_priors_actual;
334 struct ConfidenceComparator {
335 explicit ConfidenceComparator(const float* conf_data) : _conf_data(conf_data) {}
337 bool operator()(int idx1, int idx2) {
338 if (_conf_data[idx1] > _conf_data[idx2]) return true;
339 if (_conf_data[idx1] < _conf_data[idx2]) return false;
343 const float* _conf_data;
346 static inline float JaccardOverlap(const float *decoded_bbox,
347 const float *bbox_sizes,
350 float xmin1 = decoded_bbox[idx1*4 + 0];
351 float ymin1 = decoded_bbox[idx1*4 + 1];
352 float xmax1 = decoded_bbox[idx1*4 + 2];
353 float ymax1 = decoded_bbox[idx1*4 + 3];
355 float xmin2 = decoded_bbox[idx2*4 + 0];
356 float ymin2 = decoded_bbox[idx2*4 + 1];
357 float ymax2 = decoded_bbox[idx2*4 + 3];
358 float xmax2 = decoded_bbox[idx2*4 + 2];
360 if (xmin2 > xmax1 || xmax2 < xmin1 || ymin2 > ymax1 || ymax2 < ymin1) {
364 float intersect_xmin = std::max(xmin1, xmin2);
365 float intersect_ymin = std::max(ymin1, ymin2);
366 float intersect_xmax = std::min(xmax1, xmax2);
367 float intersect_ymax = std::min(ymax1, ymax2);
369 float intersect_width = intersect_xmax - intersect_xmin;
370 float intersect_height = intersect_ymax - intersect_ymin;
372 if (intersect_width <= 0 || intersect_height <= 0) {
376 float intersect_size = intersect_width * intersect_height;
377 float bbox1_size = bbox_sizes[idx1];
378 float bbox2_size = bbox_sizes[idx2];
380 return intersect_size / (bbox1_size + bbox2_size - intersect_size);
383 void DetectionOutputImpl::decodeBBoxes(const float *prior_data,
384 const float *loc_data,
385 const float *variance_data,
386 float *decoded_bboxes,
387 float *decoded_bbox_sizes,
388 int* num_priors_actual,
390 num_priors_actual[n] = _num_priors;
393 for (; num < _num_priors; ++num) {
394 float batch_id = prior_data[num * _prior_size + 0];
395 if (batch_id == -1.f) {
396 num_priors_actual[n] = num;
402 #pragma omp parallel for schedule(static)
403 for (int p = 0; p < num_priors_actual[n]; ++p) {
404 float new_xmin = 0.0f;
405 float new_ymin = 0.0f;
406 float new_xmax = 0.0f;
407 float new_ymax = 0.0f;
409 float prior_xmin = prior_data[p*_prior_size + 0 + _offset];
410 float prior_ymin = prior_data[p*_prior_size + 1 + _offset];
411 float prior_xmax = prior_data[p*_prior_size + 2 + _offset];
412 float prior_ymax = prior_data[p*_prior_size + 3 + _offset];
414 float loc_xmin = loc_data[4*p*_num_loc_classes + 0];
415 float loc_ymin = loc_data[4*p*_num_loc_classes + 1];
416 float loc_xmax = loc_data[4*p*_num_loc_classes + 2];
417 float loc_ymax = loc_data[4*p*_num_loc_classes + 3];
420 prior_xmin /= _image_width;
421 prior_ymin /= _image_height;
422 prior_xmax /= _image_width;
423 prior_ymax /= _image_height;
426 if (_code_type == CodeType::CORNER) {
427 if (_variance_encoded_in_target) {
428 // variance is encoded in target, we simply need to add the offset predictions.
429 new_xmin = prior_xmin + loc_xmin;
430 new_ymin = prior_ymin + loc_ymin;
431 new_xmax = prior_xmax + loc_xmax;
432 new_ymax = prior_ymax + loc_ymax;
434 new_xmin = prior_xmin + variance_data[p*4 + 0] * loc_xmin;
435 new_ymin = prior_ymin + variance_data[p*4 + 1] * loc_ymin;
436 new_xmax = prior_xmax + variance_data[p*4 + 2] * loc_xmax;
437 new_ymax = prior_ymax + variance_data[p*4 + 3] * loc_ymax;
439 } else if (_code_type == CodeType::CENTER_SIZE) {
440 float prior_width = prior_xmax - prior_xmin;
441 float prior_height = prior_ymax - prior_ymin;
442 float prior_center_x = (prior_xmin + prior_xmax) / 2.0f;
443 float prior_center_y = (prior_ymin + prior_ymax) / 2.0f;
445 float decode_bbox_center_x, decode_bbox_center_y;
446 float decode_bbox_width, decode_bbox_height;
448 if (_variance_encoded_in_target) {
449 // variance is encoded in target, we simply need to restore the offset predictions.
450 decode_bbox_center_x = loc_xmin * prior_width + prior_center_x;
451 decode_bbox_center_y = loc_ymin * prior_height + prior_center_y;
452 decode_bbox_width = std::exp(loc_xmax) * prior_width;
453 decode_bbox_height = std::exp(loc_ymax) * prior_height;
455 // variance is encoded in bbox, we need to scale the offset accordingly.
456 decode_bbox_center_x = variance_data[p*4 + 0] * loc_xmin * prior_width + prior_center_x;
457 decode_bbox_center_y = variance_data[p*4 + 1] * loc_ymin * prior_height + prior_center_y;
458 decode_bbox_width = std::exp(variance_data[p*4 + 2] * loc_xmax) * prior_width;
459 decode_bbox_height = std::exp(variance_data[p*4 + 3] * loc_ymax) * prior_height;
462 new_xmin = decode_bbox_center_x - decode_bbox_width / 2.0f;
463 new_ymin = decode_bbox_center_y - decode_bbox_height / 2.0f;
464 new_xmax = decode_bbox_center_x + decode_bbox_width / 2.0f;
465 new_ymax = decode_bbox_center_y + decode_bbox_height / 2.0f;
469 new_xmin = std::max(0.0f, std::min(1.0f, new_xmin));
470 new_ymin = std::max(0.0f, std::min(1.0f, new_ymin));
471 new_xmax = std::max(0.0f, std::min(1.0f, new_xmax));
472 new_ymax = std::max(0.0f, std::min(1.0f, new_ymax));
475 decoded_bboxes[p*4 + 0] = new_xmin;
476 decoded_bboxes[p*4 + 1] = new_ymin;
477 decoded_bboxes[p*4 + 2] = new_xmax;
478 decoded_bboxes[p*4 + 3] = new_ymax;
480 decoded_bbox_sizes[p] = (new_xmax - new_xmin) * (new_ymax - new_ymin);
484 void DetectionOutputImpl::nms(const float* conf_data,
490 int num_priors_actual) {
492 for (int i = 0; i < num_priors_actual; ++i) {
493 if (conf_data[i] > _confidence_threshold) {
499 int num_output_scores = (_top_k == -1 ? count : std::min<int>(_top_k, count));
501 std::partial_sort_copy(indices, indices + count,
502 buffer, buffer + num_output_scores,
503 ConfidenceComparator(conf_data));
505 for (int i = 0; i < num_output_scores; ++i) {
506 const int idx = buffer[i];
509 for (int k = 0; k < detections; ++k) {
510 const int kept_idx = indices[k];
511 float overlap = JaccardOverlap(bboxes, sizes, idx, kept_idx);
512 if (overlap > _nms_threshold) {
518 indices[detections] = idx;
524 REG_FACTORY_FOR(ImplFactory<DetectionOutputImpl>, DetectionOutput);
527 } // namespace Extensions
528 } // namespace InferenceEngine