1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
5 #include "ext_list.hpp"
6 #include "ext_base.hpp"
13 #if defined(HAVE_AVX2)
14 #include <immintrin.h>
16 #include "ie_parallel.hpp"
18 namespace InferenceEngine {
19 namespace Extensions {
23 void generate_anchors(int base_size, float* ratios,
24 float* scales, const int num_ratios,
25 const int num_scales, float* anchors,
26 float coordinates_offset, bool shift_anchors, bool round_ratios) {
27 // base box's width & height & center location
28 const float base_area = static_cast<float>(base_size * base_size);
29 const float half_base_size = base_size * 0.5f;
30 const float center = 0.5f * (base_size - coordinates_offset);
32 // enumerate all transformed boxes
33 for (int ratio = 0; ratio < num_ratios; ++ratio) {
34 // transformed width & height for given ratio factors
38 ratio_w = std::roundf(std::sqrt(base_area / ratios[ratio]));
39 ratio_h = std::roundf(ratio_w * ratios[ratio]);
41 ratio_w = std::sqrt(base_area / ratios[ratio]);
42 ratio_h = ratio_w * ratios[ratio];
45 float * const p_anchors_wm = anchors + 0 * num_ratios * num_scales + ratio * num_scales;
46 float * const p_anchors_hm = anchors + 1 * num_ratios * num_scales + ratio * num_scales;
47 float * const p_anchors_wp = anchors + 2 * num_ratios * num_scales + ratio * num_scales;
48 float * const p_anchors_hp = anchors + 3 * num_ratios * num_scales + ratio * num_scales;
50 for (int scale = 0; scale < num_scales; ++scale) {
51 // transformed width & height for given scale factors
52 const float scale_w = 0.5f * (ratio_w * scales[scale] - coordinates_offset);
53 const float scale_h = 0.5f * (ratio_h * scales[scale] - coordinates_offset);
55 // (x1, y1, x2, y2) for transformed box
56 p_anchors_wm[scale] = center - scale_w;
57 p_anchors_hm[scale] = center - scale_h;
58 p_anchors_wp[scale] = center + scale_w;
59 p_anchors_hp[scale] = center + scale_h;
62 p_anchors_wm[scale] -= half_base_size;
63 p_anchors_hm[scale] -= half_base_size;
64 p_anchors_wp[scale] -= half_base_size;
65 p_anchors_hp[scale] -= half_base_size;
72 void enumerate_proposals_cpu(const float* bottom4d, const float* d_anchor4d, const float* anchors,
73 float* proposals, const int num_anchors, const int bottom_H,
74 const int bottom_W, const float img_H, const float img_W,
75 const float min_box_H, const float min_box_W, const int feat_stride,
76 const float box_coordinate_scale, const float box_size_scale,
77 float coordinates_offset, bool initial_clip, bool swap_xy, bool clip_before_nms) {
78 const int bottom_area = bottom_H * bottom_W;
80 const float* p_anchors_wm = anchors + 0 * num_anchors;
81 const float* p_anchors_hm = anchors + 1 * num_anchors;
82 const float* p_anchors_wp = anchors + 2 * num_anchors;
83 const float* p_anchors_hp = anchors + 3 * num_anchors;
85 parallel_for2d(bottom_H, bottom_W, [&](size_t h, size_t w) {
86 const float x = static_cast<float>((swap_xy ? h : w) * feat_stride);
87 const float y = static_cast<float>((swap_xy ? w : h) * feat_stride);
89 const float* p_box = d_anchor4d + h * bottom_W + w;
90 const float* p_score = bottom4d + h * bottom_W + w;
92 float* p_proposal = proposals + (h * bottom_W + w) * num_anchors * 5;
94 for (int anchor = 0; anchor < num_anchors; ++anchor) {
95 const float dx = p_box[(anchor * 4 + 0) * bottom_area] / box_coordinate_scale;
96 const float dy = p_box[(anchor * 4 + 1) * bottom_area] / box_coordinate_scale;
98 const float d_log_w = p_box[(anchor * 4 + 2) * bottom_area] / box_size_scale;
99 const float d_log_h = p_box[(anchor * 4 + 3) * bottom_area] / box_size_scale;
101 const float score = p_score[anchor * bottom_area];
103 float x0 = x + p_anchors_wm[anchor];
104 float y0 = y + p_anchors_hm[anchor];
105 float x1 = x + p_anchors_wp[anchor];
106 float y1 = y + p_anchors_hp[anchor];
109 // adjust new corner locations to be within the image region
110 x0 = std::max<float>(0.0f, std::min<float>(x0, img_W));
111 y0 = std::max<float>(0.0f, std::min<float>(y0, img_H));
112 x1 = std::max<float>(0.0f, std::min<float>(x1, img_W));
113 y1 = std::max<float>(0.0f, std::min<float>(y1, img_H));
116 // width & height of box
117 const float ww = x1 - x0 + coordinates_offset;
118 const float hh = y1 - y0 + coordinates_offset;
119 // center location of box
120 const float ctr_x = x0 + 0.5f * ww;
121 const float ctr_y = y0 + 0.5f * hh;
123 // new center location according to gradient (dx, dy)
124 const float pred_ctr_x = dx * ww + ctr_x;
125 const float pred_ctr_y = dy * hh + ctr_y;
126 // new width & height according to gradient d(log w), d(log h)
127 const float pred_w = std::exp(d_log_w) * ww;
128 const float pred_h = std::exp(d_log_h) * hh;
130 // update upper-left corner location
131 x0 = pred_ctr_x - 0.5f * pred_w;
132 y0 = pred_ctr_y - 0.5f * pred_h;
133 // update lower-right corner location
134 x1 = pred_ctr_x + 0.5f * pred_w;
135 y1 = pred_ctr_y + 0.5f * pred_h;
137 // adjust new corner locations to be within the image region,
138 if (clip_before_nms) {
139 x0 = std::max<float>(0.0f, std::min<float>(x0, img_W - coordinates_offset));
140 y0 = std::max<float>(0.0f, std::min<float>(y0, img_H - coordinates_offset));
141 x1 = std::max<float>(0.0f, std::min<float>(x1, img_W - coordinates_offset));
142 y1 = std::max<float>(0.0f, std::min<float>(y1, img_H - coordinates_offset));
145 // recompute new width & height
146 const float box_w = x1 - x0 + coordinates_offset;
147 const float box_h = y1 - y0 + coordinates_offset;
149 p_proposal[5*anchor + 0] = x0;
150 p_proposal[5*anchor + 1] = y0;
151 p_proposal[5*anchor + 2] = x1;
152 p_proposal[5*anchor + 3] = y1;
153 p_proposal[5*anchor + 4] = (min_box_W <= box_w) * (min_box_H <= box_h) * score;
158 static void unpack_boxes(const float* p_proposals, float* unpacked_boxes, int pre_nms_topn) {
159 parallel_for(pre_nms_topn, [&](size_t i) {
160 unpacked_boxes[0*pre_nms_topn + i] = p_proposals[5*i + 0];
161 unpacked_boxes[1*pre_nms_topn + i] = p_proposals[5*i + 1];
162 unpacked_boxes[2*pre_nms_topn + i] = p_proposals[5*i + 2];
163 unpacked_boxes[3*pre_nms_topn + i] = p_proposals[5*i + 3];
168 void nms_cpu(const int num_boxes, int is_dead[],
169 const float* boxes, int index_out[], int* const num_out,
170 const int base_index, const float nms_thresh, const int max_num_out,
171 float coordinates_offset) {
172 const int num_proposals = num_boxes;
175 const float* x0 = boxes + 0 * num_proposals;
176 const float* y0 = boxes + 1 * num_proposals;
177 const float* x1 = boxes + 2 * num_proposals;
178 const float* y1 = boxes + 3 * num_proposals;
180 memset(is_dead, 0, num_boxes * sizeof(int));
182 #if defined(HAVE_AVX2)
183 __m256 vc_fone = _mm256_set1_ps(coordinates_offset);
184 __m256i vc_ione = _mm256_set1_epi32(1);
185 __m256 vc_zero = _mm256_set1_ps(0.0f);
187 __m256 vc_nms_thresh = _mm256_set1_ps(nms_thresh);
190 for (int box = 0; box < num_boxes; ++box) {
194 index_out[count++] = base_index + box;
195 if (count == max_num_out)
200 #if defined(HAVE_AVX2)
201 __m256 vx0i = _mm256_set1_ps(x0[box]);
202 __m256 vy0i = _mm256_set1_ps(y0[box]);
203 __m256 vx1i = _mm256_set1_ps(x1[box]);
204 __m256 vy1i = _mm256_set1_ps(y1[box]);
206 __m256 vA_width = _mm256_sub_ps(vx1i, vx0i);
207 __m256 vA_height = _mm256_sub_ps(vy1i, vy0i);
208 __m256 vA_area = _mm256_mul_ps(_mm256_add_ps(vA_width, vc_fone), _mm256_add_ps(vA_height, vc_fone));
210 for (; tail <= num_boxes - 8; tail += 8) {
211 __m256i *pdst = reinterpret_cast<__m256i*>(is_dead + tail);
212 __m256i vdst = _mm256_loadu_si256(pdst);
214 __m256 vx0j = _mm256_loadu_ps(x0 + tail);
215 __m256 vy0j = _mm256_loadu_ps(y0 + tail);
216 __m256 vx1j = _mm256_loadu_ps(x1 + tail);
217 __m256 vy1j = _mm256_loadu_ps(y1 + tail);
219 __m256 vx0 = _mm256_max_ps(vx0i, vx0j);
220 __m256 vy0 = _mm256_max_ps(vy0i, vy0j);
221 __m256 vx1 = _mm256_min_ps(vx1i, vx1j);
222 __m256 vy1 = _mm256_min_ps(vy1i, vy1j);
224 __m256 vwidth = _mm256_add_ps(_mm256_sub_ps(vx1, vx0), vc_fone);
225 __m256 vheight = _mm256_add_ps(_mm256_sub_ps(vy1, vy0), vc_fone);
226 __m256 varea = _mm256_mul_ps(_mm256_max_ps(vc_zero, vwidth), _mm256_max_ps(vc_zero, vheight));
228 __m256 vB_width = _mm256_sub_ps(vx1j, vx0j);
229 __m256 vB_height = _mm256_sub_ps(vy1j, vy0j);
230 __m256 vB_area = _mm256_mul_ps(_mm256_add_ps(vB_width, vc_fone), _mm256_add_ps(vB_height, vc_fone));
232 __m256 vdivisor = _mm256_sub_ps(_mm256_add_ps(vA_area, vB_area), varea);
233 __m256 vintersection_area = _mm256_div_ps(varea, vdivisor);
235 __m256 vcmp_0 = _mm256_cmp_ps(vx0i, vx1j, _CMP_LE_OS);
236 __m256 vcmp_1 = _mm256_cmp_ps(vy0i, vy1j, _CMP_LE_OS);
237 __m256 vcmp_2 = _mm256_cmp_ps(vx0j, vx1i, _CMP_LE_OS);
238 __m256 vcmp_3 = _mm256_cmp_ps(vy0j, vy1i, _CMP_LE_OS);
239 __m256 vcmp_4 = _mm256_cmp_ps(vc_nms_thresh, vintersection_area, _CMP_LT_OS);
241 vcmp_0 = _mm256_and_ps(vcmp_0, vcmp_1);
242 vcmp_2 = _mm256_and_ps(vcmp_2, vcmp_3);
243 vcmp_4 = _mm256_and_ps(vcmp_4, vcmp_0);
244 vcmp_4 = _mm256_and_ps(vcmp_4, vcmp_2);
246 _mm256_storeu_si256(pdst, _mm256_blendv_epi8(vdst, vc_ione, _mm256_castps_si256(vcmp_4)));
250 for (; tail < num_boxes; ++tail) {
253 const float x0i = x0[box];
254 const float y0i = y0[box];
255 const float x1i = x1[box];
256 const float y1i = y1[box];
258 const float x0j = x0[tail];
259 const float y0j = y0[tail];
260 const float x1j = x1[tail];
261 const float y1j = y1[tail];
263 if (x0i <= x1j && y0i <= y1j && x0j <= x1i && y0j <= y1i) {
264 // overlapped region (= box)
265 const float x0 = std::max<float>(x0i, x0j);
266 const float y0 = std::max<float>(y0i, y0j);
267 const float x1 = std::min<float>(x1i, x1j);
268 const float y1 = std::min<float>(y1i, y1j);
271 const float width = std::max<float>(0.0f, x1 - x0 + coordinates_offset);
272 const float height = std::max<float>(0.0f, y1 - y0 + coordinates_offset);
273 const float area = width * height;
276 const float A_area = (x1i - x0i + coordinates_offset) * (y1i - y0i + coordinates_offset);
277 const float B_area = (x1j - x0j + coordinates_offset) * (y1j - y0j + coordinates_offset);
280 res = area / (A_area + B_area - area);
283 if (nms_thresh < res)
292 void retrieve_rois_cpu(const int num_rois, const int item_index,
293 const int num_proposals,
294 const float* proposals, const int roi_indices[],
295 float* rois, int post_nms_topn_,
296 bool normalize, float img_h, float img_w, bool clip_after_nms) {
297 const float *src_x0 = proposals + 0 * num_proposals;
298 const float *src_y0 = proposals + 1 * num_proposals;
299 const float *src_x1 = proposals + 2 * num_proposals;
300 const float *src_y1 = proposals + 3 * num_proposals;
302 parallel_for(num_rois, [&](size_t roi) {
303 int index = roi_indices[roi];
305 float x0 = src_x0[index];
306 float y0 = src_y0[index];
307 float x1 = src_x1[index];
308 float y1 = src_y1[index];
310 if (clip_after_nms) {
311 x0 = std::max<float>(0.0f, std::min<float>(x0, img_w));
312 y0 = std::max<float>(0.0f, std::min<float>(y0, img_h));
313 x1 = std::max<float>(0.0f, std::min<float>(x1, img_w));
314 y1 = std::max<float>(0.0f, std::min<float>(y1, img_h));
324 rois[roi * 5 + 0] = static_cast<float>(item_index);
325 rois[roi * 5 + 1] = x0;
326 rois[roi * 5 + 2] = y0;
327 rois[roi * 5 + 3] = x1;
328 rois[roi * 5 + 4] = y1;
331 if (num_rois < post_nms_topn_) {
332 for (int i = 5 * num_rois; i < 5 * post_nms_topn_; i++) {
336 // marker at end of boxes list
337 rois[num_rois * 5 + 0] = -1;
341 class ProposalImpl : public ExtLayerBase {
343 explicit ProposalImpl(const CNNLayer *layer) {
345 if (layer->insData.size() != 3 || layer->outData.size() != 1)
346 THROW_IE_EXCEPTION << "Incorrect number of input/output edges!";
348 if (layer->insData[0].lock()->dims.size() != 4)
349 THROW_IE_EXCEPTION << "Proposal supports only 4D blobs!";
351 feat_stride_ = static_cast<size_t>(layer->GetParamAsInt("feat_stride"));
352 base_size_ = static_cast<size_t>(layer->GetParamAsInt("base_size"));
353 min_size_ = static_cast<size_t>(layer->GetParamAsInt("min_size"));
354 pre_nms_topn_ = layer->GetParamAsInt("pre_nms_topn");
355 post_nms_topn_ = layer->GetParamAsInt("post_nms_topn");
356 nms_thresh_ = layer->GetParamAsFloat("nms_thresh");
357 box_coordinate_scale_ = layer->GetParamAsFloat("box_coordinate_scale", 1.0);
358 box_size_scale_ = layer->GetParamAsFloat("box_size_scale", 1.0);
359 scales = layer->GetParamAsFloats("scale", {});
360 ratios = layer->GetParamAsFloats("ratio", {});
361 normalize_ = layer->GetParamsAsBool("normalize", false);
362 clip_before_nms = layer->GetParamsAsBool("clip_before_nms", true);
363 clip_after_nms = layer->GetParamsAsBool("clip_after_nms", false);
365 anchors_shape_0 = ratios.size() * scales.size();
366 anchors_.resize(anchors_shape_0 * 4);
368 std::string framework_ = layer->GetParamAsString("framework", "");
369 if (framework_ == "tensorflow") {
370 coordinates_offset = 0.0f;
372 shift_anchors = true;
373 round_ratios = false;
376 coordinates_offset = 1.0f;
377 initial_clip = false;
378 shift_anchors = false;
382 generate_anchors(base_size_, &ratios[0], &scales[0], ratios.size(), scales.size(), &anchors_[0],
383 coordinates_offset, shift_anchors, round_ratios);
385 roi_indices_.resize(post_nms_topn_);
386 addConfig(layer, {DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)},
387 {DataConfigurator(ConfLayout::PLN)});
388 } catch (InferenceEngine::details::InferenceEngineException &ex) {
389 errorMsg = ex.what();
393 StatusCode execute(std::vector<Blob::Ptr> &inputs, std::vector<Blob::Ptr> &outputs,
394 ResponseDesc *resp) noexcept override {
395 if (inputs.size() != 3 || outputs.empty()) {
397 std::string errorMsg = "Incorrect number of input or output edges!";
398 errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
400 return GENERAL_ERROR;
404 const float* p_bottom_item = inputs[0]->buffer();
405 const float* p_d_anchor_item = inputs[1]->buffer();
406 const float* p_img_info_cpu = inputs[2]->buffer();
407 float* p_roi_item = outputs[0]->buffer();
409 size_t img_info_size = inputs[2]->getTensorDesc().getDims()[1];
411 // No second output so ignoring this
412 // Dtype* p_score_item = (top.size() > 1) ? top[1]->mutable_cpu_data() : NULL;
414 // bottom shape: (2 x num_anchors) x H x W
415 const int bottom_H = inputs[0]->getTensorDesc().getDims()[2];
416 const int bottom_W = inputs[0]->getTensorDesc().getDims()[3];
418 // input image height & width
419 const float img_H = p_img_info_cpu[swap_xy ? 1 : 0];
420 const float img_W = p_img_info_cpu[swap_xy ? 0 : 1];
422 // scale factor for height & width
423 const float scale_H = p_img_info_cpu[2];
424 const float scale_W = img_info_size > 3 ? p_img_info_cpu[3] : scale_H;
426 // minimum box width & height
427 const float min_box_H = min_size_ * scale_H;
428 const float min_box_W = min_size_ * scale_W;
430 // number of all proposals = num_anchors * H * W
431 const int num_proposals = anchors_shape_0 * bottom_H * bottom_W;
433 // number of top-n proposals before NMS
434 const int pre_nms_topn = std::min<int>(num_proposals, pre_nms_topn_);
436 // number of final RoIs
439 // enumerate all proposals
440 // num_proposals = num_anchors * H * W
441 // (x1, y1, x2, y2, score) for each proposal
442 // NOTE: for bottom, only foreground scores are passed
450 std::vector<ProposalBox> proposals_(num_proposals);
451 std::vector<float> unpacked_boxes(4 * pre_nms_topn);
452 std::vector<int> is_dead(pre_nms_topn);
455 int nn = inputs[0]->getTensorDesc().getDims()[0];
456 for (int n = 0; n < nn; ++n) {
457 enumerate_proposals_cpu(p_bottom_item + num_proposals + n*num_proposals*2, p_d_anchor_item + n*num_proposals*4,
458 &anchors_[0], reinterpret_cast<float *>(&proposals_[0]),
459 anchors_shape_0, bottom_H, bottom_W, img_H, img_W,
460 min_box_H, min_box_W, feat_stride_,
461 box_coordinate_scale_, box_size_scale_,
462 coordinates_offset, initial_clip, swap_xy, clip_before_nms);
463 std::partial_sort(proposals_.begin(), proposals_.begin() + pre_nms_topn, proposals_.end(),
464 [](const ProposalBox& struct1, const ProposalBox& struct2) {
465 return (struct1.score > struct2.score);
468 unpack_boxes(reinterpret_cast<float *>(&proposals_[0]), &unpacked_boxes[0], pre_nms_topn);
469 nms_cpu(pre_nms_topn, &is_dead[0], &unpacked_boxes[0], &roi_indices_[0], &num_rois, 0, nms_thresh_, post_nms_topn_, coordinates_offset);
470 retrieve_rois_cpu(num_rois, n, pre_nms_topn, &unpacked_boxes[0], &roi_indices_[0], p_roi_item + n*post_nms_topn_*5,
471 post_nms_topn_, normalize_, img_H, img_W, clip_after_nms);
484 float box_coordinate_scale_;
485 float box_size_scale_;
486 std::vector<float> scales;
487 std::vector<float> ratios;
490 size_t anchors_shape_0;
491 std::vector<float> anchors_;
492 std::vector<int> roi_indices_;
494 // Framework specific parameters
495 float coordinates_offset;
497 bool initial_clip; // clip initial bounding boxes
498 bool clip_before_nms; // clip bounding boxes before nms step
499 bool clip_after_nms; // clip bounding boxes after nms step
500 bool round_ratios; // round ratios during anchors generation stage
501 bool shift_anchors; // shift anchors by half size of the box
504 class ProposalFactory : public ImplFactory<ProposalImpl> {
506 explicit ProposalFactory(const CNNLayer *layer): ImplFactory(layer) {}
507 // set output shapes by input shapes.
508 StatusCode getShapes(const std::vector<TensorDesc>& inShapes, std::vector<TensorDesc>& outShapes,
509 ResponseDesc *resp) noexcept override {
510 if (inShapes.size() != 1) {
512 std::string errorMsg = "Incorrect input shapes!";
513 errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
515 return GENERAL_ERROR;
518 outShapes.emplace_back(cnnLayer.precision, inShapes[0].getDims(), inShapes[0].getLayout());
523 REG_FACTORY_FOR(ProposalFactory, Proposal);
526 } // namespace Extensions
527 } // namespace InferenceEngine