1 // Copyright (C) 2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
5 #include "ext_list.hpp"
6 #include "ext_base.hpp"
14 #if defined(HAVE_AVX2)
15 #include <immintrin.h>
17 #include "ie_parallel.hpp"
22 const std::vector<int> dims_;
25 explicit Indexer(const std::vector<int>& dims) : dims_(dims) {
27 for (size_t i = 0; i < dims_.size(); ++i) {
32 const int operator()(const std::vector<int>& idx) const {
34 assert(idx.size() == dims_.size());
35 for (size_t i = 0; i < dims_.size(); ++i) {
36 assert(0 <= idx[i] && idx[i] < dims_[i]);
37 flat_idx = flat_idx * dims_[i] + idx[i];
39 assert(flat_idx < total_);
46 namespace InferenceEngine {
47 namespace Extensions {
51 void refine_anchors(const float* deltas, const float* scores, const float* anchors,
52 float* proposals, const int anchors_num, const int bottom_H,
53 const int bottom_W, const float img_H, const float img_W,
54 const float min_box_H, const float min_box_W,
55 const float max_delta_log_wh,
56 float coordinates_offset) {
57 Indexer delta_idx({anchors_num, 4, bottom_H, bottom_W});
58 Indexer score_idx({anchors_num, 1, bottom_H, bottom_W});
59 Indexer proposal_idx({bottom_H, bottom_W, anchors_num, 5});
60 Indexer anchor_idx({bottom_H, bottom_W, anchors_num, 4});
62 parallel_for2d(bottom_H, bottom_W, [&](int h, int w) {
63 for (int anchor = 0; anchor < anchors_num; ++anchor) {
64 float x0 = anchors[anchor_idx({h, w, anchor, 0})];
65 float y0 = anchors[anchor_idx({h, w, anchor, 1})];
66 float x1 = anchors[anchor_idx({h, w, anchor, 2})];
67 float y1 = anchors[anchor_idx({h, w, anchor, 3})];
69 const float dx = deltas[delta_idx({anchor, 0, h, w})];
70 const float dy = deltas[delta_idx({anchor, 1, h, w})];
71 const float d_log_w = deltas[delta_idx({anchor, 2, h, w})];
72 const float d_log_h = deltas[delta_idx({anchor, 3, h, w})];
74 const float score = scores[score_idx({anchor, 0, h, w})];
76 // width & height of box
77 const float ww = x1 - x0 + coordinates_offset;
78 const float hh = y1 - y0 + coordinates_offset;
79 // center location of box
80 const float ctr_x = x0 + 0.5f * ww;
81 const float ctr_y = y0 + 0.5f * hh;
83 // new center location according to deltas (dx, dy)
84 const float pred_ctr_x = dx * ww + ctr_x;
85 const float pred_ctr_y = dy * hh + ctr_y;
86 // new width & height according to deltas d(log w), d(log h)
87 const float pred_w = std::exp(std::min(d_log_w, max_delta_log_wh)) * ww;
88 const float pred_h = std::exp(std::min(d_log_h, max_delta_log_wh)) * hh;
90 // update upper-left corner location
91 x0 = pred_ctr_x - 0.5f * pred_w;
92 y0 = pred_ctr_y - 0.5f * pred_h;
93 // update lower-right corner location
94 x1 = pred_ctr_x + 0.5f * pred_w - coordinates_offset;
95 y1 = pred_ctr_y + 0.5f * pred_h - coordinates_offset;
97 // adjust new corner locations to be within the image region,
98 x0 = std::max<float>(0.0f, std::min<float>(x0, img_W - coordinates_offset));
99 y0 = std::max<float>(0.0f, std::min<float>(y0, img_H - coordinates_offset));
100 x1 = std::max<float>(0.0f, std::min<float>(x1, img_W - coordinates_offset));
101 y1 = std::max<float>(0.0f, std::min<float>(y1, img_H - coordinates_offset));
103 // recompute new width & height
104 const float box_w = x1 - x0 + coordinates_offset;
105 const float box_h = y1 - y0 + coordinates_offset;
107 proposals[proposal_idx({h, w, anchor, 0})] = x0;
108 proposals[proposal_idx({h, w, anchor, 1})] = y0;
109 proposals[proposal_idx({h, w, anchor, 2})] = x1;
110 proposals[proposal_idx({h, w, anchor, 3})] = y1;
111 proposals[proposal_idx({h, w, anchor, 4})] = (min_box_W <= box_w) * (min_box_H <= box_h) * score;
116 static void unpack_boxes(const float* p_proposals, float* unpacked_boxes, int pre_nms_topn) {
117 parallel_for(pre_nms_topn, [&](size_t i) {
118 unpacked_boxes[0*pre_nms_topn + i] = p_proposals[5*i + 0];
119 unpacked_boxes[1*pre_nms_topn + i] = p_proposals[5*i + 1];
120 unpacked_boxes[2*pre_nms_topn + i] = p_proposals[5*i + 2];
121 unpacked_boxes[3*pre_nms_topn + i] = p_proposals[5*i + 3];
122 unpacked_boxes[4*pre_nms_topn + i] = p_proposals[5*i + 4];
127 void nms_cpu(const int num_boxes, int is_dead[],
128 const float* boxes, int index_out[], int* const num_out,
129 const int base_index, const float nms_thresh, const int max_num_out,
130 float coordinates_offset) {
131 const int num_proposals = num_boxes;
134 const float* x0 = boxes + 0 * num_proposals;
135 const float* y0 = boxes + 1 * num_proposals;
136 const float* x1 = boxes + 2 * num_proposals;
137 const float* y1 = boxes + 3 * num_proposals;
139 memset(is_dead, 0, num_boxes * sizeof(int));
141 #if defined(HAVE_AVX2)
142 __m256 vc_fone = _mm256_set1_ps(coordinates_offset);
143 __m256i vc_ione = _mm256_set1_epi32(1);
144 __m256 vc_zero = _mm256_set1_ps(0.0f);
146 __m256 vc_nms_thresh = _mm256_set1_ps(nms_thresh);
149 for (int box = 0; box < num_boxes; ++box) {
153 index_out[count++] = base_index + box;
154 if (count == max_num_out)
159 #if defined(HAVE_AVX2)
160 __m256 vx0i = _mm256_set1_ps(x0[box]);
161 __m256 vy0i = _mm256_set1_ps(y0[box]);
162 __m256 vx1i = _mm256_set1_ps(x1[box]);
163 __m256 vy1i = _mm256_set1_ps(y1[box]);
165 __m256 vA_width = _mm256_sub_ps(vx1i, vx0i);
166 __m256 vA_height = _mm256_sub_ps(vy1i, vy0i);
167 __m256 vA_area = _mm256_mul_ps(_mm256_add_ps(vA_width, vc_fone), _mm256_add_ps(vA_height, vc_fone));
169 for (; tail <= num_boxes - 8; tail += 8) {
170 __m256i *pdst = reinterpret_cast<__m256i*>(is_dead + tail);
171 __m256i vdst = _mm256_loadu_si256(pdst);
173 __m256 vx0j = _mm256_loadu_ps(x0 + tail);
174 __m256 vy0j = _mm256_loadu_ps(y0 + tail);
175 __m256 vx1j = _mm256_loadu_ps(x1 + tail);
176 __m256 vy1j = _mm256_loadu_ps(y1 + tail);
178 __m256 vx0 = _mm256_max_ps(vx0i, vx0j);
179 __m256 vy0 = _mm256_max_ps(vy0i, vy0j);
180 __m256 vx1 = _mm256_min_ps(vx1i, vx1j);
181 __m256 vy1 = _mm256_min_ps(vy1i, vy1j);
183 __m256 vwidth = _mm256_add_ps(_mm256_sub_ps(vx1, vx0), vc_fone);
184 __m256 vheight = _mm256_add_ps(_mm256_sub_ps(vy1, vy0), vc_fone);
185 __m256 varea = _mm256_mul_ps(_mm256_max_ps(vc_zero, vwidth), _mm256_max_ps(vc_zero, vheight));
187 __m256 vB_width = _mm256_sub_ps(vx1j, vx0j);
188 __m256 vB_height = _mm256_sub_ps(vy1j, vy0j);
189 __m256 vB_area = _mm256_mul_ps(_mm256_add_ps(vB_width, vc_fone), _mm256_add_ps(vB_height, vc_fone));
191 __m256 vdivisor = _mm256_sub_ps(_mm256_add_ps(vA_area, vB_area), varea);
192 __m256 vintersection_area = _mm256_div_ps(varea, vdivisor);
194 __m256 vcmp_0 = _mm256_cmp_ps(vx0i, vx1j, _CMP_LE_OS);
195 __m256 vcmp_1 = _mm256_cmp_ps(vy0i, vy1j, _CMP_LE_OS);
196 __m256 vcmp_2 = _mm256_cmp_ps(vx0j, vx1i, _CMP_LE_OS);
197 __m256 vcmp_3 = _mm256_cmp_ps(vy0j, vy1i, _CMP_LE_OS);
198 __m256 vcmp_4 = _mm256_cmp_ps(vc_nms_thresh, vintersection_area, _CMP_LT_OS);
200 vcmp_0 = _mm256_and_ps(vcmp_0, vcmp_1);
201 vcmp_2 = _mm256_and_ps(vcmp_2, vcmp_3);
202 vcmp_4 = _mm256_and_ps(vcmp_4, vcmp_0);
203 vcmp_4 = _mm256_and_ps(vcmp_4, vcmp_2);
205 _mm256_storeu_si256(pdst, _mm256_blendv_epi8(vdst, vc_ione, _mm256_castps_si256(vcmp_4)));
209 for (; tail < num_boxes; ++tail) {
212 const float x0i = x0[box];
213 const float y0i = y0[box];
214 const float x1i = x1[box];
215 const float y1i = y1[box];
217 const float x0j = x0[tail];
218 const float y0j = y0[tail];
219 const float x1j = x1[tail];
220 const float y1j = y1[tail];
222 if (x0i <= x1j && y0i <= y1j && x0j <= x1i && y0j <= y1i) {
223 // overlapped region (= box)
224 const float x0 = std::max<float>(x0i, x0j);
225 const float y0 = std::max<float>(y0i, y0j);
226 const float x1 = std::min<float>(x1i, x1j);
227 const float y1 = std::min<float>(y1i, y1j);
230 const float width = std::max<float>(0.0f, x1 - x0 + coordinates_offset);
231 const float height = std::max<float>(0.0f, y1 - y0 + coordinates_offset);
232 const float area = width * height;
235 const float A_area = (x1i - x0i + coordinates_offset) * (y1i - y0i + coordinates_offset);
236 const float B_area = (x1j - x0j + coordinates_offset) * (y1j - y0j + coordinates_offset);
239 res = area / (A_area + B_area - area);
242 if (nms_thresh < res)
252 void fill_output_blobs(const float* proposals, const int* roi_indices,
253 float* rois, float* scores,
254 const int num_proposals, const int num_rois, const int post_nms_topn) {
255 const float *src_x0 = proposals + 0 * num_proposals;
256 const float *src_y0 = proposals + 1 * num_proposals;
257 const float *src_x1 = proposals + 2 * num_proposals;
258 const float *src_y1 = proposals + 3 * num_proposals;
259 const float *src_score = proposals + 4 * num_proposals;
261 parallel_for(num_rois, [&](size_t i) {
262 int index = roi_indices[i];
263 rois[i * 4 + 0] = src_x0[index];
264 rois[i * 4 + 1] = src_y0[index];
265 rois[i * 4 + 2] = src_x1[index];
266 rois[i * 4 + 3] = src_y1[index];
267 scores[i] = src_score[index];
270 if (num_rois < post_nms_topn) {
271 for (int i = 4 * num_rois; i < 4 * post_nms_topn; i++) {
274 for (int i = num_rois; i < post_nms_topn; i++) {
281 class ONNXCustomProposalImpl : public ExtLayerBase {
283 const int INPUT_IM_INFO {0};
284 const int INPUT_ANCHORS {1};
285 const int INPUT_DELTAS {2};
286 const int INPUT_SCORES {3};
287 const int OUTPUT_ROIS {0};
288 const int OUTPUT_SCORES {1};
291 explicit ONNXCustomProposalImpl(const CNNLayer *layer) {
293 if (layer->insData.size() != 4 || layer->outData.size() != 2)
294 THROW_IE_EXCEPTION << "Incorrect number of input/output edges!";
296 min_size_ = layer->GetParamAsFloat("min_size");
297 nms_thresh_ = layer->GetParamAsFloat("nms_threshold");
298 pre_nms_topn_ = layer->GetParamAsInt("pre_nms_count");
299 post_nms_topn_ = layer->GetParamAsInt("post_nms_count");
301 coordinates_offset = 0.0f;
303 roi_indices_.resize(post_nms_topn_);
305 {DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN),
306 DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)},
307 {DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)});
308 } catch (InferenceEngine::details::InferenceEngineException &ex) {
309 errorMsg = ex.what();
313 void print_shape(const Blob::Ptr& b) {
314 for (size_t i = 0; i < b->getTensorDesc().getDims().size(); ++i) {
315 std::cout << b->getTensorDesc().getDims()[i] << ", ";
317 std::cout << std::endl;
320 StatusCode execute(std::vector<Blob::Ptr> &inputs, std::vector<Blob::Ptr> &outputs,
321 ResponseDesc *resp) noexcept override {
322 if (inputs.size() != 4 || outputs.size() != 2) {
324 std::string errorMsg = "Incorrect number of input or output edges!";
325 errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
327 return GENERAL_ERROR;
331 const float* p_deltas_item = inputs[INPUT_DELTAS]->buffer();
332 const float* p_scores_item = inputs[INPUT_SCORES]->buffer();
333 const float* p_anchors_item = inputs[INPUT_ANCHORS]->buffer();
334 const float* p_img_info_cpu = inputs[INPUT_IM_INFO]->buffer();
336 float* p_roi_item = outputs[OUTPUT_ROIS]->buffer();
337 float* p_roi_score_item = outputs[OUTPUT_SCORES]->buffer();
340 size_t img_info_size = 1;
341 for (size_t i = 0; i < inputs[INPUT_IM_INFO]->getTensorDesc().getDims().size(); i++) {
342 img_info_size *= inputs[INPUT_IM_INFO]->getTensorDesc().getDims()[i];
345 const int anchors_num = inputs[INPUT_SCORES]->getTensorDesc().getDims()[0];
347 // bottom shape: (num_anchors) x H x W
348 const int bottom_H = inputs[INPUT_DELTAS]->getTensorDesc().getDims()[1];
349 const int bottom_W = inputs[INPUT_DELTAS]->getTensorDesc().getDims()[2];
351 // input image height & width
352 const float img_H = p_img_info_cpu[0];
353 const float img_W = p_img_info_cpu[1];
355 // scale factor for height & width
357 // minimum box width & height
358 const float min_box_H = min_size_;
359 const float min_box_W = min_size_;
361 // number of all proposals = num_anchors * H * W
362 const int num_proposals = anchors_num * bottom_H * bottom_W;
364 // number of top-n proposals before NMS
365 const int pre_nms_topn = std::min<int>(num_proposals, pre_nms_topn_);
367 // number of final RoIs
370 // enumerate all proposals
371 // num_proposals = num_anchors * H * W
372 // (x1, y1, x2, y2, score) for each proposal
373 // NOTE: for bottom, only foreground scores are passed
381 std::vector<ProposalBox> proposals_(num_proposals);
382 std::vector<float> unpacked_boxes(5 * pre_nms_topn);
383 std::vector<int> is_dead(pre_nms_topn);
386 int batch_size = 1; // inputs[INPUT_DELTAS]->getTensorDesc().getDims()[0];
387 for (int n = 0; n < batch_size; ++n) {
388 refine_anchors(p_deltas_item, p_scores_item, p_anchors_item,
389 reinterpret_cast<float *>(&proposals_[0]), anchors_num, bottom_H,
390 bottom_W, img_H, img_W,
391 min_box_H, min_box_W,
392 static_cast<const float>(log(1000. / 16.)),
394 std::partial_sort(proposals_.begin(), proposals_.begin() + pre_nms_topn, proposals_.end(),
395 [](const ProposalBox& struct1, const ProposalBox& struct2) {
396 return (struct1.score > struct2.score);
399 unpack_boxes(reinterpret_cast<float *>(&proposals_[0]), &unpacked_boxes[0], pre_nms_topn);
400 nms_cpu(pre_nms_topn, &is_dead[0], &unpacked_boxes[0], &roi_indices_[0], &num_rois, 0,
401 nms_thresh_, post_nms_topn_, coordinates_offset);
402 fill_output_blobs(&unpacked_boxes[0], &roi_indices_[0], p_roi_item, p_roi_score_item,
403 pre_nms_topn, num_rois, post_nms_topn_);
414 float coordinates_offset;
416 std::vector<int> roi_indices_;
419 class ONNXCustomProposalFactory : public ImplFactory<ONNXCustomProposalImpl> {
421 explicit ONNXCustomProposalFactory(const CNNLayer *layer): ImplFactory(layer) {}
422 // set output shapes by input shapes.
423 StatusCode getShapes(const std::vector<TensorDesc>& inShapes, std::vector<TensorDesc>& outShapes,
424 ResponseDesc *resp) noexcept override {
425 if (inShapes.size() != 1) {
427 std::string errorMsg = "Incorrect input shapes!";
428 errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
430 return GENERAL_ERROR;
433 outShapes.emplace_back(cnnLayer.precision, inShapes[0].getDims(), inShapes[0].getLayout());
438 REG_FACTORY_FOR(ONNXCustomProposalFactory, ExperimentalDetectronGenerateProposalsSingleImage);
441 } // namespace Extensions
442 } // namespace InferenceEngine