inference-engine/src/extension/ext_proposal_onnx.cpp

   1 // Copyright (C) 2019 Intel Corporation
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4
   5 #include "ext_list.hpp"
   6 #include "ext_base.hpp"
   7
   8 #include <cassert>
   9 #include <cmath>
  10 #include <string>
  11 #include <vector>
  12 #include <utility>
  13 #include <algorithm>
  14 #if defined(HAVE_AVX2)
  15 #include <immintrin.h>
  16 #endif
  17 #include "ie_parallel.hpp"
  18
  19
  20 namespace {
  21 struct Indexer {
  22   const std::vector<int> dims_;
  23   int total_{1};
  24
  25   explicit Indexer(const std::vector<int>& dims) : dims_(dims) {
  26       total_ = 1;
  27       for (size_t i = 0; i < dims_.size(); ++i) {
  28           total_ *= dims_[i];
  29       }
  30   }
  31
  32   const int operator()(const std::vector<int>& idx) const {
  33       int flat_idx = 0;
  34       assert(idx.size() == dims_.size());
  35       for (size_t i = 0; i < dims_.size(); ++i) {
  36           assert(0 <= idx[i] && idx[i] < dims_[i]);
  37           flat_idx = flat_idx * dims_[i] + idx[i];
  38       }
  39       assert(flat_idx < total_);
  40       return flat_idx;
  41   }
  42 };
  43 }  // namespace
  44
  45
  46 namespace InferenceEngine {
  47 namespace Extensions {
  48 namespace Cpu {
  49
  50 static
  51 void refine_anchors(const float* deltas, const float* scores, const float* anchors,
  52                     float* proposals, const int anchors_num, const int bottom_H,
  53                     const int bottom_W, const float img_H, const float img_W,
  54                     const float min_box_H, const float min_box_W,
  55                     const float max_delta_log_wh,
  56                     float coordinates_offset) {
  57     Indexer delta_idx({anchors_num, 4, bottom_H, bottom_W});
  58     Indexer score_idx({anchors_num, 1, bottom_H, bottom_W});
  59     Indexer proposal_idx({bottom_H, bottom_W, anchors_num, 5});
  60     Indexer anchor_idx({bottom_H, bottom_W, anchors_num, 4});
  61
  62     parallel_for2d(bottom_H, bottom_W, [&](int h, int w) {
  63             for (int anchor = 0; anchor < anchors_num; ++anchor) {
  64                 float x0 = anchors[anchor_idx({h, w, anchor, 0})];
  65                 float y0 = anchors[anchor_idx({h, w, anchor, 1})];
  66                 float x1 = anchors[anchor_idx({h, w, anchor, 2})];
  67                 float y1 = anchors[anchor_idx({h, w, anchor, 3})];
  68
  69                 const float dx = deltas[delta_idx({anchor, 0, h, w})];
  70                 const float dy = deltas[delta_idx({anchor, 1, h, w})];
  71                 const float d_log_w = deltas[delta_idx({anchor, 2, h, w})];
  72                 const float d_log_h = deltas[delta_idx({anchor, 3, h, w})];
  73
  74                 const float score = scores[score_idx({anchor, 0, h, w})];
  75
  76                 // width & height of box
  77                 const float ww = x1 - x0 + coordinates_offset;
  78                 const float hh = y1 - y0 + coordinates_offset;
  79                 // center location of box
  80                 const float ctr_x = x0 + 0.5f * ww;
  81                 const float ctr_y = y0 + 0.5f * hh;
  82
  83                 // new center location according to deltas (dx, dy)
  84                 const float pred_ctr_x = dx * ww + ctr_x;
  85                 const float pred_ctr_y = dy * hh + ctr_y;
  86                 // new width & height according to deltas d(log w), d(log h)
  87                 const float pred_w = std::exp(std::min(d_log_w, max_delta_log_wh)) * ww;
  88                 const float pred_h = std::exp(std::min(d_log_h, max_delta_log_wh)) * hh;
  89
  90                 // update upper-left corner location
  91                 x0 = pred_ctr_x - 0.5f * pred_w;
  92                 y0 = pred_ctr_y - 0.5f * pred_h;
  93                 // update lower-right corner location
  94                 x1 = pred_ctr_x + 0.5f * pred_w - coordinates_offset;
  95                 y1 = pred_ctr_y + 0.5f * pred_h - coordinates_offset;
  96
  97                 // adjust new corner locations to be within the image region,
  98                 x0 = std::max<float>(0.0f, std::min<float>(x0, img_W - coordinates_offset));
  99                 y0 = std::max<float>(0.0f, std::min<float>(y0, img_H - coordinates_offset));
 100                 x1 = std::max<float>(0.0f, std::min<float>(x1, img_W - coordinates_offset));
 101                 y1 = std::max<float>(0.0f, std::min<float>(y1, img_H - coordinates_offset));
 102
 103                 // recompute new width & height
 104                 const float box_w = x1 - x0 + coordinates_offset;
 105                 const float box_h = y1 - y0 + coordinates_offset;
 106
 107                 proposals[proposal_idx({h, w, anchor, 0})] = x0;
 108                 proposals[proposal_idx({h, w, anchor, 1})] = y0;
 109                 proposals[proposal_idx({h, w, anchor, 2})] = x1;
 110                 proposals[proposal_idx({h, w, anchor, 3})] = y1;
 111                 proposals[proposal_idx({h, w, anchor, 4})] = (min_box_W <= box_w) * (min_box_H <= box_h) * score;
 112             }
 113     });
 114 }
 115
 116 static void unpack_boxes(const float* p_proposals, float* unpacked_boxes, int pre_nms_topn) {
 117     parallel_for(pre_nms_topn, [&](size_t i) {
 118         unpacked_boxes[0*pre_nms_topn + i] = p_proposals[5*i + 0];
 119         unpacked_boxes[1*pre_nms_topn + i] = p_proposals[5*i + 1];
 120         unpacked_boxes[2*pre_nms_topn + i] = p_proposals[5*i + 2];
 121         unpacked_boxes[3*pre_nms_topn + i] = p_proposals[5*i + 3];
 122         unpacked_boxes[4*pre_nms_topn + i] = p_proposals[5*i + 4];
 123     });
 124 }
 125
 126 static
 127 void nms_cpu(const int num_boxes, int is_dead[],
 128              const float* boxes, int index_out[], int* const num_out,
 129              const int base_index, const float nms_thresh, const int max_num_out,
 130              float coordinates_offset) {
 131     const int num_proposals = num_boxes;
 132     int count = 0;
 133
 134     const float* x0 = boxes + 0 * num_proposals;
 135     const float* y0 = boxes + 1 * num_proposals;
 136     const float* x1 = boxes + 2 * num_proposals;
 137     const float* y1 = boxes + 3 * num_proposals;
 138
 139     memset(is_dead, 0, num_boxes * sizeof(int));
 140
 141 #if defined(HAVE_AVX2)
 142     __m256  vc_fone = _mm256_set1_ps(coordinates_offset);
 143     __m256i vc_ione = _mm256_set1_epi32(1);
 144     __m256  vc_zero = _mm256_set1_ps(0.0f);
 145
 146     __m256 vc_nms_thresh = _mm256_set1_ps(nms_thresh);
 147 #endif
 148
 149     for (int box = 0; box < num_boxes; ++box) {
 150         if (is_dead[box])
 151             continue;
 152
 153         index_out[count++] = base_index + box;
 154         if (count == max_num_out)
 155             break;
 156
 157         int tail = box + 1;
 158
 159 #if defined(HAVE_AVX2)
 160         __m256 vx0i = _mm256_set1_ps(x0[box]);
 161         __m256 vy0i = _mm256_set1_ps(y0[box]);
 162         __m256 vx1i = _mm256_set1_ps(x1[box]);
 163         __m256 vy1i = _mm256_set1_ps(y1[box]);
 164
 165         __m256 vA_width  = _mm256_sub_ps(vx1i, vx0i);
 166         __m256 vA_height = _mm256_sub_ps(vy1i, vy0i);
 167         __m256 vA_area   = _mm256_mul_ps(_mm256_add_ps(vA_width, vc_fone), _mm256_add_ps(vA_height, vc_fone));
 168
 169         for (; tail <= num_boxes - 8; tail += 8) {
 170             __m256i *pdst = reinterpret_cast<__m256i*>(is_dead + tail);
 171             __m256i  vdst = _mm256_loadu_si256(pdst);
 172
 173             __m256 vx0j = _mm256_loadu_ps(x0 + tail);
 174             __m256 vy0j = _mm256_loadu_ps(y0 + tail);
 175             __m256 vx1j = _mm256_loadu_ps(x1 + tail);
 176             __m256 vy1j = _mm256_loadu_ps(y1 + tail);
 177
 178             __m256 vx0 = _mm256_max_ps(vx0i, vx0j);
 179             __m256 vy0 = _mm256_max_ps(vy0i, vy0j);
 180             __m256 vx1 = _mm256_min_ps(vx1i, vx1j);
 181             __m256 vy1 = _mm256_min_ps(vy1i, vy1j);
 182
 183             __m256 vwidth  = _mm256_add_ps(_mm256_sub_ps(vx1, vx0), vc_fone);
 184             __m256 vheight = _mm256_add_ps(_mm256_sub_ps(vy1, vy0), vc_fone);
 185             __m256 varea = _mm256_mul_ps(_mm256_max_ps(vc_zero, vwidth), _mm256_max_ps(vc_zero, vheight));
 186
 187             __m256 vB_width  = _mm256_sub_ps(vx1j, vx0j);
 188             __m256 vB_height = _mm256_sub_ps(vy1j, vy0j);
 189             __m256 vB_area   = _mm256_mul_ps(_mm256_add_ps(vB_width, vc_fone), _mm256_add_ps(vB_height, vc_fone));
 190
 191             __m256 vdivisor = _mm256_sub_ps(_mm256_add_ps(vA_area, vB_area), varea);
 192             __m256 vintersection_area = _mm256_div_ps(varea, vdivisor);
 193
 194             __m256 vcmp_0 = _mm256_cmp_ps(vx0i, vx1j, _CMP_LE_OS);
 195             __m256 vcmp_1 = _mm256_cmp_ps(vy0i, vy1j, _CMP_LE_OS);
 196             __m256 vcmp_2 = _mm256_cmp_ps(vx0j, vx1i, _CMP_LE_OS);
 197             __m256 vcmp_3 = _mm256_cmp_ps(vy0j, vy1i, _CMP_LE_OS);
 198             __m256 vcmp_4 = _mm256_cmp_ps(vc_nms_thresh, vintersection_area, _CMP_LT_OS);
 199
 200             vcmp_0 = _mm256_and_ps(vcmp_0, vcmp_1);
 201             vcmp_2 = _mm256_and_ps(vcmp_2, vcmp_3);
 202             vcmp_4 = _mm256_and_ps(vcmp_4, vcmp_0);
 203             vcmp_4 = _mm256_and_ps(vcmp_4, vcmp_2);
 204
 205             _mm256_storeu_si256(pdst, _mm256_blendv_epi8(vdst, vc_ione, _mm256_castps_si256(vcmp_4)));
 206         }
 207 #endif
 208
 209         for (; tail < num_boxes; ++tail) {
 210             float res = 0.0f;
 211
 212             const float x0i = x0[box];
 213             const float y0i = y0[box];
 214             const float x1i = x1[box];
 215             const float y1i = y1[box];
 216
 217             const float x0j = x0[tail];
 218             const float y0j = y0[tail];
 219             const float x1j = x1[tail];
 220             const float y1j = y1[tail];
 221
 222             if (x0i <= x1j && y0i <= y1j && x0j <= x1i && y0j <= y1i) {
 223                 // overlapped region (= box)
 224                 const float x0 = std::max<float>(x0i, x0j);
 225                 const float y0 = std::max<float>(y0i, y0j);
 226                 const float x1 = std::min<float>(x1i, x1j);
 227                 const float y1 = std::min<float>(y1i, y1j);
 228
 229                 // intersection area
 230                 const float width  = std::max<float>(0.0f,  x1 - x0 + coordinates_offset);
 231                 const float height = std::max<float>(0.0f,  y1 - y0 + coordinates_offset);
 232                 const float area   = width * height;
 233
 234                 // area of A, B
 235                 const float A_area = (x1i - x0i + coordinates_offset) * (y1i - y0i + coordinates_offset);
 236                 const float B_area = (x1j - x0j + coordinates_offset) * (y1j - y0j + coordinates_offset);
 237
 238                 // IoU
 239                 res = area / (A_area + B_area - area);
 240             }
 241
 242             if (nms_thresh < res)
 243                 is_dead[tail] = 1;
 244         }
 245     }
 246
 247     *num_out = count;
 248 }
 249
 250
 251 static
 252 void fill_output_blobs(const float* proposals, const int* roi_indices,
 253                        float* rois, float* scores,
 254                        const int num_proposals, const int num_rois, const int post_nms_topn) {
 255     const float *src_x0 = proposals + 0 * num_proposals;
 256     const float *src_y0 = proposals + 1 * num_proposals;
 257     const float *src_x1 = proposals + 2 * num_proposals;
 258     const float *src_y1 = proposals + 3 * num_proposals;
 259     const float *src_score = proposals + 4 * num_proposals;
 260
 261     parallel_for(num_rois, [&](size_t i) {
 262         int index = roi_indices[i];
 263         rois[i * 4 + 0] = src_x0[index];
 264         rois[i * 4 + 1] = src_y0[index];
 265         rois[i * 4 + 2] = src_x1[index];
 266         rois[i * 4 + 3] = src_y1[index];
 267         scores[i] = src_score[index];
 268     });
 269
 270     if (num_rois < post_nms_topn) {
 271         for (int i = 4 * num_rois; i < 4 * post_nms_topn; i++) {
 272             rois[i] = 0.f;
 273         }
 274         for (int i = num_rois; i < post_nms_topn; i++) {
 275             scores[i] = 0.f;
 276         }
 277     }
 278 }
 279
 280
 281 class ONNXCustomProposalImpl : public ExtLayerBase {
 282 private:
 283     const int INPUT_IM_INFO {0};
 284     const int INPUT_ANCHORS {1};
 285     const int INPUT_DELTAS {2};
 286     const int INPUT_SCORES {3};
 287     const int OUTPUT_ROIS {0};
 288     const int OUTPUT_SCORES {1};
 289
 290 public:
 291     explicit ONNXCustomProposalImpl(const CNNLayer *layer) {
 292         try {
 293             if (layer->insData.size() != 4 || layer->outData.size() != 2)
 294                 THROW_IE_EXCEPTION << "Incorrect number of input/output edges!";
 295
 296             min_size_ = layer->GetParamAsFloat("min_size");
 297             nms_thresh_ = layer->GetParamAsFloat("nms_threshold");
 298             pre_nms_topn_ = layer->GetParamAsInt("pre_nms_count");
 299             post_nms_topn_ = layer->GetParamAsInt("post_nms_count");
 300
 301             coordinates_offset = 0.0f;
 302
 303             roi_indices_.resize(post_nms_topn_);
 304             addConfig(layer,
 305                       {DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN),
 306                        DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)},
 307                       {DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)});
 308         } catch (InferenceEngine::details::InferenceEngineException &ex) {
 309             errorMsg = ex.what();
 310         }
 311     }
 312
 313     void print_shape(const Blob::Ptr& b) {
 314         for (size_t i = 0; i < b->getTensorDesc().getDims().size(); ++i) {
 315             std::cout << b->getTensorDesc().getDims()[i] << ", ";
 316         }
 317         std::cout << std::endl;
 318     }
 319
 320     StatusCode execute(std::vector<Blob::Ptr> &inputs, std::vector<Blob::Ptr> &outputs,
 321                        ResponseDesc *resp) noexcept override {
 322         if (inputs.size() != 4 || outputs.size() != 2) {
 323             if (resp) {
 324                 std::string errorMsg = "Incorrect number of input or output edges!";
 325                 errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
 326             }
 327             return GENERAL_ERROR;
 328         }
 329
 330         // Prepare memory
 331         const float* p_deltas_item = inputs[INPUT_DELTAS]->buffer();
 332         const float* p_scores_item = inputs[INPUT_SCORES]->buffer();
 333         const float* p_anchors_item = inputs[INPUT_ANCHORS]->buffer();
 334         const float* p_img_info_cpu = inputs[INPUT_IM_INFO]->buffer();
 335
 336         float* p_roi_item = outputs[OUTPUT_ROIS]->buffer();
 337         float* p_roi_score_item = outputs[OUTPUT_SCORES]->buffer();
 338
 339
 340         size_t img_info_size = 1;
 341         for (size_t i = 0; i < inputs[INPUT_IM_INFO]->getTensorDesc().getDims().size(); i++) {
 342             img_info_size *= inputs[INPUT_IM_INFO]->getTensorDesc().getDims()[i];
 343         }
 344
 345         const int anchors_num = inputs[INPUT_SCORES]->getTensorDesc().getDims()[0];
 346
 347         // bottom shape: (num_anchors) x H x W
 348         const int bottom_H = inputs[INPUT_DELTAS]->getTensorDesc().getDims()[1];
 349         const int bottom_W = inputs[INPUT_DELTAS]->getTensorDesc().getDims()[2];
 350
 351         // input image height & width
 352         const float img_H = p_img_info_cpu[0];
 353         const float img_W = p_img_info_cpu[1];
 354
 355         // scale factor for height & width
 356
 357         // minimum box width & height
 358         const float min_box_H = min_size_;
 359         const float min_box_W = min_size_;
 360
 361         // number of all proposals = num_anchors * H * W
 362         const int num_proposals = anchors_num * bottom_H * bottom_W;
 363
 364         // number of top-n proposals before NMS
 365         const int pre_nms_topn = std::min<int>(num_proposals, pre_nms_topn_);
 366
 367         // number of final RoIs
 368         int num_rois = 0;
 369
 370         // enumerate all proposals
 371         //   num_proposals = num_anchors * H * W
 372         //   (x1, y1, x2, y2, score) for each proposal
 373         // NOTE: for bottom, only foreground scores are passed
 374         struct ProposalBox {
 375             float x0;
 376             float y0;
 377             float x1;
 378             float y1;
 379             float score;
 380         };
 381         std::vector<ProposalBox> proposals_(num_proposals);
 382         std::vector<float> unpacked_boxes(5 * pre_nms_topn);
 383         std::vector<int> is_dead(pre_nms_topn);
 384
 385         // Execute
 386         int batch_size = 1;  // inputs[INPUT_DELTAS]->getTensorDesc().getDims()[0];
 387         for (int n = 0; n < batch_size; ++n) {
 388             refine_anchors(p_deltas_item, p_scores_item, p_anchors_item,
 389                            reinterpret_cast<float *>(&proposals_[0]), anchors_num, bottom_H,
 390                            bottom_W, img_H, img_W,
 391                            min_box_H, min_box_W,
 392                            static_cast<const float>(log(1000. / 16.)),
 393                            1.0f);
 394             std::partial_sort(proposals_.begin(), proposals_.begin() + pre_nms_topn, proposals_.end(),
 395                               [](const ProposalBox& struct1, const ProposalBox& struct2) {
 396                                   return (struct1.score > struct2.score);
 397                               });
 398
 399             unpack_boxes(reinterpret_cast<float *>(&proposals_[0]), &unpacked_boxes[0], pre_nms_topn);
 400             nms_cpu(pre_nms_topn, &is_dead[0], &unpacked_boxes[0], &roi_indices_[0], &num_rois, 0,
 401                     nms_thresh_, post_nms_topn_, coordinates_offset);
 402             fill_output_blobs(&unpacked_boxes[0], &roi_indices_[0], p_roi_item, p_roi_score_item,
 403                               pre_nms_topn, num_rois, post_nms_topn_);
 404         }
 405
 406         return OK;
 407     }
 408
 409 private:
 410     float min_size_;
 411     int pre_nms_topn_;
 412     int post_nms_topn_;
 413     float nms_thresh_;
 414     float coordinates_offset;
 415
 416     std::vector<int> roi_indices_;
 417 };
 418
 419 class ONNXCustomProposalFactory : public ImplFactory<ONNXCustomProposalImpl> {
 420 public:
 421     explicit ONNXCustomProposalFactory(const CNNLayer *layer): ImplFactory(layer) {}
 422     // set output shapes by input shapes.
 423     StatusCode getShapes(const std::vector<TensorDesc>& inShapes, std::vector<TensorDesc>& outShapes,
 424                          ResponseDesc *resp) noexcept override {
 425         if (inShapes.size() != 1) {
 426             if (resp) {
 427                 std::string errorMsg = "Incorrect input shapes!";
 428                 errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
 429             }
 430             return GENERAL_ERROR;
 431         }
 432         outShapes.clear();
 433         outShapes.emplace_back(cnnLayer.precision, inShapes[0].getDims(), inShapes[0].getLayout());
 434         return OK;
 435     }
 436 };
 437
 438 REG_FACTORY_FOR(ONNXCustomProposalFactory, ExperimentalDetectronGenerateProposalsSingleImage);
 439
 440 }  // namespace Cpu
 441 }  // namespace Extensions
 442 }  // namespace InferenceEngine