inference-engine/src/extension/ext_roifeatureextractor_onnx.cpp

   1 // Copyright (C) 2019 Intel Corporation
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4
   5 // There are some code snippets in this file.
   6 // Original source file is avaialble here (Copyright (c) 2018 Facebook, MIT License):
   7 // https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp
   8 //
   9
  10 #include "ext_list.hpp"
  11 #include "ext_base.hpp"
  12 #include <cassert>
  13 #include <cmath>
  14 #include <vector>
  15 #include <string>
  16 #include <algorithm>
  17 #include "ie_parallel.hpp"
  18
  19 namespace InferenceEngine {
  20 namespace Extensions {
  21 namespace Cpu {
  22
  23 // implementation taken from Caffe2
  24 template <typename T>
  25 struct PreCalc {
  26   int pos1;
  27   int pos2;
  28   int pos3;
  29   int pos4;
  30   T w1;
  31   T w2;
  32   T w3;
  33   T w4;
  34 };
  35
  36 template <typename T>
  37 void pre_calc_for_bilinear_interpolate(
  38     const int height,
  39     const int width,
  40     const int pooled_height,
  41     const int pooled_width,
  42     const int iy_upper,
  43     const int ix_upper,
  44     T roi_start_h,
  45     T roi_start_w,
  46     T bin_size_h,
  47     T bin_size_w,
  48     int roi_bin_grid_h,
  49     int roi_bin_grid_w,
  50     std::vector<PreCalc<T>>& pre_calc) {
  51   int pre_calc_index = 0;
  52   for (int ph = 0; ph < pooled_height; ph++) {
  53     for (int pw = 0; pw < pooled_width; pw++) {
  54       for (int iy = 0; iy < iy_upper; iy++) {
  55         const T yy = roi_start_h + ph * bin_size_h +
  56             static_cast<T>(iy + .5f) * bin_size_h /
  57                 static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
  58         for (int ix = 0; ix < ix_upper; ix++) {
  59           const T xx = roi_start_w + pw * bin_size_w +
  60               static_cast<T>(ix + .5f) * bin_size_w /
  61                   static_cast<T>(roi_bin_grid_w);
  62
  63           T x = xx;
  64           T y = yy;
  65           // deal with: inverse elements are out of feature map boundary
  66           if (y < -1.0 || y > height || x < -1.0 || x > width) {
  67             // empty
  68             PreCalc<T> pc;
  69             pc.pos1 = 0;
  70             pc.pos2 = 0;
  71             pc.pos3 = 0;
  72             pc.pos4 = 0;
  73             pc.w1 = 0;
  74             pc.w2 = 0;
  75             pc.w3 = 0;
  76             pc.w4 = 0;
  77             pre_calc.at(pre_calc_index) = pc;
  78             pre_calc_index += 1;
  79             continue;
  80           }
  81
  82           if (y <= 0) {
  83             y = 0;
  84           }
  85           if (x <= 0) {
  86             x = 0;
  87           }
  88
  89           int y_low = static_cast<int>(y);
  90           int x_low = static_cast<int>(x);
  91           int y_high = 0;
  92           int x_high = 0;
  93
  94           if (y_low >= height - 1) {
  95             y_high = y_low = height - 1;
  96             y = (T)y_low;
  97           } else {
  98             y_high = y_low + 1;
  99           }
 100
 101           if (x_low >= width - 1) {
 102             x_high = x_low = width - 1;
 103             x = (T)x_low;
 104           } else {
 105             x_high = x_low + 1;
 106           }
 107
 108           T ly = y - y_low;
 109           T lx = x - x_low;
 110           T hy = static_cast<T>(1) - ly, hx = static_cast<T>(1) - lx;
 111           T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
 112
 113           // save weights and indeces
 114           PreCalc<T> pc;
 115           pc.pos1 = y_low * width + x_low;
 116           pc.pos2 = y_low * width + x_high;
 117           pc.pos3 = y_high * width + x_low;
 118           pc.pos4 = y_high * width + x_high;
 119           pc.w1 = w1;
 120           pc.w2 = w2;
 121           pc.w3 = w3;
 122           pc.w4 = w4;
 123           pre_calc[pre_calc_index] = pc;
 124
 125           pre_calc_index += 1;
 126         }
 127       }
 128     }
 129   }
 130 }
 131
 132 template <typename T>
 133 void ROIAlignForward_cpu_kernel(
 134     const int nthreads,
 135     const T* bottom_data,
 136     const T& spatial_scale,
 137     const int channels,
 138     const int height,
 139     const int width,
 140     const int pooled_height,
 141     const int pooled_width,
 142     const int sampling_ratio,
 143     const T* bottom_rois,
 144     T* top_data) {
 145   int roi_cols = 4;
 146
 147   int n_rois = nthreads / channels / pooled_width / pooled_height;
 148   // (n, c, ph, pw) is an element in the pooled output
 149   parallel_for(n_rois, [&](size_t n) {
 150     int index_n = n * channels * pooled_width * pooled_height;
 151
 152     // roi could have 4 or 5 columns
 153     const T* offset_bottom_rois = bottom_rois + n * roi_cols;
 154     int roi_batch_ind = 0;
 155     if (roi_cols == 5) {
 156       roi_batch_ind = static_cast<int>(offset_bottom_rois[0]);
 157       offset_bottom_rois++;
 158     }
 159
 160     // Do not using rounding; this implementation detail is critical
 161     T roi_start_w = offset_bottom_rois[0] * spatial_scale;
 162     T roi_start_h = offset_bottom_rois[1] * spatial_scale;
 163     T roi_end_w = offset_bottom_rois[2] * spatial_scale;
 164     T roi_end_h = offset_bottom_rois[3] * spatial_scale;
 165
 166     // Force malformed ROIs to be 1x1
 167     T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
 168     T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
 169     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
 170     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
 171
 172     // We use roi_bin_grid to sample the grid and mimic integral
 173     int roi_bin_grid_h = (sampling_ratio > 0)
 174         ? sampling_ratio
 175         : static_cast<int>(ceil(roi_height / pooled_height));  // e.g., = 2
 176     int roi_bin_grid_w =
 177         (sampling_ratio > 0) ? sampling_ratio : static_cast<int>(ceil(roi_width / pooled_width));
 178
 179     // We do average (integral) pooling inside a bin
 180     const T count = static_cast<T>(roi_bin_grid_h * roi_bin_grid_w);  // e.g. = 4
 181
 182     // we want to precalculate indeces and weights shared by all chanels,
 183     // this is the key point of optimiation
 184     std::vector<PreCalc<T>> pre_calc(
 185         roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
 186     pre_calc_for_bilinear_interpolate(
 187         height,
 188         width,
 189         pooled_height,
 190         pooled_width,
 191         roi_bin_grid_h,
 192         roi_bin_grid_w,
 193         roi_start_h,
 194         roi_start_w,
 195         bin_size_h,
 196         bin_size_w,
 197         roi_bin_grid_h,
 198         roi_bin_grid_w,
 199         pre_calc);
 200
 201     for (int c = 0; c < channels; c++) {
 202       int index_n_c = index_n + c * pooled_width * pooled_height;
 203       const T* offset_bottom_data =
 204           bottom_data + (roi_batch_ind * channels + c) * height * width;
 205       int pre_calc_index = 0;
 206
 207       for (int ph = 0; ph < pooled_height; ph++) {
 208         for (int pw = 0; pw < pooled_width; pw++) {
 209           int index = index_n_c + ph * pooled_width + pw;
 210
 211           T output_val = 0.;
 212           for (int iy = 0; iy < roi_bin_grid_h; iy++) {
 213             for (int ix = 0; ix < roi_bin_grid_w; ix++) {
 214               PreCalc<T> pc = pre_calc[pre_calc_index];
 215               output_val += pc.w1 * offset_bottom_data[pc.pos1] +
 216                   pc.w2 * offset_bottom_data[pc.pos2] +
 217                   pc.w3 * offset_bottom_data[pc.pos3] +
 218                   pc.w4 * offset_bottom_data[pc.pos4];
 219
 220               pre_calc_index += 1;
 221             }
 222           }
 223           output_val /= count;
 224
 225           top_data[index] = output_val;
 226         }  // for pw
 227       }  // for ph
 228     }  // for c
 229   });
 230 }
 231
 232
 233 void redistribute_rois(const float* rois, int* level_ids,
 234                        const int num_rois, const int levels_num) {
 235     const float canonical_scale = 224.0f;
 236     const int canonical_level = 2;
 237
 238     for (int i = 0; i < num_rois; ++i) {
 239         const float x0 = rois[4 * i + 0];
 240         const float y0 = rois[4 * i + 1];
 241         const float x1 = rois[4 * i + 2];
 242         const float y1 = rois[4 * i + 3];
 243
 244         int target_level = levels_num;
 245         float area = (x1 - x0) * (y1 - y0);
 246         if (area > 0) {
 247             area = std::sqrt(area) / canonical_scale;
 248             area = std::log2(area + 1e-6f);
 249             target_level = static_cast<int>(std::floor(area + canonical_level));
 250             target_level = std::max<int>(0, std::min<int>(levels_num - 1, target_level));
 251         }
 252
 253         level_ids[i] = target_level;
 254     }
 255 }
 256
 257
 258 void reorder(const float* src_data, const int* ranks, const int n, const int step, float* dst_data,
 259              int* dst_mapping) {
 260     std::iota(dst_mapping, dst_mapping + n, 0);
 261     std::sort(dst_mapping, dst_mapping + n, [&ranks](size_t i1, size_t i2) {return ranks[i1] < ranks[i2];});
 262     for (int i = 0; i < n; ++i) {
 263         const int j = dst_mapping[i];
 264         assert(0 <= j && j < n);
 265         std::memcpy(dst_data + i * step, src_data + j * step, sizeof(float) * step);
 266     }
 267 }
 268
 269 void split_points(const std::vector<int>& ids, std::vector<int>& rois_per_level, const int levels_num) {
 270     rois_per_level.clear();
 271     rois_per_level.resize(levels_num, 0);
 272     for (size_t i = 0; i < ids.size(); ++i) {
 273         assert(0 <= ids[i] && ids[i] < levels_num);
 274         rois_per_level[ids[i]]++;
 275     }
 276     for (int i = 1; i < levels_num; ++i) {
 277         rois_per_level[i] += rois_per_level[i - 1];
 278     }
 279     rois_per_level.insert(rois_per_level.begin(), 0);
 280 }
 281
 282
 283 void reorder_rois(const float *rois, const int* ids, int* mapping, const int rois_num,
 284                   float * reordered_rois, std::vector<int>& rois_per_level, const int levels_num) {
 285     rois_per_level.clear();
 286     rois_per_level.resize(levels_num, 0);
 287     for (int i = 0; i < rois_num; ++i) {
 288         assert(0 <= ids[i] && ids[i] < levels_num);
 289         rois_per_level[ids[i]]++;
 290     }
 291     for (int i = 1; i < levels_num; ++i) {
 292         rois_per_level[i] += rois_per_level[i - 1];
 293     }
 294     rois_per_level.insert(rois_per_level.begin(), 0);
 295
 296     std::vector<int> level_counter = rois_per_level;
 297
 298     for (int i = 0; i < rois_num; ++i) {
 299         const int level = ids[i];
 300         assert(level < levels_num);
 301         const int j = level_counter[level];
 302         assert(0 <= j && j < rois_num);
 303         reordered_rois[j * 4 + 0] = rois[i * 4 + 0];
 304         reordered_rois[j * 4 + 1] = rois[i * 4 + 1];
 305         reordered_rois[j * 4 + 2] = rois[i * 4 + 2];
 306         reordered_rois[j * 4 + 3] = rois[i * 4 + 3];
 307         level_counter[level]++;
 308     }
 309 }
 310
 311 class ExperimentalDetectronROIFeatureExtractorImpl: public ExtLayerBase {
 312 private:
 313     const int INPUT_ROIS {0};
 314     const int INPUT_FEATURES_START {1};
 315
 316     const int OUTPUT_ROI_FEATURES {0};
 317     const int OUTPUT_ROIS {1};
 318
 319 public:
 320     explicit ExperimentalDetectronROIFeatureExtractorImpl(const CNNLayer* layer) {
 321         try {
 322             output_dim_ = layer->GetParamAsInt("output_size");
 323             pyramid_scales_ = layer->GetParamAsInts("pyramid_scales");
 324             sampling_ratio_ = layer->GetParamAsInt("sampling_ratio");
 325             pooled_height_ = output_dim_;
 326             pooled_width_ = output_dim_;
 327
 328             std::vector<DataConfigurator> inputs_layouts(layer->insData.size(), DataConfigurator(ConfLayout::PLN));
 329             std::vector<DataConfigurator> outputs_layouts(layer->outData.size(), DataConfigurator(ConfLayout::PLN));
 330             addConfig(layer, inputs_layouts, outputs_layouts);
 331         } catch (InferenceEngine::details::InferenceEngineException &ex) {
 332             errorMsg = ex.what();
 333         }
 334     }
 335
 336     StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs,
 337                        ResponseDesc *resp) noexcept override {
 338         const int levels_num = inputs.size() - INPUT_FEATURES_START;
 339         const int num_rois = inputs[INPUT_ROIS]->getTensorDesc().getDims()[0];
 340         const int channels_num = inputs[INPUT_FEATURES_START]->getTensorDesc().getDims()[1];
 341         const int feaxels_per_roi = pooled_height_ * pooled_width_ * channels_num;
 342
 343         auto *input_rois = inputs[INPUT_ROIS]->buffer().as<const float *>();
 344         auto *output_rois_features = outputs[OUTPUT_ROI_FEATURES]->buffer().as<float *>();
 345         float *output_rois = nullptr;
 346         if (OUTPUT_ROIS < static_cast<int>(outputs.size())) {
 347             output_rois = outputs[OUTPUT_ROIS]->buffer().as<float *>();
 348         }
 349
 350         std::vector<int> level_ids(num_rois, 0);
 351         redistribute_rois(input_rois, reinterpret_cast<int *>(&level_ids[0]), num_rois, levels_num);
 352
 353         std::vector<float> reordered_rois(4 * num_rois, 0);
 354         std::vector<int> original_rois_mapping(num_rois, 0);
 355         reorder(input_rois, &level_ids[0], num_rois, 4, &reordered_rois[0], &original_rois_mapping[0]);
 356
 357         std::vector<int> rois_per_level;
 358         split_points(level_ids, rois_per_level, levels_num + 1);
 359
 360         std::vector<float> output_rois_features_temp(feaxels_per_roi * num_rois, 0);
 361         for (int i = 0; i < levels_num; ++i) {
 362             const int level_rois_offset = rois_per_level[i];
 363             const int level_rois_num = rois_per_level[i + 1] - level_rois_offset;
 364             if (level_rois_num > 0) {
 365                 auto *featuremap = inputs[INPUT_FEATURES_START + i]->buffer().as<const float *>();
 366                 const int featuremap_height = inputs[INPUT_FEATURES_START + i]->getTensorDesc().getDims()[2];
 367                 const int featuremap_width = inputs[INPUT_FEATURES_START + i]->getTensorDesc().getDims()[3];
 368                 ROIAlignForward_cpu_kernel<float>(feaxels_per_roi * level_rois_num,
 369                     featuremap,
 370                     1.0f / pyramid_scales_[i],
 371                     channels_num,
 372                     featuremap_height,
 373                     featuremap_width,
 374                     pooled_height_,
 375                     pooled_width_,
 376                     sampling_ratio_,
 377                     &reordered_rois[4 * level_rois_offset],
 378                     &output_rois_features_temp[feaxels_per_roi * level_rois_offset]);
 379             }
 380         }
 381
 382         std::vector<int> dummy_mapping(num_rois, 0);
 383         reorder(&output_rois_features_temp[0], &original_rois_mapping[0], num_rois, feaxels_per_roi,
 384                 output_rois_features, &dummy_mapping[0]);
 385         if (output_rois != nullptr) {
 386             std::memcpy(output_rois, input_rois, 4 * num_rois * sizeof(float));
 387         }
 388
 389         return OK;
 390     }
 391
 392 private:
 393     int output_dim_ = 0;
 394     int pooled_height_ = 0;
 395     int pooled_width_ = 0;
 396     std::vector<int> pyramid_scales_;
 397     int sampling_ratio_ = 0;
 398
 399     int channels = 0;
 400     int height = 0;
 401     int width = 0;
 402
 403     int nn = 0;
 404     int nc = 0;
 405     int nh = 0;
 406     int nw = 0;
 407 };
 408
 409 REG_FACTORY_FOR(ImplFactory<ExperimentalDetectronROIFeatureExtractorImpl>, ExperimentalDetectronROIFeatureExtractor);
 410
 411 }  // namespace Cpu
 412 }  // namespace Extensions
 413 }  // namespace InferenceEngine