inference-engine/src/inference_engine/ie_preprocess_gapi.cpp

   1 // Copyright (C) 2018-2019 Intel Corporation
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4
   5 #include <utility>
   6 #include <vector>
   7 #include <algorithm>
   8 #include <tuple>
   9 #include <string>
  10
  11 // Careful reader, don't worry -- it is not the whole OpenCV,
  12 // it is just a single stand-alone component of it
  13 #include <opencv2/gapi.hpp>
  14 #include <opencv2/gapi/util/util.hpp>
  15
  16 #include "ie_blob.h"
  17 #include "ie_input_info.hpp"
  18 #include "ie_preprocess_gapi.hpp"
  19 #include "ie_preprocess_gapi_kernels.hpp"
  20
  21 #include "ie_parallel.hpp"
  22
  23 #include <opencv2/gapi/fluid/gfluidkernel.hpp>  // GFluidOutputRois
  24
  25 namespace InferenceEngine {
  26 namespace {
  27 namespace G {
  28     struct Strides {int N; int C; int H; int W;};
  29     struct Dims    {int N; int C; int H; int W;};
  30     struct Desc    {Dims d; Strides s;};
  31
  32     void fix_strides_nhwc(const Dims &d, Strides &s) {
  33         if (s.W > d.C) {
  34             s.C = 1;
  35             s.W = s.C*d.C;
  36             s.H = s.W*d.W;
  37             s.N = s.H*d.H;
  38         }
  39     }
  40
  41     Desc decompose(Blob::Ptr &blob) {
  42         const auto& ie_desc     = blob->getTensorDesc();
  43         const auto& ie_blk_desc = ie_desc.getBlockingDesc();
  44         const auto& ie_dims     = ie_desc.getDims();
  45         const auto& ie_strides  = ie_blk_desc.getStrides();
  46
  47         Dims d = {
  48             static_cast<int>(ie_dims[0]),
  49             static_cast<int>(ie_dims[1]),
  50             static_cast<int>(ie_dims[2]),
  51             static_cast<int>(ie_dims[3])
  52         };
  53
  54         Strides s = {
  55             static_cast<int>(ie_strides[0]),
  56             static_cast<int>(blob->layout() == NHWC ? ie_strides[3] : ie_strides[1]),
  57             static_cast<int>(blob->layout() == NHWC ? ie_strides[1] : ie_strides[2]),
  58             static_cast<int>(blob->layout() == NHWC ? ie_strides[2] : ie_strides[3]),
  59         };
  60
  61         if (blob->layout() == NHWC) fix_strides_nhwc(d, s);
  62
  63         return Desc{d, s};
  64     }
  65 }  // namespace G
  66
  67 inline int get_cv_depth(const InferenceEngine::TensorDesc &ie_desc) {
  68     switch (ie_desc.getPrecision()) {
  69     case Precision::U8:   return CV_8U;
  70     case Precision::FP32: return CV_32F;
  71     default: THROW_IE_EXCEPTION << "Unsupported data type";
  72     }
  73 }
  74
  75 std::vector<std::vector<cv::gapi::own::Mat>> bind_to_blob(Blob::Ptr &blob, int batch_size) {
  76     if (batch_size <= 0) {
  77         return {};
  78     }
  79
  80     const auto& ie_desc     = blob->getTensorDesc();
  81     const auto& ie_desc_blk = ie_desc.getBlockingDesc();
  82     const auto     desc     = G::decompose(blob);
  83     const auto cv_depth     = get_cv_depth(ie_desc);
  84     const auto stride       = desc.s.H*blob->element_size();
  85     const auto planeSize    = cv::gapi::own::Size(desc.d.W, desc.d.H);
  86     // Note: operating with strides (desc.s) rather than dimensions (desc.d) which is vital for ROI
  87     //       blobs (data buffer is shared but dimensions are different due to ROI != original image)
  88     const auto batch_offset = desc.s.N * blob->element_size();
  89
  90     std::vector<std::vector<cv::gapi::own::Mat>> result(batch_size);
  91
  92     uint8_t* blob_ptr = static_cast<uint8_t*>(blob->buffer());
  93     blob_ptr += blob->element_size()*ie_desc_blk.getOffsetPadding();
  94
  95     for (int i = 0; i < batch_size; ++i) {
  96         uint8_t* curr_data_ptr = blob_ptr + i * batch_offset;
  97
  98         std::vector<cv::gapi::own::Mat> planes;
  99         if (blob->layout() == NHWC) {
 100             planes.emplace_back(planeSize.height, planeSize.width, CV_MAKETYPE(cv_depth, desc.d.C),
 101                 curr_data_ptr, stride);
 102         } else {  // NCHW
 103             const auto planeType = CV_MAKETYPE(cv_depth, 1);
 104             for (size_t ch = 0; ch < desc.d.C; ch++) {
 105                 cv::gapi::own::Mat plane(planeSize.height, planeSize.width, planeType,
 106                     curr_data_ptr + ch*desc.s.C*blob->element_size(), stride);
 107                 planes.emplace_back(plane);
 108             }
 109         }
 110
 111         result[i] = std::move(planes);
 112     }
 113     return result;
 114 }
 115
 116 template<typename... Ts, int... IIs>
 117 std::vector<cv::GMat> to_vec_impl(std::tuple<Ts...> &&gmats, cv::detail::Seq<IIs...>) {
 118     return { std::get<IIs>(gmats)... };
 119 }
 120
 121 template<typename... Ts>
 122 std::vector<cv::GMat> to_vec(std::tuple<Ts...> &&gmats) {
 123     return to_vec_impl(std::move(gmats), typename cv::detail::MkSeq<sizeof...(Ts)>::type());
 124 }
 125
 126 cv::GComputation buildGraph(const G::Desc &in_desc,
 127                             const G::Desc &out_desc,
 128                             InferenceEngine::Layout in_layout,
 129                             InferenceEngine::Layout out_layout,
 130                             InferenceEngine::ResizeAlgorithm algorithm,
 131                             int precision) {
 132     if ((in_layout == NHWC) && (in_desc.d.C == 3) && (precision == CV_8U) && (algorithm == RESIZE_BILINEAR)) {
 133         const auto input_sz = cv::gapi::own::Size(in_desc.d.W, in_desc.d.H);
 134         const auto scale_sz = cv::gapi::own::Size(out_desc.d.W, out_desc.d.H);
 135         std::vector<cv::GMat> inputs(1);
 136         std::vector<cv::GMat> outputs;
 137
 138         if (out_layout == NHWC) {
 139             outputs.resize(1);
 140             auto planes = to_vec(gapi::ScalePlanes::on(inputs[0], precision, input_sz, scale_sz, cv::INTER_LINEAR));
 141             outputs[0] = gapi::Merge3::on(planes[0], planes[1], planes[2]);
 142         } else {
 143             outputs = to_vec(gapi::ScalePlanes::on(inputs[0], precision, input_sz, scale_sz, cv::INTER_LINEAR));
 144         }
 145         return cv::GComputation(inputs, outputs);
 146     }
 147
 148     std::vector<cv::GMat> inputs;  // 1 element if NHWC, C elements if NCHW
 149     std::vector<cv::GMat> planes;
 150
 151     // Convert input blob to planar format, if it is not yet planar
 152     if (in_layout == NHWC) {
 153         // interleaved input blob needs to be decomposed into distinct planes
 154         inputs.resize(1);
 155         switch (in_desc.d.C) {
 156         case 1: planes = { inputs[0] };                       break;
 157         case 2: planes = to_vec(gapi::Split2::on(inputs[0])); break;
 158         case 3: planes = to_vec(gapi::Split3::on(inputs[0])); break;
 159         case 4: planes = to_vec(gapi::Split4::on(inputs[0])); break;
 160         default:
 161             for (int chan = 0; chan < in_desc.d.C; chan++)
 162                 planes.emplace_back(gapi::ChanToPlane::on(inputs[0], chan));
 163             break;
 164         }
 165     } else if (in_layout == NCHW) {
 166         // planar blob can be passed to resize as-is
 167         inputs.resize(in_desc.d.C);
 168         planes = inputs;
 169     }
 170
 171     // Resize every plane
 172     std::vector<cv::GMat> out_planes;
 173     const int interp_type = [](const ResizeAlgorithm &ar) {
 174         switch (ar) {
 175         case RESIZE_AREA:     return cv::INTER_AREA;
 176         case RESIZE_BILINEAR: return cv::INTER_LINEAR;
 177         default: THROW_IE_EXCEPTION << "Unsupported resize operation";
 178         }
 179     } (algorithm);
 180     const auto input_sz  = cv::gapi::own::Size(in_desc.d.W, in_desc.d.H);
 181     const auto scale_sz  = cv::gapi::own::Size(out_desc.d.W, out_desc.d.H);
 182     const auto scale_fcn = std::bind(&gapi::ScalePlane::on,
 183                                      std::placeholders::_1,
 184                                      precision,
 185                                      input_sz, scale_sz, interp_type);
 186     std::transform(planes.begin(), planes.end(), std::back_inserter(out_planes), scale_fcn);
 187
 188     // Convert to expected layout, if required
 189     std::vector<cv::GMat> outputs;  // 1 element if NHWC, C elements if NCHW
 190     if (out_layout == NHWC) {
 191         outputs.resize(1);
 192         if      (out_desc.d.C == 1) outputs[0] = out_planes[0];
 193         else if (out_desc.d.C == 2) outputs[0] = gapi::Merge2::on(out_planes[0], out_planes[1]);
 194         else if (out_desc.d.C == 3) outputs[0] = gapi::Merge3::on(out_planes[0], out_planes[1], out_planes[2]);
 195         else if (out_desc.d.C == 4) outputs[0] = gapi::Merge4::on(out_planes[0], out_planes[1], out_planes[2], out_planes[3]);
 196         else    THROW_IE_EXCEPTION << "Output channels >4 are not supported for HWC [by G-API]";
 197     } else {
 198         outputs = out_planes;
 199     }
 200
 201     return cv::GComputation(inputs, outputs);
 202 }
 203 }  // anonymous namespace
 204
 205 InferenceEngine::PreprocEngine::PreprocEngine() : _lastComp(parallel_get_max_threads()) {}
 206
 207 InferenceEngine::PreprocEngine::Update InferenceEngine::PreprocEngine::needUpdate(const CallDesc &newCallOrig) const {
 208     // Given our knowledge about Fluid, full graph rebuild is required
 209     // if and only if:
 210     // 0. This is the first call ever
 211     // 1. precision has changed (affects kernel versions)
 212     // 2. layout has changed (affects graph topology)
 213     // 3. algorithm has changed (affects kernel version)
 214     // 4. dimensions have changed from downscale to upscale or
 215     // vice-versa if interpolation is AREA.
 216     if (!_lastCall) {
 217         return Update::REBUILD;
 218     }
 219
 220     BlobDesc last_in;
 221     BlobDesc last_out;
 222     ResizeAlgorithm last_algo = ResizeAlgorithm::NO_RESIZE;
 223     std::tie(last_in, last_out, last_algo) = *_lastCall;
 224
 225     CallDesc newCall = newCallOrig;
 226     BlobDesc new_in;
 227     BlobDesc new_out;
 228     ResizeAlgorithm new_algo = ResizeAlgorithm::NO_RESIZE;
 229     std::tie(new_in, new_out, new_algo) = newCall;
 230
 231     // Declare two empty vectors per each call
 232     SizeVector last_in_size;
 233     SizeVector last_out_size;
 234     SizeVector new_in_size;
 235     SizeVector new_out_size;
 236
 237     // Now swap it with in/out descriptor vectors
 238     // Now last_in/last_out would contain everything but sizes
 239     last_in_size.swap(std::get<2>(last_in));
 240     last_out_size.swap(std::get<2>(last_out));
 241     new_in_size.swap(std::get<2>(new_in));
 242     new_out_size.swap(std::get<2>(new_out));
 243
 244     // If anything (except input sizes) changes, rebuild is required
 245     if (last_in != new_in || last_out != new_out || last_algo != new_algo) {
 246         return Update::REBUILD;
 247     }
 248
 249     // If output sizes change, graph should be regenerated (resize
 250     // ratio is taken from parameters)
 251     if (last_out_size != new_out_size) {
 252         return Update::REBUILD;
 253     }
 254
 255     // If interpolation is AREA and sizes change upscale/downscale
 256     // mode, rebuild is required
 257     if (last_algo == RESIZE_AREA) {
 258         // 0123 == NCHW
 259         const auto is_upscale = [](const SizeVector &in, const SizeVector &out) -> bool {
 260             return in[2] < out[2] || in[3] < out[3];
 261         };
 262         const bool old_upscale = is_upscale(last_in_size, last_out_size);
 263         const bool new_upscale = is_upscale(new_in_size, new_out_size);
 264         if (old_upscale != new_upscale) {
 265             return Update::REBUILD;
 266         }
 267     }
 268
 269     // If only sizes changes (considering the above exception),
 270     // reshape is enough
 271     if (last_in_size != new_in_size) {
 272         return Update::RESHAPE;
 273     }
 274
 275     return Update::NOTHING;
 276 }
 277
 278 bool InferenceEngine::PreprocEngine::preprocessWithGAPI(Blob::Ptr &inBlob, Blob::Ptr &outBlob,
 279         const ResizeAlgorithm &algorithm, bool omp_serial, int batch_size) {
 280     static const bool NO_GAPI = [](const char *str) -> bool {
 281         std::string var(str ? str : "");
 282         return var == "N" || var == "NO" || var == "OFF" || var == "0";
 283     } (std::getenv("USE_GAPI"));
 284
 285     if (NO_GAPI)
 286         return false;
 287
 288     const auto &in_desc_ie = inBlob->getTensorDesc();
 289     const auto &out_desc_ie = outBlob->getTensorDesc();
 290     auto supports_layout = [](Layout l) { return l == Layout::NCHW || l == Layout::NHWC; };
 291     if (!supports_layout(inBlob->layout()) || !supports_layout(outBlob->layout())
 292         || in_desc_ie.getDims().size() != 4 || out_desc_ie.getDims().size() != 4) {
 293         THROW_IE_EXCEPTION << "Preprocess support NCHW/NHWC only";
 294     }
 295
 296     const G::Desc
 297         in_desc = G::decompose(inBlob),
 298         out_desc = G::decompose(outBlob);
 299
 300     // according to the IE's current design, input blob batch size _must_ match networks's expected
 301     // batch size, even if the actual processing batch size (set on infer request) is different.
 302     if (in_desc.d.N != out_desc.d.N) {
 303         THROW_IE_EXCEPTION  << "Input blob batch size is invalid: (input blob) "
 304                             << in_desc.d.N << " != " << out_desc.d.N << " (expected by network)";
 305     }
 306
 307     // sanity check batch_size
 308     if (batch_size > in_desc.d.N || batch_size > out_desc.d.N) {
 309         THROW_IE_EXCEPTION  << "Provided batch size is invaid: (provided)"
 310                             << batch_size << " > " << out_desc.d.N << " (expected by network)";
 311     }
 312
 313     // CallDesc doesn't change within batch
 314     CallDesc thisCall = CallDesc{ BlobDesc{ in_desc_ie.getPrecision(),
 315                                             inBlob->layout(),
 316                                             in_desc_ie.getDims() },
 317                                   BlobDesc{ out_desc_ie.getPrecision(),
 318                                             outBlob->layout(),
 319                                             out_desc_ie.getDims() },
 320                                   algorithm };
 321     const Update update = needUpdate(thisCall);
 322
 323     Opt<cv::GComputation> _lastComputation;
 324     if (Update::REBUILD == update || Update::RESHAPE == update) {
 325         _lastCall = cv::util::make_optional(std::move(thisCall));
 326
 327         if (Update::REBUILD == update) {
 328             //  rebuild the graph
 329             IE_PROFILING_AUTO_SCOPE_TASK(_perf_graph_building);
 330             _lastComputation = cv::util::make_optional(buildGraph(in_desc,
 331                                                                   out_desc,
 332                                                                   inBlob->layout(),
 333                                                                   outBlob->layout(),
 334                                                                   algorithm,
 335                                                                   get_cv_depth(in_desc_ie)));
 336         }
 337     }
 338     auto batched_input_plane_mats  = bind_to_blob(inBlob, batch_size);
 339     auto batched_output_plane_mats = bind_to_blob(outBlob, batch_size);
 340
 341     const int thread_num =
 342             #if IE_THREAD == IE_THREAD_OMP
 343                 omp_serial ? 1 :    // disable threading for OpenMP if was asked for
 344             #endif
 345                 0;                  // use all available threads
 346
 347     // to suppress unused warnings
 348     (void)(omp_serial);
 349
 350     // Split the whole graph into `total_slices` slices, where
 351     // `total_slices` is provided by the parallel runtime and assumed
 352     // to be number of threads used.  However it is not guaranteed
 353     // that an actual number of threads will be as assumed, so it
 354     // possible that all slices are processed by the same thread.
 355     //
 356     parallel_nt_static(thread_num , [&, this](int slice_n, const int total_slices) {
 357         IE_PROFILING_AUTO_SCOPE_TASK(_perf_exec_tile);
 358
 359         auto& compiled = _lastComp[slice_n];
 360         if (Update::REBUILD == update || Update::RESHAPE == update) {
 361             //  need to compile (or reshape) own object for a particular ROI
 362             IE_PROFILING_AUTO_SCOPE_TASK(_perf_graph_compiling);
 363
 364             using cv::gapi::own::Rect;
 365
 366             // current design implies all images in batch are equal
 367             const auto& input_plane_mats = batched_input_plane_mats[0];
 368             const auto& output_plane_mats = batched_output_plane_mats[0];
 369
 370             auto lines_per_thread = output_plane_mats[0].rows / total_slices;
 371             const auto remainder = output_plane_mats[0].rows - total_slices * lines_per_thread;
 372
 373             // remainder shows how many threads must calculate 1 additional row. now these additions
 374             // must also be addressed in rect's Y coordinate:
 375             int roi_y = 0;
 376             if (slice_n < remainder) {
 377                 lines_per_thread++;  // 1 additional row
 378                 roi_y = slice_n * lines_per_thread;  // all previous rois have lines+1 rows
 379             } else {
 380                 // remainder rois have lines+1 rows, the rest prior to slice_n have lines rows
 381                 roi_y =
 382                     remainder * (lines_per_thread + 1) + (slice_n - remainder) * lines_per_thread;
 383             }
 384
 385             auto roi = Rect{0, roi_y, output_plane_mats[0].cols, lines_per_thread};
 386             std::vector<Rect> rois(output_plane_mats.size(), roi);
 387
 388             // TODO: make a ROI a runtime argument to avoid
 389             // recompilations
 390             auto args = cv::compile_args(gapi::preprocKernels(), cv::GFluidOutputRois{std::move(rois)});
 391             if (Update::REBUILD == update) {
 392                 auto& computation = _lastComputation.value();
 393                 compiled = computation.compile(descr_of(input_plane_mats), std::move(args));
 394             } else {
 395                 IE_ASSERT(compiled);
 396                 compiled.reshape(descr_of(input_plane_mats), std::move(args));
 397             }
 398         }
 399
 400         for (int i = 0; i < batch_size; ++i) {
 401             const std::vector<cv::gapi::own::Mat>& input_plane_mats = batched_input_plane_mats[i];
 402             std::vector<cv::gapi::own::Mat>& output_plane_mats = batched_output_plane_mats[i];
 403
 404             cv::GRunArgs call_ins;
 405             cv::GRunArgsP call_outs;
 406             for (const auto & m : input_plane_mats) { call_ins.emplace_back(m);}
 407             for (auto & m : output_plane_mats) { call_outs.emplace_back(&m);}
 408
 409             IE_PROFILING_AUTO_SCOPE_TASK(_perf_exec_graph);
 410             compiled(std::move(call_ins), std::move(call_outs));
 411         }
 412     });
 413
 414     return true;
 415 }
 416 }  // namespace InferenceEngine