1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
11 // Careful reader, don't worry -- it is not the whole OpenCV,
12 // it is just a single stand-alone component of it
13 #include <opencv2/gapi.hpp>
14 #include <opencv2/gapi/util/util.hpp>
17 #include "ie_input_info.hpp"
18 #include "ie_preprocess_gapi.hpp"
19 #include "ie_preprocess_gapi_kernels.hpp"
21 #include "ie_parallel.hpp"
23 #include <opencv2/gapi/fluid/gfluidkernel.hpp> // GFluidOutputRois
25 namespace InferenceEngine {
28 struct Strides {int N; int C; int H; int W;};
29 struct Dims {int N; int C; int H; int W;};
30 struct Desc {Dims d; Strides s;};
32 void fix_strides_nhwc(const Dims &d, Strides &s) {
41 Desc decompose(Blob::Ptr &blob) {
42 const auto& ie_desc = blob->getTensorDesc();
43 const auto& ie_blk_desc = ie_desc.getBlockingDesc();
44 const auto& ie_dims = ie_desc.getDims();
45 const auto& ie_strides = ie_blk_desc.getStrides();
48 static_cast<int>(ie_dims[0]),
49 static_cast<int>(ie_dims[1]),
50 static_cast<int>(ie_dims[2]),
51 static_cast<int>(ie_dims[3])
55 static_cast<int>(ie_strides[0]),
56 static_cast<int>(blob->layout() == NHWC ? ie_strides[3] : ie_strides[1]),
57 static_cast<int>(blob->layout() == NHWC ? ie_strides[1] : ie_strides[2]),
58 static_cast<int>(blob->layout() == NHWC ? ie_strides[2] : ie_strides[3]),
61 if (blob->layout() == NHWC) fix_strides_nhwc(d, s);
67 inline int get_cv_depth(const InferenceEngine::TensorDesc &ie_desc) {
68 switch (ie_desc.getPrecision()) {
69 case Precision::U8: return CV_8U;
70 case Precision::FP32: return CV_32F;
71 default: THROW_IE_EXCEPTION << "Unsupported data type";
75 std::vector<std::vector<cv::gapi::own::Mat>> bind_to_blob(Blob::Ptr &blob, int batch_size) {
76 if (batch_size <= 0) {
80 const auto& ie_desc = blob->getTensorDesc();
81 const auto& ie_desc_blk = ie_desc.getBlockingDesc();
82 const auto desc = G::decompose(blob);
83 const auto cv_depth = get_cv_depth(ie_desc);
84 const auto stride = desc.s.H*blob->element_size();
85 const auto planeSize = cv::gapi::own::Size(desc.d.W, desc.d.H);
86 // Note: operating with strides (desc.s) rather than dimensions (desc.d) which is vital for ROI
87 // blobs (data buffer is shared but dimensions are different due to ROI != original image)
88 const auto batch_offset = desc.s.N * blob->element_size();
90 std::vector<std::vector<cv::gapi::own::Mat>> result(batch_size);
92 uint8_t* blob_ptr = static_cast<uint8_t*>(blob->buffer());
93 blob_ptr += blob->element_size()*ie_desc_blk.getOffsetPadding();
95 for (int i = 0; i < batch_size; ++i) {
96 uint8_t* curr_data_ptr = blob_ptr + i * batch_offset;
98 std::vector<cv::gapi::own::Mat> planes;
99 if (blob->layout() == NHWC) {
100 planes.emplace_back(planeSize.height, planeSize.width, CV_MAKETYPE(cv_depth, desc.d.C),
101 curr_data_ptr, stride);
103 const auto planeType = CV_MAKETYPE(cv_depth, 1);
104 for (size_t ch = 0; ch < desc.d.C; ch++) {
105 cv::gapi::own::Mat plane(planeSize.height, planeSize.width, planeType,
106 curr_data_ptr + ch*desc.s.C*blob->element_size(), stride);
107 planes.emplace_back(plane);
111 result[i] = std::move(planes);
116 template<typename... Ts, int... IIs>
117 std::vector<cv::GMat> to_vec_impl(std::tuple<Ts...> &&gmats, cv::detail::Seq<IIs...>) {
118 return { std::get<IIs>(gmats)... };
121 template<typename... Ts>
122 std::vector<cv::GMat> to_vec(std::tuple<Ts...> &&gmats) {
123 return to_vec_impl(std::move(gmats), typename cv::detail::MkSeq<sizeof...(Ts)>::type());
126 cv::GComputation buildGraph(const G::Desc &in_desc,
127 const G::Desc &out_desc,
128 InferenceEngine::Layout in_layout,
129 InferenceEngine::Layout out_layout,
130 InferenceEngine::ResizeAlgorithm algorithm,
132 if ((in_layout == NHWC) && (in_desc.d.C == 3) && (precision == CV_8U) && (algorithm == RESIZE_BILINEAR)) {
133 const auto input_sz = cv::gapi::own::Size(in_desc.d.W, in_desc.d.H);
134 const auto scale_sz = cv::gapi::own::Size(out_desc.d.W, out_desc.d.H);
135 std::vector<cv::GMat> inputs(1);
136 std::vector<cv::GMat> outputs;
138 if (out_layout == NHWC) {
140 auto planes = to_vec(gapi::ScalePlanes::on(inputs[0], precision, input_sz, scale_sz, cv::INTER_LINEAR));
141 outputs[0] = gapi::Merge3::on(planes[0], planes[1], planes[2]);
143 outputs = to_vec(gapi::ScalePlanes::on(inputs[0], precision, input_sz, scale_sz, cv::INTER_LINEAR));
145 return cv::GComputation(inputs, outputs);
148 std::vector<cv::GMat> inputs; // 1 element if NHWC, C elements if NCHW
149 std::vector<cv::GMat> planes;
151 // Convert input blob to planar format, if it is not yet planar
152 if (in_layout == NHWC) {
153 // interleaved input blob needs to be decomposed into distinct planes
155 switch (in_desc.d.C) {
156 case 1: planes = { inputs[0] }; break;
157 case 2: planes = to_vec(gapi::Split2::on(inputs[0])); break;
158 case 3: planes = to_vec(gapi::Split3::on(inputs[0])); break;
159 case 4: planes = to_vec(gapi::Split4::on(inputs[0])); break;
161 for (int chan = 0; chan < in_desc.d.C; chan++)
162 planes.emplace_back(gapi::ChanToPlane::on(inputs[0], chan));
165 } else if (in_layout == NCHW) {
166 // planar blob can be passed to resize as-is
167 inputs.resize(in_desc.d.C);
171 // Resize every plane
172 std::vector<cv::GMat> out_planes;
173 const int interp_type = [](const ResizeAlgorithm &ar) {
175 case RESIZE_AREA: return cv::INTER_AREA;
176 case RESIZE_BILINEAR: return cv::INTER_LINEAR;
177 default: THROW_IE_EXCEPTION << "Unsupported resize operation";
180 const auto input_sz = cv::gapi::own::Size(in_desc.d.W, in_desc.d.H);
181 const auto scale_sz = cv::gapi::own::Size(out_desc.d.W, out_desc.d.H);
182 const auto scale_fcn = std::bind(&gapi::ScalePlane::on,
183 std::placeholders::_1,
185 input_sz, scale_sz, interp_type);
186 std::transform(planes.begin(), planes.end(), std::back_inserter(out_planes), scale_fcn);
188 // Convert to expected layout, if required
189 std::vector<cv::GMat> outputs; // 1 element if NHWC, C elements if NCHW
190 if (out_layout == NHWC) {
192 if (out_desc.d.C == 1) outputs[0] = out_planes[0];
193 else if (out_desc.d.C == 2) outputs[0] = gapi::Merge2::on(out_planes[0], out_planes[1]);
194 else if (out_desc.d.C == 3) outputs[0] = gapi::Merge3::on(out_planes[0], out_planes[1], out_planes[2]);
195 else if (out_desc.d.C == 4) outputs[0] = gapi::Merge4::on(out_planes[0], out_planes[1], out_planes[2], out_planes[3]);
196 else THROW_IE_EXCEPTION << "Output channels >4 are not supported for HWC [by G-API]";
198 outputs = out_planes;
201 return cv::GComputation(inputs, outputs);
203 } // anonymous namespace
205 InferenceEngine::PreprocEngine::PreprocEngine() : _lastComp(parallel_get_max_threads()) {}
207 InferenceEngine::PreprocEngine::Update InferenceEngine::PreprocEngine::needUpdate(const CallDesc &newCallOrig) const {
208 // Given our knowledge about Fluid, full graph rebuild is required
210 // 0. This is the first call ever
211 // 1. precision has changed (affects kernel versions)
212 // 2. layout has changed (affects graph topology)
213 // 3. algorithm has changed (affects kernel version)
214 // 4. dimensions have changed from downscale to upscale or
215 // vice-versa if interpolation is AREA.
217 return Update::REBUILD;
222 ResizeAlgorithm last_algo = ResizeAlgorithm::NO_RESIZE;
223 std::tie(last_in, last_out, last_algo) = *_lastCall;
225 CallDesc newCall = newCallOrig;
228 ResizeAlgorithm new_algo = ResizeAlgorithm::NO_RESIZE;
229 std::tie(new_in, new_out, new_algo) = newCall;
231 // Declare two empty vectors per each call
232 SizeVector last_in_size;
233 SizeVector last_out_size;
234 SizeVector new_in_size;
235 SizeVector new_out_size;
237 // Now swap it with in/out descriptor vectors
238 // Now last_in/last_out would contain everything but sizes
239 last_in_size.swap(std::get<2>(last_in));
240 last_out_size.swap(std::get<2>(last_out));
241 new_in_size.swap(std::get<2>(new_in));
242 new_out_size.swap(std::get<2>(new_out));
244 // If anything (except input sizes) changes, rebuild is required
245 if (last_in != new_in || last_out != new_out || last_algo != new_algo) {
246 return Update::REBUILD;
249 // If output sizes change, graph should be regenerated (resize
250 // ratio is taken from parameters)
251 if (last_out_size != new_out_size) {
252 return Update::REBUILD;
255 // If interpolation is AREA and sizes change upscale/downscale
256 // mode, rebuild is required
257 if (last_algo == RESIZE_AREA) {
259 const auto is_upscale = [](const SizeVector &in, const SizeVector &out) -> bool {
260 return in[2] < out[2] || in[3] < out[3];
262 const bool old_upscale = is_upscale(last_in_size, last_out_size);
263 const bool new_upscale = is_upscale(new_in_size, new_out_size);
264 if (old_upscale != new_upscale) {
265 return Update::REBUILD;
269 // If only sizes changes (considering the above exception),
271 if (last_in_size != new_in_size) {
272 return Update::RESHAPE;
275 return Update::NOTHING;
278 bool InferenceEngine::PreprocEngine::preprocessWithGAPI(Blob::Ptr &inBlob, Blob::Ptr &outBlob,
279 const ResizeAlgorithm &algorithm, bool omp_serial, int batch_size) {
280 static const bool NO_GAPI = [](const char *str) -> bool {
281 std::string var(str ? str : "");
282 return var == "N" || var == "NO" || var == "OFF" || var == "0";
283 } (std::getenv("USE_GAPI"));
288 const auto &in_desc_ie = inBlob->getTensorDesc();
289 const auto &out_desc_ie = outBlob->getTensorDesc();
290 auto supports_layout = [](Layout l) { return l == Layout::NCHW || l == Layout::NHWC; };
291 if (!supports_layout(inBlob->layout()) || !supports_layout(outBlob->layout())
292 || in_desc_ie.getDims().size() != 4 || out_desc_ie.getDims().size() != 4) {
293 THROW_IE_EXCEPTION << "Preprocess support NCHW/NHWC only";
297 in_desc = G::decompose(inBlob),
298 out_desc = G::decompose(outBlob);
300 // according to the IE's current design, input blob batch size _must_ match networks's expected
301 // batch size, even if the actual processing batch size (set on infer request) is different.
302 if (in_desc.d.N != out_desc.d.N) {
303 THROW_IE_EXCEPTION << "Input blob batch size is invalid: (input blob) "
304 << in_desc.d.N << " != " << out_desc.d.N << " (expected by network)";
307 // sanity check batch_size
308 if (batch_size > in_desc.d.N || batch_size > out_desc.d.N) {
309 THROW_IE_EXCEPTION << "Provided batch size is invaid: (provided)"
310 << batch_size << " > " << out_desc.d.N << " (expected by network)";
313 // CallDesc doesn't change within batch
314 CallDesc thisCall = CallDesc{ BlobDesc{ in_desc_ie.getPrecision(),
316 in_desc_ie.getDims() },
317 BlobDesc{ out_desc_ie.getPrecision(),
319 out_desc_ie.getDims() },
321 const Update update = needUpdate(thisCall);
323 Opt<cv::GComputation> _lastComputation;
324 if (Update::REBUILD == update || Update::RESHAPE == update) {
325 _lastCall = cv::util::make_optional(std::move(thisCall));
327 if (Update::REBUILD == update) {
329 IE_PROFILING_AUTO_SCOPE_TASK(_perf_graph_building);
330 _lastComputation = cv::util::make_optional(buildGraph(in_desc,
335 get_cv_depth(in_desc_ie)));
338 auto batched_input_plane_mats = bind_to_blob(inBlob, batch_size);
339 auto batched_output_plane_mats = bind_to_blob(outBlob, batch_size);
341 const int thread_num =
342 #if IE_THREAD == IE_THREAD_OMP
343 omp_serial ? 1 : // disable threading for OpenMP if was asked for
345 0; // use all available threads
347 // to suppress unused warnings
350 // Split the whole graph into `total_slices` slices, where
351 // `total_slices` is provided by the parallel runtime and assumed
352 // to be number of threads used. However it is not guaranteed
353 // that an actual number of threads will be as assumed, so it
354 // possible that all slices are processed by the same thread.
356 parallel_nt_static(thread_num , [&, this](int slice_n, const int total_slices) {
357 IE_PROFILING_AUTO_SCOPE_TASK(_perf_exec_tile);
359 auto& compiled = _lastComp[slice_n];
360 if (Update::REBUILD == update || Update::RESHAPE == update) {
361 // need to compile (or reshape) own object for a particular ROI
362 IE_PROFILING_AUTO_SCOPE_TASK(_perf_graph_compiling);
364 using cv::gapi::own::Rect;
366 // current design implies all images in batch are equal
367 const auto& input_plane_mats = batched_input_plane_mats[0];
368 const auto& output_plane_mats = batched_output_plane_mats[0];
370 auto lines_per_thread = output_plane_mats[0].rows / total_slices;
371 const auto remainder = output_plane_mats[0].rows - total_slices * lines_per_thread;
373 // remainder shows how many threads must calculate 1 additional row. now these additions
374 // must also be addressed in rect's Y coordinate:
376 if (slice_n < remainder) {
377 lines_per_thread++; // 1 additional row
378 roi_y = slice_n * lines_per_thread; // all previous rois have lines+1 rows
380 // remainder rois have lines+1 rows, the rest prior to slice_n have lines rows
382 remainder * (lines_per_thread + 1) + (slice_n - remainder) * lines_per_thread;
385 auto roi = Rect{0, roi_y, output_plane_mats[0].cols, lines_per_thread};
386 std::vector<Rect> rois(output_plane_mats.size(), roi);
388 // TODO: make a ROI a runtime argument to avoid
390 auto args = cv::compile_args(gapi::preprocKernels(), cv::GFluidOutputRois{std::move(rois)});
391 if (Update::REBUILD == update) {
392 auto& computation = _lastComputation.value();
393 compiled = computation.compile(descr_of(input_plane_mats), std::move(args));
396 compiled.reshape(descr_of(input_plane_mats), std::move(args));
400 for (int i = 0; i < batch_size; ++i) {
401 const std::vector<cv::gapi::own::Mat>& input_plane_mats = batched_input_plane_mats[i];
402 std::vector<cv::gapi::own::Mat>& output_plane_mats = batched_output_plane_mats[i];
404 cv::GRunArgs call_ins;
405 cv::GRunArgsP call_outs;
406 for (const auto & m : input_plane_mats) { call_ins.emplace_back(m);}
407 for (auto & m : output_plane_mats) { call_outs.emplace_back(&m);}
409 IE_PROFILING_AUTO_SCOPE_TASK(_perf_exec_graph);
410 compiled(std::move(call_ins), std::move(call_outs));
416 } // namespace InferenceEngine