1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
5 #include "cpu_detector.hpp"
6 #include "blob_transform.hpp"
7 #include "ie_preprocess_data.hpp"
9 #include "ie_preprocess_data_sse42.hpp"
11 #include "ie_preprocess_gapi.hpp"
16 namespace InferenceEngine {
20 template<typename data_t> static inline data_t saturate_cast(float res);
22 template<> inline float saturate_cast(float res) {
26 template<> inline uint8_t saturate_cast(float res) {
27 int ires = static_cast<int>((std::round)(res));
28 return static_cast<uint8_t>((std::max)(0, (std::min)(255, ires)));
31 template<typename data_t = float>
32 void resize_bilinear(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) {
33 Border border = {BORDER_REPLICATE, 0};
35 auto dstDims = outBlob->getTensorDesc().getDims();
36 auto srcDims = inBlob->getTensorDesc().getDims();
38 auto dwidth = static_cast<const int>(dstDims[3]);
39 auto dheight = static_cast<const int>(dstDims[2]);
40 auto swidth = static_cast<const int>(srcDims[3]);
41 auto channels = static_cast<const int>(srcDims[1]);
43 auto src_strides = inBlob->getTensorDesc().getBlockingDesc().getStrides();
44 auto dst_strides = outBlob->getTensorDesc().getBlockingDesc().getStrides();
45 auto origSrcW = src_strides[2];
46 auto origSrcH = src_strides[1] / src_strides[2];
47 auto origDstW = dst_strides[2];
48 auto origDstH = dst_strides[1] / dst_strides[2];
50 const int src_go_x = 0;
51 const int src_go_y = 0;
52 const int dst_go_x = 0;
53 const int dst_go_y = 0;
54 auto src_full_width = static_cast<const int>(srcDims[3]);
55 auto src_full_height = static_cast<const int>(srcDims[2]);
56 auto dst_full_width = static_cast<const int>(dstDims[3]);
57 auto dst_full_height = static_cast<const int>(dstDims[2]);
59 auto *sptr = static_cast<data_t*>(inBlob->buffer()) + inBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
60 auto *dptr = static_cast<data_t*>(outBlob->buffer()) + outBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
61 auto sstep = static_cast<const int>(inBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
62 auto dstep = static_cast<const int>(outBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
63 auto scale_x = static_cast<float>(src_full_width) / dst_full_width;
64 auto scale_y = static_cast<float>(src_full_height) / dst_full_height;
66 auto* xofs = reinterpret_cast<int32_t*>(buffer);
67 auto* yofs = xofs + dwidth;
68 auto* alpha = reinterpret_cast<float*>(yofs + dheight);
69 auto* beta = alpha + dwidth;
70 auto* tptr = beta + dheight;
72 for (int dx = dst_go_x; dx < dst_go_x + dwidth; dx++) {
73 auto fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
74 int32_t sx = floor(fx);
78 if (sx < 0 && border.type == BORDER_REPLICATE) {
83 if (sx >= src_full_width - 1 && border.type == BORDER_REPLICATE) {
85 sx0 = (std::max)(src_full_width - 2, 0);
88 xofs[dx - dst_go_x] = sx0 - src_go_x;
89 alpha[dx - dst_go_x] = fx;
92 for (int dy = dst_go_y; dy < dst_go_y + dheight; dy++) {
93 auto fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
94 int32_t sy = floor(fy);
98 if (sy < 0 && border.type == BORDER_REPLICATE) {
103 if (sy >= src_full_height - 1 && border.type == BORDER_REPLICATE) {
105 sy0 = (std::max)(src_full_height - 2, 0);
108 yofs[dy - dst_go_y] = sy0 - src_go_y;
109 beta[dy - dst_go_y] = fy;
112 auto full_pass = [&](int c, int y) {
113 auto sptr_ = sptr + c * origSrcW * origSrcH;
114 auto dptr_ = dptr + c * origDstW * origDstH;
117 for (int x = 0; x < swidth; x++) {
118 bool use_constant0 = yofs[y] + 0 < 0 || yofs[y] + 0 >= src_full_height;
119 bool use_constant1 = yofs[y] + 1 < 0 || yofs[y] + 1 >= src_full_height;
120 float val0 = use_constant0 ? border.value : sptr_[(yofs[y] + 0) * sstep + x];
121 float val1 = use_constant1 ? border.value : sptr_[(yofs[y] + 1) * sstep + x];
123 float res = val0 + beta[y] * (val1 - val0);
127 for (int x = 0; x < dwidth; x++) {
128 bool use_constant0 = xofs[x] + 0 < 0 || xofs[x] + 0 >= src_full_width;
129 bool use_constant1 = xofs[x] + 1 < 0 || xofs[x] + 1 >= src_full_width;
130 float val0 = use_constant0 ? border.value : tptr_[xofs[x] + 0];
131 float val1 = use_constant1 ? border.value : tptr_[xofs[x] + 1];
133 float res = val0 + alpha[x] * (val1 - val0);
134 dptr_[y * dstep + x] = saturate_cast<data_t>(res);
138 for (int c = 0; c < channels; c++) {
139 for (int y = 0; y < dheight; y++) {
145 int getResizeAreaTabSize(int dst_go, int ssize, int dsize, float scale) {
146 static const float threshold = 1e-3f;
149 for (int col = dst_go; col < dst_go + dsize; col++) {
152 float fsx1 = col * scale;
153 float fsx2 = fsx1 + scale;
155 int sx1 = ceil(fsx1);
156 int sx2 = floor(fsx2);
158 sx2 = (std::min)(sx2, ssize - 1);
159 sx1 = (std::min)(sx1, sx2);
161 if (sx1 - fsx1 > threshold) {
165 for (int sx = sx1; sx < sx2; sx++) {
169 if (fsx2 - sx2 > threshold) {
172 max_count = (std::max)(max_count, count);
178 void computeResizeAreaTab(int src_go, int dst_go, int ssize, int dsize, float scale,
179 uint16_t* si, uint16_t* alpha, int max_count) {
180 static const float threshold = 1e-3f;
183 for (int col = dst_go; col < dst_go + dsize; col++) {
186 float fsx1 = col * scale;
187 float fsx2 = fsx1 + scale;
188 float cellWidth = (std::min)(scale, ssize - fsx1);
190 int sx1 = ceil(fsx1);
191 int sx2 = floor(fsx2);
193 sx2 = (std::min)(sx2, ssize - 1);
194 sx1 = (std::min)(sx1, sx2);
196 si[col - dst_go] = (uint16_t)(sx1 - src_go);
198 if (sx1 - fsx1 > threshold) {
199 si[col - dst_go] = (uint16_t)(sx1 - src_go - 1);
200 alpha[k++] = (uint16_t)((1 << 16) * ((sx1 - fsx1) / cellWidth));
204 for (int sx = sx1; sx < sx2; sx++) {
205 alpha[k++] = (uint16_t)((1 << 16) * (1.0f / cellWidth));
209 if (fsx2 - sx2 > threshold) {
210 alpha[k++] = (uint16_t)((1 << 16) * ((std::min)((std::min)(fsx2 - sx2, 1.f), cellWidth) / cellWidth));
214 if (count != max_count) {
220 void generate_alpha_and_id_arrays(int x_max_count, int dcols, const uint16_t* xalpha, uint16_t* xsi,
221 uint16_t** alpha, uint16_t** sxid) {
222 if (x_max_count <= 4) {
223 for (int col = 0; col < dcols; col++) {
224 for (int x = 0; x < x_max_count; x++) {
225 alpha[x][col] = xalpha[col*x_max_count + x];
229 if (x_max_count <= 4) {
230 for (int col = 0; col <= dcols - 8; col += 8) {
231 for (int chunk_num_h = 0; chunk_num_h < x_max_count; chunk_num_h++) {
232 for (int i = 0; i < 128 / 16; i++) {
233 int id_diff = xsi[col + i] - xsi[col];
235 for (int chunk_num_v = 0; chunk_num_v < x_max_count; chunk_num_v++) {
236 uint16_t* sxidp = sxid[chunk_num_v] + col * x_max_count + chunk_num_h * 8;
238 int id0 = (id_diff + chunk_num_v) * 2 + 0;
239 int id1 = (id_diff + chunk_num_v) * 2 + 1;
241 (reinterpret_cast<int8_t*>(sxidp + i))[0] = static_cast<int8_t>(id0 >= (chunk_num_h * 16) && id0 < (chunk_num_h + 1) * 16 ? id0 : -1);
242 (reinterpret_cast<int8_t*>(sxidp + i))[1] = static_cast<int8_t>(id1 >= (chunk_num_h * 16) && id1 < (chunk_num_h + 1) * 16 ? id1 : -1);
250 int computeResizeAreaTabFP32(int src_go, int dst_go, int ssize, int dsize, float scale, uint16_t* si, uint16_t* di, float* alpha) {
251 static const float threshold = 1e-3f;
254 for (int col = dst_go; col < dst_go + dsize; col++) {
255 float fsx1 = col * scale;
256 float fsx2 = fsx1 + scale;
257 float cellWidth = (std::min)(scale, ssize - fsx1);
259 int sx1 = ceil(fsx1);
260 int sx2 = floor(fsx2);
262 sx2 = (std::min)(sx2, ssize - 1);
263 sx1 = (std::min)(sx1, sx2);
265 if (sx1 - fsx1 > threshold) {
266 di[k] = (uint16_t)(col - dst_go);
267 si[k] = (uint16_t)(sx1 - src_go - 1);
268 alpha[k++] = (sx1 - fsx1) / cellWidth;
271 for (int sx = sx1; sx < sx2; sx++) {
272 di[k] = (uint16_t)(col - dst_go);
273 si[k] = (uint16_t)(sx - src_go);
274 alpha[k++] = 1.0f / cellWidth;
277 if (fsx2 - sx2 > threshold) {
278 di[k] = (uint16_t)(col - dst_go);
279 si[k] = (uint16_t)(sx2 - src_go);
280 alpha[k++] = (std::min)((std::min)(fsx2 - sx2, 1.f), cellWidth) / cellWidth;
286 template<typename data_t = float>
287 void resize_area_downscale(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) {
288 auto dstDims = outBlob->getTensorDesc().getDims();
289 auto srcDims = inBlob->getTensorDesc().getDims();
291 auto src_strides = inBlob->getTensorDesc().getBlockingDesc().getStrides();
292 auto dst_strides = outBlob->getTensorDesc().getBlockingDesc().getStrides();
293 auto origSrcW = src_strides[2];
294 auto origSrcH = src_strides[1] / src_strides[2];
295 auto origDstW = dst_strides[2];
296 auto origDstH = dst_strides[1] / dst_strides[2];
298 auto dwidth = static_cast<const int>(dstDims[3]);
299 auto dheight = static_cast<const int>(dstDims[2]);
300 auto swidth = static_cast<const int>(srcDims[3]);
301 auto sheight = static_cast<const int>(srcDims[2]);
302 auto channels = static_cast<const int>(srcDims[1]);
304 const int src_go_x = 0;
305 const int src_go_y = 0;
306 const int dst_go_x = 0;
307 const int dst_go_y = 0;
309 auto src_full_width = static_cast<const int>(srcDims[3]);
310 auto src_full_height = static_cast<const int>(srcDims[2]);
311 auto dst_full_width = static_cast<const int>(dstDims[3]);
312 auto dst_full_height = static_cast<const int>(dstDims[2]);
314 auto* sptr = static_cast<const data_t*>(inBlob->buffer()) + inBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
315 auto* dptr = static_cast<data_t*>(outBlob->buffer()) + outBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
317 auto sstep = static_cast<const int>(src_strides[2]);
318 auto dstep = static_cast<const int>(dst_strides[2]);
320 float scale_x = static_cast<float>(src_full_width) / dst_full_width;
321 float scale_y = static_cast<float>(src_full_height) / dst_full_height;
323 int vert_sum_size = swidth;
324 int tabofs_size = (std::max)(2*swidth, 2*dwidth);
325 int xsi_size = (std::max)(2*swidth, 2*dwidth);
326 int xdi_size = (std::max)(2*swidth, 2*dwidth);
327 int ysi_size = (std::max)(2*sheight, 2*dheight);
328 int ydi_size = (std::max)(2*sheight, 2*dheight);
329 int xalpha_size = (std::max)(2*swidth, 2*dwidth);
331 auto vert_sum = reinterpret_cast<float*>(buffer);
332 auto tabofs = reinterpret_cast<int*>(vert_sum + vert_sum_size);
333 auto xsi = reinterpret_cast<uint16_t*>(tabofs + tabofs_size + 1);
334 auto xdi = xsi + xsi_size;
335 auto ysi = xdi + xdi_size;
336 auto ydi = ysi + ysi_size;
337 auto xalpha = reinterpret_cast<float*>(ydi + ydi_size);
338 auto yalpha = xalpha + xalpha_size;
340 int ytab_size = computeResizeAreaTabFP32(src_go_y, dst_go_y, src_full_height, dheight, scale_y, ysi, ydi, yalpha);
341 int xtab_size = computeResizeAreaTabFP32(src_go_x, dst_go_x, src_full_width, dwidth, scale_x, xsi, xdi, xalpha);
344 for (int i = 0; i < ytab_size && dy_ < dwidth*2; i++) {
345 if (i == 0 || ydi[i] != ydi[i-1]) {
349 tabofs[dy_] = ytab_size;
351 auto full_pass = [&](const data_t* sptr_, data_t* dptr_, int y) {
352 auto vert_sum_ = vert_sum;
354 memset(vert_sum_, 0, swidth * sizeof(float));
356 data_t *pdst = dptr_ + y * dstep;
358 for (int dy = tabofs[y]; dy < tabofs[y + 1] && dy < ytab_size; dy++) {
359 float beta = yalpha[dy];
362 const data_t *psrc = sptr_ + sy * sstep;
363 for (int x = 0; x < swidth; x++) {
364 vert_sum_[x] += beta * psrc[x];
369 for (int x = 0; x < dwidth; x++) {
372 for (; x == xdi[xtab_ind + dx] && xtab_ind + dx < xtab_size; dx++) {
373 float alpha = xalpha[xtab_ind + dx];
374 int sx = xsi[xtab_ind + dx];
376 res += alpha * vert_sum_[sx];
379 pdst[x] = saturate_cast<data_t>(res);
384 for (int ch = 0; ch < channels; ch++) {
385 for (int y = 0; y < dheight; y++) {
386 auto sptr_ = sptr + ch * origSrcH * origSrcW;
387 auto dptr_ = dptr + ch * origDstH * origDstW;
389 full_pass(sptr_, dptr_, y);
394 inline int clip(int x, int a, int b) {
395 return x >= a ? (x < b ? x : b-1) : a;
398 const int MAX_ESIZE = 16;
400 template<typename data_t>
401 void HResizeLinear(const data_t** src, float** dst, int count, const int* xofs, const float* alpha,
402 int swidth, int dwidth, int cn, int xmin, int xmax ) {
406 for (k = 0; k <= count - 2; k++) {
407 const data_t *S0 = src[k], *S1 = src[k+1];
408 float *D0 = dst[k], *D1 = dst[k+1];
409 for (dx = dx0; dx < xmax; dx++) {
411 float a0 = alpha[dx*2], a1 = alpha[dx*2+1];
412 float t0 = static_cast<float>(S0[sx])*a0 + static_cast<float>(S0[sx + cn])*a1;
413 float t1 = static_cast<float>(S1[sx])*a0 + static_cast<float>(S1[sx + cn])*a1;
414 D0[dx] = t0; D1[dx] = t1;
417 for (; dx < dwidth; dx++) {
419 D0[dx] = static_cast<float>(S0[sx]); D1[dx] = static_cast<float>(S1[sx]);
423 for (; k < count; k++) {
424 const data_t *S = src[k];
426 for (dx = 0; dx < xmax; dx++) {
428 D[dx] = static_cast<float>(S[sx])*alpha[dx*2] + static_cast<float>(S[sx+cn])*alpha[dx*2+1];
431 for (; dx < dwidth; dx++)
432 D[dx] = static_cast<float>(S[xofs[dx]]);
436 template<typename data_t>
437 void VResizeLinear(float** src, data_t* dst, const float* beta, int width) {
438 float b0 = beta[0], b1 = beta[1];
439 const float *S0 = src[0], *S1 = src[1];
441 if (sizeof(data_t) == 4) {
442 for (int x = 0; x < width; x++)
443 dst[x] = (S0[x] * b0 + S1[x] * b1);
445 for (int x = 0; x < width; x++)
446 dst[x] = saturateU32toU8(static_cast<uint32_t>(S0[x] * b0 + S1[x] * b1));
450 template<typename data_t>
451 static void resize_area_upscale(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) {
452 auto dstDims = outBlob->getTensorDesc().getDims();
453 auto srcDims = inBlob->getTensorDesc().getDims();
455 auto src_strides = inBlob->getTensorDesc().getBlockingDesc().getStrides();
456 auto dst_strides = outBlob->getTensorDesc().getBlockingDesc().getStrides();
457 auto origSrcW = src_strides[2];
458 auto origSrcH = src_strides[1] / src_strides[2];
459 auto origDstW = dst_strides[2];
460 auto origDstH = dst_strides[1] / dst_strides[2];
462 auto dwidth = static_cast<const int>(dstDims[3]);
463 auto dheight = static_cast<const int>(dstDims[2]);
464 auto swidth = static_cast<const int>(srcDims[3]);
465 auto sheight = static_cast<const int>(srcDims[2]);
466 auto channels = static_cast<const int>(srcDims[1]);
468 auto src_full_width = static_cast<const int>(srcDims[3]);
469 auto src_full_height = static_cast<const int>(srcDims[2]);
470 auto dst_full_width = static_cast<const int>(dstDims[3]);
471 auto dst_full_height = static_cast<const int>(dstDims[2]);
473 auto sptr = static_cast<const data_t*>(inBlob->buffer()) + inBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
474 auto dptr = static_cast<data_t*>(outBlob->buffer()) + outBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
476 auto sstep = static_cast<const int>(src_strides[2]);
477 auto dstep = static_cast<const int>(dst_strides[2]);
479 float scale_x = static_cast<float>(src_full_width) / dst_full_width;
480 float scale_y = static_cast<float>(src_full_height) / dst_full_height;
481 float inv_scale_x = static_cast<float>(dst_full_width) / src_full_width;
482 float inv_scale_y = static_cast<float>(dst_full_height) / src_full_height;
484 int xmin = 0, xmax = dwidth, width = dwidth;
486 int ksize2 = ksize/2;
488 auto xofs = reinterpret_cast<int*>(buffer);
489 auto yofs = xofs + width;
490 auto alpha = reinterpret_cast<float*>(yofs + dheight);
491 auto beta = alpha + width*ksize;
494 for (int dx = 0; dx < dwidth; dx++) {
495 int sx = floor(dx*scale_x);
496 float fx = (dx+1) - (sx+1)*inv_scale_x;
497 fx = fx <= 0 ? 0.f : fx - floor(fx);
505 if (sx + ksize2 >= swidth) {
506 xmax = (std::min)(xmax, dx);
508 fx = 0, sx = swidth-1;
516 for (int k = 0; k < ksize; k++)
517 alpha[dx*ksize + k] = cbuf[k];
520 for (int dy = 0; dy < dheight; dy++) {
521 int sy = floor(dy*scale_y);
522 float fy = (dy+1) - (sy+1)*inv_scale_y;
523 fy = fy <= 0 ? 0.f : fy - floor(fy);
529 for (int k = 0; k < ksize; k++)
530 beta[dy*ksize + k] = cbuf[k];
533 auto full_pass = [&](const data_t* sptr_, data_t* dptr_, int dy) {
534 int bufstep = dwidth;
535 const data_t* srows[MAX_ESIZE]={0};
536 float* rows[MAX_ESIZE]={0};
537 int prev_sy[MAX_ESIZE];
539 for (int k = 0; k < ksize; k++) {
541 rows[k] = reinterpret_cast<float*>(buffer + (width + dheight)*(sizeof(int) + sizeof(float)*ksize))
545 int sy0 = yofs[dy], k0 = ksize, k1 = 0;
547 for (int k = 0; k < ksize; k++) {
548 int sy = clip(sy0 - ksize2 + 1 + k, 0, sheight);
549 for (k1 = (std::max)(k1, k); k1 < ksize; k1++) {
550 if (k1 < MAX_ESIZE && sy == prev_sy[k1]) {
552 memcpy(rows[k], rows[k1], bufstep*sizeof(rows[0][0]));
558 k0 = (std::min)(k0, k);
559 srows[k] = sptr_ + sy * sstep;
564 HResizeLinear<data_t>(srows + k0, reinterpret_cast<float**>(rows + k0), ksize - k0, xofs,
565 reinterpret_cast<const float*>(alpha), swidth, dwidth, 1, xmin, xmax);
567 VResizeLinear<data_t>(reinterpret_cast<float**>(rows), dptr_ + dstep*dy, beta + dy*ksize, dwidth);
570 for (int ch = 0; ch < channels; ch++) {
571 for (int dy = 0; dy < dheight; dy++) {
572 auto sptr_ = sptr + ch * origSrcH * origSrcW;
573 auto dptr_ = dptr + ch * origDstH * origDstW;
575 full_pass(sptr_, dptr_, dy);
580 size_t resize_get_buffer_size(Blob::Ptr inBlob, Blob::Ptr outBlob, const ResizeAlgorithm &algorithm) {
581 auto dstDims = outBlob->getTensorDesc().getDims();
582 auto srcDims = inBlob->getTensorDesc().getDims();
584 SizeVector strides = inBlob->getTensorDesc().getBlockingDesc().getStrides();
585 size_t origW = strides[2];
586 size_t origH = strides[1] / strides[2];
588 const int src_full_width = origW;
589 const int src_full_height = origH;
590 const int dst_full_width = dstDims[3];
591 const int dst_full_height = dstDims[2];
593 float scale_x = static_cast<float>(dstDims[3]) / srcDims[3];
594 float scale_y = static_cast<float>(dstDims[2]) / srcDims[2];
596 auto resize_bilinear_u8_buffer_size = [&]() {
597 size_t buffer_size = (sizeof(int16_t) * 4 + sizeof(uint8_t *)) * dstDims[3] +
598 (sizeof(int32_t) + sizeof(int16_t)) * dstDims[2] +
599 sizeof(uint32_t) * dstDims[3] +
600 (((srcDims[3] + 7) / 8) * 8 * 8) +
601 sizeof(uint8_t) * 12;
606 auto resize_bilinear_fp32_buffer_size = [&]() {
607 size_t buffer_size = (sizeof(float) + sizeof(float *)) * dstDims[3] +
608 (sizeof(int32_t) + sizeof(float)) * dstDims[2] +
609 (((srcDims[3] + 1) / 2) * 2 * 2) * sizeof(float);
614 auto resize_area_u8_downscale_sse_buffer_size = [&]() {
615 const int dwidth = dstDims[3];
616 const int dheight = dstDims[2];
617 const int swidth = srcDims[3];
619 const int dst_go_x = 0;
620 const int dst_go_y = 0;
622 int x_max_count = getResizeAreaTabSize(dst_go_x, src_full_width, dwidth, static_cast<float>(src_full_width) / dst_full_width) + 1;
623 int y_max_count = getResizeAreaTabSize(dst_go_y, src_full_height, dheight, static_cast<float>(src_full_height) / dst_full_height) + 1;
625 size_t si_buf_size = sizeof(uint16_t) * dwidth + sizeof(uint16_t) * dheight;
626 size_t alpha_buf_size =
627 sizeof(uint16_t) * (dwidth * x_max_count + 8 * 16) + sizeof(uint16_t) * dheight * y_max_count;
628 size_t vert_sum_buf_size = sizeof(uint16_t) * (swidth * 2);
629 size_t alpha_array_buf_size = sizeof(uint16_t) * 4 * dwidth;
630 size_t sxid_array_buf_size = sizeof(uint16_t) * 4 * 4 * dwidth;
632 size_t buffer_size = si_buf_size +
635 alpha_array_buf_size +
641 auto resize_area_downscale_buffer_size = [&]() {
642 size_t buffer_size = sizeof(float) * (srcDims[3]) +
643 sizeof(uint32_t) * (dstDims[3] * 2 + 1) +
644 sizeof(float) * ((srcDims[3] + srcDims[2]) * 4) +
645 sizeof(float) * ((srcDims[3] + srcDims[2]) * 2);
650 auto resize_area_upscale_buffer_size = [&]() {
651 size_t buffer_size = (dstDims[3] + dstDims[2])*(sizeof(int) + sizeof(float)*2) + 2*dstDims[3] * sizeof(float);
656 if (algorithm == RESIZE_BILINEAR) {
657 if (inBlob->getTensorDesc().getPrecision() == Precision::U8) {
658 return resize_bilinear_u8_buffer_size();
660 return resize_bilinear_fp32_buffer_size();
662 } else if (algorithm == RESIZE_AREA) {
663 if (inBlob->getTensorDesc().getPrecision() == Precision::U8) {
664 if (scale_x <= 1 && scale_y <= 1) {
666 if (with_cpu_x86_sse42() && scale_x < 1 && scale_y < 1)
667 return resize_area_u8_downscale_sse_buffer_size();
670 return resize_area_downscale_buffer_size();
672 return resize_area_upscale_buffer_size();
675 if (scale_x <= 1 && scale_y <= 1)
676 return resize_area_downscale_buffer_size();
678 return resize_area_upscale_buffer_size();
685 void resize(Blob::Ptr inBlob, Blob::Ptr outBlob, const ResizeAlgorithm &algorithm) {
686 if (inBlob->getTensorDesc().getLayout() != NCHW || outBlob->getTensorDesc().getLayout() != NCHW)
687 THROW_IE_EXCEPTION << "Resize supports only NCHW layout";
689 if (!((inBlob->getTensorDesc().getPrecision() == Precision::U8 && outBlob->getTensorDesc().getPrecision() == Precision::U8) ||
690 (inBlob->getTensorDesc().getPrecision() == Precision::FP32 && outBlob->getTensorDesc().getPrecision() == Precision::FP32)))
691 THROW_IE_EXCEPTION << "Resize supports only U8 and FP32 precisions";
693 if (algorithm != RESIZE_BILINEAR && algorithm != RESIZE_AREA)
694 THROW_IE_EXCEPTION << "Unsupported resize algorithm type";
696 size_t buffer_size = resize_get_buffer_size(inBlob, outBlob, algorithm);
697 auto* buffer = static_cast<uint8_t *>(malloc(buffer_size));
698 if (buffer == nullptr) {
699 THROW_IE_EXCEPTION << "Could not allocate memory for blob";
702 auto dstDims = outBlob->getTensorDesc().getDims();
703 auto srcDims = inBlob->getTensorDesc().getDims();
704 float scale_x = static_cast<float>(dstDims[3]) / srcDims[3];
705 float scale_y = static_cast<float>(dstDims[2]) / srcDims[2];
707 if (algorithm == RESIZE_BILINEAR) {
708 if (inBlob->getTensorDesc().getPrecision() == Precision::U8) {
710 if (with_cpu_x86_sse42())
711 Resize::resize_bilinear_u8(inBlob, outBlob, buffer);
714 resize_bilinear<uint8_t>(inBlob, outBlob, buffer);
716 resize_bilinear<float>(inBlob, outBlob, buffer);
718 } else if (algorithm == RESIZE_AREA) {
719 if (inBlob->getTensorDesc().getPrecision() == Precision::U8) {
720 if (scale_x <= 1 && scale_y <= 1) {
722 if (with_cpu_x86_sse42() && scale_x < 1 && scale_y < 1)
723 Resize::resize_area_u8_downscale(inBlob, outBlob, buffer);
726 resize_area_downscale<uint8_t>(inBlob, outBlob, buffer);
728 resize_area_upscale<uint8_t>(inBlob, outBlob, buffer);
731 if (scale_x <= 1 && scale_y <= 1)
732 resize_area_downscale<float>(inBlob, outBlob, buffer);
734 resize_area_upscale<float>(inBlob, outBlob, buffer);
741 } // namespace Resize
743 //----------------------------------------------------------------------
745 using namespace Resize;
747 void PreProcessData::setRoiBlob(const Blob::Ptr &blob) {
751 Blob::Ptr PreProcessData::getRoiBlob() const {
755 void PreProcessData::execute(Blob::Ptr &outBlob, const ResizeAlgorithm &algorithm, bool serial,
757 IE_PROFILING_AUTO_SCOPE_TASK(perf_preprocessing)
759 if (algorithm == NO_RESIZE) {
760 THROW_IE_EXCEPTION << "Input pre-processing is called without resize algorithm set";
763 if (_roiBlob == nullptr) {
764 THROW_IE_EXCEPTION << "Input pre-processing is called without ROI blob set";
767 if (batchSize == 0) {
768 THROW_IE_EXCEPTION << "Input pre-processing is called with invalid batch size "
773 // if batch_size is unspecified, process the whole input blob
774 batchSize = static_cast<int>(_roiBlob->getTensorDesc().getDims()[0]);
778 _preproc.reset(new PreprocEngine);
780 if (_preproc->preprocessWithGAPI(_roiBlob, outBlob, algorithm, serial, batchSize)) {
785 THROW_IE_EXCEPTION << "Batch pre-processing is unsupported in this mode. "
786 "Use default pre-processing instead to process batches.";
789 Blob::Ptr res_in, res_out;
790 if (_roiBlob->getTensorDesc().getLayout() == NHWC) {
791 if (!_tmp1 || _tmp1->size() != _roiBlob->size()) {
792 if (_roiBlob->getTensorDesc().getPrecision() == Precision::FP32) {
793 _tmp1 = make_shared_blob<float>(Precision::FP32, NCHW, _roiBlob->dims());
795 _tmp1 = make_shared_blob<uint8_t>(Precision::U8, NCHW, _roiBlob->dims());
801 IE_PROFILING_AUTO_SCOPE_TASK(perf_reorder_before)
802 blob_copy(_roiBlob, _tmp1);
809 if (outBlob->getTensorDesc().getLayout() == NHWC) {
810 if (!_tmp2 || _tmp2->size() != outBlob->size()) {
811 if (outBlob->getTensorDesc().getPrecision() == Precision::FP32) {
812 _tmp2 = make_shared_blob<float>(Precision::FP32, NCHW, outBlob->dims());
814 _tmp2 = make_shared_blob<uint8_t>(Precision::U8, NCHW, outBlob->dims());
824 IE_PROFILING_AUTO_SCOPE_TASK(perf_resize)
825 resize(res_in, res_out, algorithm);
828 if (res_out == _tmp2) {
829 IE_PROFILING_AUTO_SCOPE_TASK(perf_reorder_after)
830 blob_copy(_tmp2, outBlob);
834 void PreProcessData::isApplicable(const Blob::Ptr &src, const Blob::Ptr &dst) {
835 auto &src_dims = src->getTensorDesc().getDims();
836 auto &dst_dims = dst->getTensorDesc().getDims();
838 if (src_dims.size() != dst_dims.size())
839 THROW_IE_EXCEPTION << "Preprocessing is not applicable. Source and destination blobs have different "
840 "number of dimensions";
842 if (src_dims.size() != 4)
843 THROW_IE_EXCEPTION << "Preprocessing is not applicable. Only 4D tensors are supported.";
845 if (src_dims[0] != dst_dims[0] || src_dims[1] != dst_dims[1])
846 THROW_IE_EXCEPTION << "Preprocessing is not applicable. Wrong shape. Network expected 4D input tensor with "
847 "shape [" << dst_dims[0] << "," << dst_dims[1] <<",H,W] but provided tensor has "
848 "shape " << details::dumpVec(src_dims) << ".";
851 } // namespace InferenceEngine