inference-engine/src/inference_engine/ie_preprocess_data.cpp

   1 // Copyright (C) 2018-2019 Intel Corporation
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4
   5 #include "cpu_detector.hpp"
   6 #include "blob_transform.hpp"
   7 #include "ie_preprocess_data.hpp"
   8 #ifdef HAVE_SSE
   9 #include "ie_preprocess_data_sse42.hpp"
  10 #endif
  11 #include "ie_preprocess_gapi.hpp"
  12 #include "debug.h"
  13
  14 #include <algorithm>
  15
  16 namespace InferenceEngine {
  17
  18 namespace Resize {
  19
  20 template<typename data_t> static inline data_t saturate_cast(float res);
  21
  22 template<> inline float saturate_cast(float res) {
  23     return res;
  24 }
  25
  26 template<> inline uint8_t saturate_cast(float res) {
  27     int ires = static_cast<int>((std::round)(res));
  28     return static_cast<uint8_t>((std::max)(0, (std::min)(255, ires)));
  29 }
  30
  31 template<typename data_t = float>
  32 void resize_bilinear(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) {
  33     Border border = {BORDER_REPLICATE, 0};
  34
  35     auto dstDims = outBlob->getTensorDesc().getDims();
  36     auto srcDims = inBlob->getTensorDesc().getDims();
  37
  38     auto dwidth = static_cast<const int>(dstDims[3]);
  39     auto dheight = static_cast<const int>(dstDims[2]);
  40     auto swidth = static_cast<const int>(srcDims[3]);
  41     auto channels = static_cast<const int>(srcDims[1]);
  42
  43     auto src_strides = inBlob->getTensorDesc().getBlockingDesc().getStrides();
  44     auto dst_strides = outBlob->getTensorDesc().getBlockingDesc().getStrides();
  45     auto origSrcW = src_strides[2];
  46     auto origSrcH = src_strides[1] / src_strides[2];
  47     auto origDstW = dst_strides[2];
  48     auto origDstH = dst_strides[1] / dst_strides[2];
  49
  50     const int src_go_x = 0;
  51     const int src_go_y = 0;
  52     const int dst_go_x = 0;
  53     const int dst_go_y = 0;
  54     auto src_full_width = static_cast<const int>(srcDims[3]);
  55     auto src_full_height = static_cast<const int>(srcDims[2]);
  56     auto dst_full_width = static_cast<const int>(dstDims[3]);
  57     auto dst_full_height = static_cast<const int>(dstDims[2]);
  58
  59     auto *sptr = static_cast<data_t*>(inBlob->buffer()) + inBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
  60     auto *dptr = static_cast<data_t*>(outBlob->buffer()) + outBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
  61     auto sstep = static_cast<const int>(inBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
  62     auto dstep = static_cast<const int>(outBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
  63     auto scale_x = static_cast<float>(src_full_width) / dst_full_width;
  64     auto scale_y = static_cast<float>(src_full_height) / dst_full_height;
  65
  66     auto* xofs = reinterpret_cast<int32_t*>(buffer);
  67     auto* yofs = xofs + dwidth;
  68     auto* alpha = reinterpret_cast<float*>(yofs + dheight);
  69     auto* beta = alpha + dwidth;
  70     auto* tptr = beta + dheight;
  71
  72     for (int dx = dst_go_x; dx < dst_go_x + dwidth; dx++) {
  73         auto fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
  74         int32_t sx = floor(fx);
  75         fx -= sx;
  76
  77         int32_t sx0 = sx;
  78         if (sx < 0 && border.type == BORDER_REPLICATE) {
  79             fx = 0;
  80             sx0 = 0;
  81         }
  82
  83         if (sx >= src_full_width - 1 && border.type == BORDER_REPLICATE) {
  84             fx = 1.f;
  85             sx0 = (std::max)(src_full_width - 2, 0);
  86         }
  87
  88         xofs[dx - dst_go_x] = sx0 - src_go_x;
  89         alpha[dx - dst_go_x] = fx;
  90     }
  91
  92     for (int dy = dst_go_y; dy < dst_go_y + dheight; dy++) {
  93         auto fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
  94         int32_t sy = floor(fy);
  95         fy -= sy;
  96
  97         int32_t sy0 = sy;
  98         if (sy < 0 && border.type == BORDER_REPLICATE) {
  99             fy = 0;
 100             sy0 = 0;
 101         }
 102
 103         if (sy >= src_full_height - 1 && border.type == BORDER_REPLICATE) {
 104             fy = 1.f;
 105             sy0 = (std::max)(src_full_height - 2, 0);
 106         }
 107
 108         yofs[dy - dst_go_y] = sy0 - src_go_y;
 109         beta[dy - dst_go_y] = fy;
 110     }
 111
 112     auto full_pass = [&](int c, int y) {
 113         auto sptr_ = sptr + c * origSrcW * origSrcH;
 114         auto dptr_ = dptr + c * origDstW * origDstH;
 115         auto tptr_ = tptr;
 116
 117         for (int x = 0; x < swidth; x++) {
 118             bool use_constant0 = yofs[y] + 0 < 0 || yofs[y] + 0 >= src_full_height;
 119             bool use_constant1 = yofs[y] + 1 < 0 || yofs[y] + 1 >= src_full_height;
 120             float val0 = use_constant0 ? border.value : sptr_[(yofs[y] + 0) * sstep + x];
 121             float val1 = use_constant1 ? border.value : sptr_[(yofs[y] + 1) * sstep + x];
 122
 123             float res = val0 + beta[y] * (val1 - val0);
 124             tptr_[x] = res;
 125         }
 126
 127         for (int x = 0; x < dwidth; x++) {
 128             bool use_constant0 = xofs[x] + 0 < 0 || xofs[x] + 0 >= src_full_width;
 129             bool use_constant1 = xofs[x] + 1 < 0 || xofs[x] + 1 >= src_full_width;
 130             float val0 = use_constant0 ? border.value : tptr_[xofs[x] + 0];
 131             float val1 = use_constant1 ? border.value : tptr_[xofs[x] + 1];
 132
 133             float res = val0 + alpha[x] * (val1 - val0);
 134             dptr_[y * dstep + x] = saturate_cast<data_t>(res);
 135         }
 136     };
 137
 138     for (int c = 0; c < channels; c++) {
 139         for (int y = 0; y < dheight; y++) {
 140             full_pass(c, y);
 141         }
 142     }
 143 }
 144
 145 int getResizeAreaTabSize(int dst_go, int ssize, int dsize, float scale) {
 146     static const float threshold = 1e-3f;
 147     int max_count = 0;
 148
 149     for (int col = dst_go; col < dst_go + dsize; col++) {
 150         int count = 0;
 151
 152         float fsx1 = col * scale;
 153         float fsx2 = fsx1 + scale;
 154
 155         int sx1 = ceil(fsx1);
 156         int sx2 = floor(fsx2);
 157
 158         sx2 = (std::min)(sx2, ssize - 1);
 159         sx1 = (std::min)(sx1, sx2);
 160
 161         if (sx1 - fsx1 > threshold) {
 162             count++;
 163         }
 164
 165         for (int sx = sx1; sx < sx2; sx++) {
 166             count++;
 167         }
 168
 169         if (fsx2 - sx2 > threshold) {
 170             count++;
 171         }
 172         max_count = (std::max)(max_count, count);
 173     }
 174
 175     return max_count;
 176 }
 177
 178 void computeResizeAreaTab(int src_go, int dst_go, int ssize, int dsize, float scale,
 179                           uint16_t* si, uint16_t* alpha, int max_count) {
 180     static const float threshold = 1e-3f;
 181     int k = 0;
 182
 183     for (int col = dst_go; col < dst_go + dsize; col++) {
 184         int count = 0;
 185
 186         float fsx1 = col * scale;
 187         float fsx2 = fsx1 + scale;
 188         float cellWidth = (std::min)(scale, ssize - fsx1);
 189
 190         int sx1 = ceil(fsx1);
 191         int sx2 = floor(fsx2);
 192
 193         sx2 = (std::min)(sx2, ssize - 1);
 194         sx1 = (std::min)(sx1, sx2);
 195
 196         si[col - dst_go] = (uint16_t)(sx1 - src_go);
 197
 198         if (sx1 - fsx1 > threshold) {
 199             si[col - dst_go] = (uint16_t)(sx1 - src_go - 1);
 200             alpha[k++] = (uint16_t)((1 << 16) * ((sx1 - fsx1) / cellWidth));
 201             count++;
 202         }
 203
 204         for (int sx = sx1; sx < sx2; sx++) {
 205             alpha[k++] = (uint16_t)((1 << 16) * (1.0f / cellWidth));
 206             count++;
 207         }
 208
 209         if (fsx2 - sx2 > threshold) {
 210             alpha[k++] = (uint16_t)((1 << 16) * ((std::min)((std::min)(fsx2 - sx2, 1.f), cellWidth) / cellWidth));
 211             count++;
 212         }
 213
 214         if (count != max_count) {
 215             alpha[k++] = 0;
 216         }
 217     }
 218 }
 219
 220 void generate_alpha_and_id_arrays(int x_max_count, int dcols, const uint16_t* xalpha, uint16_t* xsi,
 221                                   uint16_t** alpha, uint16_t** sxid) {
 222     if (x_max_count <= 4) {
 223         for (int col = 0; col < dcols; col++) {
 224             for (int x = 0; x < x_max_count; x++) {
 225                 alpha[x][col] = xalpha[col*x_max_count + x];
 226             }
 227         }
 228     }
 229     if (x_max_count <= 4) {
 230         for (int col = 0; col <= dcols - 8; col += 8) {
 231             for (int chunk_num_h = 0; chunk_num_h < x_max_count; chunk_num_h++) {
 232                 for (int i = 0; i < 128 / 16; i++) {
 233                     int id_diff = xsi[col + i] - xsi[col];
 234
 235                     for (int chunk_num_v = 0; chunk_num_v < x_max_count; chunk_num_v++) {
 236                         uint16_t* sxidp = sxid[chunk_num_v] + col * x_max_count + chunk_num_h * 8;
 237
 238                         int id0 = (id_diff + chunk_num_v) * 2 + 0;
 239                         int id1 = (id_diff + chunk_num_v) * 2 + 1;
 240
 241                         (reinterpret_cast<int8_t*>(sxidp + i))[0] = static_cast<int8_t>(id0 >= (chunk_num_h * 16) && id0 < (chunk_num_h + 1) * 16 ? id0 : -1);
 242                         (reinterpret_cast<int8_t*>(sxidp + i))[1] = static_cast<int8_t>(id1 >= (chunk_num_h * 16) && id1 < (chunk_num_h + 1) * 16 ? id1 : -1);
 243                     }
 244                 }
 245             }
 246         }
 247     }
 248 }
 249
 250 int computeResizeAreaTabFP32(int src_go, int dst_go, int ssize, int dsize, float scale, uint16_t* si, uint16_t* di, float* alpha) {
 251     static const float threshold = 1e-3f;
 252     int k = 0;
 253
 254     for (int col = dst_go; col < dst_go + dsize; col++) {
 255         float fsx1 = col * scale;
 256         float fsx2 = fsx1 + scale;
 257         float cellWidth = (std::min)(scale, ssize - fsx1);
 258
 259         int sx1 = ceil(fsx1);
 260         int sx2 = floor(fsx2);
 261
 262         sx2 = (std::min)(sx2, ssize - 1);
 263         sx1 = (std::min)(sx1, sx2);
 264
 265         if (sx1 - fsx1 > threshold) {
 266             di[k] = (uint16_t)(col - dst_go);
 267             si[k] = (uint16_t)(sx1 - src_go - 1);
 268             alpha[k++] = (sx1 - fsx1) / cellWidth;
 269         }
 270
 271         for (int sx = sx1; sx < sx2; sx++) {
 272             di[k] = (uint16_t)(col - dst_go);
 273             si[k] = (uint16_t)(sx  - src_go);
 274             alpha[k++] = 1.0f / cellWidth;
 275         }
 276
 277         if (fsx2 - sx2 > threshold) {
 278             di[k] = (uint16_t)(col - dst_go);
 279             si[k] = (uint16_t)(sx2 - src_go);
 280             alpha[k++] = (std::min)((std::min)(fsx2 - sx2, 1.f), cellWidth) / cellWidth;
 281         }
 282     }
 283     return k;
 284 }
 285
 286 template<typename data_t = float>
 287 void resize_area_downscale(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) {
 288     auto dstDims = outBlob->getTensorDesc().getDims();
 289     auto srcDims = inBlob->getTensorDesc().getDims();
 290
 291     auto src_strides = inBlob->getTensorDesc().getBlockingDesc().getStrides();
 292     auto dst_strides = outBlob->getTensorDesc().getBlockingDesc().getStrides();
 293     auto origSrcW = src_strides[2];
 294     auto origSrcH = src_strides[1] / src_strides[2];
 295     auto origDstW = dst_strides[2];
 296     auto origDstH = dst_strides[1] / dst_strides[2];
 297
 298     auto dwidth = static_cast<const int>(dstDims[3]);
 299     auto dheight = static_cast<const int>(dstDims[2]);
 300     auto swidth = static_cast<const int>(srcDims[3]);
 301     auto sheight = static_cast<const int>(srcDims[2]);
 302     auto channels = static_cast<const int>(srcDims[1]);
 303
 304     const int src_go_x = 0;
 305     const int src_go_y = 0;
 306     const int dst_go_x = 0;
 307     const int dst_go_y = 0;
 308
 309     auto src_full_width = static_cast<const int>(srcDims[3]);
 310     auto src_full_height = static_cast<const int>(srcDims[2]);
 311     auto dst_full_width = static_cast<const int>(dstDims[3]);
 312     auto dst_full_height = static_cast<const int>(dstDims[2]);
 313
 314     auto* sptr = static_cast<const data_t*>(inBlob->buffer()) + inBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
 315     auto* dptr = static_cast<data_t*>(outBlob->buffer()) + outBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
 316
 317     auto sstep = static_cast<const int>(src_strides[2]);
 318     auto dstep = static_cast<const int>(dst_strides[2]);
 319
 320     float scale_x = static_cast<float>(src_full_width) / dst_full_width;
 321     float scale_y = static_cast<float>(src_full_height) / dst_full_height;
 322
 323     int vert_sum_size = swidth;
 324     int tabofs_size = (std::max)(2*swidth, 2*dwidth);
 325     int xsi_size = (std::max)(2*swidth, 2*dwidth);
 326     int xdi_size = (std::max)(2*swidth, 2*dwidth);
 327     int ysi_size = (std::max)(2*sheight, 2*dheight);
 328     int ydi_size = (std::max)(2*sheight, 2*dheight);
 329     int xalpha_size = (std::max)(2*swidth, 2*dwidth);
 330
 331     auto vert_sum = reinterpret_cast<float*>(buffer);
 332     auto tabofs = reinterpret_cast<int*>(vert_sum + vert_sum_size);
 333     auto xsi = reinterpret_cast<uint16_t*>(tabofs + tabofs_size + 1);
 334     auto xdi = xsi + xsi_size;
 335     auto ysi = xdi + xdi_size;
 336     auto ydi = ysi + ysi_size;
 337     auto xalpha = reinterpret_cast<float*>(ydi + ydi_size);
 338     auto yalpha = xalpha + xalpha_size;
 339
 340     int ytab_size = computeResizeAreaTabFP32(src_go_y, dst_go_y, src_full_height, dheight, scale_y, ysi, ydi, yalpha);
 341     int xtab_size = computeResizeAreaTabFP32(src_go_x, dst_go_x, src_full_width,  dwidth,  scale_x, xsi, xdi, xalpha);
 342
 343     int dy_ = 0;
 344     for (int i = 0; i < ytab_size && dy_ < dwidth*2; i++) {
 345         if (i == 0 || ydi[i] != ydi[i-1]) {
 346             tabofs[dy_++] = i;
 347         }
 348     }
 349     tabofs[dy_] = ytab_size;
 350
 351     auto full_pass = [&](const data_t* sptr_, data_t* dptr_, int y) {
 352         auto vert_sum_ = vert_sum;
 353
 354         memset(vert_sum_, 0, swidth * sizeof(float));
 355
 356         data_t *pdst = dptr_ + y * dstep;
 357
 358         for (int dy = tabofs[y]; dy < tabofs[y + 1] && dy < ytab_size; dy++) {
 359             float beta = yalpha[dy];
 360             int sy = ysi[dy];
 361
 362             const data_t *psrc = sptr_ + sy * sstep;
 363             for (int x = 0; x < swidth; x++) {
 364                 vert_sum_[x] += beta * psrc[x];
 365             }
 366         }
 367
 368         int xtab_ind = 0;
 369         for (int x = 0; x < dwidth; x++) {
 370             float res = 0.f;
 371             int dx = 0;
 372             for (; x == xdi[xtab_ind + dx] && xtab_ind + dx < xtab_size; dx++) {
 373                 float alpha = xalpha[xtab_ind + dx];
 374                 int sx = xsi[xtab_ind + dx];
 375
 376                 res += alpha * vert_sum_[sx];
 377             }
 378
 379             pdst[x] = saturate_cast<data_t>(res);
 380             xtab_ind += dx;
 381         }
 382     };
 383
 384     for (int ch = 0; ch < channels; ch++) {
 385         for (int y = 0; y < dheight; y++) {
 386             auto sptr_ = sptr + ch * origSrcH * origSrcW;
 387             auto dptr_ = dptr + ch * origDstH * origDstW;
 388
 389             full_pass(sptr_, dptr_, y);
 390         }
 391     }
 392 }
 393
 394 inline int clip(int x, int a, int b) {
 395     return x >= a ? (x < b ? x : b-1) : a;
 396 }
 397
 398 const int MAX_ESIZE = 16;
 399
 400 template<typename data_t>
 401 void HResizeLinear(const data_t** src, float** dst, int count, const int* xofs, const float* alpha,
 402                  int swidth, int dwidth, int cn, int xmin, int xmax ) {
 403     int dx, k;
 404     int dx0 = 0;
 405
 406     for (k = 0; k <= count - 2; k++) {
 407         const data_t *S0 = src[k], *S1 = src[k+1];
 408         float *D0 = dst[k], *D1 = dst[k+1];
 409         for (dx = dx0; dx < xmax; dx++) {
 410             int sx = xofs[dx];
 411             float a0 = alpha[dx*2], a1 = alpha[dx*2+1];
 412             float t0 = static_cast<float>(S0[sx])*a0 + static_cast<float>(S0[sx + cn])*a1;
 413             float t1 = static_cast<float>(S1[sx])*a0 + static_cast<float>(S1[sx + cn])*a1;
 414             D0[dx] = t0; D1[dx] = t1;
 415         }
 416
 417         for (; dx < dwidth; dx++) {
 418             int sx = xofs[dx];
 419             D0[dx] = static_cast<float>(S0[sx]); D1[dx] = static_cast<float>(S1[sx]);
 420         }
 421     }
 422
 423     for (; k < count; k++) {
 424         const data_t *S = src[k];
 425         float *D = dst[k];
 426         for (dx = 0; dx < xmax; dx++) {
 427             int sx = xofs[dx];
 428             D[dx] = static_cast<float>(S[sx])*alpha[dx*2] + static_cast<float>(S[sx+cn])*alpha[dx*2+1];
 429         }
 430
 431         for (; dx < dwidth; dx++)
 432             D[dx] = static_cast<float>(S[xofs[dx]]);
 433     }
 434 }
 435
 436 template<typename data_t>
 437 void VResizeLinear(float** src, data_t* dst, const float* beta, int width) {
 438     float b0 = beta[0], b1 = beta[1];
 439     const float *S0 = src[0], *S1 = src[1];
 440
 441     if (sizeof(data_t) == 4) {
 442         for (int x = 0; x < width; x++)
 443             dst[x] = (S0[x] * b0 + S1[x] * b1);
 444     } else {
 445         for (int x = 0; x < width; x++)
 446             dst[x] = saturateU32toU8(static_cast<uint32_t>(S0[x] * b0 + S1[x] * b1));
 447     }
 448 }
 449
 450 template<typename data_t>
 451 static void resize_area_upscale(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) {
 452     auto dstDims = outBlob->getTensorDesc().getDims();
 453     auto srcDims = inBlob->getTensorDesc().getDims();
 454
 455     auto src_strides = inBlob->getTensorDesc().getBlockingDesc().getStrides();
 456     auto dst_strides = outBlob->getTensorDesc().getBlockingDesc().getStrides();
 457     auto origSrcW = src_strides[2];
 458     auto origSrcH = src_strides[1] / src_strides[2];
 459     auto origDstW = dst_strides[2];
 460     auto origDstH = dst_strides[1] / dst_strides[2];
 461
 462     auto dwidth = static_cast<const int>(dstDims[3]);
 463     auto dheight = static_cast<const int>(dstDims[2]);
 464     auto swidth = static_cast<const int>(srcDims[3]);
 465     auto sheight = static_cast<const int>(srcDims[2]);
 466     auto channels = static_cast<const int>(srcDims[1]);
 467
 468     auto src_full_width = static_cast<const int>(srcDims[3]);
 469     auto src_full_height = static_cast<const int>(srcDims[2]);
 470     auto dst_full_width = static_cast<const int>(dstDims[3]);
 471     auto dst_full_height = static_cast<const int>(dstDims[2]);
 472
 473     auto sptr = static_cast<const data_t*>(inBlob->buffer()) + inBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
 474     auto dptr = static_cast<data_t*>(outBlob->buffer()) + outBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
 475
 476     auto sstep = static_cast<const int>(src_strides[2]);
 477     auto dstep = static_cast<const int>(dst_strides[2]);
 478
 479     float scale_x = static_cast<float>(src_full_width)  / dst_full_width;
 480     float scale_y = static_cast<float>(src_full_height) / dst_full_height;
 481     float inv_scale_x = static_cast<float>(dst_full_width) / src_full_width;
 482     float inv_scale_y = static_cast<float>(dst_full_height) / src_full_height;
 483
 484     int xmin = 0, xmax = dwidth, width = dwidth;
 485     int ksize = 2;
 486     int ksize2 = ksize/2;
 487
 488     auto xofs = reinterpret_cast<int*>(buffer);
 489     auto yofs = xofs + width;
 490     auto alpha = reinterpret_cast<float*>(yofs + dheight);
 491     auto beta = alpha + width*ksize;
 492     float cbuf[2] = {0};
 493
 494     for (int dx = 0; dx < dwidth; dx++) {
 495         int sx = floor(dx*scale_x);
 496         float fx = (dx+1) - (sx+1)*inv_scale_x;
 497         fx = fx <= 0 ? 0.f : fx - floor(fx);
 498
 499         if (sx < ksize2-1) {
 500             xmin = dx+1;
 501             if (sx < 0)
 502                 fx = 0, sx = 0;
 503         }
 504
 505         if (sx + ksize2 >= swidth) {
 506             xmax = (std::min)(xmax, dx);
 507             if (sx >= swidth-1)
 508                 fx = 0, sx = swidth-1;
 509         }
 510
 511         xofs[dx] = sx;
 512
 513         cbuf[0] = 1.f - fx;
 514         cbuf[1] = fx;
 515
 516         for (int k = 0; k < ksize; k++)
 517             alpha[dx*ksize + k] = cbuf[k];
 518     }
 519
 520     for (int dy = 0; dy < dheight; dy++) {
 521         int sy = floor(dy*scale_y);
 522         float fy = (dy+1) - (sy+1)*inv_scale_y;
 523         fy = fy <= 0 ? 0.f : fy - floor(fy);
 524
 525         yofs[dy] = sy;
 526         cbuf[0] = 1.f - fy;
 527         cbuf[1] = fy;
 528
 529         for (int k = 0; k < ksize; k++)
 530             beta[dy*ksize + k] = cbuf[k];
 531     }
 532
 533     auto full_pass = [&](const data_t* sptr_, data_t* dptr_, int dy) {
 534         int bufstep = dwidth;
 535         const data_t* srows[MAX_ESIZE]={0};
 536         float* rows[MAX_ESIZE]={0};
 537         int prev_sy[MAX_ESIZE];
 538
 539         for (int k = 0; k < ksize; k++) {
 540             prev_sy[k] = -1;
 541             rows[k] = reinterpret_cast<float*>(buffer + (width + dheight)*(sizeof(int) + sizeof(float)*ksize))
 542                       + k*bufstep;
 543         }
 544
 545         int sy0 = yofs[dy], k0 = ksize, k1 = 0;
 546
 547         for (int k = 0; k < ksize; k++) {
 548             int sy = clip(sy0 - ksize2 + 1 + k, 0, sheight);
 549             for (k1 = (std::max)(k1, k); k1 < ksize; k1++) {
 550                 if (k1 < MAX_ESIZE && sy == prev_sy[k1]) {
 551                     if (k1 > k)
 552                         memcpy(rows[k], rows[k1], bufstep*sizeof(rows[0][0]));
 553                     break;
 554                 }
 555             }
 556
 557             if (k1 == ksize)
 558                 k0 = (std::min)(k0, k);
 559             srows[k] = sptr_ + sy * sstep;
 560             prev_sy[k] = sy;
 561         }
 562
 563         if (k0 < ksize)
 564             HResizeLinear<data_t>(srows + k0, reinterpret_cast<float**>(rows + k0), ksize - k0, xofs,
 565                                   reinterpret_cast<const float*>(alpha), swidth, dwidth, 1, xmin, xmax);
 566
 567         VResizeLinear<data_t>(reinterpret_cast<float**>(rows), dptr_ + dstep*dy, beta + dy*ksize, dwidth);
 568     };
 569
 570     for (int ch = 0; ch < channels; ch++) {
 571         for (int dy = 0; dy < dheight; dy++) {
 572             auto sptr_ = sptr + ch * origSrcH * origSrcW;
 573             auto dptr_ = dptr + ch * origDstH * origDstW;
 574
 575             full_pass(sptr_, dptr_, dy);
 576         }
 577     }
 578 }
 579
 580 size_t resize_get_buffer_size(Blob::Ptr inBlob, Blob::Ptr outBlob, const ResizeAlgorithm &algorithm) {
 581     auto dstDims = outBlob->getTensorDesc().getDims();
 582     auto srcDims = inBlob->getTensorDesc().getDims();
 583
 584     SizeVector strides = inBlob->getTensorDesc().getBlockingDesc().getStrides();
 585     size_t origW = strides[2];
 586     size_t origH = strides[1] / strides[2];
 587
 588     const int src_full_width = origW;
 589     const int src_full_height = origH;
 590     const int dst_full_width = dstDims[3];
 591     const int dst_full_height = dstDims[2];
 592
 593     float scale_x = static_cast<float>(dstDims[3]) / srcDims[3];
 594     float scale_y = static_cast<float>(dstDims[2]) / srcDims[2];
 595
 596     auto resize_bilinear_u8_buffer_size = [&]() {
 597         size_t buffer_size = (sizeof(int16_t) * 4 + sizeof(uint8_t *)) * dstDims[3] +
 598                              (sizeof(int32_t) + sizeof(int16_t)) * dstDims[2] +
 599                              sizeof(uint32_t) * dstDims[3] +
 600                              (((srcDims[3] + 7) / 8) * 8 * 8) +
 601                              sizeof(uint8_t) * 12;
 602
 603         return buffer_size;
 604     };
 605
 606     auto resize_bilinear_fp32_buffer_size = [&]() {
 607         size_t buffer_size = (sizeof(float) + sizeof(float *)) * dstDims[3] +
 608                              (sizeof(int32_t) + sizeof(float)) * dstDims[2] +
 609                              (((srcDims[3] + 1) / 2) * 2 * 2) * sizeof(float);
 610
 611         return buffer_size;
 612     };
 613
 614     auto resize_area_u8_downscale_sse_buffer_size = [&]() {
 615         const int dwidth = dstDims[3];
 616         const int dheight = dstDims[2];
 617         const int swidth = srcDims[3];
 618
 619         const int dst_go_x = 0;
 620         const int dst_go_y = 0;
 621
 622         int x_max_count = getResizeAreaTabSize(dst_go_x, src_full_width, dwidth, static_cast<float>(src_full_width) / dst_full_width) + 1;
 623         int y_max_count = getResizeAreaTabSize(dst_go_y, src_full_height, dheight, static_cast<float>(src_full_height) / dst_full_height) + 1;
 624
 625         size_t si_buf_size = sizeof(uint16_t) * dwidth + sizeof(uint16_t) * dheight;
 626         size_t alpha_buf_size =
 627                 sizeof(uint16_t) * (dwidth * x_max_count + 8 * 16) + sizeof(uint16_t) * dheight * y_max_count;
 628         size_t vert_sum_buf_size = sizeof(uint16_t) * (swidth * 2);
 629         size_t alpha_array_buf_size = sizeof(uint16_t) * 4 * dwidth;
 630         size_t sxid_array_buf_size = sizeof(uint16_t) * 4 * 4 * dwidth;
 631
 632         size_t buffer_size = si_buf_size +
 633                              alpha_buf_size +
 634                              vert_sum_buf_size +
 635                              alpha_array_buf_size +
 636                              sxid_array_buf_size;
 637
 638         return buffer_size;
 639     };
 640
 641     auto resize_area_downscale_buffer_size = [&]() {
 642         size_t buffer_size = sizeof(float) * (srcDims[3]) +
 643                              sizeof(uint32_t) * (dstDims[3] * 2 + 1) +
 644                              sizeof(float) * ((srcDims[3] + srcDims[2]) * 4) +
 645                              sizeof(float) * ((srcDims[3] + srcDims[2]) * 2);
 646
 647         return buffer_size;
 648     };
 649
 650     auto resize_area_upscale_buffer_size = [&]() {
 651         size_t buffer_size = (dstDims[3] + dstDims[2])*(sizeof(int) + sizeof(float)*2) + 2*dstDims[3] * sizeof(float);
 652
 653         return buffer_size;
 654     };
 655
 656     if (algorithm == RESIZE_BILINEAR) {
 657         if (inBlob->getTensorDesc().getPrecision() == Precision::U8) {
 658             return resize_bilinear_u8_buffer_size();
 659         } else {
 660             return resize_bilinear_fp32_buffer_size();
 661         }
 662     } else if (algorithm == RESIZE_AREA) {
 663         if (inBlob->getTensorDesc().getPrecision() == Precision::U8) {
 664             if (scale_x <= 1 && scale_y <= 1) {
 665 #ifdef HAVE_SSE
 666                 if (with_cpu_x86_sse42() && scale_x < 1 && scale_y < 1)
 667                     return resize_area_u8_downscale_sse_buffer_size();
 668                 else
 669 #endif
 670                     return resize_area_downscale_buffer_size();
 671             } else {
 672                 return resize_area_upscale_buffer_size();
 673             }
 674         } else {
 675             if (scale_x <= 1 && scale_y <= 1)
 676                 return resize_area_downscale_buffer_size();
 677             else
 678                 return resize_area_upscale_buffer_size();
 679         }
 680     }
 681
 682     return 0;
 683 }
 684
 685 void resize(Blob::Ptr inBlob, Blob::Ptr outBlob, const ResizeAlgorithm &algorithm) {
 686     if (inBlob->getTensorDesc().getLayout() != NCHW || outBlob->getTensorDesc().getLayout() != NCHW)
 687         THROW_IE_EXCEPTION << "Resize supports only NCHW layout";
 688
 689     if (!((inBlob->getTensorDesc().getPrecision() == Precision::U8 && outBlob->getTensorDesc().getPrecision() == Precision::U8) ||
 690           (inBlob->getTensorDesc().getPrecision() == Precision::FP32 && outBlob->getTensorDesc().getPrecision() == Precision::FP32)))
 691         THROW_IE_EXCEPTION << "Resize supports only U8 and FP32 precisions";
 692
 693     if (algorithm != RESIZE_BILINEAR && algorithm != RESIZE_AREA)
 694         THROW_IE_EXCEPTION << "Unsupported resize algorithm type";
 695
 696     size_t buffer_size = resize_get_buffer_size(inBlob, outBlob, algorithm);
 697     auto* buffer = static_cast<uint8_t *>(malloc(buffer_size));
 698     if (buffer == nullptr) {
 699         THROW_IE_EXCEPTION << "Could not allocate memory for blob";
 700     }
 701
 702     auto dstDims = outBlob->getTensorDesc().getDims();
 703     auto srcDims = inBlob->getTensorDesc().getDims();
 704     float scale_x = static_cast<float>(dstDims[3]) / srcDims[3];
 705     float scale_y = static_cast<float>(dstDims[2]) / srcDims[2];
 706
 707     if (algorithm == RESIZE_BILINEAR) {
 708         if (inBlob->getTensorDesc().getPrecision() == Precision::U8) {
 709 #ifdef HAVE_SSE
 710             if (with_cpu_x86_sse42())
 711                 Resize::resize_bilinear_u8(inBlob, outBlob, buffer);
 712             else
 713 #endif
 714                 resize_bilinear<uint8_t>(inBlob, outBlob, buffer);
 715         } else {
 716             resize_bilinear<float>(inBlob, outBlob, buffer);
 717         }
 718     } else if (algorithm == RESIZE_AREA) {
 719         if (inBlob->getTensorDesc().getPrecision() == Precision::U8) {
 720             if (scale_x <= 1 && scale_y <= 1) {
 721 #ifdef HAVE_SSE
 722                 if (with_cpu_x86_sse42() && scale_x < 1 && scale_y < 1)
 723                     Resize::resize_area_u8_downscale(inBlob, outBlob, buffer);
 724                 else
 725 #endif
 726                     resize_area_downscale<uint8_t>(inBlob, outBlob, buffer);
 727             } else {
 728                 resize_area_upscale<uint8_t>(inBlob, outBlob, buffer);
 729             }
 730         } else {
 731             if (scale_x <= 1 && scale_y <= 1)
 732                 resize_area_downscale<float>(inBlob, outBlob, buffer);
 733             else
 734                 resize_area_upscale<float>(inBlob, outBlob, buffer);
 735         }
 736     }
 737
 738     free(buffer);
 739 }
 740
 741 }  // namespace Resize
 742
 743 //----------------------------------------------------------------------
 744
 745 using namespace Resize;
 746
 747 void PreProcessData::setRoiBlob(const Blob::Ptr &blob) {
 748     _roiBlob = blob;
 749 }
 750
 751 Blob::Ptr PreProcessData::getRoiBlob() const {
 752     return _roiBlob;
 753 }
 754
 755 void PreProcessData::execute(Blob::Ptr &outBlob, const ResizeAlgorithm &algorithm, bool serial,
 756         int batchSize) {
 757     IE_PROFILING_AUTO_SCOPE_TASK(perf_preprocessing)
 758
 759     if (algorithm == NO_RESIZE) {
 760         THROW_IE_EXCEPTION << "Input pre-processing is called without resize algorithm set";
 761     }
 762
 763     if (_roiBlob == nullptr) {
 764         THROW_IE_EXCEPTION << "Input pre-processing is called without ROI blob set";
 765     }
 766
 767     if (batchSize == 0) {
 768         THROW_IE_EXCEPTION << "Input pre-processing is called with invalid batch size "
 769                            << batchSize;
 770     }
 771
 772     if (batchSize < 0) {
 773         // if batch_size is unspecified, process the whole input blob
 774         batchSize = static_cast<int>(_roiBlob->getTensorDesc().getDims()[0]);
 775     }
 776
 777     if (!_preproc) {
 778         _preproc.reset(new PreprocEngine);
 779     }
 780     if (_preproc->preprocessWithGAPI(_roiBlob, outBlob, algorithm, serial, batchSize)) {
 781         return;
 782     }
 783
 784     if (batchSize > 1) {
 785         THROW_IE_EXCEPTION <<   "Batch pre-processing is unsupported in this mode. "
 786                                 "Use default pre-processing instead to process batches.";
 787     }
 788
 789     Blob::Ptr res_in, res_out;
 790     if (_roiBlob->getTensorDesc().getLayout() == NHWC) {
 791         if (!_tmp1 || _tmp1->size() != _roiBlob->size()) {
 792             if (_roiBlob->getTensorDesc().getPrecision() == Precision::FP32) {
 793                 _tmp1 = make_shared_blob<float>(Precision::FP32, NCHW, _roiBlob->dims());
 794             } else {
 795                 _tmp1 = make_shared_blob<uint8_t>(Precision::U8, NCHW, _roiBlob->dims());
 796             }
 797             _tmp1->allocate();
 798         }
 799
 800         {
 801             IE_PROFILING_AUTO_SCOPE_TASK(perf_reorder_before)
 802             blob_copy(_roiBlob, _tmp1);
 803         }
 804         res_in = _tmp1;
 805     } else {
 806         res_in = _roiBlob;
 807     }
 808
 809     if (outBlob->getTensorDesc().getLayout() == NHWC) {
 810         if (!_tmp2 || _tmp2->size() != outBlob->size()) {
 811             if (outBlob->getTensorDesc().getPrecision() == Precision::FP32) {
 812                 _tmp2 = make_shared_blob<float>(Precision::FP32, NCHW, outBlob->dims());
 813             } else {
 814                 _tmp2 = make_shared_blob<uint8_t>(Precision::U8, NCHW, outBlob->dims());
 815             }
 816             _tmp2->allocate();
 817         }
 818         res_out = _tmp2;
 819     } else {
 820         res_out = outBlob;
 821     }
 822
 823     {
 824         IE_PROFILING_AUTO_SCOPE_TASK(perf_resize)
 825         resize(res_in, res_out, algorithm);
 826     }
 827
 828     if (res_out == _tmp2) {
 829         IE_PROFILING_AUTO_SCOPE_TASK(perf_reorder_after)
 830         blob_copy(_tmp2, outBlob);
 831     }
 832 }
 833
 834 void PreProcessData::isApplicable(const Blob::Ptr &src, const Blob::Ptr &dst) {
 835     auto &src_dims = src->getTensorDesc().getDims();
 836     auto &dst_dims = dst->getTensorDesc().getDims();
 837
 838     if (src_dims.size() != dst_dims.size())
 839         THROW_IE_EXCEPTION << "Preprocessing is not applicable. Source and destination blobs have different "
 840                               "number of dimensions";
 841
 842     if (src_dims.size() != 4)
 843         THROW_IE_EXCEPTION << "Preprocessing is not applicable. Only 4D tensors are supported.";
 844
 845     if (src_dims[0] != dst_dims[0] || src_dims[1] != dst_dims[1])
 846         THROW_IE_EXCEPTION << "Preprocessing is not applicable. Wrong shape. Network expected 4D input tensor with "
 847                               "shape [" << dst_dims[0] << "," << dst_dims[1] <<",H,W] but provided tensor has "
 848                               "shape "  << details::dumpVec(src_dims) << ".";
 849 }
 850
 851 }  // namespace InferenceEngine