inference-engine/src/inference_engine/ie_preprocess_gapi_kernels.cpp

   1 // Copyright (C) 2018-2019 Intel Corporation
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4
   5 #include "ie_preprocess_gapi_kernels.hpp"
   6 #include "ie_preprocess_gapi_kernels_impl.hpp"
   7
   8 // AFTER "ie_preprocess_gapi_kernels_impl.hpp"
   9 // (MANUAL_SIMD is defined there)
  10 #if MANUAL_SIMD
  11   #include "cpu_detector.hpp"
  12   #include "ie_preprocess_gapi_kernels_sse42.hpp"
  13 #endif
  14
  15 #include <opencv2/gapi/opencv_includes.hpp>
  16 #include <opencv2/gapi/fluid/gfluidkernel.hpp>
  17 #include <opencv2/gapi/gcompoundkernel.hpp>
  18
  19 #include <algorithm>
  20 #include <type_traits>
  21 #include <utility>
  22 #include <vector>
  23
  24 namespace InferenceEngine {
  25 namespace gapi {
  26
  27 namespace kernels {
  28
  29 template<typename T, int chs> static
  30 void mergeRow(const std::array<const uint8_t*, chs>& ins, uint8_t* out, int length) {
  31 #if MANUAL_SIMD
  32     if (with_cpu_x86_sse42()) {
  33         if (std::is_same<T, uint8_t>::value && chs == 2) {
  34             mergeRow_8UC2(ins[0], ins[1], out, length);
  35             return;
  36         }
  37
  38         if (std::is_same<T, uint8_t>::value && chs == 3) {
  39             mergeRow_8UC3(ins[0], ins[1], ins[2], out, length);
  40             return;
  41         }
  42
  43         if (std::is_same<T, uint8_t>::value && chs == 4) {
  44             mergeRow_8UC4(ins[0], ins[1], ins[2], ins[3], out, length);
  45             return;
  46         }
  47
  48         if (std::is_same<T, float>::value && chs == 2) {
  49             mergeRow_32FC2(reinterpret_cast<const float*>(ins[0]),
  50                            reinterpret_cast<const float*>(ins[1]),
  51                            reinterpret_cast<float*>(out), length);
  52             return;
  53         }
  54
  55         if (std::is_same<T, float>::value && chs == 3) {
  56             mergeRow_32FC3(reinterpret_cast<const float*>(ins[0]),
  57                            reinterpret_cast<const float*>(ins[1]),
  58                            reinterpret_cast<const float*>(ins[2]),
  59                            reinterpret_cast<float*>(out), length);
  60             return;
  61         }
  62
  63         if (std::is_same<T, float>::value && chs == 4) {
  64             mergeRow_32FC4(reinterpret_cast<const float*>(ins[0]),
  65                            reinterpret_cast<const float*>(ins[1]),
  66                            reinterpret_cast<const float*>(ins[2]),
  67                            reinterpret_cast<const float*>(ins[3]),
  68                            reinterpret_cast<float*>(out), length);
  69             return;
  70         }
  71     }
  72 #endif
  73
  74     const T* insT[chs];
  75     for (int c = 0; c < chs; c++) {
  76         insT[c] = reinterpret_cast<const T*>(ins[c]);
  77     }
  78     auto outT = reinterpret_cast<T*>(out);
  79
  80     for (int x = 0; x < length; x++) {
  81         for (int c = 0; c < chs; c++) {
  82             outT[chs*x + c] = insT[c][x];
  83         }
  84     }
  85 }
  86
  87 template<typename T, int chs> static
  88 void splitRow(const uint8_t* in, std::array<uint8_t*, chs>& outs, int length) {
  89 #if MANUAL_SIMD
  90     if (with_cpu_x86_sse42()) {
  91         if (std::is_same<T, uint8_t>::value && chs == 2) {
  92             splitRow_8UC2(in, outs[0], outs[1], length);
  93             return;
  94         }
  95
  96         if (std::is_same<T, uint8_t>::value && chs == 3) {
  97             splitRow_8UC3(in, outs[0], outs[1], outs[2], length);
  98             return;
  99         }
 100
 101         if (std::is_same<T, uint8_t>::value && chs == 4) {
 102             splitRow_8UC4(in, outs[0], outs[1], outs[2], outs[3], length);
 103             return;
 104         }
 105
 106         if (std::is_same<T, float>::value && chs == 2) {
 107             splitRow_32FC2(reinterpret_cast<const float*>(in),
 108                            reinterpret_cast<float*>(outs[0]),
 109                            reinterpret_cast<float*>(outs[1]),
 110                            length);
 111             return;
 112         }
 113
 114         if (std::is_same<T, float>::value && chs == 3) {
 115             splitRow_32FC3(reinterpret_cast<const float*>(in),
 116                            reinterpret_cast<float*>(outs[0]),
 117                            reinterpret_cast<float*>(outs[1]),
 118                            reinterpret_cast<float*>(outs[2]),
 119                            length);
 120             return;
 121         }
 122
 123         if (std::is_same<T, float>::value && chs == 4) {
 124             splitRow_32FC4(reinterpret_cast<const float*>(in),
 125                            reinterpret_cast<float*>(outs[0]),
 126                            reinterpret_cast<float*>(outs[1]),
 127                            reinterpret_cast<float*>(outs[2]),
 128                            reinterpret_cast<float*>(outs[3]),
 129                            length);
 130             return;
 131         }
 132     }
 133 #endif
 134
 135     auto inT = reinterpret_cast<const T*>(in);
 136
 137     T* outsT[chs];
 138     for (int c = 0; c < chs; c++) {
 139         outsT[c] = reinterpret_cast<T*>(outs[c]);
 140     }
 141
 142     for (int x = 0; x < length; x++) {
 143         for (int c = 0; c < chs; c++) {
 144             outsT[c][x] = inT[chs*x + c];
 145         }
 146     }
 147 }
 148
 149 GAPI_FLUID_KERNEL(FMerge2, Merge2, false) {
 150     static const int LPI = 4;
 151     static const int Window = 1;
 152     static void run(const cv::gapi::fluid::View& a,
 153                     const cv::gapi::fluid::View& b,
 154                           cv::gapi::fluid::Buffer& out) {
 155         const auto rowFunc = (a.meta().depth == CV_8U) ? &mergeRow<uint8_t, 2> : &mergeRow<float, 2>;
 156         for (int l = 0; l < out.lpi(); l++) {
 157             rowFunc({a.InLineB(l), b.InLineB(l)}, out.OutLineB(l), a.length());
 158         }
 159     }
 160 };
 161
 162 GAPI_FLUID_KERNEL(FMerge3, Merge3, false) {
 163     static const int LPI = 4;
 164     static const int Window = 1;
 165     static void run(const cv::gapi::fluid::View& a,
 166                     const cv::gapi::fluid::View& b,
 167                     const cv::gapi::fluid::View& c,
 168                           cv::gapi::fluid::Buffer& out) {
 169         const auto rowFunc = (a.meta().depth == CV_8U) ? &mergeRow<uint8_t, 3> : &mergeRow<float, 3>;
 170         for (int l = 0; l < out.lpi(); l++) {
 171             rowFunc({a.InLineB(l), b.InLineB(l), c.InLineB(l)}, out.OutLineB(l), a.length());
 172         }
 173     }
 174 };
 175
 176 GAPI_FLUID_KERNEL(FMerge4, Merge4, false) {
 177     static const int LPI = 4;
 178     static const int Window = 1;
 179     static void run(const cv::gapi::fluid::View& a,
 180                     const cv::gapi::fluid::View& b,
 181                     const cv::gapi::fluid::View& c,
 182                     const cv::gapi::fluid::View& d,
 183                           cv::gapi::fluid::Buffer& out) {
 184         const auto rowFunc = (a.meta().depth == CV_8U) ? &mergeRow<uint8_t, 4> : &mergeRow<float, 4>;
 185         for (int l = 0; l < out.lpi(); l++) {
 186             rowFunc({a.InLineB(l), b.InLineB(l), c.InLineB(l), d.InLineB(l)}, out.OutLineB(l), a.length());
 187         }
 188     }
 189 };
 190
 191 GAPI_FLUID_KERNEL(FSplit2, Split2, false) {
 192     static const int LPI = 4;
 193     static const int Window = 1;
 194     static void run(const cv::gapi::fluid::View  & in,
 195                           cv::gapi::fluid::Buffer& out1,
 196                           cv::gapi::fluid::Buffer& out2) {
 197         GAPI_DbgAssert(2 == in.meta().chan);
 198         GAPI_DbgAssert(1 == out1.meta().chan);
 199         GAPI_DbgAssert(1 == out2.meta().chan);
 200         GAPI_DbgAssert(in.meta().depth == out1.meta().depth);
 201         GAPI_DbgAssert(in.meta().depth == out2.meta().depth);
 202         GAPI_DbgAssert(CV_8U == in.meta().depth || CV_32F == in.meta().depth);
 203         const auto rowFunc = (in.meta().depth == CV_8U) ?
 204                              &splitRow<uint8_t, 2> :
 205                              &splitRow<float  , 2>;
 206         for (int i = 0, lpi = out1.lpi(); i < lpi; i++) {
 207             std::array<uint8_t*, 2> outs = {out1.OutLineB(i), out2.OutLineB(i)};
 208             rowFunc(in.InLineB(i), outs, in.length());
 209         }
 210     }
 211 };
 212
 213 GAPI_FLUID_KERNEL(FSplit3, Split3, false) {
 214     static const int LPI = 4;
 215     static const int Window = 1;
 216     static void run(const cv::gapi::fluid::View  & in,
 217                           cv::gapi::fluid::Buffer& out1,
 218                           cv::gapi::fluid::Buffer& out2,
 219                           cv::gapi::fluid::Buffer& out3) {
 220         GAPI_DbgAssert(3 == in.meta().chan);
 221         GAPI_DbgAssert(1 == out1.meta().chan);
 222         GAPI_DbgAssert(1 == out2.meta().chan);
 223         GAPI_DbgAssert(1 == out3.meta().chan);
 224         GAPI_DbgAssert(in.meta().depth == out1.meta().depth);
 225         GAPI_DbgAssert(in.meta().depth == out2.meta().depth);
 226         GAPI_DbgAssert(in.meta().depth == out3.meta().depth);
 227         GAPI_DbgAssert(CV_8U == in.meta().depth || CV_32F == in.meta().depth);
 228         const auto rowFunc = (in.meta().depth == CV_8U) ?
 229                              &splitRow<uint8_t, 3> :
 230                              &splitRow<float  , 3>;
 231         for (int i = 0, lpi = out1.lpi(); i < lpi; i++) {
 232             std::array<uint8_t*, 3> outs = {out1.OutLineB(i), out2.OutLineB(i),
 233                                             out3.OutLineB(i)};
 234             rowFunc(in.InLineB(i), outs, in.length());
 235         }
 236     }
 237 };
 238
 239 GAPI_FLUID_KERNEL(FSplit4, Split4, false) {
 240     static const int LPI = 4;
 241     static const int Window = 1;
 242     static void run(const cv::gapi::fluid::View  & in,
 243                           cv::gapi::fluid::Buffer& out1,
 244                           cv::gapi::fluid::Buffer& out2,
 245                           cv::gapi::fluid::Buffer& out3,
 246                           cv::gapi::fluid::Buffer& out4) {
 247         GAPI_DbgAssert(4 == in.meta().chan);
 248         GAPI_DbgAssert(1 == out1.meta().chan);
 249         GAPI_DbgAssert(1 == out2.meta().chan);
 250         GAPI_DbgAssert(1 == out3.meta().chan);
 251         GAPI_DbgAssert(1 == out4.meta().chan);
 252         GAPI_DbgAssert(in.meta().depth == out1.meta().depth);
 253         GAPI_DbgAssert(in.meta().depth == out2.meta().depth);
 254         GAPI_DbgAssert(in.meta().depth == out3.meta().depth);
 255         GAPI_DbgAssert(in.meta().depth == out4.meta().depth);
 256         GAPI_DbgAssert(CV_8U == in.meta().depth || CV_32F == in.meta().depth);
 257         const auto rowFunc = (in.meta().depth == CV_8U) ?
 258                              &splitRow<uint8_t, 4> :
 259                              &splitRow<float  , 4>;
 260         for (int i = 0, lpi = out1.lpi(); i < lpi; i++) {
 261             std::array<uint8_t*, 4> outs = {out1.OutLineB(i), out2.OutLineB(i),
 262                                             out3.OutLineB(i), out4.OutLineB(i)};
 263             rowFunc(in.InLineB(i), outs, in.length());
 264         }
 265     }
 266 };
 267
 268 //----------------------------------------------------------------------
 269
 270 template<typename T>
 271 static void chanToPlaneRow(const uint8_t* in, int chan, int chs, uint8_t* out, int length) {
 272 #if MANUAL_SIMD
 273     if (with_cpu_x86_sse42()) {
 274         if (std::is_same<T, uint8_t>::value && chs == 1) {
 275             copyRow_8U(in, out, length);
 276             return;
 277         }
 278
 279         if (std::is_same<T, float>::value && chs == 1) {
 280             copyRow_32F(reinterpret_cast<const float*>(in),
 281                         reinterpret_cast<float*>(out),
 282                         length);
 283             return;
 284         }
 285     }
 286 #endif
 287
 288     const auto inT  = reinterpret_cast<const T*>(in);
 289           auto outT = reinterpret_cast<      T*>(out);
 290
 291     for (int x = 0; x < length; x++) {
 292         outT[x] = inT[x*chs + chan];
 293     }
 294 }
 295
 296 //    GAPI_OCV_KERNEL(OCVChanToPlane, ChanToPlane) {
 297 //        static void run(const cv::Mat &in, int chan, cv::Mat &out) {
 298 //            out.create(in.rows, in.cols, in.depth());
 299 //            const auto rowFunc = (in.depth() == CV_8U) ? &chanToPlaneRow<uint8_t> : &chanToPlaneRow<float>;
 300
 301 //            for (int y = 0; y < out.rows; y++)
 302 //            {
 303 //                rowFunc(in.data + y*in.step, chan, in.channels(), out.data + y*out.step, in.cols);
 304 //            }
 305 //        }
 306 //    };
 307
 308 //    GAPI_OCV_KERNEL(OCVScalePlane, ScalePlane) {
 309 //        static void run(const cv::Mat &in, int /*type*/, const Size &sz, int interp, cv::Mat &out) {
 310 //            cv::resize(in, out, sz, 0, 0, interp);
 311 //        }
 312 //    };
 313
 314 //    GAPI_OCV_KERNEL(OCVMerge2, Merge2) {
 315 //        static void run(const cv::Mat &a, const cv::Mat &b, cv::Mat out) {
 316 //            out.create(a.rows, a.cols, CV_MAKETYPE(a.depth(), 2));
 317 //            const auto rowFunc = (a.depth() == CV_8U) ? &mergeRow<uint8_t, 2> : &mergeRow<float, 2>;
 318
 319 //            for (int y = 0; y < out.rows; y++)
 320 //            {
 321 //                rowFunc({a.data + y*a.step, b.data + y*b.step}, out.data + out.step, a.cols);
 322 //            }
 323 //        }
 324 //    };
 325
 326 GAPI_FLUID_KERNEL(FChanToPlane, ChanToPlane, false) {
 327     static const int Window = 1;
 328     static void run(const cv::gapi::fluid::View& in, int chan,
 329                     cv::gapi::fluid::Buffer& out) {
 330         const auto rowFunc = (in.meta().depth == CV_8U) ? &chanToPlaneRow<uint8_t> : &chanToPlaneRow<float>;
 331         rowFunc(in.InLineB(0), chan, in.meta().chan, out.OutLineB(), in.length());
 332     }
 333 };
 334
 335 //----------------------------------------------------------------------
 336
 337 G_TYPED_KERNEL(ScalePlane8u, <cv::GMat(cv::GMat, Size, int)>, "com.intel.ie.scale_plane_8u") {
 338     static cv::GMatDesc outMeta(const cv::GMatDesc &in, const Size &sz, int) {
 339         GAPI_DbgAssert(in.depth == CV_8U && in.chan == 1);
 340         return in.withSize(sz);
 341     }
 342 };
 343
 344 G_TYPED_KERNEL(ScalePlane32f, <cv::GMat(cv::GMat, Size, int)>, "com.intel.ie.scale_plane_32f") {
 345     static cv::GMatDesc outMeta(const cv::GMatDesc &in, const Size &sz, int) {
 346         GAPI_DbgAssert(in.depth == CV_32F && in.chan == 1);
 347         return in.withSize(sz);
 348     }
 349 };
 350
 351 G_TYPED_KERNEL(UpscalePlaneArea8u, <cv::GMat(cv::GMat, Size, int)>, "com.intel.ie.upscale_plane_area_8u") {
 352     static cv::GMatDesc outMeta(const cv::GMatDesc &in, const Size &sz, int) {
 353         GAPI_DbgAssert(in.depth == CV_8U && in.chan == 1);
 354         GAPI_DbgAssert(in.size.width < sz.width || in.size.height < sz.height);
 355         return in.withSize(sz);
 356     }
 357 };
 358
 359 G_TYPED_KERNEL(UpscalePlaneArea32f, <cv::GMat(cv::GMat, Size, int)>, "com.intel.ie.upscale_plane_area_32f") {
 360     static cv::GMatDesc outMeta(const cv::GMatDesc &in, const Size &sz, int) {
 361         GAPI_DbgAssert(in.depth == CV_32F && in.chan == 1);
 362         GAPI_DbgAssert(in.size.width < sz.width || in.size.height < sz.height);
 363         return in.withSize(sz);
 364     }
 365 };
 366
 367 G_TYPED_KERNEL(ScalePlaneArea8u, <cv::GMat(cv::GMat, Size, int)>, "com.intel.ie.scale_plane_area_8u") {
 368     static cv::GMatDesc outMeta(const cv::GMatDesc &in, const Size &sz, int) {
 369         GAPI_DbgAssert(in.depth == CV_8U && in.chan == 1);
 370         GAPI_DbgAssert(in.size.width >= sz.width && in.size.height >= sz.height);
 371         return in.withSize(sz);
 372     }
 373 };
 374
 375 G_TYPED_KERNEL(ScalePlaneArea32f, <cv::GMat(cv::GMat, Size, int)>, "com.intel.ie.scale_plane_area_32f") {
 376     static cv::GMatDesc outMeta(const cv::GMatDesc &in, const Size &sz, int) {
 377         GAPI_DbgAssert(in.depth == CV_32F && in.chan == 1);
 378         GAPI_DbgAssert(in.size.width >= sz.width && in.size.height >= sz.height);
 379         return in.withSize(sz);
 380     }
 381 };
 382
 383 GAPI_COMPOUND_KERNEL(FScalePlane, ScalePlane) {
 384     static cv::GMat expand(cv::GMat in, int type, const Size& szIn, const Size& szOut, int interp) {
 385         GAPI_DbgAssert(CV_8UC1 == type || CV_32FC1 == type);
 386         GAPI_DbgAssert(cv::INTER_AREA == interp || cv::INTER_LINEAR == interp);
 387
 388         if (cv::INTER_AREA == interp) {
 389             bool upscale = szIn.width < szOut.width || szIn.height < szOut.height;
 390             if (CV_8UC1 == type) {
 391                 if (upscale)
 392                     return UpscalePlaneArea8u::on(in, szOut, interp);
 393                 else
 394                     return   ScalePlaneArea8u::on(in, szOut, interp);
 395             }
 396             if (CV_32FC1 == type) {
 397                 if (upscale)
 398                     return UpscalePlaneArea32f::on(in, szOut, interp);
 399                 else
 400                     return   ScalePlaneArea32f::on(in, szOut, interp);
 401             }
 402         }
 403
 404         if (cv::INTER_LINEAR == interp) {
 405             if (CV_8UC1 == type) {
 406                 return ScalePlane8u::on(in, szOut, interp);
 407             }
 408             if (CV_32FC1 == type) {
 409                 return ScalePlane32f::on(in, szOut, interp);
 410             }
 411         }
 412
 413         GAPI_Assert(!"unsupported parameters");
 414         return {};
 415     }
 416 };
 417
 418 static inline double invRatio(int inSz, int outSz) {
 419     return static_cast<double>(outSz) / inSz;
 420 }
 421
 422 static inline double ratio(int inSz, int outSz) {
 423     return 1 / invRatio(inSz, outSz);
 424 }
 425
 426 template<typename T, typename Mapper, int chanNum>
 427 struct linearScratchDesc {
 428     using alpha_t = typename Mapper::alpha_type;
 429     using index_t = typename Mapper::index_type;
 430
 431     alpha_t* alpha;
 432     alpha_t* clone;
 433     index_t* mapsx;
 434     alpha_t* beta;
 435     index_t* mapsy;
 436     T*       tmp;
 437
 438     linearScratchDesc(int /*inW*/, int /*inH*/, int outW, int outH,  void* data) {
 439         alpha = reinterpret_cast<alpha_t*>(data);
 440         clone = reinterpret_cast<alpha_t*>(alpha + outW);
 441         mapsx = reinterpret_cast<index_t*>(clone + outW*4);
 442         beta  = reinterpret_cast<alpha_t*>(mapsx + outW);
 443         mapsy = reinterpret_cast<index_t*>(beta  + outH);
 444         tmp   = reinterpret_cast<T*>      (mapsy + outH*2);
 445     }
 446
 447     static int bufSize(int inW, int inH, int outW, int outH, int lpi) {
 448         auto size = outW * sizeof(alpha_t)     +
 449                     outW * sizeof(alpha_t) * 4 +  // alpha clones // previous alpha is redundant?
 450                     outW * sizeof(index_t)     +
 451                     outH * sizeof(alpha_t)     +
 452                     outH * sizeof(index_t) * 2 +
 453                      inW * sizeof(T) * lpi * chanNum;
 454
 455         return static_cast<int>(size);
 456     }
 457 };
 458
 459 template<typename T, typename Mapper, int chanNum = 1>
 460 static void initScratchLinear(const cv::GMatDesc& in,
 461                               const         Size& outSz,
 462                          cv::gapi::fluid::Buffer& scratch,
 463                                              int  lpi) {
 464     using alpha_type = typename Mapper::alpha_type;
 465     static const auto unity = Mapper::unity;
 466
 467     auto inSz = in.size;
 468     auto sbufsize = linearScratchDesc<T, Mapper, chanNum>::bufSize(inSz.width, inSz.height, outSz.width, outSz.height, lpi);
 469
 470     Size scratch_size{sbufsize, 1};
 471
 472     cv::GMatDesc desc;
 473     desc.chan = 1;
 474     desc.depth = CV_8UC1;
 475     desc.size = scratch_size;
 476
 477     cv::gapi::fluid::Buffer buffer(desc);
 478     scratch = std::move(buffer);
 479
 480     double hRatio = ratio(in.size.width, outSz.width);
 481     double vRatio = ratio(in.size.height, outSz.height);
 482
 483     linearScratchDesc<T, Mapper, chanNum> scr(inSz.width, inSz.height, outSz.width, outSz.height, scratch.OutLineB());
 484
 485     auto *alpha = scr.alpha;
 486     auto *clone = scr.clone;
 487     auto *index = scr.mapsx;
 488
 489     for (int x = 0; x < outSz.width; x++) {
 490         auto map = Mapper::map(hRatio, 0, in.size.width, x);
 491         auto alpha0 = map.alpha0;
 492         auto index0 = map.index0;
 493
 494         // TRICK:
 495         // Algorithm takes pair of input pixels, sx0'th and sx1'th,
 496         // and compute result as alpha0*src[sx0] + alpha1*src[sx1].
 497         // By definition: sx1 == sx0 + 1 either sx1 == sx0, and
 498         // alpha0 + alpha1 == unity (scaled appropriately).
 499         // Here we modify formulas for alpha0 and sx1: by assuming
 500         // that sx1 == sx0 + 1 always, and patching alpha0 so that
 501         // result remains intact.
 502         // Note that we need in.size.width >= 2, for both sx0 and
 503         // sx0+1 were indexing pixels inside the input's width.
 504         if (map.index1 != map.index0 + 1) {
 505             GAPI_DbgAssert(map.index1 == map.index0);
 506             GAPI_DbgAssert(in.size.width >= 2);
 507             if (map.index0 < in.size.width-1) {
 508                 // sx1=sx0+1 fits inside row,
 509                 // make sure alpha0=unity and alpha1=0,
 510                 // so that result equals src[sx0]*unity
 511                 alpha0 = saturate_cast<alpha_type>(unity);
 512             } else {
 513                 // shift sx0 to left by 1 pixel,
 514                 // and make sure that alpha0=0 and alpha1==1,
 515                 // so that result equals to src[sx0+1]*unity
 516                 alpha0 = 0;
 517                 index0--;
 518             }
 519         }
 520
 521         alpha[x] = alpha0;
 522         index[x] = index0;
 523
 524         for (int l = 0; l < 4; l++) {
 525             clone[4*x + l] = alpha0;
 526         }
 527     }
 528
 529     auto *beta    = scr.beta;
 530     auto *index_y = scr.mapsy;
 531
 532     for (int y = 0; y < outSz.height; y++) {
 533         auto mapY = Mapper::map(vRatio, 0, in.size.height, y);
 534         beta[y] = mapY.alpha0;
 535         index_y[y] = mapY.index0;
 536         index_y[outSz.height + y] = mapY.index1;
 537     }
 538 }
 539
 540 template<typename T, class Mapper>
 541 static void calcRowLinear(const cv::gapi::fluid::View  & in,
 542                                 cv::gapi::fluid::Buffer& out,
 543                                 cv::gapi::fluid::Buffer& scratch) {
 544     using alpha_type = typename Mapper::alpha_type;
 545
 546     auto  inSz =  in.meta().size;
 547     auto outSz = out.meta().size;
 548
 549     auto inY = in.y();
 550     int length = out.length();
 551     int outY = out.y();
 552     int lpi = out.lpi();
 553     GAPI_DbgAssert(outY + lpi <= outSz.height);
 554
 555     GAPI_DbgAssert(lpi <= 4);
 556
 557     linearScratchDesc<T, Mapper, 1> scr(inSz.width, inSz.height, outSz.width, outSz.height, scratch.OutLineB());
 558
 559     const auto *alpha = scr.alpha;
 560     const auto *clone = scr.clone;
 561     const auto *mapsx = scr.mapsx;
 562     const auto *beta0 = scr.beta;
 563     const auto *mapsy = scr.mapsy;
 564     auto *tmp         = scr.tmp;
 565
 566     const auto *beta = beta0 + outY;
 567     const T *src0[4];
 568     const T *src1[4];
 569     T *dst[4];
 570
 571     for (int l = 0; l < lpi; l++) {
 572         auto index0 = mapsy[outY + l] - inY;
 573         auto index1 = mapsy[outSz.height + outY + l] - inY;
 574         src0[l] = in.InLine<const T>(index0);
 575         src1[l] = in.InLine<const T>(index1);
 576         dst[l] = out.OutLine<T>(l);
 577     }
 578
 579 #if MANUAL_SIMD
 580     if (with_cpu_x86_sse42()) {
 581         if (std::is_same<T, uint8_t>::value) {
 582             if (inSz.width >= 16 && outSz.width >= 8) {
 583                 calcRowLinear_8U(reinterpret_cast<uint8_t**>(dst),
 584                                  reinterpret_cast<const uint8_t**>(src0),
 585                                  reinterpret_cast<const uint8_t**>(src1),
 586                                  reinterpret_cast<const short*>(alpha),
 587                                  reinterpret_cast<const short*>(clone),
 588                                  reinterpret_cast<const short*>(mapsx),
 589                                  reinterpret_cast<const short*>(beta),
 590                                  reinterpret_cast<uint8_t*>(tmp),
 591                                  inSz, outSz, lpi);
 592                 return;
 593             }
 594         }
 595
 596         if (std::is_same<T, float>::value) {
 597             calcRowLinear_32F(reinterpret_cast<float**>(dst),
 598                               reinterpret_cast<const float**>(src0),
 599                               reinterpret_cast<const float**>(src1),
 600                               reinterpret_cast<const float*>(alpha),
 601                               reinterpret_cast<const int*>(mapsx),
 602                               reinterpret_cast<const float*>(beta),
 603                               inSz, outSz, lpi);
 604             return;
 605         }
 606     }
 607 #endif
 608
 609     for (int l = 0; l < lpi; l++) {
 610         constexpr static const auto unity = Mapper::unity;
 611
 612         auto beta0 =                                   beta[l];
 613         auto beta1 = saturate_cast<alpha_type>(unity - beta[l]);
 614
 615         for (int x = 0; x < length; x++) {
 616             auto alpha0 =                                   alpha[x];
 617             auto alpha1 = saturate_cast<alpha_type>(unity - alpha[x]);
 618             auto sx0 = mapsx[x];
 619             auto sx1 = sx0 + 1;
 620             T tmp0 = calc(beta0, src0[l][sx0], beta1, src1[l][sx0]);
 621             T tmp1 = calc(beta0, src0[l][sx1], beta1, src1[l][sx1]);
 622             dst[l][x] = calc(alpha0, tmp0, alpha1, tmp1);
 623         }
 624     }
 625 }
 626
 627 template<typename T, class Mapper>
 628 static void calcRowLinearC3(const cv::gapi::fluid::View  & in,
 629                                   cv::gapi::fluid::Buffer& out0,
 630                                   cv::gapi::fluid::Buffer& out1,
 631                                   cv::gapi::fluid::Buffer& out2,
 632                                   cv::gapi::fluid::Buffer& scratch) {
 633     using alpha_type = typename Mapper::alpha_type;
 634
 635     auto  inSz =  in.meta().size;
 636     auto outSz = out0.meta().size;
 637
 638     auto inY  = in.y();
 639     auto outY = out0.y();
 640     auto lpi  = out0.lpi();
 641
 642     GAPI_DbgAssert(outY + lpi <= outSz.height);
 643     GAPI_DbgAssert(lpi <= 4);
 644
 645     linearScratchDesc<T, Mapper, 3> scr(inSz.width, inSz.height, outSz.width, outSz.height, scratch.OutLineB());
 646
 647     const auto *alpha = scr.alpha;
 648     const auto *clone = scr.clone;
 649     const auto *mapsx = scr.mapsx;
 650     const auto *beta0 = scr.beta;
 651     const auto *mapsy = scr.mapsy;
 652     auto *tmp         = scr.tmp;
 653
 654     const auto *beta = beta0 + outY;
 655     const T *src0[4];
 656     const T *src1[4];
 657     std::array<std::array<T*, 4>, 3> dst;
 658
 659     for (int l = 0; l < lpi; l++) {
 660         auto index0 = mapsy[outY + l] - inY;
 661         auto index1 = mapsy[outSz.height + outY + l] - inY;
 662         src0[l] = in.InLine<const T>(index0);
 663         src1[l] = in.InLine<const T>(index1);
 664         dst[0][l] = out0.OutLine<T>(l);
 665         dst[1][l] = out1.OutLine<T>(l);
 666         dst[2][l] = out2.OutLine<T>(l);
 667     }
 668
 669 #if MANUAL_SIMD
 670     if (with_cpu_x86_sse42()) {
 671         if (inSz.width >= 16 && outSz.width >= 8) {
 672             calcRowLinear_8UC3(dst,
 673                                reinterpret_cast<const uint8_t**>(src0),
 674                                reinterpret_cast<const uint8_t**>(src1),
 675                                reinterpret_cast<const short*>(alpha),
 676                                reinterpret_cast<const short*>(clone),
 677                                reinterpret_cast<const short*>(mapsx),
 678                                reinterpret_cast<const short*>(beta),
 679                                reinterpret_cast<uint8_t*>(tmp),
 680                                inSz, outSz, lpi);
 681             return;
 682         }
 683     }
 684 #endif
 685
 686     auto length = out0.length();
 687
 688     for (int l = 0; l < lpi; l++) {
 689         constexpr static const auto unity = Mapper::unity;
 690
 691         auto beta0 =                                   beta[l];
 692         auto beta1 = saturate_cast<alpha_type>(unity - beta[l]);
 693
 694         for (int x = 0; x < length; x++) {
 695             auto alpha0 =                                   alpha[x];
 696             auto alpha1 = saturate_cast<alpha_type>(unity - alpha[x]);
 697             auto sx0 = mapsx[x];
 698             auto sx1 = sx0 + 1;
 699
 700             for (int c = 0; c < 3; c++) {
 701                 auto idx0 = 3*sx0 + c;
 702                 auto idx1 = 3*sx1 + c;
 703                 T tmp0 = calc(beta0, src0[l][idx0], beta1, src1[l][idx0]);
 704                 T tmp1 = calc(beta0, src0[l][idx1], beta1, src1[l][idx1]);
 705                 dst[c][l][x] = calc(alpha0, tmp0, alpha1, tmp1);
 706             }
 707         }
 708     }
 709 }
 710
 711
 712 //------------------------------------------------------------------------------
 713
 714 namespace linear {
 715 struct Mapper {
 716     typedef short alpha_type;
 717     typedef short index_type;
 718     constexpr static const int unity = ONE;
 719
 720     typedef MapperUnit<short, short> Unit;
 721
 722     static inline Unit map(double ratio, int start, int max, int outCoord) {
 723         float f = ((outCoord + 0.5f) * ratio - 0.5f);
 724         int s = cvFloor(f);
 725         f -= s;
 726
 727         Unit u;
 728
 729         u.index0 = std::max(s - start, 0);
 730         u.index1 = ((f == 0.0) || s + 1 >= max) ? s - start : s - start + 1;
 731
 732         u.alpha0 = saturate_cast<short>(ONE * (1.0f - f));
 733         u.alpha1 = saturate_cast<short>(ONE *         f);
 734
 735         return u;
 736     }
 737 };
 738 }  // namespace linear
 739
 740 namespace linear32f {
 741 struct Mapper {
 742     typedef float alpha_type;
 743     typedef int   index_type;
 744     constexpr static const float unity = 1;
 745
 746     typedef MapperUnit<float, int> Unit;
 747
 748     static inline Unit map(double ratio, int start, int max, int outCoord) {
 749         float f = ((outCoord + 0.5f) * ratio - 0.5f);
 750         int s = cvFloor(f);
 751         f -= s;
 752
 753         Unit u;
 754
 755         u.index0 = std::max(s - start, 0);
 756         u.index1 = ((f == 0.0) || s + 1 >= max) ? s - start : s - start + 1;
 757
 758         u.alpha0 = 1.f - f;
 759         u.alpha1 =       f;
 760
 761         return u;
 762     }
 763 };
 764 }  // namespace linear32f
 765
 766 namespace areaUpscale {
 767 struct Mapper {
 768     typedef short alpha_type;
 769     typedef short index_type;
 770     constexpr static const int unity = ONE;
 771
 772     typedef MapperUnit<short, short> Unit;
 773
 774     static inline Unit map(double ratio, int start, int max, int outCoord) {
 775         int s = cvFloor(outCoord*ratio);
 776         float f = static_cast<float>((outCoord+1) - (s+1)/ratio);
 777         f = f <= 0 ? 0.f : f - cvFloor(f);
 778
 779         Unit u;
 780
 781         u.index0 = std::max(s - start, 0);
 782         u.index1 = ((f == 0.0) || s + 1 >= max) ? s - start : s - start + 1;
 783
 784         u.alpha0 = saturate_cast<short>(ONE * (1.0f - f));
 785         u.alpha1 = saturate_cast<short>(ONE *         f);
 786
 787         return u;
 788     }
 789 };
 790 }  // namespace areaUpscale
 791
 792 namespace areaUpscale32f {
 793 struct Mapper {
 794     typedef float alpha_type;
 795     typedef int   index_type;
 796     constexpr static const float unity = 1;
 797
 798     typedef MapperUnit<float, int> Unit;
 799
 800     static inline Unit map(double ratio, int start, int max, int outCoord) {
 801         int s = cvFloor(outCoord*ratio);
 802         float f = static_cast<float>((outCoord+1) - (s+1)/ratio);
 803         f = f <= 0 ? 0.f : f - cvFloor(f);
 804
 805         Unit u;
 806
 807         u.index0 = std::max(s - start, 0);
 808         u.index1 = ((f == 0.0) || s + 1 >= max) ? s - start : s - start + 1;
 809
 810         u.alpha0 = 1.0f - f;
 811         u.alpha1 =        f;
 812
 813         return u;
 814     }
 815 };
 816 }  // namespace areaUpscale32f
 817
 818 //------------------------------------------------------------------------------
 819
 820 template<typename A, typename I, typename W>
 821 struct AreaDownMapper {
 822     typedef A alpha_type;
 823     typedef I index_type;
 824     typedef W  work_type;
 825
 826     typedef MapperUnit<alpha_type, index_type> Unit;
 827
 828     inline Unit map(int outCoord) {
 829         double inCoord0 =  outCoord      * ratio;
 830         double inCoord1 = (outCoord + 1) * ratio;
 831
 832         double index0 = std::floor(inCoord0 + 0.001);
 833         double index1 =  std::ceil(inCoord1 - 0.001);
 834
 835         double alpha0 =   (index0 + 1 - inCoord0) * inv_ratio;
 836         double alpha1 = - (index1 - 1 - inCoord1) * inv_ratio;
 837
 838         GAPI_Assert(0 <= outCoord && outCoord <= outSz-1);
 839         GAPI_Assert(0 <= index0 && index0 < index1 && index1 <= inSz);
 840
 841         Unit unit;
 842
 843         unit.index0 = checked_cast<index_type>(index0);
 844         unit.index1 = checked_cast<index_type>(index1);
 845
 846         unit.alpha0 = convert_cast<alpha_type>(alpha0);
 847         unit.alpha1 = convert_cast<alpha_type>(alpha1);
 848
 849         return unit;
 850     }
 851
 852     int    inSz, outSz;
 853     double ratio, inv_ratio;
 854
 855     alpha_type  alpha;  // == inv_ratio, rounded
 856
 857     void init(int _inSz, int _outSz) {
 858         inSz  = _inSz;
 859         outSz = _outSz;
 860
 861         inv_ratio = invRatio(inSz, outSz);
 862         ratio     = 1.0 / inv_ratio;
 863
 864         alpha = convert_cast<alpha_type>(inv_ratio);
 865     }
 866 };
 867
 868 namespace areaDownscale32f {
 869 struct Mapper: public AreaDownMapper<float, int, float> {
 870     Mapper(int _inSz, int _outSz) {
 871         init(_inSz, _outSz);
 872     }
 873 };
 874 }
 875
 876 namespace areaDownscale8u {
 877 struct Mapper: public AreaDownMapper<Q0_16, short, Q8_8> {
 878     Mapper(int _inSz, int _outSz) {
 879         init(_inSz, _outSz);
 880     }
 881 };
 882 }
 883
 884 template<typename Mapper>
 885 static void initScratchArea(const cv::GMatDesc& in, const Size& outSz,
 886                             cv::gapi::fluid::Buffer &scratch) {
 887     using Unit = typename Mapper::Unit;
 888     using alpha_type = typename Mapper::alpha_type;
 889     using index_type = typename Mapper::index_type;
 890
 891     // compute the chunk of input pixels for each output pixel,
 892     // along with the coefficients for taking the weigthed sum
 893
 894     Size inSz = in.size;
 895     Mapper mapper(inSz.width, outSz.width);
 896
 897     std::vector<Unit> xmaps(outSz.width);
 898     int  maxdif = 0;
 899
 900     for (int w = 0; w < outSz.width; w++) {
 901         Unit map = mapper.map(w);
 902         xmaps[w] = map;
 903
 904         int dif = map.index1 - map.index0;
 905         if (dif > maxdif)
 906             maxdif = dif;
 907     }
 908
 909     // This assertion is critical for our trick with chunk sizes:
 910     // we would expand a chunk it is is smaller than maximal size
 911     GAPI_Assert(inSz.width >= maxdif);
 912
 913     // pack the input chunks positions and coefficients into scratch-buffer,
 914     // along with the maximal size of chunk (note that chunk size may vary)
 915
 916     size_t scratch_bytes =               sizeof(int)
 917                          + outSz.width * sizeof(index_type)
 918                          + outSz.width * sizeof(alpha_type) * maxdif
 919                          +  inSz.width * sizeof(alpha_type);
 920     Size scratch_size{static_cast<int>(scratch_bytes), 1};
 921
 922     cv::GMatDesc desc;
 923     desc.chan = 1;
 924     desc.depth = CV_8UC1;
 925     desc.size = scratch_size;
 926
 927     cv::gapi::fluid::Buffer buffer(desc);
 928     scratch = std::move(buffer);
 929
 930     auto *maxdf =  scratch.OutLine<int>();
 931     auto *index = reinterpret_cast<index_type*>(maxdf + 1);
 932     auto *alpha = reinterpret_cast<alpha_type*>(index + outSz.width);
 933 //  auto *vbuf  = reinterpret_cast<work_type *>(alpha + outSz.width * maxdif);
 934
 935     for (int w = 0; w < outSz.width; w++) {
 936         // adjust input indices so that:
 937         // - data chunk is exactly maxdif pixels
 938         // - data chunk fits inside input width
 939         int index0 = xmaps[w].index0;
 940         int index1 = xmaps[w].index1;
 941         int i0 = index0, i1 = index1;
 942         i1 = (std::min)(i0 + maxdif, in.size.width);
 943         i0 =            i1 - maxdif;
 944         GAPI_DbgAssert(i0 >= 0);
 945
 946         // fulfill coefficients for the data chunk,
 947         // extending with zeros if any extra pixels
 948         alpha_type *alphaw = &alpha[w * maxdif];
 949         for (int i = 0; i < maxdif; i++) {
 950             if (i + i0 == index0) {
 951                 alphaw[i] = xmaps[w].alpha0;
 952
 953             } else if (i + i0 == index1 - 1) {
 954                 alphaw[i] = xmaps[w].alpha1;
 955
 956             } else if (i + i0 > index0 && i + i0 < index1 - 1) {
 957                 alphaw[i] = mapper.alpha;
 958
 959             } else {
 960                 alphaw[i] = 0;
 961             }
 962         }
 963
 964         // start input chunk with adjusted position
 965         index[w] = i0;
 966     }
 967
 968     *maxdf = maxdif;
 969 }
 970
 971 template<typename T, typename Mapper>
 972 static void calcAreaRow(const cv::gapi::fluid::View& in, cv::gapi::fluid::Buffer& out,
 973                               cv::gapi::fluid::Buffer& scratch) {
 974     using Unit = typename Mapper::Unit;
 975     using alpha_type = typename Mapper::alpha_type;
 976     using index_type = typename Mapper::index_type;
 977     using  work_type = typename Mapper::work_type;
 978
 979     Size inSz  =  in.meta().size;
 980     Size outSz = out.meta().size;
 981
 982     // this method is valid only for down-scale
 983     GAPI_DbgAssert(inSz.width  >= outSz.width);
 984     GAPI_DbgAssert(inSz.height >= outSz.height);
 985
 986 //  Mapper xmapper(inSz.width,  outSz.width);
 987     Mapper ymapper(inSz.height, outSz.height);
 988
 989     auto *xmaxdf = scratch.OutLine<const int>();
 990     auto  maxdif = xmaxdf[0];
 991
 992     auto *xindex = reinterpret_cast<const index_type*>(xmaxdf + 1);
 993     auto *xalpha = reinterpret_cast<const alpha_type*>(xindex + outSz.width);
 994     auto *vbuf_c = reinterpret_cast<const  work_type*>(xalpha + outSz.width * maxdif);
 995
 996     auto *vbuf = const_cast<work_type*>(vbuf_c);
 997
 998     int iny = in.y();
 999     int y = out.y();
1000
1001     int lpi = out.lpi();
1002     GAPI_DbgAssert(y + lpi <= outSz.height);
1003
1004     for (int l = 0; l < lpi; l++) {
1005         Unit ymap = ymapper.map(y + l);
1006
1007         GAPI_Assert(ymap.index1 - ymap.index0 <= 32);
1008         GAPI_Assert(ymap.index1 - ymap.index0 > 0);
1009         const T *src[32] = {};
1010
1011         for (int yin = ymap.index0; yin < ymap.index1; yin++) {
1012             src[yin - ymap.index0] = in.InLine<const T>(yin - iny);
1013         }
1014
1015         auto dst = out.OutLine<T>(l);
1016
1017 #if MANUAL_SIMD
1018         if (with_cpu_x86_sse42()) {
1019             if (std::is_same<T, uchar>::value) {
1020                 calcRowArea_8U(reinterpret_cast<uchar*>(dst),
1021                                reinterpret_cast<const uchar**>(src),
1022                                inSz, outSz,
1023                                static_cast<Q0_16>(ymapper.alpha),
1024                                reinterpret_cast<const MapperUnit8U&>(ymap),
1025                                xmaxdf[0],
1026                                reinterpret_cast<const short*>(xindex),
1027                                reinterpret_cast<const Q0_16*>(xalpha),
1028                                reinterpret_cast<Q8_8*>(vbuf));
1029                 continue;  // next l = 0, ..., lpi-1
1030             }
1031
1032             if (std::is_same<T, float>::value) {
1033                 calcRowArea_32F(reinterpret_cast<float*>(dst),
1034                                 reinterpret_cast<const float**>(src),
1035                                 inSz, outSz,
1036                                 static_cast<float>(ymapper.alpha),
1037                                 reinterpret_cast<const MapperUnit32F&>(ymap),
1038                                 xmaxdf[0],
1039                                 reinterpret_cast<const int*>(xindex),
1040                                 reinterpret_cast<const float*>(xalpha),
1041                                 reinterpret_cast<float*>(vbuf));
1042                 continue;
1043             }
1044         }
1045 #endif
1046
1047         // vertical pass
1048         int y_1st = ymap.index0;
1049         int ylast = ymap.index1 - 1;
1050         if (y_1st < ylast) {
1051             for (int w = 0; w < inSz.width; w++) {
1052                 vbuf[w] = mulas(ymap.alpha0, src[0][w])        // Q8_8 = Q0_16 * U8
1053                         + mulas(ymap.alpha1, src[ylast - y_1st][w]);
1054             }
1055
1056             for (int i = 1; i < ylast - y_1st; i++) {
1057                 for (int w = 0; w < inSz.width; w++) {
1058                     vbuf[w] += mulas(ymapper.alpha, src[i][w]);
1059                 }
1060             }
1061         } else {
1062             for (int w = 0; w < inSz.width; w++) {
1063                 vbuf[w] = convert_cast<work_type>(src[0][w]);  // Q8_8 = U8
1064             }
1065         }
1066
1067         // horizontal pass
1068         for (int x = 0; x < outSz.width; x++) {
1069             work_type sum = 0;
1070
1071             auto        index =  xindex[x];
1072             const auto *alpha = &xalpha[x * maxdif];
1073
1074             for (int i = 0; i < maxdif; i++) {
1075                 sum +=  mulaw(alpha[i], vbuf[index + i]);      // Q8_8 = Q0_16 * Q8_8
1076             }
1077
1078             dst[x] = convert_cast<T>(sum);                     // U8 = Q8_8
1079         }
1080     }
1081 }
1082
1083 //----------------------------------------------------------------------
1084 #if USE_CVKL
1085
1086 // taken from: ie_preprocess_data.cpp
1087 static int getResizeAreaTabSize(int dst_go, int ssize, int dsize, float scale) {
1088     static const float threshold = 1e-3f;
1089     int max_count = 0;
1090
1091     for (int col = dst_go; col < dst_go + dsize; col++) {
1092         int count = 0;
1093
1094         float fsx1 = col * scale;
1095         float fsx2 = fsx1 + scale;
1096
1097         int sx1 = ceil(fsx1);
1098         int sx2 = floor(fsx2);
1099
1100         sx2 = (std::min)(sx2, ssize - 1);
1101         sx1 = (std::min)(sx1, sx2);
1102
1103         if (sx1 - fsx1 > threshold) {
1104             count++;
1105         }
1106
1107         for (int sx = sx1; sx < sx2; sx++) {
1108             count++;
1109         }
1110
1111         if (fsx2 - sx2 > threshold) {
1112             count++;
1113         }
1114         max_count = (std::max)(max_count, count);
1115     }
1116
1117     return max_count;
1118 }
1119
1120 // taken from: ie_preprocess_data.cpp
1121 static void computeResizeAreaTab(int src_go, int dst_go, int ssize, int dsize, float scale,
1122                                  uint16_t* si, uint16_t* alpha, int max_count) {
1123     static const float threshold = 1e-3f;
1124     int k = 0;
1125
1126     for (int col = dst_go; col < dst_go + dsize; col++) {
1127         int count = 0;
1128
1129         float fsx1 = col * scale;
1130         float fsx2 = fsx1 + scale;
1131         float cellWidth = (std::min)(scale, ssize - fsx1);
1132
1133         int sx1 = ceil(fsx1);
1134         int sx2 = floor(fsx2);
1135
1136         sx2 = (std::min)(sx2, ssize - 1);
1137         sx1 = (std::min)(sx1, sx2);
1138
1139         si[col - dst_go] = (uint16_t)(sx1 - src_go);
1140
1141         if (sx1 - fsx1 > threshold) {
1142             si[col - dst_go] = (uint16_t)(sx1 - src_go - 1);
1143             alpha[k++] = (uint16_t)((1 << 16) * ((sx1 - fsx1) / cellWidth));
1144             count++;
1145         }
1146
1147         for (int sx = sx1; sx < sx2; sx++) {
1148             alpha[k++] = (uint16_t)((1 << 16) * (1.0f / cellWidth));
1149             count++;
1150         }
1151
1152         if (fsx2 - sx2 > threshold) {
1153             alpha[k++] = (uint16_t)((1 << 16) * ((std::min)((std::min)(fsx2 - sx2, 1.f), cellWidth) / cellWidth));
1154             count++;
1155         }
1156
1157         if (count != max_count) {
1158             alpha[k++] = 0;
1159         }
1160     }
1161 }
1162
1163 // teken from: ie_preprocess_data.cpp
1164 static void generate_alpha_and_id_arrays(int x_max_count, int dcols, const uint16_t* xalpha, uint16_t* xsi,
1165                                          uint16_t** alpha, uint16_t** sxid) {
1166     if (x_max_count <= 4) {
1167         for (int col = 0; col < dcols; col++) {
1168             for (int x = 0; x < x_max_count; x++) {
1169                 alpha[x][col] = xalpha[col*x_max_count + x];
1170             }
1171         }
1172     }
1173     if (x_max_count <= 4) {
1174         for (int col = 0; col <= dcols - 8; col += 8) {
1175             for (int chunk_num_h = 0; chunk_num_h < x_max_count; chunk_num_h++) {
1176                 for (int i = 0; i < 128 / 16; i++) {
1177                     int id_diff = xsi[col + i] - xsi[col];
1178
1179                     for (int chunk_num_v = 0; chunk_num_v < x_max_count; chunk_num_v++) {
1180                         uint16_t* sxidp = sxid[chunk_num_v] + col * x_max_count + chunk_num_h * 8;
1181
1182                         int id0 = (id_diff + chunk_num_v) * 2 + 0;
1183                         int id1 = (id_diff + chunk_num_v) * 2 + 1;
1184
1185                         (reinterpret_cast<int8_t*>(sxidp + i))[0] = static_cast<int8_t>(id0 >= (chunk_num_h * 16) && id0 < (chunk_num_h + 1) * 16 ? id0 : -1);
1186                         (reinterpret_cast<int8_t*>(sxidp + i))[1] = static_cast<int8_t>(id1 >= (chunk_num_h * 16) && id1 < (chunk_num_h + 1) * 16 ? id1 : -1);
1187                     }
1188                 }
1189             }
1190         }
1191     }
1192 }
1193
1194 // taken from: ie_preprocess_data.cpp
1195 // (and simplified for specifically downscale area 8u)
1196 static size_t resize_get_buffer_size(const Size& inSz, const Size& outSz) {
1197     int dst_full_width  = outSz.width;
1198     int dst_full_height = outSz.height;
1199     int src_full_width  =  inSz.width;
1200     int src_full_height =  inSz.height;
1201
1202     auto resize_area_u8_downscale_sse_buffer_size = [&]() {
1203         const int dwidth  = outSz.width;
1204         const int dheight = outSz.height;
1205         const int swidth  =  inSz.width;
1206
1207         const int dst_go_x = 0;
1208         const int dst_go_y = 0;
1209
1210         int x_max_count = getResizeAreaTabSize(dst_go_x, src_full_width,  dwidth,  static_cast<float>(src_full_width)  / dst_full_width)  + 1;
1211         int y_max_count = getResizeAreaTabSize(dst_go_y, src_full_height, dheight, static_cast<float>(src_full_height) / dst_full_height) + 1;
1212
1213         size_t si_buf_size = sizeof(uint16_t) * dwidth + sizeof(uint16_t) * dheight;
1214         size_t alpha_buf_size =
1215                 sizeof(uint16_t) * (dwidth * x_max_count + 8 * 16) + sizeof(uint16_t) * dheight * y_max_count;
1216         size_t vert_sum_buf_size = sizeof(uint16_t) * (swidth * 2);
1217         size_t alpha_array_buf_size = sizeof(uint16_t) * 4 * dwidth;
1218         size_t sxid_array_buf_size = sizeof(uint16_t) * 4 * 4 * dwidth;
1219
1220         size_t buffer_size = si_buf_size +
1221                              alpha_buf_size +
1222                              vert_sum_buf_size +
1223                              alpha_array_buf_size +
1224                              sxid_array_buf_size;
1225
1226         return buffer_size;
1227     };
1228
1229     return resize_area_u8_downscale_sse_buffer_size();
1230 }
1231
1232 // buffer-fulfill is taken from: ie_preprocess_data_sse42.cpp
1233 static void initScratchArea_CVKL_U8(const cv::GMatDesc & in,
1234                                     const       Size   & outSz,
1235                                cv::gapi::fluid::Buffer & scratch) {
1236     const Size& inSz = in.size;
1237
1238     // estimate buffer size
1239     size_t scratch_bytes = resize_get_buffer_size(inSz, outSz);
1240
1241     // allocate buffer
1242
1243     Size scratch_size{static_cast<int>(scratch_bytes), 1};
1244
1245     cv::GMatDesc desc;
1246     desc.chan = 1;
1247     desc.depth = CV_8UC1;
1248     desc.size = scratch_size;
1249
1250     cv::gapi::fluid::Buffer buffer(desc);
1251     scratch = std::move(buffer);
1252
1253     // fulfil buffer
1254     {
1255         // this code is taken from: ie_preprocess_data_sse42.cpp
1256         // (and simplified for 1-channel cv::Mat instead of blob)
1257
1258         auto dwidth  = outSz.width;
1259         auto dheight = outSz.height;
1260         auto swidth  =  inSz.width;
1261         auto sheight =  inSz.height;
1262
1263         const int src_go_x = 0;
1264         const int src_go_y = 0;
1265         const int dst_go_x = 0;
1266         const int dst_go_y = 0;
1267
1268         auto src_full_width  = swidth;
1269         auto src_full_height = sheight;
1270         auto dst_full_width  = dwidth;
1271         auto dst_full_height = dheight;
1272
1273         float scale_x = static_cast<float>(src_full_width)  / dst_full_width;
1274         float scale_y = static_cast<float>(src_full_height) / dst_full_height;
1275
1276         int x_max_count = getResizeAreaTabSize(dst_go_x, src_full_width,  dwidth,  scale_x);
1277         int y_max_count = getResizeAreaTabSize(dst_go_y, src_full_height, dheight, scale_y);
1278
1279         auto* maxdif = scratch.OutLine<int>();
1280         auto* xsi = reinterpret_cast<uint16_t*>(maxdif + 2);
1281         auto* ysi = xsi + dwidth;
1282         auto* xalpha = ysi + dheight;
1283         auto* yalpha = xalpha + dwidth*x_max_count + 8*16;
1284     //  auto* vert_sum = yalpha + dheight*y_max_count;
1285
1286         maxdif[0] = x_max_count;
1287         maxdif[1] = y_max_count;
1288
1289         computeResizeAreaTab(src_go_x, dst_go_x, src_full_width,   dwidth, scale_x, xsi, xalpha, x_max_count);
1290         computeResizeAreaTab(src_go_y, dst_go_y, src_full_height, dheight, scale_y, ysi, yalpha, y_max_count);
1291
1292         int vest_sum_size = 2*swidth;
1293         uint16_t* vert_sum = yalpha + dheight*y_max_count;
1294         uint16_t* alpha0 = vert_sum + vest_sum_size;
1295         uint16_t* alpha1 = alpha0 + dwidth;
1296         uint16_t* alpha2 = alpha1 + dwidth;
1297         uint16_t* alpha3 = alpha2 + dwidth;
1298         uint16_t* sxid0 = alpha3 + dwidth;
1299         uint16_t* sxid1 = sxid0 + 4*dwidth;
1300         uint16_t* sxid2 = sxid1 + 4*dwidth;
1301         uint16_t* sxid3 = sxid2 + 4*dwidth;
1302
1303         uint16_t* alpha[] = {alpha0, alpha1, alpha2, alpha3};
1304         uint16_t* sxid[] = {sxid0, sxid1, sxid2, sxid3};
1305         generate_alpha_and_id_arrays(x_max_count, dwidth, xalpha, xsi, alpha, sxid);
1306     }
1307 }
1308
1309 static void calcAreaRow_CVKL_U8(const cv::gapi::fluid::View   & in,
1310                                       cv::gapi::fluid::Buffer & out,
1311                                       cv::gapi::fluid::Buffer & scratch) {
1312     Size inSz  =  in.meta().size;
1313     Size outSz = out.meta().size;
1314
1315     // this method is valid only for down-scale
1316     GAPI_DbgAssert(inSz.width  >= outSz.width);
1317     GAPI_DbgAssert(inSz.height >= outSz.height);
1318
1319     int dwidth  = outSz.width;
1320     int dheight = outSz.height;
1321
1322     auto* maxdif = scratch.OutLine<int>();
1323     int x_max_count = maxdif[0];
1324     int y_max_count = maxdif[1];
1325
1326     auto* xsi = reinterpret_cast<uint16_t*>(maxdif + 2);
1327     auto* ysi    = xsi + dwidth;
1328     auto* xalpha = ysi + dheight;
1329     auto* yalpha = xalpha + dwidth*x_max_count + 8*16;
1330     auto* vert_sum = yalpha + dheight*y_max_count;
1331
1332     int iny =  in.y();
1333     int   y = out.y();
1334
1335     int lpi = out.lpi();
1336     GAPI_DbgAssert(y + lpi <= outSz.height);
1337
1338     for (int l = 0; l < lpi; l++) {
1339         int yin0 = ysi[y + l];
1340         int yin1 = yin0 + y_max_count;
1341
1342         GAPI_Assert(yin1 - yin0 <= 32);
1343         const uint8_t *src[32] = {};
1344
1345         for (int yin = yin0; yin < yin1 && yin < inSz.height; yin++) {
1346             if (yalpha[(y+l)*y_max_count + yin - yin0] == 0) {
1347                 src[yin - yin0] = in.InLine<const uint8_t>(yin - iny - 1);
1348             } else {
1349                 src[yin - yin0] = in.InLine<const uint8_t>(yin - iny);
1350             }
1351         }
1352
1353         uint8_t *dst = out.OutLine<uint8_t>(l);
1354
1355         calcRowArea_CVKL_U8_SSE42(src, dst, inSz, outSz, y + l, xsi, ysi,
1356                       xalpha, yalpha, x_max_count, y_max_count, vert_sum);
1357     }
1358 }
1359
1360 #endif  // CVKL
1361 //----------------------------------------------------------------------
1362
1363 GAPI_FLUID_KERNEL(FScalePlane8u, ScalePlane8u, true) {
1364     static const int Window = 1;
1365     static const int LPI = 4;
1366     static const auto Kind = cv::GFluidKernel::Kind::Resize;
1367
1368     static void initScratch(const cv::GMatDesc& in,
1369                             Size outSz, int /*interp*/,
1370                             cv::gapi::fluid::Buffer &scratch) {
1371         initScratchLinear<uchar, linear::Mapper>(in, outSz, scratch, LPI);
1372     }
1373
1374     static void resetScratch(cv::gapi::fluid::Buffer& /*scratch*/) {
1375     }
1376
1377     static void run(const cv::gapi::fluid::View& in, Size /*sz*/, int /*interp*/,
1378                     cv::gapi::fluid::Buffer& out, cv::gapi::fluid::Buffer &scratch) {
1379         calcRowLinear<uint8_t, linear::Mapper>(in, out, scratch);
1380     }
1381 };
1382
1383 GAPI_FLUID_KERNEL(FScalePlanes, ScalePlanes, true) {
1384     static const int Window = 1;
1385     static const int LPI = 4;
1386     static const auto Kind = cv::GFluidKernel::Kind::Resize;
1387
1388     static void initScratch(const cv::GMatDesc& in, int, Size,
1389                             Size outSz, int /*interp*/,
1390                             cv::gapi::fluid::Buffer &scratch) {
1391         initScratchLinear<uchar, linear::Mapper, 3>(in, outSz, scratch, LPI);
1392     }
1393
1394     static void resetScratch(cv::gapi::fluid::Buffer& /*scratch*/) {
1395     }
1396
1397     static void run(const cv::gapi::fluid::View& in, int, Size, Size/*sz*/, int /*interp*/,
1398                     cv::gapi::fluid::Buffer& out1,
1399                     cv::gapi::fluid::Buffer& out2,
1400                     cv::gapi::fluid::Buffer& out3,
1401                     cv::gapi::fluid::Buffer& scratch) {
1402         calcRowLinearC3<uint8_t, linear::Mapper>(in, out1, out2, out3, scratch);
1403     }
1404 };
1405
1406 GAPI_FLUID_KERNEL(FUpscalePlaneArea8u, UpscalePlaneArea8u, true) {
1407     static const int Window = 1;
1408     static const int LPI = 4;
1409     static const auto Kind = cv::GFluidKernel::Kind::Resize;
1410
1411     static void initScratch(const cv::GMatDesc& in,
1412                             Size outSz, int /*interp*/,
1413                             cv::gapi::fluid::Buffer &scratch) {
1414         initScratchLinear<uchar, areaUpscale::Mapper>(in, outSz, scratch, LPI);
1415     }
1416
1417     static void resetScratch(cv::gapi::fluid::Buffer& /*scratch*/) {
1418     }
1419
1420     static void run(const cv::gapi::fluid::View& in, Size /*sz*/, int /*interp*/,
1421                     cv::gapi::fluid::Buffer& out, cv::gapi::fluid::Buffer &scratch) {
1422         calcRowLinear<uint8_t, areaUpscale::Mapper>(in, out, scratch);
1423     }
1424 };
1425
1426 GAPI_FLUID_KERNEL(FUpscalePlaneArea32f, UpscalePlaneArea32f, true) {
1427     static const int Window = 1;
1428     static const int LPI = 4;
1429     static const auto Kind = cv::GFluidKernel::Kind::Resize;
1430
1431     static void initScratch(const cv::GMatDesc& in,
1432                             Size outSz, int /*interp*/,
1433                             cv::gapi::fluid::Buffer &scratch) {
1434         initScratchLinear<float, areaUpscale32f::Mapper>(in, outSz, scratch, 0);
1435     }
1436
1437     static void resetScratch(cv::gapi::fluid::Buffer& /*scratch*/) {
1438     }
1439
1440     static void run(const cv::gapi::fluid::View& in, Size /*sz*/, int /*interp*/,
1441                     cv::gapi::fluid::Buffer& out, cv::gapi::fluid::Buffer &scratch) {
1442         calcRowLinear<float, areaUpscale32f::Mapper>(in, out, scratch);
1443     }
1444 };
1445
1446 GAPI_FLUID_KERNEL(FScalePlane32f, ScalePlane32f, true) {
1447     static const int Window = 1;
1448     static const int LPI = 4;
1449     static const auto Kind = cv::GFluidKernel::Kind::Resize;
1450
1451     static void initScratch(const cv::GMatDesc& in,
1452                             Size outSz, int /*interp*/,
1453                             cv::gapi::fluid::Buffer &scratch) {
1454         GAPI_DbgAssert(in.depth == CV_32F && in.chan == 1);
1455
1456         initScratchLinear<float, linear32f::Mapper>(in, outSz, scratch, 0);
1457     }
1458
1459     static void resetScratch(cv::gapi::fluid::Buffer& /*scratch*/) {
1460     }
1461
1462     static void run(const cv::gapi::fluid::View& in, Size /*sz*/, int /*interp*/,
1463                     cv::gapi::fluid::Buffer& out, cv::gapi::fluid::Buffer &scratch) {
1464         calcRowLinear<float, linear32f::Mapper>(in, out, scratch);
1465     }
1466 };
1467
1468 //----------------------------------------------------------------------
1469
1470 GAPI_FLUID_KERNEL(FScalePlaneArea32f, ScalePlaneArea32f, true) {
1471     static const int Window = 1;
1472     static const int LPI = 4;
1473     static const auto Kind = cv::GFluidKernel::Kind::Resize;
1474
1475     static void initScratch(const cv::GMatDesc& in,
1476                             Size outSz, int /*interp*/,
1477                             cv::gapi::fluid::Buffer &scratch) {
1478         initScratchArea<areaDownscale32f::Mapper>(in, outSz, scratch);
1479     }
1480
1481     static void resetScratch(cv::gapi::fluid::Buffer& /*scratch*/) {
1482     }
1483
1484     static void run(const cv::gapi::fluid::View& in, Size /*sz*/, int /*interp*/,
1485                     cv::gapi::fluid::Buffer& out, cv::gapi::fluid::Buffer &scratch) {
1486         calcAreaRow<float, areaDownscale32f::Mapper>(in, out, scratch);
1487     }
1488 };
1489
1490 GAPI_FLUID_KERNEL(FScalePlaneArea8u, ScalePlaneArea8u, true) {
1491     static const int Window = 1;
1492     static const int LPI = 4;
1493     static const auto Kind = cv::GFluidKernel::Kind::Resize;
1494
1495     static void initScratch(const cv::GMatDesc& in,
1496                             Size outSz, int /*interp*/,
1497                             cv::gapi::fluid::Buffer &scratch) {
1498     #if USE_CVKL
1499         if (with_cpu_x86_sse42()) {
1500             const Size& inSz = in.size;
1501             if (inSz.width > outSz.width && inSz.height > outSz.height) {
1502                 // CVKL code we use supports only downscale
1503                 initScratchArea_CVKL_U8(in, outSz, scratch);
1504                 return;
1505             }
1506         }
1507     #endif
1508
1509         initScratchArea<areaDownscale8u::Mapper>(in, outSz, scratch);
1510     }
1511
1512     static void resetScratch(cv::gapi::fluid::Buffer& /*scratch*/) {
1513     }
1514
1515     static void run(const cv::gapi::fluid::View& in, Size /*sz*/, int /*interp*/,
1516                     cv::gapi::fluid::Buffer& out, cv::gapi::fluid::Buffer &scratch) {
1517     #if USE_CVKL
1518         if (with_cpu_x86_sse42()) {
1519             auto  inSz =  in.meta().size;
1520             auto outSz = out.meta().size;
1521             if (inSz.width > outSz.width && inSz.height > outSz.height) {
1522                 // CVKL's code supports only downscale
1523                 calcAreaRow_CVKL_U8(in, out, scratch);
1524                 return;
1525             }
1526         }
1527     #endif
1528
1529         calcAreaRow<uint8_t, areaDownscale8u::Mapper>(in, out, scratch);
1530     }
1531 };
1532
1533 static const int ITUR_BT_601_CY = 1220542;
1534 static const int ITUR_BT_601_CUB = 2116026;
1535 static const int ITUR_BT_601_CUG = -409993;
1536 static const int ITUR_BT_601_CVG = -852492;
1537 static const int ITUR_BT_601_CVR = 1673527;
1538 static const int ITUR_BT_601_SHIFT = 20;
1539
1540 static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, int& buv) {
1541     int uu, vv;
1542     uu = static_cast<int>(u) - 128;
1543     vv = static_cast<int>(v) - 128;
1544
1545     ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * vv;
1546     guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * vv + ITUR_BT_601_CUG * uu;
1547     buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * uu;
1548 }
1549
1550 static inline void yRGBuvToRGB(const uchar vy, const int ruv, const int guv, const int buv,
1551                                 uchar& r, uchar& g, uchar& b) {
1552     int yy = static_cast<int>(vy);
1553     int y = std::max(0, yy - 16) * ITUR_BT_601_CY;
1554     r = saturate_cast<uchar>((y + ruv) >> ITUR_BT_601_SHIFT);
1555     g = saturate_cast<uchar>((y + guv) >> ITUR_BT_601_SHIFT);
1556     b = saturate_cast<uchar>((y + buv) >> ITUR_BT_601_SHIFT);
1557 }
1558
1559 static void calculate_nv12_to_rgb_fallback(const  uchar **y_rows,
1560                                            const  uchar *uv_row,
1561                                                   uchar **out_rows,
1562                                            int buf_width) {
1563     for (int i = 0; i < buf_width; i += 2) {
1564         uchar u = uv_row[i];
1565         uchar v = uv_row[i + 1];
1566         int ruv, guv, buv;
1567         uvToRGBuv(u, v, ruv, guv, buv);
1568
1569         for (int y = 0; y < 2; y++) {
1570             for (int x = 0; x < 2; x++) {
1571                 uchar vy = y_rows[y][i + x];
1572                 uchar r, g, b;
1573                 yRGBuvToRGB(vy, ruv, guv, buv, r, g, b);
1574
1575                 out_rows[y][3*(i + x)]     = r;
1576                 out_rows[y][3*(i + x) + 1] = g;
1577                 out_rows[y][3*(i + x) + 2] = b;
1578             }
1579         }
1580     }
1581 }
1582
1583 GAPI_FLUID_KERNEL(FNV12toRGB, NV12toRGB, false) {
1584     static const int Window = 1;
1585     static const int LPI    = 2;
1586     static const auto Kind = cv::GFluidKernel::Kind::NV12toRGB;
1587
1588     static void run(const cv::gapi::fluid::View &in_y,
1589                     const cv::gapi::fluid::View &in_uv,
1590                           cv::gapi::fluid::Buffer &out) {
1591         const uchar* uv_row = in_uv.InLineB(0);
1592         const uchar* y_rows[2] = {in_y. InLineB(0), in_y. InLineB(1)};
1593         uchar* out_rows[2] = {out.OutLineB(0), out.OutLineB(1)};
1594
1595         int buf_width = out.length();
1596
1597         #if MANUAL_SIMD
1598             calculate_nv12_to_rgb(y_rows, uv_row, out_rows, buf_width);
1599         #else
1600             calculate_nv12_to_rgb_fallback(y_rows, uv_row, out_rows, buf_width);
1601         #endif
1602     }
1603 };
1604
1605 }  // namespace kernels
1606
1607 //----------------------------------------------------------------------
1608
1609 using namespace kernels;
1610
1611 cv::gapi::GKernelPackage preprocKernels() {
1612     return cv::gapi::kernels
1613         < FChanToPlane
1614         , FScalePlanes
1615         , FScalePlane
1616         , FScalePlane32f
1617         , FScalePlane8u
1618         , FUpscalePlaneArea8u
1619         , FUpscalePlaneArea32f
1620         , FScalePlaneArea8u
1621         , FScalePlaneArea32f
1622         , FMerge2
1623         , FMerge3
1624         , FMerge4
1625         , FSplit2
1626         , FSplit3
1627         , FSplit4
1628         , FNV12toRGB
1629         >();
1630 }
1631
1632 }  // namespace gapi
1633 }  // namespace InferenceEngine