inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp

   1 // Copyright (C) 2018-2019 Intel Corporation
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4
   5 #include "ie_preprocess_gapi_kernels.hpp"
   6 #include "ie_preprocess_gapi_kernels_impl.hpp"
   7 #include "ie_preprocess_gapi_kernels_sse42.hpp"
   8
   9 // NB: include this before opencv_hal_sse.hpp
  10 #include "nmmintrin.h"
  11
  12 // NB: define these before opencv_hal_sse.hpp
  13 namespace cv {
  14 namespace hal {
  15
  16 enum StoreMode {
  17     STORE_UNALIGNED = 0,
  18     STORE_ALIGNED = 1,
  19     STORE_ALIGNED_NOCACHE = 2
  20 };
  21
  22 }  // namespace hal
  23 }  // namespace cv
  24
  25 // NB: define these before opencv_hal_sse.hpp
  26 #define OPENCV_HAL_ADD(a, b) ((a) + (b))
  27 #define OPENCV_HAL_AND(a, b) ((a) & (b))
  28 #define OPENCV_HAL_NOP(a) (a)
  29 #define OPENCV_HAL_1ST(a, b) (a)
  30
  31 // NB: define these before opencv_hal_sse.hpp
  32 #ifdef CV_SSE4_2
  33   #undef CV_SSE4_2
  34   #undef CV_SSE4_1
  35   #undef CV_SSSE3
  36   #undef CV_SSE3
  37   #undef CV_SSE2
  38   #undef CV_SSE
  39 #endif
  40 #define CV_SSE4_2 1
  41 #define CV_SSE4_1 1
  42 #define CV_SSSE3  1
  43 #define CV_SSE3   1
  44 #define CV_SSE2   1
  45 #define CV_SSE    1
  46 #define CV_CPU_HAS_SUPPORT_SSE2 1
  47 #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN  // empty
  48 #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
  49
  50 // OpenCV universal intrinsic
  51 #include "opencv_hal_sse.hpp"
  52
  53 // AFTER "opencv_hal_sse.hpp"
  54 // (CV_SIMD128 defined there)
  55 #if   !CV_SIMD128
  56 #error CV_SIMD128 is required!
  57 #endif
  58
  59 #include <cstring>
  60
  61 using namespace cv;
  62
  63 namespace InferenceEngine {
  64 namespace gapi {
  65 namespace kernels {
  66
  67 //----------------------------------------------------------------------
  68
  69 #if CV_SSE
  70 static inline void v_deinterleave(const v_float32x4& low, const v_float32x4& high,
  71                                         v_float32x4& even,      v_float32x4& odd) {
  72     __m128 tmp0 = _mm_unpacklo_ps(low.val, high.val);
  73     __m128 tmp1 = _mm_unpackhi_ps(low.val, high.val);
  74     even.val = _mm_unpacklo_ps(tmp0, tmp1);
  75     odd .val = _mm_unpackhi_ps(tmp0, tmp1);
  76 }
  77 #endif
  78
  79 #if CV_SSE2
  80 static inline void v_deinterleave(const v_uint8x16& i0, const v_uint8x16& i1,
  81                                   const v_uint8x16& i2, const v_uint8x16& i3,
  82                                         v_uint8x16& o0,       v_uint8x16& o1,
  83                                         v_uint8x16& o2,       v_uint8x16& o3) {
  84     __m128i u0 = i0.val;                     // a0 b0 c0 d0 a1 b1 c1 d1 ...
  85     __m128i u1 = i1.val;                     // a4 b4 c4 d4 ...
  86     __m128i u2 = i2.val;                     // a8 b8 c8 d8 ...
  87     __m128i u3 = i3.val;                     // a12 b12 c12 d12 ...
  88
  89     __m128i v0 = _mm_unpacklo_epi8(u0, u2);  // a0 a8 b0 b8 ...
  90     __m128i v1 = _mm_unpackhi_epi8(u0, u2);  // a2 a10 b2 b10 ...
  91     __m128i v2 = _mm_unpacklo_epi8(u1, u3);  // a4 a12 b4 b12 ...
  92     __m128i v3 = _mm_unpackhi_epi8(u1, u3);  // a6 a14 b6 b14 ...
  93
  94     u0 = _mm_unpacklo_epi8(v0, v2);          // a0 a4 a8 a12 ...
  95     u1 = _mm_unpacklo_epi8(v1, v3);          // a2 a6 a10 a14 ...
  96     u2 = _mm_unpackhi_epi8(v0, v2);          // a1 a5 a9 a13 ...
  97     u3 = _mm_unpackhi_epi8(v1, v3);          // a3 a7 a11 a15 ...
  98
  99     v0 = _mm_unpacklo_epi8(u0, u1);          // a0 a2 a4 a6 ...
 100     v1 = _mm_unpacklo_epi8(u2, u3);          // a1 a3 a5 a7 ...
 101     v2 = _mm_unpackhi_epi8(u0, u1);          // c0 c2 c4 c6 ...
 102     v3 = _mm_unpackhi_epi8(u2, u3);          // c1 c3 c5 c7 ...
 103
 104     o0.val = _mm_unpacklo_epi8(v0, v1);      // a0 a1 a2 a3 ...
 105     o1.val = _mm_unpackhi_epi8(v0, v1);      // b0 b1 b2 b3 ...
 106     o2.val = _mm_unpacklo_epi8(v2, v3);      // c0 c1 c2 c3 ...
 107     o3.val = _mm_unpackhi_epi8(v2, v3);      // d0 d1 d2 d3 ...
 108 }
 109
 110 static inline v_uint8x16 v_interleave_low(const v_uint8x16& a, const v_uint8x16& b) {
 111     return v_uint8x16(_mm_unpacklo_epi8(a.val, b.val));
 112 }
 113
 114 static inline v_uint8x16 v_interleave_high(const v_uint8x16& a, const v_uint8x16& b) {
 115     return v_uint8x16(_mm_unpackhi_epi8(a.val, b.val));
 116 }
 117
 118 static inline v_int16x8 v_interleave_low(const v_int16x8& a, const v_int16x8& b) {
 119     return v_int16x8(_mm_unpacklo_epi16(a.val, b.val));
 120 }
 121
 122 static inline v_int16x8 v_interleave_high(const v_int16x8& a, const v_int16x8& b) {
 123     return v_int16x8(_mm_unpackhi_epi16(a.val, b.val));
 124 }
 125
 126 static inline v_uint16x8 v_expand_low(const v_uint8x16& a) {
 127     return v_uint16x8(_mm_unpacklo_epi8(a.val, _mm_setzero_si128()));
 128 }
 129
 130 static inline v_uint16x8 v_expand_high(const v_uint8x16& a) {
 131     return v_uint16x8(_mm_unpackhi_epi8(a.val, _mm_setzero_si128()));
 132 }
 133
 134 static inline v_uint8x16 v_saturate_u8(const v_int16x8& a) {
 135     v_uint8x16 r;
 136     r.val = _mm_packus_epi16(a.val, _mm_setzero_si128());
 137     return r;
 138 }
 139
 140 static inline v_int16x8 v_saturate_s16(const v_int32x4& a) {
 141     v_int16x8 r;
 142     r.val = _mm_packs_epi32(a.val, _mm_setzero_si128());
 143     return r;
 144 }
 145
 146 // for each j=index[k], load two chars src[j] and src[j+1]
 147 static inline v_uint8x16 v_gather_pairs(const uchar src[], const v_int16x8& index) {
 148     v_uint8x16 r;
 149     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 0)]), 0);
 150     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 1)]), 1);
 151     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 2)]), 2);
 152     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 3)]), 3);
 153     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 4)]), 4);
 154     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 5)]), 5);
 155     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 6)]), 6);
 156     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 7)]), 7);
 157     return r;
 158 }
 159
 160 static inline v_int16x8 v_gather_chan(const uchar src[], const v_int16x8& index, int channel, int pos) {
 161     constexpr const int chanNum = 3;
 162     v_int16x8 r;
 163     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 0) + pos) + channel]), 0);
 164     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 1) + pos) + channel]), 1);
 165     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 2) + pos) + channel]), 2);
 166     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 3) + pos) + channel]), 3);
 167     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 4) + pos) + channel]), 4);
 168     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 5) + pos) + channel]), 5);
 169     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 6) + pos) + channel]), 6);
 170     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 7) + pos) + channel]), 7);
 171     return r;
 172 }
 173
 174 static inline void v_gather_pairs(const float src[], const v_int32x4& index,
 175                                   v_float32x4& low, v_float32x4& high) {
 176     int i[4];
 177     v_store(i, index);
 178
 179     __m128 l = _mm_setzero_ps();
 180     l = _mm_loadl_pi(l, (const __m64*)&src[i[0]]);  // pair of floats
 181     l = _mm_loadh_pi(l, (const __m64*)&src[i[1]]);
 182     low.val = l;
 183
 184     __m128 h = _mm_setzero_ps();
 185     h = _mm_loadl_pi(h, (const __m64*)&src[i[2]]);
 186     h = _mm_loadh_pi(h, (const __m64*)&src[i[3]]);
 187     high.val = h;
 188 }
 189
 190 static inline v_int32x4 v_madd(const v_int16x8& a, const v_int16x8& b) {
 191     v_int32x4 r;
 192     r.val = _mm_madd_epi16(a.val, b.val);
 193     return r;
 194 }
 195
 196 static inline v_int16x8 v_mulhi(const v_int16x8& a, short b) {
 197     v_int16x8 r;
 198     r.val = _mm_mulhi_epi16(a.val, _mm_set1_epi16(b));
 199     return r;
 200 }
 201
 202 static inline v_uint16x8 v_mulhi(const v_uint16x8& a, v_uint16x8 b) {
 203     v_uint16x8 r;
 204     r.val = _mm_mulhi_epu16(a.val, b.val);
 205     return r;
 206 }
 207
 208 static inline v_uint16x8 v_mulhi(const v_uint16x8& a, uint16_t b) {
 209     v_uint16x8 r;
 210     r.val = _mm_mulhi_epu16(a.val, _mm_set1_epi16(b));
 211     return r;
 212 }
 213
 214 static inline v_int16x8 v_mulhrs(const v_int16x8& a, const v_int16x8& b) {
 215     v_int16x8 r;
 216     r.val = _mm_mulhrs_epi16(a.val, b.val);
 217     return r;
 218 }
 219
 220 static inline v_int16x8 v_mulhrs(const v_int16x8& a, short b) {
 221     return v_mulhrs(a, v_setall_s16(b));
 222 }
 223 #endif  // SSE2
 224
 225 #ifdef CV_SSE3
 226 static inline void v_deinterleave_expand(const v_uint8x16& src, v_int16x8& even, v_int16x8& odd) {
 227     static const __m128i mask_even = _mm_setr_epi8(0, -1, 2, -1, 4, -1, 6, -1, 8, -1, 10, -1, 12, -1, 14, -1);
 228     static const __m128i mask_odd  = _mm_setr_epi8(1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1);
 229     even.val = _mm_shuffle_epi8(src.val, mask_even);
 230     odd .val = _mm_shuffle_epi8(src.val, mask_odd);
 231 }
 232 #endif
 233
 234 static inline v_float32x4 v_fma(const v_float32x4& a, float b, const v_float32x4& c) {
 235     return v_fma(a, v_setall_f32(b), c);
 236 }
 237
 238 static inline v_int16x8 operator+ (const v_int16x8& a, short b) {
 239     return a + v_setall_s16(b);
 240 }
 241
 242 static inline v_int16x8 operator- (short a, const v_int16x8& b) {
 243     return v_setall_s16(a) - b;
 244 }
 245
 246 static inline v_float32x4 operator- (float a, const v_float32x4& b) {
 247     return v_setall_f32(a) - b;
 248 }
 249
 250 static inline v_float32x4 operator* (const v_float32x4& a, float b) {
 251     return a * v_setall_f32(b);
 252 }
 253
 254 //------------------------------------------------------------------------------
 255
 256 // Resize (bi-linear, 8U)
 257 void calcRowLinear_8U(uint8_t *dst[],
 258                 const uint8_t *src0[],
 259                 const uint8_t *src1[],
 260                 const short    alpha[],
 261                 const short    clone[],  // 4 clones of alpha
 262                 const short    mapsx[],
 263                 const short    beta[],
 264                       uint8_t  tmp[],
 265                 const Size   & inSz,
 266                 const Size   & outSz,
 267                       int      lpi) {
 268     bool xRatioEq1 = inSz.width  == outSz.width;
 269     bool yRatioEq1 = inSz.height == outSz.height;
 270
 271     if (!xRatioEq1 && !yRatioEq1) {
 272         if (4 == lpi) {
 273             // vertical pass
 274             GAPI_DbgAssert(inSz.width >= 8);
 275
 276             __m128i b0 = _mm_set1_epi16(beta[0]);
 277             __m128i b1 = _mm_set1_epi16(beta[1]);
 278             __m128i b2 = _mm_set1_epi16(beta[2]);
 279             __m128i b3 = _mm_set1_epi16(beta[3]);
 280
 281             for (int w = 0; w < inSz.width; ) {
 282                 for (; w <= inSz.width - 8; w += 8) {
 283                 #if USE_CVKL
 284                     //--------------------------------------------
 285                     // reworked from: ie_preprocess_data_sse42.cpp
 286                     //      function: resize_bilinear_u8
 287                     //         label: vertical_pass
 288                     //--------------------------------------------
 289
 290                     __m128i val0lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src0[0][w])),
 291                                                                      *reinterpret_cast<const int64_t*>(&src0[1][w]), 1);
 292                     __m128i val0hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src0[2][w])),
 293                                                                      *reinterpret_cast<const int64_t*>(&src0[3][w]), 1);
 294                     __m128i val1lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src1[0][w])),
 295                                                                      *reinterpret_cast<const int64_t*>(&src1[1][w]), 1);
 296                     __m128i val1hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src1[2][w])),
 297                                                                      *reinterpret_cast<const int64_t*>(&src1[3][w]), 1);
 298
 299                     __m128i val0_0 = _mm_cvtepu8_epi16(val0lo);
 300                     __m128i val0_2 = _mm_cvtepu8_epi16(val0hi);
 301                     __m128i val1_0 = _mm_cvtepu8_epi16(val1lo);
 302                     __m128i val1_2 = _mm_cvtepu8_epi16(val1hi);
 303
 304                     __m128i val0_1 = _mm_unpackhi_epi8(val0lo, _mm_setzero_si128());
 305                     __m128i val0_3 = _mm_unpackhi_epi8(val0hi, _mm_setzero_si128());
 306                     __m128i val1_1 = _mm_unpackhi_epi8(val1lo, _mm_setzero_si128());
 307                     __m128i val1_3 = _mm_unpackhi_epi8(val1hi, _mm_setzero_si128());
 308
 309                     __m128i t0 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_0, val1_0), b0);
 310                     __m128i t1 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_1, val1_1), b1);
 311                     __m128i t2 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_2, val1_2), b2);
 312                     __m128i t3 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_3, val1_3), b3);
 313
 314                     __m128i r0 = _mm_add_epi16(val1_0, t0);
 315                     __m128i r1 = _mm_add_epi16(val1_1, t1);
 316                     __m128i r2 = _mm_add_epi16(val1_2, t2);
 317                     __m128i r3 = _mm_add_epi16(val1_3, t3);
 318
 319                     __m128i q0 = _mm_packus_epi16(r0, r1);
 320                     __m128i q1 = _mm_packus_epi16(r2, r3);
 321
 322                     __m128i q2 = _mm_blend_epi16(q0, _mm_slli_si128(q1, 4), 0xCC /*0b11001100*/);
 323                     __m128i q3 = _mm_blend_epi16(_mm_srli_si128(q0, 4), q1, 0xCC /*0b11001100*/);
 324
 325                     __m128i q4 = _mm_shuffle_epi8(q2, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15));
 326                     __m128i q5 = _mm_shuffle_epi8(q3, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15));
 327
 328                     _mm_storeu_si128(reinterpret_cast<__m128i *>(&tmp[4*w +  0]), q4);
 329                     _mm_storeu_si128(reinterpret_cast<__m128i *>(&tmp[4*w + 16]), q5);
 330
 331                 #else
 332                     // let: t[i] = src0[i][w]*beta0[i] + src1[i][w]*beta1
 333                     // here: beta0[i] = beta[i], beta1 = 1 - beta0[i]
 334                     v_int16x8 t0, t1, t2, t3;
 335                     {
 336                         v_int16x8 s0, s1;
 337
 338                         s0 = v_reinterpret_as_s16(v_load_expand(&src0[0][w]));
 339                         s1 = v_reinterpret_as_s16(v_load_expand(&src1[0][w]));
 340                         t0 = v_mulhrs(s0 - s1, beta[0]) + s1;
 341
 342                         s0 = v_reinterpret_as_s16(v_load_expand(&src0[1][w]));
 343                         s1 = v_reinterpret_as_s16(v_load_expand(&src1[1][w]));
 344                         t1 = v_mulhrs(s0 - s1, beta[1]) + s1;
 345
 346                         s0 = v_reinterpret_as_s16(v_load_expand(&src0[2][w]));
 347                         s1 = v_reinterpret_as_s16(v_load_expand(&src1[2][w]));
 348                         t2 = v_mulhrs(s0 - s1, beta[2]) + s1;
 349
 350                         s0 = v_reinterpret_as_s16(v_load_expand(&src0[3][w]));
 351                         s1 = v_reinterpret_as_s16(v_load_expand(&src1[3][w]));
 352                         t3 = v_mulhrs(s0 - s1, beta[3]) + s1;
 353                     }
 354                     // store as groups of 4 pixels: each group to have a pixel per row
 355                     {
 356                         v_uint8x16 a0, a1, a2, a3;
 357                         a0 = v_pack_u(t0, v_setall_s16(0));
 358                         a1 = v_pack_u(t1, v_setall_s16(0));
 359                         a2 = v_pack_u(t2, v_setall_s16(0));
 360                         a3 = v_pack_u(t3, v_setall_s16(0));
 361
 362                         v_int16x8 b0, b1;
 363                         b0 = v_reinterpret_as_s16(v_interleave_low(a0, a1));  // 0th, 1st
 364                         b1 = v_reinterpret_as_s16(v_interleave_low(a2, a3));  // 2nd, 3rd
 365
 366                         v_uint8x16 d0, d1;
 367                         d0 = v_reinterpret_as_u8(v_interleave_low(b0,  b1));
 368                         d1 = v_reinterpret_as_u8(v_interleave_high(b0, b1));
 369
 370                         v_store(&tmp[4*w +  0], d0);
 371                         v_store(&tmp[4*w + 16], d1);
 372                     }
 373                 #endif
 374                 }
 375
 376                 if (w < inSz.width) {
 377                     w = inSz.width - 8;
 378                 }
 379             }
 380
 381             // horizontal pass
 382             GAPI_DbgAssert(outSz.width >= 8);
 383             for (int x = 0; x < outSz.width; ) {
 384                 for (; x <= outSz.width - 8; x += 8) {
 385                 #if USE_CVKL
 386                     //--------------------------------------------
 387                     // reworked from: ie_preprocess_data_sse42.cpp
 388                     //      function: resize_bilinear_u8
 389                     //         label: horizontal_pass
 390                     //--------------------------------------------
 391
 392                 #if 1
 393                     __m128i a10 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 *  x]));
 394                     __m128i a32 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 * (x + 2)]));
 395                     __m128i a54 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 * (x + 4)]));
 396                     __m128i a76 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 * (x + 6)]));
 397                 #else
 398                     // provided alpha[x..x+7] = { a0, a1, a2, a3, a4, a5, a6, a7},
 399                     // clone each a[i] 4 times - one item per each of LPI rows,
 400                     // so that a10 = {a0, a0, a0, a0, a1, a1, a1, a1}, etc.
 401                     __m128i a10, a32, a54, a76;
 402                     __m128i alpha0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&alpha[x]));
 403                     a10 = _mm_unpacklo_epi16(alpha0, alpha0);  // {a0, a0, a1, a1, a2, a2, a3, a3}
 404                     a32 = _mm_unpackhi_epi16(a10, a10);        // {a2, a2, a2, a2, a3, a3, a3, a3}
 405                     a10 = _mm_unpacklo_epi16(a10, a10);        // {a0, a0, a0, a0, a1, a1, a1, a1}
 406                     a54 = _mm_unpackhi_epi16(alpha0, alpha0);  // {a4, a4, a5, a5, a6, a6, a7, a7}
 407                     a76 = _mm_unpackhi_epi16(a54, a54);        // {a6, a6, a6, a6, a7, a7, a7, a7}
 408                     a54 = _mm_unpacklo_epi16(a54, a54);        // {a4, a4, a4, a4, a5, a5, a5, a5}
 409                 #endif
 410
 411                     __m128d val0d, val1d, val2d, val3d;
 412                     val0d = _mm_load_sd(/****/  reinterpret_cast<double*>(&tmp[4 * mapsx[x + 0]]));
 413                     val0d = _mm_loadh_pd(val0d, reinterpret_cast<double*>(&tmp[4 * mapsx[x + 1]]));
 414                     val1d = _mm_load_sd(/****/  reinterpret_cast<double*>(&tmp[4 * mapsx[x + 2]]));
 415                     val1d = _mm_loadh_pd(val1d, reinterpret_cast<double*>(&tmp[4 * mapsx[x + 3]]));
 416                     val2d = _mm_load_sd(/****/  reinterpret_cast<double*>(&tmp[4 * mapsx[x + 4]]));
 417                     val2d = _mm_loadh_pd(val2d, reinterpret_cast<double*>(&tmp[4 * mapsx[x + 5]]));
 418                     val3d = _mm_load_sd(/****/  reinterpret_cast<double*>(&tmp[4 * mapsx[x + 6]]));
 419                     val3d = _mm_loadh_pd(val3d, reinterpret_cast<double*>(&tmp[4 * mapsx[x + 7]]));
 420
 421                     __m128i val_0 = _mm_castpd_si128(val0d);
 422                     __m128i val_1 = _mm_castpd_si128(val1d);
 423                     __m128i val_2 = _mm_castpd_si128(val2d);
 424                     __m128i val_3 = _mm_castpd_si128(val3d);
 425
 426                     val_0 = _mm_shuffle_epi32(val_0, _MM_SHUFFLE(3, 1, 2, 0));
 427                     val_1 = _mm_shuffle_epi32(val_1, _MM_SHUFFLE(3, 1, 2, 0));
 428                     val_2 = _mm_shuffle_epi32(val_2, _MM_SHUFFLE(3, 1, 2, 0));
 429                     val_3 = _mm_shuffle_epi32(val_3, _MM_SHUFFLE(3, 1, 2, 0));
 430
 431                     __m128i val0_0 = _mm_cvtepu8_epi16(val_0);
 432                     __m128i val0_1 = _mm_cvtepu8_epi16(val_1);
 433                     __m128i val0_2 = _mm_cvtepu8_epi16(val_2);
 434                     __m128i val0_3 = _mm_cvtepu8_epi16(val_3);
 435
 436                     __m128i val1_0 = _mm_unpackhi_epi8(val_0, _mm_setzero_si128());
 437                     __m128i val1_1 = _mm_unpackhi_epi8(val_1, _mm_setzero_si128());
 438                     __m128i val1_2 = _mm_unpackhi_epi8(val_2, _mm_setzero_si128());
 439                     __m128i val1_3 = _mm_unpackhi_epi8(val_3, _mm_setzero_si128());
 440
 441                     __m128i t0 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_0, val1_0), a10);
 442                     __m128i t1 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_1, val1_1), a32);
 443                     __m128i t2 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_2, val1_2), a54);
 444                     __m128i t3 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_3, val1_3), a76);
 445
 446                     __m128i r0 = _mm_add_epi16(val1_0, t0);
 447                     __m128i r1 = _mm_add_epi16(val1_1, t1);
 448                     __m128i r2 = _mm_add_epi16(val1_2, t2);
 449                     __m128i r3 = _mm_add_epi16(val1_3, t3);
 450
 451                     __m128i q0 = _mm_packus_epi16(r0, r1);
 452                     __m128i q1 = _mm_packus_epi16(r2, r3);
 453
 454                     __m128i q2 = _mm_shuffle_epi8(q0, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
 455                     __m128i q3 = _mm_shuffle_epi8(q1, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
 456
 457                     __m128i q4 = _mm_blend_epi16(q2, _mm_slli_si128(q3, 4), 0xCC /*0b11001100*/);
 458                     __m128i q5 = _mm_blend_epi16(_mm_srli_si128(q2, 4), q3, 0xCC /*0b11001100*/);
 459
 460                     _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[0][x]),                q4);
 461                     _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[1][x]), _mm_srli_si128(q4, 8));
 462                     _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[2][x]),                q5);
 463                     _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[3][x]), _mm_srli_si128(q5, 8));
 464
 465                 #else
 466                     // let: t be 2 pairs of groups of 4 pixels (each group is for 4 dst rows)
 467                     // each pair of gorups corresponds to pixels indexed as sx0 and sx1=sx0+1
 468                     // so: low part of t0 is 2x4 pixels corresponding to sx0=mapsx[x+0], etc.
 469                     v_uint8x16 t0, t1, t2, t3;
 470                     {
 471                         t0.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 0]])),
 472                                                                  *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 1]]), 1);
 473                         t1.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 2]])),
 474                                                                  *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 3]]), 1);
 475                         t2.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 4]])),
 476                                                                  *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 5]]), 1);
 477                         t3.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 6]])),
 478                                                                  *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 7]]), 1);
 479                     }
 480
 481                     // let: r0 be pixels for 0th row, etc
 482                     v_uint8x16 r0, r1, r2, r3;
 483                     v_deinterleave(t0, t1, t2, t3, r0, r1, r2, r3);
 484
 485                     // let: dl be resulting 8 pixels for l'th row
 486                     //      dl = alpha0*s0l + alpha1*s1l
 487                     // note that alpha0 + alpha1 = 1
 488                     {
 489                         v_int16x8 s0, s1, d, alpha0;
 490
 491                         alpha0 = v_load(&alpha[x]);  // 8 coefficients
 492
 493                         v_deinterleave_expand(r0, s0, s1);
 494                         d = v_mulhrs(s0 - s1, alpha0) + s1;
 495                         v_pack_u_store(&dst[0][x], d);
 496
 497                         v_deinterleave_expand(r1, s0, s1);
 498                         d = v_mulhrs(s0 - s1, alpha0) + s1;
 499                         v_pack_u_store(&dst[1][x], d);
 500
 501                         v_deinterleave_expand(r2, s0, s1);
 502                         d = v_mulhrs(s0 - s1, alpha0) + s1;
 503                         v_pack_u_store(&dst[2][x], d);
 504
 505                         v_deinterleave_expand(r3, s0, s1);
 506                         d = v_mulhrs(s0 - s1, alpha0) + s1;
 507                         v_pack_u_store(&dst[3][x], d);
 508                     }
 509                 #endif
 510                 }
 511
 512                 if (x < outSz.width) {
 513                     x = outSz.width - 8;
 514                 }
 515             }
 516
 517         } else {  // if any lpi
 518             for (int l = 0; l < lpi; l++) {
 519                 short beta0 =                            beta[l];
 520             //  short beta1 = saturate_cast<short>(ONE - beta[l]);
 521
 522                 // vertical pass
 523                 GAPI_DbgAssert(inSz.width >= 8);
 524                 for (int w = 0; w < inSz.width; ) {
 525                     for (; w <= inSz.width - 8; w += 8) {
 526                         v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(&src0[l][w]));
 527                         v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(&src1[l][w]));
 528                         v_int16x8 t = v_mulhrs(s0 - s1, beta0) + s1;
 529                         v_pack_u_store(tmp + w, t);
 530                     }
 531
 532                     if (w < inSz.width) {
 533                         w = inSz.width - 8;
 534                     }
 535                 }
 536
 537                 // horizontal pass
 538                 GAPI_DbgAssert(outSz.width >= 8);
 539                 for (int x = 0; x < outSz.width; ) {
 540                     for (; x <= outSz.width - 8; x += 8) {
 541                         v_int16x8 a0 = v_load(&alpha[x]);        // as signed Q1.1.14
 542                         v_int16x8 sx = v_load(&mapsx[x]);        // as integer (int16)
 543                         v_uint8x16 t = v_gather_pairs(tmp, sx);  // 8 pairs of src0 pixels
 544                         v_int16x8 t0, t1;
 545                         v_deinterleave_expand(t, t0, t1);        // tmp pixels as int16
 546                         v_int16x8 d = v_mulhrs(t0 - t1, a0) + t1;
 547                         v_pack_u_store(&dst[l][x], d);
 548                     }
 549
 550                     if (x < outSz.width) {
 551                         x = outSz.width - 8;
 552                     }
 553                 }
 554             }
 555         }  // if lpi == 4
 556
 557     } else if (!xRatioEq1) {
 558         GAPI_DbgAssert(yRatioEq1);
 559
 560         if (4 == lpi) {
 561             // vertical pass
 562             GAPI_DbgAssert(inSz.width >= 16);
 563             for (int w = 0; w < inSz.width; ) {
 564                 for (; w <= inSz.width - 16; w += 16) {
 565                     v_uint8x16 s0, s1, s2, s3;
 566                     s0 = v_load(&src0[0][w]);
 567                     s1 = v_load(&src0[1][w]);
 568                     s2 = v_load(&src0[2][w]);
 569                     s3 = v_load(&src0[3][w]);
 570                     v_store_interleave(&tmp[4*w], s0, s1, s2, s3);
 571                 }
 572
 573                 if (w < inSz.width) {
 574                     w = inSz.width - 16;
 575                 }
 576             }
 577
 578             // horizontal pass
 579             GAPI_DbgAssert(outSz.width >= 8);
 580             for (int x = 0; x < outSz.width; ) {
 581                 for (; x <= outSz.width - 8; x += 8) {
 582                     v_uint8x16 t0, t1, t2, t3;
 583                     t0.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 0]])),
 584                                                              *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 1]]), 1);
 585                     t1.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 2]])),
 586                                                              *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 3]]), 1);
 587                     t2.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 4]])),
 588                                                              *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 5]]), 1);
 589                     t3.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 6]])),
 590                                                              *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 7]]), 1);
 591
 592                     v_uint8x16 r0, r1, r2, r3;
 593                     v_deinterleave(t0, t1, t2, t3, r0, r1, r2, r3);
 594
 595                     v_int16x8 s0, s1, d, alpha0;
 596
 597                     alpha0 = v_load(&alpha[x]);  // 8 coefficients
 598
 599                     v_deinterleave_expand(r0, s0, s1);
 600                     d = v_mulhrs(s0 - s1, alpha0) + s1;
 601                     v_pack_u_store(&dst[0][x], d);
 602
 603                     v_deinterleave_expand(r1, s0, s1);
 604                     d = v_mulhrs(s0 - s1, alpha0) + s1;
 605                     v_pack_u_store(&dst[1][x], d);
 606
 607                     v_deinterleave_expand(r2, s0, s1);
 608                     d = v_mulhrs(s0 - s1, alpha0) + s1;
 609                     v_pack_u_store(&dst[2][x], d);
 610
 611                     v_deinterleave_expand(r3, s0, s1);
 612                     d = v_mulhrs(s0 - s1, alpha0) + s1;
 613                     v_pack_u_store(&dst[3][x], d);
 614                 }
 615
 616                 if (x < outSz.width) {
 617                     x = outSz.width - 8;
 618                 }
 619             }
 620
 621         } else {  // any LPI
 622             for (int l = 0; l < lpi; l++) {
 623                 const uchar *src = src0[l];
 624
 625                 // horizontal pass
 626                 GAPI_DbgAssert(outSz.width >= 8);
 627                 for (int x = 0; x < outSz.width; ) {
 628                     for (; x <= outSz.width - 8; x += 8) {
 629                         v_int16x8 a0 = v_load(&alpha[x]);        // as signed Q1.1.14
 630                         v_int16x8 sx = v_load(&mapsx[x]);        // as integer (int16)
 631                         v_uint8x16 t = v_gather_pairs(src, sx);  // 8 pairs of src0 pixels
 632                         v_int16x8 t0, t1;
 633                         v_deinterleave_expand(t, t0, t1);        // tmp pixels as int16
 634                         v_int16x8 d = v_mulhrs(t0 - t1, a0) + t1;
 635                         v_pack_u_store(&dst[l][x], d);
 636                     }
 637
 638                     if (x < outSz.width) {
 639                         x = outSz.width - 8;
 640                     }
 641                 }
 642             }
 643         }
 644
 645     } else if (!yRatioEq1) {
 646         GAPI_DbgAssert(xRatioEq1);
 647         int length = inSz.width;  // == outSz.width
 648
 649         for (int l = 0; l < lpi; l++) {
 650             short beta0 =                            beta[l];
 651         //  short beta1 = saturate_cast<short>(ONE - beta[l]);
 652
 653             // vertical pass
 654             GAPI_DbgAssert(inSz.width >= 8);
 655             for (int w = 0; w < outSz.width; ) {
 656                 for (; w <= length - 8; w += 8) {
 657                     v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(src0[l] + w));
 658                     v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(src1[l] + w));
 659                     v_int16x8 t = v_mulhrs(s0 - s1, beta0) + s1;
 660                     v_pack_u_store(dst[l] + w, t);
 661                 }
 662
 663                 if (w < inSz.width) {
 664                     w = inSz.width - 8;
 665                 }
 666             }
 667         }
 668
 669     } else {
 670         GAPI_DbgAssert(xRatioEq1 && yRatioEq1);
 671         int length = inSz.width;  // == outSz.width
 672
 673         for (int l = 0; l < lpi; l++) {
 674             memcpy(dst[l], src0[l], length);
 675         }
 676     }
 677 }
 678
 679 // Resize (bi-linear, 8UC3)
 680 void calcRowLinear_8UC3(std::array<std::array<uint8_t*, 4>, 3> &dst,
 681                   const uint8_t *src0[],
 682                   const uint8_t *src1[],
 683                   const short    alpha[],
 684                   const short    clone[],  // 4 clones of alpha
 685                   const short    mapsx[],
 686                   const short    beta[],
 687                         uint8_t  tmp[],
 688                   const Size    &inSz,
 689                   const Size    &outSz,
 690                         int      lpi) {
 691     constexpr const int chanNum = 3;
 692
 693     if (4 == lpi) {
 694         // vertical pass
 695         GAPI_DbgAssert(inSz.width >= 8);
 696
 697         __m128i b0 = _mm_set1_epi16(beta[0]);
 698         __m128i b1 = _mm_set1_epi16(beta[1]);
 699         __m128i b2 = _mm_set1_epi16(beta[2]);
 700         __m128i b3 = _mm_set1_epi16(beta[3]);
 701
 702         for (int w = 0; w < inSz.width*chanNum; ) {
 703             for (; w <= inSz.width*chanNum - 8; w += 8) {
 704                 //--------------------------------------------
 705                 // reworked from: ie_preprocess_data_sse42.cpp
 706                 //      function: resize_bilinear_u8
 707                 //         label: vertical_pass
 708                 //--------------------------------------------
 709
 710                 __m128i val0lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src0[0][w])),
 711                         *reinterpret_cast<const int64_t*>(&src0[1][w]), 1);
 712                 __m128i val0hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src0[2][w])),
 713                         *reinterpret_cast<const int64_t*>(&src0[3][w]), 1);
 714                 __m128i val1lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src1[0][w])),
 715                         *reinterpret_cast<const int64_t*>(&src1[1][w]), 1);
 716                 __m128i val1hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src1[2][w])),
 717                         *reinterpret_cast<const int64_t*>(&src1[3][w]), 1);
 718
 719                 __m128i val0_0 = _mm_cvtepu8_epi16(val0lo);
 720                 __m128i val0_2 = _mm_cvtepu8_epi16(val0hi);
 721                 __m128i val1_0 = _mm_cvtepu8_epi16(val1lo);
 722                 __m128i val1_2 = _mm_cvtepu8_epi16(val1hi);
 723
 724                 __m128i val0_1 = _mm_unpackhi_epi8(val0lo, _mm_setzero_si128());
 725                 __m128i val0_3 = _mm_unpackhi_epi8(val0hi, _mm_setzero_si128());
 726                 __m128i val1_1 = _mm_unpackhi_epi8(val1lo, _mm_setzero_si128());
 727                 __m128i val1_3 = _mm_unpackhi_epi8(val1hi, _mm_setzero_si128());
 728
 729                 __m128i t0 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_0, val1_0), b0);
 730                 __m128i t1 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_1, val1_1), b1);
 731                 __m128i t2 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_2, val1_2), b2);
 732                 __m128i t3 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_3, val1_3), b3);
 733
 734                 __m128i r0 = _mm_add_epi16(val1_0, t0);
 735                 __m128i r1 = _mm_add_epi16(val1_1, t1);
 736                 __m128i r2 = _mm_add_epi16(val1_2, t2);
 737                 __m128i r3 = _mm_add_epi16(val1_3, t3);
 738
 739                 __m128i q0 = _mm_packus_epi16(r0, r1);
 740                 __m128i q1 = _mm_packus_epi16(r2, r3);
 741
 742                 __m128i q2 = _mm_blend_epi16(q0, _mm_slli_si128(q1, 4), 0xCC /*0b11001100*/);
 743                 __m128i q3 = _mm_blend_epi16(_mm_srli_si128(q0, 4), q1, 0xCC /*0b11001100*/);
 744
 745                 __m128i q4 = _mm_shuffle_epi8(q2, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15));
 746                 __m128i q5 = _mm_shuffle_epi8(q3, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15));
 747
 748                 _mm_storeu_si128(reinterpret_cast<__m128i *>(&tmp[4*w +  0]), q4);
 749                 _mm_storeu_si128(reinterpret_cast<__m128i *>(&tmp[4*w + 16]), q5);
 750             }
 751
 752             if (w < inSz.width*chanNum) {
 753                 w = inSz.width*chanNum - 8;
 754             }
 755         }
 756
 757         // horizontal pass
 758         GAPI_DbgAssert(outSz.width >= 8);
 759         for (int x = 0; x < outSz.width; ) {
 760             for (; x <= outSz.width - 8; x += 8) {
 761                 //--------------------------------------------
 762                 // reworked from: ie_preprocess_data_sse42.cpp
 763                 //      function: resize_bilinear_u8
 764                 //         label: horizontal_pass
 765                 //--------------------------------------------
 766
 767                 __m128i a10 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 *  x]));
 768                 __m128i a32 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 * (x + 2)]));
 769                 __m128i a54 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 * (x + 4)]));
 770                 __m128i a76 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 * (x + 6)]));
 771
 772                 __m128i val_0 = _mm_setzero_si128();
 773                 __m128i val_1 = _mm_setzero_si128();
 774                 __m128i val_2 = _mm_setzero_si128();
 775                 __m128i val_3 = _mm_setzero_si128();
 776
 777                 for (int c = 0; c < chanNum; c++) {
 778                     val_0 = _mm_insert_epi32(val_0, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + 0]      + c)]), 0);
 779                     val_0 = _mm_insert_epi32(val_0, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 0] + 1) + c)]), 1);
 780                     val_0 = _mm_insert_epi32(val_0, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + 1]      + c)]), 2);
 781                     val_0 = _mm_insert_epi32(val_0, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 1] + 1) + c)]), 3);
 782
 783                     val_1 = _mm_insert_epi32(val_1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + 2]      + c)]), 0);
 784                     val_1 = _mm_insert_epi32(val_1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 2] + 1) + c)]), 1);
 785                     val_1 = _mm_insert_epi32(val_1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + 3]      + c)]), 2);
 786                     val_1 = _mm_insert_epi32(val_1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 3] + 1) + c)]), 3);
 787
 788                     val_2 = _mm_insert_epi32(val_2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + 4]      + c)]), 0);
 789                     val_2 = _mm_insert_epi32(val_2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 4] + 1) + c)]), 1);
 790                     val_2 = _mm_insert_epi32(val_2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + 5]      + c)]), 2);
 791                     val_2 = _mm_insert_epi32(val_2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 5] + 1) + c)]), 3);
 792
 793                     val_3 = _mm_insert_epi32(val_3, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + 6]      + c)]), 0);
 794                     val_3 = _mm_insert_epi32(val_3, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 6] + 1) + c)]), 1);
 795                     val_3 = _mm_insert_epi32(val_3, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + 7]      + c)]), 2);
 796                     val_3 = _mm_insert_epi32(val_3, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 7] + 1) + c)]), 3);
 797
 798                     val_0 = _mm_shuffle_epi32(val_0, _MM_SHUFFLE(3, 1, 2, 0));
 799                     val_1 = _mm_shuffle_epi32(val_1, _MM_SHUFFLE(3, 1, 2, 0));
 800                     val_2 = _mm_shuffle_epi32(val_2, _MM_SHUFFLE(3, 1, 2, 0));
 801                     val_3 = _mm_shuffle_epi32(val_3, _MM_SHUFFLE(3, 1, 2, 0));
 802
 803                     __m128i val0_0 = _mm_cvtepu8_epi16(val_0);
 804                     __m128i val0_1 = _mm_cvtepu8_epi16(val_1);
 805                     __m128i val0_2 = _mm_cvtepu8_epi16(val_2);
 806                     __m128i val0_3 = _mm_cvtepu8_epi16(val_3);
 807
 808                     __m128i val1_0 = _mm_unpackhi_epi8(val_0, _mm_setzero_si128());
 809                     __m128i val1_1 = _mm_unpackhi_epi8(val_1, _mm_setzero_si128());
 810                     __m128i val1_2 = _mm_unpackhi_epi8(val_2, _mm_setzero_si128());
 811                     __m128i val1_3 = _mm_unpackhi_epi8(val_3, _mm_setzero_si128());
 812
 813                     __m128i t0 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_0, val1_0), a10);
 814                     __m128i t1 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_1, val1_1), a32);
 815                     __m128i t2 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_2, val1_2), a54);
 816                     __m128i t3 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_3, val1_3), a76);
 817
 818                     __m128i r0 = _mm_add_epi16(val1_0, t0);
 819                     __m128i r1 = _mm_add_epi16(val1_1, t1);
 820                     __m128i r2 = _mm_add_epi16(val1_2, t2);
 821                     __m128i r3 = _mm_add_epi16(val1_3, t3);
 822
 823                     __m128i q0 = _mm_packus_epi16(r0, r1);
 824                     __m128i q1 = _mm_packus_epi16(r2, r3);
 825
 826                     __m128i q2 = _mm_shuffle_epi8(q0, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
 827                     __m128i q3 = _mm_shuffle_epi8(q1, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
 828
 829                     __m128i q4 = _mm_blend_epi16(q2, _mm_slli_si128(q3, 4), 0xCC /*0b11001100*/);
 830                     __m128i q5 = _mm_blend_epi16(_mm_srli_si128(q2, 4), q3, 0xCC /*0b11001100*/);
 831
 832                     _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[c][0][x]),                q4);
 833                     _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[c][1][x]), _mm_srli_si128(q4, 8));
 834                     _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[c][2][x]),                q5);
 835                     _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[c][3][x]), _mm_srli_si128(q5, 8));
 836                 }
 837             }
 838
 839             if (x < outSz.width) {
 840                 x = outSz.width - 8;
 841             }
 842         }
 843     } else {  // if any lpi
 844         for (int l = 0; l < lpi; l++) {
 845             short beta0 = beta[l];
 846
 847             // vertical pass
 848             GAPI_DbgAssert(inSz.width*chanNum >= 8);
 849             for (int w = 0; w < inSz.width*chanNum; ) {
 850                 for (; w <= inSz.width*chanNum - 8; w += 8) {
 851                     v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(&src0[l][w]));
 852                     v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(&src1[l][w]));
 853                     v_int16x8 t = v_mulhrs(s0 - s1, beta0) + s1;
 854                     v_pack_u_store(tmp + w, t);
 855                 }
 856
 857                 if (w < inSz.width*chanNum) {
 858                     w = inSz.width*chanNum - 8;
 859                 }
 860             }
 861
 862             // horizontal pass
 863             GAPI_DbgAssert(outSz.width >= 8);
 864             for (int x = 0; x < outSz.width; ) {
 865                 for (; x <= outSz.width - 8; x += 8) {
 866                     for (int c = 0; c < chanNum; c++) {
 867                         v_int16x8 a0 = v_load(&alpha[x]);        // as signed Q1.1.14
 868                         v_int16x8 sx = v_load(&mapsx[x]);        // as integer (int16)
 869                         v_int16x8 t0 = v_gather_chan(tmp, sx, c, 0);
 870                         v_int16x8 t1 = v_gather_chan(tmp, sx, c, 1);
 871                         v_int16x8 d = v_mulhrs(t0 - t1, a0) + t1;
 872                         v_pack_u_store(&dst[c][l][x], d);
 873                     }
 874                 }
 875
 876                 if (x < outSz.width) {
 877                     x = outSz.width - 8;
 878                 }
 879             }
 880         }
 881     }
 882 }
 883
 884 // Resize (bi-linear, 32F)
 885 void calcRowLinear_32F(float *dst[],
 886                  const float *src0[],
 887                  const float *src1[],
 888                  const float  alpha[],
 889                  const int    mapsx[],
 890                  const float  beta[],
 891                  const Size & inSz,
 892                  const Size & outSz,
 893                        int    lpi) {
 894     bool xRatioEq1 = inSz.width  == outSz.width;
 895     bool yRatioEq1 = inSz.height == outSz.height;
 896
 897     if (!xRatioEq1 && !yRatioEq1) {
 898         for (int l = 0; l < lpi; l++) {
 899             float beta0 = beta[l];
 900             float beta1 = 1 - beta0;
 901
 902             int x = 0;
 903
 904         #if CV_SIMD128
 905             for (; x <= outSz.width - 4; x += 4) {
 906                 v_float32x4 alpha0 = v_load(&alpha[x]);
 907             //  v_float32x4 alpha1 = 1.f - alpha0;
 908
 909                 v_int32x4 sx = v_load(&mapsx[x]);
 910
 911                 v_float32x4 s0l, s0h, s00, s01;
 912                 v_gather_pairs(src0[l], sx, s0l, s0h);
 913                 v_deinterleave(s0l, s0h, s00, s01);
 914
 915             //  v_float32x4 res0 = s00*alpha0 + s01*alpha1;
 916                 v_float32x4 res0 = v_fma(s00 - s01, alpha0, s01);
 917
 918                 v_float32x4 s1l, s1h, s10, s11;
 919                 v_gather_pairs(src1[l], sx, s1l, s1h);
 920                 v_deinterleave(s1l, s1h, s10, s11);
 921
 922             //  v_float32x4 res1 = s10*alpha0 + s11*alpha1;
 923                 v_float32x4 res1 = v_fma(s10 - s11, alpha0, s11);
 924
 925             //  v_float32x4 d = res0*beta0 + res1*beta1;
 926                 v_float32x4 d = v_fma(res0 - res1, beta0, res1);
 927
 928                 v_store(&dst[l][x], d);
 929             }
 930         #endif
 931
 932             for (; x < outSz.width; x++) {
 933                 float alpha0 = alpha[x];
 934                 float alpha1 = 1 - alpha0;
 935                 int   sx0 = mapsx[x];
 936                 int   sx1 = sx0 + 1;
 937                 float res0 = src0[l][sx0]*alpha0 + src0[l][sx1]*alpha1;
 938                 float res1 = src1[l][sx0]*alpha0 + src1[l][sx1]*alpha1;
 939                 dst[l][x] = beta0*res0 + beta1*res1;
 940             }
 941         }
 942
 943     } else if (!xRatioEq1) {
 944         GAPI_DbgAssert(yRatioEq1);
 945
 946         for (int l = 0; l < lpi; l++) {
 947             int x = 0;
 948
 949         #if CV_SIMD128
 950             for (; x <= outSz.width - 4; x += 4) {
 951                 v_float32x4 alpha0 = v_load(&alpha[x]);
 952             //  v_float32x4 alpha1 = 1.f - alpha0;
 953
 954                 v_int32x4 sx = v_load(&mapsx[x]);
 955
 956                 v_float32x4 s0l, s0h, s00, s01;
 957                 v_gather_pairs(src0[l], sx, s0l, s0h);
 958                 v_deinterleave(s0l, s0h, s00, s01);
 959
 960             //  v_float32x4 d = s00*alpha0 + s01*alpha1;
 961                 v_float32x4 d = v_fma(s00 - s01, alpha0, s01);
 962
 963                 v_store(&dst[l][x], d);
 964             }
 965         #endif
 966
 967             for (; x < outSz.width; x++) {
 968                 float alpha0 = alpha[x];
 969                 float alpha1 = 1 - alpha0;
 970                 int   sx0 = mapsx[x];
 971                 int   sx1 = sx0 + 1;
 972                 dst[l][x] = src0[l][sx0]*alpha0 + src0[l][sx1]*alpha1;
 973             }
 974         }
 975
 976     } else if (!yRatioEq1) {
 977         GAPI_DbgAssert(xRatioEq1);
 978         int length = inSz.width;  // == outSz.width
 979
 980         for (int l = 0; l < lpi; l++) {
 981             float beta0 = beta[l];
 982             float beta1 = 1 - beta0;
 983
 984             int x = 0;
 985
 986         #if CV_SIMD128
 987             for (; x <= length - 4; x += 4) {
 988                 v_float32x4 s0 = v_load(&src0[l][x]);
 989                 v_float32x4 s1 = v_load(&src1[l][x]);
 990
 991             //  v_float32x4 d = s0*beta0 + s1*beta1;
 992                 v_float32x4 d = v_fma(s0 - s1, beta0, s1);
 993
 994                 v_store(&dst[l][x], d);
 995             }
 996         #endif
 997
 998             for (; x < length; x++) {
 999                 dst[l][x] = beta0*src0[l][x] + beta1*src1[l][x];
1000             }
1001         }
1002
1003     } else {
1004         GAPI_DbgAssert(xRatioEq1 && yRatioEq1);
1005         int length = inSz.width;  // == outSz.width
1006         for (int l = 0; l < lpi; l++) {
1007             memcpy(dst[l], src0[l], length * sizeof(float));
1008         }
1009     }
1010 }
1011
1012 //------------------------------------------------------------------------------
1013
1014 // vertical pass
1015 template<typename T, typename A, typename I, typename W>
1016 static inline void downy(const T *src[], int inWidth, const MapperUnit<A, I>& ymap, A yalpha,
1017                          W vbuf[]) {
1018     int y_1st = ymap.index0;
1019     int ylast = ymap.index1 - 1;
1020
1021     // yratio > 1, so at least 2 rows
1022     GAPI_DbgAssert(y_1st < ylast);
1023
1024     // 1st and last rows
1025     {
1026         int w = 0;
1027
1028     #if CV_SIMD128
1029         if (std::is_same<T, uint8_t>::value) {
1030             for (; w <= inWidth - 8; w += 8) {
1031                 v_uint16x8 vsrc0 = v_load_expand(reinterpret_cast<const uint8_t*>(& src[0][w]));
1032                 v_uint16x8 vsrc1 = v_load_expand(reinterpret_cast<const uint8_t*>(& src[ylast - y_1st][w]));
1033                 v_uint16x8 vres = v_mulhi(vsrc0 << 8, static_cast<Q0_16>(ymap.alpha0)) +
1034                                   v_mulhi(vsrc1 << 8, static_cast<Q0_16>(ymap.alpha1));
1035                 v_store(reinterpret_cast<Q8_8*>(& vbuf[w]), vres);
1036             }
1037         }
1038     #endif
1039
1040         for (; w < inWidth; w++) {
1041             vbuf[w] = mulas(ymap.alpha0, src[0][w])
1042                     + mulas(ymap.alpha1, src[ylast - y_1st][w]);
1043         }
1044     }
1045
1046     // inner rows (if any)
1047     for (int i = 1; i < ylast - y_1st; i++) {
1048         int w = 0;
1049
1050     #if CV_SIMD128
1051         if (std::is_same<T, uint8_t>::value) {
1052             for (; w <= inWidth - 8; w += 8) {
1053                 v_uint16x8 vsrc = v_load_expand(reinterpret_cast<const uint8_t*>(& src[i][w]));
1054                 v_uint16x8 vres = v_load(reinterpret_cast<Q8_8*>(& vbuf[w]));
1055                 vres = vres + v_mulhi(vsrc << 8, static_cast<Q0_16>(yalpha));
1056                 v_store(reinterpret_cast<Q8_8*>(& vbuf[w]), vres);
1057             }
1058         }
1059     #endif
1060
1061         for (; w < inWidth; w++) {
1062             vbuf[w] += mulas(yalpha, src[i][w]);
1063         }
1064     }
1065 }
1066
1067 // horizontal pass
1068 template<typename T, typename A, typename I, typename W>
1069 static inline void downx(T dst[], int outWidth, int xmaxdf, const I xindex[], const A xalpha[],
1070                          const W vbuf[]) {
1071 #define HSUM(xmaxdf) \
1072     for (int x = 0; x < outWidth; x++) { \
1073         int      index =  xindex[x]; \
1074         const A *alpha = &xalpha[x * xmaxdf]; \
1075 \
1076         W sum = 0; \
1077         for (int i = 0; i < xmaxdf; i++) { \
1078             sum += mulaw(alpha[i], vbuf[index + i]); \
1079         } \
1080 \
1081         dst[x] = convert_cast<T>(sum); \
1082     }
1083
1084     if (2 == xmaxdf) {
1085         HSUM(2);
1086     } else if (3 == xmaxdf) {
1087         HSUM(3);
1088     } else if (4 == xmaxdf) {
1089         HSUM(4);
1090     } else if (5 == xmaxdf) {
1091         HSUM(5);
1092     } else if (6 == xmaxdf) {
1093         HSUM(6);
1094     } else if (7 == xmaxdf) {
1095         HSUM(7);
1096     } else if (8 == xmaxdf) {
1097         HSUM(8);
1098     } else {
1099         HSUM(xmaxdf);
1100     }
1101 #undef HSUM
1102 }
1103
1104 template<typename T, typename A, typename I, typename W>
1105 static void calcRowArea_impl(T dst[], const T *src[], const Size& inSz, const Size& outSz,
1106     A yalpha, const MapperUnit<A, I>& ymap, int xmaxdf, const I xindex[], const A xalpha[],
1107     W vbuf[]) {
1108     bool xRatioEq1 = inSz.width  == outSz.width;
1109     bool yRatioEq1 = inSz.height == outSz.height;
1110
1111     if (!yRatioEq1 && !xRatioEq1) {
1112         downy(src, inSz.width, ymap, yalpha, vbuf);
1113         downx(dst, outSz.width, xmaxdf, xindex, xalpha, vbuf);
1114
1115     } else if (!yRatioEq1) {
1116         GAPI_DbgAssert(xRatioEq1);
1117         downy(src, inSz.width, ymap, yalpha, vbuf);
1118         for (int x = 0; x < outSz.width; x++) {
1119             dst[x] = convert_cast<T>(vbuf[x]);
1120         }
1121
1122     } else if (!xRatioEq1) {
1123         GAPI_DbgAssert(yRatioEq1);
1124         for (int w = 0; w < inSz.width; w++) {
1125             vbuf[w] = convert_cast<W>(src[0][w]);
1126         }
1127         downx(dst, outSz.width, xmaxdf, xindex, xalpha, vbuf);
1128
1129     } else {
1130         GAPI_DbgAssert(xRatioEq1 && yRatioEq1);
1131         memcpy(dst, src[0], outSz.width * sizeof(T));
1132     }
1133 }
1134
1135 void calcRowArea_8U(uchar dst[], const uchar *src[], const Size& inSz, const Size& outSz,
1136     Q0_16 yalpha, const MapperUnit8U &ymap, int xmaxdf, const short xindex[], const Q0_16 xalpha[],
1137     Q8_8 vbuf[]) {
1138     calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
1139 }
1140
1141 void calcRowArea_32F(float dst[], const float *src[], const Size& inSz, const Size& outSz,
1142     float yalpha, const MapperUnit32F& ymap, int xmaxdf, const int xindex[], const float xalpha[],
1143     float vbuf[]) {
1144     calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
1145 }
1146
1147 //------------------------------------------------------------------------------
1148 #if USE_CVKL
1149
1150 // from: ie_preprocess_data.hpp
1151 static inline uint8_t saturateU32toU8(uint32_t v) {
1152     return static_cast<uint8_t>(v > UINT8_MAX ? UINT8_MAX : v);
1153 }
1154
1155 // from: ie_preprocess_data_sse42.cpp
1156 static inline uint16_t mulq16(uint16_t a, uint16_t b) {
1157     return static_cast<uint16_t>(((uint32_t)a * (uint32_t)b) >> 16);
1158 }
1159
1160 // extracted from: ie_preprocess_data_sse42.cpp
1161 // (and reworked for 1-channel and fluid's src)
1162 void calcRowArea_CVKL_U8_SSE42(const uchar  * src[],
1163                                      uchar    dst[],
1164                                const Size   & inSz,
1165                                const Size   & outSz,
1166                                      int      y,
1167                                const uint16_t xsi[],
1168                                const uint16_t ysi[],
1169                                const uint16_t xalpha[],
1170                                const uint16_t yalpha[],
1171                                      int      x_max_count,
1172                                      int      y_max_count,
1173                                      uint16_t vert_sum[]) {
1174     int dwidth  = outSz.width;
1175 //  int dheight = outSz.height;
1176     int swidth  =  inSz.width;
1177     int sheight =  inSz.height;
1178
1179     int vest_sum_size = 2*swidth;
1180 //  uint16_t* vert_sum = yalpha + dheight*y_max_count;
1181     uint16_t* alpha0 = vert_sum + vest_sum_size;
1182     uint16_t* alpha1 = alpha0 + dwidth;
1183     uint16_t* alpha2 = alpha1 + dwidth;
1184     uint16_t* alpha3 = alpha2 + dwidth;
1185     uint16_t* sxid0 = alpha3 + dwidth;
1186     uint16_t* sxid1 = sxid0 + 4*dwidth;
1187     uint16_t* sxid2 = sxid1 + 4*dwidth;
1188     uint16_t* sxid3 = sxid2 + 4*dwidth;
1189
1190     uint8_t * pdst_row  = dst;
1191     uint16_t* vert_sum_ = vert_sum;
1192
1193     int ysi_row = ysi[y];
1194
1195     memset(vert_sum_, 0, swidth * sizeof(uint16_t));
1196
1197     for (int dy = 0; dy < y_max_count; dy++) {
1198         if (ysi_row + dy >= sheight)
1199             break;
1200
1201         uint16_t yalpha_dy = yalpha[y * y_max_count + dy];
1202         const uint8_t *sptr_dy = src[dy];
1203
1204         int x = 0;
1205
1206         __m128i yalpha_dy_sse = _mm_set1_epi16(yalpha_dy);
1207         for (; x <= swidth - 16; x += 16) {
1208             __m128i sval = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sptr_dy + x));
1209
1210             // sptr_dy[x] << 8
1211             __m128i sval_Q16_lo = _mm_unpacklo_epi8(_mm_setzero_si128(), sval);
1212             __m128i sval_Q16_hi = _mm_unpackhi_epi8(_mm_setzero_si128(), sval);
1213
1214             __m128i vert_sum_lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + x + 0));
1215             __m128i vert_sum_hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + x + 8));
1216
1217             vert_sum_lo = _mm_add_epi16(vert_sum_lo, _mm_mulhi_epu16(yalpha_dy_sse, sval_Q16_lo));
1218             vert_sum_hi = _mm_add_epi16(vert_sum_hi, _mm_mulhi_epu16(yalpha_dy_sse, sval_Q16_hi));
1219
1220             _mm_storeu_si128(reinterpret_cast<__m128i*>(vert_sum_ + x + 0), vert_sum_lo);
1221             _mm_storeu_si128(reinterpret_cast<__m128i*>(vert_sum_ + x + 8), vert_sum_hi);
1222         }
1223
1224         for (; x < swidth; x++) {
1225             vert_sum_[x] += mulq16(yalpha_dy, static_cast<uint16_t>(sptr_dy[x] << 8));
1226         }
1227     }
1228
1229     if (x_max_count == 2) {
1230         int x = 0;
1231         for (; x <= dwidth - 8; x += 8) {
1232             __m128i res = _mm_set1_epi16(1 << (8 - 1));
1233
1234             int id0 = xsi[x];
1235
1236             __m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0));
1237             __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8));
1238
1239             __m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 2));
1240             __m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 2 + 8));
1241
1242             __m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 2));
1243             __m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 2 + 8));
1244
1245             __m128i vert_sum0 = _mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0),
1246                                              _mm_shuffle_epi8(chunk1, sx0_id1));
1247             __m128i vert_sum1 = _mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0),
1248                                              _mm_shuffle_epi8(chunk1, sx1_id1));
1249
1250             res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0));
1251             res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1));
1252
1253             res = _mm_srli_epi16(res, 8);
1254             res = _mm_packus_epi16(res, res);
1255             _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
1256         }
1257
1258         for (; x < dwidth; x++) {
1259             uint16_t res = 1 << (8 - 1);
1260             int id = xsi[x];
1261             res += mulq16(alpha0[x], vert_sum_[id + 0]);
1262             res += mulq16(alpha1[x], vert_sum_[id + 1]);
1263             pdst_row[x] = saturateU32toU8(res >> 8);
1264         }
1265     } else if (x_max_count == 3) {
1266         int x = 0;
1267         for (; x <= dwidth - 8; x += 8) {
1268             __m128i res = _mm_set1_epi16(1 << (8 - 1));
1269
1270             int id0 = xsi[x];
1271
1272             __m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0));
1273             __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8));
1274             __m128i chunk2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 16));
1275
1276             __m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3));
1277             __m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3 + 8));
1278             __m128i sx0_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3 + 16));
1279
1280             __m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3));
1281             __m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3 + 8));
1282             __m128i sx1_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3 + 16));
1283
1284             __m128i sx2_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3));
1285             __m128i sx2_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3 + 8));
1286             __m128i sx2_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3 + 16));
1287
1288             __m128i vert_sum0 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0),
1289                                                           _mm_shuffle_epi8(chunk1, sx0_id1)),
1290                                              _mm_shuffle_epi8(chunk2, sx0_id2));
1291             __m128i vert_sum1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0),
1292                                                           _mm_shuffle_epi8(chunk1, sx1_id1)),
1293                                              _mm_shuffle_epi8(chunk2, sx1_id2));
1294             __m128i vert_sum2 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx2_id0),
1295                                                           _mm_shuffle_epi8(chunk1, sx2_id1)),
1296                                              _mm_shuffle_epi8(chunk2, sx2_id2));
1297
1298             res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0));
1299             res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1));
1300             res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha2 + x)), vert_sum2));
1301
1302             res = _mm_srli_epi16(res, 8);
1303             res = _mm_packus_epi16(res, res);
1304             _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
1305         }
1306
1307         for (; x < dwidth; x++) {
1308             uint16_t res = 1 << (8 - 1);
1309             int id = xsi[x];
1310             res += mulq16(alpha0[x], vert_sum_[id + 0]);
1311             res += mulq16(alpha1[x], vert_sum_[id + 1]);
1312             res += mulq16(alpha2[x], vert_sum_[id + 2]);
1313             pdst_row[x] = saturateU32toU8(res >> 8);
1314         }
1315     } else if (x_max_count == 4) {
1316         int x = 0;
1317         for (; x <= dwidth - 8; x += 8) {
1318             __m128i res = _mm_set1_epi16(1 << (8 - 1));
1319
1320             int id0 = xsi[x];
1321
1322             __m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0));
1323             __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8));
1324             __m128i chunk2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 16));
1325             __m128i chunk3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 24));
1326
1327             __m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4));
1328             __m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 8));
1329             __m128i sx0_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 16));
1330             __m128i sx0_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 24));
1331
1332             __m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4));
1333             __m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 8));
1334             __m128i sx1_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 16));
1335             __m128i sx1_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 24));
1336
1337             __m128i sx2_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4));
1338             __m128i sx2_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 8));
1339             __m128i sx2_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 16));
1340             __m128i sx2_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 24));
1341
1342             __m128i sx3_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4));
1343             __m128i sx3_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 8));
1344             __m128i sx3_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 16));
1345             __m128i sx3_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 24));
1346
1347             __m128i vert_sum0 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0),
1348                                                           _mm_shuffle_epi8(chunk1, sx0_id1)),
1349                                              _mm_or_si128(_mm_shuffle_epi8(chunk2, sx0_id2),
1350                                                           _mm_shuffle_epi8(chunk3, sx0_id3)));
1351             __m128i vert_sum1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0),
1352                                                           _mm_shuffle_epi8(chunk1, sx1_id1)),
1353                                              _mm_or_si128(_mm_shuffle_epi8(chunk2, sx1_id2),
1354                                                           _mm_shuffle_epi8(chunk3, sx1_id3)));
1355             __m128i vert_sum2 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx2_id0),
1356                                                           _mm_shuffle_epi8(chunk1, sx2_id1)),
1357                                              _mm_or_si128(_mm_shuffle_epi8(chunk2, sx2_id2),
1358                                                           _mm_shuffle_epi8(chunk3, sx2_id3)));
1359             __m128i vert_sum3 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx3_id0),
1360                                                           _mm_shuffle_epi8(chunk1, sx3_id1)),
1361                                              _mm_or_si128(_mm_shuffle_epi8(chunk2, sx3_id2),
1362                                                           _mm_shuffle_epi8(chunk3, sx3_id3)));
1363
1364             res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0));
1365             res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1));
1366             res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha2 + x)), vert_sum2));
1367             res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha3 + x)), vert_sum3));
1368
1369             res = _mm_srli_epi16(res, 8);
1370             res = _mm_packus_epi16(res, res);
1371             _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
1372         }
1373
1374         for (; x < dwidth; x++) {
1375             uint16_t res = 1 << (8 - 1);
1376             int id = xsi[x];
1377             res += mulq16(alpha0[x], vert_sum_[id + 0]);
1378             res += mulq16(alpha1[x], vert_sum_[id + 1]);
1379             res += mulq16(alpha2[x], vert_sum_[id + 2]);
1380             res += mulq16(alpha3[x], vert_sum_[id + 3]);
1381             pdst_row[x] = saturateU32toU8(res >> 8);
1382         }
1383     } else if (x_max_count <= 7) {
1384         int x = 0;
1385         for (; x <= dwidth - 8; x += 8) {
1386             __m128i res = _mm_set1_epi16(1 << (16 - 8 - 1));
1387             for (int i = 0; i < x_max_count; i++) {
1388                 __m128i valpha = _mm_setr_epi16(xalpha[x * x_max_count + x_max_count * 0 + i],
1389                                                 xalpha[x * x_max_count + x_max_count * 1 + i],
1390                                                 xalpha[x * x_max_count + x_max_count * 2 + i],
1391                                                 xalpha[x * x_max_count + x_max_count * 3 + i],
1392                                                 xalpha[x * x_max_count + x_max_count * 4 + i],
1393                                                 xalpha[x * x_max_count + x_max_count * 5 + i],
1394                                                 xalpha[x * x_max_count + x_max_count * 6 + i],
1395                                                 xalpha[x * x_max_count + x_max_count * 7 + i]);
1396                 __m128i vvert_sum = _mm_setr_epi16(vert_sum_[xsi[x + 0] + i],
1397                                                    vert_sum_[xsi[x + 1] + i],
1398                                                    vert_sum_[xsi[x + 2] + i],
1399                                                    vert_sum_[xsi[x + 3] + i],
1400                                                    vert_sum_[xsi[x + 4] + i],
1401                                                    vert_sum_[xsi[x + 5] + i],
1402                                                    vert_sum_[xsi[x + 6] + i],
1403                                                    vert_sum_[xsi[x + 7] + i]);
1404
1405                 res = _mm_add_epi16(res, _mm_mulhi_epu16(valpha, vvert_sum));
1406             }
1407             res = _mm_srli_epi16(res, 8);
1408             res = _mm_packus_epi16(res, res);
1409             _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
1410         }
1411
1412         for (; x < dwidth; x++) {
1413             uint16_t res = 1 << (8 - 1);
1414             for (int i = 0; i < x_max_count; i++) {
1415                 uint16_t a = xalpha[x * x_max_count + i];
1416                 int sx = xsi[x] + i;
1417
1418                 res += mulq16(a, vert_sum_[sx]);
1419             }
1420             pdst_row[x] = saturateU32toU8(res >> 8);
1421         }
1422     } else {
1423         for (int x = 0; x < dwidth; x++) {
1424             uint16_t res = 1 << (8 - 1);
1425             __m128i vres = _mm_setzero_si128();
1426             int id = xsi[x];
1427
1428             int i = 0;
1429             for (; i <= x_max_count - 8; i += 8) {
1430                 __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(xalpha + x * x_max_count + i));
1431                 __m128i s = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id + i));
1432
1433                 vres = _mm_add_epi16(vres, _mm_mulhi_epu16(a, s));
1434             }
1435             vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 2));
1436             vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 4));
1437             vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 8));
1438             res += static_cast<uint16_t>(_mm_extract_epi16(vres, 7));
1439
1440             for (; i < x_max_count; i++) {
1441                 uint16_t a = xalpha[x * x_max_count + i];
1442                 uint16_t s = vert_sum_[id + i];
1443
1444                 res += mulq16(a, s);
1445             }
1446
1447             pdst_row[x] = saturateU32toU8(res >> 8);
1448         }
1449     }
1450 }
1451
1452 #endif  // CVKL
1453 //------------------------------------------------------------------------------
1454
1455 void mergeRow_8UC2(const uint8_t in0[],
1456                    const uint8_t in1[],
1457                          uint8_t out[],
1458                              int length) {
1459     int l = 0;
1460
1461 #if CV_SIMD128
1462     cycle:
1463     for (; l <= length - 16; l += 16) {
1464         v_uint8x16 r0, r1;
1465         r0 = v_load(&in0[l]);
1466         r1 = v_load(&in1[l]);
1467         v_store_interleave(&out[2*l], r0, r1);
1468     }
1469
1470     if (l < length && length >= 16) {
1471         l = length - 16;
1472         goto cycle;
1473     }
1474 #endif
1475
1476     for (; l < length; l++) {
1477         out[2*l + 0] = in0[l];
1478         out[2*l + 1] = in1[l];
1479     }
1480 }
1481
1482 void mergeRow_8UC3(const uint8_t in0[],
1483                    const uint8_t in1[],
1484                    const uint8_t in2[],
1485                          uint8_t out[],
1486                              int length) {
1487     int l = 0;
1488
1489 #if CV_SIMD128
1490     cycle:
1491     for (; l <= length - 16; l += 16) {
1492         v_uint8x16 r0, r1, r2;
1493         r0 = v_load(&in0[l]);
1494         r1 = v_load(&in1[l]);
1495         r2 = v_load(&in2[l]);
1496         v_store_interleave(&out[3*l], r0, r1, r2);
1497     }
1498
1499     if (l < length && length >= 16) {
1500         l = length - 16;
1501         goto cycle;
1502     }
1503 #endif
1504
1505     for (; l < length; l++) {
1506         out[3*l + 0] = in0[l];
1507         out[3*l + 1] = in1[l];
1508         out[3*l + 2] = in2[l];
1509     }
1510 }
1511
1512 void mergeRow_8UC4(const uint8_t in0[],
1513                    const uint8_t in1[],
1514                    const uint8_t in2[],
1515                    const uint8_t in3[],
1516                          uint8_t out[],
1517                              int length) {
1518     int l = 0;
1519
1520 #if CV_SIMD128
1521     cycle:
1522     for (; l <= length - 16; l += 16) {
1523         v_uint8x16 r0, r1, r2, r3;
1524         r0 = v_load(&in0[l]);
1525         r1 = v_load(&in1[l]);
1526         r2 = v_load(&in2[l]);
1527         r3 = v_load(&in3[l]);
1528         v_store_interleave(&out[4*l], r0, r1, r2, r3);
1529     }
1530
1531     if (l < length && length >= 16) {
1532         l = length - 16;
1533         goto cycle;
1534     }
1535 #endif
1536
1537     for (; l < length; l++) {
1538         out[4*l + 0] = in0[l];
1539         out[4*l + 1] = in1[l];
1540         out[4*l + 2] = in2[l];
1541         out[4*l + 3] = in3[l];
1542     }
1543 }
1544
1545 void mergeRow_32FC2(const float in0[],
1546                     const float in1[],
1547                           float out[],
1548                             int length) {
1549     int l = 0;
1550
1551 #if CV_SIMD128
1552     cycle:
1553     for (; l <= length - 4; l += 4) {
1554         v_float32x4 r0, r1;
1555         r0 = v_load(&in0[l]);
1556         r1 = v_load(&in1[l]);
1557         v_store_interleave(&out[2*l], r0, r1);
1558     }
1559
1560     if (l < length && length >= 4) {
1561         l = length - 4;
1562         goto cycle;
1563     }
1564 #endif
1565
1566     for (; l < length; l++) {
1567         out[2*l + 0] = in0[l];
1568         out[2*l + 1] = in1[l];
1569     }
1570 }
1571
1572 void mergeRow_32FC3(const float in0[],
1573                     const float in1[],
1574                     const float in2[],
1575                           float out[],
1576                             int length) {
1577     int l = 0;
1578
1579 #if CV_SIMD128
1580     cycle:
1581     for (; l <= length - 4; l += 4) {
1582         v_float32x4 r0, r1, r2;
1583         r0 = v_load(&in0[l]);
1584         r1 = v_load(&in1[l]);
1585         r2 = v_load(&in2[l]);
1586         v_store_interleave(&out[3*l], r0, r1, r2);
1587     }
1588
1589     if (l < length && length >= 4) {
1590         l = length - 4;
1591         goto cycle;
1592     }
1593 #endif
1594
1595     for (; l < length; l++) {
1596         out[3*l + 0] = in0[l];
1597         out[3*l + 1] = in1[l];
1598         out[3*l + 2] = in2[l];
1599     }
1600 }
1601
1602 void mergeRow_32FC4(const float in0[],
1603                     const float in1[],
1604                     const float in2[],
1605                     const float in3[],
1606                           float out[],
1607                             int length) {
1608     int l = 0;
1609
1610 #if CV_SIMD128
1611     cycle:
1612     for (; l <= length - 4; l += 4) {
1613         v_float32x4 r0, r1, r2, r3;
1614         r0 = v_load(&in0[l]);
1615         r1 = v_load(&in1[l]);
1616         r2 = v_load(&in2[l]);
1617         r3 = v_load(&in3[l]);
1618         v_store_interleave(&out[4*l], r0, r1, r2, r3);
1619     }
1620
1621     if (l < length && length >= 4) {
1622         l = length - 4;
1623         goto cycle;
1624     }
1625 #endif
1626
1627     for (; l < length; l++) {
1628         out[4*l + 0] = in0[l];
1629         out[4*l + 1] = in1[l];
1630         out[4*l + 2] = in2[l];
1631         out[4*l + 3] = in3[l];
1632     }
1633 }
1634
1635 void splitRow_8UC2(const uint8_t in[],
1636                          uint8_t out0[],
1637                          uint8_t out1[],
1638                              int length) {
1639     int l = 0;
1640
1641 #if CV_SIMD128
1642     cycle:
1643     for (; l <= length - 16; l += 16) {
1644         v_uint8x16 r0, r1;
1645         v_load_deinterleave(&in[2*l], r0, r1);
1646         v_store(&out0[l], r0);
1647         v_store(&out1[l], r1);
1648     }
1649     if (l < length && length >= 16) {
1650         l = length - 16;
1651         goto cycle;
1652     }
1653 #endif
1654
1655     for (; l < length; l++) {
1656         out0[l] = in[2*l + 0];
1657         out1[l] = in[2*l + 1];
1658     }
1659 }
1660
1661 void splitRow_8UC3(const uint8_t in[],
1662                          uint8_t out0[],
1663                          uint8_t out1[],
1664                          uint8_t out2[],
1665                              int length) {
1666     int l = 0;
1667
1668 #if CV_SIMD128
1669     cycle:
1670     for (; l <= length - 16; l += 16) {
1671         v_uint8x16 r0, r1, r2;
1672         v_load_deinterleave(&in[3*l], r0, r1, r2);
1673         v_store(&out0[l], r0);
1674         v_store(&out1[l], r1);
1675         v_store(&out2[l], r2);
1676     }
1677     if (l < length && length >= 16) {
1678         l = length - 16;
1679         goto cycle;
1680     }
1681 #endif
1682
1683     for (; l < length; l++) {
1684         out0[l] = in[3*l + 0];
1685         out1[l] = in[3*l + 1];
1686         out2[l] = in[3*l + 2];
1687     }
1688 }
1689
1690 void splitRow_8UC4(const uint8_t in[],
1691                          uint8_t out0[],
1692                          uint8_t out1[],
1693                          uint8_t out2[],
1694                          uint8_t out3[],
1695                              int length) {
1696     int l = 0;
1697
1698 #if CV_SIMD128
1699     cycle:
1700     for (; l <= length - 16; l += 16) {
1701         v_uint8x16 r0, r1, r2, r3;
1702         v_load_deinterleave(&in[4*l], r0, r1, r2, r3);
1703         v_store(&out0[l], r0);
1704         v_store(&out1[l], r1);
1705         v_store(&out2[l], r2);
1706         v_store(&out3[l], r3);
1707     }
1708     if (l < length && length >= 16) {
1709         l = length - 16;
1710         goto cycle;
1711     }
1712 #endif
1713
1714     for (; l < length; l++) {
1715         out0[l] = in[4*l + 0];
1716         out1[l] = in[4*l + 1];
1717         out2[l] = in[4*l + 2];
1718         out3[l] = in[4*l + 3];
1719     }
1720 }
1721
1722 void splitRow_32FC2(const float in[],
1723                           float out0[],
1724                           float out1[],
1725                             int length) {
1726     int l = 0;
1727
1728 #if CV_SIMD128
1729     cycle:
1730     for (; l <= length - 4; l += 4) {
1731         v_float32x4 r0, r1;
1732         v_load_deinterleave(&in[2*l], r0, r1);
1733         v_store(&out0[l], r0);
1734         v_store(&out1[l], r1);
1735     }
1736
1737     if (l < length && length >= 4) {
1738         l = length - 4;
1739         goto cycle;
1740     }
1741 #endif
1742
1743     for (; l < length; l++) {
1744         out0[l] = in[2*l + 0];
1745         out1[l] = in[2*l + 1];
1746     }
1747 }
1748
1749 void splitRow_32FC3(const float in[],
1750                           float out0[],
1751                           float out1[],
1752                           float out2[],
1753                             int length) {
1754     int l = 0;
1755
1756 #if CV_SIMD128
1757     cycle:
1758     for (; l <= length - 4; l += 4) {
1759         v_float32x4 r0, r1, r2;
1760         v_load_deinterleave(&in[3*l], r0, r1, r2);
1761         v_store(&out0[l], r0);
1762         v_store(&out1[l], r1);
1763         v_store(&out2[l], r2);
1764     }
1765
1766     if (l < length && length >= 4) {
1767         l = length - 4;
1768         goto cycle;
1769     }
1770 #endif
1771
1772     for (; l < length; l++) {
1773         out0[l] = in[3*l + 0];
1774         out1[l] = in[3*l + 1];
1775         out2[l] = in[3*l + 2];
1776     }
1777 }
1778
1779 void splitRow_32FC4(const float in[],
1780                           float out0[],
1781                           float out1[],
1782                           float out2[],
1783                           float out3[],
1784                             int length) {
1785     int l = 0;
1786
1787 #if CV_SIMD128
1788     cycle:
1789     for (; l <= length - 4; l += 4) {
1790         v_float32x4 r0, r1, r2, r3;
1791         v_load_deinterleave(&in[4*l], r0, r1, r2, r3);
1792         v_store(&out0[l], r0);
1793         v_store(&out1[l], r1);
1794         v_store(&out2[l], r2);
1795         v_store(&out3[l], r3);
1796     }
1797
1798     if (l < length && length >= 4) {
1799         l = length - 4;
1800         goto cycle;
1801     }
1802 #endif
1803
1804     for (; l < length; l++) {
1805         out0[l] = in[4*l + 0];
1806         out1[l] = in[4*l + 1];
1807         out2[l] = in[4*l + 2];
1808         out3[l] = in[4*l + 3];
1809     }
1810 }
1811
1812 }  // namespace kernels
1813 }  // namespace gapi
1814 }  // namespace InferenceEngine