Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / src / inference_engine / cpu_x86_sse42 / ie_preprocess_gapi_kernels_sse42.cpp
1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 //
4
5 #include "ie_preprocess_gapi_kernels.hpp"
6 #include "ie_preprocess_gapi_kernels_impl.hpp"
7 #include "ie_preprocess_gapi_kernels_sse42.hpp"
8
9 // NB: include this before opencv_hal_sse.hpp
10 #include "nmmintrin.h"
11
12 // NB: define these before opencv_hal_sse.hpp
13 namespace cv {
14 namespace hal {
15
16 enum StoreMode {
17     STORE_UNALIGNED = 0,
18     STORE_ALIGNED = 1,
19     STORE_ALIGNED_NOCACHE = 2
20 };
21
22 }  // namespace hal
23 }  // namespace cv
24
25 // NB: define these before opencv_hal_sse.hpp
26 #define OPENCV_HAL_ADD(a, b) ((a) + (b))
27 #define OPENCV_HAL_AND(a, b) ((a) & (b))
28 #define OPENCV_HAL_NOP(a) (a)
29 #define OPENCV_HAL_1ST(a, b) (a)
30
31 // NB: define these before opencv_hal_sse.hpp
32 #ifdef CV_SSE4_2
33   #undef CV_SSE4_2
34   #undef CV_SSE4_1
35   #undef CV_SSSE3
36   #undef CV_SSE3
37   #undef CV_SSE2
38   #undef CV_SSE
39 #endif
40 #define CV_SSE4_2 1
41 #define CV_SSE4_1 1
42 #define CV_SSSE3  1
43 #define CV_SSE3   1
44 #define CV_SSE2   1
45 #define CV_SSE    1
46 #define CV_CPU_HAS_SUPPORT_SSE2 1
47 #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN  // empty
48 #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
49
50 // OpenCV universal intrinsic
51 #include "opencv_hal_sse.hpp"
52
53 // AFTER "opencv_hal_sse.hpp"
54 // (CV_SIMD128 defined there)
55 #if   !CV_SIMD128
56 #error CV_SIMD128 is required!
57 #endif
58
59 #include <cstring>
60
61 using namespace cv;
62
63 namespace InferenceEngine {
64 namespace gapi {
65 namespace kernels {
66
67 //----------------------------------------------------------------------
68
69 #if CV_SSE
70 static inline void v_deinterleave(const v_float32x4& low, const v_float32x4& high,
71                                         v_float32x4& even,      v_float32x4& odd) {
72     __m128 tmp0 = _mm_unpacklo_ps(low.val, high.val);
73     __m128 tmp1 = _mm_unpackhi_ps(low.val, high.val);
74     even.val = _mm_unpacklo_ps(tmp0, tmp1);
75     odd .val = _mm_unpackhi_ps(tmp0, tmp1);
76 }
77 #endif
78
79 #if CV_SSE2
80 static inline void v_deinterleave(const v_uint8x16& i0, const v_uint8x16& i1,
81                                   const v_uint8x16& i2, const v_uint8x16& i3,
82                                         v_uint8x16& o0,       v_uint8x16& o1,
83                                         v_uint8x16& o2,       v_uint8x16& o3) {
84     __m128i u0 = i0.val;                     // a0 b0 c0 d0 a1 b1 c1 d1 ...
85     __m128i u1 = i1.val;                     // a4 b4 c4 d4 ...
86     __m128i u2 = i2.val;                     // a8 b8 c8 d8 ...
87     __m128i u3 = i3.val;                     // a12 b12 c12 d12 ...
88
89     __m128i v0 = _mm_unpacklo_epi8(u0, u2);  // a0 a8 b0 b8 ...
90     __m128i v1 = _mm_unpackhi_epi8(u0, u2);  // a2 a10 b2 b10 ...
91     __m128i v2 = _mm_unpacklo_epi8(u1, u3);  // a4 a12 b4 b12 ...
92     __m128i v3 = _mm_unpackhi_epi8(u1, u3);  // a6 a14 b6 b14 ...
93
94     u0 = _mm_unpacklo_epi8(v0, v2);          // a0 a4 a8 a12 ...
95     u1 = _mm_unpacklo_epi8(v1, v3);          // a2 a6 a10 a14 ...
96     u2 = _mm_unpackhi_epi8(v0, v2);          // a1 a5 a9 a13 ...
97     u3 = _mm_unpackhi_epi8(v1, v3);          // a3 a7 a11 a15 ...
98
99     v0 = _mm_unpacklo_epi8(u0, u1);          // a0 a2 a4 a6 ...
100     v1 = _mm_unpacklo_epi8(u2, u3);          // a1 a3 a5 a7 ...
101     v2 = _mm_unpackhi_epi8(u0, u1);          // c0 c2 c4 c6 ...
102     v3 = _mm_unpackhi_epi8(u2, u3);          // c1 c3 c5 c7 ...
103
104     o0.val = _mm_unpacklo_epi8(v0, v1);      // a0 a1 a2 a3 ...
105     o1.val = _mm_unpackhi_epi8(v0, v1);      // b0 b1 b2 b3 ...
106     o2.val = _mm_unpacklo_epi8(v2, v3);      // c0 c1 c2 c3 ...
107     o3.val = _mm_unpackhi_epi8(v2, v3);      // d0 d1 d2 d3 ...
108 }
109
110 static inline v_uint8x16 v_interleave_low(const v_uint8x16& a, const v_uint8x16& b) {
111     return v_uint8x16(_mm_unpacklo_epi8(a.val, b.val));
112 }
113
114 static inline v_uint8x16 v_interleave_high(const v_uint8x16& a, const v_uint8x16& b) {
115     return v_uint8x16(_mm_unpackhi_epi8(a.val, b.val));
116 }
117
118 static inline v_int16x8 v_interleave_low(const v_int16x8& a, const v_int16x8& b) {
119     return v_int16x8(_mm_unpacklo_epi16(a.val, b.val));
120 }
121
122 static inline v_int16x8 v_interleave_high(const v_int16x8& a, const v_int16x8& b) {
123     return v_int16x8(_mm_unpackhi_epi16(a.val, b.val));
124 }
125
126 static inline v_uint16x8 v_expand_low(const v_uint8x16& a) {
127     return v_uint16x8(_mm_unpacklo_epi8(a.val, _mm_setzero_si128()));
128 }
129
130 static inline v_uint16x8 v_expand_high(const v_uint8x16& a) {
131     return v_uint16x8(_mm_unpackhi_epi8(a.val, _mm_setzero_si128()));
132 }
133
134 static inline v_uint8x16 v_saturate_u8(const v_int16x8& a) {
135     v_uint8x16 r;
136     r.val = _mm_packus_epi16(a.val, _mm_setzero_si128());
137     return r;
138 }
139
140 static inline v_int16x8 v_saturate_s16(const v_int32x4& a) {
141     v_int16x8 r;
142     r.val = _mm_packs_epi32(a.val, _mm_setzero_si128());
143     return r;
144 }
145
146 // for each j=index[k], load two chars src[j] and src[j+1]
147 static inline v_uint8x16 v_gather_pairs(const uchar src[], const v_int16x8& index) {
148     v_uint8x16 r;
149     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 0)]), 0);
150     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 1)]), 1);
151     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 2)]), 2);
152     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 3)]), 3);
153     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 4)]), 4);
154     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 5)]), 5);
155     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 6)]), 6);
156     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 7)]), 7);
157     return r;
158 }
159
160 static inline v_int16x8 v_gather_chan(const uchar src[], const v_int16x8& index, int channel, int pos) {
161     constexpr const int chanNum = 3;
162     v_int16x8 r;
163     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 0) + pos) + channel]), 0);
164     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 1) + pos) + channel]), 1);
165     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 2) + pos) + channel]), 2);
166     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 3) + pos) + channel]), 3);
167     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 4) + pos) + channel]), 4);
168     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 5) + pos) + channel]), 5);
169     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 6) + pos) + channel]), 6);
170     r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 7) + pos) + channel]), 7);
171     return r;
172 }
173
174 static inline void v_gather_pairs(const float src[], const v_int32x4& index,
175                                   v_float32x4& low, v_float32x4& high) {
176     int i[4];
177     v_store(i, index);
178
179     __m128 l = _mm_setzero_ps();
180     l = _mm_loadl_pi(l, (const __m64*)&src[i[0]]);  // pair of floats
181     l = _mm_loadh_pi(l, (const __m64*)&src[i[1]]);
182     low.val = l;
183
184     __m128 h = _mm_setzero_ps();
185     h = _mm_loadl_pi(h, (const __m64*)&src[i[2]]);
186     h = _mm_loadh_pi(h, (const __m64*)&src[i[3]]);
187     high.val = h;
188 }
189
190 static inline v_int32x4 v_madd(const v_int16x8& a, const v_int16x8& b) {
191     v_int32x4 r;
192     r.val = _mm_madd_epi16(a.val, b.val);
193     return r;
194 }
195
196 static inline v_int16x8 v_mulhi(const v_int16x8& a, short b) {
197     v_int16x8 r;
198     r.val = _mm_mulhi_epi16(a.val, _mm_set1_epi16(b));
199     return r;
200 }
201
202 static inline v_uint16x8 v_mulhi(const v_uint16x8& a, v_uint16x8 b) {
203     v_uint16x8 r;
204     r.val = _mm_mulhi_epu16(a.val, b.val);
205     return r;
206 }
207
208 static inline v_uint16x8 v_mulhi(const v_uint16x8& a, uint16_t b) {
209     v_uint16x8 r;
210     r.val = _mm_mulhi_epu16(a.val, _mm_set1_epi16(b));
211     return r;
212 }
213
214 static inline v_int16x8 v_mulhrs(const v_int16x8& a, const v_int16x8& b) {
215     v_int16x8 r;
216     r.val = _mm_mulhrs_epi16(a.val, b.val);
217     return r;
218 }
219
220 static inline v_int16x8 v_mulhrs(const v_int16x8& a, short b) {
221     return v_mulhrs(a, v_setall_s16(b));
222 }
223 #endif  // SSE2
224
225 #ifdef CV_SSE3
226 static inline void v_deinterleave_expand(const v_uint8x16& src, v_int16x8& even, v_int16x8& odd) {
227     static const __m128i mask_even = _mm_setr_epi8(0, -1, 2, -1, 4, -1, 6, -1, 8, -1, 10, -1, 12, -1, 14, -1);
228     static const __m128i mask_odd  = _mm_setr_epi8(1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1);
229     even.val = _mm_shuffle_epi8(src.val, mask_even);
230     odd .val = _mm_shuffle_epi8(src.val, mask_odd);
231 }
232 #endif
233
234 static inline v_float32x4 v_fma(const v_float32x4& a, float b, const v_float32x4& c) {
235     return v_fma(a, v_setall_f32(b), c);
236 }
237
238 static inline v_int16x8 operator+ (const v_int16x8& a, short b) {
239     return a + v_setall_s16(b);
240 }
241
242 static inline v_int16x8 operator- (short a, const v_int16x8& b) {
243     return v_setall_s16(a) - b;
244 }
245
246 static inline v_float32x4 operator- (float a, const v_float32x4& b) {
247     return v_setall_f32(a) - b;
248 }
249
250 static inline v_float32x4 operator* (const v_float32x4& a, float b) {
251     return a * v_setall_f32(b);
252 }
253
254 //------------------------------------------------------------------------------
255
256 // Resize (bi-linear, 8U)
257 void calcRowLinear_8U(uint8_t *dst[],
258                 const uint8_t *src0[],
259                 const uint8_t *src1[],
260                 const short    alpha[],
261                 const short    clone[],  // 4 clones of alpha
262                 const short    mapsx[],
263                 const short    beta[],
264                       uint8_t  tmp[],
265                 const Size   & inSz,
266                 const Size   & outSz,
267                       int      lpi) {
268     bool xRatioEq1 = inSz.width  == outSz.width;
269     bool yRatioEq1 = inSz.height == outSz.height;
270
271     if (!xRatioEq1 && !yRatioEq1) {
272         if (4 == lpi) {
273             // vertical pass
274             GAPI_DbgAssert(inSz.width >= 8);
275
276             __m128i b0 = _mm_set1_epi16(beta[0]);
277             __m128i b1 = _mm_set1_epi16(beta[1]);
278             __m128i b2 = _mm_set1_epi16(beta[2]);
279             __m128i b3 = _mm_set1_epi16(beta[3]);
280
281             for (int w = 0; w < inSz.width; ) {
282                 for (; w <= inSz.width - 8; w += 8) {
283                 #if USE_CVKL
284                     //--------------------------------------------
285                     // reworked from: ie_preprocess_data_sse42.cpp
286                     //      function: resize_bilinear_u8
287                     //         label: vertical_pass
288                     //--------------------------------------------
289
290                     __m128i val0lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src0[0][w])),
291                                                                      *reinterpret_cast<const int64_t*>(&src0[1][w]), 1);
292                     __m128i val0hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src0[2][w])),
293                                                                      *reinterpret_cast<const int64_t*>(&src0[3][w]), 1);
294                     __m128i val1lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src1[0][w])),
295                                                                      *reinterpret_cast<const int64_t*>(&src1[1][w]), 1);
296                     __m128i val1hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src1[2][w])),
297                                                                      *reinterpret_cast<const int64_t*>(&src1[3][w]), 1);
298
299                     __m128i val0_0 = _mm_cvtepu8_epi16(val0lo);
300                     __m128i val0_2 = _mm_cvtepu8_epi16(val0hi);
301                     __m128i val1_0 = _mm_cvtepu8_epi16(val1lo);
302                     __m128i val1_2 = _mm_cvtepu8_epi16(val1hi);
303
304                     __m128i val0_1 = _mm_unpackhi_epi8(val0lo, _mm_setzero_si128());
305                     __m128i val0_3 = _mm_unpackhi_epi8(val0hi, _mm_setzero_si128());
306                     __m128i val1_1 = _mm_unpackhi_epi8(val1lo, _mm_setzero_si128());
307                     __m128i val1_3 = _mm_unpackhi_epi8(val1hi, _mm_setzero_si128());
308
309                     __m128i t0 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_0, val1_0), b0);
310                     __m128i t1 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_1, val1_1), b1);
311                     __m128i t2 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_2, val1_2), b2);
312                     __m128i t3 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_3, val1_3), b3);
313
314                     __m128i r0 = _mm_add_epi16(val1_0, t0);
315                     __m128i r1 = _mm_add_epi16(val1_1, t1);
316                     __m128i r2 = _mm_add_epi16(val1_2, t2);
317                     __m128i r3 = _mm_add_epi16(val1_3, t3);
318
319                     __m128i q0 = _mm_packus_epi16(r0, r1);
320                     __m128i q1 = _mm_packus_epi16(r2, r3);
321
322                     __m128i q2 = _mm_blend_epi16(q0, _mm_slli_si128(q1, 4), 0xCC /*0b11001100*/);
323                     __m128i q3 = _mm_blend_epi16(_mm_srli_si128(q0, 4), q1, 0xCC /*0b11001100*/);
324
325                     __m128i q4 = _mm_shuffle_epi8(q2, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15));
326                     __m128i q5 = _mm_shuffle_epi8(q3, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15));
327
328                     _mm_storeu_si128(reinterpret_cast<__m128i *>(&tmp[4*w +  0]), q4);
329                     _mm_storeu_si128(reinterpret_cast<__m128i *>(&tmp[4*w + 16]), q5);
330
331                 #else
332                     // let: t[i] = src0[i][w]*beta0[i] + src1[i][w]*beta1
333                     // here: beta0[i] = beta[i], beta1 = 1 - beta0[i]
334                     v_int16x8 t0, t1, t2, t3;
335                     {
336                         v_int16x8 s0, s1;
337
338                         s0 = v_reinterpret_as_s16(v_load_expand(&src0[0][w]));
339                         s1 = v_reinterpret_as_s16(v_load_expand(&src1[0][w]));
340                         t0 = v_mulhrs(s0 - s1, beta[0]) + s1;
341
342                         s0 = v_reinterpret_as_s16(v_load_expand(&src0[1][w]));
343                         s1 = v_reinterpret_as_s16(v_load_expand(&src1[1][w]));
344                         t1 = v_mulhrs(s0 - s1, beta[1]) + s1;
345
346                         s0 = v_reinterpret_as_s16(v_load_expand(&src0[2][w]));
347                         s1 = v_reinterpret_as_s16(v_load_expand(&src1[2][w]));
348                         t2 = v_mulhrs(s0 - s1, beta[2]) + s1;
349
350                         s0 = v_reinterpret_as_s16(v_load_expand(&src0[3][w]));
351                         s1 = v_reinterpret_as_s16(v_load_expand(&src1[3][w]));
352                         t3 = v_mulhrs(s0 - s1, beta[3]) + s1;
353                     }
354                     // store as groups of 4 pixels: each group to have a pixel per row
355                     {
356                         v_uint8x16 a0, a1, a2, a3;
357                         a0 = v_pack_u(t0, v_setall_s16(0));
358                         a1 = v_pack_u(t1, v_setall_s16(0));
359                         a2 = v_pack_u(t2, v_setall_s16(0));
360                         a3 = v_pack_u(t3, v_setall_s16(0));
361
362                         v_int16x8 b0, b1;
363                         b0 = v_reinterpret_as_s16(v_interleave_low(a0, a1));  // 0th, 1st
364                         b1 = v_reinterpret_as_s16(v_interleave_low(a2, a3));  // 2nd, 3rd
365
366                         v_uint8x16 d0, d1;
367                         d0 = v_reinterpret_as_u8(v_interleave_low(b0,  b1));
368                         d1 = v_reinterpret_as_u8(v_interleave_high(b0, b1));
369
370                         v_store(&tmp[4*w +  0], d0);
371                         v_store(&tmp[4*w + 16], d1);
372                     }
373                 #endif
374                 }
375
376                 if (w < inSz.width) {
377                     w = inSz.width - 8;
378                 }
379             }
380
381             // horizontal pass
382             GAPI_DbgAssert(outSz.width >= 8);
383             for (int x = 0; x < outSz.width; ) {
384                 for (; x <= outSz.width - 8; x += 8) {
385                 #if USE_CVKL
386                     //--------------------------------------------
387                     // reworked from: ie_preprocess_data_sse42.cpp
388                     //      function: resize_bilinear_u8
389                     //         label: horizontal_pass
390                     //--------------------------------------------
391
392                 #if 1
393                     __m128i a10 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 *  x]));
394                     __m128i a32 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 * (x + 2)]));
395                     __m128i a54 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 * (x + 4)]));
396                     __m128i a76 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 * (x + 6)]));
397                 #else
398                     // provided alpha[x..x+7] = { a0, a1, a2, a3, a4, a5, a6, a7},
399                     // clone each a[i] 4 times - one item per each of LPI rows,
400                     // so that a10 = {a0, a0, a0, a0, a1, a1, a1, a1}, etc.
401                     __m128i a10, a32, a54, a76;
402                     __m128i alpha0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&alpha[x]));
403                     a10 = _mm_unpacklo_epi16(alpha0, alpha0);  // {a0, a0, a1, a1, a2, a2, a3, a3}
404                     a32 = _mm_unpackhi_epi16(a10, a10);        // {a2, a2, a2, a2, a3, a3, a3, a3}
405                     a10 = _mm_unpacklo_epi16(a10, a10);        // {a0, a0, a0, a0, a1, a1, a1, a1}
406                     a54 = _mm_unpackhi_epi16(alpha0, alpha0);  // {a4, a4, a5, a5, a6, a6, a7, a7}
407                     a76 = _mm_unpackhi_epi16(a54, a54);        // {a6, a6, a6, a6, a7, a7, a7, a7}
408                     a54 = _mm_unpacklo_epi16(a54, a54);        // {a4, a4, a4, a4, a5, a5, a5, a5}
409                 #endif
410
411                     __m128d val0d, val1d, val2d, val3d;
412                     val0d = _mm_load_sd(/****/  reinterpret_cast<double*>(&tmp[4 * mapsx[x + 0]]));
413                     val0d = _mm_loadh_pd(val0d, reinterpret_cast<double*>(&tmp[4 * mapsx[x + 1]]));
414                     val1d = _mm_load_sd(/****/  reinterpret_cast<double*>(&tmp[4 * mapsx[x + 2]]));
415                     val1d = _mm_loadh_pd(val1d, reinterpret_cast<double*>(&tmp[4 * mapsx[x + 3]]));
416                     val2d = _mm_load_sd(/****/  reinterpret_cast<double*>(&tmp[4 * mapsx[x + 4]]));
417                     val2d = _mm_loadh_pd(val2d, reinterpret_cast<double*>(&tmp[4 * mapsx[x + 5]]));
418                     val3d = _mm_load_sd(/****/  reinterpret_cast<double*>(&tmp[4 * mapsx[x + 6]]));
419                     val3d = _mm_loadh_pd(val3d, reinterpret_cast<double*>(&tmp[4 * mapsx[x + 7]]));
420
421                     __m128i val_0 = _mm_castpd_si128(val0d);
422                     __m128i val_1 = _mm_castpd_si128(val1d);
423                     __m128i val_2 = _mm_castpd_si128(val2d);
424                     __m128i val_3 = _mm_castpd_si128(val3d);
425
426                     val_0 = _mm_shuffle_epi32(val_0, _MM_SHUFFLE(3, 1, 2, 0));
427                     val_1 = _mm_shuffle_epi32(val_1, _MM_SHUFFLE(3, 1, 2, 0));
428                     val_2 = _mm_shuffle_epi32(val_2, _MM_SHUFFLE(3, 1, 2, 0));
429                     val_3 = _mm_shuffle_epi32(val_3, _MM_SHUFFLE(3, 1, 2, 0));
430
431                     __m128i val0_0 = _mm_cvtepu8_epi16(val_0);
432                     __m128i val0_1 = _mm_cvtepu8_epi16(val_1);
433                     __m128i val0_2 = _mm_cvtepu8_epi16(val_2);
434                     __m128i val0_3 = _mm_cvtepu8_epi16(val_3);
435
436                     __m128i val1_0 = _mm_unpackhi_epi8(val_0, _mm_setzero_si128());
437                     __m128i val1_1 = _mm_unpackhi_epi8(val_1, _mm_setzero_si128());
438                     __m128i val1_2 = _mm_unpackhi_epi8(val_2, _mm_setzero_si128());
439                     __m128i val1_3 = _mm_unpackhi_epi8(val_3, _mm_setzero_si128());
440
441                     __m128i t0 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_0, val1_0), a10);
442                     __m128i t1 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_1, val1_1), a32);
443                     __m128i t2 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_2, val1_2), a54);
444                     __m128i t3 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_3, val1_3), a76);
445
446                     __m128i r0 = _mm_add_epi16(val1_0, t0);
447                     __m128i r1 = _mm_add_epi16(val1_1, t1);
448                     __m128i r2 = _mm_add_epi16(val1_2, t2);
449                     __m128i r3 = _mm_add_epi16(val1_3, t3);
450
451                     __m128i q0 = _mm_packus_epi16(r0, r1);
452                     __m128i q1 = _mm_packus_epi16(r2, r3);
453
454                     __m128i q2 = _mm_shuffle_epi8(q0, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
455                     __m128i q3 = _mm_shuffle_epi8(q1, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
456
457                     __m128i q4 = _mm_blend_epi16(q2, _mm_slli_si128(q3, 4), 0xCC /*0b11001100*/);
458                     __m128i q5 = _mm_blend_epi16(_mm_srli_si128(q2, 4), q3, 0xCC /*0b11001100*/);
459
460                     _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[0][x]),                q4);
461                     _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[1][x]), _mm_srli_si128(q4, 8));
462                     _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[2][x]),                q5);
463                     _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[3][x]), _mm_srli_si128(q5, 8));
464
465                 #else
466                     // let: t be 2 pairs of groups of 4 pixels (each group is for 4 dst rows)
467                     // each pair of gorups corresponds to pixels indexed as sx0 and sx1=sx0+1
468                     // so: low part of t0 is 2x4 pixels corresponding to sx0=mapsx[x+0], etc.
469                     v_uint8x16 t0, t1, t2, t3;
470                     {
471                         t0.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 0]])),
472                                                                  *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 1]]), 1);
473                         t1.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 2]])),
474                                                                  *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 3]]), 1);
475                         t2.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 4]])),
476                                                                  *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 5]]), 1);
477                         t3.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 6]])),
478                                                                  *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 7]]), 1);
479                     }
480
481                     // let: r0 be pixels for 0th row, etc
482                     v_uint8x16 r0, r1, r2, r3;
483                     v_deinterleave(t0, t1, t2, t3, r0, r1, r2, r3);
484
485                     // let: dl be resulting 8 pixels for l'th row
486                     //      dl = alpha0*s0l + alpha1*s1l
487                     // note that alpha0 + alpha1 = 1
488                     {
489                         v_int16x8 s0, s1, d, alpha0;
490
491                         alpha0 = v_load(&alpha[x]);  // 8 coefficients
492
493                         v_deinterleave_expand(r0, s0, s1);
494                         d = v_mulhrs(s0 - s1, alpha0) + s1;
495                         v_pack_u_store(&dst[0][x], d);
496
497                         v_deinterleave_expand(r1, s0, s1);
498                         d = v_mulhrs(s0 - s1, alpha0) + s1;
499                         v_pack_u_store(&dst[1][x], d);
500
501                         v_deinterleave_expand(r2, s0, s1);
502                         d = v_mulhrs(s0 - s1, alpha0) + s1;
503                         v_pack_u_store(&dst[2][x], d);
504
505                         v_deinterleave_expand(r3, s0, s1);
506                         d = v_mulhrs(s0 - s1, alpha0) + s1;
507                         v_pack_u_store(&dst[3][x], d);
508                     }
509                 #endif
510                 }
511
512                 if (x < outSz.width) {
513                     x = outSz.width - 8;
514                 }
515             }
516
517         } else {  // if any lpi
518             for (int l = 0; l < lpi; l++) {
519                 short beta0 =                            beta[l];
520             //  short beta1 = saturate_cast<short>(ONE - beta[l]);
521
522                 // vertical pass
523                 GAPI_DbgAssert(inSz.width >= 8);
524                 for (int w = 0; w < inSz.width; ) {
525                     for (; w <= inSz.width - 8; w += 8) {
526                         v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(&src0[l][w]));
527                         v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(&src1[l][w]));
528                         v_int16x8 t = v_mulhrs(s0 - s1, beta0) + s1;
529                         v_pack_u_store(tmp + w, t);
530                     }
531
532                     if (w < inSz.width) {
533                         w = inSz.width - 8;
534                     }
535                 }
536
537                 // horizontal pass
538                 GAPI_DbgAssert(outSz.width >= 8);
539                 for (int x = 0; x < outSz.width; ) {
540                     for (; x <= outSz.width - 8; x += 8) {
541                         v_int16x8 a0 = v_load(&alpha[x]);        // as signed Q1.1.14
542                         v_int16x8 sx = v_load(&mapsx[x]);        // as integer (int16)
543                         v_uint8x16 t = v_gather_pairs(tmp, sx);  // 8 pairs of src0 pixels
544                         v_int16x8 t0, t1;
545                         v_deinterleave_expand(t, t0, t1);        // tmp pixels as int16
546                         v_int16x8 d = v_mulhrs(t0 - t1, a0) + t1;
547                         v_pack_u_store(&dst[l][x], d);
548                     }
549
550                     if (x < outSz.width) {
551                         x = outSz.width - 8;
552                     }
553                 }
554             }
555         }  // if lpi == 4
556
557     } else if (!xRatioEq1) {
558         GAPI_DbgAssert(yRatioEq1);
559
560         if (4 == lpi) {
561             // vertical pass
562             GAPI_DbgAssert(inSz.width >= 16);
563             for (int w = 0; w < inSz.width; ) {
564                 for (; w <= inSz.width - 16; w += 16) {
565                     v_uint8x16 s0, s1, s2, s3;
566                     s0 = v_load(&src0[0][w]);
567                     s1 = v_load(&src0[1][w]);
568                     s2 = v_load(&src0[2][w]);
569                     s3 = v_load(&src0[3][w]);
570                     v_store_interleave(&tmp[4*w], s0, s1, s2, s3);
571                 }
572
573                 if (w < inSz.width) {
574                     w = inSz.width - 16;
575                 }
576             }
577
578             // horizontal pass
579             GAPI_DbgAssert(outSz.width >= 8);
580             for (int x = 0; x < outSz.width; ) {
581                 for (; x <= outSz.width - 8; x += 8) {
582                     v_uint8x16 t0, t1, t2, t3;
583                     t0.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 0]])),
584                                                              *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 1]]), 1);
585                     t1.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 2]])),
586                                                              *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 3]]), 1);
587                     t2.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 4]])),
588                                                              *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 5]]), 1);
589                     t3.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 6]])),
590                                                              *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 7]]), 1);
591
592                     v_uint8x16 r0, r1, r2, r3;
593                     v_deinterleave(t0, t1, t2, t3, r0, r1, r2, r3);
594
595                     v_int16x8 s0, s1, d, alpha0;
596
597                     alpha0 = v_load(&alpha[x]);  // 8 coefficients
598
599                     v_deinterleave_expand(r0, s0, s1);
600                     d = v_mulhrs(s0 - s1, alpha0) + s1;
601                     v_pack_u_store(&dst[0][x], d);
602
603                     v_deinterleave_expand(r1, s0, s1);
604                     d = v_mulhrs(s0 - s1, alpha0) + s1;
605                     v_pack_u_store(&dst[1][x], d);
606
607                     v_deinterleave_expand(r2, s0, s1);
608                     d = v_mulhrs(s0 - s1, alpha0) + s1;
609                     v_pack_u_store(&dst[2][x], d);
610
611                     v_deinterleave_expand(r3, s0, s1);
612                     d = v_mulhrs(s0 - s1, alpha0) + s1;
613                     v_pack_u_store(&dst[3][x], d);
614                 }
615
616                 if (x < outSz.width) {
617                     x = outSz.width - 8;
618                 }
619             }
620
621         } else {  // any LPI
622             for (int l = 0; l < lpi; l++) {
623                 const uchar *src = src0[l];
624
625                 // horizontal pass
626                 GAPI_DbgAssert(outSz.width >= 8);
627                 for (int x = 0; x < outSz.width; ) {
628                     for (; x <= outSz.width - 8; x += 8) {
629                         v_int16x8 a0 = v_load(&alpha[x]);        // as signed Q1.1.14
630                         v_int16x8 sx = v_load(&mapsx[x]);        // as integer (int16)
631                         v_uint8x16 t = v_gather_pairs(src, sx);  // 8 pairs of src0 pixels
632                         v_int16x8 t0, t1;
633                         v_deinterleave_expand(t, t0, t1);        // tmp pixels as int16
634                         v_int16x8 d = v_mulhrs(t0 - t1, a0) + t1;
635                         v_pack_u_store(&dst[l][x], d);
636                     }
637
638                     if (x < outSz.width) {
639                         x = outSz.width - 8;
640                     }
641                 }
642             }
643         }
644
645     } else if (!yRatioEq1) {
646         GAPI_DbgAssert(xRatioEq1);
647         int length = inSz.width;  // == outSz.width
648
649         for (int l = 0; l < lpi; l++) {
650             short beta0 =                            beta[l];
651         //  short beta1 = saturate_cast<short>(ONE - beta[l]);
652
653             // vertical pass
654             GAPI_DbgAssert(inSz.width >= 8);
655             for (int w = 0; w < outSz.width; ) {
656                 for (; w <= length - 8; w += 8) {
657                     v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(src0[l] + w));
658                     v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(src1[l] + w));
659                     v_int16x8 t = v_mulhrs(s0 - s1, beta0) + s1;
660                     v_pack_u_store(dst[l] + w, t);
661                 }
662
663                 if (w < inSz.width) {
664                     w = inSz.width - 8;
665                 }
666             }
667         }
668
669     } else {
670         GAPI_DbgAssert(xRatioEq1 && yRatioEq1);
671         int length = inSz.width;  // == outSz.width
672
673         for (int l = 0; l < lpi; l++) {
674             memcpy(dst[l], src0[l], length);
675         }
676     }
677 }
678
679 // Resize (bi-linear, 8UC3)
680 void calcRowLinear_8UC3(std::array<std::array<uint8_t*, 4>, 3> &dst,
681                   const uint8_t *src0[],
682                   const uint8_t *src1[],
683                   const short    alpha[],
684                   const short    clone[],  // 4 clones of alpha
685                   const short    mapsx[],
686                   const short    beta[],
687                         uint8_t  tmp[],
688                   const Size    &inSz,
689                   const Size    &outSz,
690                         int      lpi) {
691     constexpr const int chanNum = 3;
692
693     if (4 == lpi) {
694         // vertical pass
695         GAPI_DbgAssert(inSz.width >= 8);
696
697         __m128i b0 = _mm_set1_epi16(beta[0]);
698         __m128i b1 = _mm_set1_epi16(beta[1]);
699         __m128i b2 = _mm_set1_epi16(beta[2]);
700         __m128i b3 = _mm_set1_epi16(beta[3]);
701
702         for (int w = 0; w < inSz.width*chanNum; ) {
703             for (; w <= inSz.width*chanNum - 8; w += 8) {
704                 //--------------------------------------------
705                 // reworked from: ie_preprocess_data_sse42.cpp
706                 //      function: resize_bilinear_u8
707                 //         label: vertical_pass
708                 //--------------------------------------------
709
710                 __m128i val0lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src0[0][w])),
711                         *reinterpret_cast<const int64_t*>(&src0[1][w]), 1);
712                 __m128i val0hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src0[2][w])),
713                         *reinterpret_cast<const int64_t*>(&src0[3][w]), 1);
714                 __m128i val1lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src1[0][w])),
715                         *reinterpret_cast<const int64_t*>(&src1[1][w]), 1);
716                 __m128i val1hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src1[2][w])),
717                         *reinterpret_cast<const int64_t*>(&src1[3][w]), 1);
718
719                 __m128i val0_0 = _mm_cvtepu8_epi16(val0lo);
720                 __m128i val0_2 = _mm_cvtepu8_epi16(val0hi);
721                 __m128i val1_0 = _mm_cvtepu8_epi16(val1lo);
722                 __m128i val1_2 = _mm_cvtepu8_epi16(val1hi);
723
724                 __m128i val0_1 = _mm_unpackhi_epi8(val0lo, _mm_setzero_si128());
725                 __m128i val0_3 = _mm_unpackhi_epi8(val0hi, _mm_setzero_si128());
726                 __m128i val1_1 = _mm_unpackhi_epi8(val1lo, _mm_setzero_si128());
727                 __m128i val1_3 = _mm_unpackhi_epi8(val1hi, _mm_setzero_si128());
728
729                 __m128i t0 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_0, val1_0), b0);
730                 __m128i t1 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_1, val1_1), b1);
731                 __m128i t2 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_2, val1_2), b2);
732                 __m128i t3 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_3, val1_3), b3);
733
734                 __m128i r0 = _mm_add_epi16(val1_0, t0);
735                 __m128i r1 = _mm_add_epi16(val1_1, t1);
736                 __m128i r2 = _mm_add_epi16(val1_2, t2);
737                 __m128i r3 = _mm_add_epi16(val1_3, t3);
738
739                 __m128i q0 = _mm_packus_epi16(r0, r1);
740                 __m128i q1 = _mm_packus_epi16(r2, r3);
741
742                 __m128i q2 = _mm_blend_epi16(q0, _mm_slli_si128(q1, 4), 0xCC /*0b11001100*/);
743                 __m128i q3 = _mm_blend_epi16(_mm_srli_si128(q0, 4), q1, 0xCC /*0b11001100*/);
744
745                 __m128i q4 = _mm_shuffle_epi8(q2, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15));
746                 __m128i q5 = _mm_shuffle_epi8(q3, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15));
747
748                 _mm_storeu_si128(reinterpret_cast<__m128i *>(&tmp[4*w +  0]), q4);
749                 _mm_storeu_si128(reinterpret_cast<__m128i *>(&tmp[4*w + 16]), q5);
750             }
751
752             if (w < inSz.width*chanNum) {
753                 w = inSz.width*chanNum - 8;
754             }
755         }
756
757         // horizontal pass
758         GAPI_DbgAssert(outSz.width >= 8);
759         for (int x = 0; x < outSz.width; ) {
760             for (; x <= outSz.width - 8; x += 8) {
761                 //--------------------------------------------
762                 // reworked from: ie_preprocess_data_sse42.cpp
763                 //      function: resize_bilinear_u8
764                 //         label: horizontal_pass
765                 //--------------------------------------------
766
767                 __m128i a10 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 *  x]));
768                 __m128i a32 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 * (x + 2)]));
769                 __m128i a54 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 * (x + 4)]));
770                 __m128i a76 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 * (x + 6)]));
771
772                 __m128i val_0 = _mm_setzero_si128();
773                 __m128i val_1 = _mm_setzero_si128();
774                 __m128i val_2 = _mm_setzero_si128();
775                 __m128i val_3 = _mm_setzero_si128();
776
777                 for (int c = 0; c < chanNum; c++) {
778                     val_0 = _mm_insert_epi32(val_0, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + 0]      + c)]), 0);
779                     val_0 = _mm_insert_epi32(val_0, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 0] + 1) + c)]), 1);
780                     val_0 = _mm_insert_epi32(val_0, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + 1]      + c)]), 2);
781                     val_0 = _mm_insert_epi32(val_0, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 1] + 1) + c)]), 3);
782
783                     val_1 = _mm_insert_epi32(val_1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + 2]      + c)]), 0);
784                     val_1 = _mm_insert_epi32(val_1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 2] + 1) + c)]), 1);
785                     val_1 = _mm_insert_epi32(val_1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + 3]      + c)]), 2);
786                     val_1 = _mm_insert_epi32(val_1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 3] + 1) + c)]), 3);
787
788                     val_2 = _mm_insert_epi32(val_2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + 4]      + c)]), 0);
789                     val_2 = _mm_insert_epi32(val_2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 4] + 1) + c)]), 1);
790                     val_2 = _mm_insert_epi32(val_2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + 5]      + c)]), 2);
791                     val_2 = _mm_insert_epi32(val_2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 5] + 1) + c)]), 3);
792
793                     val_3 = _mm_insert_epi32(val_3, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + 6]      + c)]), 0);
794                     val_3 = _mm_insert_epi32(val_3, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 6] + 1) + c)]), 1);
795                     val_3 = _mm_insert_epi32(val_3, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + 7]      + c)]), 2);
796                     val_3 = _mm_insert_epi32(val_3, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 7] + 1) + c)]), 3);
797
798                     val_0 = _mm_shuffle_epi32(val_0, _MM_SHUFFLE(3, 1, 2, 0));
799                     val_1 = _mm_shuffle_epi32(val_1, _MM_SHUFFLE(3, 1, 2, 0));
800                     val_2 = _mm_shuffle_epi32(val_2, _MM_SHUFFLE(3, 1, 2, 0));
801                     val_3 = _mm_shuffle_epi32(val_3, _MM_SHUFFLE(3, 1, 2, 0));
802
803                     __m128i val0_0 = _mm_cvtepu8_epi16(val_0);
804                     __m128i val0_1 = _mm_cvtepu8_epi16(val_1);
805                     __m128i val0_2 = _mm_cvtepu8_epi16(val_2);
806                     __m128i val0_3 = _mm_cvtepu8_epi16(val_3);
807
808                     __m128i val1_0 = _mm_unpackhi_epi8(val_0, _mm_setzero_si128());
809                     __m128i val1_1 = _mm_unpackhi_epi8(val_1, _mm_setzero_si128());
810                     __m128i val1_2 = _mm_unpackhi_epi8(val_2, _mm_setzero_si128());
811                     __m128i val1_3 = _mm_unpackhi_epi8(val_3, _mm_setzero_si128());
812
813                     __m128i t0 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_0, val1_0), a10);
814                     __m128i t1 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_1, val1_1), a32);
815                     __m128i t2 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_2, val1_2), a54);
816                     __m128i t3 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_3, val1_3), a76);
817
818                     __m128i r0 = _mm_add_epi16(val1_0, t0);
819                     __m128i r1 = _mm_add_epi16(val1_1, t1);
820                     __m128i r2 = _mm_add_epi16(val1_2, t2);
821                     __m128i r3 = _mm_add_epi16(val1_3, t3);
822
823                     __m128i q0 = _mm_packus_epi16(r0, r1);
824                     __m128i q1 = _mm_packus_epi16(r2, r3);
825
826                     __m128i q2 = _mm_shuffle_epi8(q0, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
827                     __m128i q3 = _mm_shuffle_epi8(q1, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
828
829                     __m128i q4 = _mm_blend_epi16(q2, _mm_slli_si128(q3, 4), 0xCC /*0b11001100*/);
830                     __m128i q5 = _mm_blend_epi16(_mm_srli_si128(q2, 4), q3, 0xCC /*0b11001100*/);
831
832                     _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[c][0][x]),                q4);
833                     _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[c][1][x]), _mm_srli_si128(q4, 8));
834                     _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[c][2][x]),                q5);
835                     _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[c][3][x]), _mm_srli_si128(q5, 8));
836                 }
837             }
838
839             if (x < outSz.width) {
840                 x = outSz.width - 8;
841             }
842         }
843     } else {  // if any lpi
844         for (int l = 0; l < lpi; l++) {
845             short beta0 = beta[l];
846
847             // vertical pass
848             GAPI_DbgAssert(inSz.width*chanNum >= 8);
849             for (int w = 0; w < inSz.width*chanNum; ) {
850                 for (; w <= inSz.width*chanNum - 8; w += 8) {
851                     v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(&src0[l][w]));
852                     v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(&src1[l][w]));
853                     v_int16x8 t = v_mulhrs(s0 - s1, beta0) + s1;
854                     v_pack_u_store(tmp + w, t);
855                 }
856
857                 if (w < inSz.width*chanNum) {
858                     w = inSz.width*chanNum - 8;
859                 }
860             }
861
862             // horizontal pass
863             GAPI_DbgAssert(outSz.width >= 8);
864             for (int x = 0; x < outSz.width; ) {
865                 for (; x <= outSz.width - 8; x += 8) {
866                     for (int c = 0; c < chanNum; c++) {
867                         v_int16x8 a0 = v_load(&alpha[x]);        // as signed Q1.1.14
868                         v_int16x8 sx = v_load(&mapsx[x]);        // as integer (int16)
869                         v_int16x8 t0 = v_gather_chan(tmp, sx, c, 0);
870                         v_int16x8 t1 = v_gather_chan(tmp, sx, c, 1);
871                         v_int16x8 d = v_mulhrs(t0 - t1, a0) + t1;
872                         v_pack_u_store(&dst[c][l][x], d);
873                     }
874                 }
875
876                 if (x < outSz.width) {
877                     x = outSz.width - 8;
878                 }
879             }
880         }
881     }
882 }
883
884 // Resize (bi-linear, 32F)
885 void calcRowLinear_32F(float *dst[],
886                  const float *src0[],
887                  const float *src1[],
888                  const float  alpha[],
889                  const int    mapsx[],
890                  const float  beta[],
891                  const Size & inSz,
892                  const Size & outSz,
893                        int    lpi) {
894     bool xRatioEq1 = inSz.width  == outSz.width;
895     bool yRatioEq1 = inSz.height == outSz.height;
896
897     if (!xRatioEq1 && !yRatioEq1) {
898         for (int l = 0; l < lpi; l++) {
899             float beta0 = beta[l];
900             float beta1 = 1 - beta0;
901
902             int x = 0;
903
904         #if CV_SIMD128
905             for (; x <= outSz.width - 4; x += 4) {
906                 v_float32x4 alpha0 = v_load(&alpha[x]);
907             //  v_float32x4 alpha1 = 1.f - alpha0;
908
909                 v_int32x4 sx = v_load(&mapsx[x]);
910
911                 v_float32x4 s0l, s0h, s00, s01;
912                 v_gather_pairs(src0[l], sx, s0l, s0h);
913                 v_deinterleave(s0l, s0h, s00, s01);
914
915             //  v_float32x4 res0 = s00*alpha0 + s01*alpha1;
916                 v_float32x4 res0 = v_fma(s00 - s01, alpha0, s01);
917
918                 v_float32x4 s1l, s1h, s10, s11;
919                 v_gather_pairs(src1[l], sx, s1l, s1h);
920                 v_deinterleave(s1l, s1h, s10, s11);
921
922             //  v_float32x4 res1 = s10*alpha0 + s11*alpha1;
923                 v_float32x4 res1 = v_fma(s10 - s11, alpha0, s11);
924
925             //  v_float32x4 d = res0*beta0 + res1*beta1;
926                 v_float32x4 d = v_fma(res0 - res1, beta0, res1);
927
928                 v_store(&dst[l][x], d);
929             }
930         #endif
931
932             for (; x < outSz.width; x++) {
933                 float alpha0 = alpha[x];
934                 float alpha1 = 1 - alpha0;
935                 int   sx0 = mapsx[x];
936                 int   sx1 = sx0 + 1;
937                 float res0 = src0[l][sx0]*alpha0 + src0[l][sx1]*alpha1;
938                 float res1 = src1[l][sx0]*alpha0 + src1[l][sx1]*alpha1;
939                 dst[l][x] = beta0*res0 + beta1*res1;
940             }
941         }
942
943     } else if (!xRatioEq1) {
944         GAPI_DbgAssert(yRatioEq1);
945
946         for (int l = 0; l < lpi; l++) {
947             int x = 0;
948
949         #if CV_SIMD128
950             for (; x <= outSz.width - 4; x += 4) {
951                 v_float32x4 alpha0 = v_load(&alpha[x]);
952             //  v_float32x4 alpha1 = 1.f - alpha0;
953
954                 v_int32x4 sx = v_load(&mapsx[x]);
955
956                 v_float32x4 s0l, s0h, s00, s01;
957                 v_gather_pairs(src0[l], sx, s0l, s0h);
958                 v_deinterleave(s0l, s0h, s00, s01);
959
960             //  v_float32x4 d = s00*alpha0 + s01*alpha1;
961                 v_float32x4 d = v_fma(s00 - s01, alpha0, s01);
962
963                 v_store(&dst[l][x], d);
964             }
965         #endif
966
967             for (; x < outSz.width; x++) {
968                 float alpha0 = alpha[x];
969                 float alpha1 = 1 - alpha0;
970                 int   sx0 = mapsx[x];
971                 int   sx1 = sx0 + 1;
972                 dst[l][x] = src0[l][sx0]*alpha0 + src0[l][sx1]*alpha1;
973             }
974         }
975
976     } else if (!yRatioEq1) {
977         GAPI_DbgAssert(xRatioEq1);
978         int length = inSz.width;  // == outSz.width
979
980         for (int l = 0; l < lpi; l++) {
981             float beta0 = beta[l];
982             float beta1 = 1 - beta0;
983
984             int x = 0;
985
986         #if CV_SIMD128
987             for (; x <= length - 4; x += 4) {
988                 v_float32x4 s0 = v_load(&src0[l][x]);
989                 v_float32x4 s1 = v_load(&src1[l][x]);
990
991             //  v_float32x4 d = s0*beta0 + s1*beta1;
992                 v_float32x4 d = v_fma(s0 - s1, beta0, s1);
993
994                 v_store(&dst[l][x], d);
995             }
996         #endif
997
998             for (; x < length; x++) {
999                 dst[l][x] = beta0*src0[l][x] + beta1*src1[l][x];
1000             }
1001         }
1002
1003     } else {
1004         GAPI_DbgAssert(xRatioEq1 && yRatioEq1);
1005         int length = inSz.width;  // == outSz.width
1006         for (int l = 0; l < lpi; l++) {
1007             memcpy(dst[l], src0[l], length * sizeof(float));
1008         }
1009     }
1010 }
1011
1012 //------------------------------------------------------------------------------
1013
1014 // vertical pass
1015 template<typename T, typename A, typename I, typename W>
1016 static inline void downy(const T *src[], int inWidth, const MapperUnit<A, I>& ymap, A yalpha,
1017                          W vbuf[]) {
1018     int y_1st = ymap.index0;
1019     int ylast = ymap.index1 - 1;
1020
1021     // yratio > 1, so at least 2 rows
1022     GAPI_DbgAssert(y_1st < ylast);
1023
1024     // 1st and last rows
1025     {
1026         int w = 0;
1027
1028     #if CV_SIMD128
1029         if (std::is_same<T, uint8_t>::value) {
1030             for (; w <= inWidth - 8; w += 8) {
1031                 v_uint16x8 vsrc0 = v_load_expand(reinterpret_cast<const uint8_t*>(& src[0][w]));
1032                 v_uint16x8 vsrc1 = v_load_expand(reinterpret_cast<const uint8_t*>(& src[ylast - y_1st][w]));
1033                 v_uint16x8 vres = v_mulhi(vsrc0 << 8, static_cast<Q0_16>(ymap.alpha0)) +
1034                                   v_mulhi(vsrc1 << 8, static_cast<Q0_16>(ymap.alpha1));
1035                 v_store(reinterpret_cast<Q8_8*>(& vbuf[w]), vres);
1036             }
1037         }
1038     #endif
1039
1040         for (; w < inWidth; w++) {
1041             vbuf[w] = mulas(ymap.alpha0, src[0][w])
1042                     + mulas(ymap.alpha1, src[ylast - y_1st][w]);
1043         }
1044     }
1045
1046     // inner rows (if any)
1047     for (int i = 1; i < ylast - y_1st; i++) {
1048         int w = 0;
1049
1050     #if CV_SIMD128
1051         if (std::is_same<T, uint8_t>::value) {
1052             for (; w <= inWidth - 8; w += 8) {
1053                 v_uint16x8 vsrc = v_load_expand(reinterpret_cast<const uint8_t*>(& src[i][w]));
1054                 v_uint16x8 vres = v_load(reinterpret_cast<Q8_8*>(& vbuf[w]));
1055                 vres = vres + v_mulhi(vsrc << 8, static_cast<Q0_16>(yalpha));
1056                 v_store(reinterpret_cast<Q8_8*>(& vbuf[w]), vres);
1057             }
1058         }
1059     #endif
1060
1061         for (; w < inWidth; w++) {
1062             vbuf[w] += mulas(yalpha, src[i][w]);
1063         }
1064     }
1065 }
1066
1067 // horizontal pass
1068 template<typename T, typename A, typename I, typename W>
1069 static inline void downx(T dst[], int outWidth, int xmaxdf, const I xindex[], const A xalpha[],
1070                          const W vbuf[]) {
1071 #define HSUM(xmaxdf) \
1072     for (int x = 0; x < outWidth; x++) { \
1073         int      index =  xindex[x]; \
1074         const A *alpha = &xalpha[x * xmaxdf]; \
1075 \
1076         W sum = 0; \
1077         for (int i = 0; i < xmaxdf; i++) { \
1078             sum += mulaw(alpha[i], vbuf[index + i]); \
1079         } \
1080 \
1081         dst[x] = convert_cast<T>(sum); \
1082     }
1083
1084     if (2 == xmaxdf) {
1085         HSUM(2);
1086     } else if (3 == xmaxdf) {
1087         HSUM(3);
1088     } else if (4 == xmaxdf) {
1089         HSUM(4);
1090     } else if (5 == xmaxdf) {
1091         HSUM(5);
1092     } else if (6 == xmaxdf) {
1093         HSUM(6);
1094     } else if (7 == xmaxdf) {
1095         HSUM(7);
1096     } else if (8 == xmaxdf) {
1097         HSUM(8);
1098     } else {
1099         HSUM(xmaxdf);
1100     }
1101 #undef HSUM
1102 }
1103
1104 template<typename T, typename A, typename I, typename W>
1105 static void calcRowArea_impl(T dst[], const T *src[], const Size& inSz, const Size& outSz,
1106     A yalpha, const MapperUnit<A, I>& ymap, int xmaxdf, const I xindex[], const A xalpha[],
1107     W vbuf[]) {
1108     bool xRatioEq1 = inSz.width  == outSz.width;
1109     bool yRatioEq1 = inSz.height == outSz.height;
1110
1111     if (!yRatioEq1 && !xRatioEq1) {
1112         downy(src, inSz.width, ymap, yalpha, vbuf);
1113         downx(dst, outSz.width, xmaxdf, xindex, xalpha, vbuf);
1114
1115     } else if (!yRatioEq1) {
1116         GAPI_DbgAssert(xRatioEq1);
1117         downy(src, inSz.width, ymap, yalpha, vbuf);
1118         for (int x = 0; x < outSz.width; x++) {
1119             dst[x] = convert_cast<T>(vbuf[x]);
1120         }
1121
1122     } else if (!xRatioEq1) {
1123         GAPI_DbgAssert(yRatioEq1);
1124         for (int w = 0; w < inSz.width; w++) {
1125             vbuf[w] = convert_cast<W>(src[0][w]);
1126         }
1127         downx(dst, outSz.width, xmaxdf, xindex, xalpha, vbuf);
1128
1129     } else {
1130         GAPI_DbgAssert(xRatioEq1 && yRatioEq1);
1131         memcpy(dst, src[0], outSz.width * sizeof(T));
1132     }
1133 }
1134
1135 void calcRowArea_8U(uchar dst[], const uchar *src[], const Size& inSz, const Size& outSz,
1136     Q0_16 yalpha, const MapperUnit8U &ymap, int xmaxdf, const short xindex[], const Q0_16 xalpha[],
1137     Q8_8 vbuf[]) {
1138     calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
1139 }
1140
1141 void calcRowArea_32F(float dst[], const float *src[], const Size& inSz, const Size& outSz,
1142     float yalpha, const MapperUnit32F& ymap, int xmaxdf, const int xindex[], const float xalpha[],
1143     float vbuf[]) {
1144     calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
1145 }
1146
1147 //------------------------------------------------------------------------------
1148 #if USE_CVKL
1149
1150 // from: ie_preprocess_data.hpp
1151 static inline uint8_t saturateU32toU8(uint32_t v) {
1152     return static_cast<uint8_t>(v > UINT8_MAX ? UINT8_MAX : v);
1153 }
1154
1155 // from: ie_preprocess_data_sse42.cpp
1156 static inline uint16_t mulq16(uint16_t a, uint16_t b) {
1157     return static_cast<uint16_t>(((uint32_t)a * (uint32_t)b) >> 16);
1158 }
1159
1160 // extracted from: ie_preprocess_data_sse42.cpp
1161 // (and reworked for 1-channel and fluid's src)
1162 void calcRowArea_CVKL_U8_SSE42(const uchar  * src[],
1163                                      uchar    dst[],
1164                                const Size   & inSz,
1165                                const Size   & outSz,
1166                                      int      y,
1167                                const uint16_t xsi[],
1168                                const uint16_t ysi[],
1169                                const uint16_t xalpha[],
1170                                const uint16_t yalpha[],
1171                                      int      x_max_count,
1172                                      int      y_max_count,
1173                                      uint16_t vert_sum[]) {
1174     int dwidth  = outSz.width;
1175 //  int dheight = outSz.height;
1176     int swidth  =  inSz.width;
1177     int sheight =  inSz.height;
1178
1179     int vest_sum_size = 2*swidth;
1180 //  uint16_t* vert_sum = yalpha + dheight*y_max_count;
1181     uint16_t* alpha0 = vert_sum + vest_sum_size;
1182     uint16_t* alpha1 = alpha0 + dwidth;
1183     uint16_t* alpha2 = alpha1 + dwidth;
1184     uint16_t* alpha3 = alpha2 + dwidth;
1185     uint16_t* sxid0 = alpha3 + dwidth;
1186     uint16_t* sxid1 = sxid0 + 4*dwidth;
1187     uint16_t* sxid2 = sxid1 + 4*dwidth;
1188     uint16_t* sxid3 = sxid2 + 4*dwidth;
1189
1190     uint8_t * pdst_row  = dst;
1191     uint16_t* vert_sum_ = vert_sum;
1192
1193     int ysi_row = ysi[y];
1194
1195     memset(vert_sum_, 0, swidth * sizeof(uint16_t));
1196
1197     for (int dy = 0; dy < y_max_count; dy++) {
1198         if (ysi_row + dy >= sheight)
1199             break;
1200
1201         uint16_t yalpha_dy = yalpha[y * y_max_count + dy];
1202         const uint8_t *sptr_dy = src[dy];
1203
1204         int x = 0;
1205
1206         __m128i yalpha_dy_sse = _mm_set1_epi16(yalpha_dy);
1207         for (; x <= swidth - 16; x += 16) {
1208             __m128i sval = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sptr_dy + x));
1209
1210             // sptr_dy[x] << 8
1211             __m128i sval_Q16_lo = _mm_unpacklo_epi8(_mm_setzero_si128(), sval);
1212             __m128i sval_Q16_hi = _mm_unpackhi_epi8(_mm_setzero_si128(), sval);
1213
1214             __m128i vert_sum_lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + x + 0));
1215             __m128i vert_sum_hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + x + 8));
1216
1217             vert_sum_lo = _mm_add_epi16(vert_sum_lo, _mm_mulhi_epu16(yalpha_dy_sse, sval_Q16_lo));
1218             vert_sum_hi = _mm_add_epi16(vert_sum_hi, _mm_mulhi_epu16(yalpha_dy_sse, sval_Q16_hi));
1219
1220             _mm_storeu_si128(reinterpret_cast<__m128i*>(vert_sum_ + x + 0), vert_sum_lo);
1221             _mm_storeu_si128(reinterpret_cast<__m128i*>(vert_sum_ + x + 8), vert_sum_hi);
1222         }
1223
1224         for (; x < swidth; x++) {
1225             vert_sum_[x] += mulq16(yalpha_dy, static_cast<uint16_t>(sptr_dy[x] << 8));
1226         }
1227     }
1228
1229     if (x_max_count == 2) {
1230         int x = 0;
1231         for (; x <= dwidth - 8; x += 8) {
1232             __m128i res = _mm_set1_epi16(1 << (8 - 1));
1233
1234             int id0 = xsi[x];
1235
1236             __m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0));
1237             __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8));
1238
1239             __m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 2));
1240             __m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 2 + 8));
1241
1242             __m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 2));
1243             __m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 2 + 8));
1244
1245             __m128i vert_sum0 = _mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0),
1246                                              _mm_shuffle_epi8(chunk1, sx0_id1));
1247             __m128i vert_sum1 = _mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0),
1248                                              _mm_shuffle_epi8(chunk1, sx1_id1));
1249
1250             res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0));
1251             res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1));
1252
1253             res = _mm_srli_epi16(res, 8);
1254             res = _mm_packus_epi16(res, res);
1255             _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
1256         }
1257
1258         for (; x < dwidth; x++) {
1259             uint16_t res = 1 << (8 - 1);
1260             int id = xsi[x];
1261             res += mulq16(alpha0[x], vert_sum_[id + 0]);
1262             res += mulq16(alpha1[x], vert_sum_[id + 1]);
1263             pdst_row[x] = saturateU32toU8(res >> 8);
1264         }
1265     } else if (x_max_count == 3) {
1266         int x = 0;
1267         for (; x <= dwidth - 8; x += 8) {
1268             __m128i res = _mm_set1_epi16(1 << (8 - 1));
1269
1270             int id0 = xsi[x];
1271
1272             __m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0));
1273             __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8));
1274             __m128i chunk2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 16));
1275
1276             __m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3));
1277             __m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3 + 8));
1278             __m128i sx0_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3 + 16));
1279
1280             __m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3));
1281             __m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3 + 8));
1282             __m128i sx1_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3 + 16));
1283
1284             __m128i sx2_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3));
1285             __m128i sx2_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3 + 8));
1286             __m128i sx2_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3 + 16));
1287
1288             __m128i vert_sum0 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0),
1289                                                           _mm_shuffle_epi8(chunk1, sx0_id1)),
1290                                              _mm_shuffle_epi8(chunk2, sx0_id2));
1291             __m128i vert_sum1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0),
1292                                                           _mm_shuffle_epi8(chunk1, sx1_id1)),
1293                                              _mm_shuffle_epi8(chunk2, sx1_id2));
1294             __m128i vert_sum2 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx2_id0),
1295                                                           _mm_shuffle_epi8(chunk1, sx2_id1)),
1296                                              _mm_shuffle_epi8(chunk2, sx2_id2));
1297
1298             res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0));
1299             res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1));
1300             res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha2 + x)), vert_sum2));
1301
1302             res = _mm_srli_epi16(res, 8);
1303             res = _mm_packus_epi16(res, res);
1304             _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
1305         }
1306
1307         for (; x < dwidth; x++) {
1308             uint16_t res = 1 << (8 - 1);
1309             int id = xsi[x];
1310             res += mulq16(alpha0[x], vert_sum_[id + 0]);
1311             res += mulq16(alpha1[x], vert_sum_[id + 1]);
1312             res += mulq16(alpha2[x], vert_sum_[id + 2]);
1313             pdst_row[x] = saturateU32toU8(res >> 8);
1314         }
1315     } else if (x_max_count == 4) {
1316         int x = 0;
1317         for (; x <= dwidth - 8; x += 8) {
1318             __m128i res = _mm_set1_epi16(1 << (8 - 1));
1319
1320             int id0 = xsi[x];
1321
1322             __m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0));
1323             __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8));
1324             __m128i chunk2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 16));
1325             __m128i chunk3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 24));
1326
1327             __m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4));
1328             __m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 8));
1329             __m128i sx0_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 16));
1330             __m128i sx0_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 24));
1331
1332             __m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4));
1333             __m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 8));
1334             __m128i sx1_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 16));
1335             __m128i sx1_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 24));
1336
1337             __m128i sx2_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4));
1338             __m128i sx2_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 8));
1339             __m128i sx2_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 16));
1340             __m128i sx2_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 24));
1341
1342             __m128i sx3_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4));
1343             __m128i sx3_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 8));
1344             __m128i sx3_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 16));
1345             __m128i sx3_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 24));
1346
1347             __m128i vert_sum0 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0),
1348                                                           _mm_shuffle_epi8(chunk1, sx0_id1)),
1349                                              _mm_or_si128(_mm_shuffle_epi8(chunk2, sx0_id2),
1350                                                           _mm_shuffle_epi8(chunk3, sx0_id3)));
1351             __m128i vert_sum1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0),
1352                                                           _mm_shuffle_epi8(chunk1, sx1_id1)),
1353                                              _mm_or_si128(_mm_shuffle_epi8(chunk2, sx1_id2),
1354                                                           _mm_shuffle_epi8(chunk3, sx1_id3)));
1355             __m128i vert_sum2 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx2_id0),
1356                                                           _mm_shuffle_epi8(chunk1, sx2_id1)),
1357                                              _mm_or_si128(_mm_shuffle_epi8(chunk2, sx2_id2),
1358                                                           _mm_shuffle_epi8(chunk3, sx2_id3)));
1359             __m128i vert_sum3 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx3_id0),
1360                                                           _mm_shuffle_epi8(chunk1, sx3_id1)),
1361                                              _mm_or_si128(_mm_shuffle_epi8(chunk2, sx3_id2),
1362                                                           _mm_shuffle_epi8(chunk3, sx3_id3)));
1363
1364             res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0));
1365             res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1));
1366             res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha2 + x)), vert_sum2));
1367             res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha3 + x)), vert_sum3));
1368
1369             res = _mm_srli_epi16(res, 8);
1370             res = _mm_packus_epi16(res, res);
1371             _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
1372         }
1373
1374         for (; x < dwidth; x++) {
1375             uint16_t res = 1 << (8 - 1);
1376             int id = xsi[x];
1377             res += mulq16(alpha0[x], vert_sum_[id + 0]);
1378             res += mulq16(alpha1[x], vert_sum_[id + 1]);
1379             res += mulq16(alpha2[x], vert_sum_[id + 2]);
1380             res += mulq16(alpha3[x], vert_sum_[id + 3]);
1381             pdst_row[x] = saturateU32toU8(res >> 8);
1382         }
1383     } else if (x_max_count <= 7) {
1384         int x = 0;
1385         for (; x <= dwidth - 8; x += 8) {
1386             __m128i res = _mm_set1_epi16(1 << (16 - 8 - 1));
1387             for (int i = 0; i < x_max_count; i++) {
1388                 __m128i valpha = _mm_setr_epi16(xalpha[x * x_max_count + x_max_count * 0 + i],
1389                                                 xalpha[x * x_max_count + x_max_count * 1 + i],
1390                                                 xalpha[x * x_max_count + x_max_count * 2 + i],
1391                                                 xalpha[x * x_max_count + x_max_count * 3 + i],
1392                                                 xalpha[x * x_max_count + x_max_count * 4 + i],
1393                                                 xalpha[x * x_max_count + x_max_count * 5 + i],
1394                                                 xalpha[x * x_max_count + x_max_count * 6 + i],
1395                                                 xalpha[x * x_max_count + x_max_count * 7 + i]);
1396                 __m128i vvert_sum = _mm_setr_epi16(vert_sum_[xsi[x + 0] + i],
1397                                                    vert_sum_[xsi[x + 1] + i],
1398                                                    vert_sum_[xsi[x + 2] + i],
1399                                                    vert_sum_[xsi[x + 3] + i],
1400                                                    vert_sum_[xsi[x + 4] + i],
1401                                                    vert_sum_[xsi[x + 5] + i],
1402                                                    vert_sum_[xsi[x + 6] + i],
1403                                                    vert_sum_[xsi[x + 7] + i]);
1404
1405                 res = _mm_add_epi16(res, _mm_mulhi_epu16(valpha, vvert_sum));
1406             }
1407             res = _mm_srli_epi16(res, 8);
1408             res = _mm_packus_epi16(res, res);
1409             _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
1410         }
1411
1412         for (; x < dwidth; x++) {
1413             uint16_t res = 1 << (8 - 1);
1414             for (int i = 0; i < x_max_count; i++) {
1415                 uint16_t a = xalpha[x * x_max_count + i];
1416                 int sx = xsi[x] + i;
1417
1418                 res += mulq16(a, vert_sum_[sx]);
1419             }
1420             pdst_row[x] = saturateU32toU8(res >> 8);
1421         }
1422     } else {
1423         for (int x = 0; x < dwidth; x++) {
1424             uint16_t res = 1 << (8 - 1);
1425             __m128i vres = _mm_setzero_si128();
1426             int id = xsi[x];
1427
1428             int i = 0;
1429             for (; i <= x_max_count - 8; i += 8) {
1430                 __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(xalpha + x * x_max_count + i));
1431                 __m128i s = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id + i));
1432
1433                 vres = _mm_add_epi16(vres, _mm_mulhi_epu16(a, s));
1434             }
1435             vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 2));
1436             vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 4));
1437             vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 8));
1438             res += static_cast<uint16_t>(_mm_extract_epi16(vres, 7));
1439
1440             for (; i < x_max_count; i++) {
1441                 uint16_t a = xalpha[x * x_max_count + i];
1442                 uint16_t s = vert_sum_[id + i];
1443
1444                 res += mulq16(a, s);
1445             }
1446
1447             pdst_row[x] = saturateU32toU8(res >> 8);
1448         }
1449     }
1450 }
1451
1452 #endif  // CVKL
1453 //------------------------------------------------------------------------------
1454
1455 void mergeRow_8UC2(const uint8_t in0[],
1456                    const uint8_t in1[],
1457                          uint8_t out[],
1458                              int length) {
1459     int l = 0;
1460
1461 #if CV_SIMD128
1462     cycle:
1463     for (; l <= length - 16; l += 16) {
1464         v_uint8x16 r0, r1;
1465         r0 = v_load(&in0[l]);
1466         r1 = v_load(&in1[l]);
1467         v_store_interleave(&out[2*l], r0, r1);
1468     }
1469
1470     if (l < length && length >= 16) {
1471         l = length - 16;
1472         goto cycle;
1473     }
1474 #endif
1475
1476     for (; l < length; l++) {
1477         out[2*l + 0] = in0[l];
1478         out[2*l + 1] = in1[l];
1479     }
1480 }
1481
1482 void mergeRow_8UC3(const uint8_t in0[],
1483                    const uint8_t in1[],
1484                    const uint8_t in2[],
1485                          uint8_t out[],
1486                              int length) {
1487     int l = 0;
1488
1489 #if CV_SIMD128
1490     cycle:
1491     for (; l <= length - 16; l += 16) {
1492         v_uint8x16 r0, r1, r2;
1493         r0 = v_load(&in0[l]);
1494         r1 = v_load(&in1[l]);
1495         r2 = v_load(&in2[l]);
1496         v_store_interleave(&out[3*l], r0, r1, r2);
1497     }
1498
1499     if (l < length && length >= 16) {
1500         l = length - 16;
1501         goto cycle;
1502     }
1503 #endif
1504
1505     for (; l < length; l++) {
1506         out[3*l + 0] = in0[l];
1507         out[3*l + 1] = in1[l];
1508         out[3*l + 2] = in2[l];
1509     }
1510 }
1511
1512 void mergeRow_8UC4(const uint8_t in0[],
1513                    const uint8_t in1[],
1514                    const uint8_t in2[],
1515                    const uint8_t in3[],
1516                          uint8_t out[],
1517                              int length) {
1518     int l = 0;
1519
1520 #if CV_SIMD128
1521     cycle:
1522     for (; l <= length - 16; l += 16) {
1523         v_uint8x16 r0, r1, r2, r3;
1524         r0 = v_load(&in0[l]);
1525         r1 = v_load(&in1[l]);
1526         r2 = v_load(&in2[l]);
1527         r3 = v_load(&in3[l]);
1528         v_store_interleave(&out[4*l], r0, r1, r2, r3);
1529     }
1530
1531     if (l < length && length >= 16) {
1532         l = length - 16;
1533         goto cycle;
1534     }
1535 #endif
1536
1537     for (; l < length; l++) {
1538         out[4*l + 0] = in0[l];
1539         out[4*l + 1] = in1[l];
1540         out[4*l + 2] = in2[l];
1541         out[4*l + 3] = in3[l];
1542     }
1543 }
1544
1545 void mergeRow_32FC2(const float in0[],
1546                     const float in1[],
1547                           float out[],
1548                             int length) {
1549     int l = 0;
1550
1551 #if CV_SIMD128
1552     cycle:
1553     for (; l <= length - 4; l += 4) {
1554         v_float32x4 r0, r1;
1555         r0 = v_load(&in0[l]);
1556         r1 = v_load(&in1[l]);
1557         v_store_interleave(&out[2*l], r0, r1);
1558     }
1559
1560     if (l < length && length >= 4) {
1561         l = length - 4;
1562         goto cycle;
1563     }
1564 #endif
1565
1566     for (; l < length; l++) {
1567         out[2*l + 0] = in0[l];
1568         out[2*l + 1] = in1[l];
1569     }
1570 }
1571
1572 void mergeRow_32FC3(const float in0[],
1573                     const float in1[],
1574                     const float in2[],
1575                           float out[],
1576                             int length) {
1577     int l = 0;
1578
1579 #if CV_SIMD128
1580     cycle:
1581     for (; l <= length - 4; l += 4) {
1582         v_float32x4 r0, r1, r2;
1583         r0 = v_load(&in0[l]);
1584         r1 = v_load(&in1[l]);
1585         r2 = v_load(&in2[l]);
1586         v_store_interleave(&out[3*l], r0, r1, r2);
1587     }
1588
1589     if (l < length && length >= 4) {
1590         l = length - 4;
1591         goto cycle;
1592     }
1593 #endif
1594
1595     for (; l < length; l++) {
1596         out[3*l + 0] = in0[l];
1597         out[3*l + 1] = in1[l];
1598         out[3*l + 2] = in2[l];
1599     }
1600 }
1601
1602 void mergeRow_32FC4(const float in0[],
1603                     const float in1[],
1604                     const float in2[],
1605                     const float in3[],
1606                           float out[],
1607                             int length) {
1608     int l = 0;
1609
1610 #if CV_SIMD128
1611     cycle:
1612     for (; l <= length - 4; l += 4) {
1613         v_float32x4 r0, r1, r2, r3;
1614         r0 = v_load(&in0[l]);
1615         r1 = v_load(&in1[l]);
1616         r2 = v_load(&in2[l]);
1617         r3 = v_load(&in3[l]);
1618         v_store_interleave(&out[4*l], r0, r1, r2, r3);
1619     }
1620
1621     if (l < length && length >= 4) {
1622         l = length - 4;
1623         goto cycle;
1624     }
1625 #endif
1626
1627     for (; l < length; l++) {
1628         out[4*l + 0] = in0[l];
1629         out[4*l + 1] = in1[l];
1630         out[4*l + 2] = in2[l];
1631         out[4*l + 3] = in3[l];
1632     }
1633 }
1634
1635 void splitRow_8UC2(const uint8_t in[],
1636                          uint8_t out0[],
1637                          uint8_t out1[],
1638                              int length) {
1639     int l = 0;
1640
1641 #if CV_SIMD128
1642     cycle:
1643     for (; l <= length - 16; l += 16) {
1644         v_uint8x16 r0, r1;
1645         v_load_deinterleave(&in[2*l], r0, r1);
1646         v_store(&out0[l], r0);
1647         v_store(&out1[l], r1);
1648     }
1649     if (l < length && length >= 16) {
1650         l = length - 16;
1651         goto cycle;
1652     }
1653 #endif
1654
1655     for (; l < length; l++) {
1656         out0[l] = in[2*l + 0];
1657         out1[l] = in[2*l + 1];
1658     }
1659 }
1660
1661 void splitRow_8UC3(const uint8_t in[],
1662                          uint8_t out0[],
1663                          uint8_t out1[],
1664                          uint8_t out2[],
1665                              int length) {
1666     int l = 0;
1667
1668 #if CV_SIMD128
1669     cycle:
1670     for (; l <= length - 16; l += 16) {
1671         v_uint8x16 r0, r1, r2;
1672         v_load_deinterleave(&in[3*l], r0, r1, r2);
1673         v_store(&out0[l], r0);
1674         v_store(&out1[l], r1);
1675         v_store(&out2[l], r2);
1676     }
1677     if (l < length && length >= 16) {
1678         l = length - 16;
1679         goto cycle;
1680     }
1681 #endif
1682
1683     for (; l < length; l++) {
1684         out0[l] = in[3*l + 0];
1685         out1[l] = in[3*l + 1];
1686         out2[l] = in[3*l + 2];
1687     }
1688 }
1689
1690 void splitRow_8UC4(const uint8_t in[],
1691                          uint8_t out0[],
1692                          uint8_t out1[],
1693                          uint8_t out2[],
1694                          uint8_t out3[],
1695                              int length) {
1696     int l = 0;
1697
1698 #if CV_SIMD128
1699     cycle:
1700     for (; l <= length - 16; l += 16) {
1701         v_uint8x16 r0, r1, r2, r3;
1702         v_load_deinterleave(&in[4*l], r0, r1, r2, r3);
1703         v_store(&out0[l], r0);
1704         v_store(&out1[l], r1);
1705         v_store(&out2[l], r2);
1706         v_store(&out3[l], r3);
1707     }
1708     if (l < length && length >= 16) {
1709         l = length - 16;
1710         goto cycle;
1711     }
1712 #endif
1713
1714     for (; l < length; l++) {
1715         out0[l] = in[4*l + 0];
1716         out1[l] = in[4*l + 1];
1717         out2[l] = in[4*l + 2];
1718         out3[l] = in[4*l + 3];
1719     }
1720 }
1721
1722 void splitRow_32FC2(const float in[],
1723                           float out0[],
1724                           float out1[],
1725                             int length) {
1726     int l = 0;
1727
1728 #if CV_SIMD128
1729     cycle:
1730     for (; l <= length - 4; l += 4) {
1731         v_float32x4 r0, r1;
1732         v_load_deinterleave(&in[2*l], r0, r1);
1733         v_store(&out0[l], r0);
1734         v_store(&out1[l], r1);
1735     }
1736
1737     if (l < length && length >= 4) {
1738         l = length - 4;
1739         goto cycle;
1740     }
1741 #endif
1742
1743     for (; l < length; l++) {
1744         out0[l] = in[2*l + 0];
1745         out1[l] = in[2*l + 1];
1746     }
1747 }
1748
1749 void splitRow_32FC3(const float in[],
1750                           float out0[],
1751                           float out1[],
1752                           float out2[],
1753                             int length) {
1754     int l = 0;
1755
1756 #if CV_SIMD128
1757     cycle:
1758     for (; l <= length - 4; l += 4) {
1759         v_float32x4 r0, r1, r2;
1760         v_load_deinterleave(&in[3*l], r0, r1, r2);
1761         v_store(&out0[l], r0);
1762         v_store(&out1[l], r1);
1763         v_store(&out2[l], r2);
1764     }
1765
1766     if (l < length && length >= 4) {
1767         l = length - 4;
1768         goto cycle;
1769     }
1770 #endif
1771
1772     for (; l < length; l++) {
1773         out0[l] = in[3*l + 0];
1774         out1[l] = in[3*l + 1];
1775         out2[l] = in[3*l + 2];
1776     }
1777 }
1778
1779 void splitRow_32FC4(const float in[],
1780                           float out0[],
1781                           float out1[],
1782                           float out2[],
1783                           float out3[],
1784                             int length) {
1785     int l = 0;
1786
1787 #if CV_SIMD128
1788     cycle:
1789     for (; l <= length - 4; l += 4) {
1790         v_float32x4 r0, r1, r2, r3;
1791         v_load_deinterleave(&in[4*l], r0, r1, r2, r3);
1792         v_store(&out0[l], r0);
1793         v_store(&out1[l], r1);
1794         v_store(&out2[l], r2);
1795         v_store(&out3[l], r3);
1796     }
1797
1798     if (l < length && length >= 4) {
1799         l = length - 4;
1800         goto cycle;
1801     }
1802 #endif
1803
1804     for (; l < length; l++) {
1805         out0[l] = in[4*l + 0];
1806         out1[l] = in[4*l + 1];
1807         out2[l] = in[4*l + 2];
1808         out3[l] = in[4*l + 3];
1809     }
1810 }
1811
1812 }  // namespace kernels
1813 }  // namespace gapi
1814 }  // namespace InferenceEngine