1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
5 #include "ie_preprocess_gapi_kernels.hpp"
6 #include "ie_preprocess_gapi_kernels_impl.hpp"
7 #include "ie_preprocess_gapi_kernels_sse42.hpp"
9 // NB: include this before opencv_hal_sse.hpp
10 #include "nmmintrin.h"
12 // NB: define these before opencv_hal_sse.hpp
19 STORE_ALIGNED_NOCACHE = 2
25 // NB: define these before opencv_hal_sse.hpp
26 #define OPENCV_HAL_ADD(a, b) ((a) + (b))
27 #define OPENCV_HAL_AND(a, b) ((a) & (b))
28 #define OPENCV_HAL_NOP(a) (a)
29 #define OPENCV_HAL_1ST(a, b) (a)
31 // NB: define these before opencv_hal_sse.hpp
46 #define CV_CPU_HAS_SUPPORT_SSE2 1
47 #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN // empty
48 #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
50 // OpenCV universal intrinsic
51 #include "opencv_hal_sse.hpp"
53 // AFTER "opencv_hal_sse.hpp"
54 // (CV_SIMD128 defined there)
56 #error CV_SIMD128 is required!
63 namespace InferenceEngine {
67 //----------------------------------------------------------------------
70 static inline void v_deinterleave(const v_float32x4& low, const v_float32x4& high,
71 v_float32x4& even, v_float32x4& odd) {
72 __m128 tmp0 = _mm_unpacklo_ps(low.val, high.val);
73 __m128 tmp1 = _mm_unpackhi_ps(low.val, high.val);
74 even.val = _mm_unpacklo_ps(tmp0, tmp1);
75 odd .val = _mm_unpackhi_ps(tmp0, tmp1);
80 static inline void v_deinterleave(const v_uint8x16& i0, const v_uint8x16& i1,
81 const v_uint8x16& i2, const v_uint8x16& i3,
82 v_uint8x16& o0, v_uint8x16& o1,
83 v_uint8x16& o2, v_uint8x16& o3) {
84 __m128i u0 = i0.val; // a0 b0 c0 d0 a1 b1 c1 d1 ...
85 __m128i u1 = i1.val; // a4 b4 c4 d4 ...
86 __m128i u2 = i2.val; // a8 b8 c8 d8 ...
87 __m128i u3 = i3.val; // a12 b12 c12 d12 ...
89 __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
90 __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
91 __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
92 __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b6 b14 ...
94 u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
95 u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
96 u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ...
97 u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ...
99 v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
100 v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
101 v2 = _mm_unpackhi_epi8(u0, u1); // c0 c2 c4 c6 ...
102 v3 = _mm_unpackhi_epi8(u2, u3); // c1 c3 c5 c7 ...
104 o0.val = _mm_unpacklo_epi8(v0, v1); // a0 a1 a2 a3 ...
105 o1.val = _mm_unpackhi_epi8(v0, v1); // b0 b1 b2 b3 ...
106 o2.val = _mm_unpacklo_epi8(v2, v3); // c0 c1 c2 c3 ...
107 o3.val = _mm_unpackhi_epi8(v2, v3); // d0 d1 d2 d3 ...
110 static inline v_uint8x16 v_interleave_low(const v_uint8x16& a, const v_uint8x16& b) {
111 return v_uint8x16(_mm_unpacklo_epi8(a.val, b.val));
114 static inline v_uint8x16 v_interleave_high(const v_uint8x16& a, const v_uint8x16& b) {
115 return v_uint8x16(_mm_unpackhi_epi8(a.val, b.val));
118 static inline v_int16x8 v_interleave_low(const v_int16x8& a, const v_int16x8& b) {
119 return v_int16x8(_mm_unpacklo_epi16(a.val, b.val));
122 static inline v_int16x8 v_interleave_high(const v_int16x8& a, const v_int16x8& b) {
123 return v_int16x8(_mm_unpackhi_epi16(a.val, b.val));
126 static inline v_uint16x8 v_expand_low(const v_uint8x16& a) {
127 return v_uint16x8(_mm_unpacklo_epi8(a.val, _mm_setzero_si128()));
130 static inline v_uint16x8 v_expand_high(const v_uint8x16& a) {
131 return v_uint16x8(_mm_unpackhi_epi8(a.val, _mm_setzero_si128()));
134 static inline v_uint8x16 v_saturate_u8(const v_int16x8& a) {
136 r.val = _mm_packus_epi16(a.val, _mm_setzero_si128());
140 static inline v_int16x8 v_saturate_s16(const v_int32x4& a) {
142 r.val = _mm_packs_epi32(a.val, _mm_setzero_si128());
146 // for each j=index[k], load two chars src[j] and src[j+1]
147 static inline v_uint8x16 v_gather_pairs(const uchar src[], const v_int16x8& index) {
149 r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 0)]), 0);
150 r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 1)]), 1);
151 r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 2)]), 2);
152 r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 3)]), 3);
153 r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 4)]), 4);
154 r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 5)]), 5);
155 r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 6)]), 6);
156 r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 7)]), 7);
160 static inline v_int16x8 v_gather_chan(const uchar src[], const v_int16x8& index, int channel, int pos) {
161 constexpr const int chanNum = 3;
163 r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 0) + pos) + channel]), 0);
164 r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 1) + pos) + channel]), 1);
165 r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 2) + pos) + channel]), 2);
166 r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 3) + pos) + channel]), 3);
167 r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 4) + pos) + channel]), 4);
168 r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 5) + pos) + channel]), 5);
169 r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 6) + pos) + channel]), 6);
170 r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 7) + pos) + channel]), 7);
174 static inline void v_gather_pairs(const float src[], const v_int32x4& index,
175 v_float32x4& low, v_float32x4& high) {
179 __m128 l = _mm_setzero_ps();
180 l = _mm_loadl_pi(l, (const __m64*)&src[i[0]]); // pair of floats
181 l = _mm_loadh_pi(l, (const __m64*)&src[i[1]]);
184 __m128 h = _mm_setzero_ps();
185 h = _mm_loadl_pi(h, (const __m64*)&src[i[2]]);
186 h = _mm_loadh_pi(h, (const __m64*)&src[i[3]]);
190 static inline v_int32x4 v_madd(const v_int16x8& a, const v_int16x8& b) {
192 r.val = _mm_madd_epi16(a.val, b.val);
196 static inline v_int16x8 v_mulhi(const v_int16x8& a, short b) {
198 r.val = _mm_mulhi_epi16(a.val, _mm_set1_epi16(b));
202 static inline v_uint16x8 v_mulhi(const v_uint16x8& a, v_uint16x8 b) {
204 r.val = _mm_mulhi_epu16(a.val, b.val);
208 static inline v_uint16x8 v_mulhi(const v_uint16x8& a, uint16_t b) {
210 r.val = _mm_mulhi_epu16(a.val, _mm_set1_epi16(b));
214 static inline v_int16x8 v_mulhrs(const v_int16x8& a, const v_int16x8& b) {
216 r.val = _mm_mulhrs_epi16(a.val, b.val);
220 static inline v_int16x8 v_mulhrs(const v_int16x8& a, short b) {
221 return v_mulhrs(a, v_setall_s16(b));
226 static inline void v_deinterleave_expand(const v_uint8x16& src, v_int16x8& even, v_int16x8& odd) {
227 static const __m128i mask_even = _mm_setr_epi8(0, -1, 2, -1, 4, -1, 6, -1, 8, -1, 10, -1, 12, -1, 14, -1);
228 static const __m128i mask_odd = _mm_setr_epi8(1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1);
229 even.val = _mm_shuffle_epi8(src.val, mask_even);
230 odd .val = _mm_shuffle_epi8(src.val, mask_odd);
234 static inline v_float32x4 v_fma(const v_float32x4& a, float b, const v_float32x4& c) {
235 return v_fma(a, v_setall_f32(b), c);
238 static inline v_int16x8 operator+ (const v_int16x8& a, short b) {
239 return a + v_setall_s16(b);
242 static inline v_int16x8 operator- (short a, const v_int16x8& b) {
243 return v_setall_s16(a) - b;
246 static inline v_float32x4 operator- (float a, const v_float32x4& b) {
247 return v_setall_f32(a) - b;
250 static inline v_float32x4 operator* (const v_float32x4& a, float b) {
251 return a * v_setall_f32(b);
254 //------------------------------------------------------------------------------
256 // Resize (bi-linear, 8U)
257 void calcRowLinear_8U(uint8_t *dst[],
258 const uint8_t *src0[],
259 const uint8_t *src1[],
261 const short clone[], // 4 clones of alpha
268 bool xRatioEq1 = inSz.width == outSz.width;
269 bool yRatioEq1 = inSz.height == outSz.height;
271 if (!xRatioEq1 && !yRatioEq1) {
274 GAPI_DbgAssert(inSz.width >= 8);
276 __m128i b0 = _mm_set1_epi16(beta[0]);
277 __m128i b1 = _mm_set1_epi16(beta[1]);
278 __m128i b2 = _mm_set1_epi16(beta[2]);
279 __m128i b3 = _mm_set1_epi16(beta[3]);
281 for (int w = 0; w < inSz.width; ) {
282 for (; w <= inSz.width - 8; w += 8) {
284 //--------------------------------------------
285 // reworked from: ie_preprocess_data_sse42.cpp
286 // function: resize_bilinear_u8
287 // label: vertical_pass
288 //--------------------------------------------
290 __m128i val0lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src0[0][w])),
291 *reinterpret_cast<const int64_t*>(&src0[1][w]), 1);
292 __m128i val0hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src0[2][w])),
293 *reinterpret_cast<const int64_t*>(&src0[3][w]), 1);
294 __m128i val1lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src1[0][w])),
295 *reinterpret_cast<const int64_t*>(&src1[1][w]), 1);
296 __m128i val1hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src1[2][w])),
297 *reinterpret_cast<const int64_t*>(&src1[3][w]), 1);
299 __m128i val0_0 = _mm_cvtepu8_epi16(val0lo);
300 __m128i val0_2 = _mm_cvtepu8_epi16(val0hi);
301 __m128i val1_0 = _mm_cvtepu8_epi16(val1lo);
302 __m128i val1_2 = _mm_cvtepu8_epi16(val1hi);
304 __m128i val0_1 = _mm_unpackhi_epi8(val0lo, _mm_setzero_si128());
305 __m128i val0_3 = _mm_unpackhi_epi8(val0hi, _mm_setzero_si128());
306 __m128i val1_1 = _mm_unpackhi_epi8(val1lo, _mm_setzero_si128());
307 __m128i val1_3 = _mm_unpackhi_epi8(val1hi, _mm_setzero_si128());
309 __m128i t0 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_0, val1_0), b0);
310 __m128i t1 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_1, val1_1), b1);
311 __m128i t2 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_2, val1_2), b2);
312 __m128i t3 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_3, val1_3), b3);
314 __m128i r0 = _mm_add_epi16(val1_0, t0);
315 __m128i r1 = _mm_add_epi16(val1_1, t1);
316 __m128i r2 = _mm_add_epi16(val1_2, t2);
317 __m128i r3 = _mm_add_epi16(val1_3, t3);
319 __m128i q0 = _mm_packus_epi16(r0, r1);
320 __m128i q1 = _mm_packus_epi16(r2, r3);
322 __m128i q2 = _mm_blend_epi16(q0, _mm_slli_si128(q1, 4), 0xCC /*0b11001100*/);
323 __m128i q3 = _mm_blend_epi16(_mm_srli_si128(q0, 4), q1, 0xCC /*0b11001100*/);
325 __m128i q4 = _mm_shuffle_epi8(q2, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15));
326 __m128i q5 = _mm_shuffle_epi8(q3, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15));
328 _mm_storeu_si128(reinterpret_cast<__m128i *>(&tmp[4*w + 0]), q4);
329 _mm_storeu_si128(reinterpret_cast<__m128i *>(&tmp[4*w + 16]), q5);
332 // let: t[i] = src0[i][w]*beta0[i] + src1[i][w]*beta1
333 // here: beta0[i] = beta[i], beta1 = 1 - beta0[i]
334 v_int16x8 t0, t1, t2, t3;
338 s0 = v_reinterpret_as_s16(v_load_expand(&src0[0][w]));
339 s1 = v_reinterpret_as_s16(v_load_expand(&src1[0][w]));
340 t0 = v_mulhrs(s0 - s1, beta[0]) + s1;
342 s0 = v_reinterpret_as_s16(v_load_expand(&src0[1][w]));
343 s1 = v_reinterpret_as_s16(v_load_expand(&src1[1][w]));
344 t1 = v_mulhrs(s0 - s1, beta[1]) + s1;
346 s0 = v_reinterpret_as_s16(v_load_expand(&src0[2][w]));
347 s1 = v_reinterpret_as_s16(v_load_expand(&src1[2][w]));
348 t2 = v_mulhrs(s0 - s1, beta[2]) + s1;
350 s0 = v_reinterpret_as_s16(v_load_expand(&src0[3][w]));
351 s1 = v_reinterpret_as_s16(v_load_expand(&src1[3][w]));
352 t3 = v_mulhrs(s0 - s1, beta[3]) + s1;
354 // store as groups of 4 pixels: each group to have a pixel per row
356 v_uint8x16 a0, a1, a2, a3;
357 a0 = v_pack_u(t0, v_setall_s16(0));
358 a1 = v_pack_u(t1, v_setall_s16(0));
359 a2 = v_pack_u(t2, v_setall_s16(0));
360 a3 = v_pack_u(t3, v_setall_s16(0));
363 b0 = v_reinterpret_as_s16(v_interleave_low(a0, a1)); // 0th, 1st
364 b1 = v_reinterpret_as_s16(v_interleave_low(a2, a3)); // 2nd, 3rd
367 d0 = v_reinterpret_as_u8(v_interleave_low(b0, b1));
368 d1 = v_reinterpret_as_u8(v_interleave_high(b0, b1));
370 v_store(&tmp[4*w + 0], d0);
371 v_store(&tmp[4*w + 16], d1);
376 if (w < inSz.width) {
382 GAPI_DbgAssert(outSz.width >= 8);
383 for (int x = 0; x < outSz.width; ) {
384 for (; x <= outSz.width - 8; x += 8) {
386 //--------------------------------------------
387 // reworked from: ie_preprocess_data_sse42.cpp
388 // function: resize_bilinear_u8
389 // label: horizontal_pass
390 //--------------------------------------------
393 __m128i a10 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 * x]));
394 __m128i a32 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 * (x + 2)]));
395 __m128i a54 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 * (x + 4)]));
396 __m128i a76 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 * (x + 6)]));
398 // provided alpha[x..x+7] = { a0, a1, a2, a3, a4, a5, a6, a7},
399 // clone each a[i] 4 times - one item per each of LPI rows,
400 // so that a10 = {a0, a0, a0, a0, a1, a1, a1, a1}, etc.
401 __m128i a10, a32, a54, a76;
402 __m128i alpha0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&alpha[x]));
403 a10 = _mm_unpacklo_epi16(alpha0, alpha0); // {a0, a0, a1, a1, a2, a2, a3, a3}
404 a32 = _mm_unpackhi_epi16(a10, a10); // {a2, a2, a2, a2, a3, a3, a3, a3}
405 a10 = _mm_unpacklo_epi16(a10, a10); // {a0, a0, a0, a0, a1, a1, a1, a1}
406 a54 = _mm_unpackhi_epi16(alpha0, alpha0); // {a4, a4, a5, a5, a6, a6, a7, a7}
407 a76 = _mm_unpackhi_epi16(a54, a54); // {a6, a6, a6, a6, a7, a7, a7, a7}
408 a54 = _mm_unpacklo_epi16(a54, a54); // {a4, a4, a4, a4, a5, a5, a5, a5}
411 __m128d val0d, val1d, val2d, val3d;
412 val0d = _mm_load_sd(/****/ reinterpret_cast<double*>(&tmp[4 * mapsx[x + 0]]));
413 val0d = _mm_loadh_pd(val0d, reinterpret_cast<double*>(&tmp[4 * mapsx[x + 1]]));
414 val1d = _mm_load_sd(/****/ reinterpret_cast<double*>(&tmp[4 * mapsx[x + 2]]));
415 val1d = _mm_loadh_pd(val1d, reinterpret_cast<double*>(&tmp[4 * mapsx[x + 3]]));
416 val2d = _mm_load_sd(/****/ reinterpret_cast<double*>(&tmp[4 * mapsx[x + 4]]));
417 val2d = _mm_loadh_pd(val2d, reinterpret_cast<double*>(&tmp[4 * mapsx[x + 5]]));
418 val3d = _mm_load_sd(/****/ reinterpret_cast<double*>(&tmp[4 * mapsx[x + 6]]));
419 val3d = _mm_loadh_pd(val3d, reinterpret_cast<double*>(&tmp[4 * mapsx[x + 7]]));
421 __m128i val_0 = _mm_castpd_si128(val0d);
422 __m128i val_1 = _mm_castpd_si128(val1d);
423 __m128i val_2 = _mm_castpd_si128(val2d);
424 __m128i val_3 = _mm_castpd_si128(val3d);
426 val_0 = _mm_shuffle_epi32(val_0, _MM_SHUFFLE(3, 1, 2, 0));
427 val_1 = _mm_shuffle_epi32(val_1, _MM_SHUFFLE(3, 1, 2, 0));
428 val_2 = _mm_shuffle_epi32(val_2, _MM_SHUFFLE(3, 1, 2, 0));
429 val_3 = _mm_shuffle_epi32(val_3, _MM_SHUFFLE(3, 1, 2, 0));
431 __m128i val0_0 = _mm_cvtepu8_epi16(val_0);
432 __m128i val0_1 = _mm_cvtepu8_epi16(val_1);
433 __m128i val0_2 = _mm_cvtepu8_epi16(val_2);
434 __m128i val0_3 = _mm_cvtepu8_epi16(val_3);
436 __m128i val1_0 = _mm_unpackhi_epi8(val_0, _mm_setzero_si128());
437 __m128i val1_1 = _mm_unpackhi_epi8(val_1, _mm_setzero_si128());
438 __m128i val1_2 = _mm_unpackhi_epi8(val_2, _mm_setzero_si128());
439 __m128i val1_3 = _mm_unpackhi_epi8(val_3, _mm_setzero_si128());
441 __m128i t0 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_0, val1_0), a10);
442 __m128i t1 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_1, val1_1), a32);
443 __m128i t2 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_2, val1_2), a54);
444 __m128i t3 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_3, val1_3), a76);
446 __m128i r0 = _mm_add_epi16(val1_0, t0);
447 __m128i r1 = _mm_add_epi16(val1_1, t1);
448 __m128i r2 = _mm_add_epi16(val1_2, t2);
449 __m128i r3 = _mm_add_epi16(val1_3, t3);
451 __m128i q0 = _mm_packus_epi16(r0, r1);
452 __m128i q1 = _mm_packus_epi16(r2, r3);
454 __m128i q2 = _mm_shuffle_epi8(q0, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
455 __m128i q3 = _mm_shuffle_epi8(q1, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
457 __m128i q4 = _mm_blend_epi16(q2, _mm_slli_si128(q3, 4), 0xCC /*0b11001100*/);
458 __m128i q5 = _mm_blend_epi16(_mm_srli_si128(q2, 4), q3, 0xCC /*0b11001100*/);
460 _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[0][x]), q4);
461 _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[1][x]), _mm_srli_si128(q4, 8));
462 _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[2][x]), q5);
463 _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[3][x]), _mm_srli_si128(q5, 8));
466 // let: t be 2 pairs of groups of 4 pixels (each group is for 4 dst rows)
467 // each pair of gorups corresponds to pixels indexed as sx0 and sx1=sx0+1
468 // so: low part of t0 is 2x4 pixels corresponding to sx0=mapsx[x+0], etc.
469 v_uint8x16 t0, t1, t2, t3;
471 t0.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 0]])),
472 *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 1]]), 1);
473 t1.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 2]])),
474 *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 3]]), 1);
475 t2.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 4]])),
476 *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 5]]), 1);
477 t3.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 6]])),
478 *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 7]]), 1);
481 // let: r0 be pixels for 0th row, etc
482 v_uint8x16 r0, r1, r2, r3;
483 v_deinterleave(t0, t1, t2, t3, r0, r1, r2, r3);
485 // let: dl be resulting 8 pixels for l'th row
486 // dl = alpha0*s0l + alpha1*s1l
487 // note that alpha0 + alpha1 = 1
489 v_int16x8 s0, s1, d, alpha0;
491 alpha0 = v_load(&alpha[x]); // 8 coefficients
493 v_deinterleave_expand(r0, s0, s1);
494 d = v_mulhrs(s0 - s1, alpha0) + s1;
495 v_pack_u_store(&dst[0][x], d);
497 v_deinterleave_expand(r1, s0, s1);
498 d = v_mulhrs(s0 - s1, alpha0) + s1;
499 v_pack_u_store(&dst[1][x], d);
501 v_deinterleave_expand(r2, s0, s1);
502 d = v_mulhrs(s0 - s1, alpha0) + s1;
503 v_pack_u_store(&dst[2][x], d);
505 v_deinterleave_expand(r3, s0, s1);
506 d = v_mulhrs(s0 - s1, alpha0) + s1;
507 v_pack_u_store(&dst[3][x], d);
512 if (x < outSz.width) {
517 } else { // if any lpi
518 for (int l = 0; l < lpi; l++) {
519 short beta0 = beta[l];
520 // short beta1 = saturate_cast<short>(ONE - beta[l]);
523 GAPI_DbgAssert(inSz.width >= 8);
524 for (int w = 0; w < inSz.width; ) {
525 for (; w <= inSz.width - 8; w += 8) {
526 v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(&src0[l][w]));
527 v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(&src1[l][w]));
528 v_int16x8 t = v_mulhrs(s0 - s1, beta0) + s1;
529 v_pack_u_store(tmp + w, t);
532 if (w < inSz.width) {
538 GAPI_DbgAssert(outSz.width >= 8);
539 for (int x = 0; x < outSz.width; ) {
540 for (; x <= outSz.width - 8; x += 8) {
541 v_int16x8 a0 = v_load(&alpha[x]); // as signed Q1.1.14
542 v_int16x8 sx = v_load(&mapsx[x]); // as integer (int16)
543 v_uint8x16 t = v_gather_pairs(tmp, sx); // 8 pairs of src0 pixels
545 v_deinterleave_expand(t, t0, t1); // tmp pixels as int16
546 v_int16x8 d = v_mulhrs(t0 - t1, a0) + t1;
547 v_pack_u_store(&dst[l][x], d);
550 if (x < outSz.width) {
557 } else if (!xRatioEq1) {
558 GAPI_DbgAssert(yRatioEq1);
562 GAPI_DbgAssert(inSz.width >= 16);
563 for (int w = 0; w < inSz.width; ) {
564 for (; w <= inSz.width - 16; w += 16) {
565 v_uint8x16 s0, s1, s2, s3;
566 s0 = v_load(&src0[0][w]);
567 s1 = v_load(&src0[1][w]);
568 s2 = v_load(&src0[2][w]);
569 s3 = v_load(&src0[3][w]);
570 v_store_interleave(&tmp[4*w], s0, s1, s2, s3);
573 if (w < inSz.width) {
579 GAPI_DbgAssert(outSz.width >= 8);
580 for (int x = 0; x < outSz.width; ) {
581 for (; x <= outSz.width - 8; x += 8) {
582 v_uint8x16 t0, t1, t2, t3;
583 t0.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 0]])),
584 *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 1]]), 1);
585 t1.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 2]])),
586 *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 3]]), 1);
587 t2.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 4]])),
588 *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 5]]), 1);
589 t3.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 6]])),
590 *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 7]]), 1);
592 v_uint8x16 r0, r1, r2, r3;
593 v_deinterleave(t0, t1, t2, t3, r0, r1, r2, r3);
595 v_int16x8 s0, s1, d, alpha0;
597 alpha0 = v_load(&alpha[x]); // 8 coefficients
599 v_deinterleave_expand(r0, s0, s1);
600 d = v_mulhrs(s0 - s1, alpha0) + s1;
601 v_pack_u_store(&dst[0][x], d);
603 v_deinterleave_expand(r1, s0, s1);
604 d = v_mulhrs(s0 - s1, alpha0) + s1;
605 v_pack_u_store(&dst[1][x], d);
607 v_deinterleave_expand(r2, s0, s1);
608 d = v_mulhrs(s0 - s1, alpha0) + s1;
609 v_pack_u_store(&dst[2][x], d);
611 v_deinterleave_expand(r3, s0, s1);
612 d = v_mulhrs(s0 - s1, alpha0) + s1;
613 v_pack_u_store(&dst[3][x], d);
616 if (x < outSz.width) {
622 for (int l = 0; l < lpi; l++) {
623 const uchar *src = src0[l];
626 GAPI_DbgAssert(outSz.width >= 8);
627 for (int x = 0; x < outSz.width; ) {
628 for (; x <= outSz.width - 8; x += 8) {
629 v_int16x8 a0 = v_load(&alpha[x]); // as signed Q1.1.14
630 v_int16x8 sx = v_load(&mapsx[x]); // as integer (int16)
631 v_uint8x16 t = v_gather_pairs(src, sx); // 8 pairs of src0 pixels
633 v_deinterleave_expand(t, t0, t1); // tmp pixels as int16
634 v_int16x8 d = v_mulhrs(t0 - t1, a0) + t1;
635 v_pack_u_store(&dst[l][x], d);
638 if (x < outSz.width) {
645 } else if (!yRatioEq1) {
646 GAPI_DbgAssert(xRatioEq1);
647 int length = inSz.width; // == outSz.width
649 for (int l = 0; l < lpi; l++) {
650 short beta0 = beta[l];
651 // short beta1 = saturate_cast<short>(ONE - beta[l]);
654 GAPI_DbgAssert(inSz.width >= 8);
655 for (int w = 0; w < outSz.width; ) {
656 for (; w <= length - 8; w += 8) {
657 v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(src0[l] + w));
658 v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(src1[l] + w));
659 v_int16x8 t = v_mulhrs(s0 - s1, beta0) + s1;
660 v_pack_u_store(dst[l] + w, t);
663 if (w < inSz.width) {
670 GAPI_DbgAssert(xRatioEq1 && yRatioEq1);
671 int length = inSz.width; // == outSz.width
673 for (int l = 0; l < lpi; l++) {
674 memcpy(dst[l], src0[l], length);
679 // Resize (bi-linear, 8UC3)
680 void calcRowLinear_8UC3(std::array<std::array<uint8_t*, 4>, 3> &dst,
681 const uint8_t *src0[],
682 const uint8_t *src1[],
684 const short clone[], // 4 clones of alpha
691 constexpr const int chanNum = 3;
695 GAPI_DbgAssert(inSz.width >= 8);
697 __m128i b0 = _mm_set1_epi16(beta[0]);
698 __m128i b1 = _mm_set1_epi16(beta[1]);
699 __m128i b2 = _mm_set1_epi16(beta[2]);
700 __m128i b3 = _mm_set1_epi16(beta[3]);
702 for (int w = 0; w < inSz.width*chanNum; ) {
703 for (; w <= inSz.width*chanNum - 8; w += 8) {
704 //--------------------------------------------
705 // reworked from: ie_preprocess_data_sse42.cpp
706 // function: resize_bilinear_u8
707 // label: vertical_pass
708 //--------------------------------------------
710 __m128i val0lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src0[0][w])),
711 *reinterpret_cast<const int64_t*>(&src0[1][w]), 1);
712 __m128i val0hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src0[2][w])),
713 *reinterpret_cast<const int64_t*>(&src0[3][w]), 1);
714 __m128i val1lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src1[0][w])),
715 *reinterpret_cast<const int64_t*>(&src1[1][w]), 1);
716 __m128i val1hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src1[2][w])),
717 *reinterpret_cast<const int64_t*>(&src1[3][w]), 1);
719 __m128i val0_0 = _mm_cvtepu8_epi16(val0lo);
720 __m128i val0_2 = _mm_cvtepu8_epi16(val0hi);
721 __m128i val1_0 = _mm_cvtepu8_epi16(val1lo);
722 __m128i val1_2 = _mm_cvtepu8_epi16(val1hi);
724 __m128i val0_1 = _mm_unpackhi_epi8(val0lo, _mm_setzero_si128());
725 __m128i val0_3 = _mm_unpackhi_epi8(val0hi, _mm_setzero_si128());
726 __m128i val1_1 = _mm_unpackhi_epi8(val1lo, _mm_setzero_si128());
727 __m128i val1_3 = _mm_unpackhi_epi8(val1hi, _mm_setzero_si128());
729 __m128i t0 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_0, val1_0), b0);
730 __m128i t1 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_1, val1_1), b1);
731 __m128i t2 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_2, val1_2), b2);
732 __m128i t3 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_3, val1_3), b3);
734 __m128i r0 = _mm_add_epi16(val1_0, t0);
735 __m128i r1 = _mm_add_epi16(val1_1, t1);
736 __m128i r2 = _mm_add_epi16(val1_2, t2);
737 __m128i r3 = _mm_add_epi16(val1_3, t3);
739 __m128i q0 = _mm_packus_epi16(r0, r1);
740 __m128i q1 = _mm_packus_epi16(r2, r3);
742 __m128i q2 = _mm_blend_epi16(q0, _mm_slli_si128(q1, 4), 0xCC /*0b11001100*/);
743 __m128i q3 = _mm_blend_epi16(_mm_srli_si128(q0, 4), q1, 0xCC /*0b11001100*/);
745 __m128i q4 = _mm_shuffle_epi8(q2, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15));
746 __m128i q5 = _mm_shuffle_epi8(q3, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15));
748 _mm_storeu_si128(reinterpret_cast<__m128i *>(&tmp[4*w + 0]), q4);
749 _mm_storeu_si128(reinterpret_cast<__m128i *>(&tmp[4*w + 16]), q5);
752 if (w < inSz.width*chanNum) {
753 w = inSz.width*chanNum - 8;
758 GAPI_DbgAssert(outSz.width >= 8);
759 for (int x = 0; x < outSz.width; ) {
760 for (; x <= outSz.width - 8; x += 8) {
761 //--------------------------------------------
762 // reworked from: ie_preprocess_data_sse42.cpp
763 // function: resize_bilinear_u8
764 // label: horizontal_pass
765 //--------------------------------------------
767 __m128i a10 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 * x]));
768 __m128i a32 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 * (x + 2)]));
769 __m128i a54 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 * (x + 4)]));
770 __m128i a76 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 * (x + 6)]));
772 __m128i val_0 = _mm_setzero_si128();
773 __m128i val_1 = _mm_setzero_si128();
774 __m128i val_2 = _mm_setzero_si128();
775 __m128i val_3 = _mm_setzero_si128();
777 for (int c = 0; c < chanNum; c++) {
778 val_0 = _mm_insert_epi32(val_0, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 0] + c)]), 0);
779 val_0 = _mm_insert_epi32(val_0, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 0] + 1) + c)]), 1);
780 val_0 = _mm_insert_epi32(val_0, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 1] + c)]), 2);
781 val_0 = _mm_insert_epi32(val_0, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 1] + 1) + c)]), 3);
783 val_1 = _mm_insert_epi32(val_1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 2] + c)]), 0);
784 val_1 = _mm_insert_epi32(val_1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 2] + 1) + c)]), 1);
785 val_1 = _mm_insert_epi32(val_1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 3] + c)]), 2);
786 val_1 = _mm_insert_epi32(val_1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 3] + 1) + c)]), 3);
788 val_2 = _mm_insert_epi32(val_2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 4] + c)]), 0);
789 val_2 = _mm_insert_epi32(val_2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 4] + 1) + c)]), 1);
790 val_2 = _mm_insert_epi32(val_2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 5] + c)]), 2);
791 val_2 = _mm_insert_epi32(val_2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 5] + 1) + c)]), 3);
793 val_3 = _mm_insert_epi32(val_3, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 6] + c)]), 0);
794 val_3 = _mm_insert_epi32(val_3, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 6] + 1) + c)]), 1);
795 val_3 = _mm_insert_epi32(val_3, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 7] + c)]), 2);
796 val_3 = _mm_insert_epi32(val_3, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 7] + 1) + c)]), 3);
798 val_0 = _mm_shuffle_epi32(val_0, _MM_SHUFFLE(3, 1, 2, 0));
799 val_1 = _mm_shuffle_epi32(val_1, _MM_SHUFFLE(3, 1, 2, 0));
800 val_2 = _mm_shuffle_epi32(val_2, _MM_SHUFFLE(3, 1, 2, 0));
801 val_3 = _mm_shuffle_epi32(val_3, _MM_SHUFFLE(3, 1, 2, 0));
803 __m128i val0_0 = _mm_cvtepu8_epi16(val_0);
804 __m128i val0_1 = _mm_cvtepu8_epi16(val_1);
805 __m128i val0_2 = _mm_cvtepu8_epi16(val_2);
806 __m128i val0_3 = _mm_cvtepu8_epi16(val_3);
808 __m128i val1_0 = _mm_unpackhi_epi8(val_0, _mm_setzero_si128());
809 __m128i val1_1 = _mm_unpackhi_epi8(val_1, _mm_setzero_si128());
810 __m128i val1_2 = _mm_unpackhi_epi8(val_2, _mm_setzero_si128());
811 __m128i val1_3 = _mm_unpackhi_epi8(val_3, _mm_setzero_si128());
813 __m128i t0 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_0, val1_0), a10);
814 __m128i t1 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_1, val1_1), a32);
815 __m128i t2 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_2, val1_2), a54);
816 __m128i t3 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_3, val1_3), a76);
818 __m128i r0 = _mm_add_epi16(val1_0, t0);
819 __m128i r1 = _mm_add_epi16(val1_1, t1);
820 __m128i r2 = _mm_add_epi16(val1_2, t2);
821 __m128i r3 = _mm_add_epi16(val1_3, t3);
823 __m128i q0 = _mm_packus_epi16(r0, r1);
824 __m128i q1 = _mm_packus_epi16(r2, r3);
826 __m128i q2 = _mm_shuffle_epi8(q0, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
827 __m128i q3 = _mm_shuffle_epi8(q1, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
829 __m128i q4 = _mm_blend_epi16(q2, _mm_slli_si128(q3, 4), 0xCC /*0b11001100*/);
830 __m128i q5 = _mm_blend_epi16(_mm_srli_si128(q2, 4), q3, 0xCC /*0b11001100*/);
832 _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[c][0][x]), q4);
833 _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[c][1][x]), _mm_srli_si128(q4, 8));
834 _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[c][2][x]), q5);
835 _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[c][3][x]), _mm_srli_si128(q5, 8));
839 if (x < outSz.width) {
843 } else { // if any lpi
844 for (int l = 0; l < lpi; l++) {
845 short beta0 = beta[l];
848 GAPI_DbgAssert(inSz.width*chanNum >= 8);
849 for (int w = 0; w < inSz.width*chanNum; ) {
850 for (; w <= inSz.width*chanNum - 8; w += 8) {
851 v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(&src0[l][w]));
852 v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(&src1[l][w]));
853 v_int16x8 t = v_mulhrs(s0 - s1, beta0) + s1;
854 v_pack_u_store(tmp + w, t);
857 if (w < inSz.width*chanNum) {
858 w = inSz.width*chanNum - 8;
863 GAPI_DbgAssert(outSz.width >= 8);
864 for (int x = 0; x < outSz.width; ) {
865 for (; x <= outSz.width - 8; x += 8) {
866 for (int c = 0; c < chanNum; c++) {
867 v_int16x8 a0 = v_load(&alpha[x]); // as signed Q1.1.14
868 v_int16x8 sx = v_load(&mapsx[x]); // as integer (int16)
869 v_int16x8 t0 = v_gather_chan(tmp, sx, c, 0);
870 v_int16x8 t1 = v_gather_chan(tmp, sx, c, 1);
871 v_int16x8 d = v_mulhrs(t0 - t1, a0) + t1;
872 v_pack_u_store(&dst[c][l][x], d);
876 if (x < outSz.width) {
884 // Resize (bi-linear, 32F)
885 void calcRowLinear_32F(float *dst[],
894 bool xRatioEq1 = inSz.width == outSz.width;
895 bool yRatioEq1 = inSz.height == outSz.height;
897 if (!xRatioEq1 && !yRatioEq1) {
898 for (int l = 0; l < lpi; l++) {
899 float beta0 = beta[l];
900 float beta1 = 1 - beta0;
905 for (; x <= outSz.width - 4; x += 4) {
906 v_float32x4 alpha0 = v_load(&alpha[x]);
907 // v_float32x4 alpha1 = 1.f - alpha0;
909 v_int32x4 sx = v_load(&mapsx[x]);
911 v_float32x4 s0l, s0h, s00, s01;
912 v_gather_pairs(src0[l], sx, s0l, s0h);
913 v_deinterleave(s0l, s0h, s00, s01);
915 // v_float32x4 res0 = s00*alpha0 + s01*alpha1;
916 v_float32x4 res0 = v_fma(s00 - s01, alpha0, s01);
918 v_float32x4 s1l, s1h, s10, s11;
919 v_gather_pairs(src1[l], sx, s1l, s1h);
920 v_deinterleave(s1l, s1h, s10, s11);
922 // v_float32x4 res1 = s10*alpha0 + s11*alpha1;
923 v_float32x4 res1 = v_fma(s10 - s11, alpha0, s11);
925 // v_float32x4 d = res0*beta0 + res1*beta1;
926 v_float32x4 d = v_fma(res0 - res1, beta0, res1);
928 v_store(&dst[l][x], d);
932 for (; x < outSz.width; x++) {
933 float alpha0 = alpha[x];
934 float alpha1 = 1 - alpha0;
937 float res0 = src0[l][sx0]*alpha0 + src0[l][sx1]*alpha1;
938 float res1 = src1[l][sx0]*alpha0 + src1[l][sx1]*alpha1;
939 dst[l][x] = beta0*res0 + beta1*res1;
943 } else if (!xRatioEq1) {
944 GAPI_DbgAssert(yRatioEq1);
946 for (int l = 0; l < lpi; l++) {
950 for (; x <= outSz.width - 4; x += 4) {
951 v_float32x4 alpha0 = v_load(&alpha[x]);
952 // v_float32x4 alpha1 = 1.f - alpha0;
954 v_int32x4 sx = v_load(&mapsx[x]);
956 v_float32x4 s0l, s0h, s00, s01;
957 v_gather_pairs(src0[l], sx, s0l, s0h);
958 v_deinterleave(s0l, s0h, s00, s01);
960 // v_float32x4 d = s00*alpha0 + s01*alpha1;
961 v_float32x4 d = v_fma(s00 - s01, alpha0, s01);
963 v_store(&dst[l][x], d);
967 for (; x < outSz.width; x++) {
968 float alpha0 = alpha[x];
969 float alpha1 = 1 - alpha0;
972 dst[l][x] = src0[l][sx0]*alpha0 + src0[l][sx1]*alpha1;
976 } else if (!yRatioEq1) {
977 GAPI_DbgAssert(xRatioEq1);
978 int length = inSz.width; // == outSz.width
980 for (int l = 0; l < lpi; l++) {
981 float beta0 = beta[l];
982 float beta1 = 1 - beta0;
987 for (; x <= length - 4; x += 4) {
988 v_float32x4 s0 = v_load(&src0[l][x]);
989 v_float32x4 s1 = v_load(&src1[l][x]);
991 // v_float32x4 d = s0*beta0 + s1*beta1;
992 v_float32x4 d = v_fma(s0 - s1, beta0, s1);
994 v_store(&dst[l][x], d);
998 for (; x < length; x++) {
999 dst[l][x] = beta0*src0[l][x] + beta1*src1[l][x];
1004 GAPI_DbgAssert(xRatioEq1 && yRatioEq1);
1005 int length = inSz.width; // == outSz.width
1006 for (int l = 0; l < lpi; l++) {
1007 memcpy(dst[l], src0[l], length * sizeof(float));
1012 //------------------------------------------------------------------------------
1015 template<typename T, typename A, typename I, typename W>
1016 static inline void downy(const T *src[], int inWidth, const MapperUnit<A, I>& ymap, A yalpha,
1018 int y_1st = ymap.index0;
1019 int ylast = ymap.index1 - 1;
1021 // yratio > 1, so at least 2 rows
1022 GAPI_DbgAssert(y_1st < ylast);
1024 // 1st and last rows
1029 if (std::is_same<T, uint8_t>::value) {
1030 for (; w <= inWidth - 8; w += 8) {
1031 v_uint16x8 vsrc0 = v_load_expand(reinterpret_cast<const uint8_t*>(& src[0][w]));
1032 v_uint16x8 vsrc1 = v_load_expand(reinterpret_cast<const uint8_t*>(& src[ylast - y_1st][w]));
1033 v_uint16x8 vres = v_mulhi(vsrc0 << 8, static_cast<Q0_16>(ymap.alpha0)) +
1034 v_mulhi(vsrc1 << 8, static_cast<Q0_16>(ymap.alpha1));
1035 v_store(reinterpret_cast<Q8_8*>(& vbuf[w]), vres);
1040 for (; w < inWidth; w++) {
1041 vbuf[w] = mulas(ymap.alpha0, src[0][w])
1042 + mulas(ymap.alpha1, src[ylast - y_1st][w]);
1046 // inner rows (if any)
1047 for (int i = 1; i < ylast - y_1st; i++) {
1051 if (std::is_same<T, uint8_t>::value) {
1052 for (; w <= inWidth - 8; w += 8) {
1053 v_uint16x8 vsrc = v_load_expand(reinterpret_cast<const uint8_t*>(& src[i][w]));
1054 v_uint16x8 vres = v_load(reinterpret_cast<Q8_8*>(& vbuf[w]));
1055 vres = vres + v_mulhi(vsrc << 8, static_cast<Q0_16>(yalpha));
1056 v_store(reinterpret_cast<Q8_8*>(& vbuf[w]), vres);
1061 for (; w < inWidth; w++) {
1062 vbuf[w] += mulas(yalpha, src[i][w]);
1068 template<typename T, typename A, typename I, typename W>
1069 static inline void downx(T dst[], int outWidth, int xmaxdf, const I xindex[], const A xalpha[],
1071 #define HSUM(xmaxdf) \
1072 for (int x = 0; x < outWidth; x++) { \
1073 int index = xindex[x]; \
1074 const A *alpha = &xalpha[x * xmaxdf]; \
1077 for (int i = 0; i < xmaxdf; i++) { \
1078 sum += mulaw(alpha[i], vbuf[index + i]); \
1081 dst[x] = convert_cast<T>(sum); \
1086 } else if (3 == xmaxdf) {
1088 } else if (4 == xmaxdf) {
1090 } else if (5 == xmaxdf) {
1092 } else if (6 == xmaxdf) {
1094 } else if (7 == xmaxdf) {
1096 } else if (8 == xmaxdf) {
1104 template<typename T, typename A, typename I, typename W>
1105 static void calcRowArea_impl(T dst[], const T *src[], const Size& inSz, const Size& outSz,
1106 A yalpha, const MapperUnit<A, I>& ymap, int xmaxdf, const I xindex[], const A xalpha[],
1108 bool xRatioEq1 = inSz.width == outSz.width;
1109 bool yRatioEq1 = inSz.height == outSz.height;
1111 if (!yRatioEq1 && !xRatioEq1) {
1112 downy(src, inSz.width, ymap, yalpha, vbuf);
1113 downx(dst, outSz.width, xmaxdf, xindex, xalpha, vbuf);
1115 } else if (!yRatioEq1) {
1116 GAPI_DbgAssert(xRatioEq1);
1117 downy(src, inSz.width, ymap, yalpha, vbuf);
1118 for (int x = 0; x < outSz.width; x++) {
1119 dst[x] = convert_cast<T>(vbuf[x]);
1122 } else if (!xRatioEq1) {
1123 GAPI_DbgAssert(yRatioEq1);
1124 for (int w = 0; w < inSz.width; w++) {
1125 vbuf[w] = convert_cast<W>(src[0][w]);
1127 downx(dst, outSz.width, xmaxdf, xindex, xalpha, vbuf);
1130 GAPI_DbgAssert(xRatioEq1 && yRatioEq1);
1131 memcpy(dst, src[0], outSz.width * sizeof(T));
1135 void calcRowArea_8U(uchar dst[], const uchar *src[], const Size& inSz, const Size& outSz,
1136 Q0_16 yalpha, const MapperUnit8U &ymap, int xmaxdf, const short xindex[], const Q0_16 xalpha[],
1138 calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
1141 void calcRowArea_32F(float dst[], const float *src[], const Size& inSz, const Size& outSz,
1142 float yalpha, const MapperUnit32F& ymap, int xmaxdf, const int xindex[], const float xalpha[],
1144 calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
1147 //------------------------------------------------------------------------------
1150 // from: ie_preprocess_data.hpp
1151 static inline uint8_t saturateU32toU8(uint32_t v) {
1152 return static_cast<uint8_t>(v > UINT8_MAX ? UINT8_MAX : v);
1155 // from: ie_preprocess_data_sse42.cpp
1156 static inline uint16_t mulq16(uint16_t a, uint16_t b) {
1157 return static_cast<uint16_t>(((uint32_t)a * (uint32_t)b) >> 16);
1160 // extracted from: ie_preprocess_data_sse42.cpp
1161 // (and reworked for 1-channel and fluid's src)
1162 void calcRowArea_CVKL_U8_SSE42(const uchar * src[],
1167 const uint16_t xsi[],
1168 const uint16_t ysi[],
1169 const uint16_t xalpha[],
1170 const uint16_t yalpha[],
1173 uint16_t vert_sum[]) {
1174 int dwidth = outSz.width;
1175 // int dheight = outSz.height;
1176 int swidth = inSz.width;
1177 int sheight = inSz.height;
1179 int vest_sum_size = 2*swidth;
1180 // uint16_t* vert_sum = yalpha + dheight*y_max_count;
1181 uint16_t* alpha0 = vert_sum + vest_sum_size;
1182 uint16_t* alpha1 = alpha0 + dwidth;
1183 uint16_t* alpha2 = alpha1 + dwidth;
1184 uint16_t* alpha3 = alpha2 + dwidth;
1185 uint16_t* sxid0 = alpha3 + dwidth;
1186 uint16_t* sxid1 = sxid0 + 4*dwidth;
1187 uint16_t* sxid2 = sxid1 + 4*dwidth;
1188 uint16_t* sxid3 = sxid2 + 4*dwidth;
1190 uint8_t * pdst_row = dst;
1191 uint16_t* vert_sum_ = vert_sum;
1193 int ysi_row = ysi[y];
1195 memset(vert_sum_, 0, swidth * sizeof(uint16_t));
1197 for (int dy = 0; dy < y_max_count; dy++) {
1198 if (ysi_row + dy >= sheight)
1201 uint16_t yalpha_dy = yalpha[y * y_max_count + dy];
1202 const uint8_t *sptr_dy = src[dy];
1206 __m128i yalpha_dy_sse = _mm_set1_epi16(yalpha_dy);
1207 for (; x <= swidth - 16; x += 16) {
1208 __m128i sval = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sptr_dy + x));
1211 __m128i sval_Q16_lo = _mm_unpacklo_epi8(_mm_setzero_si128(), sval);
1212 __m128i sval_Q16_hi = _mm_unpackhi_epi8(_mm_setzero_si128(), sval);
1214 __m128i vert_sum_lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + x + 0));
1215 __m128i vert_sum_hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + x + 8));
1217 vert_sum_lo = _mm_add_epi16(vert_sum_lo, _mm_mulhi_epu16(yalpha_dy_sse, sval_Q16_lo));
1218 vert_sum_hi = _mm_add_epi16(vert_sum_hi, _mm_mulhi_epu16(yalpha_dy_sse, sval_Q16_hi));
1220 _mm_storeu_si128(reinterpret_cast<__m128i*>(vert_sum_ + x + 0), vert_sum_lo);
1221 _mm_storeu_si128(reinterpret_cast<__m128i*>(vert_sum_ + x + 8), vert_sum_hi);
1224 for (; x < swidth; x++) {
1225 vert_sum_[x] += mulq16(yalpha_dy, static_cast<uint16_t>(sptr_dy[x] << 8));
1229 if (x_max_count == 2) {
1231 for (; x <= dwidth - 8; x += 8) {
1232 __m128i res = _mm_set1_epi16(1 << (8 - 1));
1236 __m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0));
1237 __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8));
1239 __m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 2));
1240 __m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 2 + 8));
1242 __m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 2));
1243 __m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 2 + 8));
1245 __m128i vert_sum0 = _mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0),
1246 _mm_shuffle_epi8(chunk1, sx0_id1));
1247 __m128i vert_sum1 = _mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0),
1248 _mm_shuffle_epi8(chunk1, sx1_id1));
1250 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0));
1251 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1));
1253 res = _mm_srli_epi16(res, 8);
1254 res = _mm_packus_epi16(res, res);
1255 _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
1258 for (; x < dwidth; x++) {
1259 uint16_t res = 1 << (8 - 1);
1261 res += mulq16(alpha0[x], vert_sum_[id + 0]);
1262 res += mulq16(alpha1[x], vert_sum_[id + 1]);
1263 pdst_row[x] = saturateU32toU8(res >> 8);
1265 } else if (x_max_count == 3) {
1267 for (; x <= dwidth - 8; x += 8) {
1268 __m128i res = _mm_set1_epi16(1 << (8 - 1));
1272 __m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0));
1273 __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8));
1274 __m128i chunk2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 16));
1276 __m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3));
1277 __m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3 + 8));
1278 __m128i sx0_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3 + 16));
1280 __m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3));
1281 __m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3 + 8));
1282 __m128i sx1_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3 + 16));
1284 __m128i sx2_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3));
1285 __m128i sx2_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3 + 8));
1286 __m128i sx2_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3 + 16));
1288 __m128i vert_sum0 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0),
1289 _mm_shuffle_epi8(chunk1, sx0_id1)),
1290 _mm_shuffle_epi8(chunk2, sx0_id2));
1291 __m128i vert_sum1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0),
1292 _mm_shuffle_epi8(chunk1, sx1_id1)),
1293 _mm_shuffle_epi8(chunk2, sx1_id2));
1294 __m128i vert_sum2 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx2_id0),
1295 _mm_shuffle_epi8(chunk1, sx2_id1)),
1296 _mm_shuffle_epi8(chunk2, sx2_id2));
1298 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0));
1299 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1));
1300 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha2 + x)), vert_sum2));
1302 res = _mm_srli_epi16(res, 8);
1303 res = _mm_packus_epi16(res, res);
1304 _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
1307 for (; x < dwidth; x++) {
1308 uint16_t res = 1 << (8 - 1);
1310 res += mulq16(alpha0[x], vert_sum_[id + 0]);
1311 res += mulq16(alpha1[x], vert_sum_[id + 1]);
1312 res += mulq16(alpha2[x], vert_sum_[id + 2]);
1313 pdst_row[x] = saturateU32toU8(res >> 8);
1315 } else if (x_max_count == 4) {
1317 for (; x <= dwidth - 8; x += 8) {
1318 __m128i res = _mm_set1_epi16(1 << (8 - 1));
1322 __m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0));
1323 __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8));
1324 __m128i chunk2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 16));
1325 __m128i chunk3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 24));
1327 __m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4));
1328 __m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 8));
1329 __m128i sx0_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 16));
1330 __m128i sx0_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 24));
1332 __m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4));
1333 __m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 8));
1334 __m128i sx1_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 16));
1335 __m128i sx1_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 24));
1337 __m128i sx2_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4));
1338 __m128i sx2_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 8));
1339 __m128i sx2_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 16));
1340 __m128i sx2_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 24));
1342 __m128i sx3_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4));
1343 __m128i sx3_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 8));
1344 __m128i sx3_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 16));
1345 __m128i sx3_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 24));
1347 __m128i vert_sum0 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0),
1348 _mm_shuffle_epi8(chunk1, sx0_id1)),
1349 _mm_or_si128(_mm_shuffle_epi8(chunk2, sx0_id2),
1350 _mm_shuffle_epi8(chunk3, sx0_id3)));
1351 __m128i vert_sum1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0),
1352 _mm_shuffle_epi8(chunk1, sx1_id1)),
1353 _mm_or_si128(_mm_shuffle_epi8(chunk2, sx1_id2),
1354 _mm_shuffle_epi8(chunk3, sx1_id3)));
1355 __m128i vert_sum2 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx2_id0),
1356 _mm_shuffle_epi8(chunk1, sx2_id1)),
1357 _mm_or_si128(_mm_shuffle_epi8(chunk2, sx2_id2),
1358 _mm_shuffle_epi8(chunk3, sx2_id3)));
1359 __m128i vert_sum3 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx3_id0),
1360 _mm_shuffle_epi8(chunk1, sx3_id1)),
1361 _mm_or_si128(_mm_shuffle_epi8(chunk2, sx3_id2),
1362 _mm_shuffle_epi8(chunk3, sx3_id3)));
1364 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0));
1365 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1));
1366 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha2 + x)), vert_sum2));
1367 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha3 + x)), vert_sum3));
1369 res = _mm_srli_epi16(res, 8);
1370 res = _mm_packus_epi16(res, res);
1371 _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
1374 for (; x < dwidth; x++) {
1375 uint16_t res = 1 << (8 - 1);
1377 res += mulq16(alpha0[x], vert_sum_[id + 0]);
1378 res += mulq16(alpha1[x], vert_sum_[id + 1]);
1379 res += mulq16(alpha2[x], vert_sum_[id + 2]);
1380 res += mulq16(alpha3[x], vert_sum_[id + 3]);
1381 pdst_row[x] = saturateU32toU8(res >> 8);
1383 } else if (x_max_count <= 7) {
1385 for (; x <= dwidth - 8; x += 8) {
1386 __m128i res = _mm_set1_epi16(1 << (16 - 8 - 1));
1387 for (int i = 0; i < x_max_count; i++) {
1388 __m128i valpha = _mm_setr_epi16(xalpha[x * x_max_count + x_max_count * 0 + i],
1389 xalpha[x * x_max_count + x_max_count * 1 + i],
1390 xalpha[x * x_max_count + x_max_count * 2 + i],
1391 xalpha[x * x_max_count + x_max_count * 3 + i],
1392 xalpha[x * x_max_count + x_max_count * 4 + i],
1393 xalpha[x * x_max_count + x_max_count * 5 + i],
1394 xalpha[x * x_max_count + x_max_count * 6 + i],
1395 xalpha[x * x_max_count + x_max_count * 7 + i]);
1396 __m128i vvert_sum = _mm_setr_epi16(vert_sum_[xsi[x + 0] + i],
1397 vert_sum_[xsi[x + 1] + i],
1398 vert_sum_[xsi[x + 2] + i],
1399 vert_sum_[xsi[x + 3] + i],
1400 vert_sum_[xsi[x + 4] + i],
1401 vert_sum_[xsi[x + 5] + i],
1402 vert_sum_[xsi[x + 6] + i],
1403 vert_sum_[xsi[x + 7] + i]);
1405 res = _mm_add_epi16(res, _mm_mulhi_epu16(valpha, vvert_sum));
1407 res = _mm_srli_epi16(res, 8);
1408 res = _mm_packus_epi16(res, res);
1409 _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
1412 for (; x < dwidth; x++) {
1413 uint16_t res = 1 << (8 - 1);
1414 for (int i = 0; i < x_max_count; i++) {
1415 uint16_t a = xalpha[x * x_max_count + i];
1416 int sx = xsi[x] + i;
1418 res += mulq16(a, vert_sum_[sx]);
1420 pdst_row[x] = saturateU32toU8(res >> 8);
1423 for (int x = 0; x < dwidth; x++) {
1424 uint16_t res = 1 << (8 - 1);
1425 __m128i vres = _mm_setzero_si128();
1429 for (; i <= x_max_count - 8; i += 8) {
1430 __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(xalpha + x * x_max_count + i));
1431 __m128i s = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id + i));
1433 vres = _mm_add_epi16(vres, _mm_mulhi_epu16(a, s));
1435 vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 2));
1436 vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 4));
1437 vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 8));
1438 res += static_cast<uint16_t>(_mm_extract_epi16(vres, 7));
1440 for (; i < x_max_count; i++) {
1441 uint16_t a = xalpha[x * x_max_count + i];
1442 uint16_t s = vert_sum_[id + i];
1444 res += mulq16(a, s);
1447 pdst_row[x] = saturateU32toU8(res >> 8);
1453 //------------------------------------------------------------------------------
1455 void mergeRow_8UC2(const uint8_t in0[],
1456 const uint8_t in1[],
1463 for (; l <= length - 16; l += 16) {
1465 r0 = v_load(&in0[l]);
1466 r1 = v_load(&in1[l]);
1467 v_store_interleave(&out[2*l], r0, r1);
1470 if (l < length && length >= 16) {
1476 for (; l < length; l++) {
1477 out[2*l + 0] = in0[l];
1478 out[2*l + 1] = in1[l];
1482 void mergeRow_8UC3(const uint8_t in0[],
1483 const uint8_t in1[],
1484 const uint8_t in2[],
1491 for (; l <= length - 16; l += 16) {
1492 v_uint8x16 r0, r1, r2;
1493 r0 = v_load(&in0[l]);
1494 r1 = v_load(&in1[l]);
1495 r2 = v_load(&in2[l]);
1496 v_store_interleave(&out[3*l], r0, r1, r2);
1499 if (l < length && length >= 16) {
1505 for (; l < length; l++) {
1506 out[3*l + 0] = in0[l];
1507 out[3*l + 1] = in1[l];
1508 out[3*l + 2] = in2[l];
1512 void mergeRow_8UC4(const uint8_t in0[],
1513 const uint8_t in1[],
1514 const uint8_t in2[],
1515 const uint8_t in3[],
1522 for (; l <= length - 16; l += 16) {
1523 v_uint8x16 r0, r1, r2, r3;
1524 r0 = v_load(&in0[l]);
1525 r1 = v_load(&in1[l]);
1526 r2 = v_load(&in2[l]);
1527 r3 = v_load(&in3[l]);
1528 v_store_interleave(&out[4*l], r0, r1, r2, r3);
1531 if (l < length && length >= 16) {
1537 for (; l < length; l++) {
1538 out[4*l + 0] = in0[l];
1539 out[4*l + 1] = in1[l];
1540 out[4*l + 2] = in2[l];
1541 out[4*l + 3] = in3[l];
1545 void mergeRow_32FC2(const float in0[],
1553 for (; l <= length - 4; l += 4) {
1555 r0 = v_load(&in0[l]);
1556 r1 = v_load(&in1[l]);
1557 v_store_interleave(&out[2*l], r0, r1);
1560 if (l < length && length >= 4) {
1566 for (; l < length; l++) {
1567 out[2*l + 0] = in0[l];
1568 out[2*l + 1] = in1[l];
1572 void mergeRow_32FC3(const float in0[],
1581 for (; l <= length - 4; l += 4) {
1582 v_float32x4 r0, r1, r2;
1583 r0 = v_load(&in0[l]);
1584 r1 = v_load(&in1[l]);
1585 r2 = v_load(&in2[l]);
1586 v_store_interleave(&out[3*l], r0, r1, r2);
1589 if (l < length && length >= 4) {
1595 for (; l < length; l++) {
1596 out[3*l + 0] = in0[l];
1597 out[3*l + 1] = in1[l];
1598 out[3*l + 2] = in2[l];
1602 void mergeRow_32FC4(const float in0[],
1612 for (; l <= length - 4; l += 4) {
1613 v_float32x4 r0, r1, r2, r3;
1614 r0 = v_load(&in0[l]);
1615 r1 = v_load(&in1[l]);
1616 r2 = v_load(&in2[l]);
1617 r3 = v_load(&in3[l]);
1618 v_store_interleave(&out[4*l], r0, r1, r2, r3);
1621 if (l < length && length >= 4) {
1627 for (; l < length; l++) {
1628 out[4*l + 0] = in0[l];
1629 out[4*l + 1] = in1[l];
1630 out[4*l + 2] = in2[l];
1631 out[4*l + 3] = in3[l];
1635 void splitRow_8UC2(const uint8_t in[],
1643 for (; l <= length - 16; l += 16) {
1645 v_load_deinterleave(&in[2*l], r0, r1);
1646 v_store(&out0[l], r0);
1647 v_store(&out1[l], r1);
1649 if (l < length && length >= 16) {
1655 for (; l < length; l++) {
1656 out0[l] = in[2*l + 0];
1657 out1[l] = in[2*l + 1];
1661 void splitRow_8UC3(const uint8_t in[],
1670 for (; l <= length - 16; l += 16) {
1671 v_uint8x16 r0, r1, r2;
1672 v_load_deinterleave(&in[3*l], r0, r1, r2);
1673 v_store(&out0[l], r0);
1674 v_store(&out1[l], r1);
1675 v_store(&out2[l], r2);
1677 if (l < length && length >= 16) {
1683 for (; l < length; l++) {
1684 out0[l] = in[3*l + 0];
1685 out1[l] = in[3*l + 1];
1686 out2[l] = in[3*l + 2];
1690 void splitRow_8UC4(const uint8_t in[],
1700 for (; l <= length - 16; l += 16) {
1701 v_uint8x16 r0, r1, r2, r3;
1702 v_load_deinterleave(&in[4*l], r0, r1, r2, r3);
1703 v_store(&out0[l], r0);
1704 v_store(&out1[l], r1);
1705 v_store(&out2[l], r2);
1706 v_store(&out3[l], r3);
1708 if (l < length && length >= 16) {
1714 for (; l < length; l++) {
1715 out0[l] = in[4*l + 0];
1716 out1[l] = in[4*l + 1];
1717 out2[l] = in[4*l + 2];
1718 out3[l] = in[4*l + 3];
1722 void splitRow_32FC2(const float in[],
1730 for (; l <= length - 4; l += 4) {
1732 v_load_deinterleave(&in[2*l], r0, r1);
1733 v_store(&out0[l], r0);
1734 v_store(&out1[l], r1);
1737 if (l < length && length >= 4) {
1743 for (; l < length; l++) {
1744 out0[l] = in[2*l + 0];
1745 out1[l] = in[2*l + 1];
1749 void splitRow_32FC3(const float in[],
1758 for (; l <= length - 4; l += 4) {
1759 v_float32x4 r0, r1, r2;
1760 v_load_deinterleave(&in[3*l], r0, r1, r2);
1761 v_store(&out0[l], r0);
1762 v_store(&out1[l], r1);
1763 v_store(&out2[l], r2);
1766 if (l < length && length >= 4) {
1772 for (; l < length; l++) {
1773 out0[l] = in[3*l + 0];
1774 out1[l] = in[3*l + 1];
1775 out2[l] = in[3*l + 2];
1779 void splitRow_32FC4(const float in[],
1789 for (; l <= length - 4; l += 4) {
1790 v_float32x4 r0, r1, r2, r3;
1791 v_load_deinterleave(&in[4*l], r0, r1, r2, r3);
1792 v_store(&out0[l], r0);
1793 v_store(&out1[l], r1);
1794 v_store(&out2[l], r2);
1795 v_store(&out3[l], r3);
1798 if (l < length && length >= 4) {
1804 for (; l < length; l++) {
1805 out0[l] = in[4*l + 0];
1806 out1[l] = in[4*l + 1];
1807 out2[l] = in[4*l + 2];
1808 out3[l] = in[4*l + 3];
1812 } // namespace kernels
1814 } // namespace InferenceEngine