From: Anna Khakimova Date: Tue, 7 Jul 2020 08:38:59 +0000 (+0300) Subject: Preprocessing(GAPI): Universal intrinsics (AVX2) implementation of U8C1 linear Resize... X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=987cc5ee528a04b92d24aa8ab88f6a028d017af8;p=platform%2Fupstream%2Fdldt.git Preprocessing(GAPI): Universal intrinsics (AVX2) implementation of U8C1 linear Resize. (#942) * Preprocessing(GAPI): Universal intrinsics (AVX2) implementation of U8C1 linear Resize * Refactoring --- diff --git a/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp b/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp index 71c23ce..d790517 100644 --- a/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp +++ b/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp @@ -131,6 +131,314 @@ void calcRowArea_32F(float dst[], const float *src[], const Size& inSz, calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf); } +static inline void main_computation_horizontalPass_lpi4(const v_uint8& val_0, + const v_uint8& val_1, + const v_uint8& val_2, + const v_uint8& val_3, + const v_int16& a10, + const v_int16& a32, + const v_int16& a54, + const v_int16& a76, + v_uint8& shuf_mask1, + v_uint8& shuf_mask2, + v_uint8& res1, v_uint8& res2) { + v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0)); + v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1)); + v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2)); + v_int16 val0_3 = v_reinterpret_as_s16(v_expand_low(val_3)); + + v_int16 val1_0 = v_reinterpret_as_s16(v_expand_high(val_0)); + v_int16 val1_1 = v_reinterpret_as_s16(v_expand_high(val_1)); + v_int16 val1_2 = v_reinterpret_as_s16(v_expand_high(val_2)); + v_int16 val1_3 = v_reinterpret_as_s16(v_expand_high(val_3)); + + v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), a10); + v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), a32); + v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), a54); + v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), a76); + + v_int16 r0 = v_add_wrap(val1_0, t0); + v_int16 r1 = v_add_wrap(val1_1, t1); + v_int16 r2 = v_add_wrap(val1_2, t2); + v_int16 r3 = v_add_wrap(val1_3, t3); + + v_uint8 q0 = v_packus(r0, r1); + v_uint8 q1 = v_packus(r2, r3); + + v_uint8 q2 = v_shuffle_s8(q0, shuf_mask1); + v_uint8 q3 = v_shuffle_s8(q1, shuf_mask1); + + v_uint8 q4 = v_blend_shiftleft<0xCC /*0b11001100*/, 4>(q2, q3); + v_uint8 q5 = v_blend_shiftright<0xCC /*0b11001100*/, 4>(q2, q3); + + v_uint8 q6 = v256_permute4x64<0xD8>(q4); + v_uint8 q7 = v256_permute4x64<0xD8>(q5); + + res1 = v_shuffle_s8(q6, shuf_mask2); + res2 = v_shuffle_s8(q7, shuf_mask2); +} + +static inline void verticalPass_lpi4_8U(const uint8_t* src0[], const uint8_t* src1[], + uint8_t tmp[], const short beta[], + const int& length, const int& half_nlanes) { + v_int16 b0 = vx_setall_s16(beta[0]); + v_int16 b1 = vx_setall_s16(beta[1]); + v_int16 b2 = vx_setall_s16(beta[2]); + v_int16 b3 = vx_setall_s16(beta[3]); + + v_uint8 shuf_mask = v_setr_s8(0, 8, 4, 12, 1, 9, 5, 13, + 2, 10, 6, 14, 3, 11, 7, 15, + 0, 8, 4, 12, 1, 9, 5, 13, + 2, 10, 6, 14, 3, 11, 7, 15); + for (int w = 0; w < length; ) { + for (; w <= length - half_nlanes; w += half_nlanes) { + v_int16 val0_0 = v_load_ccache_expand(&src0[0][w]); + v_int16 val0_1 = v_load_ccache_expand(&src0[1][w]); + v_int16 val0_2 = v_load_ccache_expand(&src0[2][w]); + v_int16 val0_3 = v_load_ccache_expand(&src0[3][w]); + + v_int16 val1_0 = v_load_ccache_expand(&src1[0][w]); + v_int16 val1_1 = v_load_ccache_expand(&src1[1][w]); + v_int16 val1_2 = v_load_ccache_expand(&src1[2][w]); + v_int16 val1_3 = v_load_ccache_expand(&src1[3][w]); + + v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0); + v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1); + v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2); + v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3); + + v_int16 r0 = v_add_wrap(val1_0, t0); + v_int16 r1 = v_add_wrap(val1_1, t1); + v_int16 r2 = v_add_wrap(val1_2, t2); + v_int16 r3 = v_add_wrap(val1_3, t3); + + v_uint8 q0 = v_packus(r0, r1); + v_uint8 q1 = v_packus(r2, r3); + + v_uint8 q2 = v_blend_shiftleft<0xCC /*0b11001100*/, 4>(q0, q1); + v_uint8 q3 = v_blend_shiftright<0xCC /*0b11001100*/, 4>(q0, q1); + + v_uint8 q4 = v_shuffle_s8(q2, shuf_mask); + v_uint8 q5 = v_shuffle_s8(q3, shuf_mask); + + v_uint8 q6 = v256_permute2x128<0x20>(q4, q5); + v_uint8 q7 = v256_permute2x128<0x31>(q4, q5); + + vx_store(&tmp[4 * w + 0], q6); + vx_store(&tmp[4 * w + 2 * half_nlanes], q7); + } + + if (w < length) { + w = length - half_nlanes; + } + } +} + +static inline void insert64(v_uint8& val, const short mapsx[], + uint8_t tmp[], const int& x, const int& shift) { + val = v_insert64<0>(val, *reinterpret_cast(&tmp[4 * mapsx[x + shift + 0]])); + val = v_insert64<1>(val, *reinterpret_cast(&tmp[4 * mapsx[x + shift + 1]])); + val = v_insert64<2>(val, *reinterpret_cast(&tmp[4 * mapsx[x + shift + 2]])); + val = v_insert64<3>(val, *reinterpret_cast(&tmp[4 * mapsx[x + shift + 3]])); +} + +static inline v_uint8 setHorizontalShufMask1() { + return v_setr_s8(0, 4, 8, 12, 2, 6, 10, 14, + 1, 5, 9, 13, 3, 7, 11, 15, + 0, 4, 8, 12, 2, 6, 10, 14, + 1, 5, 9, 13, 3, 7, 11, 15); +} + +static inline v_uint8 setHorizontalShufMask2() { + return v_setr_s8(0, 1, 8, 9, 2, 3, 10, 11, + 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, + 4, 5, 12, 13, 6, 7, 14, 15); +} + +static inline void horizontalPass_lpi4_8UC1(const short clone[], const short mapsx[], + uint8_t tmp[], uint8_t* dst[], const int& length, + const int& half_nlanes) { + v_uint8 val_0, val_1, val_2, val_3, res1, res2; + constexpr int shift = 4; + v_uint8 shuf_mask1 = setHorizontalShufMask1(); + v_uint8 shuf_mask2 = setHorizontalShufMask2();; + v_uint32 idxs = v_setr_s32(0, 2, 4, 6, 1, 3, 5, 7); + + for (int x = 0; x < length; ) { + for (; x <= length - half_nlanes; x += half_nlanes) { + v_int16 a10 = vx_load(&clone[4 * x]); + v_int16 a32 = vx_load(&clone[4 * (x + 4)]); + v_int16 a54 = vx_load(&clone[4 * (x + 8)]); + v_int16 a76 = vx_load(&clone[4 * (x + 12)]); + + insert64(val_0, mapsx, tmp, x, 0); + insert64(val_1, mapsx, tmp, x, shift); + insert64(val_2, mapsx, tmp, x, shift*2); + insert64(val_3, mapsx, tmp, x, shift*3); + + val_0 = v_permutevar8x32(val_0, idxs); + val_1 = v_permutevar8x32(val_1, idxs); + val_2 = v_permutevar8x32(val_2, idxs); + val_3 = v_permutevar8x32(val_3, idxs); + + main_computation_horizontalPass_lpi4(val_0, val_1, val_2, val_3, + a10, a32, a54, a76, + shuf_mask1, shuf_mask2, + res1, res2); + + v_store_low(&dst[0][x], res1); + v_store_high(&dst[1][x], res1); + v_store_low(&dst[2][x], res2); + v_store_high(&dst[3][x], res2); + } + + if (x < length) { + x = length - half_nlanes; + } + } +} + +static inline void verticalPass_anylpi_8U(const uint8_t* src0[], const uint8_t* src1[], + uint8_t tmp[], const int& beta0, const int& half_nlanes, + const int& l, const int& length1, const int& length2) { + for (int w = 0; w < length2; ) { + for (; w <= length1 - half_nlanes; w += half_nlanes) { + v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[l][w])); + v_int16 s1 = v_reinterpret_as_s16(vx_load_expand(&src1[l][w])); + v_int16 t = v_mulhrs(s0 - s1, beta0) + s1; + v_pack_u_store(tmp + w, t); + } + + if (w < length1) { + w = length1 - half_nlanes; + } + } +} + +static inline void horizontalPass_anylpi_8U(const short alpha[], const short mapsx[], + uint8_t* dst[], const uchar tmp[], const int& l, + const int& half_nlanes, const int& length) { + for (int x = 0; x < length; ) { + for (; x <= length - half_nlanes; x += half_nlanes) { + v_int16 a0 = vx_load(&alpha[x]); // as signed Q1.1.14 + v_int16 sx = vx_load(&mapsx[x]); // as integer (int16) + v_uint8 t = v_gather_pairs(tmp, sx); // 8 pairs of src0 pixels + v_int16 t0, t1; + v_deinterleave_expand(t, t0, t1); // tmp pixels as int16 + v_int16 d = v_mulhrs(t0 - t1, a0) + t1; + v_pack_u_store(&dst[l][x], d); + } + + if (x < length) { + x = length - half_nlanes; + } + } +} + +// 8UC1 Resize (bi-linear) +void calcRowLinear_8UC1(uint8_t* dst[], + const uint8_t* src0[], + const uint8_t* src1[], + const short alpha[], + const short clone[], // 4 clones of alpha + const short mapsx[], + const short beta[], + uint8_t tmp[], + const Size& inSz, + const Size& outSz, + int lpi) { + bool xRatioEq = inSz.width == outSz.width; + bool yRatioEq = inSz.height == outSz.height; + + constexpr int nlanes = v_uint8::nlanes; + constexpr int half_nlanes = (nlanes / 2); + + if (!xRatioEq && !yRatioEq) { + if (4 == lpi) { + // vertical pass + GAPI_DbgAssert(inSz.width >= half_nlanes); + verticalPass_lpi4_8U(src0, src1, tmp, beta, inSz.width, half_nlanes); + + // horizontal pass + GAPI_DbgAssert(outSz.width >= half_nlanes); + horizontalPass_lpi4_8UC1(clone, mapsx, tmp, dst, outSz.width, half_nlanes); + + } else { // if any lpi + int inLength = inSz.width; + int outLength = outSz.width; + for (int l = 0; l < lpi; ++l) { + short beta0 = beta[l]; + + // vertical pass + GAPI_DbgAssert(inSz.width >= half_nlanes); + verticalPass_anylpi_8U(src0, src1, tmp, beta0, half_nlanes, l, inLength, inLength); + + // horizontal pass + GAPI_DbgAssert(outSz.width >= half_nlanes); + horizontalPass_anylpi_8U(alpha, mapsx, dst, tmp, l, half_nlanes, outLength); + } + } // if lpi == 4 + + } else if (!xRatioEq) { + GAPI_DbgAssert(yRatioEq); + + if (4 == lpi) { + // vertical pass + GAPI_DbgAssert(inSz.width >= nlanes); + for (int w = 0; w < inSz.width; ) { + for (; w <= inSz.width - nlanes; w += nlanes) { + v_uint8 s0, s1, s2, s3; + s0 = vx_load(&src0[0][w]); + s1 = vx_load(&src0[1][w]); + s2 = vx_load(&src0[2][w]); + s3 = vx_load(&src0[3][w]); + v_store_interleave(&tmp[4 * w], s0, s1, s2, s3); + } + + if (w < inSz.width) { + w = inSz.width - nlanes; + } + } + + // horizontal pass + GAPI_DbgAssert(outSz.width >= half_nlanes); + horizontalPass_lpi4_8UC1(clone, mapsx, tmp, dst, outSz.width, half_nlanes); + + } else { // any LPI + for (int l = 0; l < lpi; ++l) { + const uchar *src = src0[l]; + + // horizontal pass + GAPI_DbgAssert(outSz.width >= half_nlanes); + horizontalPass_anylpi_8U(alpha, mapsx, dst, src, l, half_nlanes, outSz.width); + } + } + + } else if (!yRatioEq) { + GAPI_DbgAssert(xRatioEq); + int inLength = inSz.width; + int outLength = outSz.width; + + for (int l = 0; l < lpi; ++l) { + short beta0 = beta[l]; + + // vertical pass + GAPI_DbgAssert(inSz.width >= half_nlanes); + verticalPass_anylpi_8U(src0, src1, dst[l], beta0, half_nlanes, l, + inLength, outLength); + } + + } else { + GAPI_DbgAssert(xRatioEq && yRatioEq); + int length = inSz.width; + + for (int l = 0; l < lpi; ++l) { + memcpy(dst[l], src0[l], length); + } + } +} + template void calcRowLinear_8UC_Impl(std::array, chanNum> &dst, const uint8_t *src0[], @@ -147,75 +455,18 @@ void calcRowLinear_8UC_Impl(std::array, chanNum> &dst, const int shift = (half_nlanes / 4); if (4 == lpi) { - GAPI_DbgAssert(inSz.width >= half_nlanes); - - v_uint8 shuf_mask1 = v_setr_s8(0, 8, 4, 12, 1, 9, 5, 13, - 2, 10, 6, 14, 3, 11, 7, 15, - 0, 8, 4, 12, 1, 9, 5, 13, - 2, 10, 6, 14, 3, 11, 7, 15); - - v_uint8 shuf_mask2 = v_setr_s8(0, 4, 8, 12, 2, 6, 10, 14, - 1, 5, 9, 13, 3, 7, 11, 15, - 0, 4, 8, 12, 2, 6, 10, 14, - 1, 5, 9, 13, 3, 7, 11, 15); - - v_uint8 shuf_mask3 = v_setr_s8(0, 1, 8, 9, 2, 3, 10, 11, - 4, 5, 12, 13, 6, 7, 14, 15, - 0, 1, 8, 9, 2, 3, 10, 11, - 4, 5, 12, 13, 6, 7, 14, 15); - - // vertical pass - v_int16 b0 = vx_setall_s16(beta[0]); - v_int16 b1 = vx_setall_s16(beta[1]); - v_int16 b2 = vx_setall_s16(beta[2]); - v_int16 b3 = vx_setall_s16(beta[3]); - - for (int w = 0; w < inSz.width*chanNum; ) { - for (; w <= inSz.width*chanNum - half_nlanes && w >= 0; w += half_nlanes) { - v_int16 val0_0 = v_load_ccache_expand(&src0[0][w]); - v_int16 val0_1 = v_load_ccache_expand(&src0[1][w]); - v_int16 val0_2 = v_load_ccache_expand(&src0[2][w]); - v_int16 val0_3 = v_load_ccache_expand(&src0[3][w]); - - v_int16 val1_0 = v_load_ccache_expand(&src1[0][w]); - v_int16 val1_1 = v_load_ccache_expand(&src1[1][w]); - v_int16 val1_2 = v_load_ccache_expand(&src1[2][w]); - v_int16 val1_3 = v_load_ccache_expand(&src1[3][w]); - - v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0); - v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1); - v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2); - v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3); - - v_int16 r0 = v_add_wrap(val1_0, t0); - v_int16 r1 = v_add_wrap(val1_1, t1); - v_int16 r2 = v_add_wrap(val1_2, t2); - v_int16 r3 = v_add_wrap(val1_3, t3); - - v_uint8 q0 = v_packus(r0, r1); - v_uint8 q1 = v_packus(r2, r3); - - v_uint8 q2 = v_blend_shiftleft<0xCC /*0b11001100*/, 4>(q0, q1); - v_uint8 q3 = v_blend_shiftright<0xCC /*0b11001100*/, 4>(q0, q1); - - v_uint8 q4 = v_shuffle_s8(q2, shuf_mask1); - v_uint8 q5 = v_shuffle_s8(q3, shuf_mask1); - - v_uint8 q6 = v256_permute2x128<0x20>(q4, q5); - v_uint8 q7 = v256_permute2x128<0x31>(q4, q5); - - vx_store(&tmp[4 * w + 0], q6); - vx_store(&tmp[4 * w + 2 * half_nlanes], q7); - } - - if (w < inSz.width*chanNum) { - w = inSz.width*chanNum - half_nlanes; - } - } + GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes); + verticalPass_lpi4_8U(src0, src1, tmp, beta, + inSz.width*chanNum, half_nlanes); // horizontal pass - v_uint8 val_0, val_1, val_2, val_3; GAPI_DbgAssert(outSz.width >= half_nlanes); + //This variables are here to initialize them once. This variant don't affect performance. + v_uint8 val_0, val_1, val_2, val_3, res1, res2; + + v_uint8 shuf_mask1 = setHorizontalShufMask1(); + v_uint8 shuf_mask2 = setHorizontalShufMask2(); + for (int x = 0; x < outSz.width; ) { for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) { v_int16 a10 = vx_load(&clone[4 * x]); @@ -229,45 +480,15 @@ void calcRowLinear_8UC_Impl(std::array, chanNum> &dst, v_gather_channel(val_2, tmp, mapsx, chanNum, c, x, shift * 2); v_gather_channel(val_3, tmp, mapsx, chanNum, c, x, shift * 3); - v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0)); - v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1)); - v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2)); - v_int16 val0_3 = v_reinterpret_as_s16(v_expand_low(val_3)); - - v_int16 val1_0 = v_reinterpret_as_s16(v_expand_high(val_0)); - v_int16 val1_1 = v_reinterpret_as_s16(v_expand_high(val_1)); - v_int16 val1_2 = v_reinterpret_as_s16(v_expand_high(val_2)); - v_int16 val1_3 = v_reinterpret_as_s16(v_expand_high(val_3)); - - v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), a10); - v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), a32); - v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), a54); - v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), a76); - - v_int16 r0 = v_add_wrap(val1_0, t0); - v_int16 r1 = v_add_wrap(val1_1, t1); - v_int16 r2 = v_add_wrap(val1_2, t2); - v_int16 r3 = v_add_wrap(val1_3, t3); - - v_uint8 q0 = v_packus(r0, r1); - v_uint8 q1 = v_packus(r2, r3); - - v_uint8 q2 = v_shuffle_s8(q0, shuf_mask2); - v_uint8 q3 = v_shuffle_s8(q1, shuf_mask2); - - v_uint8 q4 = v_blend_shiftleft<0xCC /*0b11001100*/, 4>(q2, q3); - v_uint8 q5 = v_blend_shiftright<0xCC /*0b11001100*/, 4>(q2, q3); - - v_uint8 q6 = v256_permute4x64<0xD8>(q4); - v_uint8 q7 = v256_permute4x64<0xD8>(q5); - - v_uint8 q8 = v_shuffle_s8(q6, shuf_mask3); - v_uint8 q9 = v_shuffle_s8(q7, shuf_mask3); - - v_store_low(&dst[c][0][x], q8); - v_store_high(&dst[c][1][x], q8); - v_store_low(&dst[c][2][x], q9); - v_store_high(&dst[c][3][x], q9); + main_computation_horizontalPass_lpi4(val_0, val_1, val_2, val_3, + a10, a32, a54, a76, + shuf_mask1, shuf_mask2, + res1, res2); + + v_store_low(&dst[c][0][x], res1); + v_store_high(&dst[c][1][x], res1); + v_store_low(&dst[c][2][x], res2); + v_store_high(&dst[c][3][x], res2); } } @@ -281,22 +502,11 @@ void calcRowLinear_8UC_Impl(std::array, chanNum> &dst, // vertical pass GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes); - for (int w = 0; w < inSz.width*chanNum; ) { - for (; w <= inSz.width*chanNum - half_nlanes; w += half_nlanes) { - v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[l][w])); - v_int16 s1 = v_reinterpret_as_s16(vx_load_expand(&src1[l][w])); - v_int16 t = v_mulhrs(s0 - s1, beta0) + s1; - v_pack_u_store(tmp + w, t); - } - - if (w < inSz.width*chanNum) { - w = inSz.width*chanNum - half_nlanes; - } - } + verticalPass_anylpi_8U(src0, src1, tmp, beta0, half_nlanes, l, + inSz.width*chanNum, inSz.width*chanNum); // horizontal pass GAPI_DbgAssert(outSz.width >= half_nlanes); - for (int x = 0; x < outSz.width; ) { for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) { for (int c = 0; c < chanNum; ++c) { diff --git a/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.hpp b/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.hpp index b2651bd..e2ddd76 100644 --- a/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.hpp +++ b/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.hpp @@ -42,44 +42,44 @@ void calcRowArea_CVKL_U8_SSE42(const uchar * src[], //----------------------------------------------------------------------------- -// Resize (bi-linear, 8U) -void calcRowLinear_8U(uint8_t *dst[], - const uint8_t *src0[], - const uint8_t *src1[], - const short alpha[], - const short clone[], - const short mapsx[], - const short beta[], - uint8_t tmp[], - const Size & inSz, - const Size & outSz, - int lpi); +// Resize (bi-linear, 8UC1) +void calcRowLinear_8UC1(uint8_t* dst[], + const uint8_t* src0[], + const uint8_t* src1[], + const short alpha[], + const short clone[], + const short mapsx[], + const short beta[], + uint8_t tmp[], + const Size& inSz, + const Size& outSz, + int lpi); // Resize (bi-linear, 8UC3) void calcRowLinear_8U(C3, std::array, 3> &dst, - const uint8_t *src0[], - const uint8_t *src1[], + const uint8_t* src0[], + const uint8_t* src1[], const short alpha[], const short clone[], const short mapsx[], const short beta[], - uint8_t tmp[], - const Size &inSz, - const Size &outSz, - int lpi); + uint8_t tmp[], + const Size& inSz, + const Size& outSz, + int lpi); // Resize (bi-linear, 8UC4) void calcRowLinear_8U(C4, std::array, 4> &dst, - const uint8_t *src0[], - const uint8_t *src1[], + const uint8_t* src0[], + const uint8_t* src1[], const short alpha[], const short clone[], const short mapsx[], const short beta[], - uint8_t tmp[], - const Size &inSz, - const Size &outSz, - int lpi); + uint8_t tmp[], + const Size& inSz, + const Size& outSz, + int lpi); template void calcRowLinear_8UC(std::array, numChan> &dst, diff --git a/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp b/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp index 8b994d8..8d32861 100644 --- a/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp +++ b/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp @@ -50,18 +50,18 @@ namespace InferenceEngine { namespace gapi { namespace kernels { -// Resize (bi-linear, 8U) -void calcRowLinear_8U(uint8_t *dst[], - const uint8_t *src0[], - const uint8_t *src1[], - const short alpha[], - const short clone[], // 4 clones of alpha - const short mapsx[], - const short beta[], - uint8_t tmp[], - const Size & inSz, - const Size & outSz, - int lpi) { +// 8UC1 Resize (bi-linear) +void calcRowLinear_8UC1( uint8_t *dst[], + const uint8_t *src0[], + const uint8_t *src1[], + const short alpha[], + const short clone[], // 4 clones of alpha + const short mapsx[], + const short beta[], + uint8_t tmp[], + const Size& inSz, + const Size& outSz, + int lpi) { bool xRatioEq1 = inSz.width == outSz.width; bool yRatioEq1 = inSz.height == outSz.height; @@ -650,9 +650,9 @@ void calcRowLinear_8UC_Impl_(std::array, chanNum> &dst, GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes); for (int w = 0; w < inSz.width*chanNum; ) { for (; w <= inSz.width*chanNum - half_nlanes; w += half_nlanes) { - v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(&src0[l][w])); - v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(&src1[l][w])); - v_int16x8 t = v_mulhrs(s0 - s1, beta0) + s1; + v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[l][w])); + v_int16 s1 = v_reinterpret_as_s16(vx_load_expand(&src1[l][w])); + v_int16 t = v_mulhrs(s0 - s1, beta0) + s1; v_pack_u_store(tmp + w, t); } @@ -666,11 +666,11 @@ void calcRowLinear_8UC_Impl_(std::array, chanNum> &dst, for (int x = 0; x < outSz.width; ) { for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) { for (int c = 0; c < chanNum; c++) { - v_int16x8 a0 = v_load(&alpha[x]); // as signed Q1.1.14 - v_int16x8 sx = v_load(&mapsx[x]); // as integer (int16) - v_int16x8 t0 = v_gather_chan(tmp, sx, c, 0); - v_int16x8 t1 = v_gather_chan(tmp, sx, c, 1); - v_int16x8 d = v_mulhrs(t0 - t1, a0) + t1; + v_int16 a0 = vx_load(&alpha[x]); // as signed Q1.1.14 + v_int16 sx = vx_load(&mapsx[x]); // as integer (int16) + v_int16 t0 = v_gather_chan(tmp, sx, c, 0); + v_int16 t1 = v_gather_chan(tmp, sx, c, 1); + v_int16 d = v_mulhrs(t0 - t1, a0) + t1; v_pack_u_store(&dst[c][l][x], d); } } diff --git a/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.hpp b/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.hpp index e0a2664..bfc5755 100644 --- a/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.hpp +++ b/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.hpp @@ -42,17 +42,17 @@ void calcRowArea_CVKL_U8_SSE42(const uchar * src[], //---------------------------------------------------------------------- // Resize (bi-linear, 8U) -void calcRowLinear_8U(uint8_t *dst[], - const uint8_t *src0[], - const uint8_t *src1[], - const short alpha[], - const short clone[], - const short mapsx[], - const short beta[], - uint8_t tmp[], - const Size & inSz, - const Size & outSz, - int lpi); +void calcRowLinear_8UC1(uint8_t *dst[], + const uint8_t *src0[], + const uint8_t *src1[], + const short alpha[], + const short clone[], + const short mapsx[], + const short beta[], + uint8_t tmp[], + const Size& inSz, + const Size& outSz, + int lpi); // Resize (bi-linear, 8UC3) void calcRowLinear_8U(C3, std::array, 3> &dst, diff --git a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp index 2272ba5..4d2f854 100644 --- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp +++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp @@ -805,12 +805,32 @@ static void calcRowLinear(const cv::gapi::fluid::View & in, src1[l] = in.InLine(index1); dst[l] = out.OutLine(l); } +#if 1 + #ifdef HAVE_AVX2 + if (with_cpu_x86_avx2()) { + if (std::is_same::value) { + if (inSz.width >= 32 && outSz.width >= 16) { + avx::calcRowLinear_8UC1(reinterpret_cast(dst), + reinterpret_cast(src0), + reinterpret_cast(src1), + reinterpret_cast(alpha), + reinterpret_cast(clone), + reinterpret_cast(mapsx), + reinterpret_cast(beta), + reinterpret_cast(tmp), + inSz, outSz, lpi); + return; + } + } + } + #endif +#endif #ifdef HAVE_SSE if (with_cpu_x86_sse42()) { if (std::is_same::value) { if (inSz.width >= 16 && outSz.width >= 8) { - calcRowLinear_8U(reinterpret_cast(dst), + calcRowLinear_8UC1(reinterpret_cast(dst), reinterpret_cast(src0), reinterpret_cast(src1), reinterpret_cast(alpha), diff --git a/inference-engine/thirdparty/ocv/opencv_hal_avx.hpp b/inference-engine/thirdparty/ocv/opencv_hal_avx.hpp index 046f604..c057e3a 100644 --- a/inference-engine/thirdparty/ocv/opencv_hal_avx.hpp +++ b/inference-engine/thirdparty/ocv/opencv_hal_avx.hpp @@ -36,7 +36,7 @@ inline __m256d _v256_shuffle_odd_64(const __m256d& v) { return _mm256_permute4x64_pd(v, _MM_SHUFFLE(3, 1, 2, 0)); } template -inline __m256i _v256_permute2x128(const __m256i& a, const __m256i& b) +static inline __m256i _v256_permute2x128(const __m256i& a, const __m256i& b) { return _mm256_permute2x128_si256(a, b, imm); } template @@ -52,7 +52,7 @@ static inline _Tpvec v256_permute2x128(const _Tpvec& a, const _Tpvec& b) { return _Tpvec(_v256_permute2x128(a.val, b.val)); } template -inline __m256i _v256_permute4x64(const __m256i& a) +static inline __m256i _v256_permute4x64(const __m256i& a) { return _mm256_permute4x64_epi64(a, imm); } template @@ -1956,9 +1956,14 @@ static inline v_uint8x32 v_setr_s8(char b0, char b1, char b2, char b3, char b4, char b30, char b31) { return v_uint8x32(_mm256_setr_epi8(b0, b1, b2, b3, b4, b5, b6, b7, - b8, b9, b10, b11, b12, b13, b14, b15, - b16, b17, b18, b19, b20, b21, b22, b23, - b24, b25, b26, b27, b28, b29, b30, b31)); + b8, b9, b10, b11, b12, b13, b14, b15, + b16, b17, b18, b19, b20, b21, b22, b23, + b24, b25, b26, b27, b28, b29, b30, b31)); +} + +static inline v_uint32x8 v_setr_s32(int b0, int b1, int b2, int b3, int b4, int b5, int b6, int b7) +{ + return v_uint32x8(_mm256_setr_epi32(b0, b1, b2, b3, b4, b5, b6, b7)); } inline void v_pack_store(schar* ptr, const v_int16x16& a) @@ -3001,36 +3006,30 @@ static inline void v_deinterleave(const v_float32x8& low, const v_float32x8& hig odd .val = _mm256_unpackhi_ps(tmp0, tmp1); } -static inline void v_deinterleave(const v_uint8x32& i0, const v_uint8x32& i1, - const v_uint8x32& i2, const v_uint8x32& i3, - v_uint8x32& o0, v_uint8x32& o1, - v_uint8x32& o2, v_uint8x32& o3) +static inline void v_deinterleave(const v_uint8x32& v0, const v_uint8x32& v1, + const v_uint8x32& v2, const v_uint8x32& v3, + v_uint8x32& a, v_uint8x32& b, + v_uint8x32& c, v_uint8x32& d, + v_uint8x32& shuf_mask) { - __m256i u0 = i0.val; // a0 b0 c0 d0 a1 b1 c1 d1 ... - __m256i u1 = i1.val; // a4 b4 c4 d4 ... - __m256i u2 = i2.val; // a8 b8 c8 d8 ... - __m256i u3 = i3.val; // a12 b12 c12 d12 ... + /* a0a1a2a3 b0b1b2b3 c0c1c2c3 d0d1d2d3 a16a17a18a19 b16b17b18b19 c16c17c18c19 d16d17d18d19 */ + __m256i u0 = _mm256_shuffle_epi8(v0.val, shuf_mask.val); + /* a4a5a6a7 b4b5b6b7 c4c5c6c7 d4d5d6d7 a20a21a22a23 b20b21b22b23 c20c21c22c23 d20d21d22d23 */ + __m256i u1 = _mm256_shuffle_epi8(v1.val, shuf_mask.val); + /* a8a9a10a11 b8b9b10b11 c8c9c10c11 d8d9d10d11 */ + __m256i u2 = _mm256_shuffle_epi8(v2.val, shuf_mask.val); + __m256i u3 = _mm256_shuffle_epi8(v3.val, shuf_mask.val); - __m256i v0 = _mm256_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ... - __m256i v1 = _mm256_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ... - __m256i v2 = _mm256_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ... - __m256i v3 = _mm256_unpackhi_epi8(u1, u3); // a6 a14 b6 b14 ... + __m256i s0 = _mm256_blend_epi16(u0, _mm256_slli_si256(u1, 4), 0xCC /*0b11001100*/); // a0a1a2a3a4a5a6a7 c0c1c2c3c4c5c6c7 a16a17a18a19a29a21a22a23 ... + __m256i s1 = _mm256_blend_epi16(_mm256_srli_si256(u0, 4), u1, 0xCC /*0b11001100*/); + __m256i s2 = _mm256_blend_epi16(u2, _mm256_slli_si256(u3, 4), 0xCC /*0b11001100*/); + __m256i s3 = _mm256_blend_epi16(_mm256_srli_si256(u2, 4), u3, 0xCC /*0b11001100*/); - u0 = _mm256_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ... - u1 = _mm256_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ... - u2 = _mm256_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ... - u3 = _mm256_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ... - - v0 = _mm256_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ... - v1 = _mm256_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ... - v2 = _mm256_unpackhi_epi8(u0, u1); // c0 c2 c4 c6 ... - v3 = _mm256_unpackhi_epi8(u2, u3); // c1 c3 c5 c7 ... - - o0.val = _mm256_unpacklo_epi8(v0, v1); // a0 a1 a2 a3 ... - o1.val = _mm256_unpackhi_epi8(v0, v1); // b0 b1 b2 b3 ... - o2.val = _mm256_unpacklo_epi8(v2, v3); // c0 c1 c2 c3 ... - o3.val = _mm256_unpackhi_epi8(v2, v3); // d0 d1 d2 d3 ... -} + a.val = _mm256_blend_epi16(s0, _mm256_slli_si256(s2, 8), 0xF0 /*0b11110000*/); + c.val = _mm256_blend_epi16(_mm256_srli_si256(s0, 8), s2, 0xF0 /*0b11110000*/); + b.val = _mm256_blend_epi16(s1, _mm256_slli_si256(s3, 8), 0xF0 /*0b11110000*/); + d.val = _mm256_blend_epi16(_mm256_srli_si256(s1, 8), s3, 0xF0 /*0b11110000*/); + } static inline v_uint8x32 v_interleave_low(const v_uint8x32& a, const v_uint8x32& b) { @@ -3090,23 +3089,17 @@ static inline void v_deinterleave_expand(const v_uint8x32& src, v_int16x16& even static inline v_int16x16 v_mulhi(const v_int16x16& a, short b) { - v_int16x16 r; - r.val = _mm256_mulhi_epi16(a.val, _mm256_set1_epi16(b)); - return r; + return v_int16x16(_mm256_mulhi_epi16(a.val, _mm256_set1_epi16(b))); } -static inline v_uint16x16 v_mulhi(const v_uint16x16& a, v_uint16x16 b) +static inline v_uint16x16 v_mulhi(const v_uint16x16& a, v_uint16x16& b) { - v_uint16x16 r; - r.val = _mm256_mulhi_epu16(a.val, b.val); - return r; + return v_uint16x16(_mm256_mulhi_epu16(a.val, b.val)); } static inline v_uint16x16 v_mulhi(const v_uint16x16& a, uint16_t b) { - v_uint16x16 r; - r.val = _mm256_mulhi_epu16(a.val, _mm256_set1_epi16(b)); - return r; + return v_uint16x16(_mm256_mulhi_epu16(a.val, _mm256_set1_epi16(b))); } static inline v_int16x16 v_mulhrs(const v_int16x16& a, const v_int16x16& b) @@ -3149,6 +3142,17 @@ static inline v_uint8x32 v_shuffle_s8(const v_uint8x32& a, const v_uint8x32& mas return v_uint8x32(_mm256_shuffle_epi8(a.val, mask.val)); } +template +static inline v_uint8x32 v_insert64(v_uint8x32& a, const int64_t& i) +{ + return v_uint8x32(_mm256_insert_epi64(a.val, i, index)); +} + +static inline v_uint8x32 v_permutevar8x32(v_uint8x32& a, v_uint32x8& idxs) +{ + return v_uint8x32(_mm256_permutevar8x32_epi32(a.val, idxs.val)); +} + static inline void v_gather_channel(v_uint8x32& vec, const uint8_t tmp[], const short mapsx[], int chanNum, int c, int x, int shift) { @@ -3163,6 +3167,29 @@ static inline void v_gather_channel(v_uint8x32& vec, const uint8_t tmp[], const vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + shift + 3] + 1) + c)]), 7); } +// for each j=index[k], load two chars src[j] and src[j+1] +static inline v_uint8x32 v_gather_pairs(const uchar src[], const v_int16x16& index) { + v_uint8x32 r; + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[_mm256_extract_epi16(index.val, 0)]), 0); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[_mm256_extract_epi16(index.val, 1)]), 1); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[_mm256_extract_epi16(index.val, 2)]), 2); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[_mm256_extract_epi16(index.val, 3)]), 3); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[_mm256_extract_epi16(index.val, 4)]), 4); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[_mm256_extract_epi16(index.val, 5)]), 5); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[_mm256_extract_epi16(index.val, 6)]), 6); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[_mm256_extract_epi16(index.val, 7)]), 7); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[_mm256_extract_epi16(index.val, 8)]), 8); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[_mm256_extract_epi16(index.val, 9)]), 9); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[_mm256_extract_epi16(index.val, 10)]), 10); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[_mm256_extract_epi16(index.val, 11)]), 11); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[_mm256_extract_epi16(index.val, 12)]), 12); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[_mm256_extract_epi16(index.val, 13)]), 13); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[_mm256_extract_epi16(index.val, 14)]), 14); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[_mm256_extract_epi16(index.val, 15)]), 15); + + return r; +} + namespace { template static inline v_int16x16 v_gather_chan(const uchar src[], const v_int16x16& index, int channel, int pos) {