From: Anna Khakimova Date: Fri, 29 May 2020 12:44:12 +0000 (+0300) Subject: Pre-processing(GAPI): AVX2/AVX512 implementation of 3C/4C Resize via universal intrin... X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=be3b711972442435479136419a0f58b3713d4bc2;p=platform%2Fupstream%2Fdldt.git Pre-processing(GAPI): AVX2/AVX512 implementation of 3C/4C Resize via universal intrinsics. (#612) --- diff --git a/inference-engine/ie_bridges/c/src/CMakeLists.txt b/inference-engine/ie_bridges/c/src/CMakeLists.txt index ef8527a..ab981fd 100644 --- a/inference-engine/ie_bridges/c/src/CMakeLists.txt +++ b/inference-engine/ie_bridges/c/src/CMakeLists.txt @@ -21,6 +21,12 @@ target_include_directories(${TARGET_NAME} PUBLIC "${InferenceEngine_C_API_SOURCE add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME}) +# Workaround to avoid warnings caused with bug in the avx512intrin.h of GCC5 +if((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND + (CMAKE_CXX_COMPILER_VERSION VERSION_LESS_EQUAL 5.5)) + set_target_properties(${TARGET_NAME} PROPERTIES LINK_FLAGS_RELEASE "-Wno-error=maybe-uninitialized -Wno-maybe-uninitialized") +endif() + # export export(TARGETS ${TARGET_NAME} NAMESPACE IE:: APPEND FILE "${CMAKE_BINARY_DIR}/targets.cmake") diff --git a/inference-engine/src/preprocessing/CMakeLists.txt b/inference-engine/src/preprocessing/CMakeLists.txt index 9201a6e..adc52f0 100644 --- a/inference-engine/src/preprocessing/CMakeLists.txt +++ b/inference-engine/src/preprocessing/CMakeLists.txt @@ -168,6 +168,12 @@ target_link_libraries(${TARGET_NAME} PRIVATE fluid PUBLIC inference_engine ${INT target_include_directories(${TARGET_NAME} INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}") +# Workaround to avoid warnings caused with bug in the avx512intrin.h of GCC5 +if((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND + (CMAKE_CXX_COMPILER_VERSION VERSION_LESS_EQUAL 5.5)) + set_target_properties(${TARGET_NAME} PROPERTIES LINK_FLAGS_RELEASE "-Wno-error=maybe-uninitialized -Wno-maybe-uninitialized") +endif() + if(WIN32) set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME}) endif() diff --git a/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp b/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp index da16de2..71c23ce 100644 --- a/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp +++ b/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp @@ -5,8 +5,6 @@ #include #include -#include "ie_preprocess_gapi_kernels.hpp" -#include "ie_preprocess_gapi_kernels_impl.hpp" #include "ie_preprocess_gapi_kernels_avx2.hpp" #include @@ -44,16 +42,6 @@ namespace kernels { namespace avx { -static inline v_uint16x16 v_expand_low(const v_uint8x32& a) { - return v_uint16x16(_mm256_unpacklo_epi8(a.val, _mm256_setzero_si256())); -} - -static inline v_uint16x16 v_expand_high(const v_uint8x32& a) { - return v_uint16x16(_mm256_unpackhi_epi8(a.val, _mm256_setzero_si256())); -} - -//------------------------------------------------------------------------------ - void mergeRow_8UC2(const uint8_t in0[], const uint8_t in1[], uint8_t out[], int length) { mergeRow_8UC2_Impl(in0, in1, out, length); @@ -114,8 +102,6 @@ void splitRow_32FC4(const float in[], float out0[], float out1[], splitRow_32FC4_Impl(in, out0, out1, out2, out3, length); } - - void calculate_nv12_to_rgb(const uchar **srcY, const uchar *srcUV, uchar **dstRGBx, @@ -145,6 +131,226 @@ void calcRowArea_32F(float dst[], const float *src[], const Size& inSz, calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf); } +template +void calcRowLinear_8UC_Impl(std::array, chanNum> &dst, + const uint8_t *src0[], + const uint8_t *src1[], + const short alpha[], + const short clone[], // 4 clones of alpha + const short mapsx[], + const short beta[], + uint8_t tmp[], + const Size &inSz, + const Size &outSz, + int lpi) { + constexpr int half_nlanes = (v_uint8::nlanes / 2); + const int shift = (half_nlanes / 4); + + if (4 == lpi) { + GAPI_DbgAssert(inSz.width >= half_nlanes); + + v_uint8 shuf_mask1 = v_setr_s8(0, 8, 4, 12, 1, 9, 5, 13, + 2, 10, 6, 14, 3, 11, 7, 15, + 0, 8, 4, 12, 1, 9, 5, 13, + 2, 10, 6, 14, 3, 11, 7, 15); + + v_uint8 shuf_mask2 = v_setr_s8(0, 4, 8, 12, 2, 6, 10, 14, + 1, 5, 9, 13, 3, 7, 11, 15, + 0, 4, 8, 12, 2, 6, 10, 14, + 1, 5, 9, 13, 3, 7, 11, 15); + + v_uint8 shuf_mask3 = v_setr_s8(0, 1, 8, 9, 2, 3, 10, 11, + 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, + 4, 5, 12, 13, 6, 7, 14, 15); + + // vertical pass + v_int16 b0 = vx_setall_s16(beta[0]); + v_int16 b1 = vx_setall_s16(beta[1]); + v_int16 b2 = vx_setall_s16(beta[2]); + v_int16 b3 = vx_setall_s16(beta[3]); + + for (int w = 0; w < inSz.width*chanNum; ) { + for (; w <= inSz.width*chanNum - half_nlanes && w >= 0; w += half_nlanes) { + v_int16 val0_0 = v_load_ccache_expand(&src0[0][w]); + v_int16 val0_1 = v_load_ccache_expand(&src0[1][w]); + v_int16 val0_2 = v_load_ccache_expand(&src0[2][w]); + v_int16 val0_3 = v_load_ccache_expand(&src0[3][w]); + + v_int16 val1_0 = v_load_ccache_expand(&src1[0][w]); + v_int16 val1_1 = v_load_ccache_expand(&src1[1][w]); + v_int16 val1_2 = v_load_ccache_expand(&src1[2][w]); + v_int16 val1_3 = v_load_ccache_expand(&src1[3][w]); + + v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0); + v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1); + v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2); + v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3); + + v_int16 r0 = v_add_wrap(val1_0, t0); + v_int16 r1 = v_add_wrap(val1_1, t1); + v_int16 r2 = v_add_wrap(val1_2, t2); + v_int16 r3 = v_add_wrap(val1_3, t3); + + v_uint8 q0 = v_packus(r0, r1); + v_uint8 q1 = v_packus(r2, r3); + + v_uint8 q2 = v_blend_shiftleft<0xCC /*0b11001100*/, 4>(q0, q1); + v_uint8 q3 = v_blend_shiftright<0xCC /*0b11001100*/, 4>(q0, q1); + + v_uint8 q4 = v_shuffle_s8(q2, shuf_mask1); + v_uint8 q5 = v_shuffle_s8(q3, shuf_mask1); + + v_uint8 q6 = v256_permute2x128<0x20>(q4, q5); + v_uint8 q7 = v256_permute2x128<0x31>(q4, q5); + + vx_store(&tmp[4 * w + 0], q6); + vx_store(&tmp[4 * w + 2 * half_nlanes], q7); + } + + if (w < inSz.width*chanNum) { + w = inSz.width*chanNum - half_nlanes; + } + } + + // horizontal pass + v_uint8 val_0, val_1, val_2, val_3; + GAPI_DbgAssert(outSz.width >= half_nlanes); + for (int x = 0; x < outSz.width; ) { + for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) { + v_int16 a10 = vx_load(&clone[4 * x]); + v_int16 a32 = vx_load(&clone[4 * (x + 4)]); + v_int16 a54 = vx_load(&clone[4 * (x + 8)]); + v_int16 a76 = vx_load(&clone[4 * (x + 12)]); + + for (int c = 0; c < chanNum; ++c) { + v_gather_channel(val_0, tmp, mapsx, chanNum, c, x, 0); + v_gather_channel(val_1, tmp, mapsx, chanNum, c, x, shift); + v_gather_channel(val_2, tmp, mapsx, chanNum, c, x, shift * 2); + v_gather_channel(val_3, tmp, mapsx, chanNum, c, x, shift * 3); + + v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0)); + v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1)); + v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2)); + v_int16 val0_3 = v_reinterpret_as_s16(v_expand_low(val_3)); + + v_int16 val1_0 = v_reinterpret_as_s16(v_expand_high(val_0)); + v_int16 val1_1 = v_reinterpret_as_s16(v_expand_high(val_1)); + v_int16 val1_2 = v_reinterpret_as_s16(v_expand_high(val_2)); + v_int16 val1_3 = v_reinterpret_as_s16(v_expand_high(val_3)); + + v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), a10); + v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), a32); + v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), a54); + v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), a76); + + v_int16 r0 = v_add_wrap(val1_0, t0); + v_int16 r1 = v_add_wrap(val1_1, t1); + v_int16 r2 = v_add_wrap(val1_2, t2); + v_int16 r3 = v_add_wrap(val1_3, t3); + + v_uint8 q0 = v_packus(r0, r1); + v_uint8 q1 = v_packus(r2, r3); + + v_uint8 q2 = v_shuffle_s8(q0, shuf_mask2); + v_uint8 q3 = v_shuffle_s8(q1, shuf_mask2); + + v_uint8 q4 = v_blend_shiftleft<0xCC /*0b11001100*/, 4>(q2, q3); + v_uint8 q5 = v_blend_shiftright<0xCC /*0b11001100*/, 4>(q2, q3); + + v_uint8 q6 = v256_permute4x64<0xD8>(q4); + v_uint8 q7 = v256_permute4x64<0xD8>(q5); + + v_uint8 q8 = v_shuffle_s8(q6, shuf_mask3); + v_uint8 q9 = v_shuffle_s8(q7, shuf_mask3); + + v_store_low(&dst[c][0][x], q8); + v_store_high(&dst[c][1][x], q8); + v_store_low(&dst[c][2][x], q9); + v_store_high(&dst[c][3][x], q9); + } + } + + if (x < outSz.width) { + x = outSz.width - half_nlanes; + } + } + } else { // if any lpi + for (int l = 0; l < lpi; ++l) { + short beta0 = beta[l]; + + // vertical pass + GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes); + for (int w = 0; w < inSz.width*chanNum; ) { + for (; w <= inSz.width*chanNum - half_nlanes; w += half_nlanes) { + v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[l][w])); + v_int16 s1 = v_reinterpret_as_s16(vx_load_expand(&src1[l][w])); + v_int16 t = v_mulhrs(s0 - s1, beta0) + s1; + v_pack_u_store(tmp + w, t); + } + + if (w < inSz.width*chanNum) { + w = inSz.width*chanNum - half_nlanes; + } + } + + // horizontal pass + GAPI_DbgAssert(outSz.width >= half_nlanes); + + for (int x = 0; x < outSz.width; ) { + for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) { + for (int c = 0; c < chanNum; ++c) { + v_int16 a0 = vx_load(&alpha[x]); // as signed Q1.1.14 + v_int16 sx = vx_load(&mapsx[x]); // as integer (int16) + v_int16 t0 = v_gather_chan(tmp, sx, c, 0); + v_int16 t1 = v_gather_chan(tmp, sx, c, 1); + v_int16 d = v_mulhrs(t0 - t1, a0) + t1; + v_pack_u_store(&dst[c][l][x], d); + } + } + + if (x < outSz.width) { + x = outSz.width - half_nlanes; + } + } + } + } +} + +// Resize (bi-linear, 8UC3) +void calcRowLinear_8U(C3, std::array, 3> &dst, + const uint8_t *src0[], + const uint8_t *src1[], + const short alpha[], + const short clone[], // 4 clones of alpha + const short mapsx[], + const short beta[], + uint8_t tmp[], + const Size &inSz, + const Size &outSz, + int lpi) { + constexpr const int chanNum = 3; + + calcRowLinear_8UC_Impl(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi); +} + +// Resize (bi-linear, 8UC4) +void calcRowLinear_8U(C4, std::array, 4> &dst, + const uint8_t *src0[], + const uint8_t *src1[], + const short alpha[], + const short clone[], // 4 clones of alpha + const short mapsx[], + const short beta[], + uint8_t tmp[], + const Size &inSz, + const Size &outSz, + int lpi) { + constexpr const int chanNum = 4; + + calcRowLinear_8UC_Impl(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi); +} + void copyRow_8U(const uint8_t in[], uint8_t out[], int length) { copyRow_8U_impl(in, out, length); } diff --git a/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.cpp b/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.cpp index 6b6e4cf..5b900d5 100644 --- a/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.cpp +++ b/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.cpp @@ -4,10 +4,7 @@ #include #include -#include -#include "ie_preprocess_gapi_kernels.hpp" -#include "ie_preprocess_gapi_kernels_impl.hpp" #include "ie_preprocess_gapi_kernels_avx512.hpp" #include @@ -38,17 +35,6 @@ namespace gapi { namespace kernels { namespace avx512 { -//---------------------------------------------------------------------- - -static inline v_uint16x32 v_expand_low(const v_uint8x64& a) { - return v_uint16x32(_mm512_unpacklo_epi8(a.val, _mm512_setzero_si512())); -} - -static inline v_uint16x32 v_expand_high(const v_uint8x64& a) { - return v_uint16x32(_mm512_unpackhi_epi8(a.val, _mm512_setzero_si512())); -} - -//------------------------------------------------------------------------------ void mergeRow_8UC2(const uint8_t in0[], const uint8_t in1[], uint8_t out[], int length) { @@ -110,8 +96,6 @@ void splitRow_32FC4(const float in[], float out0[], float out1[], splitRow_32FC4_Impl(in, out0, out1, out2, out3, length); } - - void calculate_nv12_to_rgb(const uchar **srcY, const uchar *srcUV, uchar **dstRGBx, @@ -141,6 +125,278 @@ void calcRowArea_32F(float dst[], const float *src[], const Size& inSz, calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf); } +// Resize (bi-linear, 8U, generic number of channels) +template +void calcRowLinear_8UC_Impl(std::array, chanNum> &dst, + const uint8_t *src0[], + const uint8_t *src1[], + const short alpha[], + const short clone[], // 4 clones of alpha + const short mapsx[], + const short beta[], + uint8_t tmp[], + const Size &inSz, + const Size &outSz, + int lpi) { + constexpr int half_nlanes = (v_uint8::nlanes / 2); + const int shift = (half_nlanes / 4); + + if (4 == lpi) { + GAPI_DbgAssert(inSz.width >= half_nlanes); + + + v_uint8 shuf_mask1 = v_setr_s8(0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15, + 0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15, + 0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15, + 0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15); + + v_uint8 shuf_mask2 = v_setr_s8(0, 1, 4, 5, 8, 9, 12, 13, + 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, + 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, + 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, + 2, 3, 6, 7, 10, 11, 14, 15); + + v_uint32 idx1 = v_set_s32(23, 21, 7, 5, 22, 20, 6, 4, 19, 17, 3, 1, 18, 16, 2, 0); + v_uint32 idx2 = v_set_s32(31, 29, 15, 13, 30, 28, 14, 12, 27, 25, 11, 9, 26, 24, 10, 8); + v_uint32 idx3 = v_set_s32(29, 25, 21, 17, 13, 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0); + v_uint32 idx4 = v_set_s32(31, 27, 23, 19, 15, 11, 7, 3, 30, 26, 22, 18, 14, 10, 6, 2); + + // vertical pass + v_int16 b0 = vx_setall_s16(beta[0]); + v_int16 b1 = vx_setall_s16(beta[1]); + v_int16 b2 = vx_setall_s16(beta[2]); + v_int16 b3 = vx_setall_s16(beta[3]); + + for (int w = 0; w < inSz.width*chanNum; ) { + for (; w <= inSz.width*chanNum - half_nlanes && w >= 0; w += half_nlanes) { + v_int16 val0_0 = v_load_ccache_expand(&src0[0][w]); + v_int16 val0_1 = v_load_ccache_expand(&src0[1][w]); + v_int16 val0_2 = v_load_ccache_expand(&src0[2][w]); + v_int16 val0_3 = v_load_ccache_expand(&src0[3][w]); + + v_int16 val1_0 = v_load_ccache_expand(&src1[0][w]); + v_int16 val1_1 = v_load_ccache_expand(&src1[1][w]); + v_int16 val1_2 = v_load_ccache_expand(&src1[2][w]); + v_int16 val1_3 = v_load_ccache_expand(&src1[3][w]); + + v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0); + v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1); + v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2); + v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3); + + v_int16 r0 = v_add_wrap(val1_0, t0); + v_int16 r1 = v_add_wrap(val1_1, t1); + v_int16 r2 = v_add_wrap(val1_2, t2); + v_int16 r3 = v_add_wrap(val1_3, t3); + + v_uint8 q0 = v_packus(r0, r1); + v_uint8 q1 = v_packus(r2, r3); +#if 1 + v_uint8 q2 = v_permutex2_s32(q0, q1, idx1); + v_uint8 q3 = v_permutex2_s32(q0, q1, idx2); + + v_uint8 q4 = v_shuffle_s8(q2, shuf_mask1); + v_uint8 q5 = v_shuffle_s8(q3, shuf_mask1); + + //Second variant of decompose. It'll be usefull in the future. +#else + v_uint8 q2 = v_mblend_shiftleft(q0, q1); + v_uint8 q3 = v_mblend_shiftright(q0, q1); + + v_uint8 mask1 = v_setr_s8(0, 8, 4, 12, 1, 9, 5, 13, + 2, 10, 6, 14, 3, 11, 7, 15, + 0, 8, 4, 12, 1, 9, 5, 13, + 2, 10, 6, 14, 3, 11, 7, 15, + 0, 8, 4, 12, 1, 9, 5, 13, + 2, 10, 6, 14, 3, 11, 7, 15, + 0, 8, 4, 12, 1, 9, 5, 13, + 2, 10, 6, 14, 3, 11, 7, 15); + + v_uint8 q4 = v_shuffle_s8(q2, mask1); + v_uint8 q5 = v_shuffle_s8(q3, mask1); + + v_uint64 idx1 = v_set_s64(11, 10, 3, 2, 9, 8, 1, 0); + v_uint64 idx2 = v_set_s64(15, 14, 7, 6, 13, 12, 5, 4); + + v_uint8 q6 = v_permutex2_s64(q4, q5, idx1); + v_uint8 q7 = v_permutex2_s64(q4, q5, idx2); +#endif + + vx_store(&tmp[4 * w + 0], q4); + vx_store(&tmp[4 * w + 2 * half_nlanes], q5); + } + + if (w < inSz.width*chanNum) { + w = inSz.width*chanNum - half_nlanes; + } + } + + // horizontal pass + v_uint8 val_0, val_1, val_2, val_3; + + GAPI_DbgAssert(outSz.width >= half_nlanes); + for (int x = 0; x < outSz.width; ) { + for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) { + v_int16 a10 = vx_load(&clone[4 * x]); + v_int16 a32 = vx_load(&clone[4 * (x + 8)]); + v_int16 a54 = vx_load(&clone[4 * (x + 16)]); + v_int16 a76 = vx_load(&clone[4 * (x + 24)]); + + for (int c = 0; c < chanNum; ++c) { + v_gather_channel(val_0, tmp, mapsx, chanNum, c, x, 0); + v_gather_channel(val_1, tmp, mapsx, chanNum, c, x, shift); + v_gather_channel(val_2, tmp, mapsx, chanNum, c, x, shift * 2); + v_gather_channel(val_3, tmp, mapsx, chanNum, c, x, shift * 3); + + v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0)); + v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1)); + v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2)); + v_int16 val0_3 = v_reinterpret_as_s16(v_expand_low(val_3)); + + v_int16 val1_0 = v_reinterpret_as_s16(v_expand_high(val_0)); + v_int16 val1_1 = v_reinterpret_as_s16(v_expand_high(val_1)); + v_int16 val1_2 = v_reinterpret_as_s16(v_expand_high(val_2)); + v_int16 val1_3 = v_reinterpret_as_s16(v_expand_high(val_3)); + + v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), a10); + v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), a32); + v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), a54); + v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), a76); + + v_int16 r0 = v_add_wrap(val1_0, t0); + v_int16 r1 = v_add_wrap(val1_1, t1); + v_int16 r2 = v_add_wrap(val1_2, t2); + v_int16 r3 = v_add_wrap(val1_3, t3); + + v_uint8 q0 = v_packus(r0, r1); + v_uint8 q1 = v_packus(r2, r3); + + v_uint8 q2 = v_shuffle_s8(q0, shuf_mask1); + v_uint8 q3 = v_shuffle_s8(q1, shuf_mask1); +#if 1 + v_uint8 q4 = v_permutex2_s32(q2, q3, idx3); + v_uint8 q5 = v_permutex2_s32(q2, q3, idx4); + + v_uint8 q6 = v_shuffle_s8(q4, shuf_mask2); + v_uint8 q7 = v_shuffle_s8(q5, shuf_mask2); + + + //Second variant of decompose. It'll be usefull in the future. +#else + v_uint8 q4 = v_mask_blend_shiftleft<0xCCCCCCCC /*0b11001100110011001100110011001100*/, 4>(q2, q3); + v_uint8 q5 = v_mask_blend_shiftright<0xCCCCCCCC /*0b11001100110011001100110011001100*/, 4>(q2, q3); + + v_int32 idx = v_set_s32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); + + v_uint8 q6 = v_permutex_s32(idx, q4); + v_uint8 q7 = v_permutex_s32(idx, q5); + + v_uint8 mask2 = v_setr_s8(0, 1, 4, 5, 8, 9, 12, 13, + 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, + 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, + 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, + 2, 3, 6, 7, 10, 11, 14, 15); + + v_uint8 q8 = v_shuffle_s8(q6, mask2); + v_uint8 q9 = v_shuffle_s8(q7, mask2); +#endif + v_store_low(&dst[c][0][x], q6); + v_store_high(&dst[c][1][x], q6); + v_store_low(&dst[c][2][x], q7); + v_store_high(&dst[c][3][x], q7); + } + } + + if (x < outSz.width) { + x = outSz.width - half_nlanes; + } + } + } else { // if any lpi + for (int l = 0; l < lpi; ++l) { + short beta0 = beta[l]; + + // vertical pass + GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes); + for (int w = 0; w < inSz.width*chanNum; ) { + for (; w <= inSz.width*chanNum - half_nlanes; w += half_nlanes) { + v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[l][w])); + v_int16 s1 = v_reinterpret_as_s16(vx_load_expand(&src1[l][w])); + v_int16 t = v_mulhrs(s0 - s1, beta0) + s1; + v_pack_u_store(tmp + w, t); + } + + if (w < inSz.width*chanNum) { + w = inSz.width*chanNum - half_nlanes; + } + } + + // horizontal pass + GAPI_DbgAssert(outSz.width >= half_nlanes); + + for (int x = 0; x < outSz.width; ) { + for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) { + for (int c = 0; c < chanNum; ++c) { + v_int16 a0 = vx_load(&alpha[x]); // as signed Q1.1.14 + v_int16 sx = vx_load(&mapsx[x]); // as integer (int16) + v_int16 t0 = v_gather_chan(tmp, sx, c, 0); + v_int16 t1 = v_gather_chan(tmp, sx, c, 1); + v_int16 d = v_mulhrs(t0 - t1, a0) + t1; + v_pack_u_store(&dst[c][l][x], d); + } + } + + if (x < outSz.width) { + x = outSz.width - half_nlanes; + } + } + } + } +} + +// Resize (bi-linear, 8UC3) +void calcRowLinear_8U(C3, std::array, 3> &dst, + const uint8_t *src0[], + const uint8_t *src1[], + const short alpha[], + const short clone[], // 4 clones of alpha + const short mapsx[], + const short beta[], + uint8_t tmp[], + const Size &inSz, + const Size &outSz, + int lpi) { + constexpr const int chanNum = 3; + + calcRowLinear_8UC_Impl(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi); +} + +// Resize (bi-linear, 8UC4) +void calcRowLinear_8U(C4, std::array, 4> &dst, + const uint8_t *src0[], + const uint8_t *src1[], + const short alpha[], + const short clone[], // 4 clones of alpha + const short mapsx[], + const short beta[], + uint8_t tmp[], + const Size &inSz, + const Size &outSz, + int lpi) { + constexpr const int chanNum = 4; + + calcRowLinear_8UC_Impl(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi); +} + void copyRow_8U(const uint8_t in[], uint8_t out[], int length) { copyRow_8U_impl(in, out, length); } @@ -153,3 +409,4 @@ void copyRow_32F(const float in[], float out[], int length) { } // namespace kernels } // namespace gapi } // namespace InferenceEngine + diff --git a/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp b/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp index cf121f4..8b994d8 100644 --- a/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp +++ b/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp @@ -50,18 +50,6 @@ namespace InferenceEngine { namespace gapi { namespace kernels { -//---------------------------------------------------------------------- - -static inline v_uint16x8 v_expand_low(const v_uint8x16& a) { - return v_uint16x8(_mm_unpacklo_epi8(a.val, _mm_setzero_si128())); -} - -static inline v_uint16x8 v_expand_high(const v_uint8x16& a) { - return v_uint16x8(_mm_unpackhi_epi8(a.val, _mm_setzero_si128())); -} - -//------------------------------------------------------------------------------ - // Resize (bi-linear, 8U) void calcRowLinear_8U(uint8_t *dst[], const uint8_t *src0[], @@ -485,9 +473,12 @@ void calcRowLinear_8U(uint8_t *dst[], } } +// Resize 3C/4C universal intrinsic implementation for SSE42 version is a bit slower than original sometimes. +// Remove original implementation when I find a cause. +#if 1 // Resize (bi-linear, 8U, generic number of channels) template -void calcRowLinear_8UC_Impl(std::array, chanNum> &dst, +void calcRowLinear_8UC_Impl_(std::array, chanNum> &dst, const uint8_t *src0[], const uint8_t *src1[], const short alpha[], @@ -498,9 +489,11 @@ void calcRowLinear_8UC_Impl(std::array, chanNum> &dst, const Size &inSz, const Size &outSz, int lpi) { + const int half_nlanes = (v_uint8::nlanes / 2); + if (4 == lpi) { // vertical pass - GAPI_DbgAssert(inSz.width >= 8); + GAPI_DbgAssert(inSz.width >= half_nlanes); __m128i b0 = _mm_set1_epi16(beta[0]); __m128i b1 = _mm_set1_epi16(beta[1]); @@ -508,7 +501,7 @@ void calcRowLinear_8UC_Impl(std::array, chanNum> &dst, __m128i b3 = _mm_set1_epi16(beta[3]); for (int w = 0; w < inSz.width*chanNum; ) { - for (; w <= inSz.width*chanNum - 8 && w >= 0; w += 8) { + for (; w <= inSz.width*chanNum - half_nlanes && w >= 0; w += half_nlanes) { //-------------------------------------------- // reworked from: ie_preprocess_data_sse42.cpp // function: resize_bilinear_u8 @@ -558,14 +551,14 @@ void calcRowLinear_8UC_Impl(std::array, chanNum> &dst, } if (w < inSz.width*chanNum) { - w = inSz.width*chanNum - 8; + w = inSz.width*chanNum - half_nlanes; } } // horizontal pass - GAPI_DbgAssert(outSz.width >= 8); + GAPI_DbgAssert(outSz.width >= half_nlanes); for (int x = 0; x < outSz.width; ) { - for (; x <= outSz.width - 8 && x >= 0; x += 8) { + for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) { //-------------------------------------------- // reworked from: ie_preprocess_data_sse42.cpp // function: resize_bilinear_u8 @@ -645,17 +638,18 @@ void calcRowLinear_8UC_Impl(std::array, chanNum> &dst, } if (x < outSz.width) { - x = outSz.width - 8; + x = outSz.width - half_nlanes; } } + } else { // if any lpi for (int l = 0; l < lpi; l++) { short beta0 = beta[l]; // vertical pass - GAPI_DbgAssert(inSz.width*chanNum >= 8); + GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes); for (int w = 0; w < inSz.width*chanNum; ) { - for (; w <= inSz.width*chanNum - 8; w += 8) { + for (; w <= inSz.width*chanNum - half_nlanes; w += half_nlanes) { v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(&src0[l][w])); v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(&src1[l][w])); v_int16x8 t = v_mulhrs(s0 - s1, beta0) + s1; @@ -663,14 +657,14 @@ void calcRowLinear_8UC_Impl(std::array, chanNum> &dst, } if (w < inSz.width*chanNum) { - w = inSz.width*chanNum - 8; + w = inSz.width*chanNum - half_nlanes; } } // horizontal pass - GAPI_DbgAssert(outSz.width >= 8); + GAPI_DbgAssert(outSz.width >= half_nlanes); for (int x = 0; x < outSz.width; ) { - for (; x <= outSz.width - 8 && x >= 0; x += 8) { + for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) { for (int c = 0; c < chanNum; c++) { v_int16x8 a0 = v_load(&alpha[x]); // as signed Q1.1.14 v_int16x8 sx = v_load(&mapsx[x]); // as integer (int16) @@ -682,12 +676,186 @@ void calcRowLinear_8UC_Impl(std::array, chanNum> &dst, } if (x < outSz.width) { - x = outSz.width - 8; + x = outSz.width - half_nlanes; + } + } + } + } +} +#else +// Resize 3C/4C universal intrinsic implementation for SSE42 version is a bit slower sometimes. +// Gonna turn it on when I find a cause. +template +void calcRowLinear_8UC_Impl_(std::array, chanNum> &dst, + const uint8_t *src0[], + const uint8_t *src1[], + const short alpha[], + const short clone[], // 4 clones of alpha + const short mapsx[], + const short beta[], + uint8_t tmp[], + const Size &inSz, + const Size &outSz, + int lpi) { + const int half_nlanes = (v_uint8::nlanes / 2); + + if (4 == lpi) { + // vertical pass + GAPI_DbgAssert(inSz.width >= half_nlanes); + + v_int16 b0 = vx_setall_s16(beta[0]); + v_int16 b1 = vx_setall_s16(beta[1]); + v_int16 b2 = vx_setall_s16(beta[2]); + v_int16 b3 = vx_setall_s16(beta[3]); + + for (int w = 0; w < inSz.width*chanNum; ) { + for (; w <= inSz.width*chanNum - half_nlanes && w >= 0; w += half_nlanes) { + v_int16 val0_0 = v_reinterpret_as_s16(vx_load_expand(&src0[0][w])); + v_int16 val0_1 = v_reinterpret_as_s16(vx_load_expand(&src0[1][w])); + v_int16 val0_2 = v_reinterpret_as_s16(vx_load_expand(&src0[2][w])); + v_int16 val0_3 = v_reinterpret_as_s16(vx_load_expand(&src0[3][w])); + + v_int16 val1_0 = v_reinterpret_as_s16(vx_load_expand(&src1[0][w])); + v_int16 val1_1 = v_reinterpret_as_s16(vx_load_expand(&src1[1][w])); + v_int16 val1_2 = v_reinterpret_as_s16(vx_load_expand(&src1[2][w])); + v_int16 val1_3 = v_reinterpret_as_s16(vx_load_expand(&src1[3][w])); + + v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0); + v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1); + v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2); + v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3); + + v_int16 r0 = v_add_wrap(val1_0, t0); + v_int16 r1 = v_add_wrap(val1_1, t1); + v_int16 r2 = v_add_wrap(val1_2, t2); + v_int16 r3 = v_add_wrap(val1_3, t3); + + v_uint8 q0 = v_packus(r0, r1); + v_uint8 q1 = v_packus(r2, r3); + + v_uint8 q2 = v_blend_shiftleft<0xCC /*0b11001100*/, 4>(q0, q1); + v_uint8 q3 = v_blend_shiftright<0xCC /*0b11001100*/, 4>(q0, q1); + + v_uint8 mask = v_setr_s8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15); + + v_uint8 q4 = v_shuffle_s8(q2, mask); + v_uint8 q5 = v_shuffle_s8(q3, mask); + + vx_store(&tmp[4 * w + 0], q4); + vx_store(&tmp[4 * w + 2 * half_nlanes], q5); + } + + if (w < inSz.width*chanNum) { + w = inSz.width*chanNum - half_nlanes; + } + } + + // horizontal pass + GAPI_DbgAssert(outSz.width >= half_nlanes); + for (int x = 0; x < outSz.width; ) { + for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) { + v_int16 a10 = vx_load(&clone[4 * x]); + v_int16 a32 = vx_load(&clone[4 * (x + 2)]); + v_int16 a54 = vx_load(&clone[4 * (x + 4)]); + v_int16 a76 = vx_load(&clone[4 * (x + 6)]); + + v_uint8 val_0 = vx_setzero_u8(); + v_uint8 val_1 = vx_setzero_u8(); + v_uint8 val_2 = vx_setzero_u8(); + v_uint8 val_3 = vx_setzero_u8(); + + for (int c = 0; c < chanNum; ++c) { + int shift = (half_nlanes / 4); + + v_gather_channel(val_0, tmp, mapsx, chanNum, c, x, 0); + v_gather_channel(val_1, tmp, mapsx, chanNum, c, x, shift); + v_gather_channel(val_2, tmp, mapsx, chanNum, c, x, shift * 2); + v_gather_channel(val_3, tmp, mapsx, chanNum, c, x, shift * 3); + + v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0)); + v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1)); + v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2)); + v_int16 val0_3 = v_reinterpret_as_s16(v_expand_low(val_3)); + + v_int16 val1_0 = v_reinterpret_as_s16(v_expand_high(val_0)); + v_int16 val1_1 = v_reinterpret_as_s16(v_expand_high(val_1)); + v_int16 val1_2 = v_reinterpret_as_s16(v_expand_high(val_2)); + v_int16 val1_3 = v_reinterpret_as_s16(v_expand_high(val_3)); + + v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), a10); + v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), a32); + v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), a54); + v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), a76); + + v_int16 r0 = v_add_wrap(val1_0, t0); + v_int16 r1 = v_add_wrap(val1_1, t1); + v_int16 r2 = v_add_wrap(val1_2, t2); + v_int16 r3 = v_add_wrap(val1_3, t3); + + v_uint8 q0 = v_packus(r0, r1); + v_uint8 q1 = v_packus(r2, r3); + + v_uint8 mask = v_setr_s8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15); + + v_uint8 q2 = v_shuffle_s8(q0, mask); + v_uint8 q3 = v_shuffle_s8(q1, mask); + + v_uint8 q4 = v_blend_shiftleft<0xCC /*0b11001100*/, 4>(q2, q3); + v_uint8 q5 = v_blend_shiftright<0xCC /*0b11001100*/, 4>(q2, q3); + + v_store_low(&dst[c][0][x], q4); + v_store_high(&dst[c][1][x], q4); + v_store_low(&dst[c][2][x], q5); + v_store_high(&dst[c][3][x], q5); + } + } + + if (x < outSz.width) { + x = outSz.width - half_nlanes; + } + } + + } else { // if any lpi + for (int l = 0; l < lpi; ++l) { + short beta0 = beta[l]; + + // vertical pass + GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes); + for (int w = 0; w < inSz.width*chanNum; ) { + for (; w <= inSz.width*chanNum - half_nlanes; w += half_nlanes) { + v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[l][w])); + v_int16 s1 = v_reinterpret_as_s16(vx_load_expand(&src1[l][w])); + v_int16 t = v_mulhrs(s0 - s1, beta0) + s1; + v_pack_u_store(tmp + w, t); + } + + if (w < inSz.width*chanNum) { + w = inSz.width*chanNum - half_nlanes; + } + } + + // horizontal pass + GAPI_DbgAssert(outSz.width >= half_nlanes); + for (int x = 0; x < outSz.width; ) { + for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) { + for (int c = 0; c < chanNum; ++c) { + v_int16 a0 = vx_load(&alpha[x]); // as signed Q1.1.14 + v_int16 sx = vx_load(&mapsx[x]); // as integer (int16) + v_int16 t0 = v_gather_chan(tmp, sx, c, 0); + v_int16 t1 = v_gather_chan(tmp, sx, c, 1); + v_int16 d = v_mulhrs(t0 - t1, a0) + t1; + v_pack_u_store(&dst[c][l][x], d); + } + } + + if (x < outSz.width) { + x = outSz.width - half_nlanes; } } } } } +#endif // Resize (bi-linear, 8UC3) void calcRowLinear_8U(C3, std::array, 3> &dst, @@ -703,7 +871,7 @@ void calcRowLinear_8U(C3, std::array, 3> &dst, int lpi) { constexpr const int chanNum = 3; - calcRowLinear_8UC_Impl(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi); + calcRowLinear_8UC_Impl_(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi); } // Resize (bi-linear, 8UC4) @@ -719,7 +887,7 @@ void calcRowLinear_8U(C4, std::array, 4> &dst, const Size &outSz, int lpi) { constexpr const int chanNum = 4; - calcRowLinear_8UC_Impl(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi); + calcRowLinear_8UC_Impl_(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi); } // Resize (bi-linear, 32F) diff --git a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp index 667e9d6..2272ba5 100644 --- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp +++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp @@ -894,24 +894,62 @@ static void calcRowLinearC(const cv::gapi::fluid::View & in, } } - #ifdef HAVE_SSE +#ifdef HAVE_AVX512 + if (with_cpu_x86_avx512_core()) { + if (std::is_same::value) { + if (inSz.width >= 64 && outSz.width >= 32) { + avx512::calcRowLinear_8UC(dst, + reinterpret_cast(src0), + reinterpret_cast(src1), + reinterpret_cast(alpha), + reinterpret_cast(clone), + reinterpret_cast(mapsx), + reinterpret_cast(beta), + reinterpret_cast(tmp), + inSz, outSz, lpi); + return; + } + } + } +#endif + +#ifdef HAVE_AVX2 + if (with_cpu_x86_avx2()) { + if (std::is_same::value) { + if (inSz.width >= 32 && outSz.width >= 16) { + avx::calcRowLinear_8UC(dst, + reinterpret_cast(src0), + reinterpret_cast(src1), + reinterpret_cast(alpha), + reinterpret_cast(clone), + reinterpret_cast(mapsx), + reinterpret_cast(beta), + reinterpret_cast(tmp), + inSz, outSz, lpi); + return; + } + } + } +#endif + +#ifdef HAVE_SSE if (with_cpu_x86_sse42()) { if (std::is_same::value) { if (inSz.width >= 16 && outSz.width >= 8) { calcRowLinear_8UC(dst, - reinterpret_cast(src0), - reinterpret_cast(src1), - reinterpret_cast(alpha), - reinterpret_cast(clone), - reinterpret_cast(mapsx), - reinterpret_cast(beta), - reinterpret_cast(tmp), - inSz, outSz, lpi); + reinterpret_cast(src0), + reinterpret_cast(src1), + reinterpret_cast(alpha), + reinterpret_cast(clone), + reinterpret_cast(mapsx), + reinterpret_cast(beta), + reinterpret_cast(tmp), + inSz, outSz, lpi); return; } } } - #endif // HAVE_SSE +#endif // HAVE_SSE auto length = out[0].get().length(); diff --git a/inference-engine/tests_deprecated/fluid_preproc/fluid_test_computations/CMakeLists.txt b/inference-engine/tests_deprecated/fluid_preproc/fluid_test_computations/CMakeLists.txt index 94b935f..36b3d9a 100644 --- a/inference-engine/tests_deprecated/fluid_preproc/fluid_test_computations/CMakeLists.txt +++ b/inference-engine/tests_deprecated/fluid_preproc/fluid_test_computations/CMakeLists.txt @@ -7,6 +7,12 @@ file(GLOB HDR *.hpp) add_library(fluid_test_computations SHARED ${SRC} ${HDR}) +# Workaround to avoid warnings caused with bug in the avx512intrin.h of GCC5 +if((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND + (CMAKE_CXX_COMPILER_VERSION VERSION_LESS_EQUAL 5.5)) + set_target_properties(fluid_test_computations PROPERTIES LINK_FLAGS_RELEASE "-Wno-error=maybe-uninitialized -Wno-maybe-uninitialized") +endif() + target_include_directories(fluid_test_computations PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}") target_link_libraries(fluid_test_computations PRIVATE inference_engine_preproc_s inference_engine fluid) diff --git a/inference-engine/thirdparty/ocv/opencv_hal_avx.hpp b/inference-engine/thirdparty/ocv/opencv_hal_avx.hpp index eb592b1..046f604 100644 --- a/inference-engine/thirdparty/ocv/opencv_hal_avx.hpp +++ b/inference-engine/thirdparty/ocv/opencv_hal_avx.hpp @@ -48,7 +48,7 @@ inline __m256d _v256_permute2x128(const __m256d& a, const __m256d& b) { return _mm256_permute2f128_pd(a, b, imm); } template -inline _Tpvec v256_permute2x128(const _Tpvec& a, const _Tpvec& b) +static inline _Tpvec v256_permute2x128(const _Tpvec& a, const _Tpvec& b) { return _Tpvec(_v256_permute2x128(a.val, b.val)); } template @@ -60,7 +60,7 @@ inline __m256d _v256_permute4x64(const __m256d& a) { return _mm256_permute4x64_pd(a, imm); } template -inline _Tpvec v256_permute4x64(const _Tpvec& a) +static inline _Tpvec v256_permute4x64(const _Tpvec& a) { return _Tpvec(_v256_permute4x64(a.val)); } inline __m128i _v256_extract_high(const __m256i& v) @@ -730,6 +730,11 @@ OPENCV_HAL_IMPL_AVX_EXPAND(v_int16x16, v_int32x8, short, _mm256_cvtepi16_e OPENCV_HAL_IMPL_AVX_EXPAND(v_uint32x8, v_uint64x4, unsigned, _mm256_cvtepu32_epi64) OPENCV_HAL_IMPL_AVX_EXPAND(v_int32x8, v_int64x4, int, _mm256_cvtepi32_epi64) +static inline v_int16x16 v_load_ccache_expand(const uchar* ptr) +{ + return v_int16x16(_mm256_cvtepu8_epi16(_mm_lddqu_si128((const __m128i*)ptr))); +} + inline v_uint8x32 v_mul_wrap(const v_uint8x32& a, const v_uint8x32& b) { __m256i ad = _mm256_srai_epi16(a.val, 8); @@ -1925,6 +1930,37 @@ inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b) return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val))); } +static inline v_uint8x32 v_packus(const v_int16x16& a, const v_int16x16& b) +{ + return v_uint8x32(_mm256_packus_epi16(a.val, b.val)); +} + +template +static inline v_uint8x32 v_blend_shiftleft(const v_uint8x32& a, const v_uint8x32& b) +{ + return v_uint8x32(_mm256_blend_epi16(a.val, _mm256_slli_si256(b.val, shift), mask)); +} + +template +static inline v_uint8x32 v_blend_shiftright(const v_uint8x32& a, const v_uint8x32& b) +{ + return v_uint8x32(_mm256_blend_epi16(_mm256_srli_si256(a.val, shift), b.val, mask)); +} + +static inline v_uint8x32 v_setr_s8(char b0, char b1, char b2, char b3, char b4, + char b5, char b6, char b7, char b8, char b9, + char b10, char b11, char b12, char b13, char b14, + char b15, char b16, char b17, char b18, char b19, + char b20, char b21, char b22, char b23, char b24, + char b25, char b26, char b27, char b28, char b29, + char b30, char b31) +{ + return v_uint8x32(_mm256_setr_epi8(b0, b1, b2, b3, b4, b5, b6, b7, + b8, b9, b10, b11, b12, b13, b14, b15, + b16, b17, b18, b19, b20, b21, b22, b23, + b24, b25, b26, b27, b28, b29, b30, b31)); +} + inline void v_pack_store(schar* ptr, const v_int16x16& a) { v_store_low(ptr, v_pack(a, a)); } @@ -3075,9 +3111,7 @@ static inline v_uint16x16 v_mulhi(const v_uint16x16& a, uint16_t b) static inline v_int16x16 v_mulhrs(const v_int16x16& a, const v_int16x16& b) { - v_int16x16 r; - r.val = _mm256_mulhrs_epi16(a.val, b.val); - return r; + return v_int16x16(_mm256_mulhrs_epi16(a.val, b.val)); } static inline v_int16x16 v_mulhrs(const v_int16x16& a, short b) @@ -3110,6 +3144,49 @@ static inline v_float32x8 operator* (const v_float32x8& a, float b) return a * v256_setall_f32(b); } +static inline v_uint8x32 v_shuffle_s8(const v_uint8x32& a, const v_uint8x32& mask) +{ + return v_uint8x32(_mm256_shuffle_epi8(a.val, mask.val)); +} + +static inline void v_gather_channel(v_uint8x32& vec, const uint8_t tmp[], const short mapsx[], + int chanNum, int c, int x, int shift) +{ + vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + shift + 0] + c)]), 0); + vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + shift + 1] + c)]), 1); + vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + shift + 2] + c)]), 2); + vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + shift + 3] + c)]), 3); + + vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + shift + 0] + 1) + c)]), 4); + vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + shift + 1] + 1) + c)]), 5); + vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + shift + 2] + 1) + c)]), 6); + vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + shift + 3] + 1) + c)]), 7); +} + +namespace { + template + static inline v_int16x16 v_gather_chan(const uchar src[], const v_int16x16& index, int channel, int pos) { + v_int16x16 r; + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(_mm256_extract_epi16(index.val, 0) + pos) + channel]), 0); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(_mm256_extract_epi16(index.val, 1) + pos) + channel]), 1); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(_mm256_extract_epi16(index.val, 2) + pos) + channel]), 2); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(_mm256_extract_epi16(index.val, 3) + pos) + channel]), 3); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(_mm256_extract_epi16(index.val, 4) + pos) + channel]), 4); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(_mm256_extract_epi16(index.val, 5) + pos) + channel]), 5); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(_mm256_extract_epi16(index.val, 6) + pos) + channel]), 6); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(_mm256_extract_epi16(index.val, 7) + pos) + channel]), 7); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(_mm256_extract_epi16(index.val, 8) + pos) + channel]), 8); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(_mm256_extract_epi16(index.val, 9) + pos) + channel]), 9); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(_mm256_extract_epi16(index.val, 10) + pos) + channel]), 10); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(_mm256_extract_epi16(index.val, 11) + pos) + channel]), 11); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(_mm256_extract_epi16(index.val, 12) + pos) + channel]), 12); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(_mm256_extract_epi16(index.val, 13) + pos) + channel]), 13); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(_mm256_extract_epi16(index.val, 14) + pos) + channel]), 14); + r.val = _mm256_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(_mm256_extract_epi16(index.val, 15) + pos) + channel]), 15); + return r; + } +} // namespace + CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END //! @endcond diff --git a/inference-engine/thirdparty/ocv/opencv_hal_avx512.hpp b/inference-engine/thirdparty/ocv/opencv_hal_avx512.hpp index 2f88c19..1f786b7 100644 --- a/inference-engine/thirdparty/ocv/opencv_hal_avx512.hpp +++ b/inference-engine/thirdparty/ocv/opencv_hal_avx512.hpp @@ -89,7 +89,7 @@ inline __m256 _v512_extract_high(const __m512& v) { return _mm512_extractf32x8_ps(v, 1); } inline __m256d _v512_extract_high(const __m512d& v) -{ return _mm512_extractf64x4_pd(v, 1); } +{ return _mm512_mask_extractf64x4_pd(_mm256_setzero_pd(), (__mmask8) -1, v, 1); } inline __m256i _v512_extract_low(const __m512i& v) { return _mm512_castsi512_si256(v); } @@ -1936,7 +1936,7 @@ OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_int32x16, schar, _mm512_cvtepi8_epi32) /* pack */ // 16 inline v_int8x64 v_pack(const v_int16x32& a, const v_int16x32& b) -{ return v_int8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); } +{ return v_int8x64(_mm512_mask_permutexvar_epi64(_mm512_setzero_si512(), (__mmask8)-1, _v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); } inline v_uint8x64 v_pack(const v_uint16x32& a, const v_uint16x32& b) { @@ -1946,7 +1946,7 @@ inline v_uint8x64 v_pack(const v_uint16x32& a, const v_uint16x32& b) inline v_uint8x64 v_pack_u(const v_int16x32& a, const v_int16x32& b) { - return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi16(a.val, b.val))); + return v_uint8x64(_mm512_mask_permutexvar_epi64(_mm512_setzero_si512(), (__mmask8)-1, _v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi16(a.val, b.val))); } inline void v_pack_store(schar* ptr, const v_int16x32& a) @@ -2007,7 +2007,9 @@ void v_rshr_pack_store(schar* ptr, const v_int16x32& a) // 32 inline v_int16x32 v_pack(const v_int32x16& a, const v_int32x16& b) -{ return v_int16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi32(a.val, b.val))); } +{ return v_int16x32(_mm512_mask_permutexvar_epi64(_mm512_setzero_si512(), (__mmask8) -1, + _v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), + _mm512_packs_epi32(a.val, b.val))); } inline v_uint16x32 v_pack(const v_uint32x16& a, const v_uint32x16& b) { @@ -2016,7 +2018,9 @@ inline v_uint16x32 v_pack(const v_uint32x16& a, const v_uint32x16& b) } inline v_uint16x32 v_pack_u(const v_int32x16& a, const v_int32x16& b) -{ return v_uint16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi32(a.val, b.val))); } +{ return v_uint16x32(_mm512_mask_permutexvar_epi64(_mm512_setzero_si512(), (__mmask8) -1, + _v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), + _mm512_packus_epi32(a.val, b.val))); } inline void v_pack_store(short* ptr, const v_int32x16& a) { v_store_low(ptr, v_pack(a, a)); } @@ -2118,7 +2122,7 @@ void v_rshr_pack_store(int* ptr, const v_int64x8& a) // pack boolean inline v_uint8x64 v_pack_b(const v_uint16x32& a, const v_uint16x32& b) -{ return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); } +{ return v_uint8x64(_mm512_mask_permutexvar_epi64(_mm512_setzero_si512(), (__mmask8) -1, _v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); } inline v_uint8x64 v_pack_b(const v_uint32x16& a, const v_uint32x16& b, const v_uint32x16& c, const v_uint32x16& d) @@ -3069,9 +3073,7 @@ static inline v_uint16x32 v_mulhi(const v_uint16x32& a, uint16_t b) static inline v_int16x32 v_mulhrs(const v_int16x32& a, const v_int16x32& b) { - v_int16x32 r; - r.val = _mm512_mulhrs_epi16(a.val, b.val); - return r; + return v_int16x32(_mm512_mulhrs_epi16(a.val, b.val)); } static inline v_int16x32 v_mulhrs(const v_int16x32& a, short b) @@ -3104,6 +3106,188 @@ static inline v_float32x16 operator* (const v_float32x16& a, float b) return a * v512_setall_f32(b); } +template +static inline v_uint8x64 v_mask_blend_shiftleft(const v_uint8x64& a, const v_uint8x64& b) +{ + return v_uint8x64(_mm512_mask_blend_epi16(mask, + a.val, _mm512_bslli_epi128(b.val, shift))); +} + +template +static inline v_uint8x64 v_mask_blend_shiftright(const v_uint8x64& a, const v_uint8x64& b) +{ + return v_uint8x64(_mm512_mask_blend_epi16(mask, + _mm512_bsrli_epi128(a.val, shift), b.val)); +} + +static inline v_uint8x64 v_packus(const v_int16x32& a, const v_int16x32& b) +{ + return v_uint8x64(_mm512_packus_epi16(a.val, b.val)); +} + + +#define word(b0, b1, b2, b3) \ + (((uint32_t)((uint8_t)(b0)) << 0*8) \ + | ((uint32_t)((uint8_t)(b1)) << 1*8) \ + | ((uint32_t)((uint8_t)(b2)) << 2*8) \ + | ((uint32_t)((uint8_t)(b3)) << 3*8)) + +static inline v_uint8x64 v_setr_s8(char b0, char b1, char b2, char b3, char b4, + char b5, char b6, char b7, char b8, char b9, + char b10, char b11, char b12, char b13, char b14, + char b15, char b16, char b17, char b18, char b19, + char b20, char b21, char b22, char b23, char b24, + char b25, char b26, char b27, char b28, char b29, + char b30, char b31, char b32, char b33, char b34, + char b35, char b36, char b37, char b38, char b39, + char b40, char b41, char b42, char b43, char b44, + char b45, char b46, char b47, char b48, char b49, + char b50, char b51, char b52, char b53, char b54, + char b55, char b56, char b57, char b58, char b59, + char b60, char b61, char b62, char b63) +{ + return v_uint8x64(_mm512_setr_epi32(word(b0, b1, b2, b3), word(b4, b5, b6, b7), word(b8, b9, b10, b11), + word(b12, b13, b14, b15), word(b16, b17, b18, b19), word(b20, b21, b22, b23), + word(b24, b25, b26, b27), word(b28, b29, b30, b31), word(b32, b33, b34, b35), + word(b36, b37, b38, b39), word(b40, b41, b42, b43), word(b44, b45, b46, b47), + word(b48, b49, b50, b51), word(b52, b53, b54, b55), word(b56, b57, b58, b59), + word(b60, b61, b62, b63))); +} + +static inline v_uint64x8 v_set_s64(int b7, int b6, int b5, int b4, int b3, int b2, int b1, int b0) +{ + return v_uint64x8(_mm512_set_epi64(b7, b6, b5, b4, b3, b2, b1, b0)); +} + +static inline v_uint32x16 v_set_s32(int b15, int b14, int b13, int b12, int b11, int b10, int b9, int b8, + int b7, int b6, int b5, int b4, int b3, int b2, int b1, int b0) +{ + return v_uint32x16(_mm512_set_epi32(b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0)); +} + +static inline v_uint8x64 v_shuffle_s8(const v_uint8x64& a, const v_uint8x64& mask) +{ + return v_uint8x64(_mm512_shuffle_epi8(a.val, mask.val)); +} +static inline v_int16x32 v_load_ccache_expand(const uchar* ptr) +{ + return v_int16x32(_mm512_cvtepu8_epi16(_mm256_lddqu_si256((const __m256i*)ptr))); \ +} +static inline __m512i v512_insert_epi16(__m512i target, const uchar x, const int index) +{ + return _mm512_mask_set1_epi16(target, 1UL << index, x); +} +static inline __m512i v512_insert_epi32(__m512i target, const int32_t x, const int index) +{ + return _mm512_mask_set1_epi32(target, 1UL << index, x); +} + +static inline void v_gather_channel(v_uint8x64& vec, const uint8_t tmp[], const short mapsx[], + int chanNum, int c, int x, int shift) +{ + __m256i vec1 = _mm256_setzero_si256(); + __m256i vec2 = _mm256_setzero_si256(); + + vec1 = _mm256_insert_epi32(vec1, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + shift + 0] + c)]), 0); + vec1 = _mm256_insert_epi32(vec1, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + shift + 1] + c)]), 1); + vec1 = _mm256_insert_epi32(vec1, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + shift + 2] + c)]), 2); + vec1 = _mm256_insert_epi32(vec1, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + shift + 3] + c)]), 3); + vec1 = _mm256_insert_epi32(vec1, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + shift + 4] + c)]), 4); + vec1 = _mm256_insert_epi32(vec1, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + shift + 5] + c)]), 5); + vec1 = _mm256_insert_epi32(vec1, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + shift + 6] + c)]), 6); + vec1 = _mm256_insert_epi32(vec1, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + shift + 7] + c)]), 7); + + vec2 = _mm256_insert_epi32(vec2, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + shift + 0] + 1) + c)]), 0); + vec2 = _mm256_insert_epi32(vec2, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + shift + 1] + 1) + c)]), 1); + vec2 = _mm256_insert_epi32(vec2, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + shift + 2] + 1) + c)]), 2); + vec2 = _mm256_insert_epi32(vec2, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + shift + 3] + 1) + c)]), 3); + vec2 = _mm256_insert_epi32(vec2, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + shift + 4] + 1) + c)]), 4); + vec2 = _mm256_insert_epi32(vec2, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + shift + 5] + 1) + c)]), 5); + vec2 = _mm256_insert_epi32(vec2, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + shift + 6] + 1) + c)]), 6); + vec2 = _mm256_insert_epi32(vec2, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + shift + 7] + 1) + c)]), 7); + + vec.val = _mm512_inserti32x8(_mm512_castsi256_si512(vec1), vec2, 1); +} + +static inline v_uint8x64 v_permutex2_s64(const v_uint8x64& a, const v_uint8x64& b, const v_uint64x8& idxs) +{ + return v_uint8x64(_mm512_permutex2var_epi64(a.val, idxs.val, b.val)); +} + +static inline v_uint8x64 v_permutex_s32(const v_uint8x64& a, const v_uint64x8 idxs) +{ + return v_uint8x64(_mm512_permutexvar_epi32(idxs.val, a.val)); +} + +static inline v_uint8x64 v_permutex2_s32(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16 idxs) +{ + return v_uint8x64(_mm512_permutex2var_epi32(a.val, idxs.val, b.val)); +} + +#if defined(__GNUC__) + +int _mm512_cvtsi512_si32(__m512i a) +{ + __v16si b = (__v16si)a; + return b[0]; +} + +#endif + +template +static inline int v512_extract_epi32(__m512i target) +{ + return _mm512_cvtsi512_si32(_mm512_mask_alignr_epi32(_mm512_setzero_si512(), (__mmask16)-1, target, target, index)); +} + +template +static inline int v512_extract_epi16(__m512i target) +{ + return (v512_extract_epi32(target) >> (index % 2 ? 16 : 0)) & 0xFFFF; +} + +namespace { + template + static inline v_int16x32 v_gather_chan(const uchar src[], const v_int16x32& index, int channel, int pos) { + v_int16x32 r; + + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<0>(index.val) + pos) + channel]), 0); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<1>(index.val) + pos) + channel]), 1); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<2>(index.val) + pos) + channel]), 2); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<3>(index.val) + pos) + channel]), 3); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<4>(index.val) + pos) + channel]), 4); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<5>(index.val) + pos) + channel]), 5); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<6>(index.val) + pos) + channel]), 6); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<7>(index.val) + pos) + channel]), 7); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<8>(index.val) + pos) + channel]), 8); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<9>(index.val) + pos) + channel]), 9); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<10>(index.val) + pos) + channel]), 10); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<11>(index.val) + pos) + channel]), 11); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<12>(index.val) + pos) + channel]), 12); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<13>(index.val) + pos) + channel]), 13); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<14>(index.val) + pos) + channel]), 14); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<15>(index.val) + pos) + channel]), 15); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<16>(index.val) + pos) + channel]), 16); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<17>(index.val) + pos) + channel]), 17); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<18>(index.val) + pos) + channel]), 18); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<19>(index.val) + pos) + channel]), 19); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<20>(index.val) + pos) + channel]), 20); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<21>(index.val) + pos) + channel]), 21); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<22>(index.val) + pos) + channel]), 22); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<23>(index.val) + pos) + channel]), 23); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<24>(index.val) + pos) + channel]), 24); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<25>(index.val) + pos) + channel]), 25); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<26>(index.val) + pos) + channel]), 26); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<27>(index.val) + pos) + channel]), 27); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<28>(index.val) + pos) + channel]), 28); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<29>(index.val) + pos) + channel]), 29); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<30>(index.val) + pos) + channel]), 30); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[chanNum*(v512_extract_epi16<31>(index.val) + pos) + channel]), 31); + + return r; + } +} // namespace + CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END //! @endcond diff --git a/inference-engine/thirdparty/ocv/opencv_hal_sse.hpp b/inference-engine/thirdparty/ocv/opencv_hal_sse.hpp index cfeb296..1e75ee7 100644 --- a/inference-engine/thirdparty/ocv/opencv_hal_sse.hpp +++ b/inference-engine/thirdparty/ocv/opencv_hal_sse.hpp @@ -371,6 +371,12 @@ inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) {return v_float32x inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) {return v_float64x2(_mm_castps_pd(a.val)); } //////////////// PACK /////////////// +static inline v_uint8x16 v_packus(const v_int16x8& a, const v_int16x8& b) { + v_uint8x16 res; + res.val = _mm_packus_epi16(a.val, b.val); + return res; +} + inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b) { __m128i delta = _mm_set1_epi16(255); @@ -1526,7 +1532,17 @@ inline _Tpwsvec v_load_expand(const _Tps* ptr) \ { \ __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \ return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \ -} +}\ +inline _Tpwuvec v_expand_low(const _Tpuvec& a) { \ + _Tpwuvec res; \ + res.val = _mm_cvtepu8_epi16(a.val); \ + return res; \ +} \ +inline _Tpwuvec v_expand_high(const _Tpuvec& a) { \ + _Tpwuvec res; \ + res.val = _mm_unpackhi_epi8(a.val, _mm_setzero_si128()); \ + return res; \ +} \ OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8) OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16) @@ -2921,6 +2937,12 @@ static inline v_int16x8 v_saturate_s16(const v_int32x4& a) { return r; } +static inline v_uint8x16 v_packus_s16(const v_int16x8& a, const v_int16x8& b) { + v_uint8x16 r; + r.val = _mm_packus_epi16(a.val, b.val); + return r; +} + // for each j=index[k], load two chars src[j] and src[j+1] static inline v_uint8x16 v_gather_pairs(const uchar src[], const v_int16x8& index) { v_uint8x16 r; @@ -3030,6 +3052,47 @@ static inline v_float32x4 operator* (const v_float32x4& a, float b) { return a * v_setall_f32(b); } +template +static inline v_uint8x16 v_blend_shiftleft(const v_uint8x16& a, const v_uint8x16& b) { + v_uint8x16 res; + res.val = _mm_blend_epi16(a.val, _mm_slli_si128(b.val, shift), mask /*0xCC 0b11001100*/); + return res; +} + +template +static inline v_uint8x16 v_blend_shiftright(const v_uint8x16& a, const v_uint8x16& b) { + v_uint8x16 res; + res.val = _mm_blend_epi16(_mm_srli_si128(a.val, shift), b.val, mask /*0xCC 0b11001100*/); + return res; +} + +static inline v_uint8x16 v_setr_s8(char b0, char b1, char b2, char b3, char b4, + char b5, char b6, char b7, char b8, char b9, + char b10, char b11, char b12, char b13, char b14, + char b15) { + v_uint8x16 res; + res.val = _mm_setr_epi8(b0, b1, b2, b3, b4, b5, b6, b7, b8, + b9, b10, b11, b12, b13, b14, b15); + return res; +} + + +static inline v_uint8x16 v_shuffle_s8(const v_uint8x16& a, const v_uint8x16& mask) { + v_uint8x16 res; + res.val = _mm_shuffle_epi8(a.val, mask.val); + return res; +} + +static inline void v_gather_channel(v_uint8x16& vec, const uint8_t tmp[], const short mapsx[], + int chanNum, int c, int x, int shift) +{ + vec.val = _mm_insert_epi32(vec.val, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + shift + 0] + c)]), 0); + vec.val = _mm_insert_epi32(vec.val, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + shift + 1] + c)]), 1); + + vec.val = _mm_insert_epi32(vec.val, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + shift + 0] + 1) + c)]), 2); + vec.val = _mm_insert_epi32(vec.val, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + shift + 1] + 1) + c)]), 3); +} + //! @} CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END