Pre-processing(GAPI): AVX2/AVX512 implementation of 3C/4C Resize via universal intrin...
authorAnna Khakimova <anna.khakimova@intel.com>
Fri, 29 May 2020 12:44:12 +0000 (15:44 +0300)
committerGitHub <noreply@github.com>
Fri, 29 May 2020 12:44:12 +0000 (15:44 +0300)
inference-engine/ie_bridges/c/src/CMakeLists.txt
inference-engine/src/preprocessing/CMakeLists.txt
inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp
inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.cpp
inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp
inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
inference-engine/tests_deprecated/fluid_preproc/fluid_test_computations/CMakeLists.txt
inference-engine/thirdparty/ocv/opencv_hal_avx.hpp
inference-engine/thirdparty/ocv/opencv_hal_avx512.hpp
inference-engine/thirdparty/ocv/opencv_hal_sse.hpp

index ef8527a..ab981fd 100644 (file)
@@ -21,6 +21,12 @@ target_include_directories(${TARGET_NAME} PUBLIC "${InferenceEngine_C_API_SOURCE
 
 add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME})
 
+# Workaround to avoid warnings caused with bug in the avx512intrin.h of GCC5
+if((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND
+   (CMAKE_CXX_COMPILER_VERSION VERSION_LESS_EQUAL 5.5))
+    set_target_properties(${TARGET_NAME} PROPERTIES LINK_FLAGS_RELEASE "-Wno-error=maybe-uninitialized -Wno-maybe-uninitialized")
+endif()
+
 # export
 
 export(TARGETS ${TARGET_NAME} NAMESPACE IE:: APPEND FILE "${CMAKE_BINARY_DIR}/targets.cmake")
index 9201a6e..adc52f0 100644 (file)
@@ -168,6 +168,12 @@ target_link_libraries(${TARGET_NAME} PRIVATE fluid PUBLIC inference_engine ${INT
 
 target_include_directories(${TARGET_NAME} INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}")
 
+# Workaround to avoid warnings caused with bug in the avx512intrin.h of GCC5
+if((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND
+   (CMAKE_CXX_COMPILER_VERSION VERSION_LESS_EQUAL 5.5))
+    set_target_properties(${TARGET_NAME} PROPERTIES LINK_FLAGS_RELEASE "-Wno-error=maybe-uninitialized -Wno-maybe-uninitialized")
+endif()
+
 if(WIN32)
     set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME})
 endif()
index da16de2..71c23ce 100644 (file)
@@ -5,8 +5,6 @@
 #include <algorithm>
 #include <utility>
 
-#include "ie_preprocess_gapi_kernels.hpp"
-#include "ie_preprocess_gapi_kernels_impl.hpp"
 #include "ie_preprocess_gapi_kernels_avx2.hpp"
 
 #include <immintrin.h>
@@ -44,16 +42,6 @@ namespace kernels {
 
 namespace avx {
 
-static inline v_uint16x16 v_expand_low(const v_uint8x32& a) {
-    return v_uint16x16(_mm256_unpacklo_epi8(a.val, _mm256_setzero_si256()));
-}
-
-static inline v_uint16x16 v_expand_high(const v_uint8x32& a) {
-    return v_uint16x16(_mm256_unpackhi_epi8(a.val, _mm256_setzero_si256()));
-}
-
-//------------------------------------------------------------------------------
-
 void mergeRow_8UC2(const uint8_t in0[], const uint8_t in1[],
                    uint8_t out[], int length) {
     mergeRow_8UC2_Impl(in0, in1, out, length);
@@ -114,8 +102,6 @@ void splitRow_32FC4(const float in[], float out0[], float out1[],
     splitRow_32FC4_Impl(in, out0, out1, out2, out3, length);
 }
 
-
-
 void calculate_nv12_to_rgb(const  uchar **srcY,
                            const  uchar *srcUV,
                                   uchar **dstRGBx,
@@ -145,6 +131,226 @@ void calcRowArea_32F(float dst[], const float *src[], const Size& inSz,
     calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
 }
 
+template<int chanNum>
+void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
+                            const uint8_t *src0[],
+                            const uint8_t *src1[],
+                            const short    alpha[],
+                            const short    clone[],  // 4 clones of alpha
+                            const short    mapsx[],
+                            const short    beta[],
+                                uint8_t    tmp[],
+                             const Size    &inSz,
+                             const Size    &outSz,
+                                    int    lpi) {
+    constexpr int half_nlanes = (v_uint8::nlanes / 2);
+    const int shift = (half_nlanes / 4);
+
+    if (4 == lpi) {
+        GAPI_DbgAssert(inSz.width >= half_nlanes);
+
+        v_uint8 shuf_mask1 = v_setr_s8(0, 8,  4, 12, 1, 9,  5, 13,
+                                       2, 10, 6, 14, 3, 11, 7, 15,
+                                       0, 8,  4, 12, 1, 9,  5, 13,
+                                       2, 10, 6, 14, 3, 11, 7, 15);
+
+        v_uint8 shuf_mask2 = v_setr_s8(0, 4, 8, 12, 2, 6, 10, 14,
+                                       1, 5, 9, 13, 3, 7, 11, 15,
+                                       0, 4, 8, 12, 2, 6, 10, 14,
+                                       1, 5, 9, 13, 3, 7, 11, 15);
+
+        v_uint8 shuf_mask3 = v_setr_s8(0, 1, 8,  9,  2, 3, 10, 11,
+                                       4, 5, 12, 13, 6, 7, 14, 15,
+                                       0, 1, 8,  9,  2, 3, 10, 11,
+                                       4, 5, 12, 13, 6, 7, 14, 15);
+
+        // vertical pass
+        v_int16 b0 = vx_setall_s16(beta[0]);
+        v_int16 b1 = vx_setall_s16(beta[1]);
+        v_int16 b2 = vx_setall_s16(beta[2]);
+        v_int16 b3 = vx_setall_s16(beta[3]);
+
+        for (int w = 0; w < inSz.width*chanNum; ) {
+            for (; w <= inSz.width*chanNum - half_nlanes && w >= 0; w += half_nlanes) {
+                v_int16 val0_0 = v_load_ccache_expand(&src0[0][w]);
+                v_int16 val0_1 = v_load_ccache_expand(&src0[1][w]);
+                v_int16 val0_2 = v_load_ccache_expand(&src0[2][w]);
+                v_int16 val0_3 = v_load_ccache_expand(&src0[3][w]);
+
+                v_int16 val1_0 = v_load_ccache_expand(&src1[0][w]);
+                v_int16 val1_1 = v_load_ccache_expand(&src1[1][w]);
+                v_int16 val1_2 = v_load_ccache_expand(&src1[2][w]);
+                v_int16 val1_3 = v_load_ccache_expand(&src1[3][w]);
+
+                v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0);
+                v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1);
+                v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2);
+                v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3);
+
+                v_int16 r0 = v_add_wrap(val1_0, t0);
+                v_int16 r1 = v_add_wrap(val1_1, t1);
+                v_int16 r2 = v_add_wrap(val1_2, t2);
+                v_int16 r3 = v_add_wrap(val1_3, t3);
+
+                v_uint8 q0 = v_packus(r0, r1);
+                v_uint8 q1 = v_packus(r2, r3);
+
+                v_uint8 q2 = v_blend_shiftleft<0xCC /*0b11001100*/, 4>(q0, q1);
+                v_uint8 q3 = v_blend_shiftright<0xCC /*0b11001100*/, 4>(q0, q1);             
+
+                v_uint8 q4 = v_shuffle_s8(q2, shuf_mask1);
+                v_uint8 q5 = v_shuffle_s8(q3, shuf_mask1);
+
+                v_uint8 q6 = v256_permute2x128<0x20>(q4, q5);
+                v_uint8 q7 = v256_permute2x128<0x31>(q4, q5);
+
+                vx_store(&tmp[4 * w + 0],  q6);
+                vx_store(&tmp[4 * w + 2 * half_nlanes], q7);
+            }
+
+            if (w < inSz.width*chanNum) {
+                w = inSz.width*chanNum - half_nlanes;
+            }
+        }
+
+        // horizontal pass
+        v_uint8 val_0, val_1, val_2, val_3;
+        GAPI_DbgAssert(outSz.width >= half_nlanes);
+        for (int x = 0; x < outSz.width; ) {
+            for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) {
+                v_int16 a10 = vx_load(&clone[4 * x]);
+                v_int16 a32 = vx_load(&clone[4 * (x + 4)]);
+                v_int16 a54 = vx_load(&clone[4 * (x + 8)]);
+                v_int16 a76 = vx_load(&clone[4 * (x + 12)]);
+
+                for (int c = 0; c < chanNum; ++c) {
+                    v_gather_channel(val_0, tmp, mapsx, chanNum, c, x, 0);
+                    v_gather_channel(val_1, tmp, mapsx, chanNum, c, x, shift);
+                    v_gather_channel(val_2, tmp, mapsx, chanNum, c, x, shift * 2);
+                    v_gather_channel(val_3, tmp, mapsx, chanNum, c, x, shift * 3);
+
+                    v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0));
+                    v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1));
+                    v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2));
+                    v_int16 val0_3 = v_reinterpret_as_s16(v_expand_low(val_3));
+
+                    v_int16 val1_0 = v_reinterpret_as_s16(v_expand_high(val_0));
+                    v_int16 val1_1 = v_reinterpret_as_s16(v_expand_high(val_1));
+                    v_int16 val1_2 = v_reinterpret_as_s16(v_expand_high(val_2));
+                    v_int16 val1_3 = v_reinterpret_as_s16(v_expand_high(val_3));
+
+                    v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), a10);
+                    v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), a32);
+                    v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), a54);
+                    v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), a76);
+
+                    v_int16 r0 = v_add_wrap(val1_0, t0);
+                    v_int16 r1 = v_add_wrap(val1_1, t1);
+                    v_int16 r2 = v_add_wrap(val1_2, t2);
+                    v_int16 r3 = v_add_wrap(val1_3, t3);
+
+                    v_uint8 q0 = v_packus(r0, r1);
+                    v_uint8 q1 = v_packus(r2, r3);                    
+
+                    v_uint8 q2 = v_shuffle_s8(q0, shuf_mask2);
+                    v_uint8 q3 = v_shuffle_s8(q1, shuf_mask2);
+                    
+                    v_uint8 q4 = v_blend_shiftleft<0xCC /*0b11001100*/, 4>(q2, q3);
+                    v_uint8 q5 = v_blend_shiftright<0xCC /*0b11001100*/, 4>(q2, q3);
+
+                    v_uint8 q6 = v256_permute4x64<0xD8>(q4);
+                    v_uint8 q7 = v256_permute4x64<0xD8>(q5);                    
+
+                    v_uint8 q8 = v_shuffle_s8(q6, shuf_mask3);
+                    v_uint8 q9 = v_shuffle_s8(q7, shuf_mask3);
+                    
+                    v_store_low(&dst[c][0][x], q8);
+                    v_store_high(&dst[c][1][x], q8);
+                    v_store_low(&dst[c][2][x], q9);
+                    v_store_high(&dst[c][3][x], q9);
+                }
+            }
+
+            if (x < outSz.width) {
+                x = outSz.width - half_nlanes;
+            }
+        }
+    } else {  // if any lpi
+        for (int l = 0; l < lpi; ++l) {
+            short beta0 = beta[l];
+
+            // vertical pass
+            GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes);
+            for (int w = 0; w < inSz.width*chanNum; ) {
+                for (; w <= inSz.width*chanNum - half_nlanes; w += half_nlanes) {
+                    v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[l][w]));
+                    v_int16 s1 = v_reinterpret_as_s16(vx_load_expand(&src1[l][w]));
+                    v_int16 t = v_mulhrs(s0 - s1, beta0) + s1;
+                    v_pack_u_store(tmp + w, t);
+                }
+
+                if (w < inSz.width*chanNum) {
+                    w = inSz.width*chanNum - half_nlanes;
+                }
+            }
+
+            // horizontal pass
+            GAPI_DbgAssert(outSz.width >= half_nlanes);
+
+            for (int x = 0; x < outSz.width; ) {
+                for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) {
+                    for (int c = 0; c < chanNum; ++c) {
+                        v_int16 a0 = vx_load(&alpha[x]);        // as signed Q1.1.14
+                        v_int16 sx = vx_load(&mapsx[x]);        // as integer (int16)
+                        v_int16 t0 = v_gather_chan<chanNum>(tmp, sx, c, 0);
+                        v_int16 t1 = v_gather_chan<chanNum>(tmp, sx, c, 1);
+                        v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
+                        v_pack_u_store(&dst[c][l][x], d);
+                    }
+                }
+
+                if (x < outSz.width) {
+                    x = outSz.width - half_nlanes;
+                }
+            }
+        }
+    }
+}
+
+// Resize (bi-linear, 8UC3)
+void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
+                      const uint8_t *src0[],
+                      const uint8_t *src1[],
+                      const short    alpha[],
+                      const short    clone[],  // 4 clones of alpha
+                      const short    mapsx[],
+                      const short    beta[],
+                      uint8_t  tmp[],
+                      const Size    &inSz,
+                      const Size    &outSz,
+                      int      lpi) {
+    constexpr const int chanNum = 3;
+
+    calcRowLinear_8UC_Impl<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
+}
+
+// Resize (bi-linear, 8UC4)
+void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
+                      const uint8_t *src0[],
+                      const uint8_t *src1[],
+                      const short    alpha[],
+                      const short    clone[],  // 4 clones of alpha
+                      const short    mapsx[],
+                      const short    beta[],
+                      uint8_t  tmp[],
+                      const Size    &inSz,
+                      const Size    &outSz,
+                      int      lpi) {
+    constexpr const int chanNum = 4;
+
+    calcRowLinear_8UC_Impl<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
+}
+
 void copyRow_8U(const uint8_t in[], uint8_t out[], int length) {
     copyRow_8U_impl(in, out, length);
 }
index 6b6e4cf..5b900d5 100644 (file)
@@ -4,10 +4,7 @@
 
 #include <algorithm>
 #include <utility>
-#include <cstring>
 
-#include "ie_preprocess_gapi_kernels.hpp"
-#include "ie_preprocess_gapi_kernels_impl.hpp"
 #include "ie_preprocess_gapi_kernels_avx512.hpp"
 
 #include <immintrin.h>
@@ -38,17 +35,6 @@ namespace gapi {
 namespace kernels {
 
 namespace avx512 {
-//----------------------------------------------------------------------
-
-static inline v_uint16x32 v_expand_low(const v_uint8x64& a) {
-    return v_uint16x32(_mm512_unpacklo_epi8(a.val, _mm512_setzero_si512()));
-}
-
-static inline v_uint16x32 v_expand_high(const v_uint8x64& a) {
-    return v_uint16x32(_mm512_unpackhi_epi8(a.val, _mm512_setzero_si512()));
-}
-
-//------------------------------------------------------------------------------
 
 void mergeRow_8UC2(const uint8_t in0[], const uint8_t in1[],
                    uint8_t out[], int length) {
@@ -110,8 +96,6 @@ void splitRow_32FC4(const float in[], float out0[], float out1[],
     splitRow_32FC4_Impl(in, out0, out1, out2, out3, length);
 }
 
-
-
 void calculate_nv12_to_rgb(const  uchar **srcY,
                            const  uchar *srcUV,
                                   uchar **dstRGBx,
@@ -141,6 +125,278 @@ void calcRowArea_32F(float dst[], const float *src[], const Size& inSz,
     calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
 }
 
+// Resize (bi-linear, 8U, generic number of channels)
+template<int chanNum>
+void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
+                            const uint8_t *src0[],
+                            const uint8_t *src1[],
+                            const short    alpha[],
+                            const short    clone[],  // 4 clones of alpha
+                            const short    mapsx[],
+                            const short    beta[],
+                                uint8_t    tmp[],
+                             const Size    &inSz,
+                             const Size    &outSz,
+                                    int    lpi) {
+    constexpr int half_nlanes = (v_uint8::nlanes / 2);
+    const int shift = (half_nlanes / 4);
+
+    if (4 == lpi) {
+        GAPI_DbgAssert(inSz.width >= half_nlanes);
+
+
+        v_uint8 shuf_mask1 = v_setr_s8(0, 4, 8,  12, 1, 5, 9,  13,
+                                       2, 6, 10, 14, 3, 7, 11, 15,
+                                       0, 4, 8,  12, 1, 5, 9,  13,
+                                       2, 6, 10, 14, 3, 7, 11, 15,
+                                       0, 4, 8,  12, 1, 5, 9,  13,
+                                       2, 6, 10, 14, 3, 7, 11, 15,
+                                       0, 4, 8,  12, 1, 5, 9,  13,
+                                       2, 6, 10, 14, 3, 7, 11, 15);
+
+        v_uint8 shuf_mask2 = v_setr_s8(0, 1, 4, 5, 8,  9,  12, 13,
+                                       2, 3, 6, 7, 10, 11, 14, 15,
+                                       0, 1, 4, 5, 8,  9,  12, 13,
+                                       2, 3, 6, 7, 10, 11, 14, 15,
+                                       0, 1, 4, 5, 8,  9,  12, 13,
+                                       2, 3, 6, 7, 10, 11, 14, 15,
+                                       0, 1, 4, 5, 8,  9,  12, 13,
+                                       2, 3, 6, 7, 10, 11, 14, 15);
+
+        v_uint32 idx1 = v_set_s32(23, 21, 7, 5, 22, 20, 6, 4, 19, 17, 3, 1, 18, 16, 2, 0);
+        v_uint32 idx2 = v_set_s32(31, 29, 15, 13, 30, 28, 14, 12, 27, 25, 11, 9, 26, 24, 10, 8);
+        v_uint32 idx3 = v_set_s32(29, 25, 21, 17, 13, 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0);
+        v_uint32 idx4 = v_set_s32(31, 27, 23, 19, 15, 11, 7, 3, 30, 26, 22, 18, 14, 10, 6, 2);
+
+        // vertical pass
+        v_int16 b0 = vx_setall_s16(beta[0]);
+        v_int16 b1 = vx_setall_s16(beta[1]);
+        v_int16 b2 = vx_setall_s16(beta[2]);
+        v_int16 b3 = vx_setall_s16(beta[3]);
+
+        for (int w = 0; w < inSz.width*chanNum; ) {
+            for (; w <= inSz.width*chanNum - half_nlanes && w >= 0; w += half_nlanes) {
+                v_int16 val0_0 = v_load_ccache_expand(&src0[0][w]);
+                v_int16 val0_1 = v_load_ccache_expand(&src0[1][w]);
+                v_int16 val0_2 = v_load_ccache_expand(&src0[2][w]);
+                v_int16 val0_3 = v_load_ccache_expand(&src0[3][w]);
+
+                v_int16 val1_0 = v_load_ccache_expand(&src1[0][w]);
+                v_int16 val1_1 = v_load_ccache_expand(&src1[1][w]);
+                v_int16 val1_2 = v_load_ccache_expand(&src1[2][w]);
+                v_int16 val1_3 = v_load_ccache_expand(&src1[3][w]);
+
+                v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0);
+                v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1);
+                v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2);
+                v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3);
+
+                v_int16 r0 = v_add_wrap(val1_0, t0);
+                v_int16 r1 = v_add_wrap(val1_1, t1);
+                v_int16 r2 = v_add_wrap(val1_2, t2);
+                v_int16 r3 = v_add_wrap(val1_3, t3);
+
+                v_uint8 q0 = v_packus(r0, r1);
+                v_uint8 q1 = v_packus(r2, r3);
+#if 1
+                v_uint8 q2 = v_permutex2_s32(q0, q1, idx1);
+                v_uint8 q3 = v_permutex2_s32(q0, q1, idx2);
+
+                v_uint8 q4 = v_shuffle_s8(q2, shuf_mask1);
+                v_uint8 q5 = v_shuffle_s8(q3, shuf_mask1);
+
+               //Second variant of decompose. It'll be usefull in the future.
+#else
+                v_uint8 q2 = v_mblend_shiftleft(q0, q1);
+                v_uint8 q3 = v_mblend_shiftright(q0, q1);
+
+                v_uint8 mask1 = v_setr_s8(0, 8,  4, 12, 1, 9,  5, 13,
+                                          2, 10, 6, 14, 3, 11, 7, 15,
+                                          0, 8,  4, 12, 1, 9,  5, 13,
+                                          2, 10, 6, 14, 3, 11, 7, 15,
+                                          0, 8,  4, 12, 1, 9,  5, 13,
+                                          2, 10, 6, 14, 3, 11, 7, 15,
+                                          0, 8,  4, 12, 1, 9,  5, 13,
+                                          2, 10, 6, 14, 3, 11, 7, 15);
+
+                v_uint8 q4 = v_shuffle_s8(q2, mask1);
+                v_uint8 q5 = v_shuffle_s8(q3, mask1);
+
+                v_uint64 idx1 = v_set_s64(11, 10, 3, 2, 9, 8, 1, 0);
+                v_uint64 idx2 = v_set_s64(15, 14, 7, 6, 13, 12, 5, 4);
+
+                v_uint8 q6 = v_permutex2_s64(q4, q5, idx1);
+                v_uint8 q7 = v_permutex2_s64(q4, q5, idx2);
+#endif
+
+                vx_store(&tmp[4 * w + 0], q4);
+                vx_store(&tmp[4 * w + 2 * half_nlanes], q5);
+            }
+
+            if (w < inSz.width*chanNum) {
+                w = inSz.width*chanNum - half_nlanes;
+            }
+        }
+
+        // horizontal pass
+        v_uint8 val_0, val_1, val_2, val_3;
+
+        GAPI_DbgAssert(outSz.width >= half_nlanes);
+        for (int x = 0; x < outSz.width; ) {
+            for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) {
+                v_int16 a10 = vx_load(&clone[4 * x]);
+                v_int16 a32 = vx_load(&clone[4 * (x + 8)]);
+                v_int16 a54 = vx_load(&clone[4 * (x + 16)]);
+                v_int16 a76 = vx_load(&clone[4 * (x + 24)]);
+                                
+                for (int c = 0; c < chanNum; ++c) {
+                    v_gather_channel(val_0, tmp, mapsx, chanNum, c, x, 0);
+                    v_gather_channel(val_1, tmp, mapsx, chanNum, c, x, shift);
+                    v_gather_channel(val_2, tmp, mapsx, chanNum, c, x, shift * 2);
+                    v_gather_channel(val_3, tmp, mapsx, chanNum, c, x, shift * 3);
+
+                    v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0));
+                    v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1));
+                    v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2));
+                    v_int16 val0_3 = v_reinterpret_as_s16(v_expand_low(val_3));
+
+                    v_int16 val1_0 = v_reinterpret_as_s16(v_expand_high(val_0));
+                    v_int16 val1_1 = v_reinterpret_as_s16(v_expand_high(val_1));
+                    v_int16 val1_2 = v_reinterpret_as_s16(v_expand_high(val_2));
+                    v_int16 val1_3 = v_reinterpret_as_s16(v_expand_high(val_3));
+
+                    v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), a10);
+                    v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), a32);
+                    v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), a54);
+                    v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), a76);
+
+                    v_int16 r0 = v_add_wrap(val1_0, t0);
+                    v_int16 r1 = v_add_wrap(val1_1, t1);
+                    v_int16 r2 = v_add_wrap(val1_2, t2);
+                    v_int16 r3 = v_add_wrap(val1_3, t3);
+
+                    v_uint8 q0 = v_packus(r0, r1);
+                    v_uint8 q1 = v_packus(r2, r3);
+
+                    v_uint8 q2 = v_shuffle_s8(q0, shuf_mask1);
+                    v_uint8 q3 = v_shuffle_s8(q1, shuf_mask1);              
+#if 1
+                    v_uint8 q4 = v_permutex2_s32(q2, q3, idx3);
+                    v_uint8 q5 = v_permutex2_s32(q2, q3, idx4);
+
+                    v_uint8 q6 = v_shuffle_s8(q4, shuf_mask2);
+                    v_uint8 q7 = v_shuffle_s8(q5, shuf_mask2);
+
+
+                    //Second variant of decompose. It'll be usefull in the future.
+#else
+                    v_uint8 q4 = v_mask_blend_shiftleft<0xCCCCCCCC /*0b11001100110011001100110011001100*/, 4>(q2, q3);
+                    v_uint8 q5 = v_mask_blend_shiftright<0xCCCCCCCC /*0b11001100110011001100110011001100*/, 4>(q2, q3);
+
+                    v_int32 idx = v_set_s32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+
+                    v_uint8 q6 = v_permutex_s32(idx, q4);
+                    v_uint8 q7 = v_permutex_s32(idx, q5);
+
+                    v_uint8 mask2 = v_setr_s8(0, 1, 4, 5, 8,  9,  12, 13,
+                                              2, 3, 6, 7, 10, 11, 14, 15,
+                                              0, 1, 4, 5, 8,  9,  12, 13,
+                                              2, 3, 6, 7, 10, 11, 14, 15,
+                                              0, 1, 4, 5, 8,  9,  12, 13,
+                                              2, 3, 6, 7, 10, 11, 14, 15,
+                                              0, 1, 4, 5, 8,  9,  12, 13,
+                                              2, 3, 6, 7, 10, 11, 14, 15);
+
+                    v_uint8 q8 = v_shuffle_s8(q6, mask2);
+                    v_uint8 q9 = v_shuffle_s8(q7, mask2);
+#endif
+                    v_store_low(&dst[c][0][x],  q6);
+                    v_store_high(&dst[c][1][x], q6);
+                    v_store_low(&dst[c][2][x],  q7);
+                    v_store_high(&dst[c][3][x], q7);
+                }
+            }
+
+            if (x < outSz.width) {
+                x = outSz.width - half_nlanes;
+            }
+        }
+    } else {  // if any lpi
+        for (int l = 0; l < lpi; ++l) {
+            short beta0 = beta[l];
+
+         // vertical pass
+            GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes);
+            for (int w = 0; w < inSz.width*chanNum; ) {
+                for (; w <= inSz.width*chanNum - half_nlanes; w += half_nlanes) {
+                    v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[l][w]));
+                    v_int16 s1 = v_reinterpret_as_s16(vx_load_expand(&src1[l][w]));
+                    v_int16 t = v_mulhrs(s0 - s1, beta0) + s1;
+                    v_pack_u_store(tmp + w, t);
+                }
+
+                if (w < inSz.width*chanNum) {
+                    w = inSz.width*chanNum - half_nlanes;
+                }
+            }
+
+         // horizontal pass
+         GAPI_DbgAssert(outSz.width >= half_nlanes);
+
+        for (int x = 0; x < outSz.width; ) {
+            for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) {
+                 for (int c = 0; c < chanNum; ++c) {
+                     v_int16 a0 = vx_load(&alpha[x]);        // as signed Q1.1.14
+                     v_int16 sx = vx_load(&mapsx[x]);        // as integer (int16)
+                     v_int16 t0 = v_gather_chan<chanNum>(tmp, sx, c, 0);
+                     v_int16 t1 = v_gather_chan<chanNum>(tmp, sx, c, 1);
+                     v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
+                     v_pack_u_store(&dst[c][l][x], d);
+                 }
+            }
+
+            if (x < outSz.width) {
+                 x = outSz.width - half_nlanes;
+            }
+         }
+       }
+    }
+}
+
+// Resize (bi-linear, 8UC3)
+void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
+                      const uint8_t *src0[],
+                      const uint8_t *src1[],
+                      const short    alpha[],
+                      const short    clone[],  // 4 clones of alpha
+                      const short    mapsx[],
+                      const short    beta[],
+                      uint8_t  tmp[],
+                      const Size    &inSz,
+                      const Size    &outSz,
+                      int      lpi) {
+    constexpr const int chanNum = 3;
+
+    calcRowLinear_8UC_Impl<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
+}
+
+// Resize (bi-linear, 8UC4)
+void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
+                      const uint8_t *src0[],
+                      const uint8_t *src1[],
+                      const short    alpha[],
+                      const short    clone[],  // 4 clones of alpha
+                      const short    mapsx[],
+                      const short    beta[],
+                      uint8_t  tmp[],
+                      const Size    &inSz,
+                      const Size    &outSz,
+                      int      lpi) {
+    constexpr const int chanNum = 4;
+
+    calcRowLinear_8UC_Impl<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
+}
+
 void copyRow_8U(const uint8_t in[], uint8_t out[], int length) {
     copyRow_8U_impl(in, out, length);
 }
@@ -153,3 +409,4 @@ void copyRow_32F(const float in[], float out[], int length) {
 }  // namespace kernels
 }  // namespace gapi
 }  // namespace InferenceEngine
+
index cf121f4..8b994d8 100644 (file)
@@ -50,18 +50,6 @@ namespace InferenceEngine {
 namespace gapi {
 namespace kernels {
 
-//----------------------------------------------------------------------
-
-static inline v_uint16x8 v_expand_low(const v_uint8x16& a) {
-    return v_uint16x8(_mm_unpacklo_epi8(a.val, _mm_setzero_si128()));
-}
-
-static inline v_uint16x8 v_expand_high(const v_uint8x16& a) {
-    return v_uint16x8(_mm_unpackhi_epi8(a.val, _mm_setzero_si128()));
-}
-
-//------------------------------------------------------------------------------
-
 // Resize (bi-linear, 8U)
 void calcRowLinear_8U(uint8_t *dst[],
                 const uint8_t *src0[],
@@ -485,9 +473,12 @@ void calcRowLinear_8U(uint8_t *dst[],
     }
 }
 
+// Resize 3C/4C universal intrinsic implementation for SSE42 version is a bit slower than original sometimes.
+// Remove original implementation when I find a cause.
+#if 1
 // Resize (bi-linear, 8U, generic number of channels)
 template<int chanNum>
-void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
+void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
                   const uint8_t *src0[],
                   const uint8_t *src1[],
                   const short    alpha[],
@@ -498,9 +489,11 @@ void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
                   const Size    &inSz,
                   const Size    &outSz,
                         int      lpi) {
+    const int half_nlanes = (v_uint8::nlanes / 2);
+
     if (4 == lpi) {
         // vertical pass
-        GAPI_DbgAssert(inSz.width >= 8);
+        GAPI_DbgAssert(inSz.width >= half_nlanes);
 
         __m128i b0 = _mm_set1_epi16(beta[0]);
         __m128i b1 = _mm_set1_epi16(beta[1]);
@@ -508,7 +501,7 @@ void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
         __m128i b3 = _mm_set1_epi16(beta[3]);
 
         for (int w = 0; w < inSz.width*chanNum; ) {
-            for (; w <= inSz.width*chanNum - 8 && w >= 0; w += 8) {
+            for (; w <= inSz.width*chanNum - half_nlanes && w >= 0; w += half_nlanes) {
                 //--------------------------------------------
                 // reworked from: ie_preprocess_data_sse42.cpp
                 //      function: resize_bilinear_u8
@@ -558,14 +551,14 @@ void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
             }
 
             if (w < inSz.width*chanNum) {
-                w = inSz.width*chanNum - 8;
+                w = inSz.width*chanNum - half_nlanes;
             }
         }
 
         // horizontal pass
-        GAPI_DbgAssert(outSz.width >= 8);
+        GAPI_DbgAssert(outSz.width >= half_nlanes);
         for (int x = 0; x < outSz.width; ) {
-            for (; x <= outSz.width - 8 && x >= 0; x += 8) {
+            for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) {
                 //--------------------------------------------
                 // reworked from: ie_preprocess_data_sse42.cpp
                 //      function: resize_bilinear_u8
@@ -645,17 +638,18 @@ void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
             }
 
             if (x < outSz.width) {
-                x = outSz.width - 8;
+                x = outSz.width - half_nlanes;
             }
         }
+
     } else {  // if any lpi
         for (int l = 0; l < lpi; l++) {
             short beta0 = beta[l];
 
             // vertical pass
-            GAPI_DbgAssert(inSz.width*chanNum >= 8);
+            GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes);
             for (int w = 0; w < inSz.width*chanNum; ) {
-                for (; w <= inSz.width*chanNum - 8; w += 8) {
+                for (; w <= inSz.width*chanNum - half_nlanes; w += half_nlanes) {
                     v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(&src0[l][w]));
                     v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(&src1[l][w]));
                     v_int16x8 t = v_mulhrs(s0 - s1, beta0) + s1;
@@ -663,14 +657,14 @@ void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
                 }
 
                 if (w < inSz.width*chanNum) {
-                    w = inSz.width*chanNum - 8;
+                    w = inSz.width*chanNum - half_nlanes;
                 }
             }
 
             // horizontal pass
-            GAPI_DbgAssert(outSz.width >= 8);
+            GAPI_DbgAssert(outSz.width >= half_nlanes);
             for (int x = 0; x < outSz.width; ) {
-                for (; x <= outSz.width - 8 && x >= 0; x += 8) {
+                for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) {
                     for (int c = 0; c < chanNum; c++) {
                         v_int16x8 a0 = v_load(&alpha[x]);        // as signed Q1.1.14
                         v_int16x8 sx = v_load(&mapsx[x]);        // as integer (int16)
@@ -682,12 +676,186 @@ void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
                 }
 
                 if (x < outSz.width) {
-                    x = outSz.width - 8;
+                    x = outSz.width - half_nlanes;
+                }
+            }
+        }
+    }
+}
+#else
+// Resize 3C/4C universal intrinsic implementation for SSE42 version is a bit slower sometimes.
+// Gonna turn it on when I find a cause.
+template<int chanNum>
+void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
+                            const uint8_t *src0[],
+                            const uint8_t *src1[],
+                            const short    alpha[],
+                            const short    clone[],  // 4 clones of alpha
+                            const short    mapsx[],
+                            const short    beta[],
+                            uint8_t  tmp[],
+                            const Size    &inSz,
+                            const Size    &outSz,
+                            int      lpi) {
+    const int half_nlanes = (v_uint8::nlanes / 2);
+
+    if (4 == lpi) {
+        // vertical pass
+        GAPI_DbgAssert(inSz.width >= half_nlanes);
+
+        v_int16 b0 = vx_setall_s16(beta[0]);
+        v_int16 b1 = vx_setall_s16(beta[1]);
+        v_int16 b2 = vx_setall_s16(beta[2]);
+        v_int16 b3 = vx_setall_s16(beta[3]);
+
+        for (int w = 0; w < inSz.width*chanNum; ) {
+            for (; w <= inSz.width*chanNum - half_nlanes && w >= 0; w += half_nlanes) {
+                v_int16 val0_0 = v_reinterpret_as_s16(vx_load_expand(&src0[0][w]));
+                v_int16 val0_1 = v_reinterpret_as_s16(vx_load_expand(&src0[1][w]));
+                v_int16 val0_2 = v_reinterpret_as_s16(vx_load_expand(&src0[2][w]));
+                v_int16 val0_3 = v_reinterpret_as_s16(vx_load_expand(&src0[3][w]));
+
+                v_int16 val1_0 = v_reinterpret_as_s16(vx_load_expand(&src1[0][w]));
+                v_int16 val1_1 = v_reinterpret_as_s16(vx_load_expand(&src1[1][w]));
+                v_int16 val1_2 = v_reinterpret_as_s16(vx_load_expand(&src1[2][w]));
+                v_int16 val1_3 = v_reinterpret_as_s16(vx_load_expand(&src1[3][w]));
+
+                v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0);
+                v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1);
+                v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2);
+                v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3);
+
+                v_int16 r0 = v_add_wrap(val1_0, t0);
+                v_int16 r1 = v_add_wrap(val1_1, t1);
+                v_int16 r2 = v_add_wrap(val1_2, t2);
+                v_int16 r3 = v_add_wrap(val1_3, t3);
+
+                v_uint8 q0 = v_packus(r0, r1);
+                v_uint8 q1 = v_packus(r2, r3);
+
+                v_uint8 q2 = v_blend_shiftleft<0xCC  /*0b11001100*/, 4>(q0, q1);
+                v_uint8 q3 = v_blend_shiftright<0xCC  /*0b11001100*/, 4>(q0, q1);
+
+                v_uint8 mask = v_setr_s8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15);
+
+                v_uint8 q4 = v_shuffle_s8(q2, mask);
+                v_uint8 q5 = v_shuffle_s8(q3, mask);
+
+                vx_store(&tmp[4 * w + 0], q4);
+                vx_store(&tmp[4 * w + 2 * half_nlanes], q5);
+            }
+
+            if (w < inSz.width*chanNum) {
+                w = inSz.width*chanNum - half_nlanes;
+            }
+        }
+
+        // horizontal pass
+        GAPI_DbgAssert(outSz.width >= half_nlanes);
+        for (int x = 0; x < outSz.width; ) {
+            for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) {
+                v_int16 a10 = vx_load(&clone[4 * x]);
+                v_int16 a32 = vx_load(&clone[4 * (x + 2)]);
+                v_int16 a54 = vx_load(&clone[4 * (x + 4)]);
+                v_int16 a76 = vx_load(&clone[4 * (x + 6)]);
+
+                v_uint8 val_0 = vx_setzero_u8();
+                v_uint8 val_1 = vx_setzero_u8();
+                v_uint8 val_2 = vx_setzero_u8();
+                v_uint8 val_3 = vx_setzero_u8();
+
+                for (int c = 0; c < chanNum; ++c) {
+                    int shift = (half_nlanes / 4);
+
+                    v_gather_channel(val_0, tmp, mapsx, chanNum, c, x, 0);
+                    v_gather_channel(val_1, tmp, mapsx, chanNum, c, x, shift);
+                    v_gather_channel(val_2, tmp, mapsx, chanNum, c, x, shift * 2);
+                    v_gather_channel(val_3, tmp, mapsx, chanNum, c, x, shift * 3);
+
+                    v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0));
+                    v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1));
+                    v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2));
+                    v_int16 val0_3 = v_reinterpret_as_s16(v_expand_low(val_3));
+
+                    v_int16 val1_0 = v_reinterpret_as_s16(v_expand_high(val_0));
+                    v_int16 val1_1 = v_reinterpret_as_s16(v_expand_high(val_1));
+                    v_int16 val1_2 = v_reinterpret_as_s16(v_expand_high(val_2));
+                    v_int16 val1_3 = v_reinterpret_as_s16(v_expand_high(val_3));
+
+                    v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), a10);
+                    v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), a32);
+                    v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), a54);
+                    v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), a76);
+
+                    v_int16 r0 = v_add_wrap(val1_0, t0);
+                    v_int16 r1 = v_add_wrap(val1_1, t1);
+                    v_int16 r2 = v_add_wrap(val1_2, t2);
+                    v_int16 r3 = v_add_wrap(val1_3, t3);
+
+                    v_uint8 q0 = v_packus(r0, r1);
+                    v_uint8 q1 = v_packus(r2, r3);
+
+                    v_uint8 mask = v_setr_s8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15);
+
+                    v_uint8 q2 = v_shuffle_s8(q0, mask);
+                    v_uint8 q3 = v_shuffle_s8(q1, mask);
+
+                    v_uint8 q4 = v_blend_shiftleft<0xCC /*0b11001100*/, 4>(q2, q3);
+                    v_uint8 q5 = v_blend_shiftright<0xCC /*0b11001100*/, 4>(q2, q3);
+
+                    v_store_low(&dst[c][0][x], q4);
+                    v_store_high(&dst[c][1][x], q4);
+                    v_store_low(&dst[c][2][x], q5);
+                    v_store_high(&dst[c][3][x], q5);
+                }
+            }
+
+            if (x < outSz.width) {
+                x = outSz.width - half_nlanes;
+            }
+        }
+
+    } else {  // if any lpi
+        for (int l = 0; l < lpi; ++l) {
+            short beta0 = beta[l];
+
+            // vertical pass
+            GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes);
+            for (int w = 0; w < inSz.width*chanNum; ) {
+                for (; w <= inSz.width*chanNum - half_nlanes; w += half_nlanes) {
+                    v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[l][w]));
+                    v_int16 s1 = v_reinterpret_as_s16(vx_load_expand(&src1[l][w]));
+                    v_int16 t = v_mulhrs(s0 - s1, beta0) + s1;
+                    v_pack_u_store(tmp + w, t);
+                }
+
+                if (w < inSz.width*chanNum) {
+                    w = inSz.width*chanNum - half_nlanes;
+                }
+            }
+
+            // horizontal pass
+            GAPI_DbgAssert(outSz.width >= half_nlanes);
+            for (int x = 0; x < outSz.width; ) {
+                for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) {
+                    for (int c = 0; c < chanNum; ++c) {
+                        v_int16 a0 = vx_load(&alpha[x]);        // as signed Q1.1.14
+                        v_int16 sx = vx_load(&mapsx[x]);        // as integer (int16)
+                        v_int16 t0 = v_gather_chan<chanNum>(tmp, sx, c, 0);
+                        v_int16 t1 = v_gather_chan<chanNum>(tmp, sx, c, 1);
+                        v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
+                        v_pack_u_store(&dst[c][l][x], d);
+                    }
+                }
+
+                if (x < outSz.width) {
+                    x = outSz.width - half_nlanes;
                 }
             }
         }
     }
 }
+#endif
 
 // Resize (bi-linear, 8UC3)
 void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
@@ -703,7 +871,7 @@ void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
                         int      lpi) {
     constexpr const int chanNum = 3;
 
-    calcRowLinear_8UC_Impl<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
+    calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
 }
 
 // Resize (bi-linear, 8UC4)
@@ -719,7 +887,7 @@ void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
                   const Size    &outSz,
                         int      lpi) {
     constexpr const int chanNum = 4;
-    calcRowLinear_8UC_Impl<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
+    calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
 }
 
 // Resize (bi-linear, 32F)
index 667e9d6..2272ba5 100644 (file)
@@ -894,24 +894,62 @@ static void calcRowLinearC(const cv::gapi::fluid::View  & in,
         }
     }
 
-    #ifdef HAVE_SSE
+#ifdef HAVE_AVX512
+    if (with_cpu_x86_avx512_core()) {
+        if (std::is_same<T, uint8_t>::value) {
+            if (inSz.width >= 64 && outSz.width >= 32) {
+                avx512::calcRowLinear_8UC<numChan>(dst,
+                                                   reinterpret_cast<const uint8_t**>(src0),
+                                                   reinterpret_cast<const uint8_t**>(src1),
+                                                   reinterpret_cast<const short*>(alpha),
+                                                   reinterpret_cast<const short*>(clone),
+                                                   reinterpret_cast<const short*>(mapsx),
+                                                   reinterpret_cast<const short*>(beta),
+                                                   reinterpret_cast<uint8_t*>(tmp),
+                                                   inSz, outSz, lpi);
+                return;
+            }
+        }
+    }
+#endif
+
+#ifdef HAVE_AVX2
+    if (with_cpu_x86_avx2()) {
+        if (std::is_same<T, uint8_t>::value) {
+            if (inSz.width >= 32 && outSz.width >= 16) {
+                avx::calcRowLinear_8UC<numChan>(dst,
+                                                reinterpret_cast<const uint8_t**>(src0),
+                                                reinterpret_cast<const uint8_t**>(src1),
+                                                reinterpret_cast<const short*>(alpha),
+                                                reinterpret_cast<const short*>(clone),
+                                                reinterpret_cast<const short*>(mapsx),
+                                                reinterpret_cast<const short*>(beta),
+                                                reinterpret_cast<uint8_t*>(tmp),
+                                                inSz, outSz, lpi);
+                return;
+            }
+        }
+    }
+#endif
+
+#ifdef HAVE_SSE
     if (with_cpu_x86_sse42()) {
         if (std::is_same<T, uint8_t>::value) {
             if (inSz.width >= 16 && outSz.width >= 8) {
                 calcRowLinear_8UC<numChan>(dst,
-                                   reinterpret_cast<const uint8_t**>(src0),
-                                   reinterpret_cast<const uint8_t**>(src1),
-                                   reinterpret_cast<const short*>(alpha),
-                                   reinterpret_cast<const short*>(clone),
-                                   reinterpret_cast<const short*>(mapsx),
-                                   reinterpret_cast<const short*>(beta),
-                                   reinterpret_cast<uint8_t*>(tmp),
-                                   inSz, outSz, lpi);
+                                           reinterpret_cast<const uint8_t**>(src0),
+                                           reinterpret_cast<const uint8_t**>(src1),
+                                           reinterpret_cast<const short*>(alpha),
+                                           reinterpret_cast<const short*>(clone),
+                                           reinterpret_cast<const short*>(mapsx),
+                                           reinterpret_cast<const short*>(beta),
+                                           reinterpret_cast<uint8_t*>(tmp),
+                                           inSz, outSz, lpi);
                 return;
             }
         }
     }
-    #endif  // HAVE_SSE
+#endif  // HAVE_SSE
 
     auto length = out[0].get().length();
 
index 94b935f..36b3d9a 100644 (file)
@@ -7,6 +7,12 @@ file(GLOB HDR *.hpp)
 
 add_library(fluid_test_computations SHARED ${SRC} ${HDR})
 
+# Workaround to avoid warnings caused with bug in the avx512intrin.h of GCC5
+if((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND
+   (CMAKE_CXX_COMPILER_VERSION VERSION_LESS_EQUAL 5.5))
+    set_target_properties(fluid_test_computations PROPERTIES LINK_FLAGS_RELEASE "-Wno-error=maybe-uninitialized -Wno-maybe-uninitialized")
+endif()
+
 target_include_directories(fluid_test_computations PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}")
 
 target_link_libraries(fluid_test_computations PRIVATE inference_engine_preproc_s inference_engine fluid)
index eb592b1..046f604 100644 (file)
@@ -48,7 +48,7 @@ inline __m256d _v256_permute2x128(const __m256d& a, const __m256d& b)
 { return _mm256_permute2f128_pd(a, b, imm); }
 
 template<int imm, typename _Tpvec>
-inline _Tpvec v256_permute2x128(const _Tpvec& a, const _Tpvec& b)
+static inline _Tpvec v256_permute2x128(const _Tpvec& a, const _Tpvec& b)
 { return _Tpvec(_v256_permute2x128<imm>(a.val, b.val)); }
 
 template<int imm>
@@ -60,7 +60,7 @@ inline __m256d _v256_permute4x64(const __m256d& a)
 { return _mm256_permute4x64_pd(a, imm); }
 
 template<int imm, typename _Tpvec>
-inline _Tpvec v256_permute4x64(const _Tpvec& a)
+static inline _Tpvec v256_permute4x64(const _Tpvec& a)
 { return _Tpvec(_v256_permute4x64<imm>(a.val)); }
 
 inline __m128i _v256_extract_high(const __m256i& v)
@@ -730,6 +730,11 @@ OPENCV_HAL_IMPL_AVX_EXPAND(v_int16x16,  v_int32x8,   short,    _mm256_cvtepi16_e
 OPENCV_HAL_IMPL_AVX_EXPAND(v_uint32x8,  v_uint64x4,  unsigned, _mm256_cvtepu32_epi64)
 OPENCV_HAL_IMPL_AVX_EXPAND(v_int32x8,   v_int64x4,   int,      _mm256_cvtepi32_epi64)
 
+static inline v_int16x16 v_load_ccache_expand(const uchar* ptr)
+{
+    return v_int16x16(_mm256_cvtepu8_epi16(_mm_lddqu_si128((const __m128i*)ptr)));
+}
+
 inline v_uint8x32 v_mul_wrap(const v_uint8x32& a, const v_uint8x32& b)
 {
     __m256i ad = _mm256_srai_epi16(a.val, 8);
@@ -1925,6 +1930,37 @@ inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b)
     return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val)));
 }
 
+static inline v_uint8x32 v_packus(const v_int16x16& a, const v_int16x16& b)
+{
+    return v_uint8x32(_mm256_packus_epi16(a.val, b.val));
+}
+
+template<int mask, int shift>
+static inline v_uint8x32 v_blend_shiftleft(const v_uint8x32& a, const v_uint8x32& b)
+{
+    return v_uint8x32(_mm256_blend_epi16(a.val, _mm256_slli_si256(b.val, shift), mask));
+}
+
+template<int mask, int shift>
+static inline v_uint8x32 v_blend_shiftright(const v_uint8x32& a, const v_uint8x32& b)
+{
+    return v_uint8x32(_mm256_blend_epi16(_mm256_srli_si256(a.val, shift), b.val, mask));
+}
+
+static inline v_uint8x32 v_setr_s8(char b0, char b1, char b2, char b3, char b4,
+                                   char b5, char b6, char b7, char b8, char b9,
+                                   char b10, char b11, char b12, char b13, char b14,
+                                   char b15, char b16, char b17, char b18, char b19,
+                                   char b20, char b21, char b22, char b23, char b24,
+                                   char b25, char b26, char b27, char b28, char b29,
+                                   char b30, char b31)
+{
+    return v_uint8x32(_mm256_setr_epi8(b0, b1, b2, b3, b4, b5, b6, b7,
+                        b8, b9, b10, b11, b12, b13, b14, b15,
+                        b16, b17, b18, b19, b20, b21, b22, b23,
+                        b24, b25, b26, b27, b28, b29, b30, b31));
+}
+
 inline void v_pack_store(schar* ptr, const v_int16x16& a)
 { v_store_low(ptr, v_pack(a, a)); }
 
@@ -3075,9 +3111,7 @@ static inline v_uint16x16 v_mulhi(const v_uint16x16& a, uint16_t b)
 
 static inline v_int16x16 v_mulhrs(const v_int16x16& a, const v_int16x16& b)
 {
-    v_int16x16 r;
-    r.val = _mm256_mulhrs_epi16(a.val, b.val);
-    return r;
+    return v_int16x16(_mm256_mulhrs_epi16(a.val, b.val));
 }
 
 static inline v_int16x16 v_mulhrs(const v_int16x16& a, short b)
@@ -3110,6 +3144,49 @@ static inline v_float32x8 operator* (const v_float32x8& a, float b)
     return a * v256_setall_f32(b);
 }
 
+static inline v_uint8x32 v_shuffle_s8(const v_uint8x32& a, const v_uint8x32& mask)
+{
+    return v_uint8x32(_mm256_shuffle_epi8(a.val, mask.val));
+}
+
+static inline void v_gather_channel(v_uint8x32& vec, const uint8_t tmp[], const short mapsx[],
+                                    int chanNum, int c, int x, int shift)
+{
+    vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + shift + 0] + c)]), 0);
+    vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + shift + 1] + c)]), 1);
+    vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + shift + 2] + c)]), 2);
+    vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + shift + 3] + c)]), 3);
+
+    vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 0] + 1) + c)]), 4);
+    vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 1] + 1) + c)]), 5);
+    vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 2] + 1) + c)]), 6);
+    vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 3] + 1) + c)]), 7);
+}
+
+namespace {
+    template<int chanNum>
+    static inline v_int16x16 v_gather_chan(const uchar src[], const v_int16x16& index, int channel, int pos) {
+        v_int16x16 r;
+        r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 0) + pos) + channel]), 0);
+        r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 1) + pos) + channel]), 1);
+        r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 2) + pos) + channel]), 2);
+        r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 3) + pos) + channel]), 3);
+        r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 4) + pos) + channel]), 4);
+        r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 5) + pos) + channel]), 5);
+        r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 6) + pos) + channel]), 6);
+        r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 7) + pos) + channel]), 7);
+        r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 8) + pos) + channel]), 8);
+        r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 9) + pos) + channel]), 9);
+        r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 10) + pos) + channel]), 10);
+        r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 11) + pos) + channel]), 11);
+        r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 12) + pos) + channel]), 12);
+        r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 13) + pos) + channel]), 13);
+        r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 14) + pos) + channel]), 14);
+        r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 15) + pos) + channel]), 15);
+        return r;
+    }
+}  // namespace
+
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
 //! @endcond
index 2f88c19..1f786b7 100644 (file)
@@ -89,7 +89,7 @@ inline __m256  _v512_extract_high(const __m512& v)
 { return _mm512_extractf32x8_ps(v, 1); }
 
 inline __m256d _v512_extract_high(const __m512d& v)
-{ return _mm512_extractf64x4_pd(v, 1); }
+{ return _mm512_mask_extractf64x4_pd(_mm256_setzero_pd(), (__mmask8) -1, v, 1); }
 
 inline __m256i _v512_extract_low(const __m512i& v)
 { return _mm512_castsi512_si256(v); }
@@ -1936,7 +1936,7 @@ OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_int32x16,  schar, _mm512_cvtepi8_epi32)
 /* pack */
 // 16
 inline v_int8x64 v_pack(const v_int16x32& a, const v_int16x32& b)
-{ return v_int8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
+{ return v_int8x64(_mm512_mask_permutexvar_epi64(_mm512_setzero_si512(), (__mmask8)-1, _v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
 
 inline v_uint8x64 v_pack(const v_uint16x32& a, const v_uint16x32& b)
 {
@@ -1946,7 +1946,7 @@ inline v_uint8x64 v_pack(const v_uint16x32& a, const v_uint16x32& b)
 
 inline v_uint8x64 v_pack_u(const v_int16x32& a, const v_int16x32& b)
 {
-    return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi16(a.val, b.val)));
+    return v_uint8x64(_mm512_mask_permutexvar_epi64(_mm512_setzero_si512(), (__mmask8)-1, _v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi16(a.val, b.val)));
 }
 
 inline void v_pack_store(schar* ptr, const v_int16x32& a)
@@ -2007,7 +2007,9 @@ void v_rshr_pack_store(schar* ptr, const v_int16x32& a)
 
 // 32
 inline v_int16x32 v_pack(const v_int32x16& a, const v_int32x16& b)
-{ return v_int16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi32(a.val, b.val))); }
+{ return v_int16x32(_mm512_mask_permutexvar_epi64(_mm512_setzero_si512(), (__mmask8) -1,
+                                                 _v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0),
+                                                 _mm512_packs_epi32(a.val, b.val))); }
 
 inline v_uint16x32 v_pack(const v_uint32x16& a, const v_uint32x16& b)
 {
@@ -2016,7 +2018,9 @@ inline v_uint16x32 v_pack(const v_uint32x16& a, const v_uint32x16& b)
 }
 
 inline v_uint16x32 v_pack_u(const v_int32x16& a, const v_int32x16& b)
-{ return v_uint16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi32(a.val, b.val))); }
+{ return v_uint16x32(_mm512_mask_permutexvar_epi64(_mm512_setzero_si512(), (__mmask8) -1,
+                                                  _v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0),
+                                                  _mm512_packus_epi32(a.val, b.val))); }
 
 inline void v_pack_store(short* ptr, const v_int32x16& a)
 { v_store_low(ptr, v_pack(a, a)); }
@@ -2118,7 +2122,7 @@ void v_rshr_pack_store(int* ptr, const v_int64x8& a)
 
 // pack boolean
 inline v_uint8x64 v_pack_b(const v_uint16x32& a, const v_uint16x32& b)
-{ return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
+{ return v_uint8x64(_mm512_mask_permutexvar_epi64(_mm512_setzero_si512(), (__mmask8) -1, _v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
 
 inline v_uint8x64 v_pack_b(const v_uint32x16& a, const v_uint32x16& b,
                            const v_uint32x16& c, const v_uint32x16& d)
@@ -3069,9 +3073,7 @@ static inline v_uint16x32 v_mulhi(const v_uint16x32& a, uint16_t b)
 
 static inline v_int16x32 v_mulhrs(const v_int16x32& a, const v_int16x32& b)
 {
-    v_int16x32 r;
-    r.val = _mm512_mulhrs_epi16(a.val, b.val);
-    return r;
+    return v_int16x32(_mm512_mulhrs_epi16(a.val, b.val));
 }
 
 static inline v_int16x32 v_mulhrs(const v_int16x32& a, short b)
@@ -3104,6 +3106,188 @@ static inline v_float32x16 operator* (const v_float32x16& a, float b)
     return a * v512_setall_f32(b);
 }
 
+template<int mask, int shift>
+static inline v_uint8x64 v_mask_blend_shiftleft(const v_uint8x64& a, const v_uint8x64& b)
+{
+    return v_uint8x64(_mm512_mask_blend_epi16(mask,
+                                              a.val, _mm512_bslli_epi128(b.val, shift)));
+}
+
+template<int mask, int shift>
+static inline v_uint8x64 v_mask_blend_shiftright(const v_uint8x64& a, const v_uint8x64& b)
+{
+    return v_uint8x64(_mm512_mask_blend_epi16(mask,
+                                              _mm512_bsrli_epi128(a.val, shift), b.val));
+}
+
+static inline v_uint8x64 v_packus(const v_int16x32& a, const v_int16x32& b)
+{
+    return v_uint8x64(_mm512_packus_epi16(a.val, b.val));
+}
+
+
+#define word(b0, b1, b2, b3)                 \
+        (((uint32_t)((uint8_t)(b0)) << 0*8)  \
+      | ((uint32_t)((uint8_t)(b1))  << 1*8)  \
+      | ((uint32_t)((uint8_t)(b2))  << 2*8)  \
+      | ((uint32_t)((uint8_t)(b3))  << 3*8))
+
+static inline v_uint8x64 v_setr_s8(char b0, char b1, char b2, char b3, char b4,
+                                   char b5, char b6, char b7, char b8, char b9,
+                                   char b10, char b11, char b12, char b13, char b14,
+                                   char b15, char b16, char b17, char b18, char b19,
+                                   char b20, char b21, char b22, char b23, char b24,
+                                   char b25, char b26, char b27, char b28, char b29,
+                                   char b30, char b31, char b32, char b33, char b34,
+                                   char b35, char b36, char b37, char b38, char b39,
+                                   char b40, char b41, char b42, char b43, char b44,
+                                   char b45, char b46, char b47, char b48, char b49,
+                                   char b50, char b51, char b52, char b53, char b54,
+                                   char b55, char b56, char b57, char b58, char b59,
+                                   char b60, char b61, char b62, char b63)
+{
+    return v_uint8x64(_mm512_setr_epi32(word(b0, b1, b2, b3), word(b4, b5, b6, b7), word(b8, b9, b10, b11),
+                                        word(b12, b13, b14, b15), word(b16, b17, b18, b19), word(b20, b21, b22, b23),
+                                        word(b24, b25, b26, b27), word(b28, b29, b30, b31), word(b32, b33, b34, b35),
+                                        word(b36, b37, b38, b39), word(b40, b41, b42, b43), word(b44, b45, b46, b47),
+                                        word(b48, b49, b50, b51), word(b52, b53, b54, b55), word(b56, b57, b58, b59),
+                                        word(b60, b61, b62, b63)));
+}
+
+static inline v_uint64x8 v_set_s64(int b7, int b6, int b5, int b4, int b3, int b2, int b1, int b0)
+{
+    return v_uint64x8(_mm512_set_epi64(b7, b6, b5, b4, b3, b2, b1, b0));
+}
+
+static inline v_uint32x16 v_set_s32(int b15, int b14, int b13, int b12, int b11, int b10, int b9, int b8,
+                                    int b7, int b6, int b5, int b4, int b3, int b2, int b1, int b0)
+{
+    return v_uint32x16(_mm512_set_epi32(b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0));
+}
+
+static inline v_uint8x64 v_shuffle_s8(const v_uint8x64& a, const v_uint8x64& mask)
+{
+    return v_uint8x64(_mm512_shuffle_epi8(a.val, mask.val));
+}
+static inline v_int16x32 v_load_ccache_expand(const uchar* ptr)
+{
+    return v_int16x32(_mm512_cvtepu8_epi16(_mm256_lddqu_si256((const __m256i*)ptr)));                         \
+}
+static inline __m512i v512_insert_epi16(__m512i target, const uchar x, const int index)
+{
+    return _mm512_mask_set1_epi16(target, 1UL << index, x);
+}
+static inline __m512i v512_insert_epi32(__m512i target, const int32_t x, const int index)
+{
+    return _mm512_mask_set1_epi32(target, 1UL << index, x);
+}
+
+static inline void v_gather_channel(v_uint8x64& vec, const uint8_t tmp[], const short mapsx[],
+                                    int chanNum, int c, int x, int shift)
+{
+    __m256i vec1 = _mm256_setzero_si256();
+    __m256i vec2 = _mm256_setzero_si256();
+
+    vec1 = _mm256_insert_epi32(vec1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + shift + 0] + c)]), 0);
+    vec1 = _mm256_insert_epi32(vec1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + shift + 1] + c)]), 1);
+    vec1 = _mm256_insert_epi32(vec1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + shift + 2] + c)]), 2);
+    vec1 = _mm256_insert_epi32(vec1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + shift + 3] + c)]), 3);
+    vec1 = _mm256_insert_epi32(vec1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + shift + 4] + c)]), 4);
+    vec1 = _mm256_insert_epi32(vec1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + shift + 5] + c)]), 5);
+    vec1 = _mm256_insert_epi32(vec1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + shift + 6] + c)]), 6);
+    vec1 = _mm256_insert_epi32(vec1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + shift + 7] + c)]), 7);
+
+    vec2 = _mm256_insert_epi32(vec2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 0] + 1) + c)]), 0);
+    vec2 = _mm256_insert_epi32(vec2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 1] + 1) + c)]), 1);
+    vec2 = _mm256_insert_epi32(vec2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 2] + 1) + c)]), 2);
+    vec2 = _mm256_insert_epi32(vec2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 3] + 1) + c)]), 3);
+    vec2 = _mm256_insert_epi32(vec2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 4] + 1) + c)]), 4);
+    vec2 = _mm256_insert_epi32(vec2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 5] + 1) + c)]), 5);
+    vec2 = _mm256_insert_epi32(vec2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 6] + 1) + c)]), 6);
+    vec2 = _mm256_insert_epi32(vec2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 7] + 1) + c)]), 7);
+
+    vec.val = _mm512_inserti32x8(_mm512_castsi256_si512(vec1), vec2, 1);
+}
+
+static inline v_uint8x64 v_permutex2_s64(const v_uint8x64& a, const v_uint8x64& b, const v_uint64x8& idxs)
+{
+    return v_uint8x64(_mm512_permutex2var_epi64(a.val, idxs.val, b.val));
+}
+
+static inline v_uint8x64 v_permutex_s32(const v_uint8x64& a, const v_uint64x8 idxs)
+{
+    return v_uint8x64(_mm512_permutexvar_epi32(idxs.val, a.val));
+}
+
+static inline v_uint8x64 v_permutex2_s32(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16 idxs)
+{
+    return v_uint8x64(_mm512_permutex2var_epi32(a.val, idxs.val, b.val));
+}
+
+#if defined(__GNUC__)
+
+int _mm512_cvtsi512_si32(__m512i a)
+{
+    __v16si b = (__v16si)a;
+    return b[0];
+}
+
+#endif
+
+template <int index>
+static inline int v512_extract_epi32(__m512i target)
+{
+    return _mm512_cvtsi512_si32(_mm512_mask_alignr_epi32(_mm512_setzero_si512(), (__mmask16)-1, target, target, index));
+}
+
+template <int index>
+static inline int v512_extract_epi16(__m512i target)
+{
+    return (v512_extract_epi32<index/2>(target) >> (index % 2 ? 16 : 0)) & 0xFFFF;
+}
+
+namespace {
+    template<int chanNum>
+    static inline v_int16x32 v_gather_chan(const uchar src[], const v_int16x32& index, int channel, int pos) {
+        v_int16x32 r;
+
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<0>(index.val) + pos) + channel]), 0);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<1>(index.val) + pos) + channel]), 1);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<2>(index.val) + pos) + channel]), 2);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<3>(index.val) + pos) + channel]), 3);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<4>(index.val) + pos) + channel]), 4);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<5>(index.val) + pos) + channel]), 5);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<6>(index.val) + pos) + channel]), 6);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<7>(index.val) + pos) + channel]), 7);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<8>(index.val) + pos) + channel]), 8);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<9>(index.val) + pos) + channel]), 9);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<10>(index.val) + pos) + channel]), 10);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<11>(index.val) + pos) + channel]), 11);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<12>(index.val) + pos) + channel]), 12);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<13>(index.val) + pos) + channel]), 13);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<14>(index.val) + pos) + channel]), 14);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<15>(index.val) + pos) + channel]), 15);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<16>(index.val) + pos) + channel]), 16);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<17>(index.val) + pos) + channel]), 17);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<18>(index.val) + pos) + channel]), 18);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<19>(index.val) + pos) + channel]), 19);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<20>(index.val) + pos) + channel]), 20);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<21>(index.val) + pos) + channel]), 21);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<22>(index.val) + pos) + channel]), 22);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<23>(index.val) + pos) + channel]), 23);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<24>(index.val) + pos) + channel]), 24);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<25>(index.val) + pos) + channel]), 25);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<26>(index.val) + pos) + channel]), 26);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<27>(index.val) + pos) + channel]), 27);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<28>(index.val) + pos) + channel]), 28);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<29>(index.val) + pos) + channel]), 29);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<30>(index.val) + pos) + channel]), 30);
+        r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<31>(index.val) + pos) + channel]), 31);
+
+        return r;
+    }
+}  // namespace
+
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
 //! @endcond
index cfeb296..1e75ee7 100644 (file)
@@ -371,6 +371,12 @@ inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) {return v_float32x
 inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) {return v_float64x2(_mm_castps_pd(a.val)); }
 
 //////////////// PACK ///////////////
+static inline v_uint8x16 v_packus(const v_int16x8& a, const v_int16x8& b) {
+    v_uint8x16 res;
+    res.val = _mm_packus_epi16(a.val, b.val);
+    return res;
+}
+
 inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
 {
     __m128i delta = _mm_set1_epi16(255);
@@ -1526,7 +1532,17 @@ inline _Tpwsvec v_load_expand(const _Tps* ptr) \
 { \
     __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
     return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \
-}
+}\
+inline _Tpwuvec v_expand_low(const _Tpuvec& a) {             \
+    _Tpwuvec res;                                            \
+    res.val = _mm_cvtepu8_epi16(a.val);                      \
+    return res;                                              \
+}                                                            \
+inline  _Tpwuvec v_expand_high(const _Tpuvec& a) {           \
+    _Tpwuvec res;                                            \
+    res.val = _mm_unpackhi_epi8(a.val, _mm_setzero_si128()); \
+    return res;                                              \
+}                                                            \
 
 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8)
 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16)
@@ -2921,6 +2937,12 @@ static inline v_int16x8 v_saturate_s16(const v_int32x4& a) {
     return r;
 }
 
+static inline v_uint8x16 v_packus_s16(const v_int16x8& a, const v_int16x8& b) {
+    v_uint8x16 r;
+    r.val = _mm_packus_epi16(a.val, b.val);
+    return r;
+}
+
 // for each j=index[k], load two chars src[j] and src[j+1]
 static inline v_uint8x16 v_gather_pairs(const uchar src[], const v_int16x8& index) {
     v_uint8x16 r;
@@ -3030,6 +3052,47 @@ static inline v_float32x4 operator* (const v_float32x4& a, float b) {
     return a * v_setall_f32(b);
 }
 
+template<int mask, int shift>
+static inline v_uint8x16 v_blend_shiftleft(const v_uint8x16& a, const v_uint8x16& b) {
+    v_uint8x16 res;
+    res.val = _mm_blend_epi16(a.val, _mm_slli_si128(b.val, shift), mask /*0xCC 0b11001100*/);
+    return res;
+}
+
+template<int mask, int shift>
+static inline v_uint8x16 v_blend_shiftright(const v_uint8x16& a, const v_uint8x16& b) {
+    v_uint8x16 res;
+    res.val = _mm_blend_epi16(_mm_srli_si128(a.val, shift), b.val, mask /*0xCC 0b11001100*/);
+    return res;
+}
+
+static inline v_uint8x16 v_setr_s8(char b0, char b1, char b2, char b3, char b4,
+                                   char b5, char b6, char b7, char b8, char b9,
+                                   char b10, char b11, char b12, char b13, char b14,
+                                   char b15) {
+    v_uint8x16 res;
+    res.val = _mm_setr_epi8(b0, b1, b2, b3, b4, b5, b6, b7, b8,
+                            b9, b10, b11, b12, b13, b14, b15);
+    return res;
+}
+
+
+static inline v_uint8x16 v_shuffle_s8(const v_uint8x16& a, const v_uint8x16& mask) {
+    v_uint8x16 res;
+    res.val = _mm_shuffle_epi8(a.val, mask.val);
+    return res;
+}
+
+static inline void v_gather_channel(v_uint8x16& vec, const uint8_t tmp[], const short mapsx[],
+                                    int chanNum, int c, int x, int shift)
+{
+    vec.val = _mm_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + shift + 0] + c)]), 0);
+    vec.val = _mm_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + shift + 1] + c)]), 1);
+
+    vec.val = _mm_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 0] + 1) + c)]), 2);
+    vec.val = _mm_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 1] + 1) + c)]), 3);
+}
+
 //! @}
 
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END