add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME})
+# Workaround to avoid warnings caused with bug in the avx512intrin.h of GCC5
+if((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND
+ (CMAKE_CXX_COMPILER_VERSION VERSION_LESS_EQUAL 5.5))
+ set_target_properties(${TARGET_NAME} PROPERTIES LINK_FLAGS_RELEASE "-Wno-error=maybe-uninitialized -Wno-maybe-uninitialized")
+endif()
+
# export
export(TARGETS ${TARGET_NAME} NAMESPACE IE:: APPEND FILE "${CMAKE_BINARY_DIR}/targets.cmake")
target_include_directories(${TARGET_NAME} INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}")
+# Workaround to avoid warnings caused with bug in the avx512intrin.h of GCC5
+if((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND
+ (CMAKE_CXX_COMPILER_VERSION VERSION_LESS_EQUAL 5.5))
+ set_target_properties(${TARGET_NAME} PROPERTIES LINK_FLAGS_RELEASE "-Wno-error=maybe-uninitialized -Wno-maybe-uninitialized")
+endif()
+
if(WIN32)
set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME})
endif()
#include <algorithm>
#include <utility>
-#include "ie_preprocess_gapi_kernels.hpp"
-#include "ie_preprocess_gapi_kernels_impl.hpp"
#include "ie_preprocess_gapi_kernels_avx2.hpp"
#include <immintrin.h>
namespace avx {
-static inline v_uint16x16 v_expand_low(const v_uint8x32& a) {
- return v_uint16x16(_mm256_unpacklo_epi8(a.val, _mm256_setzero_si256()));
-}
-
-static inline v_uint16x16 v_expand_high(const v_uint8x32& a) {
- return v_uint16x16(_mm256_unpackhi_epi8(a.val, _mm256_setzero_si256()));
-}
-
-//------------------------------------------------------------------------------
-
void mergeRow_8UC2(const uint8_t in0[], const uint8_t in1[],
uint8_t out[], int length) {
mergeRow_8UC2_Impl(in0, in1, out, length);
splitRow_32FC4_Impl(in, out0, out1, out2, out3, length);
}
-
-
void calculate_nv12_to_rgb(const uchar **srcY,
const uchar *srcUV,
uchar **dstRGBx,
calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
}
+template<int chanNum>
+void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
+ const uint8_t *src0[],
+ const uint8_t *src1[],
+ const short alpha[],
+ const short clone[], // 4 clones of alpha
+ const short mapsx[],
+ const short beta[],
+ uint8_t tmp[],
+ const Size &inSz,
+ const Size &outSz,
+ int lpi) {
+ constexpr int half_nlanes = (v_uint8::nlanes / 2);
+ const int shift = (half_nlanes / 4);
+
+ if (4 == lpi) {
+ GAPI_DbgAssert(inSz.width >= half_nlanes);
+
+ v_uint8 shuf_mask1 = v_setr_s8(0, 8, 4, 12, 1, 9, 5, 13,
+ 2, 10, 6, 14, 3, 11, 7, 15,
+ 0, 8, 4, 12, 1, 9, 5, 13,
+ 2, 10, 6, 14, 3, 11, 7, 15);
+
+ v_uint8 shuf_mask2 = v_setr_s8(0, 4, 8, 12, 2, 6, 10, 14,
+ 1, 5, 9, 13, 3, 7, 11, 15,
+ 0, 4, 8, 12, 2, 6, 10, 14,
+ 1, 5, 9, 13, 3, 7, 11, 15);
+
+ v_uint8 shuf_mask3 = v_setr_s8(0, 1, 8, 9, 2, 3, 10, 11,
+ 4, 5, 12, 13, 6, 7, 14, 15,
+ 0, 1, 8, 9, 2, 3, 10, 11,
+ 4, 5, 12, 13, 6, 7, 14, 15);
+
+ // vertical pass
+ v_int16 b0 = vx_setall_s16(beta[0]);
+ v_int16 b1 = vx_setall_s16(beta[1]);
+ v_int16 b2 = vx_setall_s16(beta[2]);
+ v_int16 b3 = vx_setall_s16(beta[3]);
+
+ for (int w = 0; w < inSz.width*chanNum; ) {
+ for (; w <= inSz.width*chanNum - half_nlanes && w >= 0; w += half_nlanes) {
+ v_int16 val0_0 = v_load_ccache_expand(&src0[0][w]);
+ v_int16 val0_1 = v_load_ccache_expand(&src0[1][w]);
+ v_int16 val0_2 = v_load_ccache_expand(&src0[2][w]);
+ v_int16 val0_3 = v_load_ccache_expand(&src0[3][w]);
+
+ v_int16 val1_0 = v_load_ccache_expand(&src1[0][w]);
+ v_int16 val1_1 = v_load_ccache_expand(&src1[1][w]);
+ v_int16 val1_2 = v_load_ccache_expand(&src1[2][w]);
+ v_int16 val1_3 = v_load_ccache_expand(&src1[3][w]);
+
+ v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0);
+ v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1);
+ v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2);
+ v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3);
+
+ v_int16 r0 = v_add_wrap(val1_0, t0);
+ v_int16 r1 = v_add_wrap(val1_1, t1);
+ v_int16 r2 = v_add_wrap(val1_2, t2);
+ v_int16 r3 = v_add_wrap(val1_3, t3);
+
+ v_uint8 q0 = v_packus(r0, r1);
+ v_uint8 q1 = v_packus(r2, r3);
+
+ v_uint8 q2 = v_blend_shiftleft<0xCC /*0b11001100*/, 4>(q0, q1);
+ v_uint8 q3 = v_blend_shiftright<0xCC /*0b11001100*/, 4>(q0, q1);
+
+ v_uint8 q4 = v_shuffle_s8(q2, shuf_mask1);
+ v_uint8 q5 = v_shuffle_s8(q3, shuf_mask1);
+
+ v_uint8 q6 = v256_permute2x128<0x20>(q4, q5);
+ v_uint8 q7 = v256_permute2x128<0x31>(q4, q5);
+
+ vx_store(&tmp[4 * w + 0], q6);
+ vx_store(&tmp[4 * w + 2 * half_nlanes], q7);
+ }
+
+ if (w < inSz.width*chanNum) {
+ w = inSz.width*chanNum - half_nlanes;
+ }
+ }
+
+ // horizontal pass
+ v_uint8 val_0, val_1, val_2, val_3;
+ GAPI_DbgAssert(outSz.width >= half_nlanes);
+ for (int x = 0; x < outSz.width; ) {
+ for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) {
+ v_int16 a10 = vx_load(&clone[4 * x]);
+ v_int16 a32 = vx_load(&clone[4 * (x + 4)]);
+ v_int16 a54 = vx_load(&clone[4 * (x + 8)]);
+ v_int16 a76 = vx_load(&clone[4 * (x + 12)]);
+
+ for (int c = 0; c < chanNum; ++c) {
+ v_gather_channel(val_0, tmp, mapsx, chanNum, c, x, 0);
+ v_gather_channel(val_1, tmp, mapsx, chanNum, c, x, shift);
+ v_gather_channel(val_2, tmp, mapsx, chanNum, c, x, shift * 2);
+ v_gather_channel(val_3, tmp, mapsx, chanNum, c, x, shift * 3);
+
+ v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0));
+ v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1));
+ v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2));
+ v_int16 val0_3 = v_reinterpret_as_s16(v_expand_low(val_3));
+
+ v_int16 val1_0 = v_reinterpret_as_s16(v_expand_high(val_0));
+ v_int16 val1_1 = v_reinterpret_as_s16(v_expand_high(val_1));
+ v_int16 val1_2 = v_reinterpret_as_s16(v_expand_high(val_2));
+ v_int16 val1_3 = v_reinterpret_as_s16(v_expand_high(val_3));
+
+ v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), a10);
+ v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), a32);
+ v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), a54);
+ v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), a76);
+
+ v_int16 r0 = v_add_wrap(val1_0, t0);
+ v_int16 r1 = v_add_wrap(val1_1, t1);
+ v_int16 r2 = v_add_wrap(val1_2, t2);
+ v_int16 r3 = v_add_wrap(val1_3, t3);
+
+ v_uint8 q0 = v_packus(r0, r1);
+ v_uint8 q1 = v_packus(r2, r3);
+
+ v_uint8 q2 = v_shuffle_s8(q0, shuf_mask2);
+ v_uint8 q3 = v_shuffle_s8(q1, shuf_mask2);
+
+ v_uint8 q4 = v_blend_shiftleft<0xCC /*0b11001100*/, 4>(q2, q3);
+ v_uint8 q5 = v_blend_shiftright<0xCC /*0b11001100*/, 4>(q2, q3);
+
+ v_uint8 q6 = v256_permute4x64<0xD8>(q4);
+ v_uint8 q7 = v256_permute4x64<0xD8>(q5);
+
+ v_uint8 q8 = v_shuffle_s8(q6, shuf_mask3);
+ v_uint8 q9 = v_shuffle_s8(q7, shuf_mask3);
+
+ v_store_low(&dst[c][0][x], q8);
+ v_store_high(&dst[c][1][x], q8);
+ v_store_low(&dst[c][2][x], q9);
+ v_store_high(&dst[c][3][x], q9);
+ }
+ }
+
+ if (x < outSz.width) {
+ x = outSz.width - half_nlanes;
+ }
+ }
+ } else { // if any lpi
+ for (int l = 0; l < lpi; ++l) {
+ short beta0 = beta[l];
+
+ // vertical pass
+ GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes);
+ for (int w = 0; w < inSz.width*chanNum; ) {
+ for (; w <= inSz.width*chanNum - half_nlanes; w += half_nlanes) {
+ v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[l][w]));
+ v_int16 s1 = v_reinterpret_as_s16(vx_load_expand(&src1[l][w]));
+ v_int16 t = v_mulhrs(s0 - s1, beta0) + s1;
+ v_pack_u_store(tmp + w, t);
+ }
+
+ if (w < inSz.width*chanNum) {
+ w = inSz.width*chanNum - half_nlanes;
+ }
+ }
+
+ // horizontal pass
+ GAPI_DbgAssert(outSz.width >= half_nlanes);
+
+ for (int x = 0; x < outSz.width; ) {
+ for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) {
+ for (int c = 0; c < chanNum; ++c) {
+ v_int16 a0 = vx_load(&alpha[x]); // as signed Q1.1.14
+ v_int16 sx = vx_load(&mapsx[x]); // as integer (int16)
+ v_int16 t0 = v_gather_chan<chanNum>(tmp, sx, c, 0);
+ v_int16 t1 = v_gather_chan<chanNum>(tmp, sx, c, 1);
+ v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
+ v_pack_u_store(&dst[c][l][x], d);
+ }
+ }
+
+ if (x < outSz.width) {
+ x = outSz.width - half_nlanes;
+ }
+ }
+ }
+ }
+}
+
+// Resize (bi-linear, 8UC3)
+void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
+ const uint8_t *src0[],
+ const uint8_t *src1[],
+ const short alpha[],
+ const short clone[], // 4 clones of alpha
+ const short mapsx[],
+ const short beta[],
+ uint8_t tmp[],
+ const Size &inSz,
+ const Size &outSz,
+ int lpi) {
+ constexpr const int chanNum = 3;
+
+ calcRowLinear_8UC_Impl<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
+}
+
+// Resize (bi-linear, 8UC4)
+void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
+ const uint8_t *src0[],
+ const uint8_t *src1[],
+ const short alpha[],
+ const short clone[], // 4 clones of alpha
+ const short mapsx[],
+ const short beta[],
+ uint8_t tmp[],
+ const Size &inSz,
+ const Size &outSz,
+ int lpi) {
+ constexpr const int chanNum = 4;
+
+ calcRowLinear_8UC_Impl<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
+}
+
void copyRow_8U(const uint8_t in[], uint8_t out[], int length) {
copyRow_8U_impl(in, out, length);
}
#include <algorithm>
#include <utility>
-#include <cstring>
-#include "ie_preprocess_gapi_kernels.hpp"
-#include "ie_preprocess_gapi_kernels_impl.hpp"
#include "ie_preprocess_gapi_kernels_avx512.hpp"
#include <immintrin.h>
namespace kernels {
namespace avx512 {
-//----------------------------------------------------------------------
-
-static inline v_uint16x32 v_expand_low(const v_uint8x64& a) {
- return v_uint16x32(_mm512_unpacklo_epi8(a.val, _mm512_setzero_si512()));
-}
-
-static inline v_uint16x32 v_expand_high(const v_uint8x64& a) {
- return v_uint16x32(_mm512_unpackhi_epi8(a.val, _mm512_setzero_si512()));
-}
-
-//------------------------------------------------------------------------------
void mergeRow_8UC2(const uint8_t in0[], const uint8_t in1[],
uint8_t out[], int length) {
splitRow_32FC4_Impl(in, out0, out1, out2, out3, length);
}
-
-
void calculate_nv12_to_rgb(const uchar **srcY,
const uchar *srcUV,
uchar **dstRGBx,
calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
}
+// Resize (bi-linear, 8U, generic number of channels)
+template<int chanNum>
+void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
+ const uint8_t *src0[],
+ const uint8_t *src1[],
+ const short alpha[],
+ const short clone[], // 4 clones of alpha
+ const short mapsx[],
+ const short beta[],
+ uint8_t tmp[],
+ const Size &inSz,
+ const Size &outSz,
+ int lpi) {
+ constexpr int half_nlanes = (v_uint8::nlanes / 2);
+ const int shift = (half_nlanes / 4);
+
+ if (4 == lpi) {
+ GAPI_DbgAssert(inSz.width >= half_nlanes);
+
+
+ v_uint8 shuf_mask1 = v_setr_s8(0, 4, 8, 12, 1, 5, 9, 13,
+ 2, 6, 10, 14, 3, 7, 11, 15,
+ 0, 4, 8, 12, 1, 5, 9, 13,
+ 2, 6, 10, 14, 3, 7, 11, 15,
+ 0, 4, 8, 12, 1, 5, 9, 13,
+ 2, 6, 10, 14, 3, 7, 11, 15,
+ 0, 4, 8, 12, 1, 5, 9, 13,
+ 2, 6, 10, 14, 3, 7, 11, 15);
+
+ v_uint8 shuf_mask2 = v_setr_s8(0, 1, 4, 5, 8, 9, 12, 13,
+ 2, 3, 6, 7, 10, 11, 14, 15,
+ 0, 1, 4, 5, 8, 9, 12, 13,
+ 2, 3, 6, 7, 10, 11, 14, 15,
+ 0, 1, 4, 5, 8, 9, 12, 13,
+ 2, 3, 6, 7, 10, 11, 14, 15,
+ 0, 1, 4, 5, 8, 9, 12, 13,
+ 2, 3, 6, 7, 10, 11, 14, 15);
+
+ v_uint32 idx1 = v_set_s32(23, 21, 7, 5, 22, 20, 6, 4, 19, 17, 3, 1, 18, 16, 2, 0);
+ v_uint32 idx2 = v_set_s32(31, 29, 15, 13, 30, 28, 14, 12, 27, 25, 11, 9, 26, 24, 10, 8);
+ v_uint32 idx3 = v_set_s32(29, 25, 21, 17, 13, 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0);
+ v_uint32 idx4 = v_set_s32(31, 27, 23, 19, 15, 11, 7, 3, 30, 26, 22, 18, 14, 10, 6, 2);
+
+ // vertical pass
+ v_int16 b0 = vx_setall_s16(beta[0]);
+ v_int16 b1 = vx_setall_s16(beta[1]);
+ v_int16 b2 = vx_setall_s16(beta[2]);
+ v_int16 b3 = vx_setall_s16(beta[3]);
+
+ for (int w = 0; w < inSz.width*chanNum; ) {
+ for (; w <= inSz.width*chanNum - half_nlanes && w >= 0; w += half_nlanes) {
+ v_int16 val0_0 = v_load_ccache_expand(&src0[0][w]);
+ v_int16 val0_1 = v_load_ccache_expand(&src0[1][w]);
+ v_int16 val0_2 = v_load_ccache_expand(&src0[2][w]);
+ v_int16 val0_3 = v_load_ccache_expand(&src0[3][w]);
+
+ v_int16 val1_0 = v_load_ccache_expand(&src1[0][w]);
+ v_int16 val1_1 = v_load_ccache_expand(&src1[1][w]);
+ v_int16 val1_2 = v_load_ccache_expand(&src1[2][w]);
+ v_int16 val1_3 = v_load_ccache_expand(&src1[3][w]);
+
+ v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0);
+ v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1);
+ v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2);
+ v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3);
+
+ v_int16 r0 = v_add_wrap(val1_0, t0);
+ v_int16 r1 = v_add_wrap(val1_1, t1);
+ v_int16 r2 = v_add_wrap(val1_2, t2);
+ v_int16 r3 = v_add_wrap(val1_3, t3);
+
+ v_uint8 q0 = v_packus(r0, r1);
+ v_uint8 q1 = v_packus(r2, r3);
+#if 1
+ v_uint8 q2 = v_permutex2_s32(q0, q1, idx1);
+ v_uint8 q3 = v_permutex2_s32(q0, q1, idx2);
+
+ v_uint8 q4 = v_shuffle_s8(q2, shuf_mask1);
+ v_uint8 q5 = v_shuffle_s8(q3, shuf_mask1);
+
+ //Second variant of decompose. It'll be usefull in the future.
+#else
+ v_uint8 q2 = v_mblend_shiftleft(q0, q1);
+ v_uint8 q3 = v_mblend_shiftright(q0, q1);
+
+ v_uint8 mask1 = v_setr_s8(0, 8, 4, 12, 1, 9, 5, 13,
+ 2, 10, 6, 14, 3, 11, 7, 15,
+ 0, 8, 4, 12, 1, 9, 5, 13,
+ 2, 10, 6, 14, 3, 11, 7, 15,
+ 0, 8, 4, 12, 1, 9, 5, 13,
+ 2, 10, 6, 14, 3, 11, 7, 15,
+ 0, 8, 4, 12, 1, 9, 5, 13,
+ 2, 10, 6, 14, 3, 11, 7, 15);
+
+ v_uint8 q4 = v_shuffle_s8(q2, mask1);
+ v_uint8 q5 = v_shuffle_s8(q3, mask1);
+
+ v_uint64 idx1 = v_set_s64(11, 10, 3, 2, 9, 8, 1, 0);
+ v_uint64 idx2 = v_set_s64(15, 14, 7, 6, 13, 12, 5, 4);
+
+ v_uint8 q6 = v_permutex2_s64(q4, q5, idx1);
+ v_uint8 q7 = v_permutex2_s64(q4, q5, idx2);
+#endif
+
+ vx_store(&tmp[4 * w + 0], q4);
+ vx_store(&tmp[4 * w + 2 * half_nlanes], q5);
+ }
+
+ if (w < inSz.width*chanNum) {
+ w = inSz.width*chanNum - half_nlanes;
+ }
+ }
+
+ // horizontal pass
+ v_uint8 val_0, val_1, val_2, val_3;
+
+ GAPI_DbgAssert(outSz.width >= half_nlanes);
+ for (int x = 0; x < outSz.width; ) {
+ for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) {
+ v_int16 a10 = vx_load(&clone[4 * x]);
+ v_int16 a32 = vx_load(&clone[4 * (x + 8)]);
+ v_int16 a54 = vx_load(&clone[4 * (x + 16)]);
+ v_int16 a76 = vx_load(&clone[4 * (x + 24)]);
+
+ for (int c = 0; c < chanNum; ++c) {
+ v_gather_channel(val_0, tmp, mapsx, chanNum, c, x, 0);
+ v_gather_channel(val_1, tmp, mapsx, chanNum, c, x, shift);
+ v_gather_channel(val_2, tmp, mapsx, chanNum, c, x, shift * 2);
+ v_gather_channel(val_3, tmp, mapsx, chanNum, c, x, shift * 3);
+
+ v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0));
+ v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1));
+ v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2));
+ v_int16 val0_3 = v_reinterpret_as_s16(v_expand_low(val_3));
+
+ v_int16 val1_0 = v_reinterpret_as_s16(v_expand_high(val_0));
+ v_int16 val1_1 = v_reinterpret_as_s16(v_expand_high(val_1));
+ v_int16 val1_2 = v_reinterpret_as_s16(v_expand_high(val_2));
+ v_int16 val1_3 = v_reinterpret_as_s16(v_expand_high(val_3));
+
+ v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), a10);
+ v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), a32);
+ v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), a54);
+ v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), a76);
+
+ v_int16 r0 = v_add_wrap(val1_0, t0);
+ v_int16 r1 = v_add_wrap(val1_1, t1);
+ v_int16 r2 = v_add_wrap(val1_2, t2);
+ v_int16 r3 = v_add_wrap(val1_3, t3);
+
+ v_uint8 q0 = v_packus(r0, r1);
+ v_uint8 q1 = v_packus(r2, r3);
+
+ v_uint8 q2 = v_shuffle_s8(q0, shuf_mask1);
+ v_uint8 q3 = v_shuffle_s8(q1, shuf_mask1);
+#if 1
+ v_uint8 q4 = v_permutex2_s32(q2, q3, idx3);
+ v_uint8 q5 = v_permutex2_s32(q2, q3, idx4);
+
+ v_uint8 q6 = v_shuffle_s8(q4, shuf_mask2);
+ v_uint8 q7 = v_shuffle_s8(q5, shuf_mask2);
+
+
+ //Second variant of decompose. It'll be usefull in the future.
+#else
+ v_uint8 q4 = v_mask_blend_shiftleft<0xCCCCCCCC /*0b11001100110011001100110011001100*/, 4>(q2, q3);
+ v_uint8 q5 = v_mask_blend_shiftright<0xCCCCCCCC /*0b11001100110011001100110011001100*/, 4>(q2, q3);
+
+ v_int32 idx = v_set_s32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+
+ v_uint8 q6 = v_permutex_s32(idx, q4);
+ v_uint8 q7 = v_permutex_s32(idx, q5);
+
+ v_uint8 mask2 = v_setr_s8(0, 1, 4, 5, 8, 9, 12, 13,
+ 2, 3, 6, 7, 10, 11, 14, 15,
+ 0, 1, 4, 5, 8, 9, 12, 13,
+ 2, 3, 6, 7, 10, 11, 14, 15,
+ 0, 1, 4, 5, 8, 9, 12, 13,
+ 2, 3, 6, 7, 10, 11, 14, 15,
+ 0, 1, 4, 5, 8, 9, 12, 13,
+ 2, 3, 6, 7, 10, 11, 14, 15);
+
+ v_uint8 q8 = v_shuffle_s8(q6, mask2);
+ v_uint8 q9 = v_shuffle_s8(q7, mask2);
+#endif
+ v_store_low(&dst[c][0][x], q6);
+ v_store_high(&dst[c][1][x], q6);
+ v_store_low(&dst[c][2][x], q7);
+ v_store_high(&dst[c][3][x], q7);
+ }
+ }
+
+ if (x < outSz.width) {
+ x = outSz.width - half_nlanes;
+ }
+ }
+ } else { // if any lpi
+ for (int l = 0; l < lpi; ++l) {
+ short beta0 = beta[l];
+
+ // vertical pass
+ GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes);
+ for (int w = 0; w < inSz.width*chanNum; ) {
+ for (; w <= inSz.width*chanNum - half_nlanes; w += half_nlanes) {
+ v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[l][w]));
+ v_int16 s1 = v_reinterpret_as_s16(vx_load_expand(&src1[l][w]));
+ v_int16 t = v_mulhrs(s0 - s1, beta0) + s1;
+ v_pack_u_store(tmp + w, t);
+ }
+
+ if (w < inSz.width*chanNum) {
+ w = inSz.width*chanNum - half_nlanes;
+ }
+ }
+
+ // horizontal pass
+ GAPI_DbgAssert(outSz.width >= half_nlanes);
+
+ for (int x = 0; x < outSz.width; ) {
+ for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) {
+ for (int c = 0; c < chanNum; ++c) {
+ v_int16 a0 = vx_load(&alpha[x]); // as signed Q1.1.14
+ v_int16 sx = vx_load(&mapsx[x]); // as integer (int16)
+ v_int16 t0 = v_gather_chan<chanNum>(tmp, sx, c, 0);
+ v_int16 t1 = v_gather_chan<chanNum>(tmp, sx, c, 1);
+ v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
+ v_pack_u_store(&dst[c][l][x], d);
+ }
+ }
+
+ if (x < outSz.width) {
+ x = outSz.width - half_nlanes;
+ }
+ }
+ }
+ }
+}
+
+// Resize (bi-linear, 8UC3)
+void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
+ const uint8_t *src0[],
+ const uint8_t *src1[],
+ const short alpha[],
+ const short clone[], // 4 clones of alpha
+ const short mapsx[],
+ const short beta[],
+ uint8_t tmp[],
+ const Size &inSz,
+ const Size &outSz,
+ int lpi) {
+ constexpr const int chanNum = 3;
+
+ calcRowLinear_8UC_Impl<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
+}
+
+// Resize (bi-linear, 8UC4)
+void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
+ const uint8_t *src0[],
+ const uint8_t *src1[],
+ const short alpha[],
+ const short clone[], // 4 clones of alpha
+ const short mapsx[],
+ const short beta[],
+ uint8_t tmp[],
+ const Size &inSz,
+ const Size &outSz,
+ int lpi) {
+ constexpr const int chanNum = 4;
+
+ calcRowLinear_8UC_Impl<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
+}
+
void copyRow_8U(const uint8_t in[], uint8_t out[], int length) {
copyRow_8U_impl(in, out, length);
}
} // namespace kernels
} // namespace gapi
} // namespace InferenceEngine
+
namespace gapi {
namespace kernels {
-//----------------------------------------------------------------------
-
-static inline v_uint16x8 v_expand_low(const v_uint8x16& a) {
- return v_uint16x8(_mm_unpacklo_epi8(a.val, _mm_setzero_si128()));
-}
-
-static inline v_uint16x8 v_expand_high(const v_uint8x16& a) {
- return v_uint16x8(_mm_unpackhi_epi8(a.val, _mm_setzero_si128()));
-}
-
-//------------------------------------------------------------------------------
-
// Resize (bi-linear, 8U)
void calcRowLinear_8U(uint8_t *dst[],
const uint8_t *src0[],
}
}
+// Resize 3C/4C universal intrinsic implementation for SSE42 version is a bit slower than original sometimes.
+// Remove original implementation when I find a cause.
+#if 1
// Resize (bi-linear, 8U, generic number of channels)
template<int chanNum>
-void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
+void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
const uint8_t *src0[],
const uint8_t *src1[],
const short alpha[],
const Size &inSz,
const Size &outSz,
int lpi) {
+ const int half_nlanes = (v_uint8::nlanes / 2);
+
if (4 == lpi) {
// vertical pass
- GAPI_DbgAssert(inSz.width >= 8);
+ GAPI_DbgAssert(inSz.width >= half_nlanes);
__m128i b0 = _mm_set1_epi16(beta[0]);
__m128i b1 = _mm_set1_epi16(beta[1]);
__m128i b3 = _mm_set1_epi16(beta[3]);
for (int w = 0; w < inSz.width*chanNum; ) {
- for (; w <= inSz.width*chanNum - 8 && w >= 0; w += 8) {
+ for (; w <= inSz.width*chanNum - half_nlanes && w >= 0; w += half_nlanes) {
//--------------------------------------------
// reworked from: ie_preprocess_data_sse42.cpp
// function: resize_bilinear_u8
}
if (w < inSz.width*chanNum) {
- w = inSz.width*chanNum - 8;
+ w = inSz.width*chanNum - half_nlanes;
}
}
// horizontal pass
- GAPI_DbgAssert(outSz.width >= 8);
+ GAPI_DbgAssert(outSz.width >= half_nlanes);
for (int x = 0; x < outSz.width; ) {
- for (; x <= outSz.width - 8 && x >= 0; x += 8) {
+ for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) {
//--------------------------------------------
// reworked from: ie_preprocess_data_sse42.cpp
// function: resize_bilinear_u8
}
if (x < outSz.width) {
- x = outSz.width - 8;
+ x = outSz.width - half_nlanes;
}
}
+
} else { // if any lpi
for (int l = 0; l < lpi; l++) {
short beta0 = beta[l];
// vertical pass
- GAPI_DbgAssert(inSz.width*chanNum >= 8);
+ GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes);
for (int w = 0; w < inSz.width*chanNum; ) {
- for (; w <= inSz.width*chanNum - 8; w += 8) {
+ for (; w <= inSz.width*chanNum - half_nlanes; w += half_nlanes) {
v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(&src0[l][w]));
v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(&src1[l][w]));
v_int16x8 t = v_mulhrs(s0 - s1, beta0) + s1;
}
if (w < inSz.width*chanNum) {
- w = inSz.width*chanNum - 8;
+ w = inSz.width*chanNum - half_nlanes;
}
}
// horizontal pass
- GAPI_DbgAssert(outSz.width >= 8);
+ GAPI_DbgAssert(outSz.width >= half_nlanes);
for (int x = 0; x < outSz.width; ) {
- for (; x <= outSz.width - 8 && x >= 0; x += 8) {
+ for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) {
for (int c = 0; c < chanNum; c++) {
v_int16x8 a0 = v_load(&alpha[x]); // as signed Q1.1.14
v_int16x8 sx = v_load(&mapsx[x]); // as integer (int16)
}
if (x < outSz.width) {
- x = outSz.width - 8;
+ x = outSz.width - half_nlanes;
+ }
+ }
+ }
+ }
+}
+#else
+// Resize 3C/4C universal intrinsic implementation for SSE42 version is a bit slower sometimes.
+// Gonna turn it on when I find a cause.
+template<int chanNum>
+void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
+ const uint8_t *src0[],
+ const uint8_t *src1[],
+ const short alpha[],
+ const short clone[], // 4 clones of alpha
+ const short mapsx[],
+ const short beta[],
+ uint8_t tmp[],
+ const Size &inSz,
+ const Size &outSz,
+ int lpi) {
+ const int half_nlanes = (v_uint8::nlanes / 2);
+
+ if (4 == lpi) {
+ // vertical pass
+ GAPI_DbgAssert(inSz.width >= half_nlanes);
+
+ v_int16 b0 = vx_setall_s16(beta[0]);
+ v_int16 b1 = vx_setall_s16(beta[1]);
+ v_int16 b2 = vx_setall_s16(beta[2]);
+ v_int16 b3 = vx_setall_s16(beta[3]);
+
+ for (int w = 0; w < inSz.width*chanNum; ) {
+ for (; w <= inSz.width*chanNum - half_nlanes && w >= 0; w += half_nlanes) {
+ v_int16 val0_0 = v_reinterpret_as_s16(vx_load_expand(&src0[0][w]));
+ v_int16 val0_1 = v_reinterpret_as_s16(vx_load_expand(&src0[1][w]));
+ v_int16 val0_2 = v_reinterpret_as_s16(vx_load_expand(&src0[2][w]));
+ v_int16 val0_3 = v_reinterpret_as_s16(vx_load_expand(&src0[3][w]));
+
+ v_int16 val1_0 = v_reinterpret_as_s16(vx_load_expand(&src1[0][w]));
+ v_int16 val1_1 = v_reinterpret_as_s16(vx_load_expand(&src1[1][w]));
+ v_int16 val1_2 = v_reinterpret_as_s16(vx_load_expand(&src1[2][w]));
+ v_int16 val1_3 = v_reinterpret_as_s16(vx_load_expand(&src1[3][w]));
+
+ v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0);
+ v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1);
+ v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2);
+ v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3);
+
+ v_int16 r0 = v_add_wrap(val1_0, t0);
+ v_int16 r1 = v_add_wrap(val1_1, t1);
+ v_int16 r2 = v_add_wrap(val1_2, t2);
+ v_int16 r3 = v_add_wrap(val1_3, t3);
+
+ v_uint8 q0 = v_packus(r0, r1);
+ v_uint8 q1 = v_packus(r2, r3);
+
+ v_uint8 q2 = v_blend_shiftleft<0xCC /*0b11001100*/, 4>(q0, q1);
+ v_uint8 q3 = v_blend_shiftright<0xCC /*0b11001100*/, 4>(q0, q1);
+
+ v_uint8 mask = v_setr_s8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15);
+
+ v_uint8 q4 = v_shuffle_s8(q2, mask);
+ v_uint8 q5 = v_shuffle_s8(q3, mask);
+
+ vx_store(&tmp[4 * w + 0], q4);
+ vx_store(&tmp[4 * w + 2 * half_nlanes], q5);
+ }
+
+ if (w < inSz.width*chanNum) {
+ w = inSz.width*chanNum - half_nlanes;
+ }
+ }
+
+ // horizontal pass
+ GAPI_DbgAssert(outSz.width >= half_nlanes);
+ for (int x = 0; x < outSz.width; ) {
+ for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) {
+ v_int16 a10 = vx_load(&clone[4 * x]);
+ v_int16 a32 = vx_load(&clone[4 * (x + 2)]);
+ v_int16 a54 = vx_load(&clone[4 * (x + 4)]);
+ v_int16 a76 = vx_load(&clone[4 * (x + 6)]);
+
+ v_uint8 val_0 = vx_setzero_u8();
+ v_uint8 val_1 = vx_setzero_u8();
+ v_uint8 val_2 = vx_setzero_u8();
+ v_uint8 val_3 = vx_setzero_u8();
+
+ for (int c = 0; c < chanNum; ++c) {
+ int shift = (half_nlanes / 4);
+
+ v_gather_channel(val_0, tmp, mapsx, chanNum, c, x, 0);
+ v_gather_channel(val_1, tmp, mapsx, chanNum, c, x, shift);
+ v_gather_channel(val_2, tmp, mapsx, chanNum, c, x, shift * 2);
+ v_gather_channel(val_3, tmp, mapsx, chanNum, c, x, shift * 3);
+
+ v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0));
+ v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1));
+ v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2));
+ v_int16 val0_3 = v_reinterpret_as_s16(v_expand_low(val_3));
+
+ v_int16 val1_0 = v_reinterpret_as_s16(v_expand_high(val_0));
+ v_int16 val1_1 = v_reinterpret_as_s16(v_expand_high(val_1));
+ v_int16 val1_2 = v_reinterpret_as_s16(v_expand_high(val_2));
+ v_int16 val1_3 = v_reinterpret_as_s16(v_expand_high(val_3));
+
+ v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), a10);
+ v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), a32);
+ v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), a54);
+ v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), a76);
+
+ v_int16 r0 = v_add_wrap(val1_0, t0);
+ v_int16 r1 = v_add_wrap(val1_1, t1);
+ v_int16 r2 = v_add_wrap(val1_2, t2);
+ v_int16 r3 = v_add_wrap(val1_3, t3);
+
+ v_uint8 q0 = v_packus(r0, r1);
+ v_uint8 q1 = v_packus(r2, r3);
+
+ v_uint8 mask = v_setr_s8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15);
+
+ v_uint8 q2 = v_shuffle_s8(q0, mask);
+ v_uint8 q3 = v_shuffle_s8(q1, mask);
+
+ v_uint8 q4 = v_blend_shiftleft<0xCC /*0b11001100*/, 4>(q2, q3);
+ v_uint8 q5 = v_blend_shiftright<0xCC /*0b11001100*/, 4>(q2, q3);
+
+ v_store_low(&dst[c][0][x], q4);
+ v_store_high(&dst[c][1][x], q4);
+ v_store_low(&dst[c][2][x], q5);
+ v_store_high(&dst[c][3][x], q5);
+ }
+ }
+
+ if (x < outSz.width) {
+ x = outSz.width - half_nlanes;
+ }
+ }
+
+ } else { // if any lpi
+ for (int l = 0; l < lpi; ++l) {
+ short beta0 = beta[l];
+
+ // vertical pass
+ GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes);
+ for (int w = 0; w < inSz.width*chanNum; ) {
+ for (; w <= inSz.width*chanNum - half_nlanes; w += half_nlanes) {
+ v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[l][w]));
+ v_int16 s1 = v_reinterpret_as_s16(vx_load_expand(&src1[l][w]));
+ v_int16 t = v_mulhrs(s0 - s1, beta0) + s1;
+ v_pack_u_store(tmp + w, t);
+ }
+
+ if (w < inSz.width*chanNum) {
+ w = inSz.width*chanNum - half_nlanes;
+ }
+ }
+
+ // horizontal pass
+ GAPI_DbgAssert(outSz.width >= half_nlanes);
+ for (int x = 0; x < outSz.width; ) {
+ for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) {
+ for (int c = 0; c < chanNum; ++c) {
+ v_int16 a0 = vx_load(&alpha[x]); // as signed Q1.1.14
+ v_int16 sx = vx_load(&mapsx[x]); // as integer (int16)
+ v_int16 t0 = v_gather_chan<chanNum>(tmp, sx, c, 0);
+ v_int16 t1 = v_gather_chan<chanNum>(tmp, sx, c, 1);
+ v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
+ v_pack_u_store(&dst[c][l][x], d);
+ }
+ }
+
+ if (x < outSz.width) {
+ x = outSz.width - half_nlanes;
}
}
}
}
}
+#endif
// Resize (bi-linear, 8UC3)
void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
int lpi) {
constexpr const int chanNum = 3;
- calcRowLinear_8UC_Impl<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
+ calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
}
// Resize (bi-linear, 8UC4)
const Size &outSz,
int lpi) {
constexpr const int chanNum = 4;
- calcRowLinear_8UC_Impl<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
+ calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
}
// Resize (bi-linear, 32F)
}
}
- #ifdef HAVE_SSE
+#ifdef HAVE_AVX512
+ if (with_cpu_x86_avx512_core()) {
+ if (std::is_same<T, uint8_t>::value) {
+ if (inSz.width >= 64 && outSz.width >= 32) {
+ avx512::calcRowLinear_8UC<numChan>(dst,
+ reinterpret_cast<const uint8_t**>(src0),
+ reinterpret_cast<const uint8_t**>(src1),
+ reinterpret_cast<const short*>(alpha),
+ reinterpret_cast<const short*>(clone),
+ reinterpret_cast<const short*>(mapsx),
+ reinterpret_cast<const short*>(beta),
+ reinterpret_cast<uint8_t*>(tmp),
+ inSz, outSz, lpi);
+ return;
+ }
+ }
+ }
+#endif
+
+#ifdef HAVE_AVX2
+ if (with_cpu_x86_avx2()) {
+ if (std::is_same<T, uint8_t>::value) {
+ if (inSz.width >= 32 && outSz.width >= 16) {
+ avx::calcRowLinear_8UC<numChan>(dst,
+ reinterpret_cast<const uint8_t**>(src0),
+ reinterpret_cast<const uint8_t**>(src1),
+ reinterpret_cast<const short*>(alpha),
+ reinterpret_cast<const short*>(clone),
+ reinterpret_cast<const short*>(mapsx),
+ reinterpret_cast<const short*>(beta),
+ reinterpret_cast<uint8_t*>(tmp),
+ inSz, outSz, lpi);
+ return;
+ }
+ }
+ }
+#endif
+
+#ifdef HAVE_SSE
if (with_cpu_x86_sse42()) {
if (std::is_same<T, uint8_t>::value) {
if (inSz.width >= 16 && outSz.width >= 8) {
calcRowLinear_8UC<numChan>(dst,
- reinterpret_cast<const uint8_t**>(src0),
- reinterpret_cast<const uint8_t**>(src1),
- reinterpret_cast<const short*>(alpha),
- reinterpret_cast<const short*>(clone),
- reinterpret_cast<const short*>(mapsx),
- reinterpret_cast<const short*>(beta),
- reinterpret_cast<uint8_t*>(tmp),
- inSz, outSz, lpi);
+ reinterpret_cast<const uint8_t**>(src0),
+ reinterpret_cast<const uint8_t**>(src1),
+ reinterpret_cast<const short*>(alpha),
+ reinterpret_cast<const short*>(clone),
+ reinterpret_cast<const short*>(mapsx),
+ reinterpret_cast<const short*>(beta),
+ reinterpret_cast<uint8_t*>(tmp),
+ inSz, outSz, lpi);
return;
}
}
}
- #endif // HAVE_SSE
+#endif // HAVE_SSE
auto length = out[0].get().length();
add_library(fluid_test_computations SHARED ${SRC} ${HDR})
+# Workaround to avoid warnings caused with bug in the avx512intrin.h of GCC5
+if((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND
+ (CMAKE_CXX_COMPILER_VERSION VERSION_LESS_EQUAL 5.5))
+ set_target_properties(fluid_test_computations PROPERTIES LINK_FLAGS_RELEASE "-Wno-error=maybe-uninitialized -Wno-maybe-uninitialized")
+endif()
+
target_include_directories(fluid_test_computations PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}")
target_link_libraries(fluid_test_computations PRIVATE inference_engine_preproc_s inference_engine fluid)
{ return _mm256_permute2f128_pd(a, b, imm); }
template<int imm, typename _Tpvec>
-inline _Tpvec v256_permute2x128(const _Tpvec& a, const _Tpvec& b)
+static inline _Tpvec v256_permute2x128(const _Tpvec& a, const _Tpvec& b)
{ return _Tpvec(_v256_permute2x128<imm>(a.val, b.val)); }
template<int imm>
{ return _mm256_permute4x64_pd(a, imm); }
template<int imm, typename _Tpvec>
-inline _Tpvec v256_permute4x64(const _Tpvec& a)
+static inline _Tpvec v256_permute4x64(const _Tpvec& a)
{ return _Tpvec(_v256_permute4x64<imm>(a.val)); }
inline __m128i _v256_extract_high(const __m256i& v)
OPENCV_HAL_IMPL_AVX_EXPAND(v_uint32x8, v_uint64x4, unsigned, _mm256_cvtepu32_epi64)
OPENCV_HAL_IMPL_AVX_EXPAND(v_int32x8, v_int64x4, int, _mm256_cvtepi32_epi64)
+static inline v_int16x16 v_load_ccache_expand(const uchar* ptr)
+{
+ return v_int16x16(_mm256_cvtepu8_epi16(_mm_lddqu_si128((const __m128i*)ptr)));
+}
+
inline v_uint8x32 v_mul_wrap(const v_uint8x32& a, const v_uint8x32& b)
{
__m256i ad = _mm256_srai_epi16(a.val, 8);
return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val)));
}
+static inline v_uint8x32 v_packus(const v_int16x16& a, const v_int16x16& b)
+{
+ return v_uint8x32(_mm256_packus_epi16(a.val, b.val));
+}
+
+template<int mask, int shift>
+static inline v_uint8x32 v_blend_shiftleft(const v_uint8x32& a, const v_uint8x32& b)
+{
+ return v_uint8x32(_mm256_blend_epi16(a.val, _mm256_slli_si256(b.val, shift), mask));
+}
+
+template<int mask, int shift>
+static inline v_uint8x32 v_blend_shiftright(const v_uint8x32& a, const v_uint8x32& b)
+{
+ return v_uint8x32(_mm256_blend_epi16(_mm256_srli_si256(a.val, shift), b.val, mask));
+}
+
+static inline v_uint8x32 v_setr_s8(char b0, char b1, char b2, char b3, char b4,
+ char b5, char b6, char b7, char b8, char b9,
+ char b10, char b11, char b12, char b13, char b14,
+ char b15, char b16, char b17, char b18, char b19,
+ char b20, char b21, char b22, char b23, char b24,
+ char b25, char b26, char b27, char b28, char b29,
+ char b30, char b31)
+{
+ return v_uint8x32(_mm256_setr_epi8(b0, b1, b2, b3, b4, b5, b6, b7,
+ b8, b9, b10, b11, b12, b13, b14, b15,
+ b16, b17, b18, b19, b20, b21, b22, b23,
+ b24, b25, b26, b27, b28, b29, b30, b31));
+}
+
inline void v_pack_store(schar* ptr, const v_int16x16& a)
{ v_store_low(ptr, v_pack(a, a)); }
static inline v_int16x16 v_mulhrs(const v_int16x16& a, const v_int16x16& b)
{
- v_int16x16 r;
- r.val = _mm256_mulhrs_epi16(a.val, b.val);
- return r;
+ return v_int16x16(_mm256_mulhrs_epi16(a.val, b.val));
}
static inline v_int16x16 v_mulhrs(const v_int16x16& a, short b)
return a * v256_setall_f32(b);
}
+static inline v_uint8x32 v_shuffle_s8(const v_uint8x32& a, const v_uint8x32& mask)
+{
+ return v_uint8x32(_mm256_shuffle_epi8(a.val, mask.val));
+}
+
+static inline void v_gather_channel(v_uint8x32& vec, const uint8_t tmp[], const short mapsx[],
+ int chanNum, int c, int x, int shift)
+{
+ vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + shift + 0] + c)]), 0);
+ vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + shift + 1] + c)]), 1);
+ vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + shift + 2] + c)]), 2);
+ vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + shift + 3] + c)]), 3);
+
+ vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 0] + 1) + c)]), 4);
+ vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 1] + 1) + c)]), 5);
+ vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 2] + 1) + c)]), 6);
+ vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 3] + 1) + c)]), 7);
+}
+
+namespace {
+ template<int chanNum>
+ static inline v_int16x16 v_gather_chan(const uchar src[], const v_int16x16& index, int channel, int pos) {
+ v_int16x16 r;
+ r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 0) + pos) + channel]), 0);
+ r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 1) + pos) + channel]), 1);
+ r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 2) + pos) + channel]), 2);
+ r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 3) + pos) + channel]), 3);
+ r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 4) + pos) + channel]), 4);
+ r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 5) + pos) + channel]), 5);
+ r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 6) + pos) + channel]), 6);
+ r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 7) + pos) + channel]), 7);
+ r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 8) + pos) + channel]), 8);
+ r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 9) + pos) + channel]), 9);
+ r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 10) + pos) + channel]), 10);
+ r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 11) + pos) + channel]), 11);
+ r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 12) + pos) + channel]), 12);
+ r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 13) + pos) + channel]), 13);
+ r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 14) + pos) + channel]), 14);
+ r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm256_extract_epi16(index.val, 15) + pos) + channel]), 15);
+ return r;
+ }
+} // namespace
+
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond
{ return _mm512_extractf32x8_ps(v, 1); }
inline __m256d _v512_extract_high(const __m512d& v)
-{ return _mm512_extractf64x4_pd(v, 1); }
+{ return _mm512_mask_extractf64x4_pd(_mm256_setzero_pd(), (__mmask8) -1, v, 1); }
inline __m256i _v512_extract_low(const __m512i& v)
{ return _mm512_castsi512_si256(v); }
/* pack */
// 16
inline v_int8x64 v_pack(const v_int16x32& a, const v_int16x32& b)
-{ return v_int8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
+{ return v_int8x64(_mm512_mask_permutexvar_epi64(_mm512_setzero_si512(), (__mmask8)-1, _v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
inline v_uint8x64 v_pack(const v_uint16x32& a, const v_uint16x32& b)
{
inline v_uint8x64 v_pack_u(const v_int16x32& a, const v_int16x32& b)
{
- return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi16(a.val, b.val)));
+ return v_uint8x64(_mm512_mask_permutexvar_epi64(_mm512_setzero_si512(), (__mmask8)-1, _v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi16(a.val, b.val)));
}
inline void v_pack_store(schar* ptr, const v_int16x32& a)
// 32
inline v_int16x32 v_pack(const v_int32x16& a, const v_int32x16& b)
-{ return v_int16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi32(a.val, b.val))); }
+{ return v_int16x32(_mm512_mask_permutexvar_epi64(_mm512_setzero_si512(), (__mmask8) -1,
+ _v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0),
+ _mm512_packs_epi32(a.val, b.val))); }
inline v_uint16x32 v_pack(const v_uint32x16& a, const v_uint32x16& b)
{
}
inline v_uint16x32 v_pack_u(const v_int32x16& a, const v_int32x16& b)
-{ return v_uint16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi32(a.val, b.val))); }
+{ return v_uint16x32(_mm512_mask_permutexvar_epi64(_mm512_setzero_si512(), (__mmask8) -1,
+ _v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0),
+ _mm512_packus_epi32(a.val, b.val))); }
inline void v_pack_store(short* ptr, const v_int32x16& a)
{ v_store_low(ptr, v_pack(a, a)); }
// pack boolean
inline v_uint8x64 v_pack_b(const v_uint16x32& a, const v_uint16x32& b)
-{ return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
+{ return v_uint8x64(_mm512_mask_permutexvar_epi64(_mm512_setzero_si512(), (__mmask8) -1, _v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
inline v_uint8x64 v_pack_b(const v_uint32x16& a, const v_uint32x16& b,
const v_uint32x16& c, const v_uint32x16& d)
static inline v_int16x32 v_mulhrs(const v_int16x32& a, const v_int16x32& b)
{
- v_int16x32 r;
- r.val = _mm512_mulhrs_epi16(a.val, b.val);
- return r;
+ return v_int16x32(_mm512_mulhrs_epi16(a.val, b.val));
}
static inline v_int16x32 v_mulhrs(const v_int16x32& a, short b)
return a * v512_setall_f32(b);
}
+template<int mask, int shift>
+static inline v_uint8x64 v_mask_blend_shiftleft(const v_uint8x64& a, const v_uint8x64& b)
+{
+ return v_uint8x64(_mm512_mask_blend_epi16(mask,
+ a.val, _mm512_bslli_epi128(b.val, shift)));
+}
+
+template<int mask, int shift>
+static inline v_uint8x64 v_mask_blend_shiftright(const v_uint8x64& a, const v_uint8x64& b)
+{
+ return v_uint8x64(_mm512_mask_blend_epi16(mask,
+ _mm512_bsrli_epi128(a.val, shift), b.val));
+}
+
+static inline v_uint8x64 v_packus(const v_int16x32& a, const v_int16x32& b)
+{
+ return v_uint8x64(_mm512_packus_epi16(a.val, b.val));
+}
+
+
+#define word(b0, b1, b2, b3) \
+ (((uint32_t)((uint8_t)(b0)) << 0*8) \
+ | ((uint32_t)((uint8_t)(b1)) << 1*8) \
+ | ((uint32_t)((uint8_t)(b2)) << 2*8) \
+ | ((uint32_t)((uint8_t)(b3)) << 3*8))
+
+static inline v_uint8x64 v_setr_s8(char b0, char b1, char b2, char b3, char b4,
+ char b5, char b6, char b7, char b8, char b9,
+ char b10, char b11, char b12, char b13, char b14,
+ char b15, char b16, char b17, char b18, char b19,
+ char b20, char b21, char b22, char b23, char b24,
+ char b25, char b26, char b27, char b28, char b29,
+ char b30, char b31, char b32, char b33, char b34,
+ char b35, char b36, char b37, char b38, char b39,
+ char b40, char b41, char b42, char b43, char b44,
+ char b45, char b46, char b47, char b48, char b49,
+ char b50, char b51, char b52, char b53, char b54,
+ char b55, char b56, char b57, char b58, char b59,
+ char b60, char b61, char b62, char b63)
+{
+ return v_uint8x64(_mm512_setr_epi32(word(b0, b1, b2, b3), word(b4, b5, b6, b7), word(b8, b9, b10, b11),
+ word(b12, b13, b14, b15), word(b16, b17, b18, b19), word(b20, b21, b22, b23),
+ word(b24, b25, b26, b27), word(b28, b29, b30, b31), word(b32, b33, b34, b35),
+ word(b36, b37, b38, b39), word(b40, b41, b42, b43), word(b44, b45, b46, b47),
+ word(b48, b49, b50, b51), word(b52, b53, b54, b55), word(b56, b57, b58, b59),
+ word(b60, b61, b62, b63)));
+}
+
+static inline v_uint64x8 v_set_s64(int b7, int b6, int b5, int b4, int b3, int b2, int b1, int b0)
+{
+ return v_uint64x8(_mm512_set_epi64(b7, b6, b5, b4, b3, b2, b1, b0));
+}
+
+static inline v_uint32x16 v_set_s32(int b15, int b14, int b13, int b12, int b11, int b10, int b9, int b8,
+ int b7, int b6, int b5, int b4, int b3, int b2, int b1, int b0)
+{
+ return v_uint32x16(_mm512_set_epi32(b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0));
+}
+
+static inline v_uint8x64 v_shuffle_s8(const v_uint8x64& a, const v_uint8x64& mask)
+{
+ return v_uint8x64(_mm512_shuffle_epi8(a.val, mask.val));
+}
+static inline v_int16x32 v_load_ccache_expand(const uchar* ptr)
+{
+ return v_int16x32(_mm512_cvtepu8_epi16(_mm256_lddqu_si256((const __m256i*)ptr))); \
+}
+static inline __m512i v512_insert_epi16(__m512i target, const uchar x, const int index)
+{
+ return _mm512_mask_set1_epi16(target, 1UL << index, x);
+}
+static inline __m512i v512_insert_epi32(__m512i target, const int32_t x, const int index)
+{
+ return _mm512_mask_set1_epi32(target, 1UL << index, x);
+}
+
+static inline void v_gather_channel(v_uint8x64& vec, const uint8_t tmp[], const short mapsx[],
+ int chanNum, int c, int x, int shift)
+{
+ __m256i vec1 = _mm256_setzero_si256();
+ __m256i vec2 = _mm256_setzero_si256();
+
+ vec1 = _mm256_insert_epi32(vec1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + shift + 0] + c)]), 0);
+ vec1 = _mm256_insert_epi32(vec1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + shift + 1] + c)]), 1);
+ vec1 = _mm256_insert_epi32(vec1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + shift + 2] + c)]), 2);
+ vec1 = _mm256_insert_epi32(vec1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + shift + 3] + c)]), 3);
+ vec1 = _mm256_insert_epi32(vec1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + shift + 4] + c)]), 4);
+ vec1 = _mm256_insert_epi32(vec1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + shift + 5] + c)]), 5);
+ vec1 = _mm256_insert_epi32(vec1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + shift + 6] + c)]), 6);
+ vec1 = _mm256_insert_epi32(vec1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + shift + 7] + c)]), 7);
+
+ vec2 = _mm256_insert_epi32(vec2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 0] + 1) + c)]), 0);
+ vec2 = _mm256_insert_epi32(vec2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 1] + 1) + c)]), 1);
+ vec2 = _mm256_insert_epi32(vec2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 2] + 1) + c)]), 2);
+ vec2 = _mm256_insert_epi32(vec2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 3] + 1) + c)]), 3);
+ vec2 = _mm256_insert_epi32(vec2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 4] + 1) + c)]), 4);
+ vec2 = _mm256_insert_epi32(vec2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 5] + 1) + c)]), 5);
+ vec2 = _mm256_insert_epi32(vec2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 6] + 1) + c)]), 6);
+ vec2 = _mm256_insert_epi32(vec2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 7] + 1) + c)]), 7);
+
+ vec.val = _mm512_inserti32x8(_mm512_castsi256_si512(vec1), vec2, 1);
+}
+
+static inline v_uint8x64 v_permutex2_s64(const v_uint8x64& a, const v_uint8x64& b, const v_uint64x8& idxs)
+{
+ return v_uint8x64(_mm512_permutex2var_epi64(a.val, idxs.val, b.val));
+}
+
+static inline v_uint8x64 v_permutex_s32(const v_uint8x64& a, const v_uint64x8 idxs)
+{
+ return v_uint8x64(_mm512_permutexvar_epi32(idxs.val, a.val));
+}
+
+static inline v_uint8x64 v_permutex2_s32(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16 idxs)
+{
+ return v_uint8x64(_mm512_permutex2var_epi32(a.val, idxs.val, b.val));
+}
+
+#if defined(__GNUC__)
+
+int _mm512_cvtsi512_si32(__m512i a)
+{
+ __v16si b = (__v16si)a;
+ return b[0];
+}
+
+#endif
+
+template <int index>
+static inline int v512_extract_epi32(__m512i target)
+{
+ return _mm512_cvtsi512_si32(_mm512_mask_alignr_epi32(_mm512_setzero_si512(), (__mmask16)-1, target, target, index));
+}
+
+template <int index>
+static inline int v512_extract_epi16(__m512i target)
+{
+ return (v512_extract_epi32<index/2>(target) >> (index % 2 ? 16 : 0)) & 0xFFFF;
+}
+
+namespace {
+ template<int chanNum>
+ static inline v_int16x32 v_gather_chan(const uchar src[], const v_int16x32& index, int channel, int pos) {
+ v_int16x32 r;
+
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<0>(index.val) + pos) + channel]), 0);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<1>(index.val) + pos) + channel]), 1);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<2>(index.val) + pos) + channel]), 2);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<3>(index.val) + pos) + channel]), 3);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<4>(index.val) + pos) + channel]), 4);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<5>(index.val) + pos) + channel]), 5);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<6>(index.val) + pos) + channel]), 6);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<7>(index.val) + pos) + channel]), 7);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<8>(index.val) + pos) + channel]), 8);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<9>(index.val) + pos) + channel]), 9);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<10>(index.val) + pos) + channel]), 10);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<11>(index.val) + pos) + channel]), 11);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<12>(index.val) + pos) + channel]), 12);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<13>(index.val) + pos) + channel]), 13);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<14>(index.val) + pos) + channel]), 14);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<15>(index.val) + pos) + channel]), 15);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<16>(index.val) + pos) + channel]), 16);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<17>(index.val) + pos) + channel]), 17);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<18>(index.val) + pos) + channel]), 18);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<19>(index.val) + pos) + channel]), 19);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<20>(index.val) + pos) + channel]), 20);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<21>(index.val) + pos) + channel]), 21);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<22>(index.val) + pos) + channel]), 22);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<23>(index.val) + pos) + channel]), 23);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<24>(index.val) + pos) + channel]), 24);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<25>(index.val) + pos) + channel]), 25);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<26>(index.val) + pos) + channel]), 26);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<27>(index.val) + pos) + channel]), 27);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<28>(index.val) + pos) + channel]), 28);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<29>(index.val) + pos) + channel]), 29);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<30>(index.val) + pos) + channel]), 30);
+ r.val = v512_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(v512_extract_epi16<31>(index.val) + pos) + channel]), 31);
+
+ return r;
+ }
+} // namespace
+
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond
inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) {return v_float64x2(_mm_castps_pd(a.val)); }
//////////////// PACK ///////////////
+static inline v_uint8x16 v_packus(const v_int16x8& a, const v_int16x8& b) {
+ v_uint8x16 res;
+ res.val = _mm_packus_epi16(a.val, b.val);
+ return res;
+}
+
inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
{
__m128i delta = _mm_set1_epi16(255);
{ \
__m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \
-}
+}\
+inline _Tpwuvec v_expand_low(const _Tpuvec& a) { \
+ _Tpwuvec res; \
+ res.val = _mm_cvtepu8_epi16(a.val); \
+ return res; \
+} \
+inline _Tpwuvec v_expand_high(const _Tpuvec& a) { \
+ _Tpwuvec res; \
+ res.val = _mm_unpackhi_epi8(a.val, _mm_setzero_si128()); \
+ return res; \
+} \
OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8)
OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16)
return r;
}
+static inline v_uint8x16 v_packus_s16(const v_int16x8& a, const v_int16x8& b) {
+ v_uint8x16 r;
+ r.val = _mm_packus_epi16(a.val, b.val);
+ return r;
+}
+
// for each j=index[k], load two chars src[j] and src[j+1]
static inline v_uint8x16 v_gather_pairs(const uchar src[], const v_int16x8& index) {
v_uint8x16 r;
return a * v_setall_f32(b);
}
+template<int mask, int shift>
+static inline v_uint8x16 v_blend_shiftleft(const v_uint8x16& a, const v_uint8x16& b) {
+ v_uint8x16 res;
+ res.val = _mm_blend_epi16(a.val, _mm_slli_si128(b.val, shift), mask /*0xCC 0b11001100*/);
+ return res;
+}
+
+template<int mask, int shift>
+static inline v_uint8x16 v_blend_shiftright(const v_uint8x16& a, const v_uint8x16& b) {
+ v_uint8x16 res;
+ res.val = _mm_blend_epi16(_mm_srli_si128(a.val, shift), b.val, mask /*0xCC 0b11001100*/);
+ return res;
+}
+
+static inline v_uint8x16 v_setr_s8(char b0, char b1, char b2, char b3, char b4,
+ char b5, char b6, char b7, char b8, char b9,
+ char b10, char b11, char b12, char b13, char b14,
+ char b15) {
+ v_uint8x16 res;
+ res.val = _mm_setr_epi8(b0, b1, b2, b3, b4, b5, b6, b7, b8,
+ b9, b10, b11, b12, b13, b14, b15);
+ return res;
+}
+
+
+static inline v_uint8x16 v_shuffle_s8(const v_uint8x16& a, const v_uint8x16& mask) {
+ v_uint8x16 res;
+ res.val = _mm_shuffle_epi8(a.val, mask.val);
+ return res;
+}
+
+static inline void v_gather_channel(v_uint8x16& vec, const uint8_t tmp[], const short mapsx[],
+ int chanNum, int c, int x, int shift)
+{
+ vec.val = _mm_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + shift + 0] + c)]), 0);
+ vec.val = _mm_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + shift + 1] + c)]), 1);
+
+ vec.val = _mm_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 0] + 1) + c)]), 2);
+ vec.val = _mm_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 1] + 1) + c)]), 3);
+}
+
//! @}
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END