From 9548093b466b9f515b00f3719f87631e7af39c22 Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Thu, 11 Oct 2018 20:00:12 +0300 Subject: [PATCH] Horizontal line processing for pyrDown() reworked using wide universal intrinsics. --- .../core/include/opencv2/core/hal/intrin_avx.hpp | 10 + .../core/include/opencv2/core/hal/intrin_cpp.hpp | 1 - .../core/include/opencv2/core/hal/intrin_neon.hpp | 89 +- .../core/include/opencv2/core/hal/intrin_sse.hpp | 12 +- .../core/include/opencv2/core/hal/intrin_vsx.hpp | 4 + modules/imgproc/src/pyramids.cpp | 894 ++++++++++++++------- 6 files changed, 694 insertions(+), 316 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index 913c915..a6725c8 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -1610,6 +1610,16 @@ inline v_int16x16 v_pack_triplets(const v_int16x16& vec) } inline v_uint16x16 v_pack_triplets(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); } +inline v_int32x8 v_pack_triplets(const v_int32x8& vec) +{ + return v_int32x8(_mm256_permutevar8x32_epi32(vec.val, _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000))); +} +inline v_uint32x8 v_pack_triplets(const v_uint32x8& vec) { return v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); } +inline v_float32x8 v_pack_triplets(const v_float32x8& vec) +{ + return v_float32x8(_mm256_permutevar8x32_ps(vec.val, _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000))); +} + ////////// Matrix operations ///////// inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b) diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index 5cfaea7..757c67b 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -1908,7 +1908,6 @@ template inline v_reg<_Tp, n> v_interleave_quads(const v_re template inline v_reg<_Tp, n> v_pack_triplets(const v_reg<_Tp, n>& vec) { v_reg c; - int j = 0; for (int i = 0; i < n/4; i++) { c.s[3*i ] = vec.s[4*i ]; diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index f674791..e131909 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -1597,29 +1597,49 @@ inline v_int8x16 v_lut(const schar* tab, const int* idx) } inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx) { - short CV_DECL_ALIGNED(32) elems[8] = + schar CV_DECL_ALIGNED(32) elems[16] = { - *(short*)(tab+idx[0]), - *(short*)(tab+idx[1]), - *(short*)(tab+idx[2]), - *(short*)(tab+idx[3]), - *(short*)(tab+idx[4]), - *(short*)(tab+idx[5]), - *(short*)(tab+idx[6]), - *(short*)(tab+idx[7]) + tab[idx[0]], + tab[idx[0] + 1], + tab[idx[1]], + tab[idx[1] + 1], + tab[idx[2]], + tab[idx[2] + 1], + tab[idx[3]], + tab[idx[3] + 1], + tab[idx[4]], + tab[idx[4] + 1], + tab[idx[5]], + tab[idx[5] + 1], + tab[idx[6]], + tab[idx[6] + 1], + tab[idx[7]], + tab[idx[7] + 1] }; - return v_int8x16(vreinterpretq_s8_s16(vld1q_s16(elems))); + return v_int8x16(vld1q_s8(elems)); } inline v_int8x16 v_lut_quads(const schar* tab, const int* idx) { - int CV_DECL_ALIGNED(32) elems[4] = + schar CV_DECL_ALIGNED(32) elems[16] = { - *(int*)(tab + idx[0]), - *(int*)(tab + idx[1]), - *(int*)(tab + idx[2]), - *(int*)(tab + idx[3]) + tab[idx[0]], + tab[idx[0] + 1], + tab[idx[0] + 2], + tab[idx[0] + 3], + tab[idx[1]], + tab[idx[1] + 1], + tab[idx[1] + 2], + tab[idx[1] + 3], + tab[idx[2]], + tab[idx[2] + 1], + tab[idx[2] + 2], + tab[idx[2] + 3], + tab[idx[3]], + tab[idx[3] + 1], + tab[idx[3] + 2], + tab[idx[3] + 3] }; - return v_int8x16(vreinterpretq_s8_s32(vld1q_s32(elems))); + return v_int8x16(vld1q_s8(elems)); } inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); } inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); } @@ -1642,23 +1662,22 @@ inline v_int16x8 v_lut(const short* tab, const int* idx) } inline v_int16x8 v_lut_pairs(const short* tab, const int* idx) { - int CV_DECL_ALIGNED(32) elems[4] = + short CV_DECL_ALIGNED(32) elems[8] = { - *(int*)(tab + idx[0]), - *(int*)(tab + idx[1]), - *(int*)(tab + idx[2]), - *(int*)(tab + idx[3]) + tab[idx[0]], + tab[idx[0] + 1], + tab[idx[1]], + tab[idx[1] + 1], + tab[idx[2]], + tab[idx[2] + 1], + tab[idx[3]], + tab[idx[3] + 1] }; - return v_int16x8(vreinterpretq_s16_s32(vld1q_s32(elems))); + return v_int16x8(vld1q_s16(elems)); } inline v_int16x8 v_lut_quads(const short* tab, const int* idx) { - int64 CV_DECL_ALIGNED(32) elems[2] = - { - *(int64*)(tab + idx[0]), - *(int64*)(tab + idx[1]) - }; - return v_int16x8(vreinterpretq_s16_s64(vld1q_s64(elems))); + return v_int16x8(vcombine_s16(vld1_s16(tab + idx[0]), vld1_s16(tab + idx[1]))); } inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); } inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); } @@ -1677,12 +1696,7 @@ inline v_int32x4 v_lut(const int* tab, const int* idx) } inline v_int32x4 v_lut_pairs(const int* tab, const int* idx) { - int64 CV_DECL_ALIGNED(32) elems[2] = - { - *(int64*)(tab + idx[0]), - *(int64*)(tab + idx[1]) - }; - return v_int32x4(vreinterpretq_s32_s64(vld1q_s64(elems))); + return v_int32x4(vcombine_s32(vld1_s32(tab + idx[0]), vld1_s32(tab + idx[1]))); } inline v_int32x4 v_lut_quads(const int* tab, const int* idx) { @@ -1800,7 +1814,8 @@ inline v_int16x8 v_interleave_pairs(const v_int16x8& vec) inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); } inline v_int16x8 v_interleave_quads(const v_int16x8& vec) { - return v_int16x8(vreinterpretq_s16_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0b0a030209080100)), vtbl1_s8(vget_high_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0b0a030209080100))))); + int16x4x2_t res = vzip_s16(vget_low_s16(vec.val), vget_high_s16(vec.val)); + return v_int16x8(vcombine_s16(res.val[0], res.val[1])); } inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); } @@ -1824,6 +1839,10 @@ inline v_int16x8 v_pack_triplets(const v_int16x8& vec) } inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); } +inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; } +inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; } +inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; } + #if CV_SIMD128_64F inline v_float64x2 v_lut(const double* tab, const int* idx) { diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index dcfae9a..a5adad0 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -2789,7 +2789,7 @@ inline v_int32x4 v_lut_pairs(const int* tab, const int* idx) } inline v_int32x4 v_lut_quads(const int* tab, const int* idx) { - return v_int32x4(_mm_load_si128((const __m128i*)(tab + idx[0]))); + return v_int32x4(_mm_loadu_si128((const __m128i*)(tab + idx[0]))); } inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); } inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); } @@ -2801,7 +2801,7 @@ inline v_int64x2 v_lut(const int64_t* tab, const int* idx) } inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx) { - return v_int64x2(_mm_load_si128((const __m128i*)(tab + idx[0]))); + return v_int64x2(_mm_loadu_si128((const __m128i*)(tab + idx[0]))); } inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); } inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); } @@ -2817,7 +2817,7 @@ inline v_float64x2 v_lut(const double* tab, const int* idx) { return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]])); } -inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { return v_float64x2(_mm_castsi128_pd(_mm_load_si128((const __m128i*)(tab + idx[0])))); } +inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { return v_float64x2(_mm_castsi128_pd(_mm_loadu_si128((const __m128i*)(tab + idx[0])))); } inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec) { @@ -2932,7 +2932,7 @@ inline v_int8x16 v_pack_triplets(const v_int8x16& vec) return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0xffffff0f0e0d0c0a, 0x0908060504020100))); #else __m128i mask = _mm_set1_epi64x(0x00000000FFFFFFFF); - __m128i a = _mm_or_si128(_mm_andnot_si128(mask, vec.val), _mm_and_si128(mask, _mm_sll_epi32(vec.val, _mm_set_epi64x(0, 8)))); + __m128i a = _mm_srli_si128(_mm_or_si128(_mm_andnot_si128(mask, vec.val), _mm_and_si128(mask, _mm_sll_epi32(vec.val, _mm_set_epi64x(0, 8)))), 1); return v_int8x16(_mm_srli_si128(_mm_shufflelo_epi16(a, _MM_SHUFFLE(2, 1, 0, 3)), 2)); #endif } @@ -2948,6 +2948,10 @@ inline v_int16x8 v_pack_triplets(const v_int16x8& vec) } inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); } +inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; } +inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; } +inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; } + ////////////// FP16 support /////////////////////////// inline v_float32x4 v_load_expand(const float16_t* ptr) diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index ddda1d1..4e0c75f 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -1160,6 +1160,10 @@ inline v_int16x8 v_pack_triplets(const v_int16x8& vec) } inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); } +inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; } +inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; } +inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; } + /////// FP16 support //////// // [TODO] implement these 2 using VSX or universal intrinsics (copy from intrin_sse.cpp and adopt) diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp index d4efeec..5a229b6 100644 --- a/modules/imgproc/src/pyramids.cpp +++ b/modules/imgproc/src/pyramids.cpp @@ -64,333 +64,662 @@ template struct FltCast rtype operator ()(type1 arg) const { return arg*(T)(1./(1 << shift)); } }; -template struct PyrDownNoVec +template int PyrDownVecH(const T1*, T2*, int) { - int operator()(T1**, T2*, int, int) const { return 0; } -}; + // row[x ] = src[x * 2 + 2*cn ] * 6 + (src[x * 2 + cn ] + src[x * 2 + 3*cn ]) * 4 + src[x * 2 ] + src[x * 2 + 4*cn ]; + // row[x + 1] = src[x * 2 + 2*cn+1] * 6 + (src[x * 2 + cn+1] + src[x * 2 + 3*cn+1]) * 4 + src[x * 2 + 1] + src[x * 2 + 4*cn+1]; + // .... + // row[x + cn-1] = src[x * 2 + 3*cn-1] * 6 + (src[x * 2 + 2*cn-1] + src[x * 2 + 4*cn-1]) * 4 + src[x * 2 + cn-1] + src[x * 2 + 5*cn-1]; + return 0; +} -template struct PyrUpNoVec +template int PyrUpVecH(const T1*, T2*, int) { - int operator()(T1**, T2**, int, int) const { return 0; } -}; + return 0; +} + +template int PyrDownVecV(T1**, T2*, int) { return 0; } + +template int PyrUpVecV(T1**, T2**, int) { return 0; } #if CV_SIMD -struct PyrDownVec_32s8u +template<> int PyrDownVecH(const uchar* src, int* row, int width) { - int operator()(int** src, uchar* dst, int, int width) const + int x = 0; + const uchar *src0 = src, *src2 = src + 2, *src4 = src + 3; + + v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001)); + v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); + for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) + v_store(row, v_dotprod(v_reinterpret_as_s16(vx_load_expand(src0)), v_1_4) + + v_dotprod(v_reinterpret_as_s16(vx_load_expand(src2)), v_6_4) + + (v_reinterpret_as_s32(vx_load_expand(src4)) >> 16)); + vx_cleanup(); + + return x; +} +template<> int PyrDownVecH(const uchar* src, int* row, int width) +{ + int x = 0; + const uchar *src0 = src, *src2 = src + 4, *src4 = src + 6; + + v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001)); + v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); + for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) + v_store(row, v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(vx_load_expand(src0))), v_1_4) + + v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(vx_load_expand(src2))), v_6_4) + + (v_reinterpret_as_s32(v_interleave_pairs(vx_load_expand(src4))) >> 16)); + vx_cleanup(); + + return x; +} +template<> int PyrDownVecH(const uchar* src, int* row, int width) +{ + int idx[v_int8::nlanes/2 + 4]; + for (int i = 0; i < v_int8::nlanes/4 + 2; i++) { - int x = 0; - const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; - - for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes ) - { - v_uint16 r0, r1, r2, r3, r4, t0, t1; - r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes))); - r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes))); - r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes))); - r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x), vx_load(row3 + x + v_int32::nlanes))); - r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x), vx_load(row4 + x + v_int32::nlanes))); - t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2); - r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x + 2*v_int32::nlanes), vx_load(row0 + x + 3*v_int32::nlanes))); - r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x + 2*v_int32::nlanes), vx_load(row1 + x + 3*v_int32::nlanes))); - r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x + 2*v_int32::nlanes), vx_load(row2 + x + 3*v_int32::nlanes))); - r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x + 2*v_int32::nlanes), vx_load(row3 + x + 3*v_int32::nlanes))); - r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x + 2*v_int32::nlanes), vx_load(row4 + x + 3*v_int32::nlanes))); - t1 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2); - v_store(dst + x, v_rshr_pack<8>(t0, t1)); - } - if (x <= width - v_int16::nlanes) - { - v_uint16 r0, r1, r2, r3, r4, t0; - r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes))); - r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes))); - r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes))); - r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x), vx_load(row3 + x + v_int32::nlanes))); - r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x), vx_load(row4 + x + v_int32::nlanes))); - t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2); - v_rshr_pack_store<8>(dst + x, t0); - x += v_uint16::nlanes; - } - typedef int CV_DECL_ALIGNED(1) unaligned_int; - for ( ; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes) - { - v_int32x4 r0, r1, r2, r3, r4, t0; - r0 = v_load(row0 + x); - r1 = v_load(row1 + x); - r2 = v_load(row2 + x); - r3 = v_load(row3 + x); - r4 = v_load(row4 + x); - t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2); - - *((unaligned_int*) (dst + x)) = v_reinterpret_as_s32(v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16())).get0(); - } + idx[i] = 6*i; + idx[i + v_int8::nlanes/4 + 2] = 6*i + 3; + } - return x; + int x = 0; + v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); + for (; x <= width - v_int8::nlanes; x += 3*v_int8::nlanes/4, src += 6*v_int8::nlanes/4, row += 3*v_int8::nlanes/4) + { + v_uint16 r0l, r0h, r1l, r1h, r2l, r2h, r3l, r3h, r4l, r4h; + v_expand(vx_lut_quads(src, idx ), r0l, r0h); + v_expand(vx_lut_quads(src, idx + v_int8::nlanes/4 + 2), r1l, r1h); + v_expand(vx_lut_quads(src, idx + 1 ), r2l, r2h); + v_expand(vx_lut_quads(src, idx + v_int8::nlanes/4 + 3), r3l, r3h); + v_expand(vx_lut_quads(src, idx + 2 ), r4l, r4h); + + v_zip(r2l, r1l + r3l, r1l, r3l); + v_zip(r2h, r1h + r3h, r1h, r3h); + r0l += r4l; r0h += r4h; + + v_store(row , v_pack_triplets(v_dotprod(v_reinterpret_as_s16(r1l), v_6_4) + v_reinterpret_as_s32(v_expand_low( r0l)))); + v_store(row + 3*v_int32::nlanes/4, v_pack_triplets(v_dotprod(v_reinterpret_as_s16(r3l), v_6_4) + v_reinterpret_as_s32(v_expand_high(r0l)))); + v_store(row + 6*v_int32::nlanes/4, v_pack_triplets(v_dotprod(v_reinterpret_as_s16(r1h), v_6_4) + v_reinterpret_as_s32(v_expand_low( r0h)))); + v_store(row + 9*v_int32::nlanes/4, v_pack_triplets(v_dotprod(v_reinterpret_as_s16(r3h), v_6_4) + v_reinterpret_as_s32(v_expand_high(r0h)))); } -}; + vx_cleanup(); -struct PyrDownVec_32f + return x; +} +template<> int PyrDownVecH(const uchar* src, int* row, int width) { - int operator()(float** src, float* dst, int, int width) const + int x = 0; + const uchar *src0 = src, *src2 = src + 8, *src4 = src + 12; + + v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001)); + v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); + for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) + v_store(row, v_dotprod(v_interleave_quads(v_reinterpret_as_s16(vx_load_expand(src0))), v_1_4) + + v_dotprod(v_interleave_quads(v_reinterpret_as_s16(vx_load_expand(src2))), v_6_4) + + (v_reinterpret_as_s32(v_interleave_quads(vx_load_expand(src4))) >> 16)); + vx_cleanup(); + + return x; +} + +template<> int PyrDownVecH(const short* src, int* row, int width) +{ + int x = 0; + const short *src0 = src, *src2 = src + 2, *src4 = src + 3; + + v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001)); + v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); + for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) + v_store(row, v_dotprod(vx_load(src0), v_1_4) + + v_dotprod(vx_load(src2), v_6_4) + + (v_reinterpret_as_s32(vx_load(src4)) >> 16)); + vx_cleanup(); + + return x; +} +template<> int PyrDownVecH(const short* src, int* row, int width) +{ + int x = 0; + const short *src0 = src, *src2 = src + 4, *src4 = src + 6; + + v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001)); + v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); + for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) + v_store(row, v_dotprod(v_interleave_pairs(vx_load(src0)), v_1_4) + + v_dotprod(v_interleave_pairs(vx_load(src2)), v_6_4) + + (v_reinterpret_as_s32(v_interleave_pairs(vx_load(src4))) >> 16)); + vx_cleanup(); + + return x; +} +template<> int PyrDownVecH(const short* src, int* row, int width) +{ + int idx[v_int16::nlanes/2 + 4]; + for (int i = 0; i < v_int16::nlanes/4 + 2; i++) { - int x = 0; - const float *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; + idx[i] = 6*i; + idx[i + v_int16::nlanes/4 + 2] = 6*i + 3; + } - v_float32 _4 = vx_setall_f32(4.f), _scale = vx_setall_f32(1.f/256); - for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes) - { - v_float32 r0, r1, r2, r3, r4; - r0 = vx_load(row0 + x); - r1 = vx_load(row1 + x); - r2 = vx_load(row2 + x); - r3 = vx_load(row3 + x); - r4 = vx_load(row4 + x); - v_store(dst + x, v_muladd(r1 + r3 + r2, _4, r0 + r4 + (r2 + r2)) * _scale); - } + int x = 0; + v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001)); + v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); + for (; x <= width - v_int16::nlanes; x += 3*v_int16::nlanes/4, src += 6*v_int16::nlanes/4, row += 3*v_int16::nlanes/4) + { + v_int16 r0, r1, r2, r3, r4; + v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + v_int16::nlanes/4 + 2), r0, r1); + v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + v_int16::nlanes/4 + 3), r2, r3); + r4 = vx_lut_quads(src, idx + 2); + v_store(row, v_pack_triplets(v_dotprod(r0, v_1_4) + v_dotprod(r2, v_6_4) + v_expand_low(r4))); + v_store(row + 3*v_int32::nlanes/4, v_pack_triplets(v_dotprod(r1, v_1_4) + v_dotprod(r3, v_6_4) + v_expand_high(r4))); + } + vx_cleanup(); + + return x; +} +template<> int PyrDownVecH(const short* src, int* row, int width) +{ + int idx[v_int16::nlanes/2 + 4]; + for (int i = 0; i < v_int16::nlanes/4 + 2; i++) + { + idx[i] = 8*i; + idx[i + v_int16::nlanes/4 + 2] = 8*i + 4; + } - return x; + int x = 0; + v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001)); + v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); + for (; x <= width - v_int16::nlanes; x += v_int16::nlanes, src += 2*v_int16::nlanes, row += v_int16::nlanes) + { + v_int16 r0, r1, r2, r3, r4; + v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + v_int16::nlanes/4 + 2), r0, r1); + v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + v_int16::nlanes/4 + 3), r2, r3); + r4 = vx_lut_quads(src, idx + 2); + v_store(row, v_dotprod(r0, v_1_4) + v_dotprod(r2, v_6_4) + v_expand_low(r4)); + v_store(row + v_int32::nlanes, v_dotprod(r1, v_1_4) + v_dotprod(r3, v_6_4) + v_expand_high(r4)); } -}; + vx_cleanup(); -#if CV_SSE4_1 || CV_NEON || CV_VSX + return x; +} -struct PyrDownVec_32s16u +template<> int PyrDownVecH(const ushort* src, int* row, int width) +{ + int x = 0; + const ushort *src0 = src, *src2 = src + 2, *src4 = src + 3; + + v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001)); + v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); + v_uint16 v_half = vx_setall_u16(0x8000); + v_int32 v_half15 = vx_setall_s32(0x00078000); + for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) + v_store(row, v_dotprod(v_reinterpret_as_s16(v_sub_wrap(vx_load(src0), v_half)), v_1_4) + + v_dotprod(v_reinterpret_as_s16(v_sub_wrap(vx_load(src2), v_half)), v_6_4) + + v_reinterpret_as_s32(v_reinterpret_as_u32(vx_load(src4)) >> 16) + v_half15); + vx_cleanup(); + + return x; +} +template<> int PyrDownVecH(const ushort* src, int* row, int width) +{ + int x = 0; + const ushort *src0 = src, *src2 = src + 4, *src4 = src + 6; + + v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001)); + v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); + v_uint16 v_half = vx_setall_u16(0x8000); + v_int32 v_half15 = vx_setall_s32(0x00078000); + for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) + v_store(row, v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(v_sub_wrap(vx_load(src0), v_half))), v_1_4) + + v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(v_sub_wrap(vx_load(src2), v_half))), v_6_4) + + v_reinterpret_as_s32(v_reinterpret_as_u32(v_interleave_pairs(vx_load(src4))) >> 16) + v_half15); + vx_cleanup(); + + return x; +} +template<> int PyrDownVecH(const ushort* src, int* row, int width) { - int operator()(int** src, ushort* dst, int, int width) const + int idx[v_int16::nlanes/2 + 4]; + for (int i = 0; i < v_int16::nlanes/4 + 2; i++) { - int x = 0; - const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; + idx[i] = 6*i; + idx[i + v_int16::nlanes/4 + 2] = 6*i + 3; + } - for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes) - { - v_int32 r00 = vx_load(row0 + x), - r01 = vx_load(row0 + x + v_int32::nlanes), - r10 = vx_load(row1 + x), - r11 = vx_load(row1 + x + v_int32::nlanes), - r20 = vx_load(row2 + x), - r21 = vx_load(row2 + x + v_int32::nlanes), - r30 = vx_load(row3 + x), - r31 = vx_load(row3 + x + v_int32::nlanes), - r40 = vx_load(row4 + x), - r41 = vx_load(row4 + x + v_int32::nlanes); - v_store(dst + x, v_rshr_pack_u<8>(r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2), - r01 + r41 + (r21 + r21) + ((r11 + r21 + r31) << 2))); - } - if (x <= width - v_int32::nlanes) - { - v_int32 r00 = vx_load(row0 + x), - r10 = vx_load(row1 + x), - r20 = vx_load(row2 + x), - r30 = vx_load(row3 + x), - r40 = vx_load(row4 + x); - v_rshr_pack_u_store<8>(dst + x, r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2)); - x += v_int32::nlanes; - } + int x = 0; + v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001)); + v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); + v_uint16 v_half = vx_setall_u16(0x8000); + v_int32 v_half15 = vx_setall_s32(0x00078000); + for (; x <= width - v_int16::nlanes; x += 3*v_int16::nlanes/4, src += 6*v_int16::nlanes/4, row += 3*v_int16::nlanes/4) + { + v_uint16 r0, r1, r2, r3, r4; + v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + v_int16::nlanes/4 + 2), r0, r1); + v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + v_int16::nlanes/4 + 3), r2, r3); + r4 = vx_lut_quads(src, idx + 2); + v_store(row , v_pack_triplets(v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r0, v_half)), v_1_4) + + v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r2, v_half)), v_6_4) + + v_reinterpret_as_s32(v_expand_low(r4)) + v_half15)); + v_store(row + 3*v_int32::nlanes/4, v_pack_triplets(v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r1, v_half)), v_1_4) + + v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r3, v_half)), v_6_4) + + v_reinterpret_as_s32(v_expand_high(r4)) + v_half15)); + } + vx_cleanup(); - return x; + return x; +} +template<> int PyrDownVecH(const ushort* src, int* row, int width) +{ + int idx[v_int16::nlanes/2 + 4]; + for (int i = 0; i < v_int16::nlanes/4 + 2; i++) + { + idx[i] = 8*i; + idx[i + v_int16::nlanes/4 + 2] = 8*i + 4; } -}; -#else + int x = 0; + v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001)); + v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); + v_uint16 v_half = vx_setall_u16(0x8000); + v_int32 v_half15 = vx_setall_s32(0x00078000); + for (; x <= width - v_int16::nlanes; x += v_int16::nlanes, src += 2*v_int16::nlanes, row += v_int16::nlanes) + { + v_uint16 r0, r1, r2, r3, r4; + v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + v_int16::nlanes/4 + 2), r0, r1); + v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + v_int16::nlanes/4 + 3), r2, r3); + r4 = vx_lut_quads(src, idx + 2); + v_store(row , v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r0, v_half)), v_1_4) + + v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r2, v_half)), v_6_4) + + v_reinterpret_as_s32(v_expand_low(r4)) + v_half15); + v_store(row + v_int32::nlanes, v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r1, v_half)), v_1_4) + + v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r3, v_half)), v_6_4) + + v_reinterpret_as_s32(v_expand_high(r4)) + v_half15); + } + vx_cleanup(); -typedef PyrDownNoVec PyrDownVec_32s16u; + return x; +} -#endif +template<> int PyrDownVecH(const float* src, float* row, int width) +{ + int x = 0; + const float *src0 = src, *src2 = src + 2, *src4 = src + 4; -struct PyrDownVec_32s16s + v_float32 _4 = vx_setall_f32(4.f), _6 = vx_setall_f32(6.f); + for (; x <= width - v_float32::nlanes; x += v_float32::nlanes, src0 += 2*v_float32::nlanes, src2 += 2*v_float32::nlanes, src4 += 2*v_float32::nlanes, row+=v_float32::nlanes) + { + v_float32 r0, r1, r2, r3, r4, rtmp; + v_load_deinterleave(src0, r0, r1); + v_load_deinterleave(src2, r2, r3); + v_load_deinterleave(src4, r4, rtmp); + v_store(row, v_muladd(r2, _6, v_muladd(r1 + r3, _4, r0 + r4))); + } + vx_cleanup(); + + return x; +} +template<> int PyrDownVecH(const float* src, float* row, int width) { - int operator()(int** src, short* dst, int, int width) const + int x = 0; + const float *src0 = src, *src2 = src + 4, *src4 = src + 6; + + v_float32 _4 = vx_setall_f32(4.f), _6 = vx_setall_f32(6.f); + for (; x <= width - 2*v_float32::nlanes; x += 2*v_float32::nlanes, src0 += 4*v_float32::nlanes, src2 += 4*v_float32::nlanes, src4 += 4*v_float32::nlanes, row += 2*v_float32::nlanes) { - int x = 0; - const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; + v_float32 r0a, r0b, r1a, r1b, r2a, r2b, r3a, r3b, r4a, r4b, rtmpa, rtmpb; + v_load_deinterleave(src0, r0a, r0b, r1a, r1b); + v_load_deinterleave(src2, r2a, r2b, r3a, r3b); + v_load_deinterleave(src4, rtmpa, rtmpb, r4a, r4b); + v_store_interleave(row, v_muladd(r2a, _6, v_muladd(r1a + r3a, _4, r0a + r4a)), v_muladd(r2b, _6, v_muladd(r1b + r3b, _4, r0b + r4b))); + } + vx_cleanup(); - for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes) - { - v_int32 r00 = vx_load(row0 + x), - r01 = vx_load(row0 + x + v_int32::nlanes), - r10 = vx_load(row1 + x), - r11 = vx_load(row1 + x + v_int32::nlanes), - r20 = vx_load(row2 + x), - r21 = vx_load(row2 + x + v_int32::nlanes), - r30 = vx_load(row3 + x), - r31 = vx_load(row3 + x + v_int32::nlanes), - r40 = vx_load(row4 + x), - r41 = vx_load(row4 + x + v_int32::nlanes); - v_store(dst + x, v_rshr_pack<8>(r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2), - r01 + r41 + (r21 + r21) + ((r11 + r21 + r31) << 2))); - } - if (x <= width - v_int32::nlanes) - { - v_int32 r00 = vx_load(row0 + x), - r10 = vx_load(row1 + x), - r20 = vx_load(row2 + x), - r30 = vx_load(row3 + x), - r40 = vx_load(row4 + x); - v_rshr_pack_store<8>(dst + x, r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2)); - x += v_int32::nlanes; - } + return x; +} +template<> int PyrDownVecH(const float* src, float* row, int width) +{ + int idx[v_float32::nlanes/2 + 4]; + for (int i = 0; i < v_float32::nlanes/4 + 2; i++) + { + idx[i] = 6*i; + idx[i + v_float32::nlanes/4 + 2] = 6*i + 3; + } - return x; + int x = 0; + v_float32 _4 = vx_setall_f32(4.f), _6 = vx_setall_f32(6.f); + for (; x <= width - v_float32::nlanes; x += 3*v_float32::nlanes/4, src += 6*v_float32::nlanes/4, row += 3*v_float32::nlanes/4) + { + v_float32 r0 = vx_lut_quads(src, idx); + v_float32 r1 = vx_lut_quads(src, idx + v_float32::nlanes/4 + 2); + v_float32 r2 = vx_lut_quads(src, idx + 1); + v_float32 r3 = vx_lut_quads(src, idx + v_float32::nlanes/4 + 3); + v_float32 r4 = vx_lut_quads(src, idx + 2); + v_store(row, v_pack_triplets(v_muladd(r2, _6, v_muladd(r1 + r3, _4, r0 + r4)))); } -}; + vx_cleanup(); -struct PyrUpVec_32s8u + return x; +} +template<> int PyrDownVecH(const float* src, float* row, int width) { - int operator()(int** src, uchar** dst, int, int width) const + int idx[v_float32::nlanes/2 + 4]; + for (int i = 0; i < v_float32::nlanes/4 + 2; i++) { - int x = 0; - uchar *dst0 = dst[0], *dst1 = dst[1]; - const int *row0 = src[0], *row1 = src[1], *row2 = src[2]; + idx[i] = 8*i; + idx[i + v_float32::nlanes/4 + 2] = 8*i + 4; + } - for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes) - { - v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)), - v_r01 = v_pack(vx_load(row0 + x + 2 * v_int32::nlanes), vx_load(row0 + x + 3 * v_int32::nlanes)), - v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)), - v_r11 = v_pack(vx_load(row1 + x + 2 * v_int32::nlanes), vx_load(row1 + x + 3 * v_int32::nlanes)), - v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes)), - v_r21 = v_pack(vx_load(row2 + x + 2 * v_int32::nlanes), vx_load(row2 + x + 3 * v_int32::nlanes)); - v_int16 v_2r10 = v_r10 + v_r10, v_2r11 = (v_r11 + v_r11); - v_store(dst0 + x, v_rshr_pack_u<6>(v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10), v_r01 + v_r21 + (v_2r11 + v_2r11 + v_2r11))); - v_store(dst1 + x, v_rshr_pack_u<6>((v_r10 + v_r20) << 2, (v_r11 + v_r21) << 2)); - } - if(x <= width - v_uint16::nlanes) - { - v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)), - v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)), - v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes)); - v_int16 v_2r10 = v_r10 + v_r10; - v_rshr_pack_u_store<6>(dst0 + x, v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10)); - v_rshr_pack_u_store<6>(dst1 + x, (v_r10 + v_r20) << 2); - x += v_uint16::nlanes; - } - for (; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes) - { - v_int32 v_r00 = vx_load(row0 + x), - v_r10 = vx_load(row1 + x), - v_r20 = vx_load(row2 + x); - v_int32 v_2r10 = v_r10 + v_r10; - v_int16 d = v_pack(v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10), (v_r10 + v_r20) << 2); - *(int*)(dst0 + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())).get0(); - *(int*)(dst1 + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(v_combine_high(d, d), vx_setzero_s16())).get0(); - } + int x = 0; + v_float32 _4 = vx_setall_f32(4.f), _6 = vx_setall_f32(6.f); + for (; x <= width - v_float32::nlanes; x += v_float32::nlanes, src += 2*v_float32::nlanes, row += v_float32::nlanes) + { + v_float32 r0 = vx_lut_quads(src, idx); + v_float32 r1 = vx_lut_quads(src, idx + v_float32::nlanes/4 + 2); + v_float32 r2 = vx_lut_quads(src, idx + 1); + v_float32 r3 = vx_lut_quads(src, idx + v_float32::nlanes/4 + 3); + v_float32 r4 = vx_lut_quads(src, idx + 2); + v_store(row, v_muladd(r2, _6, v_muladd(r1 + r3, _4, r0 + r4))); + } + vx_cleanup(); + + return x; +} + +#if CV_SIMD_64F +template<> int PyrDownVecH(const double* src, double* row, int width) +{ + int x = 0; + const double *src0 = src, *src2 = src + 2, *src4 = src + 4; - return x; + v_float64 _4 = vx_setall_f64(4.f), _6 = vx_setall_f64(6.f); + for (; x <= width - v_float64::nlanes; x += v_float64::nlanes, src0 += 2*v_float64::nlanes, src2 += 2*v_float64::nlanes, src4 += 2*v_float64::nlanes, row += v_float64::nlanes) + { + v_float64 r0, r1, r2, r3, r4, rtmp; + v_load_deinterleave(src0, r0, r1); + v_load_deinterleave(src2, r2, r3); + v_load_deinterleave(src4, r4, rtmp); + v_store(row, v_muladd(r2, _6, v_muladd(r1 + r3, _4, r0 + r4))); } -}; + vx_cleanup(); + + return x; +} +#endif -struct PyrUpVec_32s16s +template<> int PyrDownVecV(int** src, uchar* dst, int width) { - int operator()(int** src, short** dst, int, int width) const + int x = 0; + const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; + + for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes ) + { + v_uint16 r0, r1, r2, r3, r4, t0, t1; + r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes))); + r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes))); + r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes))); + r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x), vx_load(row3 + x + v_int32::nlanes))); + r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x), vx_load(row4 + x + v_int32::nlanes))); + t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2); + r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x + 2*v_int32::nlanes), vx_load(row0 + x + 3*v_int32::nlanes))); + r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x + 2*v_int32::nlanes), vx_load(row1 + x + 3*v_int32::nlanes))); + r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x + 2*v_int32::nlanes), vx_load(row2 + x + 3*v_int32::nlanes))); + r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x + 2*v_int32::nlanes), vx_load(row3 + x + 3*v_int32::nlanes))); + r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x + 2*v_int32::nlanes), vx_load(row4 + x + 3*v_int32::nlanes))); + t1 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2); + v_store(dst + x, v_rshr_pack<8>(t0, t1)); + } + if (x <= width - v_int16::nlanes) { - int x = 0; - short *dst0 = dst[0], *dst1 = dst[1]; - const int *row0 = src[0], *row1 = src[1], *row2 = src[2]; + v_uint16 r0, r1, r2, r3, r4, t0; + r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes))); + r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes))); + r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes))); + r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x), vx_load(row3 + x + v_int32::nlanes))); + r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x), vx_load(row4 + x + v_int32::nlanes))); + t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2); + v_rshr_pack_store<8>(dst + x, t0); + x += v_uint16::nlanes; + } + typedef int CV_DECL_ALIGNED(1) unaligned_int; + for ( ; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes) + { + v_int32x4 r0, r1, r2, r3, r4, t0; + r0 = v_load(row0 + x); + r1 = v_load(row1 + x); + r2 = v_load(row2 + x); + r3 = v_load(row3 + x); + r4 = v_load(row4 + x); + t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2); + + *((unaligned_int*) (dst + x)) = v_reinterpret_as_s32(v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16())).get0(); + } + vx_cleanup(); - for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes) - { - v_int32 v_r00 = vx_load(row0 + x), - v_r01 = vx_load(row0 + x + v_int32::nlanes), - v_r10 = vx_load(row1 + x), - v_r11 = vx_load(row1 + x + v_int32::nlanes), - v_r20 = vx_load(row2 + x), - v_r21 = vx_load(row2 + x + v_int32::nlanes); - v_store(dst0 + x, v_rshr_pack<6>(v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)), v_r01 + v_r21 + ((v_r11 << 1) + (v_r11 << 2)))); - v_store(dst1 + x, v_rshr_pack<6>((v_r10 + v_r20) << 2, (v_r11 + v_r21) << 2)); - } - if(x <= width - v_int32::nlanes) - { - v_int32 v_r00 = vx_load(row0 + x), - v_r10 = vx_load(row1 + x), - v_r20 = vx_load(row2 + x); - v_rshr_pack_store<6>(dst0 + x, v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2))); - v_rshr_pack_store<6>(dst1 + x, (v_r10 + v_r20) << 2); - x += v_int32::nlanes; - } + return x; +} - return x; +template <> +int PyrDownVecV(float** src, float* dst, int width) +{ + int x = 0; + const float *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; + + v_float32 _4 = vx_setall_f32(4.f), _scale = vx_setall_f32(1.f/256); + for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes) + { + v_float32 r0, r1, r2, r3, r4; + r0 = vx_load(row0 + x); + r1 = vx_load(row1 + x); + r2 = vx_load(row2 + x); + r3 = vx_load(row3 + x); + r4 = vx_load(row4 + x); + v_store(dst + x, v_muladd(r1 + r3 + r2, _4, r0 + r4 + (r2 + r2)) * _scale); } -}; + vx_cleanup(); -#if CV_SSE4_1 || CV_NEON || CV_VSX + return x; +} -struct PyrUpVec_32s16u +template <> int PyrDownVecV(int** src, ushort* dst, int width) { - int operator()(int** src, ushort** dst, int, int width) const + int x = 0; + const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; + + for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes) + { + v_int32 r00 = vx_load(row0 + x), + r01 = vx_load(row0 + x + v_int32::nlanes), + r10 = vx_load(row1 + x), + r11 = vx_load(row1 + x + v_int32::nlanes), + r20 = vx_load(row2 + x), + r21 = vx_load(row2 + x + v_int32::nlanes), + r30 = vx_load(row3 + x), + r31 = vx_load(row3 + x + v_int32::nlanes), + r40 = vx_load(row4 + x), + r41 = vx_load(row4 + x + v_int32::nlanes); + v_store(dst + x, v_rshr_pack_u<8>(r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2), + r01 + r41 + (r21 + r21) + ((r11 + r21 + r31) << 2))); + } + if (x <= width - v_int32::nlanes) { - int x = 0; - ushort *dst0 = dst[0], *dst1 = dst[1]; - const int *row0 = src[0], *row1 = src[1], *row2 = src[2]; + v_int32 r00 = vx_load(row0 + x), + r10 = vx_load(row1 + x), + r20 = vx_load(row2 + x), + r30 = vx_load(row3 + x), + r40 = vx_load(row4 + x); + v_rshr_pack_u_store<8>(dst + x, r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2)); + x += v_int32::nlanes; + } + vx_cleanup(); - for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes) - { - v_int32 v_r00 = vx_load(row0 + x), - v_r01 = vx_load(row0 + x + v_int32::nlanes), - v_r10 = vx_load(row1 + x), - v_r11 = vx_load(row1 + x + v_int32::nlanes), - v_r20 = vx_load(row2 + x), - v_r21 = vx_load(row2 + x + v_int32::nlanes); - v_store(dst0 + x, v_rshr_pack_u<6>(v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)), v_r01 + v_r21 + ((v_r11 << 1) + (v_r11 << 2)))); - v_store(dst1 + x, v_rshr_pack_u<6>((v_r10 + v_r20) << 2, (v_r11 + v_r21) << 2)); - } - if(x <= width - v_int32::nlanes) - { - v_int32 v_r00 = vx_load(row0 + x), - v_r10 = vx_load(row1 + x), - v_r20 = vx_load(row2 + x); - v_rshr_pack_u_store<6>(dst0 + x, v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2))); - v_rshr_pack_u_store<6>(dst1 + x, (v_r10 + v_r20) << 2); - x += v_int32::nlanes; - } + return x; +} - return x; +template <> int PyrDownVecV(int** src, short* dst, int width) +{ + int x = 0; + const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; + + for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes) + { + v_int32 r00 = vx_load(row0 + x), + r01 = vx_load(row0 + x + v_int32::nlanes), + r10 = vx_load(row1 + x), + r11 = vx_load(row1 + x + v_int32::nlanes), + r20 = vx_load(row2 + x), + r21 = vx_load(row2 + x + v_int32::nlanes), + r30 = vx_load(row3 + x), + r31 = vx_load(row3 + x + v_int32::nlanes), + r40 = vx_load(row4 + x), + r41 = vx_load(row4 + x + v_int32::nlanes); + v_store(dst + x, v_rshr_pack<8>(r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2), + r01 + r41 + (r21 + r21) + ((r11 + r21 + r31) << 2))); } -}; + if (x <= width - v_int32::nlanes) + { + v_int32 r00 = vx_load(row0 + x), + r10 = vx_load(row1 + x), + r20 = vx_load(row2 + x), + r30 = vx_load(row3 + x), + r40 = vx_load(row4 + x); + v_rshr_pack_store<8>(dst + x, r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2)); + x += v_int32::nlanes; + } + vx_cleanup(); -#else + return x; +} -typedef PyrUpNoVec PyrUpVec_32s16u; +template <> int PyrUpVecV(int** src, uchar** dst, int width) +{ + int x = 0; + uchar *dst0 = dst[0], *dst1 = dst[1]; + const int *row0 = src[0], *row1 = src[1], *row2 = src[2]; -#endif // CV_SSE4_1 + for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes) + { + v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)), + v_r01 = v_pack(vx_load(row0 + x + 2 * v_int32::nlanes), vx_load(row0 + x + 3 * v_int32::nlanes)), + v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)), + v_r11 = v_pack(vx_load(row1 + x + 2 * v_int32::nlanes), vx_load(row1 + x + 3 * v_int32::nlanes)), + v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes)), + v_r21 = v_pack(vx_load(row2 + x + 2 * v_int32::nlanes), vx_load(row2 + x + 3 * v_int32::nlanes)); + v_int16 v_2r10 = v_r10 + v_r10, v_2r11 = (v_r11 + v_r11); + v_store(dst0 + x, v_rshr_pack_u<6>(v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10), v_r01 + v_r21 + (v_2r11 + v_2r11 + v_2r11))); + v_store(dst1 + x, v_rshr_pack_u<6>((v_r10 + v_r20) << 2, (v_r11 + v_r21) << 2)); + } + if(x <= width - v_uint16::nlanes) + { + v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)), + v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)), + v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes)); + v_int16 v_2r10 = v_r10 + v_r10; + v_rshr_pack_u_store<6>(dst0 + x, v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10)); + v_rshr_pack_u_store<6>(dst1 + x, (v_r10 + v_r20) << 2); + x += v_uint16::nlanes; + } + typedef int CV_DECL_ALIGNED(1) unaligned_int; + for (; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes) + { + v_int32 v_r00 = vx_load(row0 + x), + v_r10 = vx_load(row1 + x), + v_r20 = vx_load(row2 + x); + v_int32 v_2r10 = v_r10 + v_r10; + v_int16 d = v_pack(v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10), (v_r10 + v_r20) << 2); + *(unaligned_int*)(dst0 + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())).get0(); + *(unaligned_int*)(dst1 + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(v_combine_high(d, d), vx_setzero_s16())).get0(); + } + vx_cleanup(); -struct PyrUpVec_32f + return x; +} + +template <> int PyrUpVecV(int** src, short** dst, int width) { - int operator()(float** src, float** dst, int, int width) const + int x = 0; + short *dst0 = dst[0], *dst1 = dst[1]; + const int *row0 = src[0], *row1 = src[1], *row2 = src[2]; + + for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes) + { + v_int32 v_r00 = vx_load(row0 + x), + v_r01 = vx_load(row0 + x + v_int32::nlanes), + v_r10 = vx_load(row1 + x), + v_r11 = vx_load(row1 + x + v_int32::nlanes), + v_r20 = vx_load(row2 + x), + v_r21 = vx_load(row2 + x + v_int32::nlanes); + v_store(dst0 + x, v_rshr_pack<6>(v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)), v_r01 + v_r21 + ((v_r11 << 1) + (v_r11 << 2)))); + v_store(dst1 + x, v_rshr_pack<6>((v_r10 + v_r20) << 2, (v_r11 + v_r21) << 2)); + } + if(x <= width - v_int32::nlanes) { - int x = 0; - const float *row0 = src[0], *row1 = src[1], *row2 = src[2]; - float *dst0 = dst[0], *dst1 = dst[1]; + v_int32 v_r00 = vx_load(row0 + x), + v_r10 = vx_load(row1 + x), + v_r20 = vx_load(row2 + x); + v_rshr_pack_store<6>(dst0 + x, v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2))); + v_rshr_pack_store<6>(dst1 + x, (v_r10 + v_r20) << 2); + x += v_int32::nlanes; + } + vx_cleanup(); - v_float32 v_6 = vx_setall_f32(6.0f), v_scale = vx_setall_f32(1.f/64.f), v_scale4 = vx_setall_f32(1.f/16.f); - for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes) - { - v_float32 v_r0 = vx_load(row0 + x), - v_r1 = vx_load(row1 + x), - v_r2 = vx_load(row2 + x); - v_store(dst1 + x, v_scale4 * (v_r1 + v_r2)); - v_store(dst0 + x, v_scale * (v_muladd(v_6, v_r1, v_r0) + v_r2)); - } + return x; +} + +template <> int PyrUpVecV(int** src, ushort** dst, int width) +{ + int x = 0; + ushort *dst0 = dst[0], *dst1 = dst[1]; + const int *row0 = src[0], *row1 = src[1], *row2 = src[2]; - return x; + for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes) + { + v_int32 v_r00 = vx_load(row0 + x), + v_r01 = vx_load(row0 + x + v_int32::nlanes), + v_r10 = vx_load(row1 + x), + v_r11 = vx_load(row1 + x + v_int32::nlanes), + v_r20 = vx_load(row2 + x), + v_r21 = vx_load(row2 + x + v_int32::nlanes); + v_store(dst0 + x, v_rshr_pack_u<6>(v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)), v_r01 + v_r21 + ((v_r11 << 1) + (v_r11 << 2)))); + v_store(dst1 + x, v_rshr_pack_u<6>((v_r10 + v_r20) << 2, (v_r11 + v_r21) << 2)); } -}; + if(x <= width - v_int32::nlanes) + { + v_int32 v_r00 = vx_load(row0 + x), + v_r10 = vx_load(row1 + x), + v_r20 = vx_load(row2 + x); + v_rshr_pack_u_store<6>(dst0 + x, v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2))); + v_rshr_pack_u_store<6>(dst1 + x, (v_r10 + v_r20) << 2); + x += v_int32::nlanes; + } + vx_cleanup(); -#else + return x; +} + +template <> int PyrUpVecV(float** src, float** dst, int width) +{ + int x = 0; + const float *row0 = src[0], *row1 = src[1], *row2 = src[2]; + float *dst0 = dst[0], *dst1 = dst[1]; -typedef PyrDownNoVec PyrDownVec_32s8u; -typedef PyrDownNoVec PyrDownVec_32s16u; -typedef PyrDownNoVec PyrDownVec_32s16s; -typedef PyrDownNoVec PyrDownVec_32f; + v_float32 v_6 = vx_setall_f32(6.0f), v_scale = vx_setall_f32(1.f/64.f), v_scale4 = vx_setall_f32(1.f/16.f); + for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes) + { + v_float32 v_r0 = vx_load(row0 + x), + v_r1 = vx_load(row1 + x), + v_r2 = vx_load(row2 + x); + v_store(dst1 + x, v_scale4 * (v_r1 + v_r2)); + v_store(dst0 + x, v_scale * (v_muladd(v_6, v_r1, v_r0) + v_r2)); + } + vx_cleanup(); -typedef PyrUpNoVec PyrUpVec_32s8u; -typedef PyrUpNoVec PyrUpVec_32s16s; -typedef PyrUpNoVec PyrUpVec_32s16u; -typedef PyrUpNoVec PyrUpVec_32f; + return x; +} #endif -template void +template void pyrDown_( const Mat& _src, Mat& _dst, int borderType ) { const int PD_SZ = 5; @@ -408,7 +737,6 @@ pyrDown_( const Mat& _src, Mat& _dst, int borderType ) int* tabM = _tabM.data(); WT* rows[PD_SZ]; CastOp castOp; - VecOp vecOp; CV_Assert( ssize.width > 0 && ssize.height > 0 && std::abs(dsize.width*2 - ssize.width) <= 2 && @@ -460,12 +788,25 @@ pyrDown_( const Mat& _src, Mat& _dst, int borderType ) if( cn == 1 ) { + x += PyrDownVecH(src + x * 2 - 2, row + x, width0 - x); for( ; x < width0; x++ ) row[x] = src[x*2]*6 + (src[x*2 - 1] + src[x*2 + 1])*4 + src[x*2 - 2] + src[x*2 + 2]; } + else if( cn == 2 ) + { + x += PyrDownVecH(src + x * 2 - 4, row + x, width0 - x); + for( ; x < width0; x += 2 ) + { + const T* s = src + x*2; + WT t0 = s[0] * 6 + (s[-2] + s[2]) * 4 + s[-4] + s[4]; + WT t1 = s[1] * 6 + (s[-1] + s[3]) * 4 + s[-3] + s[5]; + row[x] = t0; row[x + 1] = t1; + } + } else if( cn == 3 ) { + x += PyrDownVecH(src + x * 2 - 6, row + x, width0 - x); for( ; x < width0; x += 3 ) { const T* s = src + x*2; @@ -477,6 +818,7 @@ pyrDown_( const Mat& _src, Mat& _dst, int borderType ) } else if( cn == 4 ) { + x += PyrDownVecH(src + x * 2 - 8, row + x, width0 - x); for( ; x < width0; x += 4 ) { const T* s = src + x*2; @@ -508,14 +850,14 @@ pyrDown_( const Mat& _src, Mat& _dst, int borderType ) rows[k] = buf + ((y*2 - PD_SZ/2 + k - sy0) % PD_SZ)*bufstep; row0 = rows[0]; row1 = rows[1]; row2 = rows[2]; row3 = rows[3]; row4 = rows[4]; - x = vecOp(rows, dst, (int)_dst.step, dsize.width); + x = PyrDownVecV(rows, dst, dsize.width); for( ; x < dsize.width; x++ ) dst[x] = castOp(row2[x]*6 + (row1[x] + row3[x])*4 + row0[x] + row4[x]); } } -template void +template void pyrUp_( const Mat& _src, Mat& _dst, int) { const int PU_SZ = 3; @@ -532,7 +874,7 @@ pyrUp_( const Mat& _src, Mat& _dst, int) WT* rows[PU_SZ]; T* dsts[2]; CastOp castOp; - VecOp vecOp; + //PyrUpVecH vecOpH; CV_Assert( std::abs(dsize.width - ssize.width*2) == dsize.width % 2 && std::abs(dsize.height - ssize.height*2) == dsize.height % 2); @@ -598,7 +940,7 @@ pyrUp_( const Mat& _src, Mat& _dst, int) row0 = rows[0]; row1 = rows[1]; row2 = rows[2]; dsts[0] = dst0; dsts[1] = dst1; - x = vecOp(rows, dsts, (int)_dst.step, dsize.width); + x = PyrUpVecV(rows, dsts, dsize.width); for( ; x < dsize.width; x++ ) { T t1 = castOp((row1[x] + row2[x])*4); @@ -912,15 +1254,15 @@ void cv::pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borde PyrFunc func = 0; if( depth == CV_8U ) - func = pyrDown_, PyrDownVec_32s8u>; + func = pyrDown_< FixPtCast >; else if( depth == CV_16S ) - func = pyrDown_, PyrDownVec_32s16s >; + func = pyrDown_< FixPtCast >; else if( depth == CV_16U ) - func = pyrDown_, PyrDownVec_32s16u >; + func = pyrDown_< FixPtCast >; else if( depth == CV_32F ) - func = pyrDown_, PyrDownVec_32f>; + func = pyrDown_< FltCast >; else if( depth == CV_64F ) - func = pyrDown_, PyrDownNoVec >; + func = pyrDown_< FltCast >; else CV_Error( CV_StsUnsupportedFormat, "" ); @@ -1020,15 +1362,15 @@ void cv::pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int borderT PyrFunc func = 0; if( depth == CV_8U ) - func = pyrUp_, PyrUpVec_32s8u >; + func = pyrUp_< FixPtCast >; else if( depth == CV_16S ) - func = pyrUp_, PyrUpVec_32s16s >; + func = pyrUp_< FixPtCast >; else if( depth == CV_16U ) - func = pyrUp_, PyrUpVec_32s16u >; + func = pyrUp_< FixPtCast >; else if( depth == CV_32F ) - func = pyrUp_, PyrUpVec_32f >; + func = pyrUp_< FltCast >; else if( depth == CV_64F ) - func = pyrUp_, PyrUpNoVec >; + func = pyrUp_< FltCast >; else CV_Error( CV_StsUnsupportedFormat, "" ); -- 2.7.4