From 9fbd1d68ad95d7bad2d09501110081ecd82d5ad8 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Fri, 1 May 2015 21:49:11 +0300 Subject: [PATCH] refactored div & pow funcs; added tests for special cases in pow() function. fixed http://code.opencv.org/issues/3935 possibly fixed http://code.opencv.org/issues/3594 --- modules/core/src/arithm.cpp | 748 ++++++++++++++----------- modules/core/src/lapack.cpp | 4 +- modules/core/src/mathfuncs.cpp | 436 +++++++++----- modules/core/src/precomp.hpp | 2 + modules/core/test/test_mat.cpp | 7 + modules/core/test/test_math.cpp | 69 ++- modules/hal/include/opencv2/hal/intrin_sse.hpp | 10 + 7 files changed, 792 insertions(+), 484 deletions(-) diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 15b0759..1b87059 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -2781,15 +2781,23 @@ struct Div_SIMD } }; -#if CV_SSE2 +template +struct Recip_SIMD +{ + int operator() (const T *, T *, int, double) const + { + return 0; + } +}; -#if CV_SSE4_1 + +#if CV_SIMD128 template <> struct Div_SIMD { bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const { @@ -2798,50 +2806,44 @@ struct Div_SIMD if (!haveSIMD) return x; - __m128d v_scale = _mm_set1_pd(scale); - __m128i v_zero = _mm_setzero_si128(); + v_float32x4 v_scale = v_setall_f32((float)scale); + v_uint16x8 v_zero = v_setzero_u16(); for ( ; x <= width - 8; x += 8) { - __m128i v_src1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(src1 + x)), v_zero); - __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x)); - __m128i v_src2 = _mm_unpacklo_epi8(_v_src2, v_zero); - - __m128i v_src1i = _mm_unpacklo_epi16(v_src1, v_zero); - __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero); - __m128d v_src1d = _mm_cvtepi32_pd(v_src1i); - __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); - __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); - - v_src1i = _mm_unpackhi_epi16(v_src1, v_zero); - v_src2i = _mm_unpackhi_epi16(v_src2, v_zero); - v_src1d = _mm_cvtepi32_pd(v_src1i); - v_src2d = _mm_cvtepi32_pd(v_src2i); - v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); - - __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero); - _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero))); + v_uint16x8 v_src1 = v_load_expand(src1 + x); + v_uint16x8 v_src2 = v_load_expand(src2 + x); + + v_uint32x4 t0, t1, t2, t3; + v_expand(v_src1, t0, t1); + v_expand(v_src2, t2, t3); + + v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); + v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); + + v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2)); + v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3)); + + f0 = f0 * v_scale / f2; + f1 = f1 * v_scale / f3; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_uint16x8 res = v_pack_u(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_pack_store(dst + x, res); } return x; } }; -#endif // CV_SSE4_1 template <> struct Div_SIMD { bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const { @@ -2850,50 +2852,44 @@ struct Div_SIMD if (!haveSIMD) return x; - __m128d v_scale = _mm_set1_pd(scale); - __m128i v_zero = _mm_setzero_si128(); + v_float32x4 v_scale = v_setall_f32((float)scale); + v_int16x8 v_zero = v_setzero_s16(); for ( ; x <= width - 8; x += 8) { - __m128i v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((const __m128i *)(src1 + x))), 8); - __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x)); - __m128i v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _v_src2), 8); - - __m128i v_src1i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16); - __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16); - __m128d v_src1d = _mm_cvtepi32_pd(v_src1i); - __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); - __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); - - v_src1i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16); - v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16); - v_src1d = _mm_cvtepi32_pd(v_src1i); - v_src2d = _mm_cvtepi32_pd(v_src2i); - v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); - - __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero); - _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero))); + v_int16x8 v_src1 = v_load_expand(src1 + x); + v_int16x8 v_src2 = v_load_expand(src2 + x); + + v_int32x4 t0, t1, t2, t3; + v_expand(v_src1, t0, t1); + v_expand(v_src2, t2, t3); + + v_float32x4 f0 = v_cvt_f32(t0); + v_float32x4 f1 = v_cvt_f32(t1); + + v_float32x4 f2 = v_cvt_f32(t2); + v_float32x4 f3 = v_cvt_f32(t3); + + f0 = f0 * v_scale / f2; + f1 = f1 * v_scale / f3; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_int16x8 res = v_pack(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_pack_store(dst + x, res); } return x; } }; -#if CV_SSE4_1 template <> struct Div_SIMD { bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const { @@ -2902,49 +2898,43 @@ struct Div_SIMD if (!haveSIMD) return x; - __m128d v_scale = _mm_set1_pd(scale); - __m128i v_zero = _mm_setzero_si128(); + v_float32x4 v_scale = v_setall_f32((float)scale); + v_uint16x8 v_zero = v_setzero_u16(); for ( ; x <= width - 8; x += 8) { - __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); - __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + v_uint16x8 v_src1 = v_load(src1 + x); + v_uint16x8 v_src2 = v_load(src2 + x); + + v_uint32x4 t0, t1, t2, t3; + v_expand(v_src1, t0, t1); + v_expand(v_src2, t2, t3); - __m128i v_src1i = _mm_unpacklo_epi16(v_src1, v_zero); - __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero); - __m128d v_src1d = _mm_cvtepi32_pd(v_src1i); - __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); - __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); - - v_src1i = _mm_unpackhi_epi16(v_src1, v_zero); - v_src2i = _mm_unpackhi_epi16(v_src2, v_zero); - v_src1d = _mm_cvtepi32_pd(v_src1i); - v_src2d = _mm_cvtepi32_pd(v_src2i); - v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); - - __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero); - _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi32(v_dst_0, v_dst_1))); + v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); + v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); + + v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2)); + v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3)); + + f0 = f0 * v_scale / f2; + f1 = f1 * v_scale / f3; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_uint16x8 res = v_pack_u(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_store(dst + x, res); } return x; } }; -#endif // CV_SSE4_1 - template <> struct Div_SIMD { bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const { @@ -2953,36 +2943,32 @@ struct Div_SIMD if (!haveSIMD) return x; - __m128d v_scale = _mm_set1_pd(scale); - __m128i v_zero = _mm_setzero_si128(); + v_float32x4 v_scale = v_setall_f32((float)scale); + v_int16x8 v_zero = v_setzero_s16(); for ( ; x <= width - 8; x += 8) { - __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); - __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + v_int16x8 v_src1 = v_load(src1 + x); + v_int16x8 v_src2 = v_load(src2 + x); - __m128i v_src1i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16); - __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16); - __m128d v_src1d = _mm_cvtepi32_pd(v_src1i); - __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); - __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); - - v_src1i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16); - v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16); - v_src1d = _mm_cvtepi32_pd(v_src1i); - v_src2d = _mm_cvtepi32_pd(v_src2i); - v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); - - __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero); - _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi32(v_dst_0, v_dst_1))); + v_int32x4 t0, t1, t2, t3; + v_expand(v_src1, t0, t1); + v_expand(v_src2, t2, t3); + + v_float32x4 f0 = v_cvt_f32(t0); + v_float32x4 f1 = v_cvt_f32(t1); + + v_float32x4 f2 = v_cvt_f32(t2); + v_float32x4 f3 = v_cvt_f32(t3); + + f0 = f0 * v_scale / f2; + f1 = f1 * v_scale / f3; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_int16x8 res = v_pack(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_store(dst + x, res); } return x; @@ -2993,7 +2979,7 @@ template <> struct Div_SIMD { bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const { @@ -3002,100 +2988,82 @@ struct Div_SIMD if (!haveSIMD) return x; - __m128d v_scale = _mm_set1_pd(scale); - __m128i v_zero = _mm_setzero_si128(); + v_float32x4 v_scale = v_setall_f32((float)scale); + v_int32x4 v_zero = v_setzero_s32(); - for ( ; x <= width - 4; x += 4) + for ( ; x <= width - 8; x += 8) { - __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); - __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + v_int32x4 t0 = v_load(src1 + x); + v_int32x4 t1 = v_load(src1 + x + 4); + v_int32x4 t2 = v_load(src2 + x); + v_int32x4 t3 = v_load(src2 + x + 4); + + v_float32x4 f0 = v_cvt_f32(t0); + v_float32x4 f1 = v_cvt_f32(t1); + v_float32x4 f2 = v_cvt_f32(t2); + v_float32x4 f3 = v_cvt_f32(t3); - __m128d v_src1d = _mm_cvtepi32_pd(v_src1); - __m128d v_src2d = _mm_cvtepi32_pd(v_src2); - __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + f0 = f0 * v_scale / f2; + f1 = f1 * v_scale / f3; - v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1, 8)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2, 8)); - __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + v_int32x4 res0 = v_round(f0), res1 = v_round(f1); - __m128i v_dst = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); - __m128i v_mask = _mm_cmpeq_epi32(v_src2, v_zero); - _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, v_dst)); + res0 = v_select(t2 == v_zero, v_zero, res0); + res1 = v_select(t3 == v_zero, v_zero, res1); + v_store(dst + x, res0); + v_store(dst + x + 4, res1); } return x; } }; -#endif -template static void -div_( const T* src1, size_t step1, const T* src2, size_t step2, - T* dst, size_t step, Size size, double scale ) +template <> +struct Div_SIMD { - step1 /= sizeof(src1[0]); - step2 /= sizeof(src2[0]); - step /= sizeof(dst[0]); - - Div_SIMD vop; + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - for( ; size.height--; src1 += step1, src2 += step2, dst += step ) + int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const { - int i = vop(src1, src2, dst, size.width, scale); - #if CV_ENABLE_UNROLLED - for( ; i <= size.width - 4; i += 4 ) + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_float32x4 v_zero = v_setzero_f32(); + + for ( ; x <= width - 8; x += 8) { - if( src2[i] != 0 && src2[i+1] != 0 && src2[i+2] != 0 && src2[i+3] != 0 ) - { - double a = (double)src2[i] * src2[i+1]; - double b = (double)src2[i+2] * src2[i+3]; - double d = scale/(a * b); - b *= d; - a *= d; - - T z0 = saturate_cast(src2[i+1] * ((double)src1[i] * b)); - T z1 = saturate_cast(src2[i] * ((double)src1[i+1] * b)); - T z2 = saturate_cast(src2[i+3] * ((double)src1[i+2] * a)); - T z3 = saturate_cast(src2[i+2] * ((double)src1[i+3] * a)); - - dst[i] = z0; dst[i+1] = z1; - dst[i+2] = z2; dst[i+3] = z3; - } - else - { - T z0 = src2[i] != 0 ? saturate_cast(src1[i]*scale/src2[i]) : 0; - T z1 = src2[i+1] != 0 ? saturate_cast(src1[i+1]*scale/src2[i+1]) : 0; - T z2 = src2[i+2] != 0 ? saturate_cast(src1[i+2]*scale/src2[i+2]) : 0; - T z3 = src2[i+3] != 0 ? saturate_cast(src1[i+3]*scale/src2[i+3]) : 0; + v_float32x4 f0 = v_load(src1 + x); + v_float32x4 f1 = v_load(src1 + x + 4); + v_float32x4 f2 = v_load(src2 + x); + v_float32x4 f3 = v_load(src2 + x + 4); - dst[i] = z0; dst[i+1] = z1; - dst[i+2] = z2; dst[i+3] = z3; - } + v_float32x4 res0 = f0 * v_scale / f2; + v_float32x4 res1 = f1 * v_scale / f3; + + res0 = v_select(f2 == v_zero, v_zero, res0); + res1 = v_select(f3 == v_zero, v_zero, res1); + + v_store(dst + x, res0); + v_store(dst + x + 4, res1); } - #endif - for( ; i < size.width; i++ ) - dst[i] = src2[i] != 0 ? saturate_cast(src1[i]*scale/src2[i]) : 0; - } -} -template -struct Recip_SIMD -{ - int operator() (const T *, T *, int, double) const - { - return 0; + return x; } }; -#if CV_SSE2 -#if CV_SSE4_1 +///////////////////////// RECIPROCAL ////////////////////// template <> struct Recip_SIMD { bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const uchar * src2, uchar * dst, int width, double scale) const { @@ -3104,43 +3072,39 @@ struct Recip_SIMD if (!haveSIMD) return x; - __m128d v_scale = _mm_set1_pd(scale); - __m128i v_zero = _mm_setzero_si128(); + v_float32x4 v_scale = v_setall_f32((float)scale); + v_uint16x8 v_zero = v_setzero_u16(); for ( ; x <= width - 8; x += 8) { - __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x)); - __m128i v_src2 = _mm_unpacklo_epi8(_v_src2, v_zero); + v_uint16x8 v_src2 = v_load_expand(src2 + x); - __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero); - __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); - __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + v_uint32x4 t0, t1; + v_expand(v_src2, t0, t1); - v_src2i = _mm_unpackhi_epi16(v_src2, v_zero); - v_src2d = _mm_cvtepi32_pd(v_src2i); - v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); + v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); - __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero); - _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero))); + f0 = v_scale / f0; + f1 = v_scale / f1; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_uint16x8 res = v_pack_u(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_pack_store(dst + x, res); } return x; } }; -#endif // CV_SSE4_1 template <> struct Recip_SIMD { bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const schar * src2, schar * dst, int width, double scale) const { @@ -3149,43 +3113,39 @@ struct Recip_SIMD if (!haveSIMD) return x; - __m128d v_scale = _mm_set1_pd(scale); - __m128i v_zero = _mm_setzero_si128(); + v_float32x4 v_scale = v_setall_f32((float)scale); + v_int16x8 v_zero = v_setzero_s16(); for ( ; x <= width - 8; x += 8) { - __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x)); - __m128i v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _v_src2), 8); + v_int16x8 v_src2 = v_load_expand(src2 + x); + + v_int32x4 t0, t1; + v_expand(v_src2, t0, t1); - __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16); - __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); - __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + v_float32x4 f0 = v_cvt_f32(t0); + v_float32x4 f1 = v_cvt_f32(t1); - v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16); - v_src2d = _mm_cvtepi32_pd(v_src2i); - v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + f0 = v_scale / f0; + f1 = v_scale / f1; - __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero); - _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero))); + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_int16x8 res = v_pack(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_pack_store(dst + x, res); } return x; } }; -#if CV_SSE4_1 template <> struct Recip_SIMD { bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const ushort * src2, ushort * dst, int width, double scale) const { @@ -3194,42 +3154,38 @@ struct Recip_SIMD if (!haveSIMD) return x; - __m128d v_scale = _mm_set1_pd(scale); - __m128i v_zero = _mm_setzero_si128(); + v_float32x4 v_scale = v_setall_f32((float)scale); + v_uint16x8 v_zero = v_setzero_u16(); for ( ; x <= width - 8; x += 8) { - __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + v_uint16x8 v_src2 = v_load(src2 + x); - __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero); - __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); - __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + v_uint32x4 t0, t1; + v_expand(v_src2, t0, t1); - v_src2i = _mm_unpackhi_epi16(v_src2, v_zero); - v_src2d = _mm_cvtepi32_pd(v_src2i); - v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); + v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); - __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero); - _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi32(v_dst_0, v_dst_1))); + f0 = v_scale / f0; + f1 = v_scale / f1; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_uint16x8 res = v_pack_u(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_store(dst + x, res); } return x; } }; -#endif // CV_SSE4_1 - template <> struct Recip_SIMD { bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const short * src2, short * dst, int width, double scale) const { @@ -3238,29 +3194,27 @@ struct Recip_SIMD if (!haveSIMD) return x; - __m128d v_scale = _mm_set1_pd(scale); - __m128i v_zero = _mm_setzero_si128(); + v_float32x4 v_scale = v_setall_f32((float)scale); + v_int16x8 v_zero = v_setzero_s16(); for ( ; x <= width - 8; x += 8) { - __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + v_int16x8 v_src2 = v_load(src2 + x); + + v_int32x4 t0, t1; + v_expand(v_src2, t0, t1); + + v_float32x4 f0 = v_cvt_f32(t0); + v_float32x4 f1 = v_cvt_f32(t1); - __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16); - __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); - __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + f0 = v_scale / f0; + f1 = v_scale / f1; - v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16); - v_src2d = _mm_cvtepi32_pd(v_src2i); - v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_int16x8 res = v_pack(i0, i1); - __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero); - _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi32(v_dst_0, v_dst_1))); + res = v_select(v_src2 == v_zero, v_zero, res); + v_store(dst + x, res); } return x; @@ -3271,7 +3225,7 @@ template <> struct Recip_SIMD { bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const int * src2, int * dst, int width, double scale) const { @@ -3280,22 +3234,136 @@ struct Recip_SIMD if (!haveSIMD) return x; - __m128d v_scale = _mm_set1_pd(scale); - __m128i v_zero = _mm_setzero_si128(); + v_float32x4 v_scale = v_setall_f32((float)scale); + v_int32x4 v_zero = v_setzero_s32(); + + for ( ; x <= width - 8; x += 8) + { + v_int32x4 t0 = v_load(src2 + x); + v_int32x4 t1 = v_load(src2 + x + 4); + + v_float32x4 f0 = v_cvt_f32(t0); + v_float32x4 f1 = v_cvt_f32(t1); + + f0 = v_scale / f0; + f1 = v_scale / f1; + + v_int32x4 res0 = v_round(f0), res1 = v_round(f1); + + res0 = v_select(t0 == v_zero, v_zero, res0); + res1 = v_select(t1 == v_zero, v_zero, res1); + v_store(dst + x, res0); + v_store(dst + x + 4, res1); + } + + return x; + } +}; + + +template <> +struct Recip_SIMD +{ + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const float * src2, float * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_float32x4 v_zero = v_setzero_f32(); + + for ( ; x <= width - 8; x += 8) + { + v_float32x4 f0 = v_load(src2 + x); + v_float32x4 f1 = v_load(src2 + x + 4); + + v_float32x4 res0 = v_scale / f0; + v_float32x4 res1 = v_scale / f1; + + res0 = v_select(f0 == v_zero, v_zero, res0); + res1 = v_select(f1 == v_zero, v_zero, res1); + + v_store(dst + x, res0); + v_store(dst + x + 4, res1); + } + + return x; + } +}; + +#if CV_SIMD128_64F + +template <> +struct Div_SIMD +{ + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float64x2 v_scale = v_setall_f64(scale); + v_float64x2 v_zero = v_setzero_f64(); for ( ; x <= width - 4; x += 4) { - __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + v_float64x2 f0 = v_load(src1 + x); + v_float64x2 f1 = v_load(src1 + x + 2); + v_float64x2 f2 = v_load(src2 + x); + v_float64x2 f3 = v_load(src2 + x + 2); + + v_float64x2 res0 = f0 * v_scale / f2; + v_float64x2 res1 = f1 * v_scale / f3; + + res0 = v_select(f0 == v_zero, v_zero, res0); + res1 = v_select(f1 == v_zero, v_zero, res1); + + v_store(dst + x, res0); + v_store(dst + x + 2, res1); + } + + return x; + } +}; + +template <> +struct Recip_SIMD +{ + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const double * src2, double * dst, int width, double scale) const + { + int x = 0; - __m128d v_src2d = _mm_cvtepi32_pd(v_src2); - __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + if (!haveSIMD) + return x; - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2, 8)); - __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + v_float64x2 v_scale = v_setall_f64(scale); + v_float64x2 v_zero = v_setzero_f64(); - __m128i v_dst = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); - __m128i v_mask = _mm_cmpeq_epi32(v_src2, v_zero); - _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, v_dst)); + for ( ; x <= width - 4; x += 4) + { + v_float64x2 f0 = v_load(src2 + x); + v_float64x2 f1 = v_load(src2 + x + 2); + + v_float64x2 res0 = v_scale / f0; + v_float64x2 res1 = v_scale / f1; + + res0 = v_select(f0 == v_zero, v_zero, res0); + res1 = v_select(f1 == v_zero, v_zero, res1); + + v_store(dst + x, res0); + v_store(dst + x + 2, res1); } return x; @@ -3304,51 +3372,91 @@ struct Recip_SIMD #endif +#endif + template static void -recip_( const T*, size_t, const T* src2, size_t step2, - T* dst, size_t step, Size size, double scale ) +div_i( const T* src1, size_t step1, const T* src2, size_t step2, + T* dst, size_t step, Size size, double scale ) +{ + step1 /= sizeof(src1[0]); + step2 /= sizeof(src2[0]); + step /= sizeof(dst[0]); + + Div_SIMD vop; + float scale_f = (float)scale; + + for( ; size.height--; src1 += step1, src2 += step2, dst += step ) + { + int i = vop(src1, src2, dst, size.width, scale); + for( ; i < size.width; i++ ) + { + T num = src1[i], denom = src2[i]; + dst[i] = denom != 0 ? saturate_cast(num*scale_f/denom) : (T)0; + } + } +} + +template static void +div_f( const T* src1, size_t step1, const T* src2, size_t step2, + T* dst, size_t step, Size size, double scale ) +{ + T scale_f = (T)scale; + step1 /= sizeof(src1[0]); + step2 /= sizeof(src2[0]); + step /= sizeof(dst[0]); + + Div_SIMD vop; + + for( ; size.height--; src1 += step1, src2 += step2, dst += step ) + { + int i = vop(src1, src2, dst, size.width, scale); + for( ; i < size.width; i++ ) + { + T num = src1[i], denom = src2[i]; + dst[i] = denom != 0 ? saturate_cast(num*scale_f/denom) : (T)0; + } + } +} + +template static void +recip_i( const T*, size_t, const T* src2, size_t step2, + T* dst, size_t step, Size size, double scale ) { step2 /= sizeof(src2[0]); step /= sizeof(dst[0]); Recip_SIMD vop; + float scale_f = (float)scale; for( ; size.height--; src2 += step2, dst += step ) { int i = vop(src2, dst, size.width, scale); - #if CV_ENABLE_UNROLLED - for( ; i <= size.width - 4; i += 4 ) + for( ; i < size.width; i++ ) { - if( src2[i] != 0 && src2[i+1] != 0 && src2[i+2] != 0 && src2[i+3] != 0 ) - { - double a = (double)src2[i] * src2[i+1]; - double b = (double)src2[i+2] * src2[i+3]; - double d = scale/(a * b); - b *= d; - a *= d; - - T z0 = saturate_cast(src2[i+1] * b); - T z1 = saturate_cast(src2[i] * b); - T z2 = saturate_cast(src2[i+3] * a); - T z3 = saturate_cast(src2[i+2] * a); - - dst[i] = z0; dst[i+1] = z1; - dst[i+2] = z2; dst[i+3] = z3; - } - else - { - T z0 = src2[i] != 0 ? saturate_cast(scale/src2[i]) : 0; - T z1 = src2[i+1] != 0 ? saturate_cast(scale/src2[i+1]) : 0; - T z2 = src2[i+2] != 0 ? saturate_cast(scale/src2[i+2]) : 0; - T z3 = src2[i+3] != 0 ? saturate_cast(scale/src2[i+3]) : 0; - - dst[i] = z0; dst[i+1] = z1; - dst[i+2] = z2; dst[i+3] = z3; - } + T denom = src2[i]; + dst[i] = denom != 0 ? saturate_cast(scale_f/denom) : (T)0; } - #endif + } +} + +template static void +recip_f( const T*, size_t, const T* src2, size_t step2, + T* dst, size_t step, Size size, double scale ) +{ + T scale_f = (T)scale; + step2 /= sizeof(src2[0]); + step /= sizeof(dst[0]); + + Recip_SIMD vop; + + for( ; size.height--; src2 += step2, dst += step ) + { + int i = vop(src2, dst, size.width, scale); for( ; i < size.width; i++ ) - dst[i] = src2[i] != 0 ? saturate_cast(scale/src2[i]) : 0; + { + T denom = src2[i]; + dst[i] = denom != 0 ? saturate_cast(scale_f/denom) : (T)0; + } } } @@ -3459,87 +3567,87 @@ static void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t st uchar* dst, size_t step, Size sz, void* scale) { if( src1 ) - div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); else - recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, Size sz, void* scale) { - div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, Size sz, void* scale) { - div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void div16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, Size sz, void* scale) { - div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void div32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, Size sz, void* scale) { - div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void div32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, Size sz, void* scale) { - div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + div_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void div64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, Size sz, void* scale) { - div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + div_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, Size sz, void* scale) { - recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, Size sz, void* scale) { - recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, Size sz, void* scale) { - recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void recip16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, Size sz, void* scale) { - recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void recip32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, Size sz, void* scale) { - recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void recip32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, Size sz, void* scale) { - recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + recip_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void recip64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, Size sz, void* scale) { - recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + recip_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } diff --git a/modules/core/src/lapack.cpp b/modules/core/src/lapack.cpp index 3fbe85b..e246a7c 100644 --- a/modules/core/src/lapack.cpp +++ b/modules/core/src/lapack.cpp @@ -502,7 +502,7 @@ JacobiSVDImpl_(_Tp* At, size_t astep, _Tp* _W, _Tp* Vt, size_t vstep, { sd = i < n ? W[i] : 0; - while( sd <= minval ) + for( int ii = 0; ii < 100 && sd <= minval; ii++ ) { // if we got a zero singular value, then in order to get the corresponding left singular vector // we generate a random vector, project it to the previously computed left singular vectors, @@ -541,7 +541,7 @@ JacobiSVDImpl_(_Tp* At, size_t astep, _Tp* _W, _Tp* Vt, size_t vstep, sd = std::sqrt(sd); } - s = (_Tp)(1/sd); + s = (_Tp)(sd > minval ? 1/sd : 0.); for( k = 0; k < m; k++ ) At[i*astep + k] *= s; } diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp index e96eaeb..8f7fc7b 100644 --- a/modules/core/src/mathfuncs.cpp +++ b/modules/core/src/mathfuncs.cpp @@ -889,38 +889,41 @@ struct iPow_SIMD } }; -#if CV_NEON +#if CV_SIMD128 template <> struct iPow_SIMD { - int operator() ( const uchar * src, uchar * dst, int len, int power) + int operator() ( const uchar * src, uchar * dst, int len, int power ) { int i = 0; - uint32x4_t v_1 = vdupq_n_u32(1u); + v_uint32x4 v_1 = v_setall_u32(1u); for ( ; i <= len - 8; i += 8) { - uint32x4_t v_a1 = v_1, v_a2 = v_1; - uint16x8_t v_src = vmovl_u8(vld1_u8(src + i)); - uint32x4_t v_b1 = vmovl_u16(vget_low_u16(v_src)), v_b2 = vmovl_u16(vget_high_u16(v_src)); + v_uint32x4 v_a1 = v_1, v_a2 = v_1; + v_uint16x8 v = v_load_expand(src + i); + v_uint32x4 v_b1, v_b2; + v_expand(v, v_b1, v_b2); int p = power; while( p > 1 ) { if (p & 1) { - v_a1 = vmulq_u32(v_a1, v_b1); - v_a2 = vmulq_u32(v_a2, v_b2); + v_a1 *= v_b1; + v_a2 *= v_b2; } - v_b1 = vmulq_u32(v_b1, v_b1); - v_b2 = vmulq_u32(v_b2, v_b2); + v_b1 *= v_b1; + v_b2 *= v_b2; p >>= 1; } - v_a1 = vmulq_u32(v_a1, v_b1); - v_a2 = vmulq_u32(v_a2, v_b2); - vst1_u8(dst + i, vqmovn_u16(vcombine_u16(vqmovn_u32(v_a1), vqmovn_u32(v_a2)))); + v_a1 *= v_b1; + v_a2 *= v_b2; + + v = v_pack(v_a1, v_a2); + v_pack_store(dst + i, v); } return i; @@ -933,30 +936,33 @@ struct iPow_SIMD int operator() ( const schar * src, schar * dst, int len, int power) { int i = 0; - int32x4_t v_1 = vdupq_n_s32(1); + v_int32x4 v_1 = v_setall_s32(1); for ( ; i <= len - 8; i += 8) { - int32x4_t v_a1 = v_1, v_a2 = v_1; - int16x8_t v_src = vmovl_s8(vld1_s8(src + i)); - int32x4_t v_b1 = vmovl_s16(vget_low_s16(v_src)), v_b2 = vmovl_s16(vget_high_s16(v_src)); + v_int32x4 v_a1 = v_1, v_a2 = v_1; + v_int16x8 v = v_load_expand(src + i); + v_int32x4 v_b1, v_b2; + v_expand(v, v_b1, v_b2); int p = power; while( p > 1 ) { if (p & 1) { - v_a1 = vmulq_s32(v_a1, v_b1); - v_a2 = vmulq_s32(v_a2, v_b2); + v_a1 *= v_b1; + v_a2 *= v_b2; } - v_b1 = vmulq_s32(v_b1, v_b1); - v_b2 = vmulq_s32(v_b2, v_b2); + v_b1 *= v_b1; + v_b2 *= v_b2; p >>= 1; } - v_a1 = vmulq_s32(v_a1, v_b1); - v_a2 = vmulq_s32(v_a2, v_b2); - vst1_s8(dst + i, vqmovn_s16(vcombine_s16(vqmovn_s32(v_a1), vqmovn_s32(v_a2)))); + v_a1 *= v_b1; + v_a2 *= v_b2; + + v = v_pack(v_a1, v_a2); + v_pack_store(dst + i, v); } return i; @@ -969,30 +975,33 @@ struct iPow_SIMD int operator() ( const ushort * src, ushort * dst, int len, int power) { int i = 0; - uint32x4_t v_1 = vdupq_n_u32(1u); + v_uint32x4 v_1 = v_setall_u32(1u); for ( ; i <= len - 8; i += 8) { - uint32x4_t v_a1 = v_1, v_a2 = v_1; - uint16x8_t v_src = vld1q_u16(src + i); - uint32x4_t v_b1 = vmovl_u16(vget_low_u16(v_src)), v_b2 = vmovl_u16(vget_high_u16(v_src)); + v_uint32x4 v_a1 = v_1, v_a2 = v_1; + v_uint16x8 v = v_load(src + i); + v_uint32x4 v_b1, v_b2; + v_expand(v, v_b1, v_b2); int p = power; while( p > 1 ) { if (p & 1) { - v_a1 = vmulq_u32(v_a1, v_b1); - v_a2 = vmulq_u32(v_a2, v_b2); + v_a1 *= v_b1; + v_a2 *= v_b2; } - v_b1 = vmulq_u32(v_b1, v_b1); - v_b2 = vmulq_u32(v_b2, v_b2); + v_b1 *= v_b1; + v_b2 *= v_b2; p >>= 1; } - v_a1 = vmulq_u32(v_a1, v_b1); - v_a2 = vmulq_u32(v_a2, v_b2); - vst1q_u16(dst + i, vcombine_u16(vqmovn_u32(v_a1), vqmovn_u32(v_a2))); + v_a1 *= v_b1; + v_a2 *= v_b2; + + v = v_pack(v_a1, v_a2); + v_store(dst + i, v); } return i; @@ -1005,60 +1014,70 @@ struct iPow_SIMD int operator() ( const short * src, short * dst, int len, int power) { int i = 0; - int32x4_t v_1 = vdupq_n_s32(1); + v_int32x4 v_1 = v_setall_s32(1); for ( ; i <= len - 8; i += 8) { - int32x4_t v_a1 = v_1, v_a2 = v_1; - int16x8_t v_src = vld1q_s16(src + i); - int32x4_t v_b1 = vmovl_s16(vget_low_s16(v_src)), v_b2 = vmovl_s16(vget_high_s16(v_src)); + v_int32x4 v_a1 = v_1, v_a2 = v_1; + v_int16x8 v = v_load(src + i); + v_int32x4 v_b1, v_b2; + v_expand(v, v_b1, v_b2); int p = power; while( p > 1 ) { if (p & 1) { - v_a1 = vmulq_s32(v_a1, v_b1); - v_a2 = vmulq_s32(v_a2, v_b2); + v_a1 *= v_b1; + v_a2 *= v_b2; } - v_b1 = vmulq_s32(v_b1, v_b1); - v_b2 = vmulq_s32(v_b2, v_b2); + v_b1 *= v_b1; + v_b2 *= v_b2; p >>= 1; } - v_a1 = vmulq_s32(v_a1, v_b1); - v_a2 = vmulq_s32(v_a2, v_b2); - vst1q_s16(dst + i, vcombine_s16(vqmovn_s32(v_a1), vqmovn_s32(v_a2))); + v_a1 *= v_b1; + v_a2 *= v_b2; + + v = v_pack(v_a1, v_a2); + v_store(dst + i, v); } return i; } }; - template <> struct iPow_SIMD { int operator() ( const int * src, int * dst, int len, int power) { int i = 0; - int32x4_t v_1 = vdupq_n_s32(1); + v_int32x4 v_1 = v_setall_s32(1); - for ( ; i <= len - 4; i += 4) + for ( ; i <= len - 8; i += 8) { - int32x4_t v_b = vld1q_s32(src + i), v_a = v_1; + v_int32x4 v_a1 = v_1, v_a2 = v_1; + v_int32x4 v_b1 = v_load(src + i), v_b2 = v_load(src + i + 4); int p = power; while( p > 1 ) { if (p & 1) - v_a = vmulq_s32(v_a, v_b); - v_b = vmulq_s32(v_b, v_b); + { + v_a1 *= v_b1; + v_a2 *= v_b2; + } + v_b1 *= v_b1; + v_b2 *= v_b2; p >>= 1; } - v_a = vmulq_s32(v_a, v_b); - vst1q_s32(dst + i, v_a); + v_a1 *= v_b1; + v_a2 *= v_b2; + + v_store(dst + i, v_a1); + v_store(dst + i + 4, v_a2); } return i; @@ -1071,42 +1090,143 @@ struct iPow_SIMD int operator() ( const float * src, float * dst, int len, int power) { int i = 0; - float32x4_t v_1 = vdupq_n_f32(1.0f); + v_float32x4 v_1 = v_setall_f32(1.f); + + for ( ; i <= len - 8; i += 8) + { + v_float32x4 v_a1 = v_1, v_a2 = v_1; + v_float32x4 v_b1 = v_load(src + i), v_b2 = v_load(src + i + 4); + int p = std::abs(power); + if( power < 0 ) + { + v_b1 = v_1 / v_b1; + v_b2 = v_1 / v_b2; + } + + while( p > 1 ) + { + if (p & 1) + { + v_a1 *= v_b1; + v_a2 *= v_b2; + } + v_b1 *= v_b1; + v_b2 *= v_b2; + p >>= 1; + } + + v_a1 *= v_b1; + v_a2 *= v_b2; + + v_store(dst + i, v_a1); + v_store(dst + i + 4, v_a2); + } + + return i; + } +}; + +#if CV_SIMD128_64F +template <> +struct iPow_SIMD +{ + int operator() ( const double * src, double * dst, int len, int power) + { + int i = 0; + v_float64x2 v_1 = v_setall_f64(1.); for ( ; i <= len - 4; i += 4) { - float32x4_t v_b = vld1q_f32(src + i), v_a = v_1; - int p = power; + v_float64x2 v_a1 = v_1, v_a2 = v_1; + v_float64x2 v_b1 = v_load(src + i), v_b2 = v_load(src + i + 2); + int p = std::abs(power); + if( power < 0 ) + { + v_b1 = v_1 / v_b1; + v_b2 = v_1 / v_b2; + } while( p > 1 ) { if (p & 1) - v_a = vmulq_f32(v_a, v_b); - v_b = vmulq_f32(v_b, v_b); + { + v_a1 *= v_b1; + v_a2 *= v_b2; + } + v_b1 *= v_b1; + v_b2 *= v_b2; p >>= 1; } - v_a = vmulq_f32(v_a, v_b); - vst1q_f32(dst + i, v_a); + v_a1 *= v_b1; + v_a2 *= v_b2; + + v_store(dst + i, v_a1); + v_store(dst + i + 2, v_a2); } return i; } }; +#endif #endif template static void -iPow_( const T* src, T* dst, int len, int power ) +iPow_i( const T* src, T* dst, int len, int power ) { - iPow_SIMD vop; - int i = vop(src, dst, len, power); + if( power < 0 ) + { + T tab[5] = + { + power == -1 ? saturate_cast(-1) : 0, (power & 1) ? -1 : 1, + std::numeric_limits::max(), 1, power == -1 ? 1 : 0 + }; + for( int i = 0; i < len; i++ ) + { + T val = src[i]; + dst[i] = cv_abs(val) <= 2 ? tab[val + 2] : (T)0; + } + } + else + { + iPow_SIMD vop; + int i = vop(src, dst, len, power); + + for( ; i < len; i++ ) + { + WT a = 1, b = src[i]; + int p = power; + while( p > 1 ) + { + if( p & 1 ) + a *= b; + b *= b; + p >>= 1; + } + + a *= b; + dst[i] = saturate_cast(a); + } + } +} + +template +static void +iPow_f( const T* src, T* dst, int len, int power0 ) +{ + iPow_SIMD vop; + int i = vop(src, dst, len, power0); + int power = std::abs(power0); for( ; i < len; i++ ) { - WT a = 1, b = src[i]; + T a = 1, b = src[i]; int p = power; + if( power0 < 0 ) + b = 1/b; + while( p > 1 ) { if( p & 1 ) @@ -1116,44 +1236,43 @@ iPow_( const T* src, T* dst, int len, int power ) } a *= b; - dst[i] = saturate_cast(a); + dst[i] = a; } } - static void iPow8u(const uchar* src, uchar* dst, int len, int power) { - iPow_(src, dst, len, power); + iPow_i(src, dst, len, power); } static void iPow8s(const schar* src, schar* dst, int len, int power) { - iPow_(src, dst, len, power); + iPow_i(src, dst, len, power); } static void iPow16u(const ushort* src, ushort* dst, int len, int power) { - iPow_(src, dst, len, power); + iPow_i(src, dst, len, power); } static void iPow16s(const short* src, short* dst, int len, int power) { - iPow_(src, dst, len, power); + iPow_i(src, dst, len, power); } static void iPow32s(const int* src, int* dst, int len, int power) { - iPow_(src, dst, len, power); + iPow_i(src, dst, len, power); } static void iPow32f(const float* src, float* dst, int len, int power) { - iPow_(src, dst, len, power); + iPow_f(src, dst, len, power); } static void iPow64f(const double* src, double* dst, int len, int power) { - iPow_(src, dst, len, power); + iPow_f(src, dst, len, power); } @@ -1176,14 +1295,25 @@ static bool ocl_pow(InputArray _src, double power, OutputArray _dst, bool doubleSupport = d.doubleFPConfig() > 0; _dst.createSameSize(_src, type); - if (is_ipower && (ipower == 0 || ipower == 1)) + if (is_ipower) { if (ipower == 0) + { _dst.setTo(Scalar::all(1)); - else if (ipower == 1) + return true; + } + if (ipower == 1) + { _src.copyTo(_dst); - - return true; + return true; + } + if( ipower < 0 ) + { + if( depth == CV_32F || depth == CV_64F ) + is_ipower = false; + else + return false; + } } if (depth == CV_64F && !doubleSupport) @@ -1238,15 +1368,6 @@ void pow( InputArray _src, double power, OutputArray _dst ) if( is_ipower && !(ocl::Device::getDefault().isIntel() && useOpenCL && depth != CV_64F)) { - if( ipower < 0 ) - { - divide( Scalar::all(1), _src, _dst ); - if( ipower == -1 ) - return; - ipower = -ipower; - same = true; - } - switch( ipower ) { case 0: @@ -1257,41 +1378,7 @@ void pow( InputArray _src, double power, OutputArray _dst ) _src.copyTo(_dst); return; case 2: -#if defined(HAVE_IPP) - CV_IPP_CHECK() - { - if (depth == CV_32F && !same && ( (_src.dims() <= 2 && !ocl::useOpenCL()) || - (_src.dims() > 2 && _src.isContinuous() && _dst.isContinuous()) )) - { - Mat src = _src.getMat(); - _dst.create( src.dims, src.size, type ); - Mat dst = _dst.getMat(); - - Size size = src.size(); - int srcstep = (int)src.step, dststep = (int)dst.step, esz = CV_ELEM_SIZE(type); - if (src.isContinuous() && dst.isContinuous()) - { - size.width = (int)src.total(); - size.height = 1; - srcstep = dststep = (int)src.total() * esz; - } - size.width *= cn; - - IppStatus status = ippiSqr_32f_C1R(src.ptr(), srcstep, dst.ptr(), dststep, ippiSize(size.width, size.height)); - - if (status >= 0) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } - } -#endif - if (same) - multiply(_dst, _dst, _dst); - else - multiply(_src, _src, _dst); + multiply(_src, _src, _dst); return; } } @@ -1301,15 +1388,9 @@ void pow( InputArray _src, double power, OutputArray _dst ) CV_OCL_RUN(useOpenCL, ocl_pow(same ? _dst : _src, power, _dst, is_ipower, ipower)) - Mat src, dst; - if (same) - src = dst = _dst.getMat(); - else - { - src = _src.getMat(); - _dst.create( src.dims, src.size, type ); - dst = _dst.getMat(); - } + Mat src = _src.getMat(); + _dst.create( src.dims, src.size, type ); + Mat dst = _dst.getMat(); const Mat* arrays[] = {&src, &dst, 0}; uchar* ptrs[2]; @@ -1335,52 +1416,103 @@ void pow( InputArray _src, double power, OutputArray _dst ) } else { -#if defined(HAVE_IPP) - CV_IPP_CHECK() - { - if (src.isContinuous() && dst.isContinuous()) - { - IppStatus status = depth == CV_32F ? - ippsPowx_32f_A21(src.ptr(), (Ipp32f)power, dst.ptr(), (Ipp32s)(src.total() * cn)) : - ippsPowx_64f_A50(src.ptr(), power, dst.ptr(), (Ipp32s)(src.total() * cn)); - - if (status >= 0) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } - } -#endif - int j, k, blockSize = std::min(len, ((BLOCK_SIZE + cn-1)/cn)*cn); size_t esz1 = src.elemSize1(); + AutoBuffer buf; + Cv32suf inf32, nan32; + Cv64suf inf64, nan64; + float* fbuf = 0; + double* dbuf = 0; + inf32.i = 0x7f800000; + nan32.i = 0x7fffffff; + inf64.i = CV_BIG_INT(0x7FF0000000000000); + nan64.i = CV_BIG_INT(0x7FFFFFFFFFFFFFFF); + + if( src.ptr() == dst.ptr() ) + { + buf.allocate(blockSize*esz1); + fbuf = (float*)(uchar*)buf; + dbuf = (double*)(uchar*)buf; + } for( size_t i = 0; i < it.nplanes; i++, ++it ) { for( j = 0; j < len; j += blockSize ) { int bsz = std::min(len - j, blockSize); + + #if defined(HAVE_IPP) + CV_IPP_CHECK() + { + IppStatus status = depth == CV_32F ? + ippsPowx_32f_A21((const float*)ptrs[0], (float)power, (float*)ptrs[1], bsz) : + ippsPowx_64f_A50((const double*)ptrs[0], (double)power, (double*)ptrs[1], bsz); + + if (status >= 0) + { + CV_IMPL_ADD(CV_IMPL_IPP); + ptrs[0] += bsz*esz1; + ptrs[1] += bsz*esz1; + continue; + } + setIppErrorStatus(); + } + #endif + if( depth == CV_32F ) { - const float* x = (const float*)ptrs[0]; + float* x0 = (float*)ptrs[0]; + float* x = fbuf ? fbuf : x0; float* y = (float*)ptrs[1]; + if( x != x0 ) + memcpy(x, x0, bsz*esz1); + Log_32f(x, y, bsz); for( k = 0; k < bsz; k++ ) y[k] = (float)(y[k]*power); Exp_32f(y, y, bsz); + for( k = 0; k < bsz; k++ ) + { + if( x0[k] <= 0 ) + { + if( x0[k] == 0.f ) + { + if( power < 0 ) + y[k] = inf32.f; + } + else + y[k] = nan32.f; + } + } } else { - const double* x = (const double*)ptrs[0]; + double* x0 = (double*)ptrs[0]; + double* x = dbuf ? dbuf : x0; double* y = (double*)ptrs[1]; + if( x != x0 ) + memcpy(x, x0, bsz*esz1); + Log_64f(x, y, bsz); for( k = 0; k < bsz; k++ ) y[k] *= power; Exp_64f(y, y, bsz); + + for( k = 0; k < bsz; k++ ) + { + if( x0[k] <= 0 ) + { + if( x0[k] == 0. ) + { + if( power < 0 ) + y[k] = inf64.f; + } + else + y[k] = nan64.f; + } + } } ptrs[0] += bsz*esz1; ptrs[1] += bsz*esz1; diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp index e5007e5..0b65c13 100644 --- a/modules/core/src/precomp.hpp +++ b/modules/core/src/precomp.hpp @@ -290,4 +290,6 @@ extern bool __termination; // skip some cleanups, because process is terminating } +#include "opencv2/hal/intrin.hpp" + #endif /*_CXCORE_INTERNAL_H_*/ diff --git a/modules/core/test/test_mat.cpp b/modules/core/test/test_mat.cpp index 60b2444..417ab64 100644 --- a/modules/core/test/test_mat.cpp +++ b/modules/core/test/test_mat.cpp @@ -1210,6 +1210,13 @@ TEST(Core_Mat, copyNx1ToVector) ASSERT_PRED_FORMAT2(cvtest::MatComparator(0, 0), ref_dst16, cv::Mat_(dst16)); } +TEST(Core_Matx, fromMat_) +{ + Mat_ a = (Mat_(2,2) << 10, 11, 12, 13); + Matx22d b(a); + ASSERT_EQ( norm(a, b, NORM_INF), 0.); +} + TEST(Core_SVD, orthogonality) { for( int i = 0; i < 2; i++ ) diff --git a/modules/core/test/test_math.cpp b/modules/core/test/test_math.cpp index 5d48320..23a1452 100644 --- a/modules/core/test/test_math.cpp +++ b/modules/core/test/test_math.cpp @@ -232,7 +232,7 @@ void Core_PowTest::prepare_to_validation( int /*test_case_idx*/ ) for( j = 0; j < ncols; j++ ) { int val = ((uchar*)a_data)[j]; - ((uchar*)b_data)[j] = (uchar)(val <= 1 ? val : + ((uchar*)b_data)[j] = (uchar)(val == 0 ? 255 : val == 1 ? 1 : val == 2 && ipower == -1 ? 1 : 0); } else @@ -247,17 +247,17 @@ void Core_PowTest::prepare_to_validation( int /*test_case_idx*/ ) if( ipower < 0 ) for( j = 0; j < ncols; j++ ) { - int val = ((char*)a_data)[j]; - ((char*)b_data)[j] = (char)((val&~1)==0 ? val : + int val = ((schar*)a_data)[j]; + ((schar*)b_data)[j] = (schar)(val == 0 ? 127 : val == 1 ? 1 : val ==-1 ? 1-2*(ipower&1) : val == 2 && ipower == -1 ? 1 : 0); } else for( j = 0; j < ncols; j++ ) { - int val = ((char*)a_data)[j]; + int val = ((schar*)a_data)[j]; val = ipow( val, ipower ); - ((char*)b_data)[j] = saturate_cast(val); + ((schar*)b_data)[j] = saturate_cast(val); } break; case CV_16U: @@ -265,7 +265,7 @@ void Core_PowTest::prepare_to_validation( int /*test_case_idx*/ ) for( j = 0; j < ncols; j++ ) { int val = ((ushort*)a_data)[j]; - ((ushort*)b_data)[j] = (ushort)((val&~1)==0 ? val : + ((ushort*)b_data)[j] = (ushort)(val == 0 ? 65535 : val == 1 ? 1 : val ==-1 ? 1-2*(ipower&1) : val == 2 && ipower == -1 ? 1 : 0); } @@ -282,7 +282,7 @@ void Core_PowTest::prepare_to_validation( int /*test_case_idx*/ ) for( j = 0; j < ncols; j++ ) { int val = ((short*)a_data)[j]; - ((short*)b_data)[j] = (short)((val&~1)==0 ? val : + ((short*)b_data)[j] = (short)(val == 0 ? 32767 : val == 1 ? 1 : val ==-1 ? 1-2*(ipower&1) : val == 2 && ipower == -1 ? 1 : 0); } @@ -299,7 +299,7 @@ void Core_PowTest::prepare_to_validation( int /*test_case_idx*/ ) for( j = 0; j < ncols; j++ ) { int val = ((int*)a_data)[j]; - ((int*)b_data)[j] = (val&~1)==0 ? val : + ((int*)b_data)[j] = val == 0 ? INT_MAX : val == 1 ? 1 : val ==-1 ? 1-2*(ipower&1) : val == 2 && ipower == -1 ? 1 : 0; } @@ -351,8 +351,6 @@ void Core_PowTest::prepare_to_validation( int /*test_case_idx*/ ) } } - - ///////////////////////////////////////// matrix tests //////////////////////////////////////////// class Core_MatrixTest : public cvtest::ArrayTest @@ -2822,4 +2820,55 @@ TEST(CovariationMatrixVectorOfMatWithMean, accuracy) ASSERT_EQ(sDiff.dot(sDiff), 0.0); } +TEST(Core_Pow, special) +{ + for( int i = 0; i < 100; i++ ) + { + int n = theRNG().uniform(1, 30); + Mat mtx0(1, n, CV_8S), mtx, result; + randu(mtx0, -5, 5); + + int type = theRNG().uniform(0, 2) ? CV_64F : CV_32F; + double eps = type == CV_32F ? 1e-3 : 1e-10; + mtx0.convertTo(mtx, type); + // generate power from [-n, n] interval with 1/8 step - enough to check various cases. + const int max_pf = 3; + int pf = theRNG().uniform(0, max_pf*2+1); + double power = ((1 << pf) - (1 << (max_pf*2-1)))/16.; + int ipower = cvRound(power); + bool is_ipower = ipower == power; + cv::pow(mtx, power, result); + for( int j = 0; j < n; j++ ) + { + double val = type == CV_32F ? (double)mtx.at(j) : mtx.at(j); + double r = type == CV_32F ? (double)result.at(j) : result.at(j); + double r0; + if( power == 0. ) + r0 = 1; + else if( is_ipower ) + { + r0 = 1; + for( int k = 0; k < std::abs(ipower); k++ ) + r0 *= val; + if( ipower < 0 ) + r0 = 1./r0; + } + else + r0 = std::pow(val, power); + if( cvIsInf(r0) ) + { + ASSERT_TRUE(cvIsInf(r)); + } + else if( cvIsNaN(r0) ) + { + ASSERT_TRUE(cvIsNaN(r)); + } + else + { + ASSERT_LT(fabs(r - r0), eps); + } + } + } +} + /* End of file. */ diff --git a/modules/hal/include/opencv2/hal/intrin_sse.hpp b/modules/hal/include/opencv2/hal/intrin_sse.hpp index 3b77a11..69171e2 100644 --- a/modules/hal/include/opencv2/hal/intrin_sse.hpp +++ b/modules/hal/include/opencv2/hal/intrin_sse.hpp @@ -614,6 +614,16 @@ inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b) __m128i d1 = _mm_unpackhi_epi32(c0, c1); return v_int32x4(_mm_unpacklo_epi64(d0, d1)); } +inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b) +{ + a = a * b; + return a; +} +inline v_int32x4& operator *= (v_int32x4& a, const v_int32x4& b) +{ + a = a * b; + return a; +} inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, v_int32x4& c, v_int32x4& d) -- 2.7.4