}
};
-#if CV_SSE2
+template <typename T>
+struct Recip_SIMD
+{
+ int operator() (const T *, T *, int, double) const
+ {
+ return 0;
+ }
+};
-#if CV_SSE4_1
+
+#if CV_SIMD128
template <>
struct Div_SIMD<uchar>
{
bool haveSIMD;
- Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
+ Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const
{
if (!haveSIMD)
return x;
- __m128d v_scale = _mm_set1_pd(scale);
- __m128i v_zero = _mm_setzero_si128();
+ v_float32x4 v_scale = v_setall_f32((float)scale);
+ v_uint16x8 v_zero = v_setzero_u16();
for ( ; x <= width - 8; x += 8)
{
- __m128i v_src1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(src1 + x)), v_zero);
- __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
- __m128i v_src2 = _mm_unpacklo_epi8(_v_src2, v_zero);
-
- __m128i v_src1i = _mm_unpacklo_epi16(v_src1, v_zero);
- __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero);
- __m128d v_src1d = _mm_cvtepi32_pd(v_src1i);
- __m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
- __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
- v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
- v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
- __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
- __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
-
- v_src1i = _mm_unpackhi_epi16(v_src1, v_zero);
- v_src2i = _mm_unpackhi_epi16(v_src2, v_zero);
- v_src1d = _mm_cvtepi32_pd(v_src1i);
- v_src2d = _mm_cvtepi32_pd(v_src2i);
- v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
- v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
- v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
- v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
- __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
-
- __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero);
- _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero)));
+ v_uint16x8 v_src1 = v_load_expand(src1 + x);
+ v_uint16x8 v_src2 = v_load_expand(src2 + x);
+
+ v_uint32x4 t0, t1, t2, t3;
+ v_expand(v_src1, t0, t1);
+ v_expand(v_src2, t2, t3);
+
+ v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
+ v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
+
+ v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
+ v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
+
+ f0 = f0 * v_scale / f2;
+ f1 = f1 * v_scale / f3;
+
+ v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
+ v_uint16x8 res = v_pack_u(i0, i1);
+
+ res = v_select(v_src2 == v_zero, v_zero, res);
+ v_pack_store(dst + x, res);
}
return x;
}
};
-#endif // CV_SSE4_1
template <>
struct Div_SIMD<schar>
{
bool haveSIMD;
- Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
+ Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const
{
if (!haveSIMD)
return x;
- __m128d v_scale = _mm_set1_pd(scale);
- __m128i v_zero = _mm_setzero_si128();
+ v_float32x4 v_scale = v_setall_f32((float)scale);
+ v_int16x8 v_zero = v_setzero_s16();
for ( ; x <= width - 8; x += 8)
{
- __m128i v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((const __m128i *)(src1 + x))), 8);
- __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
- __m128i v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _v_src2), 8);
-
- __m128i v_src1i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16);
- __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16);
- __m128d v_src1d = _mm_cvtepi32_pd(v_src1i);
- __m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
- __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
- v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
- v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
- __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
- __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
-
- v_src1i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16);
- v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16);
- v_src1d = _mm_cvtepi32_pd(v_src1i);
- v_src2d = _mm_cvtepi32_pd(v_src2i);
- v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
- v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
- v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
- v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
- __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
-
- __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero);
- _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero)));
+ v_int16x8 v_src1 = v_load_expand(src1 + x);
+ v_int16x8 v_src2 = v_load_expand(src2 + x);
+
+ v_int32x4 t0, t1, t2, t3;
+ v_expand(v_src1, t0, t1);
+ v_expand(v_src2, t2, t3);
+
+ v_float32x4 f0 = v_cvt_f32(t0);
+ v_float32x4 f1 = v_cvt_f32(t1);
+
+ v_float32x4 f2 = v_cvt_f32(t2);
+ v_float32x4 f3 = v_cvt_f32(t3);
+
+ f0 = f0 * v_scale / f2;
+ f1 = f1 * v_scale / f3;
+
+ v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
+ v_int16x8 res = v_pack(i0, i1);
+
+ res = v_select(v_src2 == v_zero, v_zero, res);
+ v_pack_store(dst + x, res);
}
return x;
}
};
-#if CV_SSE4_1
template <>
struct Div_SIMD<ushort>
{
bool haveSIMD;
- Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
+ Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const
{
if (!haveSIMD)
return x;
- __m128d v_scale = _mm_set1_pd(scale);
- __m128i v_zero = _mm_setzero_si128();
+ v_float32x4 v_scale = v_setall_f32((float)scale);
+ v_uint16x8 v_zero = v_setzero_u16();
for ( ; x <= width - 8; x += 8)
{
- __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
- __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
+ v_uint16x8 v_src1 = v_load(src1 + x);
+ v_uint16x8 v_src2 = v_load(src2 + x);
+
+ v_uint32x4 t0, t1, t2, t3;
+ v_expand(v_src1, t0, t1);
+ v_expand(v_src2, t2, t3);
- __m128i v_src1i = _mm_unpacklo_epi16(v_src1, v_zero);
- __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero);
- __m128d v_src1d = _mm_cvtepi32_pd(v_src1i);
- __m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
- __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
- v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
- v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
- __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
- __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
-
- v_src1i = _mm_unpackhi_epi16(v_src1, v_zero);
- v_src2i = _mm_unpackhi_epi16(v_src2, v_zero);
- v_src1d = _mm_cvtepi32_pd(v_src1i);
- v_src2d = _mm_cvtepi32_pd(v_src2i);
- v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
- v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
- v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
- v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
- __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
-
- __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero);
- _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi32(v_dst_0, v_dst_1)));
+ v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
+ v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
+
+ v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
+ v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
+
+ f0 = f0 * v_scale / f2;
+ f1 = f1 * v_scale / f3;
+
+ v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
+ v_uint16x8 res = v_pack_u(i0, i1);
+
+ res = v_select(v_src2 == v_zero, v_zero, res);
+ v_store(dst + x, res);
}
return x;
}
};
-#endif // CV_SSE4_1
-
template <>
struct Div_SIMD<short>
{
bool haveSIMD;
- Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
+ Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const
{
if (!haveSIMD)
return x;
- __m128d v_scale = _mm_set1_pd(scale);
- __m128i v_zero = _mm_setzero_si128();
+ v_float32x4 v_scale = v_setall_f32((float)scale);
+ v_int16x8 v_zero = v_setzero_s16();
for ( ; x <= width - 8; x += 8)
{
- __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
- __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
+ v_int16x8 v_src1 = v_load(src1 + x);
+ v_int16x8 v_src2 = v_load(src2 + x);
- __m128i v_src1i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16);
- __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16);
- __m128d v_src1d = _mm_cvtepi32_pd(v_src1i);
- __m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
- __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
- v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
- v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
- __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
- __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
-
- v_src1i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16);
- v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16);
- v_src1d = _mm_cvtepi32_pd(v_src1i);
- v_src2d = _mm_cvtepi32_pd(v_src2i);
- v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
- v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
- v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
- v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
- __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
-
- __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero);
- _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi32(v_dst_0, v_dst_1)));
+ v_int32x4 t0, t1, t2, t3;
+ v_expand(v_src1, t0, t1);
+ v_expand(v_src2, t2, t3);
+
+ v_float32x4 f0 = v_cvt_f32(t0);
+ v_float32x4 f1 = v_cvt_f32(t1);
+
+ v_float32x4 f2 = v_cvt_f32(t2);
+ v_float32x4 f3 = v_cvt_f32(t3);
+
+ f0 = f0 * v_scale / f2;
+ f1 = f1 * v_scale / f3;
+
+ v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
+ v_int16x8 res = v_pack(i0, i1);
+
+ res = v_select(v_src2 == v_zero, v_zero, res);
+ v_store(dst + x, res);
}
return x;
struct Div_SIMD<int>
{
bool haveSIMD;
- Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
+ Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const
{
if (!haveSIMD)
return x;
- __m128d v_scale = _mm_set1_pd(scale);
- __m128i v_zero = _mm_setzero_si128();
+ v_float32x4 v_scale = v_setall_f32((float)scale);
+ v_int32x4 v_zero = v_setzero_s32();
- for ( ; x <= width - 4; x += 4)
+ for ( ; x <= width - 8; x += 8)
{
- __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
- __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
+ v_int32x4 t0 = v_load(src1 + x);
+ v_int32x4 t1 = v_load(src1 + x + 4);
+ v_int32x4 t2 = v_load(src2 + x);
+ v_int32x4 t3 = v_load(src2 + x + 4);
+
+ v_float32x4 f0 = v_cvt_f32(t0);
+ v_float32x4 f1 = v_cvt_f32(t1);
+ v_float32x4 f2 = v_cvt_f32(t2);
+ v_float32x4 f3 = v_cvt_f32(t3);
- __m128d v_src1d = _mm_cvtepi32_pd(v_src1);
- __m128d v_src2d = _mm_cvtepi32_pd(v_src2);
- __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
+ f0 = f0 * v_scale / f2;
+ f1 = f1 * v_scale / f3;
- v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1, 8));
- v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2, 8));
- __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
+ v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
- __m128i v_dst = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
- __m128i v_mask = _mm_cmpeq_epi32(v_src2, v_zero);
- _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, v_dst));
+ res0 = v_select(t2 == v_zero, v_zero, res0);
+ res1 = v_select(t3 == v_zero, v_zero, res1);
+ v_store(dst + x, res0);
+ v_store(dst + x + 4, res1);
}
return x;
}
};
-#endif
-template<typename T> static void
-div_( const T* src1, size_t step1, const T* src2, size_t step2,
- T* dst, size_t step, Size size, double scale )
+template <>
+struct Div_SIMD<float>
{
- step1 /= sizeof(src1[0]);
- step2 /= sizeof(src2[0]);
- step /= sizeof(dst[0]);
-
- Div_SIMD<T> vop;
+ bool haveSIMD;
+ Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
- for( ; size.height--; src1 += step1, src2 += step2, dst += step )
+ int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const
{
- int i = vop(src1, src2, dst, size.width, scale);
- #if CV_ENABLE_UNROLLED
- for( ; i <= size.width - 4; i += 4 )
+ int x = 0;
+
+ if (!haveSIMD)
+ return x;
+
+ v_float32x4 v_scale = v_setall_f32((float)scale);
+ v_float32x4 v_zero = v_setzero_f32();
+
+ for ( ; x <= width - 8; x += 8)
{
- if( src2[i] != 0 && src2[i+1] != 0 && src2[i+2] != 0 && src2[i+3] != 0 )
- {
- double a = (double)src2[i] * src2[i+1];
- double b = (double)src2[i+2] * src2[i+3];
- double d = scale/(a * b);
- b *= d;
- a *= d;
-
- T z0 = saturate_cast<T>(src2[i+1] * ((double)src1[i] * b));
- T z1 = saturate_cast<T>(src2[i] * ((double)src1[i+1] * b));
- T z2 = saturate_cast<T>(src2[i+3] * ((double)src1[i+2] * a));
- T z3 = saturate_cast<T>(src2[i+2] * ((double)src1[i+3] * a));
-
- dst[i] = z0; dst[i+1] = z1;
- dst[i+2] = z2; dst[i+3] = z3;
- }
- else
- {
- T z0 = src2[i] != 0 ? saturate_cast<T>(src1[i]*scale/src2[i]) : 0;
- T z1 = src2[i+1] != 0 ? saturate_cast<T>(src1[i+1]*scale/src2[i+1]) : 0;
- T z2 = src2[i+2] != 0 ? saturate_cast<T>(src1[i+2]*scale/src2[i+2]) : 0;
- T z3 = src2[i+3] != 0 ? saturate_cast<T>(src1[i+3]*scale/src2[i+3]) : 0;
+ v_float32x4 f0 = v_load(src1 + x);
+ v_float32x4 f1 = v_load(src1 + x + 4);
+ v_float32x4 f2 = v_load(src2 + x);
+ v_float32x4 f3 = v_load(src2 + x + 4);
- dst[i] = z0; dst[i+1] = z1;
- dst[i+2] = z2; dst[i+3] = z3;
- }
+ v_float32x4 res0 = f0 * v_scale / f2;
+ v_float32x4 res1 = f1 * v_scale / f3;
+
+ res0 = v_select(f2 == v_zero, v_zero, res0);
+ res1 = v_select(f3 == v_zero, v_zero, res1);
+
+ v_store(dst + x, res0);
+ v_store(dst + x + 4, res1);
}
- #endif
- for( ; i < size.width; i++ )
- dst[i] = src2[i] != 0 ? saturate_cast<T>(src1[i]*scale/src2[i]) : 0;
- }
-}
-template <typename T>
-struct Recip_SIMD
-{
- int operator() (const T *, T *, int, double) const
- {
- return 0;
+ return x;
}
};
-#if CV_SSE2
-#if CV_SSE4_1
+///////////////////////// RECIPROCAL //////////////////////
template <>
struct Recip_SIMD<uchar>
{
bool haveSIMD;
- Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
+ Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const uchar * src2, uchar * dst, int width, double scale) const
{
if (!haveSIMD)
return x;
- __m128d v_scale = _mm_set1_pd(scale);
- __m128i v_zero = _mm_setzero_si128();
+ v_float32x4 v_scale = v_setall_f32((float)scale);
+ v_uint16x8 v_zero = v_setzero_u16();
for ( ; x <= width - 8; x += 8)
{
- __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
- __m128i v_src2 = _mm_unpacklo_epi8(_v_src2, v_zero);
+ v_uint16x8 v_src2 = v_load_expand(src2 + x);
- __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero);
- __m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
- __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
- v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
- __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
- __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+ v_uint32x4 t0, t1;
+ v_expand(v_src2, t0, t1);
- v_src2i = _mm_unpackhi_epi16(v_src2, v_zero);
- v_src2d = _mm_cvtepi32_pd(v_src2i);
- v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
- v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
- v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
- __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+ v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
+ v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
- __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero);
- _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero)));
+ f0 = v_scale / f0;
+ f1 = v_scale / f1;
+
+ v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
+ v_uint16x8 res = v_pack_u(i0, i1);
+
+ res = v_select(v_src2 == v_zero, v_zero, res);
+ v_pack_store(dst + x, res);
}
return x;
}
};
-#endif // CV_SSE4_1
template <>
struct Recip_SIMD<schar>
{
bool haveSIMD;
- Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
+ Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const schar * src2, schar * dst, int width, double scale) const
{
if (!haveSIMD)
return x;
- __m128d v_scale = _mm_set1_pd(scale);
- __m128i v_zero = _mm_setzero_si128();
+ v_float32x4 v_scale = v_setall_f32((float)scale);
+ v_int16x8 v_zero = v_setzero_s16();
for ( ; x <= width - 8; x += 8)
{
- __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
- __m128i v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _v_src2), 8);
+ v_int16x8 v_src2 = v_load_expand(src2 + x);
+
+ v_int32x4 t0, t1;
+ v_expand(v_src2, t0, t1);
- __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16);
- __m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
- __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
- v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
- __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
- __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+ v_float32x4 f0 = v_cvt_f32(t0);
+ v_float32x4 f1 = v_cvt_f32(t1);
- v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16);
- v_src2d = _mm_cvtepi32_pd(v_src2i);
- v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
- v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
- v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
- __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+ f0 = v_scale / f0;
+ f1 = v_scale / f1;
- __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero);
- _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero)));
+ v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
+ v_int16x8 res = v_pack(i0, i1);
+
+ res = v_select(v_src2 == v_zero, v_zero, res);
+ v_pack_store(dst + x, res);
}
return x;
}
};
-#if CV_SSE4_1
template <>
struct Recip_SIMD<ushort>
{
bool haveSIMD;
- Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
+ Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const ushort * src2, ushort * dst, int width, double scale) const
{
if (!haveSIMD)
return x;
- __m128d v_scale = _mm_set1_pd(scale);
- __m128i v_zero = _mm_setzero_si128();
+ v_float32x4 v_scale = v_setall_f32((float)scale);
+ v_uint16x8 v_zero = v_setzero_u16();
for ( ; x <= width - 8; x += 8)
{
- __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
+ v_uint16x8 v_src2 = v_load(src2 + x);
- __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero);
- __m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
- __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
- v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
- __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
- __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+ v_uint32x4 t0, t1;
+ v_expand(v_src2, t0, t1);
- v_src2i = _mm_unpackhi_epi16(v_src2, v_zero);
- v_src2d = _mm_cvtepi32_pd(v_src2i);
- v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
- v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
- v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
- __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+ v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
+ v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
- __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero);
- _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi32(v_dst_0, v_dst_1)));
+ f0 = v_scale / f0;
+ f1 = v_scale / f1;
+
+ v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
+ v_uint16x8 res = v_pack_u(i0, i1);
+
+ res = v_select(v_src2 == v_zero, v_zero, res);
+ v_store(dst + x, res);
}
return x;
}
};
-#endif // CV_SSE4_1
-
template <>
struct Recip_SIMD<short>
{
bool haveSIMD;
- Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
+ Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const short * src2, short * dst, int width, double scale) const
{
if (!haveSIMD)
return x;
- __m128d v_scale = _mm_set1_pd(scale);
- __m128i v_zero = _mm_setzero_si128();
+ v_float32x4 v_scale = v_setall_f32((float)scale);
+ v_int16x8 v_zero = v_setzero_s16();
for ( ; x <= width - 8; x += 8)
{
- __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
+ v_int16x8 v_src2 = v_load(src2 + x);
+
+ v_int32x4 t0, t1;
+ v_expand(v_src2, t0, t1);
+
+ v_float32x4 f0 = v_cvt_f32(t0);
+ v_float32x4 f1 = v_cvt_f32(t1);
- __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16);
- __m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
- __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
- v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
- __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
- __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+ f0 = v_scale / f0;
+ f1 = v_scale / f1;
- v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16);
- v_src2d = _mm_cvtepi32_pd(v_src2i);
- v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
- v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
- v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
- __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+ v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
+ v_int16x8 res = v_pack(i0, i1);
- __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero);
- _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi32(v_dst_0, v_dst_1)));
+ res = v_select(v_src2 == v_zero, v_zero, res);
+ v_store(dst + x, res);
}
return x;
struct Recip_SIMD<int>
{
bool haveSIMD;
- Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
+ Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const int * src2, int * dst, int width, double scale) const
{
if (!haveSIMD)
return x;
- __m128d v_scale = _mm_set1_pd(scale);
- __m128i v_zero = _mm_setzero_si128();
+ v_float32x4 v_scale = v_setall_f32((float)scale);
+ v_int32x4 v_zero = v_setzero_s32();
+
+ for ( ; x <= width - 8; x += 8)
+ {
+ v_int32x4 t0 = v_load(src2 + x);
+ v_int32x4 t1 = v_load(src2 + x + 4);
+
+ v_float32x4 f0 = v_cvt_f32(t0);
+ v_float32x4 f1 = v_cvt_f32(t1);
+
+ f0 = v_scale / f0;
+ f1 = v_scale / f1;
+
+ v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
+
+ res0 = v_select(t0 == v_zero, v_zero, res0);
+ res1 = v_select(t1 == v_zero, v_zero, res1);
+ v_store(dst + x, res0);
+ v_store(dst + x + 4, res1);
+ }
+
+ return x;
+ }
+};
+
+
+template <>
+struct Recip_SIMD<float>
+{
+ bool haveSIMD;
+ Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
+
+ int operator() (const float * src2, float * dst, int width, double scale) const
+ {
+ int x = 0;
+
+ if (!haveSIMD)
+ return x;
+
+ v_float32x4 v_scale = v_setall_f32((float)scale);
+ v_float32x4 v_zero = v_setzero_f32();
+
+ for ( ; x <= width - 8; x += 8)
+ {
+ v_float32x4 f0 = v_load(src2 + x);
+ v_float32x4 f1 = v_load(src2 + x + 4);
+
+ v_float32x4 res0 = v_scale / f0;
+ v_float32x4 res1 = v_scale / f1;
+
+ res0 = v_select(f0 == v_zero, v_zero, res0);
+ res1 = v_select(f1 == v_zero, v_zero, res1);
+
+ v_store(dst + x, res0);
+ v_store(dst + x + 4, res1);
+ }
+
+ return x;
+ }
+};
+
+#if CV_SIMD128_64F
+
+template <>
+struct Div_SIMD<double>
+{
+ bool haveSIMD;
+ Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
+
+ int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const
+ {
+ int x = 0;
+
+ if (!haveSIMD)
+ return x;
+
+ v_float64x2 v_scale = v_setall_f64(scale);
+ v_float64x2 v_zero = v_setzero_f64();
for ( ; x <= width - 4; x += 4)
{
- __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
+ v_float64x2 f0 = v_load(src1 + x);
+ v_float64x2 f1 = v_load(src1 + x + 2);
+ v_float64x2 f2 = v_load(src2 + x);
+ v_float64x2 f3 = v_load(src2 + x + 2);
+
+ v_float64x2 res0 = f0 * v_scale / f2;
+ v_float64x2 res1 = f1 * v_scale / f3;
+
+ res0 = v_select(f0 == v_zero, v_zero, res0);
+ res1 = v_select(f1 == v_zero, v_zero, res1);
+
+ v_store(dst + x, res0);
+ v_store(dst + x + 2, res1);
+ }
+
+ return x;
+ }
+};
+
+template <>
+struct Recip_SIMD<double>
+{
+ bool haveSIMD;
+ Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
+
+ int operator() (const double * src2, double * dst, int width, double scale) const
+ {
+ int x = 0;
- __m128d v_src2d = _mm_cvtepi32_pd(v_src2);
- __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
+ if (!haveSIMD)
+ return x;
- v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2, 8));
- __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
+ v_float64x2 v_scale = v_setall_f64(scale);
+ v_float64x2 v_zero = v_setzero_f64();
- __m128i v_dst = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
- __m128i v_mask = _mm_cmpeq_epi32(v_src2, v_zero);
- _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, v_dst));
+ for ( ; x <= width - 4; x += 4)
+ {
+ v_float64x2 f0 = v_load(src2 + x);
+ v_float64x2 f1 = v_load(src2 + x + 2);
+
+ v_float64x2 res0 = v_scale / f0;
+ v_float64x2 res1 = v_scale / f1;
+
+ res0 = v_select(f0 == v_zero, v_zero, res0);
+ res1 = v_select(f1 == v_zero, v_zero, res1);
+
+ v_store(dst + x, res0);
+ v_store(dst + x + 2, res1);
}
return x;
#endif
+#endif
+
template<typename T> static void
-recip_( const T*, size_t, const T* src2, size_t step2,
- T* dst, size_t step, Size size, double scale )
+div_i( const T* src1, size_t step1, const T* src2, size_t step2,
+ T* dst, size_t step, Size size, double scale )
+{
+ step1 /= sizeof(src1[0]);
+ step2 /= sizeof(src2[0]);
+ step /= sizeof(dst[0]);
+
+ Div_SIMD<T> vop;
+ float scale_f = (float)scale;
+
+ for( ; size.height--; src1 += step1, src2 += step2, dst += step )
+ {
+ int i = vop(src1, src2, dst, size.width, scale);
+ for( ; i < size.width; i++ )
+ {
+ T num = src1[i], denom = src2[i];
+ dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
+ }
+ }
+}
+
+template<typename T> static void
+div_f( const T* src1, size_t step1, const T* src2, size_t step2,
+ T* dst, size_t step, Size size, double scale )
+{
+ T scale_f = (T)scale;
+ step1 /= sizeof(src1[0]);
+ step2 /= sizeof(src2[0]);
+ step /= sizeof(dst[0]);
+
+ Div_SIMD<T> vop;
+
+ for( ; size.height--; src1 += step1, src2 += step2, dst += step )
+ {
+ int i = vop(src1, src2, dst, size.width, scale);
+ for( ; i < size.width; i++ )
+ {
+ T num = src1[i], denom = src2[i];
+ dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
+ }
+ }
+}
+
+template<typename T> static void
+recip_i( const T*, size_t, const T* src2, size_t step2,
+ T* dst, size_t step, Size size, double scale )
{
step2 /= sizeof(src2[0]);
step /= sizeof(dst[0]);
Recip_SIMD<T> vop;
+ float scale_f = (float)scale;
for( ; size.height--; src2 += step2, dst += step )
{
int i = vop(src2, dst, size.width, scale);
- #if CV_ENABLE_UNROLLED
- for( ; i <= size.width - 4; i += 4 )
+ for( ; i < size.width; i++ )
{
- if( src2[i] != 0 && src2[i+1] != 0 && src2[i+2] != 0 && src2[i+3] != 0 )
- {
- double a = (double)src2[i] * src2[i+1];
- double b = (double)src2[i+2] * src2[i+3];
- double d = scale/(a * b);
- b *= d;
- a *= d;
-
- T z0 = saturate_cast<T>(src2[i+1] * b);
- T z1 = saturate_cast<T>(src2[i] * b);
- T z2 = saturate_cast<T>(src2[i+3] * a);
- T z3 = saturate_cast<T>(src2[i+2] * a);
-
- dst[i] = z0; dst[i+1] = z1;
- dst[i+2] = z2; dst[i+3] = z3;
- }
- else
- {
- T z0 = src2[i] != 0 ? saturate_cast<T>(scale/src2[i]) : 0;
- T z1 = src2[i+1] != 0 ? saturate_cast<T>(scale/src2[i+1]) : 0;
- T z2 = src2[i+2] != 0 ? saturate_cast<T>(scale/src2[i+2]) : 0;
- T z3 = src2[i+3] != 0 ? saturate_cast<T>(scale/src2[i+3]) : 0;
-
- dst[i] = z0; dst[i+1] = z1;
- dst[i+2] = z2; dst[i+3] = z3;
- }
+ T denom = src2[i];
+ dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
}
- #endif
+ }
+}
+
+template<typename T> static void
+recip_f( const T*, size_t, const T* src2, size_t step2,
+ T* dst, size_t step, Size size, double scale )
+{
+ T scale_f = (T)scale;
+ step2 /= sizeof(src2[0]);
+ step /= sizeof(dst[0]);
+
+ Recip_SIMD<T> vop;
+
+ for( ; size.height--; src2 += step2, dst += step )
+ {
+ int i = vop(src2, dst, size.width, scale);
for( ; i < size.width; i++ )
- dst[i] = src2[i] != 0 ? saturate_cast<T>(scale/src2[i]) : 0;
+ {
+ T denom = src2[i];
+ dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
+ }
}
}
uchar* dst, size_t step, Size sz, void* scale)
{
if( src1 )
- div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+ div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
else
- recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+ recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
static void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
schar* dst, size_t step, Size sz, void* scale)
{
- div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+ div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
static void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
ushort* dst, size_t step, Size sz, void* scale)
{
- div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+ div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
static void div16s( const short* src1, size_t step1, const short* src2, size_t step2,
short* dst, size_t step, Size sz, void* scale)
{
- div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+ div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
static void div32s( const int* src1, size_t step1, const int* src2, size_t step2,
int* dst, size_t step, Size sz, void* scale)
{
- div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+ div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
static void div32f( const float* src1, size_t step1, const float* src2, size_t step2,
float* dst, size_t step, Size sz, void* scale)
{
- div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+ div_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
static void div64f( const double* src1, size_t step1, const double* src2, size_t step2,
double* dst, size_t step, Size sz, void* scale)
{
- div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+ div_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
static void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
uchar* dst, size_t step, Size sz, void* scale)
{
- recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+ recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
static void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
schar* dst, size_t step, Size sz, void* scale)
{
- recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+ recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
static void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
ushort* dst, size_t step, Size sz, void* scale)
{
- recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+ recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
static void recip16s( const short* src1, size_t step1, const short* src2, size_t step2,
short* dst, size_t step, Size sz, void* scale)
{
- recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+ recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
static void recip32s( const int* src1, size_t step1, const int* src2, size_t step2,
int* dst, size_t step, Size sz, void* scale)
{
- recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+ recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
static void recip32f( const float* src1, size_t step1, const float* src2, size_t step2,
float* dst, size_t step, Size sz, void* scale)
{
- recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+ recip_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
static void recip64f( const double* src1, size_t step1, const double* src2, size_t step2,
double* dst, size_t step, Size sz, void* scale)
{
- recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+ recip_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
}
};
-#if CV_NEON
+#if CV_SIMD128
template <>
struct iPow_SIMD<uchar, int>
{
- int operator() ( const uchar * src, uchar * dst, int len, int power)
+ int operator() ( const uchar * src, uchar * dst, int len, int power )
{
int i = 0;
- uint32x4_t v_1 = vdupq_n_u32(1u);
+ v_uint32x4 v_1 = v_setall_u32(1u);
for ( ; i <= len - 8; i += 8)
{
- uint32x4_t v_a1 = v_1, v_a2 = v_1;
- uint16x8_t v_src = vmovl_u8(vld1_u8(src + i));
- uint32x4_t v_b1 = vmovl_u16(vget_low_u16(v_src)), v_b2 = vmovl_u16(vget_high_u16(v_src));
+ v_uint32x4 v_a1 = v_1, v_a2 = v_1;
+ v_uint16x8 v = v_load_expand(src + i);
+ v_uint32x4 v_b1, v_b2;
+ v_expand(v, v_b1, v_b2);
int p = power;
while( p > 1 )
{
if (p & 1)
{
- v_a1 = vmulq_u32(v_a1, v_b1);
- v_a2 = vmulq_u32(v_a2, v_b2);
+ v_a1 *= v_b1;
+ v_a2 *= v_b2;
}
- v_b1 = vmulq_u32(v_b1, v_b1);
- v_b2 = vmulq_u32(v_b2, v_b2);
+ v_b1 *= v_b1;
+ v_b2 *= v_b2;
p >>= 1;
}
- v_a1 = vmulq_u32(v_a1, v_b1);
- v_a2 = vmulq_u32(v_a2, v_b2);
- vst1_u8(dst + i, vqmovn_u16(vcombine_u16(vqmovn_u32(v_a1), vqmovn_u32(v_a2))));
+ v_a1 *= v_b1;
+ v_a2 *= v_b2;
+
+ v = v_pack(v_a1, v_a2);
+ v_pack_store(dst + i, v);
}
return i;
int operator() ( const schar * src, schar * dst, int len, int power)
{
int i = 0;
- int32x4_t v_1 = vdupq_n_s32(1);
+ v_int32x4 v_1 = v_setall_s32(1);
for ( ; i <= len - 8; i += 8)
{
- int32x4_t v_a1 = v_1, v_a2 = v_1;
- int16x8_t v_src = vmovl_s8(vld1_s8(src + i));
- int32x4_t v_b1 = vmovl_s16(vget_low_s16(v_src)), v_b2 = vmovl_s16(vget_high_s16(v_src));
+ v_int32x4 v_a1 = v_1, v_a2 = v_1;
+ v_int16x8 v = v_load_expand(src + i);
+ v_int32x4 v_b1, v_b2;
+ v_expand(v, v_b1, v_b2);
int p = power;
while( p > 1 )
{
if (p & 1)
{
- v_a1 = vmulq_s32(v_a1, v_b1);
- v_a2 = vmulq_s32(v_a2, v_b2);
+ v_a1 *= v_b1;
+ v_a2 *= v_b2;
}
- v_b1 = vmulq_s32(v_b1, v_b1);
- v_b2 = vmulq_s32(v_b2, v_b2);
+ v_b1 *= v_b1;
+ v_b2 *= v_b2;
p >>= 1;
}
- v_a1 = vmulq_s32(v_a1, v_b1);
- v_a2 = vmulq_s32(v_a2, v_b2);
- vst1_s8(dst + i, vqmovn_s16(vcombine_s16(vqmovn_s32(v_a1), vqmovn_s32(v_a2))));
+ v_a1 *= v_b1;
+ v_a2 *= v_b2;
+
+ v = v_pack(v_a1, v_a2);
+ v_pack_store(dst + i, v);
}
return i;
int operator() ( const ushort * src, ushort * dst, int len, int power)
{
int i = 0;
- uint32x4_t v_1 = vdupq_n_u32(1u);
+ v_uint32x4 v_1 = v_setall_u32(1u);
for ( ; i <= len - 8; i += 8)
{
- uint32x4_t v_a1 = v_1, v_a2 = v_1;
- uint16x8_t v_src = vld1q_u16(src + i);
- uint32x4_t v_b1 = vmovl_u16(vget_low_u16(v_src)), v_b2 = vmovl_u16(vget_high_u16(v_src));
+ v_uint32x4 v_a1 = v_1, v_a2 = v_1;
+ v_uint16x8 v = v_load(src + i);
+ v_uint32x4 v_b1, v_b2;
+ v_expand(v, v_b1, v_b2);
int p = power;
while( p > 1 )
{
if (p & 1)
{
- v_a1 = vmulq_u32(v_a1, v_b1);
- v_a2 = vmulq_u32(v_a2, v_b2);
+ v_a1 *= v_b1;
+ v_a2 *= v_b2;
}
- v_b1 = vmulq_u32(v_b1, v_b1);
- v_b2 = vmulq_u32(v_b2, v_b2);
+ v_b1 *= v_b1;
+ v_b2 *= v_b2;
p >>= 1;
}
- v_a1 = vmulq_u32(v_a1, v_b1);
- v_a2 = vmulq_u32(v_a2, v_b2);
- vst1q_u16(dst + i, vcombine_u16(vqmovn_u32(v_a1), vqmovn_u32(v_a2)));
+ v_a1 *= v_b1;
+ v_a2 *= v_b2;
+
+ v = v_pack(v_a1, v_a2);
+ v_store(dst + i, v);
}
return i;
int operator() ( const short * src, short * dst, int len, int power)
{
int i = 0;
- int32x4_t v_1 = vdupq_n_s32(1);
+ v_int32x4 v_1 = v_setall_s32(1);
for ( ; i <= len - 8; i += 8)
{
- int32x4_t v_a1 = v_1, v_a2 = v_1;
- int16x8_t v_src = vld1q_s16(src + i);
- int32x4_t v_b1 = vmovl_s16(vget_low_s16(v_src)), v_b2 = vmovl_s16(vget_high_s16(v_src));
+ v_int32x4 v_a1 = v_1, v_a2 = v_1;
+ v_int16x8 v = v_load(src + i);
+ v_int32x4 v_b1, v_b2;
+ v_expand(v, v_b1, v_b2);
int p = power;
while( p > 1 )
{
if (p & 1)
{
- v_a1 = vmulq_s32(v_a1, v_b1);
- v_a2 = vmulq_s32(v_a2, v_b2);
+ v_a1 *= v_b1;
+ v_a2 *= v_b2;
}
- v_b1 = vmulq_s32(v_b1, v_b1);
- v_b2 = vmulq_s32(v_b2, v_b2);
+ v_b1 *= v_b1;
+ v_b2 *= v_b2;
p >>= 1;
}
- v_a1 = vmulq_s32(v_a1, v_b1);
- v_a2 = vmulq_s32(v_a2, v_b2);
- vst1q_s16(dst + i, vcombine_s16(vqmovn_s32(v_a1), vqmovn_s32(v_a2)));
+ v_a1 *= v_b1;
+ v_a2 *= v_b2;
+
+ v = v_pack(v_a1, v_a2);
+ v_store(dst + i, v);
}
return i;
}
};
-
template <>
struct iPow_SIMD<int, int>
{
int operator() ( const int * src, int * dst, int len, int power)
{
int i = 0;
- int32x4_t v_1 = vdupq_n_s32(1);
+ v_int32x4 v_1 = v_setall_s32(1);
- for ( ; i <= len - 4; i += 4)
+ for ( ; i <= len - 8; i += 8)
{
- int32x4_t v_b = vld1q_s32(src + i), v_a = v_1;
+ v_int32x4 v_a1 = v_1, v_a2 = v_1;
+ v_int32x4 v_b1 = v_load(src + i), v_b2 = v_load(src + i + 4);
int p = power;
while( p > 1 )
{
if (p & 1)
- v_a = vmulq_s32(v_a, v_b);
- v_b = vmulq_s32(v_b, v_b);
+ {
+ v_a1 *= v_b1;
+ v_a2 *= v_b2;
+ }
+ v_b1 *= v_b1;
+ v_b2 *= v_b2;
p >>= 1;
}
- v_a = vmulq_s32(v_a, v_b);
- vst1q_s32(dst + i, v_a);
+ v_a1 *= v_b1;
+ v_a2 *= v_b2;
+
+ v_store(dst + i, v_a1);
+ v_store(dst + i + 4, v_a2);
}
return i;
int operator() ( const float * src, float * dst, int len, int power)
{
int i = 0;
- float32x4_t v_1 = vdupq_n_f32(1.0f);
+ v_float32x4 v_1 = v_setall_f32(1.f);
+
+ for ( ; i <= len - 8; i += 8)
+ {
+ v_float32x4 v_a1 = v_1, v_a2 = v_1;
+ v_float32x4 v_b1 = v_load(src + i), v_b2 = v_load(src + i + 4);
+ int p = std::abs(power);
+ if( power < 0 )
+ {
+ v_b1 = v_1 / v_b1;
+ v_b2 = v_1 / v_b2;
+ }
+
+ while( p > 1 )
+ {
+ if (p & 1)
+ {
+ v_a1 *= v_b1;
+ v_a2 *= v_b2;
+ }
+ v_b1 *= v_b1;
+ v_b2 *= v_b2;
+ p >>= 1;
+ }
+
+ v_a1 *= v_b1;
+ v_a2 *= v_b2;
+
+ v_store(dst + i, v_a1);
+ v_store(dst + i + 4, v_a2);
+ }
+
+ return i;
+ }
+};
+
+#if CV_SIMD128_64F
+template <>
+struct iPow_SIMD<double, double>
+{
+ int operator() ( const double * src, double * dst, int len, int power)
+ {
+ int i = 0;
+ v_float64x2 v_1 = v_setall_f64(1.);
for ( ; i <= len - 4; i += 4)
{
- float32x4_t v_b = vld1q_f32(src + i), v_a = v_1;
- int p = power;
+ v_float64x2 v_a1 = v_1, v_a2 = v_1;
+ v_float64x2 v_b1 = v_load(src + i), v_b2 = v_load(src + i + 2);
+ int p = std::abs(power);
+ if( power < 0 )
+ {
+ v_b1 = v_1 / v_b1;
+ v_b2 = v_1 / v_b2;
+ }
while( p > 1 )
{
if (p & 1)
- v_a = vmulq_f32(v_a, v_b);
- v_b = vmulq_f32(v_b, v_b);
+ {
+ v_a1 *= v_b1;
+ v_a2 *= v_b2;
+ }
+ v_b1 *= v_b1;
+ v_b2 *= v_b2;
p >>= 1;
}
- v_a = vmulq_f32(v_a, v_b);
- vst1q_f32(dst + i, v_a);
+ v_a1 *= v_b1;
+ v_a2 *= v_b2;
+
+ v_store(dst + i, v_a1);
+ v_store(dst + i + 2, v_a2);
}
return i;
}
};
+#endif
#endif
template<typename T, typename WT>
static void
-iPow_( const T* src, T* dst, int len, int power )
+iPow_i( const T* src, T* dst, int len, int power )
{
- iPow_SIMD<T, WT> vop;
- int i = vop(src, dst, len, power);
+ if( power < 0 )
+ {
+ T tab[5] =
+ {
+ power == -1 ? saturate_cast<T>(-1) : 0, (power & 1) ? -1 : 1,
+ std::numeric_limits<T>::max(), 1, power == -1 ? 1 : 0
+ };
+ for( int i = 0; i < len; i++ )
+ {
+ T val = src[i];
+ dst[i] = cv_abs(val) <= 2 ? tab[val + 2] : (T)0;
+ }
+ }
+ else
+ {
+ iPow_SIMD<T, WT> vop;
+ int i = vop(src, dst, len, power);
+
+ for( ; i < len; i++ )
+ {
+ WT a = 1, b = src[i];
+ int p = power;
+ while( p > 1 )
+ {
+ if( p & 1 )
+ a *= b;
+ b *= b;
+ p >>= 1;
+ }
+
+ a *= b;
+ dst[i] = saturate_cast<T>(a);
+ }
+ }
+}
+
+template<typename T>
+static void
+iPow_f( const T* src, T* dst, int len, int power0 )
+{
+ iPow_SIMD<T, T> vop;
+ int i = vop(src, dst, len, power0);
+ int power = std::abs(power0);
for( ; i < len; i++ )
{
- WT a = 1, b = src[i];
+ T a = 1, b = src[i];
int p = power;
+ if( power0 < 0 )
+ b = 1/b;
+
while( p > 1 )
{
if( p & 1 )
}
a *= b;
- dst[i] = saturate_cast<T>(a);
+ dst[i] = a;
}
}
-
static void iPow8u(const uchar* src, uchar* dst, int len, int power)
{
- iPow_<uchar, int>(src, dst, len, power);
+ iPow_i<uchar, unsigned>(src, dst, len, power);
}
static void iPow8s(const schar* src, schar* dst, int len, int power)
{
- iPow_<schar, int>(src, dst, len, power);
+ iPow_i<schar, int>(src, dst, len, power);
}
static void iPow16u(const ushort* src, ushort* dst, int len, int power)
{
- iPow_<ushort, int>(src, dst, len, power);
+ iPow_i<ushort, unsigned>(src, dst, len, power);
}
static void iPow16s(const short* src, short* dst, int len, int power)
{
- iPow_<short, int>(src, dst, len, power);
+ iPow_i<short, int>(src, dst, len, power);
}
static void iPow32s(const int* src, int* dst, int len, int power)
{
- iPow_<int, int>(src, dst, len, power);
+ iPow_i<int, int>(src, dst, len, power);
}
static void iPow32f(const float* src, float* dst, int len, int power)
{
- iPow_<float, float>(src, dst, len, power);
+ iPow_f<float>(src, dst, len, power);
}
static void iPow64f(const double* src, double* dst, int len, int power)
{
- iPow_<double, double>(src, dst, len, power);
+ iPow_f<double>(src, dst, len, power);
}
bool doubleSupport = d.doubleFPConfig() > 0;
_dst.createSameSize(_src, type);
- if (is_ipower && (ipower == 0 || ipower == 1))
+ if (is_ipower)
{
if (ipower == 0)
+ {
_dst.setTo(Scalar::all(1));
- else if (ipower == 1)
+ return true;
+ }
+ if (ipower == 1)
+ {
_src.copyTo(_dst);
-
- return true;
+ return true;
+ }
+ if( ipower < 0 )
+ {
+ if( depth == CV_32F || depth == CV_64F )
+ is_ipower = false;
+ else
+ return false;
+ }
}
if (depth == CV_64F && !doubleSupport)
if( is_ipower && !(ocl::Device::getDefault().isIntel() && useOpenCL && depth != CV_64F))
{
- if( ipower < 0 )
- {
- divide( Scalar::all(1), _src, _dst );
- if( ipower == -1 )
- return;
- ipower = -ipower;
- same = true;
- }
-
switch( ipower )
{
case 0:
_src.copyTo(_dst);
return;
case 2:
-#if defined(HAVE_IPP)
- CV_IPP_CHECK()
- {
- if (depth == CV_32F && !same && ( (_src.dims() <= 2 && !ocl::useOpenCL()) ||
- (_src.dims() > 2 && _src.isContinuous() && _dst.isContinuous()) ))
- {
- Mat src = _src.getMat();
- _dst.create( src.dims, src.size, type );
- Mat dst = _dst.getMat();
-
- Size size = src.size();
- int srcstep = (int)src.step, dststep = (int)dst.step, esz = CV_ELEM_SIZE(type);
- if (src.isContinuous() && dst.isContinuous())
- {
- size.width = (int)src.total();
- size.height = 1;
- srcstep = dststep = (int)src.total() * esz;
- }
- size.width *= cn;
-
- IppStatus status = ippiSqr_32f_C1R(src.ptr<Ipp32f>(), srcstep, dst.ptr<Ipp32f>(), dststep, ippiSize(size.width, size.height));
-
- if (status >= 0)
- {
- CV_IMPL_ADD(CV_IMPL_IPP);
- return;
- }
- setIppErrorStatus();
- }
- }
-#endif
- if (same)
- multiply(_dst, _dst, _dst);
- else
- multiply(_src, _src, _dst);
+ multiply(_src, _src, _dst);
return;
}
}
CV_OCL_RUN(useOpenCL,
ocl_pow(same ? _dst : _src, power, _dst, is_ipower, ipower))
- Mat src, dst;
- if (same)
- src = dst = _dst.getMat();
- else
- {
- src = _src.getMat();
- _dst.create( src.dims, src.size, type );
- dst = _dst.getMat();
- }
+ Mat src = _src.getMat();
+ _dst.create( src.dims, src.size, type );
+ Mat dst = _dst.getMat();
const Mat* arrays[] = {&src, &dst, 0};
uchar* ptrs[2];
}
else
{
-#if defined(HAVE_IPP)
- CV_IPP_CHECK()
- {
- if (src.isContinuous() && dst.isContinuous())
- {
- IppStatus status = depth == CV_32F ?
- ippsPowx_32f_A21(src.ptr<Ipp32f>(), (Ipp32f)power, dst.ptr<Ipp32f>(), (Ipp32s)(src.total() * cn)) :
- ippsPowx_64f_A50(src.ptr<Ipp64f>(), power, dst.ptr<Ipp64f>(), (Ipp32s)(src.total() * cn));
-
- if (status >= 0)
- {
- CV_IMPL_ADD(CV_IMPL_IPP);
- return;
- }
- setIppErrorStatus();
- }
- }
-#endif
-
int j, k, blockSize = std::min(len, ((BLOCK_SIZE + cn-1)/cn)*cn);
size_t esz1 = src.elemSize1();
+ AutoBuffer<uchar> buf;
+ Cv32suf inf32, nan32;
+ Cv64suf inf64, nan64;
+ float* fbuf = 0;
+ double* dbuf = 0;
+ inf32.i = 0x7f800000;
+ nan32.i = 0x7fffffff;
+ inf64.i = CV_BIG_INT(0x7FF0000000000000);
+ nan64.i = CV_BIG_INT(0x7FFFFFFFFFFFFFFF);
+
+ if( src.ptr() == dst.ptr() )
+ {
+ buf.allocate(blockSize*esz1);
+ fbuf = (float*)(uchar*)buf;
+ dbuf = (double*)(uchar*)buf;
+ }
for( size_t i = 0; i < it.nplanes; i++, ++it )
{
for( j = 0; j < len; j += blockSize )
{
int bsz = std::min(len - j, blockSize);
+
+ #if defined(HAVE_IPP)
+ CV_IPP_CHECK()
+ {
+ IppStatus status = depth == CV_32F ?
+ ippsPowx_32f_A21((const float*)ptrs[0], (float)power, (float*)ptrs[1], bsz) :
+ ippsPowx_64f_A50((const double*)ptrs[0], (double)power, (double*)ptrs[1], bsz);
+
+ if (status >= 0)
+ {
+ CV_IMPL_ADD(CV_IMPL_IPP);
+ ptrs[0] += bsz*esz1;
+ ptrs[1] += bsz*esz1;
+ continue;
+ }
+ setIppErrorStatus();
+ }
+ #endif
+
if( depth == CV_32F )
{
- const float* x = (const float*)ptrs[0];
+ float* x0 = (float*)ptrs[0];
+ float* x = fbuf ? fbuf : x0;
float* y = (float*)ptrs[1];
+ if( x != x0 )
+ memcpy(x, x0, bsz*esz1);
+
Log_32f(x, y, bsz);
for( k = 0; k < bsz; k++ )
y[k] = (float)(y[k]*power);
Exp_32f(y, y, bsz);
+ for( k = 0; k < bsz; k++ )
+ {
+ if( x0[k] <= 0 )
+ {
+ if( x0[k] == 0.f )
+ {
+ if( power < 0 )
+ y[k] = inf32.f;
+ }
+ else
+ y[k] = nan32.f;
+ }
+ }
}
else
{
- const double* x = (const double*)ptrs[0];
+ double* x0 = (double*)ptrs[0];
+ double* x = dbuf ? dbuf : x0;
double* y = (double*)ptrs[1];
+ if( x != x0 )
+ memcpy(x, x0, bsz*esz1);
+
Log_64f(x, y, bsz);
for( k = 0; k < bsz; k++ )
y[k] *= power;
Exp_64f(y, y, bsz);
+
+ for( k = 0; k < bsz; k++ )
+ {
+ if( x0[k] <= 0 )
+ {
+ if( x0[k] == 0. )
+ {
+ if( power < 0 )
+ y[k] = inf64.f;
+ }
+ else
+ y[k] = nan64.f;
+ }
+ }
}
ptrs[0] += bsz*esz1;
ptrs[1] += bsz*esz1;