#include "precomp.hpp"
#include "opencl_kernels_imgproc.hpp"
+#include "opencv2/core/hal/intrin.hpp"
#include "opencv2/core/openvx/ovx_defs.hpp"
CV_Assert( it.planes[0].isContinuous() && it.planes[1].isContinuous() );
-#if CV_SSE2
- bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
-#endif
-
for( size_t i = 0; i < it.nplanes; i++, ++it )
{
const float* h1 = it.planes[0].ptr<float>();
}
else if( method == CV_COMP_CORREL )
{
- #if CV_SSE2
- if (haveSIMD)
+#if CV_SIMD_64F
+ v_float64 v_s1 = vx_setzero_f64();
+ v_float64 v_s2 = vx_setzero_f64();
+ v_float64 v_s11 = vx_setzero_f64();
+ v_float64 v_s12 = vx_setzero_f64();
+ v_float64 v_s22 = vx_setzero_f64();
+ for ( ; j <= len - v_float32::nlanes; j += v_float32::nlanes)
{
- __m128d v_s1 = _mm_setzero_pd(), v_s2 = v_s1;
- __m128d v_s11 = v_s1, v_s22 = v_s1, v_s12 = v_s1;
-
- for ( ; j <= len - 4; j += 4)
- {
- __m128 v_a = _mm_loadu_ps(h1 + j);
- __m128 v_b = _mm_loadu_ps(h2 + j);
-
- // 0-1
- __m128d v_ad = _mm_cvtps_pd(v_a);
- __m128d v_bd = _mm_cvtps_pd(v_b);
- v_s12 = _mm_add_pd(v_s12, _mm_mul_pd(v_ad, v_bd));
- v_s11 = _mm_add_pd(v_s11, _mm_mul_pd(v_ad, v_ad));
- v_s22 = _mm_add_pd(v_s22, _mm_mul_pd(v_bd, v_bd));
- v_s1 = _mm_add_pd(v_s1, v_ad);
- v_s2 = _mm_add_pd(v_s2, v_bd);
-
- // 2-3
- v_ad = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_a), 8)));
- v_bd = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_b), 8)));
- v_s12 = _mm_add_pd(v_s12, _mm_mul_pd(v_ad, v_bd));
- v_s11 = _mm_add_pd(v_s11, _mm_mul_pd(v_ad, v_ad));
- v_s22 = _mm_add_pd(v_s22, _mm_mul_pd(v_bd, v_bd));
- v_s1 = _mm_add_pd(v_s1, v_ad);
- v_s2 = _mm_add_pd(v_s2, v_bd);
- }
-
- double CV_DECL_ALIGNED(16) ar[10];
- _mm_store_pd(ar, v_s12);
- _mm_store_pd(ar + 2, v_s11);
- _mm_store_pd(ar + 4, v_s22);
- _mm_store_pd(ar + 6, v_s1);
- _mm_store_pd(ar + 8, v_s2);
-
- s12 += ar[0] + ar[1];
- s11 += ar[2] + ar[3];
- s22 += ar[4] + ar[5];
- s1 += ar[6] + ar[7];
- s2 += ar[8] + ar[9];
+ v_float32 v_a = vx_load(h1 + j);
+ v_float32 v_b = vx_load(h2 + j);
+
+ // 0-1
+ v_float64 v_ad = v_cvt_f64(v_a);
+ v_float64 v_bd = v_cvt_f64(v_b);
+ v_s12 = v_muladd(v_ad, v_bd, v_s12);
+ v_s11 = v_muladd(v_ad, v_ad, v_s11);
+ v_s22 = v_muladd(v_bd, v_bd, v_s22);
+ v_s1 += v_ad;
+ v_s2 += v_bd;
+
+ // 2-3
+ v_ad = v_cvt_f64_high(v_a);
+ v_bd = v_cvt_f64_high(v_b);
+ v_s12 = v_muladd(v_ad, v_bd, v_s12);
+ v_s11 = v_muladd(v_ad, v_ad, v_s11);
+ v_s22 = v_muladd(v_bd, v_bd, v_s22);
+ v_s1 += v_ad;
+ v_s2 += v_bd;
}
- #endif
+ s12 += v_reduce_sum(v_s12);
+ s11 += v_reduce_sum(v_s11);
+ s22 += v_reduce_sum(v_s22);
+ s1 += v_reduce_sum(v_s1);
+ s2 += v_reduce_sum(v_s2);
+#elif CV_SIMD && 0 //Disable vectorization for CV_COMP_CORREL if f64 is unsupported due to low precision
+ v_float32 v_s1 = vx_setzero_f32();
+ v_float32 v_s2 = vx_setzero_f32();
+ v_float32 v_s11 = vx_setzero_f32();
+ v_float32 v_s12 = vx_setzero_f32();
+ v_float32 v_s22 = vx_setzero_f32();
+ for (; j <= len - v_float32::nlanes; j += v_float32::nlanes)
+ {
+ v_float32 v_a = vx_load(h1 + j);
+ v_float32 v_b = vx_load(h2 + j);
+
+ v_s12 = v_muladd(v_a, v_b, v_s12);
+ v_s11 = v_muladd(v_a, v_a, v_s11);
+ v_s22 = v_muladd(v_b, v_b, v_s22);
+ v_s1 += v_a;
+ v_s2 += v_b;
+ }
+ s12 += v_reduce_sum(v_s12);
+ s11 += v_reduce_sum(v_s11);
+ s22 += v_reduce_sum(v_s22);
+ s1 += v_reduce_sum(v_s1);
+ s2 += v_reduce_sum(v_s2);
+#endif
for( ; j < len; j++ )
{
double a = h1[j];
}
else if( method == CV_COMP_INTERSECT )
{
- #if CV_NEON
- float32x4_t v_result = vdupq_n_f32(0.0f);
- for( ; j <= len - 4; j += 4 )
- v_result = vaddq_f32(v_result, vminq_f32(vld1q_f32(h1 + j), vld1q_f32(h2 + j)));
- float CV_DECL_ALIGNED(16) ar[4];
- vst1q_f32(ar, v_result);
- result += ar[0] + ar[1] + ar[2] + ar[3];
- #elif CV_SSE2
- if (haveSIMD)
+#if CV_SIMD_64F
+ v_float64 v_result = vx_setzero_f64();
+ for ( ; j <= len - v_float32::nlanes; j += v_float32::nlanes)
{
- __m128d v_result = _mm_setzero_pd();
- for ( ; j <= len - 4; j += 4)
- {
- __m128 v_src = _mm_min_ps(_mm_loadu_ps(h1 + j),
- _mm_loadu_ps(h2 + j));
- v_result = _mm_add_pd(v_result, _mm_cvtps_pd(v_src));
- v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8));
- v_result = _mm_add_pd(v_result, _mm_cvtps_pd(v_src));
- }
-
- double CV_DECL_ALIGNED(16) ar[2];
- _mm_store_pd(ar, v_result);
- result += ar[0] + ar[1];
+ v_float32 v_src = v_min(vx_load(h1 + j), vx_load(h2 + j));
+ v_result += v_cvt_f64(v_src) + v_cvt_f64_high(v_src);
+ }
+ result += v_reduce_sum(v_result);
+#elif CV_SIMD
+ v_float32 v_result = vx_setzero_f32();
+ for (; j <= len - v_float32::nlanes; j += v_float32::nlanes)
+ {
+ v_float32 v_src = v_min(vx_load(h1 + j), vx_load(h2 + j));
+ v_result += v_src;
}
- #endif
+ result += v_reduce_sum(v_result);
+#endif
for( ; j < len; j++ )
result += std::min(h1[j], h2[j]);
}
else if( method == CV_COMP_BHATTACHARYYA )
{
- #if CV_SSE2
- if (haveSIMD)
+#if CV_SIMD_64F
+ v_float64 v_s1 = vx_setzero_f64();
+ v_float64 v_s2 = vx_setzero_f64();
+ v_float64 v_result = vx_setzero_f64();
+ for ( ; j <= len - v_float32::nlanes; j += v_float32::nlanes)
{
- __m128d v_s1 = _mm_setzero_pd(), v_s2 = v_s1, v_result = v_s1;
- for ( ; j <= len - 4; j += 4)
- {
- __m128 v_a = _mm_loadu_ps(h1 + j);
- __m128 v_b = _mm_loadu_ps(h2 + j);
-
- __m128d v_ad = _mm_cvtps_pd(v_a);
- __m128d v_bd = _mm_cvtps_pd(v_b);
- v_s1 = _mm_add_pd(v_s1, v_ad);
- v_s2 = _mm_add_pd(v_s2, v_bd);
- v_result = _mm_add_pd(v_result, _mm_sqrt_pd(_mm_mul_pd(v_ad, v_bd)));
-
- v_ad = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_a), 8)));
- v_bd = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_b), 8)));
- v_s1 = _mm_add_pd(v_s1, v_ad);
- v_s2 = _mm_add_pd(v_s2, v_bd);
- v_result = _mm_add_pd(v_result, _mm_sqrt_pd(_mm_mul_pd(v_ad, v_bd)));
- }
-
- double CV_DECL_ALIGNED(16) ar[6];
- _mm_store_pd(ar, v_s1);
- _mm_store_pd(ar + 2, v_s2);
- _mm_store_pd(ar + 4, v_result);
- s1 += ar[0] + ar[1];
- s2 += ar[2] + ar[3];
- result += ar[4] + ar[5];
+ v_float32 v_a = vx_load(h1 + j);
+ v_float32 v_b = vx_load(h2 + j);
+
+ v_float64 v_ad = v_cvt_f64(v_a);
+ v_float64 v_bd = v_cvt_f64(v_b);
+ v_s1 += v_ad;
+ v_s2 += v_bd;
+ v_result += v_sqrt(v_ad * v_bd);
+
+ v_ad = v_cvt_f64_high(v_a);
+ v_bd = v_cvt_f64_high(v_b);
+ v_s1 += v_ad;
+ v_s2 += v_bd;
+ v_result += v_sqrt(v_ad * v_bd);
}
- #endif
+ s1 += v_reduce_sum(v_s1);
+ s2 += v_reduce_sum(v_s2);
+ result += v_reduce_sum(v_result);
+#elif CV_SIMD && 0 //Disable vectorization for CV_COMP_BHATTACHARYYA if f64 is unsupported due to low precision
+ v_float32 v_s1 = vx_setzero_f32();
+ v_float32 v_s2 = vx_setzero_f32();
+ v_float32 v_result = vx_setzero_f32();
+ for (; j <= len - v_float32::nlanes; j += v_float32::nlanes)
+ {
+ v_float32 v_a = vx_load(h1 + j);
+ v_float32 v_b = vx_load(h2 + j);
+ v_s1 += v_a;
+ v_s2 += v_b;
+ v_result += v_sqrt(v_a * v_b);
+ }
+ s1 += v_reduce_sum(v_s1);
+ s2 += v_reduce_sum(v_s2);
+ result += v_reduce_sum(v_result);
+#endif
for( ; j < len; j++ )
{
double a = h1[j];