From 33fb253a66275abaa5060ef318c9a5cc87c5fd6e Mon Sep 17 00:00:00 2001 From: "Paul E. Murphy" Date: Tue, 20 Aug 2019 11:26:38 -0500 Subject: [PATCH] core: vectorize dotProd_32s Use 4x FMA chains to sum on SIMD 128 FP64 targets. On x86 this showed about 1.4x improvement. For PPC, do a full multiply (32x32->64b), convert to DP then accumulate. This may be slightly less precise for some inputs. But is 1.5x faster than the above which is about 1.5x than the FMA above for ~2.5x speedup. --- .../core/include/opencv2/core/hal/intrin_vsx.hpp | 9 +++++++ modules/core/src/matmul.simd.hpp | 29 ++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index a4d2c29..85fe0d0 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -1051,6 +1051,15 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a) inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) { return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); } +// The altivec intrinsic is missing for this 2.06 insn +inline v_float64x2 v_cvt_f64(const v_int64x2& a) +{ +vec_double2 out; + +__asm__ ("xvcvsxddp %x0,%x1" : "=wa"(out) : "wa"(a.val)); +return v_float64x2(out); +} + ////////////// Lookup table access //////////////////// inline v_int8x16 v_lut(const schar* tab, const int* idx) diff --git a/modules/core/src/matmul.simd.hpp b/modules/core/src/matmul.simd.hpp index bb6b6c5..fcc88a1 100644 --- a/modules/core/src/matmul.simd.hpp +++ b/modules/core/src/matmul.simd.hpp @@ -2493,7 +2493,36 @@ double dotProd_16s(const short* src1, const short* src2, int len) double dotProd_32s(const int* src1, const int* src2, int len) { +#if CV_SIMD128_64F + double r = 0.0; + int i = 0; + int lenAligned = len & -v_int32x4::nlanes; + v_float64x2 a(0.0, 0.0); + v_float64x2 b(0.0, 0.0); + + for( i = 0; i < lenAligned; i += v_int32x4::nlanes ) + { + v_int32x4 s1 = v_load(src1); + v_int32x4 s2 = v_load(src2); + +#if CV_VSX + // Do 32x32->64 multiplies, convert/round to double, accumulate + // Potentially less precise than FMA, but 1.5x faster than fma below. + a += v_cvt_f64(v_int64(vec_mule(s1.val, s2.val))); + b += v_cvt_f64(v_int64(vec_mulo(s1.val, s2.val))); +#else + a = v_fma(v_cvt_f64(s1), v_cvt_f64(s2), a); + b = v_fma(v_cvt_f64_high(s1), v_cvt_f64_high(s2), b); +#endif + src1 += v_int32x4::nlanes; + src2 += v_int32x4::nlanes; + } + a += b; + r = v_reduce_sum(a); + return r + dotProd_(src1, src2, len - i); +#else return dotProd_(src1, src2, len); +#endif } double dotProd_32f(const float* src1, const float* src2, int len) -- 2.7.4