fast_math: implement optimized PPC routines

author Paul E. Murphy <pmur@users.noreply.github.com>

Mon, 22 Jul 2019 19:23:56 +0000 (14:23 -0500)

committer Paul E. Murphy <pmur@users.noreply.github.com>

Wed, 7 Aug 2019 20:01:18 +0000 (15:01 -0500)
author Paul E. Murphy <pmur@users.noreply.github.com>
Mon, 22 Jul 2019 19:23:56 +0000 (14:23 -0500)
committer Paul E. Murphy <pmur@users.noreply.github.com>
Wed, 7 Aug 2019 20:01:18 +0000 (15:01 -0500)
diff --git a/modules/core/include/opencv2/core/fast_math.hpp b/modules/core/include/opencv2/core/fast_math.hpp

index 6eb6f1f..b1e8c42 100644 (file)
--- a/modules/core/include/opencv2/core/fast_math.hpp
+++ b/modules/core/include/opencv2/core/fast_math.hpp
@@ -74,7 +74,15 @@
  #  include "tegra_round.hpp"
  #endif
  
-#if defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__ && !defined(__CUDACC__)
+#if defined __PPC64__ && defined __GNUC__ && defined _ARCH_PWR8 && !defined (__CUDACC__)
+#  include <altivec.h>
+#endif
+
+#if ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \
+        defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
+    #define CV_INLINE_ROUND_DBL(value) TEGRA_ROUND_DBL(value);
+    #define CV_INLINE_ROUND_FLT(value) TEGRA_ROUND_FLT(value);
+#elif defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__ && !defined(__CUDACC__)
      // 1. general scheme
      #define ARM_ROUND(_value, _asm_string) \
          int res; \
@@ -84,12 +92,32 @@
          return res
      // 2. version for double
      #ifdef __clang__
-        #define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %[value] \n vmov %[res], %[temp]")
+        #define CV_INLINE_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %[value] \n vmov %[res], %[temp]")
      #else
-        #define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %P[value] \n vmov %[res], %[temp]")
+        #define CV_INLINE_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %P[value] \n vmov %[res], %[temp]")
      #endif
      // 3. version for float
-    #define ARM_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]")
+    #define CV_INLINE_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]")
+#elif defined __PPC64__ && defined __GNUC__ && defined _ARCH_PWR8 && !defined (__CUDACC__)
+    // P8 and newer machines can convert fp32/64 to int quickly.
+    #define CV_INLINE_ROUND_DBL(value) \
+        int out; \
+        double temp; \
+        __asm__( "fctiw %[temp],%[in]\n\tmffprwz %[out],%[temp]\n\t" : [out] "=r" (out), [temp] "=d" (temp) : [in] "d" ((double)(value)) : ); \
+        return out;
+
+    // FP32 also works with FP64 routine above
+    #define CV_INLINE_ROUND_FLT(value) CV_INLINE_ROUND_DBL(value)
+
+    #ifdef _ARCH_PWR9
+        #define CV_INLINE_ISINF_DBL(value) return scalar_test_data_class(value, 0x30);
+        #define CV_INLINE_ISNAN_DBL(value) return scalar_test_data_class(value, 0x40);
+        #define CV_INLINE_ISINF_FLT(value) CV_INLINE_ISINF_DBL(value)
+        #define CV_INLINE_ISNAN_FLT(value) CV_INLINE_ISNAN_DBL(value)
+    #endif
+#elif defined CV_ICC || defined __GNUC__
+    #define CV_INLINE_ROUND_DBL(value) return (int)(lrint(value));
+    #define CV_INLINE_ROUND_FLT(value) return (int)(lrintf(value));
  #endif
  
  #if defined __PPC64__ && !defined OPENCV_USE_FASTMATH_GCC_BUILTINS
@@ -105,6 +133,16 @@
      #define _OPENCV_FASTMATH_ENABLE_GCC_MATH_BUILTINS
  #endif
  
+/* Allow overrides for some functions which may benefit from tuning. Likewise,
+   note that isinf is not used as the return value is signed. */
+#if defined _OPENCV_FASTMATH_ENABLE_GCC_MATH_BUILTINS && !defined CV_INLINE_ISNAN_DBL
+    #define CV_INLINE_ISNAN_DBL(value) return __builtin_isnan(value);
+#endif
+
+#if defined _OPENCV_FASTMATH_ENABLE_GCC_MATH_BUILTINS && !defined CV_INLINE_ISNAN_FLT
+    #define CV_INLINE_ISNAN_FLT(value) return __builtin_isnanf(value);
+#endif
+
  /** @brief Rounds floating-point number to the nearest integer
  
   @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
@@ -125,15 +163,8 @@ cvRound( double value )
          fistp t;
      }
      return t;
-#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \
-        defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
-    TEGRA_ROUND_DBL(value);
-#elif defined CV_ICC || defined __GNUC__
-# if defined ARM_ROUND_DBL
-    ARM_ROUND_DBL(value);
-# else
-    return (int)lrint(value);
-# endif
+#elif defined CV_INLINE_ROUND_DBL
+    CV_INLINE_ROUND_DBL(value);
  #else
      /* it's ok if round does not comply with IEEE754 standard;
         the tests should allow +/-1 difference when the tested functions use round */
@@ -184,10 +215,14 @@ CV_INLINE int cvCeil( double value )
   otherwise. */
  CV_INLINE int cvIsNaN( double value )
  {
+#if defined CV_INLINE_ISNAN_DBL
+    CV_INLINE_ISNAN_DBL(value);
+#else
      Cv64suf ieee754;
      ieee754.f = value;
      return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) +
             ((unsigned)ieee754.u != 0) > 0x7ff00000;
+#endif
  }
  
  /** @brief Determines if the argument is Infinity.
@@ -198,10 +233,14 @@ CV_INLINE int cvIsNaN( double value )
   and 0 otherwise. */
  CV_INLINE int cvIsInf( double value )
  {
+#if defined CV_INLINE_ISINF_DBL
+    CV_INLINE_ISINF_DBL(value);
+#else
      Cv64suf ieee754;
      ieee754.f = value;
      return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) == 0x7ff00000 &&
              (unsigned)ieee754.u == 0;
+#endif
  }
  
  #ifdef __cplusplus
@@ -221,15 +260,8 @@ CV_INLINE int cvRound(float value)
          fistp t;
      }
      return t;
-#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \
-        defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
-    TEGRA_ROUND_FLT(value);
-#elif defined CV_ICC || defined __GNUC__
-# if defined ARM_ROUND_FLT
-    ARM_ROUND_FLT(value);
-# else
-    return (int)lrintf(value);
-# endif
+#elif defined CV_INLINE_ROUND_FLT
+    CV_INLINE_ROUND_FLT(value);
  #else
      /* it's ok if round does not comply with IEEE754 standard;
       the tests should allow +/-1 difference when the tested functions use round */
@@ -280,17 +312,25 @@ CV_INLINE int cvCeil( int value )
  /** @overload */
  CV_INLINE int cvIsNaN( float value )
  {
+#if defined CV_INLINE_ISNAN_FLT
+    CV_INLINE_ISNAN_FLT(value);
+#else
      Cv32suf ieee754;
      ieee754.f = value;
      return (ieee754.u & 0x7fffffff) > 0x7f800000;
+#endif
  }
  
  /** @overload */
  CV_INLINE int cvIsInf( float value )
  {
+#if defined CV_INLINE_ISINF_FLT
+    CV_INLINE_ISINF_FLT(value);
+#else
      Cv32suf ieee754;
      ieee754.f = value;
      return (ieee754.u & 0x7fffffff) == 0x7f800000;
+#endif
  }
  
  #endif // __cplusplus
author	Paul E. Murphy <pmur@users.noreply.github.com>
	Mon, 22 Jul 2019 19:23:56 +0000 (14:23 -0500)
committer	Paul E. Murphy <pmur@users.noreply.github.com>
	Wed, 7 Aug 2019 20:01:18 +0000 (15:01 -0500)