performance improved

author Qiyu8 <fangchunlin@huawei.com>

Tue, 22 Sep 2020 08:52:15 +0000 (16:52 +0800)

committer Qiyu8 <fangchunlin@huawei.com>

Tue, 22 Sep 2020 08:52:15 +0000 (16:52 +0800)
author Qiyu8 <fangchunlin@huawei.com>
Tue, 22 Sep 2020 08:52:15 +0000 (16:52 +0800)
committer Qiyu8 <fangchunlin@huawei.com>
Tue, 22 Sep 2020 08:52:15 +0000 (16:52 +0800)
diff --git a/kernel/simd/intrin.h b/kernel/simd/intrin.h

index ef599f0..5997bb6 100644 (file)
--- a/kernel/simd/intrin.h
+++ b/kernel/simd/intrin.h
@@ -1,6 +1,26 @@
  #ifndef _INTRIN_H_
  #define _INTRIN_H_
  
+#if defined(_MSC_VER)
+#define BLAS_INLINE __inline
+#elif defined(__GNUC__)
+#if defined(__STRICT_ANSI__)
+#define BLAS_INLINE __inline__
+#else
+#define BLAS_INLINE inline
+#endif
+#else
+#define BLAS_INLINE
+#endif
+
+#ifdef _MSC_VER
+#define BLAS_FINLINE static __forceinline
+#elif defined(__GNUC__)
+#define BLAS_FINLINE static BLAS_INLINE __attribute__((always_inline))
+#else
+#define BLAS_FINLINE static
+#endif
+
  #ifdef __cplusplus
  extern "C" {
  #endif
diff --git a/kernel/simd/intrin_avx.h b/kernel/simd/intrin_avx.h

index 7262544..f6257ae 100644 (file)
--- a/kernel/simd/intrin_avx.h
+++ b/kernel/simd/intrin_avx.h
@@ -10,6 +10,16 @@ arithmetic
  */
  #define v_add_f32 _mm256_add_ps
  #define v_mul_f32 _mm256_mul_ps
+
+#ifdef HAVE_FMA3
+    // multiply and add, a*b + c
+    #define v_muladd_f32 _mm256_fmadd_ps
+#else
+    // multiply and add, a*b + c
+    BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
+    { return v_add_f32(v_mul_f32(a, b), c); }
+#endif // !HAVE_FMA3
+
  /*
  memory
  */
diff --git a/kernel/simd/intrin_avx512.h b/kernel/simd/intrin_avx512.h

index 775fe7a..cb116a9 100644 (file)
--- a/kernel/simd/intrin_avx512.h
+++ b/kernel/simd/intrin_avx512.h
@@ -10,10 +10,12 @@ arithmetic
  */
  #define v_add_f32 _mm512_add_ps
  #define v_mul_f32 _mm512_mul_ps
+// multiply and add, a*b + c
+#define v_muladd_f32 _mm512_fmadd_ps
  /*
  memory
  */
  // unaligned load
  #define v_loadu_f32(PTR) _mm512_loadu_ps((const __m512*)(PTR))
-#define v_storeu_f32(PTR) _mm512_storeu_ps((const __m512*)(PTR))
+#define v_storeu_f32 _mm512_storeu_ps
  #define v_setall_f32(VAL) _mm512_set1_ps(VAL)
diff --git a/kernel/simd/intrin_sse.h b/kernel/simd/intrin_sse.h

index 0cc159a..2601120 100644 (file)
--- a/kernel/simd/intrin_sse.h
+++ b/kernel/simd/intrin_sse.h
@@ -10,6 +10,17 @@ arithmetic
  */
  #define v_add_f32 _mm_add_ps
  #define v_mul_f32 _mm_mul_ps
+#ifdef HAVE_FMA3
+    // multiply and add, a*b + c
+    #define v_muladd_f32 _mm_fmadd_ps
+#elif defined(HAVE_FMA4)
+    // multiply and add, a*b + c
+    #define v_muladd_f32 _mm_macc_ps
+#else
+    // multiply and add, a*b + c
+    BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
+    { return v_add_f32(v_mul_f32(a, b), c); }
+#endif // HAVE_FMA3
  /*
  memory
  */
diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c

index 9836fac..b62e3dc 100644 (file)
--- a/kernel/x86_64/daxpy.c
+++ b/kernel/x86_64/daxpy.c
@@ -48,7 +48,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  #ifndef HAVE_KERNEL_8
  #include"../simd/intrin.h"
  
-void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
+static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
  {
         BLASLONG register i = 0;
         FLOAT a = *alpha;
@@ -57,7 +57,7 @@ void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
         __alpha =  v_setall_f32(*alpha);
         const int vstep = v_nlanes_f32;
         for (; i < n; i += vstep) {
-               tmp = v_add_f32(v_loadu_f32(y + i), v_mul_f32(__alpha, v_loadu_f32( x + i )));
+               tmp = v_muladd_f32(__alpha, v_loadu_f32( x + i ), v_loadu_f32(y + i));
                 v_storeu_f32(y + i, tmp);
         }
  #else
author	Qiyu8 <fangchunlin@huawei.com>
	Tue, 22 Sep 2020 08:52:15 +0000 (16:52 +0800)
committer	Qiyu8 <fangchunlin@huawei.com>
	Tue, 22 Sep 2020 08:52:15 +0000 (16:52 +0800)
kernel/simd/intrin.h		patch \| blob \| history
kernel/simd/intrin_avx.h		patch \| blob \| history
kernel/simd/intrin_avx512.h		patch \| blob \| history
kernel/simd/intrin_sse.h		patch \| blob \| history
kernel/x86_64/daxpy.c		patch \| blob \| history