From f6b50057e259761f8b2662e64272861cabb66607 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 19 Oct 2013 10:52:20 +0200 Subject: [PATCH] corrected and testet FMA3 Code --- kernel/x86_64/cgemm_kernel_8x2_haswell.S | 23 ++++++++++++-------- kernel/x86_64/dgemm_kernel_16x2_haswell.S | 29 +++++++++++++++++++++++-- kernel/x86_64/sgemm_kernel_16x4_haswell.S | 35 ++++++++++++++++++++++++------- kernel/x86_64/zgemm_kernel_4x2_haswell.S | 24 ++++++++++++--------- 4 files changed, 83 insertions(+), 28 deletions(-) diff --git a/kernel/x86_64/cgemm_kernel_8x2_haswell.S b/kernel/x86_64/cgemm_kernel_8x2_haswell.S index 0561c0f..bac7739 100644 --- a/kernel/x86_64/cgemm_kernel_8x2_haswell.S +++ b/kernel/x86_64/cgemm_kernel_8x2_haswell.S @@ -37,6 +37,11 @@ /*********************************************************************/ /********************************************************************* +* 2013/10/19 Saar +* BLASTEST : +* CTEST : OK +* TEST : OK +* * 2013/08/16 Saar * Parameter: * CGEMM_DEFAULT_UNROLL_N 2 @@ -139,7 +144,7 @@ #endif -#if defined(BULLDOZER) || defined(PILEDRIVER) +#if defined(BULLDOZER) #if defined(NN) || defined(NT) || defined(TN) || defined(TT) @@ -188,41 +193,41 @@ #if defined(NN) || defined(NT) || defined(TN) || defined(TT) .macro VFMADDPS_R y0,y1,y2 - vfmadd231ps \y0,\y1,\y2 + vfmadd231ps \y1,\y2,\y0 .endm .macro VFMADDPS_I y0,y1,y2 - vfmadd231ps \y0,\y1,\y2 + vfmadd231ps \y1,\y2,\y0 .endm #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) .macro VFMADDPS_R y0,y1,y2 - vfnmadd231ps \y0,\y1,\y2 + vfnmadd231ps \y1,\y2,\y0 .endm .macro VFMADDPS_I y0,y1,y2 - vfmadd231ps \y0,\y1,\y2 + vfmadd231ps \y1,\y2,\y0 .endm #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) .macro VFMADDPS_R y0,y1,y2 - vfmadd231ps \y0,\y1,\y2 + vfmadd231ps \y1,\y2,\y0 .endm .macro VFMADDPS_I y0,y1,y2 - vfnmadd231ps \y0,\y1,\y2 + vfnmadd231ps \y1,\y2,\y0 .endm #else .macro VFMADDPS_R y0,y1,y2 - vfnmadd231ps \y0,\y1,\y2 + vfnmadd231ps \y1,\y2,\y0 .endm .macro VFMADDPS_I y0,y1,y2 - vfnmadd231ps \y0,\y1,\y2 + vfnmadd231ps \y1,\y2,\y0 .endm #endif diff --git a/kernel/x86_64/dgemm_kernel_16x2_haswell.S b/kernel/x86_64/dgemm_kernel_16x2_haswell.S index 67a7ed3..e015bbd 100644 --- a/kernel/x86_64/dgemm_kernel_16x2_haswell.S +++ b/kernel/x86_64/dgemm_kernel_16x2_haswell.S @@ -36,6 +36,31 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ +/********************************************************************* +* 2013/10/19 Saar +* BLASTEST : +* CTEST : OK +* TEST : OK + +* +* +* 2013/08/15 Saar +* Parameter: +* SGEMM_DEFAULT_UNROLL_N 2 +* SGEMM_DEFAULT_UNROLL_M 16 +* SGEMM_DEFAULT_P 384 +* SGEMM_DEFAULT_Q 168 +* +* BLASTEST: OK +* +* Performance: +* 1 thread: 2.31 times faster than sandybridge +* 4 threads: 2.26 times faster than sandybridge +* +* Compile for FMA3: OK +* +*********************************************************************/ + #define ASSEMBLER #include "common.h" @@ -130,11 +155,11 @@ #else .macro VFMADD231PD_ y0,y1,y2 - vfmadd231pd \y0,\y1,\y2 + vfmadd231pd \y2,\y1,\y0 .endm .macro VFMADD231SD_ x0,x1,x2 - vfmadd231sd \x0,\x1,\x2 + vfmadd231sd \x2,\x1,\x0 .endm #endif diff --git a/kernel/x86_64/sgemm_kernel_16x4_haswell.S b/kernel/x86_64/sgemm_kernel_16x4_haswell.S index 9220961..c648927 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_haswell.S +++ b/kernel/x86_64/sgemm_kernel_16x4_haswell.S @@ -36,6 +36,28 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ +/********************************************************************* +* 2013/10/19 Saar +* BLASTEST : +* CTEST : OK +* TEST : OK +* +* 2013/08/15 Saar +* Parameter: +* SGEMM_DEFAULT_UNROLL_N 4 +* SGEMM_DEFAULT_UNROLL_M 16 +* SGEMM_DEFAULT_P 768 +* SGEMM_DEFAULT_Q 168 +* +* BLASTEST: OK +* +* Performance: +* 1 thread: 2.22 times faster than sandybridge +* 4 threads: 2.26 times faster than sandybridge +* +* Compile for FMA3: OK +* +*********************************************************************/ #define ASSEMBLER #include "common.h" @@ -60,7 +82,6 @@ #define SP %rbx #define BO1 %rdi -#define BO2 %r15 #define CO2 %rdx #ifndef WINDOWS_ABI @@ -131,11 +152,11 @@ #else .macro VFMADD231PS_ y0,y1,y2 - vfmadd231ps \y0,\y1,\y2 + vfmadd231ps \y1,\y2,\y0 .endm .macro VFMADD231SS_ x0,x1,x2 - vfmadd231ss \x0,\x1,\x2 + vfmadd231ss \x1,\x2,\x0 .endm #endif @@ -791,7 +812,7 @@ movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 + vmovsd OLD_OFFSET, %xmm12 #endif vmovaps %xmm3, %xmm0 @@ -836,8 +857,8 @@ #ifdef TRMMKERNEL - vmovss %xmm12, OFFSET - vmovss %xmm12, KK + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK #ifndef LEFT negq KK #endif @@ -1629,7 +1650,7 @@ .L4_60: #if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK + addq $4, KK #endif decq J // j -- diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.S b/kernel/x86_64/zgemm_kernel_4x2_haswell.S index d189b51..f4b8142 100644 --- a/kernel/x86_64/zgemm_kernel_4x2_haswell.S +++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.S @@ -37,6 +37,11 @@ /*********************************************************************/ /********************************************************************* +* 2013/10/19 Saar +* BLASTEST : +* CTEST : OK +* TEST : OK +* * 2013/08/16 Saar * Parameter: * ZGEMM_DEFAULT_UNROLL_N 2 @@ -44,7 +49,6 @@ * ZGEMM_DEFAULT_P 112 * ZGEMM_DEFAULT_Q 224 * -* BLASTEST: OK * * Performance: * 1 thread: 1.80 times faster than sandybridge @@ -138,7 +142,7 @@ #endif -#if defined(BULLDOZER) || defined(PILEDRIVER) +#if defined(BULLDOZER) #if defined(NN) || defined(NT) || defined(TN) || defined(TT) @@ -187,41 +191,41 @@ #if defined(NN) || defined(NT) || defined(TN) || defined(TT) .macro VFMADDPD_R y0,y1,y2 - vfmadd231pd \y0,\y1,\y2 + vfmadd231pd \y1,\y2,\y0 .endm .macro VFMADDPD_I y0,y1,y2 - vfmadd231pd \y0,\y1,\y2 + vfmadd231pd \y1,\y2,\y0 .endm #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) .macro VFMADDPD_R y0,y1,y2 - vfnmadd231pd \y0,\y1,\y2 + vfnmadd231pd \y1,\y2,\y0 .endm .macro VFMADDPD_I y0,y1,y2 - vfmadd231pd \y0,\y1,\y2 + vfmadd231pd \y1,\y2,\y0 .endm #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) .macro VFMADDPD_R y0,y1,y2 - vfmadd231pd \y0,\y1,\y2 + vfmadd231pd \y1,\y2,\y0 .endm .macro VFMADDPD_I y0,y1,y2 - vfnmadd231pd \y0,\y1,\y2 + vfnmadd231pd \y1,\y2,\y0 .endm #else .macro VFMADDPD_R y0,y1,y2 - vfnmadd231pd \y0,\y1,\y2 + vfnmadd231pd \y1,\y2,\y0 .endm .macro VFMADDPD_I y0,y1,y2 - vfnmadd231pd \y0,\y1,\y2 + vfnmadd231pd \y1,\y2,\y0 .endm #endif -- 2.7.4