/*********************************************************************/\r
\r
/*********************************************************************\r
+* 2013/10/19 Saar\r
+* BLASTEST : \r
+* CTEST : OK\r
+* TEST : OK\r
+*\r
* 2013/08/16 Saar\r
* Parameter:\r
* CGEMM_DEFAULT_UNROLL_N 2\r
#endif\r
\r
\r
-#if defined(BULLDOZER) || defined(PILEDRIVER)\r
+#if defined(BULLDOZER)\r
\r
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)\r
\r
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)\r
\r
.macro VFMADDPS_R y0,y1,y2\r
- vfmadd231ps \y0,\y1,\y2\r
+ vfmadd231ps \y1,\y2,\y0\r
.endm\r
\r
.macro VFMADDPS_I y0,y1,y2\r
- vfmadd231ps \y0,\y1,\y2\r
+ vfmadd231ps \y1,\y2,\y0\r
.endm\r
\r
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)\r
\r
.macro VFMADDPS_R y0,y1,y2\r
- vfnmadd231ps \y0,\y1,\y2\r
+ vfnmadd231ps \y1,\y2,\y0\r
.endm\r
\r
.macro VFMADDPS_I y0,y1,y2\r
- vfmadd231ps \y0,\y1,\y2\r
+ vfmadd231ps \y1,\y2,\y0\r
.endm\r
\r
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)\r
\r
.macro VFMADDPS_R y0,y1,y2\r
- vfmadd231ps \y0,\y1,\y2\r
+ vfmadd231ps \y1,\y2,\y0\r
.endm\r
\r
.macro VFMADDPS_I y0,y1,y2\r
- vfnmadd231ps \y0,\y1,\y2\r
+ vfnmadd231ps \y1,\y2,\y0\r
.endm\r
\r
#else\r
\r
.macro VFMADDPS_R y0,y1,y2\r
- vfnmadd231ps \y0,\y1,\y2\r
+ vfnmadd231ps \y1,\y2,\y0\r
.endm\r
\r
.macro VFMADDPS_I y0,y1,y2\r
- vfnmadd231ps \y0,\y1,\y2\r
+ vfnmadd231ps \y1,\y2,\y0\r
.endm\r
\r
#endif\r
/* or implied, of The University of Texas at Austin. */\r
/*********************************************************************/\r
\r
+/*********************************************************************\r
+* 2013/10/19 Saar\r
+* BLASTEST : \r
+* CTEST : OK\r
+* TEST : OK\r
+\r
+*\r
+*\r
+* 2013/08/15 Saar\r
+* Parameter:\r
+* SGEMM_DEFAULT_UNROLL_N 2\r
+* SGEMM_DEFAULT_UNROLL_M 16\r
+* SGEMM_DEFAULT_P 384\r
+* SGEMM_DEFAULT_Q 168\r
+*\r
+* BLASTEST: OK\r
+*\r
+* Performance:\r
+* 1 thread: 2.31 times faster than sandybridge\r
+* 4 threads: 2.26 times faster than sandybridge\r
+*\r
+* Compile for FMA3: OK\r
+*\r
+*********************************************************************/\r
+\r
\r
#define ASSEMBLER\r
#include "common.h"\r
#else\r
\r
.macro VFMADD231PD_ y0,y1,y2\r
- vfmadd231pd \y0,\y1,\y2\r
+ vfmadd231pd \y2,\y1,\y0\r
.endm\r
\r
.macro VFMADD231SD_ x0,x1,x2\r
- vfmadd231sd \x0,\x1,\x2\r
+ vfmadd231sd \x2,\x1,\x0\r
.endm\r
\r
#endif\r
/* or implied, of The University of Texas at Austin. */\r
/*********************************************************************/\r
\r
+/*********************************************************************\r
+* 2013/10/19 Saar\r
+* BLASTEST : \r
+* CTEST : OK\r
+* TEST : OK\r
+*\r
+* 2013/08/15 Saar\r
+* Parameter:\r
+* SGEMM_DEFAULT_UNROLL_N 4\r
+* SGEMM_DEFAULT_UNROLL_M 16\r
+* SGEMM_DEFAULT_P 768\r
+* SGEMM_DEFAULT_Q 168\r
+* \r
+* BLASTEST: OK\r
+* \r
+* Performance:\r
+* 1 thread: 2.22 times faster than sandybridge\r
+* 4 threads: 2.26 times faster than sandybridge\r
+*\r
+* Compile for FMA3: OK\r
+*\r
+*********************************************************************/\r
\r
#define ASSEMBLER\r
#include "common.h"\r
#define SP %rbx\r
\r
#define BO1 %rdi\r
-#define BO2 %r15\r
#define CO2 %rdx\r
\r
#ifndef WINDOWS_ABI\r
#else\r
\r
.macro VFMADD231PS_ y0,y1,y2\r
- vfmadd231ps \y0,\y1,\y2\r
+ vfmadd231ps \y1,\y2,\y0\r
.endm\r
\r
.macro VFMADD231SS_ x0,x1,x2\r
- vfmadd231ss \x0,\x1,\x2\r
+ vfmadd231ss \x1,\x2,\x0\r
.endm\r
\r
#endif\r
movq OLD_C, C\r
movq OLD_LDC, LDC\r
#ifdef TRMMKERNEL\r
- movsd OLD_OFFSET, %xmm12\r
+ vmovsd OLD_OFFSET, %xmm12\r
#endif\r
vmovaps %xmm3, %xmm0\r
\r
\r
\r
#ifdef TRMMKERNEL\r
- vmovss %xmm12, OFFSET\r
- vmovss %xmm12, KK\r
+ vmovsd %xmm12, OFFSET\r
+ vmovsd %xmm12, KK\r
#ifndef LEFT\r
negq KK\r
#endif \r
\r
.L4_60:\r
#if defined(TRMMKERNEL) && !defined(LEFT)\r
- addq $2, KK\r
+ addq $4, KK\r
#endif\r
\r
decq J // j --\r
/*********************************************************************/\r
\r
/*********************************************************************\r
+* 2013/10/19 Saar\r
+* BLASTEST : \r
+* CTEST : OK\r
+* TEST : OK\r
+*\r
* 2013/08/16 Saar\r
* Parameter:\r
* ZGEMM_DEFAULT_UNROLL_N 2\r
* ZGEMM_DEFAULT_P 112\r
* ZGEMM_DEFAULT_Q 224\r
*\r
-* BLASTEST: OK\r
*\r
* Performance:\r
* 1 thread: 1.80 times faster than sandybridge\r
#endif\r
\r
\r
-#if defined(BULLDOZER) || defined(PILEDRIVER)\r
+#if defined(BULLDOZER) \r
\r
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)\r
\r
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)\r
\r
.macro VFMADDPD_R y0,y1,y2\r
- vfmadd231pd \y0,\y1,\y2\r
+ vfmadd231pd \y1,\y2,\y0\r
.endm\r
\r
.macro VFMADDPD_I y0,y1,y2\r
- vfmadd231pd \y0,\y1,\y2\r
+ vfmadd231pd \y1,\y2,\y0\r
.endm\r
\r
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)\r
\r
.macro VFMADDPD_R y0,y1,y2\r
- vfnmadd231pd \y0,\y1,\y2\r
+ vfnmadd231pd \y1,\y2,\y0\r
.endm\r
\r
.macro VFMADDPD_I y0,y1,y2\r
- vfmadd231pd \y0,\y1,\y2\r
+ vfmadd231pd \y1,\y2,\y0\r
.endm\r
\r
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)\r
\r
.macro VFMADDPD_R y0,y1,y2\r
- vfmadd231pd \y0,\y1,\y2\r
+ vfmadd231pd \y1,\y2,\y0\r
.endm\r
\r
.macro VFMADDPD_I y0,y1,y2\r
- vfnmadd231pd \y0,\y1,\y2\r
+ vfnmadd231pd \y1,\y2,\y0\r
.endm\r
\r
#else\r
\r
.macro VFMADDPD_R y0,y1,y2\r
- vfnmadd231pd \y0,\y1,\y2\r
+ vfnmadd231pd \y1,\y2,\y0\r
.endm\r
\r
.macro VFMADDPD_I y0,y1,y2\r
- vfnmadd231pd \y0,\y1,\y2\r
+ vfnmadd231pd \y1,\y2,\y0\r
.endm\r
\r
#endif\r