/* or implied, of The University of Texas at Austin. */\r
/*********************************************************************/\r
\r
+/*********************************************************************\r
+* 2013/08/16 Saar\r
+* Parameter:\r
+* ZGEMM_DEFAULT_UNROLL_N 2\r
+* ZGEMM_DEFAULT_UNROLL_M 4 \r
+* ZGEMM_DEFAULT_P 112\r
+* ZGEMM_DEFAULT_Q 224\r
+*\r
+* BLASTEST: OK\r
+*\r
+* Performance:\r
+* 1 thread: 1.80 times faster than sandybridge\r
+* 4 threads: 1.74 times faster than sandybridge\r
+*\r
+* Compile for FMA3: OK\r
+*\r
+*********************************************************************/\r
+\r
\r
#define ASSEMBLER\r
#include "common.h"\r
#define STACK_TOUCH\r
#endif\r
\r
-#if defined(BULLDOZER)\r
-\r
-.macro VFMADD231PD_ y0,y1,y2\r
- vfmaddpd \y0,\y1,\y2,\y0\r
-.endm\r
-\r
-.macro VFMADD231SD_ x0,x1,x2\r
- vfmaddsd \x0,\x1,\x2,\x0\r
-.endm\r
-\r
-#else\r
-\r
-.macro VFMADD231PD_ y0,y1,y2\r
- vfmadd231pd \y0,\y1,\y2\r
-.endm\r
-\r
-.macro VFMADD231SD_ x0,x1,x2\r
- vfmadd231sd \x0,\x1,\x2\r
-.endm\r
-\r
-#endif\r
\r
-#if defined(BULLDOZER)\r
+#if defined(BULLDOZER) || defined(PILEDRIVER)\r
\r
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)\r
\r
vfmaddpd \y0,\y1,\y2,\y0\r
.endm\r
\r
-#define VFMADD_R vfmaddpd\r
-#define VFMADD_I vfmaddpd\r
-\r
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)\r
\r
.macro VFMADDPD_R y0,y1,y2\r
vfmaddpd \y0,\y1,\y2,\y0\r
.endm\r
\r
-#define VFMADD_R vfnmaddpd\r
-#define VFMADD_I vfmaddpd\r
-\r
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)\r
\r
.macro VFMADDPD_R y0,y1,y2\r
vfnmaddpd \y0,\y1,\y2,\y0\r
.endm\r
\r
-#define VFMADD_R vfmaddpd\r
-#define VFMADD_I vfnmaddpd\r
-\r
#else\r
\r
.macro VFMADDPD_R y0,y1,y2\r
vfnmaddpd \y0,\y1,\y2,\y0\r
.endm\r
\r
-#define VFMADD_R vfnmaddpd\r
-#define VFMADD_I vfnmaddpd\r
-\r
#endif\r
\r
#else\r
vfmadd231pd \y0,\y1,\y2\r
.endm\r
\r
-#define VFMADD_R vfmadd231pd\r
-#define VFMADD_I vfmadd231pd\r
-\r
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)\r
\r
.macro VFMADDPD_R y0,y1,y2\r
vfmadd231pd \y0,\y1,\y2\r
.endm\r
\r
-#define VFMADD_R vfnmadd231pd\r
-#define VFMADD_I vfmadd231pd\r
-\r
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)\r
\r
.macro VFMADDPD_R y0,y1,y2\r
vfnmadd231pd \y0,\y1,\y2\r
.endm\r
\r
-#define VFMADD_R vfmadd231pd\r
-#define VFMADD_I vfnmadd231pd\r
-\r
#else\r
\r
.macro VFMADDPD_R y0,y1,y2\r
vfnmadd231pd \y0,\y1,\y2\r
.endm\r
\r
-#define VFMADD_R vfnmadd231pd\r
-#define VFMADD_I vfnmadd231pd\r
-\r
#endif\r
\r
#endif\r
.endm\r
\r
/***************************************************************************************************/\r
-#define KERNEL2x2_1(xx) \\r
- prefetcht0 A_PR1(AO,%rax,SIZE) ;\\r
- vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
- vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\\r
- VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\\r
- vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\\r
- VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\\r
- vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\\r
- VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\\r
- VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\\r
- vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\\r
- VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\\r
- VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\\r
- vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\\r
- VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\\r
- VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\\r
-\r
-#define KERNEL2x2_2(xx) \\r
- vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
- vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\\r
- VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\\r
- vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\\r
- VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\\r
- vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\\r
- VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\\r
- VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\\r
- vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\\r
- VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\\r
- VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\\r
- vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\\r
- VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\\r
- VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\\r
-\r
-#define KERNEL2x2_3(xx) \\r
- prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\\r
- vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
- vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\\r
- VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\\r
- vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\\r
- VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\\r
- vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\\r
- VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\\r
- VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\\r
- vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\\r
- VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\\r
- VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\\r
- vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\\r
- VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\\r
- VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\\r
-\r
-#define KERNEL2x2_4(xx) \\r
- vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
- vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\\r
- VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\\r
- vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\\r
- VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\\r
- vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\\r
- VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\\r
- VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\\r
- vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\\r
- VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\\r
- VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\\r
- vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\\r
- VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\\r
- VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\\r
- addq $16, BI ;\\r
- addq $16, %rax ;\\r
-\r
-\r
-#define KERNEL2x2_SUB(xx) \\r
- vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
- vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\\r
- VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\\r
- vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\\r
- VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\\r
- vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\\r
- VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\\r
- VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\\r
- vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\\r
- VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\\r
- VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\\r
- vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\\r
- VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\\r
- VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\\r
- addq $4, BI ;\\r
- addq $4, %rax ;\\r
+\r
+.macro KERNEL2x2_SUB\r
+ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0\r
+ vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4\r
+ vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1\r
+ VFMADDPD_R %xmm8,%xmm4,%xmm0\r
+ VFMADDPD_R %xmm12,%xmm4,%xmm1\r
+ vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5\r
+ VFMADDPD_I %xmm9,%xmm5,%xmm0\r
+ VFMADDPD_I %xmm13,%xmm5,%xmm1\r
+ vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6\r
+ VFMADDPD_R %xmm10,%xmm6,%xmm0\r
+ VFMADDPD_R %xmm14,%xmm6,%xmm1\r
+ vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7\r
+ VFMADDPD_I %xmm11,%xmm7,%xmm0\r
+ VFMADDPD_I %xmm15,%xmm7,%xmm1\r
+ addq $4, BI \r
+ addq $4, %rax \r
+.endm\r
+\r
+.macro SAVE2x2\r
+\r
+ vmovddup ALPHA_R, %xmm0\r
+ vmovddup ALPHA_I, %xmm1\r
+\r
+ // swap high and low 64 bytes\r
+ vshufpd $0x01, %xmm9 , %xmm9, %xmm9\r
+ vshufpd $0x01, %xmm11, %xmm11, %xmm11\r
+ vshufpd $0x01, %xmm13, %xmm13, %xmm13\r
+ vshufpd $0x01, %xmm15, %xmm15, %xmm15\r
+\r
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \\r
+ defined(NR) || defined(NC) || defined(TR) || defined(TC)\r
+\r
+ vaddsubpd %xmm9, %xmm8 , %xmm8\r
+ vaddsubpd %xmm11,%xmm10, %xmm10\r
+ vaddsubpd %xmm13,%xmm12, %xmm12\r
+ vaddsubpd %xmm15,%xmm14, %xmm14\r
+\r
+ vshufpd $0x01, %xmm8 , %xmm8, %xmm9\r
+ vshufpd $0x01, %xmm10, %xmm10, %xmm11\r
+ vshufpd $0x01, %xmm12, %xmm12, %xmm13\r
+ vshufpd $0x01, %xmm14, %xmm14, %xmm15\r
+\r
+#else\r
+ vaddsubpd %xmm8, %xmm9 ,%xmm9\r
+ vaddsubpd %xmm10, %xmm11,%xmm11\r
+ vaddsubpd %xmm12, %xmm13,%xmm13\r
+ vaddsubpd %xmm14, %xmm15,%xmm15\r
+\r
+ vmovapd %xmm9, %xmm8\r
+ vmovapd %xmm11, %xmm10\r
+ vmovapd %xmm13, %xmm12\r
+ vmovapd %xmm15, %xmm14\r
+\r
+ // swap high and low 64 bytes\r
+ vshufpd $0x01, %xmm9 , %xmm9, %xmm9\r
+ vshufpd $0x01, %xmm11, %xmm11, %xmm11\r
+ vshufpd $0x01, %xmm13, %xmm13, %xmm13\r
+ vshufpd $0x01, %xmm15, %xmm15, %xmm15\r
+\r
+#endif\r
+\r
+ // multiply with ALPHA_R\r
+ vmulpd %xmm8 , %xmm0, %xmm8\r
+ vmulpd %xmm10, %xmm0, %xmm10\r
+ vmulpd %xmm12, %xmm0, %xmm12\r
+ vmulpd %xmm14, %xmm0, %xmm14\r
+\r
+ // multiply with ALPHA_I\r
+ vmulpd %xmm9 , %xmm1, %xmm9\r
+ vmulpd %xmm11, %xmm1, %xmm11\r
+ vmulpd %xmm13, %xmm1, %xmm13\r
+ vmulpd %xmm15, %xmm1, %xmm15\r
+\r
+ vaddsubpd %xmm9, %xmm8 , %xmm8\r
+ vaddsubpd %xmm11,%xmm10, %xmm10\r
+ vaddsubpd %xmm13,%xmm12, %xmm12\r
+ vaddsubpd %xmm15,%xmm14, %xmm14\r
+\r
+\r
+\r
+#ifndef TRMMKERNEL\r
+\r
+ vaddpd (CO1), %xmm8 , %xmm8\r
+ vaddpd 2 * SIZE(CO1), %xmm12, %xmm12\r
+\r
+ vaddpd (CO1, LDC), %xmm10, %xmm10\r
+ vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14\r
+\r
+#endif\r
+\r
+ vmovups %xmm8 , (CO1)\r
+ vmovups %xmm12 , 2 * SIZE(CO1)\r
+\r
+ vmovups %xmm10 , (CO1, LDC)\r
+ vmovups %xmm14 , 2 * SIZE(CO1, LDC)\r
+\r
+.endm\r
\r
/************************************************************************************************/\r
\r
/************************************************************************************************/\r
\r
-#define KERNEL1x2_1(xx) \\r
- prefetcht0 A_PR1(AO,%rax,SIZE) ;\\r
- vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
- vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\\r
- VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\\r
- vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\\r
- VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\\r
- vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\\r
- VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\\r
- vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\\r
- VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\\r
-\r
-#define KERNEL1x2_2(xx) \\r
- vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
- vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\\r
- VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\\r
- VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\\r
- vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\\r
- VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\\r
- vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\\r
- VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\\r
- vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\\r
- VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\\r
-\r
-#define KERNEL1x2_3(xx) \\r
- vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
- vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\\r
- VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\\r
- vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\\r
- VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\\r
- vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\\r
- VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\\r
- vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\\r
- VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\\r
-\r
-#define KERNEL1x2_4(xx) \\r
- vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
- vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\\r
- VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\\r
- vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\\r
- VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\\r
- vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\\r
- VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\\r
- vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\\r
- VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\\r
- addq $16, BI ;\\r
- addq $8 , %rax ;\\r
-\r
-\r
-#define KERNEL1x2_SUB(xx) \\r
- vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
- vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\\r
- VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\\r
- vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\\r
- VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\\r
- vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\\r
- VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\\r
- vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\\r
- VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\\r
- addq $4, BI ;\\r
- addq $2, %rax ;\\r
+\r
+.macro KERNEL1x2_SUB\r
+ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0\r
+ vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4\r
+ vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5\r
+ VFMADDPD_R %xmm8,%xmm4,%xmm0\r
+ VFMADDPD_I %xmm9,%xmm5,%xmm0\r
+ vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6\r
+ vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7\r
+ VFMADDPD_R %xmm10,%xmm6,%xmm0\r
+ VFMADDPD_I %xmm11,%xmm7,%xmm0\r
+ addq $4, BI \r
+ addq $2, %rax \r
+.endm\r
+\r
+.macro SAVE1x2\r
+\r
+ vmovddup ALPHA_R, %xmm0\r
+ vmovddup ALPHA_I, %xmm1\r
+\r
+ // swap high and low 64 bytes\r
+ vshufpd $0x01, %xmm9 , %xmm9, %xmm9\r
+ vshufpd $0x01, %xmm11, %xmm11, %xmm11\r
+\r
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \\r
+ defined(NR) || defined(NC) || defined(TR) || defined(TC)\r
+\r
+ vaddsubpd %xmm9, %xmm8 , %xmm8\r
+ vaddsubpd %xmm11,%xmm10, %xmm10\r
+\r
+ vshufpd $0x01, %xmm8 , %xmm8, %xmm9\r
+ vshufpd $0x01, %xmm10, %xmm10, %xmm11\r
+\r
+#else\r
+ vaddsubpd %xmm8, %xmm9, %xmm9\r
+ vaddsubpd %xmm10,%xmm11, %xmm11\r
+\r
+ vmovapd %xmm9, %xmm8\r
+ vmovapd %xmm11, %xmm10\r
+\r
+ // swap high and low 64 bytes\r
+ vshufpd $0x01, %xmm9 , %xmm9, %xmm9\r
+ vshufpd $0x01, %xmm11, %xmm11, %xmm11\r
+\r
+#endif\r
+\r
+ // multiply with ALPHA_R\r
+ vmulpd %xmm8 , %xmm0, %xmm8\r
+ vmulpd %xmm10, %xmm0, %xmm10\r
+\r
+ // multiply with ALPHA_I\r
+ vmulpd %xmm9 , %xmm1, %xmm9\r
+ vmulpd %xmm11, %xmm1, %xmm11\r
+\r
+ vaddsubpd %xmm9, %xmm8 , %xmm8\r
+ vaddsubpd %xmm11,%xmm10, %xmm10\r
+\r
+#ifndef TRMMKERNEL\r
+\r
+ vaddpd (CO1), %xmm8 , %xmm8\r
+ vaddpd (CO1, LDC), %xmm10, %xmm10\r
+\r
+#endif\r
+\r
+ vmovups %xmm8 , (CO1)\r
+ vmovups %xmm10 , (CO1, LDC)\r
+\r
+.endm\r
+\r
\r
/************************************************************************************************/\r
\r
\r
/************************************************************************************************/\r
\r
-#define KERNEL2x1_1(xx) \\r
- prefetcht0 A_PR1(AO,%rax,SIZE) ;\\r
- vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
- vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\\r
- VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\\r
- vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\\r
- VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\\r
- vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\\r
- VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\\r
- VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\\r
-\r
-#define KERNEL2x1_2(xx) \\r
- vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
- vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\\r
- VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\\r
- vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\\r
- VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\\r
- vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\\r
- VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\\r
- VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\\r
-\r
-#define KERNEL2x1_3(xx) \\r
- prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\\r
- vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
- vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\\r
- VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\\r
- vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\\r
- VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\\r
- vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\\r
- VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\\r
- VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\\r
-\r
-#define KERNEL2x1_4(xx) \\r
- vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
- vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\\r
- VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\\r
- vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\\r
- VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\\r
- vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\\r
- VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\\r
- VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\\r
- addq $8, BI ;\\r
- addq $16, %rax ;\\r
-\r
-\r
-#define KERNEL2x1_SUB(xx) \\r
- vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
- vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\\r
- VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\\r
- vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\\r
- VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\\r
- vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\\r
- VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\\r
- VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\\r
- addq $2, BI ;\\r
- addq $4, %rax ;\\r
+.macro KERNEL2x1_SUB\r
+ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0\r
+ vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4\r
+ VFMADDPD_R %xmm8,%xmm4,%xmm0\r
+ vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1\r
+ VFMADDPD_R %xmm12,%xmm4,%xmm1\r
+ vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5\r
+ VFMADDPD_I %xmm9,%xmm5,%xmm0\r
+ VFMADDPD_I %xmm13,%xmm5,%xmm1\r
+ addq $2, BI \r
+ addq $4, %rax \r
+.endm\r
+\r
+.macro SAVE2x1\r
+\r
+ vmovddup ALPHA_R, %xmm0\r
+ vmovddup ALPHA_I, %xmm1\r
+\r
+ // swap high and low 64 bytes\r
+ vshufpd $0x01, %xmm9 , %xmm9, %xmm9\r
+ vshufpd $0x01, %xmm13, %xmm13, %xmm13\r
+\r
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \\r
+ defined(NR) || defined(NC) || defined(TR) || defined(TC)\r
+\r
+ vaddsubpd %xmm9, %xmm8 , %xmm8\r
+ vaddsubpd %xmm13,%xmm12 , %xmm12\r
+\r
+ vshufpd $0x01, %xmm8 , %xmm8, %xmm9\r
+ vshufpd $0x01, %xmm12, %xmm12, %xmm13\r
+\r
+#else\r
+ vaddsubpd %xmm8, %xmm9 , %xmm9\r
+ vaddsubpd %xmm12,%xmm13, %xmm13\r
+\r
+ vmovapd %xmm9, %xmm8\r
+ vmovapd %xmm13, %xmm12\r
+\r
+ // swap high and low 64 bytes\r
+ vshufpd $0x01, %xmm9 , %xmm9, %xmm9\r
+ vshufpd $0x01, %xmm13, %xmm13, %xmm13\r
+\r
+#endif\r
+\r
+ // multiply with ALPHA_R\r
+ vmulpd %xmm8 , %xmm0, %xmm8\r
+ vmulpd %xmm12, %xmm0, %xmm12\r
+\r
+ // multiply with ALPHA_I\r
+ vmulpd %xmm9 , %xmm1, %xmm9\r
+ vmulpd %xmm13, %xmm1, %xmm13\r
+\r
+ vaddsubpd %xmm9, %xmm8 , %xmm8\r
+ vaddsubpd %xmm13, %xmm12, %xmm12\r
+\r
+#ifndef TRMMKERNEL\r
+\r
+ vaddpd (CO1), %xmm8 , %xmm8\r
+ vaddpd 2 * SIZE(CO1), %xmm12, %xmm12\r
+\r
+#endif\r
+\r
+ vmovups %xmm8 , (CO1)\r
+ vmovups %xmm12 , 2 * SIZE(CO1)\r
+\r
+.endm\r
\r
\r
/************************************************************************************************/\r
\r
-#define KERNEL1x1_1(xx) \\r
- prefetcht0 A_PR1(AO,%rax,SIZE) ;\\r
- vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
- vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\\r
- VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\\r
- vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\\r
- VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\\r
-\r
-#define KERNEL1x1_2(xx) \\r
- vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
- vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\\r
- VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\\r
- vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\\r
- VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\\r
-\r
-#define KERNEL1x1_3(xx) \\r
- vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
- vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\\r
- VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\\r
- vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\\r
- VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\\r
-\r
-#define KERNEL1x1_4(xx) \\r
- vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
- vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\\r
- VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\\r
- vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\\r
- VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\\r
- addq $8, BI ;\\r
- addq $8, %rax ;\\r
-\r
-\r
-#define KERNEL1x1_SUB(xx) \\r
- vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
- vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\\r
- VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\\r
- vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\\r
- VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\\r
- addq $2, BI ;\\r
- addq $2, %rax ;\\r
+.macro KERNEL1x1_SUB\r
+ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0\r
+ vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4\r
+ VFMADDPD_R %xmm8,%xmm4,%xmm0\r
+ vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5\r
+ VFMADDPD_I %xmm9,%xmm5,%xmm0\r
+ addq $2, BI \r
+ addq $2, %rax \r
+.endm\r
+\r
+.macro SAVE1x1\r
+\r
+ vmovddup ALPHA_R, %xmm0\r
+ vmovddup ALPHA_I, %xmm1\r
+\r
+ // swap high and low 64 bytes\r
+ vshufpd $0x01, %xmm9 , %xmm9, %xmm9\r
+\r
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \\r
+ defined(NR) || defined(NC) || defined(TR) || defined(TC)\r
+\r
+ vaddsubpd %xmm9, %xmm8, %xmm8\r
+\r
+ vshufpd $0x01, %xmm8 , %xmm8, %xmm9\r
+\r
+#else\r
+ vaddsubpd %xmm8, %xmm9, %xmm9\r
+\r
+ vmovapd %xmm9, %xmm8\r
+\r
+ // swap high and low 64 bytes\r
+ vshufpd $0x01, %xmm9 , %xmm9, %xmm9\r
+\r
+#endif\r
+\r
+ // multiply with ALPHA_R\r
+ vmulpd %xmm8 , %xmm0, %xmm8\r
+\r
+ // multiply with ALPHA_I\r
+ vmulpd %xmm9 , %xmm1, %xmm9\r
+\r
+ vaddsubpd %xmm9 ,%xmm8, %xmm8\r
+\r
+#ifndef TRMMKERNEL\r
+\r
+ vaddpd (CO1), %xmm8 , %xmm8\r
+\r
+#endif\r
+\r
+ vmovups %xmm8 , (CO1)\r
+\r
+.endm\r
\r
\r
/************************************************************************************************/\r
\r
.L2_2_12:\r
\r
+ prefetcht0 A_PR1(AO,%rax,SIZE)\r
+ prefetcht0 B_PR1(BO,BI,SIZE)\r
+ KERNEL2x2_SUB\r
+ KERNEL2x2_SUB\r
+ prefetcht0 A_PR1(AO,%rax,SIZE)\r
prefetcht0 B_PR1(BO,BI,SIZE)\r
- KERNEL2x2_1(xxx)\r
- KERNEL2x2_2(xxx)\r
- prefetcht0 B_PR1+64(BO,BI,SIZE)\r
- KERNEL2x2_3(xxx)\r
- KERNEL2x2_4(xxx)\r
+ KERNEL2x2_SUB\r
+ KERNEL2x2_SUB\r
\r
+ prefetcht0 A_PR1(AO,%rax,SIZE)\r
prefetcht0 B_PR1(BO,BI,SIZE)\r
- KERNEL2x2_1(xxx)\r
- KERNEL2x2_2(xxx)\r
- prefetcht0 B_PR1+64(BO,BI,SIZE)\r
- KERNEL2x2_3(xxx)\r
- KERNEL2x2_4(xxx)\r
+ KERNEL2x2_SUB\r
+ KERNEL2x2_SUB\r
+ prefetcht0 A_PR1(AO,%rax,SIZE)\r
+ prefetcht0 B_PR1(BO,BI,SIZE)\r
+ KERNEL2x2_SUB\r
+ KERNEL2x2_SUB\r
\r
je .L2_2_16\r
\r
+ prefetcht0 A_PR1(AO,%rax,SIZE)\r
+ prefetcht0 B_PR1(BO,BI,SIZE)\r
+ KERNEL2x2_SUB\r
+ KERNEL2x2_SUB\r
+ prefetcht0 A_PR1(AO,%rax,SIZE)\r
prefetcht0 B_PR1(BO,BI,SIZE)\r
- KERNEL2x2_1(xxx)\r
- KERNEL2x2_2(xxx)\r
- prefetcht0 B_PR1+64(BO,BI,SIZE)\r
- KERNEL2x2_3(xxx)\r
- KERNEL2x2_4(xxx)\r
+ KERNEL2x2_SUB\r
+ KERNEL2x2_SUB\r
\r
+ prefetcht0 A_PR1(AO,%rax,SIZE)\r
prefetcht0 B_PR1(BO,BI,SIZE)\r
- KERNEL2x2_1(xxx)\r
- KERNEL2x2_2(xxx)\r
- prefetcht0 B_PR1+64(BO,BI,SIZE)\r
- KERNEL2x2_3(xxx)\r
- KERNEL2x2_4(xxx)\r
+ KERNEL2x2_SUB\r
+ KERNEL2x2_SUB\r
+ prefetcht0 A_PR1(AO,%rax,SIZE)\r
+ prefetcht0 B_PR1(BO,BI,SIZE)\r
+ KERNEL2x2_SUB\r
+ KERNEL2x2_SUB\r
\r
je .L2_2_16\r
\r
\r
.L2_2_17:\r
\r
- KERNEL2x2_SUB(xxx)\r
+ KERNEL2x2_SUB\r
+\r
jl .L2_2_17\r
ALIGN_4\r
\r
\r
.L2_2_19:\r
\r
- vmovddup ALPHA_R, %xmm0\r
- vmovddup ALPHA_I, %xmm1\r
-\r
- // swap high and low 64 bytes\r
- vshufpd $0x01, %xmm9 , %xmm9, %xmm9\r
- vshufpd $0x01, %xmm11, %xmm11, %xmm11\r
- vshufpd $0x01, %xmm13, %xmm13, %xmm13\r
- vshufpd $0x01, %xmm15, %xmm15, %xmm15\r
-\r
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \\r
- defined(NR) || defined(NC) || defined(TR) || defined(TC)\r
-\r
- vaddsubpd %xmm9, %xmm8 , %xmm8\r
- vaddsubpd %xmm11,%xmm10, %xmm10\r
- vaddsubpd %xmm13,%xmm12, %xmm12\r
- vaddsubpd %xmm15,%xmm14, %xmm14\r
-\r
- vshufpd $0x01, %xmm8 , %xmm8, %xmm9\r
- vshufpd $0x01, %xmm10, %xmm10, %xmm11\r
- vshufpd $0x01, %xmm12, %xmm12, %xmm13\r
- vshufpd $0x01, %xmm14, %xmm14, %xmm15\r
-\r
-#else\r
- vaddsubpd %xmm8, %xmm9 ,%xmm9\r
- vaddsubpd %xmm10, %xmm11,%xmm11\r
- vaddsubpd %xmm12, %xmm13,%xmm13\r
- vaddsubpd %xmm14, %xmm15,%xmm15\r
-\r
- vmovapd %xmm9, %xmm8\r
- vmovapd %xmm11, %xmm10\r
- vmovapd %xmm13, %xmm12\r
- vmovapd %xmm15, %xmm14\r
-\r
- // swap high and low 64 bytes\r
- vshufpd $0x01, %xmm9 , %xmm9, %xmm9\r
- vshufpd $0x01, %xmm11, %xmm11, %xmm11\r
- vshufpd $0x01, %xmm13, %xmm13, %xmm13\r
- vshufpd $0x01, %xmm15, %xmm15, %xmm15\r
-\r
-#endif\r
-\r
- // multiply with ALPHA_R\r
- vmulpd %xmm8 , %xmm0, %xmm8\r
- vmulpd %xmm10, %xmm0, %xmm10\r
- vmulpd %xmm12, %xmm0, %xmm12\r
- vmulpd %xmm14, %xmm0, %xmm14\r
-\r
- // multiply with ALPHA_I\r
- vmulpd %xmm9 , %xmm1, %xmm9\r
- vmulpd %xmm11, %xmm1, %xmm11\r
- vmulpd %xmm13, %xmm1, %xmm13\r
- vmulpd %xmm15, %xmm1, %xmm15\r
-\r
- vaddsubpd %xmm9, %xmm8 , %xmm8\r
- vaddsubpd %xmm11,%xmm10, %xmm10\r
- vaddsubpd %xmm13,%xmm12, %xmm12\r
- vaddsubpd %xmm15,%xmm14, %xmm14\r
-\r
-\r
-\r
-#ifndef TRMMKERNEL\r
-\r
- vaddpd (CO1), %xmm8 , %xmm8\r
- vaddpd 2 * SIZE(CO1), %xmm12, %xmm12\r
-\r
- vaddpd (CO1, LDC), %xmm10, %xmm10\r
- vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14\r
-\r
-#endif\r
-\r
- vmovups %xmm8 , (CO1)\r
- vmovups %xmm12 , 2 * SIZE(CO1)\r
-\r
- vmovups %xmm10 , (CO1, LDC)\r
- vmovups %xmm14 , 2 * SIZE(CO1, LDC)\r
+ SAVE2x2\r
\r
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \\r
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))\r
\r
.L2_2_42:\r
\r
+ prefetcht0 A_PR1(AO,%rax,SIZE)\r
prefetcht0 B_PR1(BO,BI,SIZE)\r
- KERNEL1x2_1(xxx)\r
- KERNEL1x2_2(xxx)\r
- prefetcht0 B_PR1+64(BO,BI,SIZE)\r
- KERNEL1x2_3(xxx)\r
- KERNEL1x2_4(xxx)\r
+ KERNEL1x2_SUB\r
+ KERNEL1x2_SUB\r
+ prefetcht0 B_PR1(BO,BI,SIZE)\r
+ KERNEL1x2_SUB\r
+ KERNEL1x2_SUB\r
\r
+ prefetcht0 A_PR1(AO,%rax,SIZE)\r
+ prefetcht0 B_PR1(BO,BI,SIZE)\r
+ KERNEL1x2_SUB\r
+ KERNEL1x2_SUB\r
prefetcht0 B_PR1(BO,BI,SIZE)\r
- KERNEL1x2_1(xxx)\r
- KERNEL1x2_2(xxx)\r
- prefetcht0 B_PR1+64(BO,BI,SIZE)\r
- KERNEL1x2_3(xxx)\r
- KERNEL1x2_4(xxx)\r
+ KERNEL1x2_SUB\r
+ KERNEL1x2_SUB\r
\r
je .L2_2_46\r
\r
+ prefetcht0 A_PR1(AO,%rax,SIZE)\r
prefetcht0 B_PR1(BO,BI,SIZE)\r
- KERNEL1x2_1(xxx)\r
- KERNEL1x2_2(xxx)\r
- prefetcht0 B_PR1+64(BO,BI,SIZE)\r
- KERNEL1x2_3(xxx)\r
- KERNEL1x2_4(xxx)\r
+ KERNEL1x2_SUB\r
+ KERNEL1x2_SUB\r
+ prefetcht0 B_PR1(BO,BI,SIZE)\r
+ KERNEL1x2_SUB\r
+ KERNEL1x2_SUB\r
\r
+ prefetcht0 A_PR1(AO,%rax,SIZE)\r
+ prefetcht0 B_PR1(BO,BI,SIZE)\r
+ KERNEL1x2_SUB\r
+ KERNEL1x2_SUB\r
prefetcht0 B_PR1(BO,BI,SIZE)\r
- KERNEL1x2_1(xxx)\r
- KERNEL1x2_2(xxx)\r
- prefetcht0 B_PR1+64(BO,BI,SIZE)\r
- KERNEL1x2_3(xxx)\r
- KERNEL1x2_4(xxx)\r
+ KERNEL1x2_SUB\r
+ KERNEL1x2_SUB\r
\r
je .L2_2_46\r
\r
\r
.L2_2_47:\r
\r
- KERNEL1x2_SUB(xxx)\r
+ KERNEL1x2_SUB\r
+\r
jl .L2_2_47\r
ALIGN_4\r
\r
\r
.L2_2_49:\r
\r
- vmovddup ALPHA_R, %xmm0\r
- vmovddup ALPHA_I, %xmm1\r
-\r
- // swap high and low 64 bytes\r
- vshufpd $0x01, %xmm9 , %xmm9, %xmm9\r
- vshufpd $0x01, %xmm11, %xmm11, %xmm11\r
-\r
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \\r
- defined(NR) || defined(NC) || defined(TR) || defined(TC)\r
-\r
- vaddsubpd %xmm9, %xmm8 , %xmm8\r
- vaddsubpd %xmm11,%xmm10, %xmm10\r
-\r
- vshufpd $0x01, %xmm8 , %xmm8, %xmm9\r
- vshufpd $0x01, %xmm10, %xmm10, %xmm11\r
-\r
-#else\r
- vaddsubpd %xmm8, %xmm9, %xmm9\r
- vaddsubpd %xmm10,%xmm11, %xmm11\r
-\r
- vmovapd %xmm9, %xmm8\r
- vmovapd %xmm11, %xmm10\r
-\r
- // swap high and low 64 bytes\r
- vshufpd $0x01, %xmm9 , %xmm9, %xmm9\r
- vshufpd $0x01, %xmm11, %xmm11, %xmm11\r
-\r
-#endif\r
-\r
- // multiply with ALPHA_R\r
- vmulpd %xmm8 , %xmm0, %xmm8\r
- vmulpd %xmm10, %xmm0, %xmm10\r
-\r
- // multiply with ALPHA_I\r
- vmulpd %xmm9 , %xmm1, %xmm9\r
- vmulpd %xmm11, %xmm1, %xmm11\r
-\r
- vaddsubpd %xmm9, %xmm8 , %xmm8\r
- vaddsubpd %xmm11,%xmm10, %xmm10\r
-\r
-\r
-\r
-#ifndef TRMMKERNEL\r
-\r
- vaddpd (CO1), %xmm8 , %xmm8\r
- vaddpd (CO1, LDC), %xmm10, %xmm10\r
-\r
-#endif\r
-\r
- vmovups %xmm8 , (CO1)\r
- vmovups %xmm10 , (CO1, LDC)\r
+ SAVE1x2\r
\r
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \\r
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))\r
\r
.L1_2_12:\r
\r
+ prefetcht0 A_PR1(AO,%rax,SIZE)\r
prefetcht0 B_PR1(BO,BI,SIZE)\r
- KERNEL2x1_1(xxx)\r
- KERNEL2x1_2(xxx)\r
- KERNEL2x1_3(xxx)\r
- KERNEL2x1_4(xxx)\r
+ KERNEL2x1_SUB\r
+ KERNEL2x1_SUB\r
+ prefetcht0 A_PR1(AO,%rax,SIZE)\r
+ KERNEL2x1_SUB\r
+ KERNEL2x1_SUB\r
\r
prefetcht0 B_PR1(BO,BI,SIZE)\r
- KERNEL2x1_1(xxx)\r
- KERNEL2x1_2(xxx)\r
- KERNEL2x1_3(xxx)\r
- KERNEL2x1_4(xxx)\r
+ KERNEL2x1_SUB\r
+ KERNEL2x1_SUB\r
+ prefetcht0 A_PR1(AO,%rax,SIZE)\r
+ KERNEL2x1_SUB\r
+ KERNEL2x1_SUB\r
\r
je .L1_2_16\r
\r
prefetcht0 B_PR1(BO,BI,SIZE)\r
- KERNEL2x1_1(xxx)\r
- KERNEL2x1_2(xxx)\r
- KERNEL2x1_3(xxx)\r
- KERNEL2x1_4(xxx)\r
+ KERNEL2x1_SUB\r
+ KERNEL2x1_SUB\r
+ prefetcht0 A_PR1(AO,%rax,SIZE)\r
+ KERNEL2x1_SUB\r
+ KERNEL2x1_SUB\r
\r
prefetcht0 B_PR1(BO,BI,SIZE)\r
- KERNEL2x1_1(xxx)\r
- KERNEL2x1_2(xxx)\r
- KERNEL2x1_3(xxx)\r
- KERNEL2x1_4(xxx)\r
+ KERNEL2x1_SUB\r
+ KERNEL2x1_SUB\r
+ prefetcht0 A_PR1(AO,%rax,SIZE)\r
+ KERNEL2x1_SUB\r
+ KERNEL2x1_SUB\r
\r
je .L1_2_16\r
\r
\r
.L1_2_17:\r
\r
- KERNEL2x1_SUB(xxx)\r
+ KERNEL2x1_SUB\r
+\r
jl .L1_2_17\r
ALIGN_4\r
\r
\r
.L1_2_19:\r
\r
- vmovddup ALPHA_R, %xmm0\r
- vmovddup ALPHA_I, %xmm1\r
-\r
- // swap high and low 64 bytes\r
- vshufpd $0x01, %xmm9 , %xmm9, %xmm9\r
- vshufpd $0x01, %xmm13, %xmm13, %xmm13\r
-\r
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \\r
- defined(NR) || defined(NC) || defined(TR) || defined(TC)\r
-\r
- vaddsubpd %xmm9, %xmm8 , %xmm8\r
- vaddsubpd %xmm13,%xmm12 , %xmm12\r
-\r
- vshufpd $0x01, %xmm8 , %xmm8, %xmm9\r
- vshufpd $0x01, %xmm12, %xmm12, %xmm13\r
-\r
-#else\r
- vaddsubpd %xmm8, %xmm9 , %xmm9\r
- vaddsubpd %xmm12,%xmm13, %xmm13\r
-\r
- vmovapd %xmm9, %xmm8\r
- vmovapd %xmm13, %xmm12\r
-\r
- // swap high and low 64 bytes\r
- vshufpd $0x01, %xmm9 , %xmm9, %xmm9\r
- vshufpd $0x01, %xmm13, %xmm13, %xmm13\r
-\r
-#endif\r
-\r
- // multiply with ALPHA_R\r
- vmulpd %xmm8 , %xmm0, %xmm8\r
- vmulpd %xmm12, %xmm0, %xmm12\r
-\r
- // multiply with ALPHA_I\r
- vmulpd %xmm9 , %xmm1, %xmm9\r
- vmulpd %xmm13, %xmm1, %xmm13\r
-\r
- vaddsubpd %xmm9, %xmm8 , %xmm8\r
- vaddsubpd %xmm13, %xmm12, %xmm12\r
-\r
-\r
-\r
-#ifndef TRMMKERNEL\r
-\r
- vaddpd (CO1), %xmm8 , %xmm8\r
- vaddpd 2 * SIZE(CO1), %xmm12, %xmm12\r
-\r
-#endif\r
-\r
- vmovups %xmm8 , (CO1)\r
- vmovups %xmm12 , 2 * SIZE(CO1)\r
-\r
+ SAVE2x1\r
\r
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \\r
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))\r
\r
.L1_2_42:\r
\r
+ prefetcht0 A_PR1(AO,%rax,SIZE)\r
prefetcht0 B_PR1(BO,BI,SIZE)\r
- KERNEL1x1_1(xxx)\r
- KERNEL1x1_2(xxx)\r
- KERNEL1x1_3(xxx)\r
- KERNEL1x1_4(xxx)\r
+ KERNEL1x1_SUB\r
+ KERNEL1x1_SUB\r
+ KERNEL1x1_SUB\r
+ KERNEL1x1_SUB\r
\r
+ prefetcht0 A_PR1(AO,%rax,SIZE)\r
prefetcht0 B_PR1(BO,BI,SIZE)\r
- KERNEL1x1_1(xxx)\r
- KERNEL1x1_2(xxx)\r
- KERNEL1x1_3(xxx)\r
- KERNEL1x1_4(xxx)\r
+ KERNEL1x1_SUB\r
+ KERNEL1x1_SUB\r
+ KERNEL1x1_SUB\r
+ KERNEL1x1_SUB\r
\r
je .L1_2_46\r
\r
+ prefetcht0 A_PR1(AO,%rax,SIZE)\r
prefetcht0 B_PR1(BO,BI,SIZE)\r
- KERNEL1x1_1(xxx)\r
- KERNEL1x1_2(xxx)\r
- KERNEL1x1_3(xxx)\r
- KERNEL1x1_4(xxx)\r
+ KERNEL1x1_SUB\r
+ KERNEL1x1_SUB\r
+ KERNEL1x1_SUB\r
+ KERNEL1x1_SUB\r
\r
+ prefetcht0 A_PR1(AO,%rax,SIZE)\r
prefetcht0 B_PR1(BO,BI,SIZE)\r
- KERNEL1x1_1(xxx)\r
- KERNEL1x1_2(xxx)\r
- KERNEL1x1_3(xxx)\r
- KERNEL1x1_4(xxx)\r
+ KERNEL1x1_SUB\r
+ KERNEL1x1_SUB\r
+ KERNEL1x1_SUB\r
+ KERNEL1x1_SUB\r
\r
je .L1_2_46\r
\r
\r
.L1_2_47:\r
\r
- KERNEL1x1_SUB(xxx)\r
+ KERNEL1x1_SUB\r
+\r
jl .L1_2_47\r
ALIGN_4\r
\r
\r
.L1_2_49:\r
\r
- vmovddup ALPHA_R, %xmm0\r
- vmovddup ALPHA_I, %xmm1\r
-\r
- // swap high and low 64 bytes\r
- vshufpd $0x01, %xmm9 , %xmm9, %xmm9\r
-\r
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \\r
- defined(NR) || defined(NC) || defined(TR) || defined(TC)\r
-\r
- vaddsubpd %xmm9, %xmm8, %xmm8\r
-\r
- vshufpd $0x01, %xmm8 , %xmm8, %xmm9\r
-\r
-#else\r
- vaddsubpd %xmm8, %xmm9, %xmm9\r
-\r
- vmovapd %xmm9, %xmm8\r
-\r
- // swap high and low 64 bytes\r
- vshufpd $0x01, %xmm9 , %xmm9, %xmm9\r
-\r
-#endif\r
-\r
- // multiply with ALPHA_R\r
- vmulpd %xmm8 , %xmm0, %xmm8\r
-\r
- // multiply with ALPHA_I\r
- vmulpd %xmm9 , %xmm1, %xmm9\r
-\r
- vaddsubpd %xmm9 ,%xmm8, %xmm8\r
-\r
-\r
-\r
-#ifndef TRMMKERNEL\r
-\r
- vaddpd (CO1), %xmm8 , %xmm8\r
-\r
-#endif\r
-\r
- vmovups %xmm8 , (CO1)\r
+ SAVE1x1\r
\r
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \\r
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))\r