#define MMXSTORE movd
#endif
-#if defined(SANDYBRIDGE) || defined(HASWELL)
-//Enable some optimazation for nehalem.
-#define NEHALEM_OPTIMIZATION
-#endif
-
#if defined(PILEDRIVER) || defined(BULLDOZER)
//Enable some optimazation for barcelona.
#define BARCELONA_OPTIMIZATION
#define PROFCODE
#endif
+
+#if defined(C_PATHSCALE) || defined(OS_DARWIN)
+
+#define EPILOGUE \
+ .size REALNAME, .-REALNAME; \
+ .section .note.GNU-stack,"",@progbits
+
+#else
+
#define EPILOGUE \
- .size REALNAME, .-REALNAME; \
+ .size REALNAME, .-REALNAME; \
.section .note.GNU-stack,"",%progbits
#endif
+
+
+#endif
+
#ifdef XDOUBLE
#define FLD fldt
#define FST fstpt
#ifdef ASSEMBLER
-#if defined(SANDYBRIDGE) || defined(HASWELL)
-//Enable some optimazation for nehalem.
-#define NEHALEM_OPTIMIZATION
-#endif
-
-
#if defined(PILEDRIVER) || defined(BULLDOZER)
//Enable some optimazation for barcelona.
#define BARCELONA_OPTIMIZATION
#define PROFCODE
#endif
+#if defined(C_PATHSCALE) || defined(OS_DARWIN)
+
+#define EPILOGUE \
+ .size REALNAME, .-REALNAME; \
+ .section .note.GNU-stack,"",@progbits
+
+#else
+
#define EPILOGUE \
.size REALNAME, .-REALNAME; \
.section .note.GNU-stack,"",%progbits
+#endif
+
#endif
**********************************************************************************/\r
\r
/*********************************************************************\r
-* 2013/10/28 Saar\r
+* 2013/11/13 Saar\r
* BLASTEST : OK\r
* CTEST : OK\r
* TEST : OK\r
\r
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)\r
\r
-.macro VFMADDPS_R y0,y1,y2\r
- vfmaddps \y0,\y1,\y2,\y0\r
-.endm\r
+#define VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0\r
\r
-.macro VFMADDPS_I y0,y1,y2\r
- vfmaddps \y0,\y1,\y2,\y0\r
-.endm\r
+#define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0\r
\r
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)\r
\r
-.macro VFMADDPS_R y0,y1,y2\r
- vfnmaddps \y0,\y1,\y2,\y0\r
-.endm\r
+#define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0\r
\r
-.macro VFMADDPS_I y0,y1,y2\r
- vfmaddps \y0,\y1,\y2,\y0\r
-.endm\r
+#define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0\r
\r
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)\r
\r
-.macro VFMADDPS_R y0,y1,y2\r
- vfmaddps \y0,\y1,\y2,\y0\r
-.endm\r
+#define VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0\r
\r
-.macro VFMADDPS_I y0,y1,y2\r
- vfnmaddps \y0,\y1,\y2,\y0\r
-.endm\r
+#define VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0\r
\r
#else\r
\r
-.macro VFMADDPS_R y0,y1,y2\r
- vfnmaddps \y0,\y1,\y2,\y0\r
-.endm\r
+#define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0\r
\r
-.macro VFMADDPS_I y0,y1,y2\r
- vfnmaddps \y0,\y1,\y2,\y0\r
-.endm\r
+#define VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0\r
\r
#endif\r
\r
\r
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)\r
\r
-.macro VFMADDPS_R y0,y1,y2\r
- vfmadd231ps \y1,\y2,\y0\r
-.endm\r
+#define VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0\r
\r
-.macro VFMADDPS_I y0,y1,y2\r
- vfmadd231ps \y1,\y2,\y0\r
-.endm\r
+#define VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0\r
\r
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)\r
\r
-.macro VFMADDPS_R y0,y1,y2\r
- vfnmadd231ps \y1,\y2,\y0\r
-.endm\r
+#define VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0\r
\r
-.macro VFMADDPS_I y0,y1,y2\r
- vfmadd231ps \y1,\y2,\y0\r
-.endm\r
+#define VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0\r
\r
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)\r
\r
-.macro VFMADDPS_R y0,y1,y2\r
- vfmadd231ps \y1,\y2,\y0\r
-.endm\r
+#define VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0\r
\r
-.macro VFMADDPS_I y0,y1,y2\r
- vfnmadd231ps \y1,\y2,\y0\r
-.endm\r
+#define VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0\r
\r
#else\r
\r
-.macro VFMADDPS_R y0,y1,y2\r
- vfnmadd231ps \y1,\y2,\y0\r
-.endm\r
+#define VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0\r
\r
-.macro VFMADDPS_I y0,y1,y2\r
- vfnmadd231ps \y1,\y2,\y0\r
-.endm\r
+#define VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0\r
\r
#endif\r
\r
\r
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0\r
vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4\r
- VFMADDPS_R %ymm8,%ymm4,%ymm0\r
+ VFMADDPS_R( %ymm8,%ymm4,%ymm0 )\r
vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1\r
- VFMADDPS_R %ymm12,%ymm4,%ymm1\r
+ VFMADDPS_R( %ymm12,%ymm4,%ymm1 )\r
vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5\r
- VFMADDPS_I %ymm9,%ymm5,%ymm0\r
- VFMADDPS_I %ymm13,%ymm5,%ymm1\r
+ VFMADDPS_I( %ymm9,%ymm5,%ymm0 )\r
+ VFMADDPS_I( %ymm13,%ymm5,%ymm1 )\r
vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6\r
- VFMADDPS_R %ymm10,%ymm6,%ymm0\r
- VFMADDPS_R %ymm14,%ymm6,%ymm1\r
+ VFMADDPS_R( %ymm10,%ymm6,%ymm0 )\r
+ VFMADDPS_R( %ymm14,%ymm6,%ymm1 )\r
vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7\r
- VFMADDPS_I %ymm11,%ymm7,%ymm0\r
- VFMADDPS_I %ymm15,%ymm7,%ymm1\r
+ VFMADDPS_I( %ymm11,%ymm7,%ymm0 )\r
+ VFMADDPS_I( %ymm15,%ymm7,%ymm1 )\r
addq $4 , BI \r
addq $16, %rax \r
.endm\r
.macro KERNEL4x2_SUB\r
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0\r
vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4\r
- VFMADDPS_R %xmm8,%xmm4,%xmm0\r
+ VFMADDPS_R( %xmm8,%xmm4,%xmm0 )\r
vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1\r
- VFMADDPS_R %xmm12,%xmm4,%xmm1\r
+ VFMADDPS_R( %xmm12,%xmm4,%xmm1 )\r
vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5\r
- VFMADDPS_I %xmm9,%xmm5,%xmm0\r
- VFMADDPS_I %xmm13,%xmm5,%xmm1\r
+ VFMADDPS_I( %xmm9,%xmm5,%xmm0 )\r
+ VFMADDPS_I( %xmm13,%xmm5,%xmm1 )\r
vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6\r
- VFMADDPS_R %xmm10,%xmm6,%xmm0\r
- VFMADDPS_R %xmm14,%xmm6,%xmm1\r
+ VFMADDPS_R( %xmm10,%xmm6,%xmm0 )\r
+ VFMADDPS_R( %xmm14,%xmm6,%xmm1 )\r
vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7\r
- VFMADDPS_I %xmm11,%xmm7,%xmm0\r
- VFMADDPS_I %xmm15,%xmm7,%xmm1\r
+ VFMADDPS_I( %xmm11,%xmm7,%xmm0 )\r
+ VFMADDPS_I( %xmm15,%xmm7,%xmm1 )\r
addq $4, BI \r
addq $8, %rax \r
.endm\r
.macro KERNEL2x2_SUB\r
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0\r
vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4\r
- VFMADDPS_R %xmm8,%xmm4,%xmm0\r
+ VFMADDPS_R( %xmm8,%xmm4,%xmm0 )\r
vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5\r
- VFMADDPS_I %xmm9,%xmm5,%xmm0\r
+ VFMADDPS_I( %xmm9,%xmm5,%xmm0 )\r
vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6\r
- VFMADDPS_R %xmm10,%xmm6,%xmm0\r
+ VFMADDPS_R( %xmm10,%xmm6,%xmm0 )\r
vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7\r
- VFMADDPS_I %xmm11,%xmm7,%xmm0\r
+ VFMADDPS_I( %xmm11,%xmm7,%xmm0 )\r
addq $4, BI \r
addq $4, %rax \r
.endm\r
.macro KERNEL1x2_SUB\r
vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0\r
vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4\r
- VFMADDPS_R %xmm8,%xmm4,%xmm0\r
+ VFMADDPS_R( %xmm8,%xmm4,%xmm0 )\r
vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5\r
- VFMADDPS_I %xmm9,%xmm5,%xmm0\r
+ VFMADDPS_I( %xmm9,%xmm5,%xmm0 )\r
vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6\r
- VFMADDPS_R %xmm10,%xmm6,%xmm0\r
+ VFMADDPS_R( %xmm10,%xmm6,%xmm0 )\r
vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7\r
- VFMADDPS_I %xmm11,%xmm7,%xmm0\r
+ VFMADDPS_I( %xmm11,%xmm7,%xmm0 )\r
addq $4, BI \r
addq $2, %rax \r
.endm\r
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0\r
vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1\r
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4\r
- VFMADDPS_R %ymm8,%ymm4,%ymm0\r
- VFMADDPS_R %ymm12,%ymm4,%ymm1\r
+ VFMADDPS_R( %ymm8,%ymm4,%ymm0 )\r
+ VFMADDPS_R( %ymm12,%ymm4,%ymm1 )\r
vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5\r
- VFMADDPS_I %ymm9,%ymm5,%ymm0\r
- VFMADDPS_I %ymm13,%ymm5,%ymm1\r
+ VFMADDPS_I( %ymm9,%ymm5,%ymm0 )\r
+ VFMADDPS_I( %ymm13,%ymm5,%ymm1 )\r
addq $2 , BI \r
addq $16, %rax \r
.endm\r
.macro KERNEL4x1_SUB\r
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0\r
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4\r
- VFMADDPS_R %xmm8,%xmm4,%xmm0\r
+ VFMADDPS_R( %xmm8,%xmm4,%xmm0 )\r
vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1\r
- VFMADDPS_R %xmm12,%xmm4,%xmm1\r
+ VFMADDPS_R( %xmm12,%xmm4,%xmm1 )\r
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5\r
- VFMADDPS_I %xmm9,%xmm5,%xmm0\r
- VFMADDPS_I %xmm13,%xmm5,%xmm1\r
+ VFMADDPS_I( %xmm9,%xmm5,%xmm0 )\r
+ VFMADDPS_I( %xmm13,%xmm5,%xmm1 )\r
addq $2, BI \r
addq $8, %rax \r
.endm\r
.macro KERNEL2x1_SUB\r
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0\r
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4\r
- VFMADDPS_R %xmm8,%xmm4,%xmm0\r
+ VFMADDPS_R( %xmm8,%xmm4,%xmm0 )\r
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5\r
- VFMADDPS_I %xmm9,%xmm5,%xmm0\r
+ VFMADDPS_I( %xmm9,%xmm5,%xmm0 )\r
addq $2, BI \r
addq $4, %rax \r
.endm\r
.macro KERNEL1x1_SUB\r
vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0\r
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4\r
- VFMADDPS_R %xmm8,%xmm4,%xmm0\r
+ VFMADDPS_R( %xmm8,%xmm4,%xmm0 )\r
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5\r
- VFMADDPS_I %xmm9,%xmm5,%xmm0\r
+ VFMADDPS_I( %xmm9,%xmm5,%xmm0 )\r
addq $2, BI \r
addq $2, %rax \r
.endm\r
**********************************************************************************/\r
\r
/*********************************************************************\r
-* 2013/10/28 Saar\r
+* 2013/11/13 Saar\r
* BLASTEST : OK\r
* CTEST : OK\r
* TEST : OK\r
\r
#if defined(BULLDOZER)\r
\r
-.macro VFMADD231PS_ y0,y1,y2\r
- vfmaddps \y0,\y1,\y2,\y0\r
-.endm\r
+#define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0\r
\r
-.macro VFMADD231SS_ x0,x1,x2\r
- vfmaddss \x0,\x1,\x2,\x0\r
-.endm\r
+#define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0\r
\r
#else\r
\r
-.macro VFMADD231PS_ y0,y1,y2\r
- vfmadd231ps \y1,\y2,\y0\r
-.endm\r
+#define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0\r
\r
-.macro VFMADD231SS_ x0,x1,x2\r
- vfmadd231ss \x1,\x2,\x0\r
-.endm\r
+#define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0\r
\r
#endif\r
\r
vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1\r
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2\r
vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3\r
- VFMADD231PS_ %ymm4,%ymm2,%ymm0\r
- VFMADD231PS_ %ymm5,%ymm2,%ymm1\r
- VFMADD231PS_ %ymm6,%ymm3,%ymm0\r
- VFMADD231PS_ %ymm7,%ymm3,%ymm1\r
+ VFMADD231PS_( %ymm4,%ymm2,%ymm0 )\r
+ VFMADD231PS_( %ymm5,%ymm2,%ymm1 )\r
+ VFMADD231PS_( %ymm6,%ymm3,%ymm0 )\r
+ VFMADD231PS_( %ymm7,%ymm3,%ymm1 )\r
vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2\r
vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3\r
- VFMADD231PS_ %ymm8,%ymm2,%ymm0\r
- VFMADD231PS_ %ymm9,%ymm2,%ymm1\r
- VFMADD231PS_ %ymm10,%ymm3,%ymm0\r
- VFMADD231PS_ %ymm11,%ymm3,%ymm1\r
+ VFMADD231PS_( %ymm8,%ymm2,%ymm0 )\r
+ VFMADD231PS_( %ymm9,%ymm2,%ymm1 )\r
+ VFMADD231PS_( %ymm10,%ymm3,%ymm0 )\r
+ VFMADD231PS_( %ymm11,%ymm3,%ymm1 )\r
addq $4 , BI \r
addq $16, %rax \r
.endm\r
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0\r
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2\r
vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3\r
- VFMADD231PS_ %ymm4,%ymm2,%ymm0\r
- VFMADD231PS_ %ymm6,%ymm3,%ymm0\r
+ VFMADD231PS_( %ymm4,%ymm2,%ymm0 )\r
+ VFMADD231PS_( %ymm6,%ymm3,%ymm0 )\r
vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2\r
vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3\r
- VFMADD231PS_ %ymm8,%ymm2,%ymm0\r
- VFMADD231PS_ %ymm10,%ymm3,%ymm0\r
+ VFMADD231PS_( %ymm8,%ymm2,%ymm0 )\r
+ VFMADD231PS_( %ymm10,%ymm3,%ymm0 )\r
addq $4 , BI \r
addq $8 , %rax \r
.endm\r
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0\r
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2\r
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3\r
- VFMADD231PS_ %xmm4,%xmm2,%xmm0\r
- VFMADD231PS_ %xmm6,%xmm3,%xmm0\r
+ VFMADD231PS_( %xmm4,%xmm2,%xmm0 )\r
+ VFMADD231PS_( %xmm6,%xmm3,%xmm0 )\r
vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2\r
vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3\r
- VFMADD231PS_ %xmm8,%xmm2,%xmm0\r
- VFMADD231PS_ %xmm10,%xmm3,%xmm0\r
+ VFMADD231PS_( %xmm8,%xmm2,%xmm0 )\r
+ VFMADD231PS_( %xmm10,%xmm3,%xmm0 )\r
addq $4 , BI \r
addq $4 , %rax \r
.endm\r
vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1\r
vmovss -4 * SIZE(BO, BI, SIZE), %xmm2\r
vmovss -3 * SIZE(BO, BI, SIZE), %xmm3\r
- VFMADD231SS_ %xmm4,%xmm2,%xmm0\r
- VFMADD231SS_ %xmm5,%xmm2,%xmm1\r
- VFMADD231SS_ %xmm6,%xmm3,%xmm0\r
- VFMADD231SS_ %xmm7,%xmm3,%xmm1\r
+ VFMADD231SS_( %xmm4,%xmm2,%xmm0 )\r
+ VFMADD231SS_( %xmm5,%xmm2,%xmm1 )\r
+ VFMADD231SS_( %xmm6,%xmm3,%xmm0 )\r
+ VFMADD231SS_( %xmm7,%xmm3,%xmm1 )\r
vmovss -2 * SIZE(BO, BI, SIZE), %xmm2\r
vmovss -1 * SIZE(BO, BI, SIZE), %xmm3\r
- VFMADD231SS_ %xmm8,%xmm2,%xmm0\r
- VFMADD231SS_ %xmm9,%xmm2,%xmm1\r
- VFMADD231SS_ %xmm10,%xmm3,%xmm0\r
- VFMADD231SS_ %xmm11,%xmm3,%xmm1\r
+ VFMADD231SS_( %xmm8,%xmm2,%xmm0 )\r
+ VFMADD231SS_( %xmm9,%xmm2,%xmm1 )\r
+ VFMADD231SS_( %xmm10,%xmm3,%xmm0 )\r
+ VFMADD231SS_( %xmm11,%xmm3,%xmm1 )\r
addq $4 , BI \r
addq $2, %rax \r
.endm\r
vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0\r
vmovss -4 * SIZE(BO, BI, SIZE), %xmm2\r
vmovss -3 * SIZE(BO, BI, SIZE), %xmm3\r
- VFMADD231SS_ %xmm4,%xmm2,%xmm0\r
- VFMADD231SS_ %xmm6,%xmm3,%xmm0\r
+ VFMADD231SS_( %xmm4,%xmm2,%xmm0 )\r
+ VFMADD231SS_( %xmm6,%xmm3,%xmm0 )\r
vmovss -2 * SIZE(BO, BI, SIZE), %xmm2\r
vmovss -1 * SIZE(BO, BI, SIZE), %xmm3\r
- VFMADD231SS_ %xmm8,%xmm2,%xmm0\r
- VFMADD231SS_ %xmm10,%xmm3,%xmm0\r
+ VFMADD231SS_( %xmm8,%xmm2,%xmm0 )\r
+ VFMADD231SS_( %xmm10,%xmm3,%xmm0 )\r
addq $4 , BI \r
addq $1, %rax \r
.endm\r
vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1\r
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2\r
vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3\r
- VFMADD231PS_ %ymm4,%ymm2,%ymm0\r
- VFMADD231PS_ %ymm5,%ymm2,%ymm1\r
- VFMADD231PS_ %ymm6,%ymm3,%ymm0\r
- VFMADD231PS_ %ymm7,%ymm3,%ymm1\r
+ VFMADD231PS_( %ymm4,%ymm2,%ymm0 )\r
+ VFMADD231PS_( %ymm5,%ymm2,%ymm1 )\r
+ VFMADD231PS_( %ymm6,%ymm3,%ymm0 )\r
+ VFMADD231PS_( %ymm7,%ymm3,%ymm1 )\r
addq $2 , BI \r
addq $16, %rax \r
.endm\r
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0\r
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2\r
vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3\r
- VFMADD231PS_ %ymm4,%ymm2,%ymm0\r
- VFMADD231PS_ %ymm6,%ymm3,%ymm0\r
+ VFMADD231PS_( %ymm4,%ymm2,%ymm0 )\r
+ VFMADD231PS_( %ymm6,%ymm3,%ymm0 )\r
addq $2 , BI \r
addq $8 , %rax \r
.endm\r
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0\r
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2\r
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3\r
- VFMADD231PS_ %xmm4,%xmm2,%xmm0\r
- VFMADD231PS_ %xmm6,%xmm3,%xmm0\r
+ VFMADD231PS_( %xmm4,%xmm2,%xmm0 )\r
+ VFMADD231PS_( %xmm6,%xmm3,%xmm0 )\r
addq $2 , BI \r
addq $4 , %rax \r
.endm\r
vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1\r
vmovss -4 * SIZE(BO, BI, SIZE), %xmm2\r
vmovss -3 * SIZE(BO, BI, SIZE), %xmm3\r
- VFMADD231SS_ %xmm4,%xmm2,%xmm0\r
- VFMADD231SS_ %xmm5,%xmm2,%xmm1\r
- VFMADD231SS_ %xmm6,%xmm3,%xmm0\r
- VFMADD231SS_ %xmm7,%xmm3,%xmm1\r
+ VFMADD231SS_( %xmm4,%xmm2,%xmm0 )\r
+ VFMADD231SS_( %xmm5,%xmm2,%xmm1 )\r
+ VFMADD231SS_( %xmm6,%xmm3,%xmm0 )\r
+ VFMADD231SS_( %xmm7,%xmm3,%xmm1 )\r
addq $2 , BI \r
addq $2, %rax \r
.endm\r
vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0\r
vmovss -4 * SIZE(BO, BI, SIZE), %xmm2\r
vmovss -3 * SIZE(BO, BI, SIZE), %xmm3\r
- VFMADD231SS_ %xmm4,%xmm2,%xmm0\r
- VFMADD231SS_ %xmm6,%xmm3,%xmm0\r
+ VFMADD231SS_( %xmm4,%xmm2,%xmm0 )\r
+ VFMADD231SS_( %xmm6,%xmm3,%xmm0 )\r
addq $2 , BI \r
addq $1, %rax \r
.endm\r
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0\r
vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1\r
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2\r
- VFMADD231PS_ %ymm4,%ymm2,%ymm0\r
- VFMADD231PS_ %ymm5,%ymm2,%ymm1\r
+ VFMADD231PS_( %ymm4,%ymm2,%ymm0 )\r
+ VFMADD231PS_( %ymm5,%ymm2,%ymm1 )\r
addq $1 , BI \r
addq $16, %rax \r
.endm\r
.macro KERNEL8x1_SUB\r
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0\r
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2\r
- VFMADD231PS_ %ymm4,%ymm2,%ymm0\r
+ VFMADD231PS_( %ymm4,%ymm2,%ymm0 )\r
addq $1 , BI \r
addq $8 , %rax \r
.endm\r
.macro KERNEL4x1_SUB\r
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0\r
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2\r
- VFMADD231PS_ %xmm4,%xmm2,%xmm0\r
+ VFMADD231PS_( %xmm4,%xmm2,%xmm0 )\r
addq $1 , BI \r
addq $4 , %rax \r
.endm\r
vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0\r
vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1\r
vmovss -4 * SIZE(BO, BI, SIZE), %xmm2\r
- VFMADD231SS_ %xmm4,%xmm2,%xmm0\r
- VFMADD231SS_ %xmm5,%xmm2,%xmm1\r
+ VFMADD231SS_( %xmm4,%xmm2,%xmm0 )\r
+ VFMADD231SS_( %xmm5,%xmm2,%xmm1 )\r
addq $1 , BI \r
addq $2, %rax \r
.endm\r
.macro KERNEL1x1_SUB\r
vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0\r
vmovss -4 * SIZE(BO, BI, SIZE), %xmm2\r
- VFMADD231SS_ %xmm4,%xmm2,%xmm0\r
+ VFMADD231SS_( %xmm4,%xmm2,%xmm0 )\r
addq $1 , BI \r
addq $1, %rax \r
.endm\r
**********************************************************************************/\r
\r
/********************************************************************************\r
-* 2013/10/28 Saar\r
+* 2013/11/13 Saar\r
* BLASTEST : OK\r
* CTEST : OK\r
* TEST : OK\r
\r
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)\r
\r
-.macro VFMADDPD_R y0,y1,y2\r
- vfmaddpd \y0,\y1,\y2,\y0\r
-.endm\r
+#define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0\r
\r
-.macro VFMADDPD_I y0,y1,y2\r
- vfmaddpd \y0,\y1,\y2,\y0\r
-.endm\r
+#define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0\r
\r
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)\r
\r
-.macro VFMADDPD_R y0,y1,y2\r
- vfnmaddpd \y0,\y1,\y2,\y0\r
-.endm\r
+#define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0\r
\r
-.macro VFMADDPD_I y0,y1,y2\r
- vfmaddpd \y0,\y1,\y2,\y0\r
-.endm\r
+#define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0\r
\r
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)\r
\r
-.macro VFMADDPD_R y0,y1,y2\r
- vfmaddpd \y0,\y1,\y2,\y0\r
-.endm\r
+#define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0\r
\r
-.macro VFMADDPD_I y0,y1,y2\r
- vfnmaddpd \y0,\y1,\y2,\y0\r
-.endm\r
+#define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0\r
\r
#else\r
\r
-.macro VFMADDPD_R y0,y1,y2\r
- vfnmaddpd \y0,\y1,\y2,\y0\r
-.endm\r
+#define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0\r
\r
-.macro VFMADDPD_I y0,y1,y2\r
- vfnmaddpd \y0,\y1,\y2,\y0\r
-.endm\r
+#define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0\r
\r
#endif\r
\r
\r
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)\r
\r
-.macro VFMADDPD_R y0,y1,y2\r
- vfmadd231pd \y1,\y2,\y0\r
-.endm\r
+#define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0\r
\r
-.macro VFMADDPD_I y0,y1,y2\r
- vfmadd231pd \y1,\y2,\y0\r
-.endm\r
+#define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0\r
\r
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)\r
\r
-.macro VFMADDPD_R y0,y1,y2\r
- vfnmadd231pd \y1,\y2,\y0\r
-.endm\r
+#define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0\r
\r
-.macro VFMADDPD_I y0,y1,y2\r
- vfmadd231pd \y1,\y2,\y0\r
-.endm\r
+#define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0\r
\r
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)\r
\r
-.macro VFMADDPD_R y0,y1,y2\r
- vfmadd231pd \y1,\y2,\y0\r
-.endm\r
+#define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0\r
\r
-.macro VFMADDPD_I y0,y1,y2\r
- vfnmadd231pd \y1,\y2,\y0\r
-.endm\r
+#define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0\r
\r
#else\r
\r
-.macro VFMADDPD_R y0,y1,y2\r
- vfnmadd231pd \y1,\y2,\y0\r
-.endm\r
+#define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0\r
\r
-.macro VFMADDPD_I y0,y1,y2\r
- vfnmadd231pd \y1,\y2,\y0\r
-.endm\r
+#define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0\r
\r
#endif\r
\r
\r
vbroadcastsd -8 * SIZE(BO, BI, SIZE), %ymm4\r
vbroadcastsd -7 * SIZE(BO, BI, SIZE), %ymm5\r
- VFMADDPD_R %ymm8 ,%ymm4,%ymm0\r
- VFMADDPD_R %ymm12,%ymm4,%ymm1\r
+ VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 )\r
+ VFMADDPD_R( %ymm12,%ymm4,%ymm1 )\r
vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm6\r
- VFMADDPD_I %ymm9 ,%ymm5,%ymm0\r
- VFMADDPD_I %ymm13,%ymm5,%ymm1\r
+ VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 )\r
+ VFMADDPD_I( %ymm13,%ymm5,%ymm1 )\r
vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm7\r
- VFMADDPD_R %ymm10,%ymm6,%ymm0\r
- VFMADDPD_R %ymm14,%ymm6,%ymm1\r
- VFMADDPD_I %ymm11,%ymm7,%ymm0\r
- VFMADDPD_I %ymm15,%ymm7,%ymm1\r
+ VFMADDPD_R( %ymm10,%ymm6,%ymm0 )\r
+ VFMADDPD_R( %ymm14,%ymm6,%ymm1 )\r
+ VFMADDPD_I( %ymm11,%ymm7,%ymm0 )\r
+ VFMADDPD_I( %ymm15,%ymm7,%ymm1 )\r
\r
addq $4, BI \r
addq $8, %rax \r
vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0\r
vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4\r
vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1\r
- VFMADDPD_R %xmm8,%xmm4,%xmm0\r
- VFMADDPD_R %xmm12,%xmm4,%xmm1\r
+ VFMADDPD_R( %xmm8,%xmm4,%xmm0 )\r
+ VFMADDPD_R( %xmm12,%xmm4,%xmm1 )\r
vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5\r
- VFMADDPD_I %xmm9,%xmm5,%xmm0\r
- VFMADDPD_I %xmm13,%xmm5,%xmm1\r
+ VFMADDPD_I( %xmm9,%xmm5,%xmm0 )\r
+ VFMADDPD_I( %xmm13,%xmm5,%xmm1 )\r
vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6\r
- VFMADDPD_R %xmm10,%xmm6,%xmm0\r
- VFMADDPD_R %xmm14,%xmm6,%xmm1\r
+ VFMADDPD_R( %xmm10,%xmm6,%xmm0 )\r
+ VFMADDPD_R( %xmm14,%xmm6,%xmm1 )\r
vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7\r
- VFMADDPD_I %xmm11,%xmm7,%xmm0\r
- VFMADDPD_I %xmm15,%xmm7,%xmm1\r
+ VFMADDPD_I( %xmm11,%xmm7,%xmm0 )\r
+ VFMADDPD_I( %xmm15,%xmm7,%xmm1 )\r
addq $4, BI \r
addq $4, %rax \r
.endm\r
vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0\r
vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4\r
vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5\r
- VFMADDPD_R %xmm8,%xmm4,%xmm0\r
- VFMADDPD_I %xmm9,%xmm5,%xmm0\r
+ VFMADDPD_R( %xmm8,%xmm4,%xmm0 )\r
+ VFMADDPD_I( %xmm9,%xmm5,%xmm0 )\r
vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6\r
vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7\r
- VFMADDPD_R %xmm10,%xmm6,%xmm0\r
- VFMADDPD_I %xmm11,%xmm7,%xmm0\r
+ VFMADDPD_R( %xmm10,%xmm6,%xmm0 )\r
+ VFMADDPD_I( %xmm11,%xmm7,%xmm0 )\r
addq $4, BI \r
addq $2, %rax \r
.endm\r
vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1\r
vbroadcastsd -4 * SIZE(BO, BI, SIZE) , %ymm4\r
vbroadcastsd -3 * SIZE(BO, BI, SIZE) , %ymm5\r
- VFMADDPD_R %ymm8 ,%ymm4,%ymm0\r
- VFMADDPD_R %ymm12,%ymm4,%ymm1\r
- VFMADDPD_I %ymm9 ,%ymm5,%ymm0\r
- VFMADDPD_I %ymm13,%ymm5,%ymm1\r
+ VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 )\r
+ VFMADDPD_R( %ymm12,%ymm4,%ymm1 )\r
+ VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 )\r
+ VFMADDPD_I( %ymm13,%ymm5,%ymm1 )\r
\r
addq $2, BI \r
addq $8, %rax \r
.macro KERNEL2x1_SUB\r
vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0\r
vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4\r
- VFMADDPD_R %xmm8,%xmm4,%xmm0\r
+ VFMADDPD_R( %xmm8,%xmm4,%xmm0 )\r
vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1\r
- VFMADDPD_R %xmm12,%xmm4,%xmm1\r
+ VFMADDPD_R( %xmm12,%xmm4,%xmm1 )\r
vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5\r
- VFMADDPD_I %xmm9,%xmm5,%xmm0\r
- VFMADDPD_I %xmm13,%xmm5,%xmm1\r
+ VFMADDPD_I( %xmm9,%xmm5,%xmm0 )\r
+ VFMADDPD_I( %xmm13,%xmm5,%xmm1 )\r
addq $2, BI \r
addq $4, %rax \r
.endm\r
.macro KERNEL1x1_SUB\r
vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0\r
vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4\r
- VFMADDPD_R %xmm8,%xmm4,%xmm0\r
+ VFMADDPD_R( %xmm8,%xmm4,%xmm0 )\r
vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5\r
- VFMADDPD_I %xmm9,%xmm5,%xmm0\r
+ VFMADDPD_I( %xmm9,%xmm5,%xmm0 )\r
addq $2, BI \r
addq $2, %rax \r
.endm\r