changes for compatibility with Pathscale compiler
authorwernsaar <wernsaar@googlemail.com>
Wed, 13 Nov 2013 16:39:13 +0000 (17:39 +0100)
committerwernsaar <wernsaar@googlemail.com>
Wed, 13 Nov 2013 16:39:13 +0000 (17:39 +0100)
common_x86.h
common_x86_64.h
kernel/x86_64/dgemm_kernel_8x2_piledriver.S

index 48517d9..8245f70 100644 (file)
@@ -301,12 +301,25 @@ REALNAME:
 #define PROFCODE
 #endif
 
+
+#if defined(C_PATHSCALE) || defined(OS_DARWIN)
+
+#define EPILOGUE \
+        .size    REALNAME, .-REALNAME; \
+        .section .note.GNU-stack,"",@progbits
+
+#else
+
 #define EPILOGUE \
-        .size   REALNAME, .-REALNAME; \
+        .size    REALNAME, .-REALNAME; \
         .section .note.GNU-stack,"",%progbits
 
 #endif
 
+
+
+#endif
+
 #ifdef XDOUBLE
 #define FLD    fldt
 #define FST    fstpt
index 1889038..4fe2344 100644 (file)
@@ -372,10 +372,20 @@ REALNAME:
 #define PROFCODE
 #endif
 
+#if defined(C_PATHSCALE) || defined(OS_DARWIN)
+
+#define EPILOGUE \
+        .size   REALNAME, .-REALNAME; \
+        .section .note.GNU-stack,"",@progbits
+
+#else
+
 #define EPILOGUE \
         .size   REALNAME, .-REALNAME; \
         .section .note.GNU-stack,"",%progbits
 
+#endif
+
 
 #endif
 
index cc0ebef..8585d45 100644 (file)
@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 \r
 /*********************************************************************\r
 *\r
-* 2013/10/31 Saar\r
+* 2013/11/13 Saar\r
 *        BLASTEST               : OK\r
 *        CTEST                  : OK\r
 *        TEST                   : OK\r
@@ -144,25 +144,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define STACK_TOUCH\r
 #endif\r
 \r
-#if defined(BULLDOZER1)\r
+#if defined(BULLDOZER)\r
 \r
-.macro VFMADD231PD_ y1,y2,y0\r
-        vfmaddpd \y0,\y1,\y2,\y0\r
-.endm\r
+#define VFMADD231PD_( y1,y2,y0 ) vfmaddpd y0,y1,y2,y0\r
 \r
-.macro VFMADD231SD_ x1,x2,x0\r
-        vfmaddsd \x0,\x1,\x2,\x0\r
-.endm\r
+#define VFMADD231SD_( x1,x2,x0 ) vfmaddsd x0,x1,x2,x0\r
 \r
 #else\r
 \r
-.macro VFMADD231PD_ y1,y2,y0\r
-        vfmadd231pd \y2,\y1,\y0\r
-.endm\r
+#define VFMADD231PD_( y1,y2,y0 ) vfmadd231pd y2,y1,y0\r
 \r
-.macro VFMADD231SD_ x1,x2,x0\r
-        vfmadd231sd \x2,\x1,\x0\r
-.endm\r
+#define VFMADD231SD_( x1,x2,x0 ) vfmadd231sd x2,x1,x0\r
 \r
 #endif\r
 \r
@@ -218,46 +210,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro KERNEL8x3_M1 \r
        vmovups         -16 * SIZE(AO), %xmm0\r
        prefetcht0      A_PR1(AO)\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm4\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm5\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm6\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm4 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm5 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm6 )\r
        vmovups         -14 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm7\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm8\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm9\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm7 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm8 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm9 )\r
        vmovups         -12 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm10\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm11\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm12\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm10 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm11 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm12 )\r
        vmovups         -10 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm13\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm13 )\r
        vmovddup        -12 * SIZE(BO), %xmm1\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm14\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm14 )\r
        vmovddup        -11 * SIZE(BO), %xmm2\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm15\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm15 )\r
 .endm\r
 \r
 .macro KERNEL8x3_M2 \r
        vmovups          -8 * SIZE(AO), %xmm0\r
        prefetcht0      A_PR1+64(AO)\r
        vmovddup        -10 * SIZE(BO), %xmm3\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm4\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm5\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm6\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm4 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm5 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm6 )\r
        vmovups          -6 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm7\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm8\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm9\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm7 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm8 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm9 )\r
        vmovups          -4 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm10\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm11\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm12\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm10 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm11 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm12 )\r
        vmovups          -2 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm13\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm13 )\r
        vmovddup         -9 * SIZE(BO), %xmm1\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm14\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm14 )\r
        vmovddup         -8 * SIZE(BO), %xmm2\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm15\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm15 )\r
 .endm\r
 \r
 \r
@@ -265,93 +257,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        vmovups           0 * SIZE(AO), %xmm0\r
        prefetcht0      A_PR1+128(AO)\r
        vmovddup         -7 * SIZE(BO), %xmm3\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm4\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm5\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm6\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm4 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm5 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm6 )\r
        vmovups           2 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm7\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm8\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm9\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm7 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm8 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm9 )\r
        vmovups           4 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm10\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm11\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm12\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm10 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm11 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm12 )\r
        vmovups           6 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm13\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm13 )\r
        vmovddup         -6 * SIZE(BO), %xmm1\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm14\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm14 )\r
        vmovddup         -5 * SIZE(BO), %xmm2\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm15\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm15 )\r
 .endm\r
 \r
 .macro KERNEL8x3_M4 \r
        vmovups           8 * SIZE(AO), %xmm0\r
        prefetcht0      A_PR1+192(AO)\r
        vmovddup         -4 * SIZE(BO), %xmm3\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm4\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm5\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm6\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm4 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm5 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm6 )\r
        vmovups          10 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm7\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm8\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm9\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm7 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm8 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm9 )\r
        vmovups          12 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm10\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm11\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm12\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm10 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm11 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm12 )\r
        vmovups          14 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm13\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm13 )\r
        vmovddup         -3 * SIZE(BO), %xmm1\r
        addq            $32 * SIZE, AO\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm14\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm14 )\r
        vmovddup         -2 * SIZE(BO), %xmm2\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm15\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm15 )\r
 .endm\r
 \r
 .macro KERNEL8x3_M5 \r
        vmovups         -16 * SIZE(AO), %xmm0\r
        prefetcht0      A_PR1(AO)\r
        vmovddup         -1 * SIZE(BO), %xmm3\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm4\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm5\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm6\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm4 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm5 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm6 )\r
        vmovups         -14 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm7\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm8\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm9\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm7 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm8 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm9 )\r
        vmovups         -12 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm10\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm11\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm12\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm10 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm11 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm12 )\r
        vmovups         -10 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm13\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm13 )\r
        vmovddup          0 * SIZE(BO), %xmm1\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm14\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm14 )\r
        vmovddup          1 * SIZE(BO), %xmm2\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm15\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm15 )\r
 .endm\r
 \r
 .macro KERNEL8x3_M6 \r
        vmovups          -8 * SIZE(AO), %xmm0\r
        prefetcht0      A_PR1+64(AO)\r
        vmovddup          2 * SIZE(BO), %xmm3\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm4\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm5\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm6\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm4 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm5 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm6 )\r
        vmovups          -6 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm7\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm8\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm9\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm7 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm8 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm9 )\r
        vmovups          -4 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm10\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm11\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm12\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm10 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm11 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm12 )\r
        vmovups          -2 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm13\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm13 )\r
        vmovddup          3 * SIZE(BO), %xmm1\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm14\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm14 )\r
        vmovddup          4 * SIZE(BO), %xmm2\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm15\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm15 )\r
 .endm\r
 \r
 \r
@@ -359,46 +351,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        vmovups           0 * SIZE(AO), %xmm0\r
        prefetcht0      A_PR1+128(AO)\r
        vmovddup          5 * SIZE(BO), %xmm3\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm4\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm5\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm6\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm4 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm5 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm6 )\r
        vmovups           2 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm7\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm8\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm9\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm7 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm8 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm9 )\r
        vmovups           4 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm10\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm11\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm12\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm10 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm11 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm12 )\r
        vmovups           6 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm13\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm13 )\r
        vmovddup          6 * SIZE(BO), %xmm1\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm14\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm14 )\r
        vmovddup          7 * SIZE(BO), %xmm2\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm15\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm15 )\r
 .endm\r
 \r
 .macro KERNEL8x3_M8 \r
        vmovups           8 * SIZE(AO), %xmm0\r
        prefetcht0      A_PR1+192(AO)\r
        vmovddup          8 * SIZE(BO), %xmm3\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm4\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm5\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm6\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm4 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm5 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm6 )\r
        vmovups          10 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm7\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm8\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm9\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm7 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm8 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm9 )\r
        vmovups          12 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm10\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm11\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm12\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm10 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm11 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm12 )\r
        vmovups          14 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm13\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm13 )\r
        vmovddup          9 * SIZE(BO), %xmm1\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm14\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm14 )\r
        vmovddup         10 * SIZE(BO), %xmm2\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm15\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm15 )\r
        vmovddup         11 * SIZE(BO), %xmm3\r
        addq            $32 * SIZE, AO\r
        addq            $24 * SIZE, BO\r
@@ -409,47 +401,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        vmovups           8 * SIZE(AO), %xmm0\r
        prefetcht0      A_PR1+192(AO)\r
        vmovddup          8 * SIZE(BO), %xmm3\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm4\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm5\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm6\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm4 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm5 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm6 )\r
        vmovups          10 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm7\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm8\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm9\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm7 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm8 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm9 )\r
        vmovups          12 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm10\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm11\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm12\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm10 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm11 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm12 )\r
        vmovups          14 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm13\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm13 )\r
        addq            $32*SIZE, AO\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm14\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm14 )\r
        addq            $21*SIZE, BO\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm15\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm15 )\r
 .endm\r
 \r
 .macro KERNEL8x3_SUBN \r
        vmovddup        -12 * SIZE(BO), %xmm1\r
        vmovups         -16 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm4\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm4 )\r
        vmovddup        -11 * SIZE(BO), %xmm2\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm5\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm5 )\r
        vmovddup        -10 * SIZE(BO), %xmm3\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm6\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm6 )\r
        vmovups         -14 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm7\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm8\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm9\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm7 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm8 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm9 )\r
        vmovups         -12 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm10\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm11\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm12\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm10 )\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm11 )\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm12 )\r
        vmovups         -10 * SIZE(AO), %xmm0\r
-       VFMADD231PD_    %xmm1,%xmm0,%xmm13\r
+       VFMADD231PD_(   %xmm1,%xmm0,%xmm13 )\r
        addq            $3*SIZE, BO\r
-       VFMADD231PD_    %xmm2,%xmm0,%xmm14\r
+       VFMADD231PD_(   %xmm2,%xmm0,%xmm14 )\r
        addq            $8*SIZE, AO\r
-       VFMADD231PD_    %xmm3,%xmm0,%xmm15\r
+       VFMADD231PD_(   %xmm3,%xmm0,%xmm15 )\r
 .endm\r
 \r
 .macro SAVE8x3\r