Use .p2align instead of .align for compatibility on Sandybridge as well
authorMartin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Sat, 24 Feb 2018 18:43:15 +0000 (19:43 +0100)
committerGitHub <noreply@github.com>
Sat, 24 Feb 2018 18:43:15 +0000 (19:43 +0100)
18 files changed:
kernel/x86_64/caxpy_microk_sandy-2.c
kernel/x86_64/cdot_microk_sandy-2.c
kernel/x86_64/daxpy_microk_sandy-2.c
kernel/x86_64/ddot_microk_sandy-2.c
kernel/x86_64/dger_microk_sandy-2.c
kernel/x86_64/dscal_microk_sandy-2.c
kernel/x86_64/dsymv_L_microk_sandy-2.c
kernel/x86_64/dsymv_U_microk_sandy-2.c
kernel/x86_64/saxpy_microk_sandy-2.c
kernel/x86_64/sdot_microk_sandy-2.c
kernel/x86_64/sgemv_n_microk_sandy-4.c
kernel/x86_64/sgemv_t_microk_sandy-4.c
kernel/x86_64/sger_microk_sandy-2.c
kernel/x86_64/ssymv_L_microk_sandy-2.c
kernel/x86_64/ssymv_U_microk_sandy-2.c
kernel/x86_64/zaxpy_microk_sandy-2.c
kernel/x86_64/zdot_microk_sandy-2.c
kernel/x86_64/zgemv_n_microk_sandy-4.c

index dbfce20..a798fd9 100644 (file)
@@ -50,11 +50,11 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
        "vmulps         (%5), %%ymm0 , %%ymm0               \n\t"
 #endif
 
-       ".align 16                                          \n\t"
+       ".p2align 4                                         \n\t"
        "1:                                         \n\t"
 
        "vmovups        (%2,%0,4), %%ymm5                   \n\t" // 4 complex values from x
-       ".align 2                                           \n\t"
+       ".p2align 1                                         \n\t"
        "vmovups      32(%2,%0,4), %%ymm7                   \n\t" // 4 complex values from x
        "vmovups      64(%2,%0,4), %%ymm9                   \n\t" // 4 complex values from x
        "vmovups      96(%2,%0,4), %%ymm11                  \n\t" // 4 complex values from x
@@ -85,7 +85,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
        "vaddps         %%ymm10, %%ymm11, %%ymm11           \n\t"
 
        "vmovups        %%ymm5 ,   (%3,%0,4)                \n\t"
-       ".align 2                                           \n\t"
+       ".p2align 1                                         \n\t"
        "vmovups        %%ymm7 , 32(%3,%0,4)                \n\t"
        "vmovups        %%ymm9 , 64(%3,%0,4)                \n\t"
        "vmovups        %%ymm11, 96(%3,%0,4)                \n\t"
index 22cd79e..0181691 100644 (file)
@@ -46,7 +46,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
        "vxorps         %%ymm6, %%ymm6, %%ymm6               \n\t"
        "vxorps         %%ymm7, %%ymm7, %%ymm7               \n\t"
 
-       ".align 16                                   \n\t"
+       ".p2align 4                                  \n\t"
        "1:                                          \n\t"
         "vmovups                  (%2,%0,4), %%ymm8          \n\t"  // 2 * x
         "vmovups                32(%2,%0,4), %%ymm9          \n\t"  // 2 * x
index 522e084..85e038c 100644 (file)
@@ -50,7 +50,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
        "subq           $16, %1                              \n\t"              
        "jz             2f                           \n\t"
 
-       ".align 16                                          \n\t"
+       ".p2align 4                                         \n\t"
        "1:                                         \n\t"
 
        "vmulpd         %%ymm4, %%ymm0, %%ymm4          \n\t"
index e2e6701..160f956 100644 (file)
@@ -41,7 +41,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
        "vxorpd         %%ymm6, %%ymm6, %%ymm6               \n\t"
        "vxorpd         %%ymm7, %%ymm7, %%ymm7               \n\t"
 
-       ".align 16                                           \n\t"
+       ".p2align 4                                          \n\t"
        "1:                                          \n\t"
         "vmovups                  (%2,%0,8), %%ymm12         \n\t"  // 2 * x
         "vmovups                32(%2,%0,8), %%ymm13         \n\t"  // 2 * x
index 564f135..2bf966a 100644 (file)
@@ -53,7 +53,7 @@ static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
        "subq           $8, %1                               \n\t"              
        "jz             2f                           \n\t"
 
-       ".align 8                                           \n\t"
+       ".p2align 3                                         \n\t"
        "1:                                         \n\t"
 
        "vmulpd         %%xmm4, %%xmm0, %%xmm4          \n\t"
index f5bf593..8d85507 100644 (file)
@@ -58,7 +58,7 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
        "subq           $1 , %0                             \n\t"               
        "jz             2f                                  \n\t"
 
-       ".align 16                                          \n\t"
+       ".p2align 4                                         \n\t"
        "1:                                                 \n\t"
        "prefetcht0     640(%1)                             \n\t" 
 
@@ -156,7 +156,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
        "cmpq   $0, %0                                      \n\t"
        "je     2f                                          \n\t" 
 
-       ".align 16                                          \n\t"
+       ".p2align 4                                         \n\t"
        "1:                                                 \n\t"
 
        "vmovups        %%xmm0  ,-128(%1)                   \n\t"
index c870849..b4e6ab3 100644 (file)
@@ -44,7 +44,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
        "vbroadcastsd 16(%8),    %%ymm6              \n\t"      // temp1[1]
        "vbroadcastsd 24(%8),    %%ymm7              \n\t"      // temp1[1]
 
-       ".align 16                                   \n\t"
+       ".p2align 4                                  \n\t"
        "1:                                  \n\t"
 
        "vmovups        (%3,%0,8), %%ymm9                  \n\t"  // 2 * y
index 212d4cf..1ef6fba 100644 (file)
@@ -46,7 +46,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
        "vbroadcastsd 24(%8),    %%ymm7              \n\t"      // temp1[1]
        "xorq           %0,%0                        \n\t"
 
-       ".align 16                                   \n\t"
+       ".p2align 4                                  \n\t"
        "1:                                  \n\t"
 
        "vmovups        (%3,%0,8), %%ymm9                  \n\t"  // 2 * y
index 159a231..0a6bef0 100644 (file)
@@ -50,7 +50,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
        "subq           $32, %1                              \n\t"              
        "jz             2f                           \n\t"
 
-       ".align 16                                          \n\t"
+       ".p2align 4                                         \n\t"
        "1:                                         \n\t"
 
        "vmulps         %%ymm4, %%ymm0, %%ymm4          \n\t"
index e265d16..ca13536 100644 (file)
@@ -41,7 +41,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
        "vxorps         %%ymm6, %%ymm6, %%ymm6               \n\t"
        "vxorps         %%ymm7, %%ymm7, %%ymm7               \n\t"
 
-       ".align 16                                           \n\t"
+       ".p2align 4                                          \n\t"
        "1:                                          \n\t"
         "vmovups                  (%2,%0,4), %%ymm12         \n\t"  // 2 * x
         "vmovups                32(%2,%0,4), %%ymm13         \n\t"  // 2 * x
index f617ccd..b35daa3 100644 (file)
@@ -129,7 +129,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
         "je             4f                 \n\t"
 
 
-       ".align 16                               \n\t"
+       ".p2align 4                              \n\t"
        "1:                              \n\t"
        "vxorps   %%ymm4 , %%ymm4 , %%ymm4        \n\t"
        "vxorps   %%ymm5 , %%ymm5 , %%ymm5        \n\t"
@@ -299,7 +299,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
         "je             4f                 \n\t"
 
 
-       ".align 16                               \n\t"
+       ".p2align 4                              \n\t"
        "1:                              \n\t"
        "vxorps   %%ymm4 , %%ymm4 , %%ymm4        \n\t"
        "vxorps   %%ymm5 , %%ymm5 , %%ymm5        \n\t"
index ca49fe7..34ffec9 100644 (file)
@@ -85,7 +85,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
         "je             4f                      \n\t"
 
 
-       ".align 16                               \n\t"
+       ".p2align 4                              \n\t"
        "1:                              \n\t"
        "prefetcht0      384(%2,%0,4)                  \n\t"
         "vmovups        (%2,%0,4), %%ymm12       \n\t"  // 8 * x
index 51c3bef..79180b9 100644 (file)
@@ -53,7 +53,7 @@ static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
        "subq           $16, %1                              \n\t"              
        "jz             2f                           \n\t"
 
-       ".align 16                                          \n\t"
+       ".p2align 4                                         \n\t"
        "1:                                         \n\t"
 
        "vmulps         %%xmm4, %%xmm0, %%xmm4          \n\t"
index 07293a9..093ca80 100644 (file)
@@ -45,7 +45,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
        "vbroadcastss  8(%8),    %%xmm6              \n\t"      // temp1[1]
        "vbroadcastss 12(%8),    %%xmm7              \n\t"      // temp1[1]
 
-       ".align 16                                   \n\t"
+       ".p2align 4                                  \n\t"
        "1:                                  \n\t"
 
        "vmovups        (%3,%0,4), %%xmm9                  \n\t"  // 2 * y
@@ -143,7 +143,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
        "vbroadcastss  8(%8),    %%ymm6              \n\t"      // temp1[1]
        "vbroadcastss 12(%8),    %%ymm7              \n\t"      // temp1[1]
 
-       ".align 16                                   \n\t"
+       ".p2align 4                                  \n\t"
        "1:                                  \n\t"
 
        "vmovups        (%3,%0,4), %%ymm9                  \n\t"  // 2 * y
index 4b699af..e865065 100644 (file)
@@ -46,7 +46,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
        "vbroadcastss 12(%8),    %%ymm7              \n\t"      // temp1[1]
        "xorq           %0,%0                        \n\t"
 
-       ".align 16                                   \n\t"
+       ".p2align 4                                  \n\t"
        "1:                                  \n\t"
 
        "vmovups        (%3,%0,4), %%ymm9                  \n\t"  // 2 * y
index 8b0a7ed..233af14 100644 (file)
@@ -54,11 +54,11 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
        "vmulpd         (%5), %%ymm0 , %%ymm0               \n\t"
 #endif
 
-       ".align 16                                          \n\t"
+       ".p2align 4                                         \n\t"
        "1:                                         \n\t"
 
        "vmovups        (%2,%0,8), %%ymm5                   \n\t" // 4 complex values from x
-       ".align 2                                           \n\t"
+       ".p2align 1                                         \n\t"
        "vmovups      32(%2,%0,8), %%ymm7                   \n\t" // 4 complex values from x
        "vmovups      64(%2,%0,8), %%ymm9                   \n\t" // 4 complex values from x
        "vmovups      96(%2,%0,8), %%ymm11                  \n\t" // 4 complex values from x
@@ -89,7 +89,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
        "vaddpd         %%ymm10, %%ymm11, %%ymm11           \n\t"
 
        "vmovups        %%ymm5 ,   (%3,%0,8)                \n\t"
-       ".align 2                                           \n\t"
+       ".p2align 1                                         \n\t"
        "vmovups        %%ymm7 , 32(%3,%0,8)                \n\t"
        "vmovups        %%ymm9 , 64(%3,%0,8)                \n\t"
        "vmovups        %%ymm11, 96(%3,%0,8)                \n\t"
@@ -127,13 +127,13 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
        "vmulpd         (%5), %%ymm0 , %%ymm0               \n\t"
 #endif
 
-       ".align 16                                          \n\t"
+       ".p2align 4                                         \n\t"
        "1:                                         \n\t"
 
        "prefetcht0     512(%2,%0,8)                        \n\t"
        "prefetcht0     576(%2,%0,8)                        \n\t"
        "vmovups        (%2,%0,8), %%ymm5                   \n\t" // 4 complex values from x
-       ".align 2                                           \n\t"
+       ".p2align 1                                         \n\t"
        "vmovups      32(%2,%0,8), %%ymm7                   \n\t" // 4 complex values from x
        "vmovups      64(%2,%0,8), %%ymm9                   \n\t" // 4 complex values from x
        "vmovups      96(%2,%0,8), %%ymm11                  \n\t" // 4 complex values from x
@@ -166,7 +166,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
        "vaddpd         %%ymm10, %%ymm11, %%ymm11           \n\t"
 
        "vmovups        %%ymm5 ,   (%3,%0,8)                \n\t"
-       ".align 2                                           \n\t"
+       ".p2align 1                                         \n\t"
        "vmovups        %%ymm7 , 32(%3,%0,8)                \n\t"
        "vmovups        %%ymm9 , 64(%3,%0,8)                \n\t"
        "vmovups        %%ymm11, 96(%3,%0,8)                \n\t"
index fd06612..87c5b03 100644 (file)
@@ -49,7 +49,7 @@ if ( n < 1280 )
        "vxorpd         %%ymm6, %%ymm6, %%ymm6               \n\t"
        "vxorpd         %%ymm7, %%ymm7, %%ymm7               \n\t"
 
-       ".align 16                                   \n\t"
+       ".p2align 4                                  \n\t"
        "1:                                          \n\t"
         "vmovups                  (%2,%0,8), %%ymm8          \n\t"  // 2 * x
         "vmovups                32(%2,%0,8), %%ymm9          \n\t"  // 2 * x
@@ -137,7 +137,7 @@ if ( n < 1280 )
        "vxorpd         %%ymm6, %%ymm6, %%ymm6               \n\t"
        "vxorpd         %%ymm7, %%ymm7, %%ymm7               \n\t"
 
-       ".align 16                                   \n\t"
+       ".p2align 4                                  \n\t"
        "1:                                          \n\t"
        "prefetcht0     512(%2,%0,8)                 \n\t"
         "vmovups                  (%2,%0,8), %%ymm8          \n\t"  // 2 * x
index 245f45d..696ed0e 100644 (file)
@@ -47,7 +47,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
        "vbroadcastsd   56(%2), %%ymm7                  \n\t"  // imag part x3
 
 
-       ".align 16                                      \n\t"
+       ".p2align 4                                     \n\t"
        "1:                                     \n\t"
 
         //"prefetcht0      256(%4,%0,8)                   \n\t"
@@ -164,7 +164,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
        "vbroadcastsd   16(%2), %%ymm2                  \n\t"  // real part x1
        "vbroadcastsd   24(%2), %%ymm3                  \n\t"  // imag part x1
 
-       ".align 16                                      \n\t"
+       ".p2align 4                                     \n\t"
        "1:                                     \n\t"
 
         // "prefetcht0      256(%4,%0,8)                   \n\t"
@@ -253,7 +253,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
        "vbroadcastsd     (%2), %%ymm0                  \n\t"  // real part x0
        "vbroadcastsd    8(%2), %%ymm1                  \n\t"  // imag part x0
 
-       ".align 16                                      \n\t"
+       ".p2align 4                                     \n\t"
        "1:                                     \n\t"
 
         // "prefetcht0      256(%4,%0,8)                   \n\t"
@@ -355,7 +355,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
        "vbroadcastsd     (%4), %%ymm0                  \n\t"  // alpha_r
        "vbroadcastsd     (%5), %%ymm1                  \n\t"  // alpha_i
 
-       ".align 16                                      \n\t"
+       ".p2align 4                                     \n\t"
        "1:                                     \n\t"
        // "prefetcht0      192(%2,%0,8)                        \n\t"
        "vmovups        (%2,%0,8), %%ymm8               \n\t" // 2 complex values from src