Use .p2align instead of .align for compatibility on Sandybridge as well
authorMartin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Sat, 24 Feb 2018 18:43:15 +0000 (19:43 +0100)
committerGitHub <noreply@github.com>
Sat, 24 Feb 2018 18:43:15 +0000 (19:43 +0100)
18 files changed:
kernel/x86_64/caxpy_microk_sandy-2.c
kernel/x86_64/cdot_microk_sandy-2.c
kernel/x86_64/daxpy_microk_sandy-2.c
kernel/x86_64/ddot_microk_sandy-2.c
kernel/x86_64/dger_microk_sandy-2.c
kernel/x86_64/dscal_microk_sandy-2.c
kernel/x86_64/dsymv_L_microk_sandy-2.c
kernel/x86_64/dsymv_U_microk_sandy-2.c
kernel/x86_64/saxpy_microk_sandy-2.c
kernel/x86_64/sdot_microk_sandy-2.c
kernel/x86_64/sgemv_n_microk_sandy-4.c
kernel/x86_64/sgemv_t_microk_sandy-4.c
kernel/x86_64/sger_microk_sandy-2.c
kernel/x86_64/ssymv_L_microk_sandy-2.c
kernel/x86_64/ssymv_U_microk_sandy-2.c
kernel/x86_64/zaxpy_microk_sandy-2.c
kernel/x86_64/zdot_microk_sandy-2.c
kernel/x86_64/zgemv_n_microk_sandy-4.c

index dbfce208f5f242f31ff9f9e8279529cdb7ee3698..a798fd977914c11f7f84d7a24a04ecf63a1f811d 100644 (file)
@@ -50,11 +50,11 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
        "vmulps         (%5), %%ymm0 , %%ymm0               \n\t"
 #endif
 
-       ".align 16                                          \n\t"
+       ".p2align 4                                         \n\t"
        "1:                                         \n\t"
 
        "vmovups        (%2,%0,4), %%ymm5                   \n\t" // 4 complex values from x
-       ".align 2                                           \n\t"
+       ".p2align 1                                         \n\t"
        "vmovups      32(%2,%0,4), %%ymm7                   \n\t" // 4 complex values from x
        "vmovups      64(%2,%0,4), %%ymm9                   \n\t" // 4 complex values from x
        "vmovups      96(%2,%0,4), %%ymm11                  \n\t" // 4 complex values from x
@@ -85,7 +85,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
        "vaddps         %%ymm10, %%ymm11, %%ymm11           \n\t"
 
        "vmovups        %%ymm5 ,   (%3,%0,4)                \n\t"
-       ".align 2                                           \n\t"
+       ".p2align 1                                         \n\t"
        "vmovups        %%ymm7 , 32(%3,%0,4)                \n\t"
        "vmovups        %%ymm9 , 64(%3,%0,4)                \n\t"
        "vmovups        %%ymm11, 96(%3,%0,4)                \n\t"
index 22cd79e2ed284d117d0ecc409ab04fa899600bca..01816917d253b58490c04193e7edd420c7e0cc60 100644 (file)
@@ -46,7 +46,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
        "vxorps         %%ymm6, %%ymm6, %%ymm6               \n\t"
        "vxorps         %%ymm7, %%ymm7, %%ymm7               \n\t"
 
-       ".align 16                                   \n\t"
+       ".p2align 4                                  \n\t"
        "1:                                          \n\t"
         "vmovups                  (%2,%0,4), %%ymm8          \n\t"  // 2 * x
         "vmovups                32(%2,%0,4), %%ymm9          \n\t"  // 2 * x
index 522e084dcd8e7ffb10a6948a30b46cc2592d22cd..85e038cef19b29839390180bf2c56eaa6b627a09 100644 (file)
@@ -50,7 +50,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
        "subq           $16, %1                              \n\t"              
        "jz             2f                           \n\t"
 
-       ".align 16                                          \n\t"
+       ".p2align 4                                         \n\t"
        "1:                                         \n\t"
 
        "vmulpd         %%ymm4, %%ymm0, %%ymm4          \n\t"
index e2e6701c75cd7b96a9d78b6f0d62f9445f190448..160f956048a13500dde57e6bb6af47cae682ab6b 100644 (file)
@@ -41,7 +41,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
        "vxorpd         %%ymm6, %%ymm6, %%ymm6               \n\t"
        "vxorpd         %%ymm7, %%ymm7, %%ymm7               \n\t"
 
-       ".align 16                                           \n\t"
+       ".p2align 4                                          \n\t"
        "1:                                          \n\t"
         "vmovups                  (%2,%0,8), %%ymm12         \n\t"  // 2 * x
         "vmovups                32(%2,%0,8), %%ymm13         \n\t"  // 2 * x
index 564f1356d645aa10a9218f3fa5d51844077679c2..2bf966a5f467b757ae5ef1f80cf7ca69d3878b96 100644 (file)
@@ -53,7 +53,7 @@ static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
        "subq           $8, %1                               \n\t"              
        "jz             2f                           \n\t"
 
-       ".align 8                                           \n\t"
+       ".p2align 3                                         \n\t"
        "1:                                         \n\t"
 
        "vmulpd         %%xmm4, %%xmm0, %%xmm4          \n\t"
index f5bf5932f333d83b4dcab1ec1cdcf73469e71656..8d855072b4ec3136d1c87b55579a1ceab656828d 100644 (file)
@@ -58,7 +58,7 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
        "subq           $1 , %0                             \n\t"               
        "jz             2f                                  \n\t"
 
-       ".align 16                                          \n\t"
+       ".p2align 4                                         \n\t"
        "1:                                                 \n\t"
        "prefetcht0     640(%1)                             \n\t" 
 
@@ -156,7 +156,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
        "cmpq   $0, %0                                      \n\t"
        "je     2f                                          \n\t" 
 
-       ".align 16                                          \n\t"
+       ".p2align 4                                         \n\t"
        "1:                                                 \n\t"
 
        "vmovups        %%xmm0  ,-128(%1)                   \n\t"
index c87084915bbaa8a9ca211886fee5fc84ff799432..b4e6ab3692774214013908db53618721f619444c 100644 (file)
@@ -44,7 +44,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
        "vbroadcastsd 16(%8),    %%ymm6              \n\t"      // temp1[1]
        "vbroadcastsd 24(%8),    %%ymm7              \n\t"      // temp1[1]
 
-       ".align 16                                   \n\t"
+       ".p2align 4                                  \n\t"
        "1:                                  \n\t"
 
        "vmovups        (%3,%0,8), %%ymm9                  \n\t"  // 2 * y
index 212d4cf7b3e76161e831f4593ce4a0f24a529cbe..1ef6fbafdc824bce2dc2dfbbe2c7ba270dd1ee6d 100644 (file)
@@ -46,7 +46,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
        "vbroadcastsd 24(%8),    %%ymm7              \n\t"      // temp1[1]
        "xorq           %0,%0                        \n\t"
 
-       ".align 16                                   \n\t"
+       ".p2align 4                                  \n\t"
        "1:                                  \n\t"
 
        "vmovups        (%3,%0,8), %%ymm9                  \n\t"  // 2 * y
index 159a231755e2f6cd7032fbbc8df7d91d4123ddf4..0a6bef046660ddbb00c6802472c5abc0ec0db6fb 100644 (file)
@@ -50,7 +50,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
        "subq           $32, %1                              \n\t"              
        "jz             2f                           \n\t"
 
-       ".align 16                                          \n\t"
+       ".p2align 4                                         \n\t"
        "1:                                         \n\t"
 
        "vmulps         %%ymm4, %%ymm0, %%ymm4          \n\t"
index e265d16bd22d420931f14575e08c9060cdc9f669..ca13536f26bd51960acba031d969f429c36c73a8 100644 (file)
@@ -41,7 +41,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
        "vxorps         %%ymm6, %%ymm6, %%ymm6               \n\t"
        "vxorps         %%ymm7, %%ymm7, %%ymm7               \n\t"
 
-       ".align 16                                           \n\t"
+       ".p2align 4                                          \n\t"
        "1:                                          \n\t"
         "vmovups                  (%2,%0,4), %%ymm12         \n\t"  // 2 * x
         "vmovups                32(%2,%0,4), %%ymm13         \n\t"  // 2 * x
index f617ccd5a0bef3cdc5651dc8828c4580f2022e9f..b35daa35b04e157f34f4966e33f540fad6e081d0 100644 (file)
@@ -129,7 +129,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
         "je             4f                 \n\t"
 
 
-       ".align 16                               \n\t"
+       ".p2align 4                              \n\t"
        "1:                              \n\t"
        "vxorps   %%ymm4 , %%ymm4 , %%ymm4        \n\t"
        "vxorps   %%ymm5 , %%ymm5 , %%ymm5        \n\t"
@@ -299,7 +299,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
         "je             4f                 \n\t"
 
 
-       ".align 16                               \n\t"
+       ".p2align 4                              \n\t"
        "1:                              \n\t"
        "vxorps   %%ymm4 , %%ymm4 , %%ymm4        \n\t"
        "vxorps   %%ymm5 , %%ymm5 , %%ymm5        \n\t"
index ca49fe7e6f2bfee601165045213c1a836125f89c..34ffec944a1e9880eb7599307f725482963c7331 100644 (file)
@@ -85,7 +85,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
         "je             4f                      \n\t"
 
 
-       ".align 16                               \n\t"
+       ".p2align 4                              \n\t"
        "1:                              \n\t"
        "prefetcht0      384(%2,%0,4)                  \n\t"
         "vmovups        (%2,%0,4), %%ymm12       \n\t"  // 8 * x
index 51c3bef3e9a4debf96287e663c07442340a7f621..79180b991eb0b58c4c1636a312020651d0aba9a8 100644 (file)
@@ -53,7 +53,7 @@ static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
        "subq           $16, %1                              \n\t"              
        "jz             2f                           \n\t"
 
-       ".align 16                                          \n\t"
+       ".p2align 4                                         \n\t"
        "1:                                         \n\t"
 
        "vmulps         %%xmm4, %%xmm0, %%xmm4          \n\t"
index 07293a964ddb15ad61d0f9e1eadddb6f91ff8a01..093ca8073c430e294b62ef8ac9933fa9f1fc337b 100644 (file)
@@ -45,7 +45,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
        "vbroadcastss  8(%8),    %%xmm6              \n\t"      // temp1[1]
        "vbroadcastss 12(%8),    %%xmm7              \n\t"      // temp1[1]
 
-       ".align 16                                   \n\t"
+       ".p2align 4                                  \n\t"
        "1:                                  \n\t"
 
        "vmovups        (%3,%0,4), %%xmm9                  \n\t"  // 2 * y
@@ -143,7 +143,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
        "vbroadcastss  8(%8),    %%ymm6              \n\t"      // temp1[1]
        "vbroadcastss 12(%8),    %%ymm7              \n\t"      // temp1[1]
 
-       ".align 16                                   \n\t"
+       ".p2align 4                                  \n\t"
        "1:                                  \n\t"
 
        "vmovups        (%3,%0,4), %%ymm9                  \n\t"  // 2 * y
index 4b699af50411badb4ff175d60526de3f43075014..e8650650cdaaf7e9b846de5fa3ab90295dcb1b8b 100644 (file)
@@ -46,7 +46,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
        "vbroadcastss 12(%8),    %%ymm7              \n\t"      // temp1[1]
        "xorq           %0,%0                        \n\t"
 
-       ".align 16                                   \n\t"
+       ".p2align 4                                  \n\t"
        "1:                                  \n\t"
 
        "vmovups        (%3,%0,4), %%ymm9                  \n\t"  // 2 * y
index 8b0a7ed05a0e177d12e312ab97b2f33fcb3b8115..233af143adc56845e1910b0c02ca764e887f672e 100644 (file)
@@ -54,11 +54,11 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
        "vmulpd         (%5), %%ymm0 , %%ymm0               \n\t"
 #endif
 
-       ".align 16                                          \n\t"
+       ".p2align 4                                         \n\t"
        "1:                                         \n\t"
 
        "vmovups        (%2,%0,8), %%ymm5                   \n\t" // 4 complex values from x
-       ".align 2                                           \n\t"
+       ".p2align 1                                         \n\t"
        "vmovups      32(%2,%0,8), %%ymm7                   \n\t" // 4 complex values from x
        "vmovups      64(%2,%0,8), %%ymm9                   \n\t" // 4 complex values from x
        "vmovups      96(%2,%0,8), %%ymm11                  \n\t" // 4 complex values from x
@@ -89,7 +89,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
        "vaddpd         %%ymm10, %%ymm11, %%ymm11           \n\t"
 
        "vmovups        %%ymm5 ,   (%3,%0,8)                \n\t"
-       ".align 2                                           \n\t"
+       ".p2align 1                                         \n\t"
        "vmovups        %%ymm7 , 32(%3,%0,8)                \n\t"
        "vmovups        %%ymm9 , 64(%3,%0,8)                \n\t"
        "vmovups        %%ymm11, 96(%3,%0,8)                \n\t"
@@ -127,13 +127,13 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
        "vmulpd         (%5), %%ymm0 , %%ymm0               \n\t"
 #endif
 
-       ".align 16                                          \n\t"
+       ".p2align 4                                         \n\t"
        "1:                                         \n\t"
 
        "prefetcht0     512(%2,%0,8)                        \n\t"
        "prefetcht0     576(%2,%0,8)                        \n\t"
        "vmovups        (%2,%0,8), %%ymm5                   \n\t" // 4 complex values from x
-       ".align 2                                           \n\t"
+       ".p2align 1                                         \n\t"
        "vmovups      32(%2,%0,8), %%ymm7                   \n\t" // 4 complex values from x
        "vmovups      64(%2,%0,8), %%ymm9                   \n\t" // 4 complex values from x
        "vmovups      96(%2,%0,8), %%ymm11                  \n\t" // 4 complex values from x
@@ -166,7 +166,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
        "vaddpd         %%ymm10, %%ymm11, %%ymm11           \n\t"
 
        "vmovups        %%ymm5 ,   (%3,%0,8)                \n\t"
-       ".align 2                                           \n\t"
+       ".p2align 1                                         \n\t"
        "vmovups        %%ymm7 , 32(%3,%0,8)                \n\t"
        "vmovups        %%ymm9 , 64(%3,%0,8)                \n\t"
        "vmovups        %%ymm11, 96(%3,%0,8)                \n\t"
index fd06612e6b104c6b53c255f24ed86d009a5b065f..87c5b034027671b63c3d6e37fd73e0009b17f49e 100644 (file)
@@ -49,7 +49,7 @@ if ( n < 1280 )
        "vxorpd         %%ymm6, %%ymm6, %%ymm6               \n\t"
        "vxorpd         %%ymm7, %%ymm7, %%ymm7               \n\t"
 
-       ".align 16                                   \n\t"
+       ".p2align 4                                  \n\t"
        "1:                                          \n\t"
         "vmovups                  (%2,%0,8), %%ymm8          \n\t"  // 2 * x
         "vmovups                32(%2,%0,8), %%ymm9          \n\t"  // 2 * x
@@ -137,7 +137,7 @@ if ( n < 1280 )
        "vxorpd         %%ymm6, %%ymm6, %%ymm6               \n\t"
        "vxorpd         %%ymm7, %%ymm7, %%ymm7               \n\t"
 
-       ".align 16                                   \n\t"
+       ".p2align 4                                  \n\t"
        "1:                                          \n\t"
        "prefetcht0     512(%2,%0,8)                 \n\t"
         "vmovups                  (%2,%0,8), %%ymm8          \n\t"  // 2 * x
index 245f45d052bccc5c41af958b5c6634d19adfe683..696ed0eeee06d46473484d5c3fd3e3129c6f4371 100644 (file)
@@ -47,7 +47,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
        "vbroadcastsd   56(%2), %%ymm7                  \n\t"  // imag part x3
 
 
-       ".align 16                                      \n\t"
+       ".p2align 4                                     \n\t"
        "1:                                     \n\t"
 
         //"prefetcht0      256(%4,%0,8)                   \n\t"
@@ -164,7 +164,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
        "vbroadcastsd   16(%2), %%ymm2                  \n\t"  // real part x1
        "vbroadcastsd   24(%2), %%ymm3                  \n\t"  // imag part x1
 
-       ".align 16                                      \n\t"
+       ".p2align 4                                     \n\t"
        "1:                                     \n\t"
 
         // "prefetcht0      256(%4,%0,8)                   \n\t"
@@ -253,7 +253,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
        "vbroadcastsd     (%2), %%ymm0                  \n\t"  // real part x0
        "vbroadcastsd    8(%2), %%ymm1                  \n\t"  // imag part x0
 
-       ".align 16                                      \n\t"
+       ".p2align 4                                     \n\t"
        "1:                                     \n\t"
 
         // "prefetcht0      256(%4,%0,8)                   \n\t"
@@ -355,7 +355,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
        "vbroadcastsd     (%4), %%ymm0                  \n\t"  // alpha_r
        "vbroadcastsd     (%5), %%ymm1                  \n\t"  // alpha_i
 
-       ".align 16                                      \n\t"
+       ".p2align 4                                     \n\t"
        "1:                                     \n\t"
        // "prefetcht0      192(%2,%0,8)                        \n\t"
        "vmovups        (%2,%0,8), %%ymm8               \n\t" // 2 complex values from src