POWER10: Improving dasum performance
authorRajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Wed, 11 Aug 2021 03:06:04 +0000 (22:06 -0500)
committerRajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Wed, 11 Aug 2021 03:06:04 +0000 (22:06 -0500)
Unrolling a loop in dasum micro code to help in improving
POWER10 performance.

kernel/power/dasum.c
kernel/power/dasum_microk_power10.c

index 7507621..35390dd 100644 (file)
@@ -115,14 +115,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
        {
 
 #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
-               if ( n >= 16 )
+               if ( n >= 32)
                {
                        BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
                        for (i = 0; i < align; i++) {
                                sumf += ABS(x[i]);
                        }
                }
-               n1 = (n-i) & -16;
+               n1 = (n-i) & -32;
                if ( n1 > 0 )
                {
                        sumf += dasum_kernel_16(n1, &x[i]);
index d1a21b4..110627f 100644 (file)
@@ -34,6 +34,19 @@ static double dasum_kernel_16 (long n, double *x)
   __vector double t1;
   __vector double t2;
   __vector double t3;
+  __vector double t4;
+  __vector double t5;
+  __vector double t6;
+  __vector double t7;
+  __vector double a0;
+  __vector double a1;
+  __vector double a2;
+  __vector double a3;
+  __vector double a4;
+  __vector double a5;
+  __vector double a6;
+  __vector double a7;
+
 
   __asm__
     (
@@ -48,14 +61,27 @@ static double dasum_kernel_16 (long n, double *x)
        "xxlxor         38, 38, 38      \n\t"
        "xxlxor         39, 39, 39      \n\t"
 
+       "xxlxor         %x11, %x11, %x11        \n\t"
+       "xxlxor         %x12, %x12, %x12        \n\t"
+       "xxlxor         %x13, %x13, %x13        \n\t"
+       "xxlxor         %x14, %x14, %x14        \n\t"
+       "xxlxor         %x15, %x15, %x15        \n\t"
+       "xxlxor         %x16, %x16, %x16        \n\t"
+       "xxlxor         %x17, %x17, %x17        \n\t"
+       "xxlxor         %x18, %x18, %x18        \n\t"
+
        "lxvp            40, 0(%2)       \n\t"
        "lxvp            42, 32(%2)      \n\t"
        "lxvp            44, 64(%2)      \n\t"
        "lxvp            46, 96(%2)      \n\t"
+       "lxvp            52, 128(%2)    \n\t"
+       "lxvp            54, 160(%2)    \n\t"
+       "lxvp            56, 192(%2)    \n\t"
+       "lxvp            58, 224(%2)    \n\t"
 
-       "addi           %2, %2, 128     \n\t"
+       "addi           %2, %2, 256     \n\t"
 
-       "addic.         %1, %1, -16     \n\t"
+       "addic.         %1, %1, -32     \n\t"
        "ble            two%=           \n\t"
 
        ".align 5               \n"
@@ -65,33 +91,52 @@ static double dasum_kernel_16 (long n, double *x)
        "xvabsdp                49, 41          \n\t"
        "xvabsdp                50, 42          \n\t"
        "xvabsdp                51, 43          \n\t"
-       "lxvp            40, 0(%2)       \n\t"
-
 
        "xvabsdp                %x3, 44         \n\t"
        "xvabsdp                %x4, 45         \n\t"
-       "lxvp            42, 32(%2)      \n\t"
-
-
        "xvabsdp                %x5, 46         \n\t"
        "xvabsdp                %x6, 47         \n\t"
-       "lxvp            44, 64(%2)      \n\t"
-
 
        "xvadddp                32, 32, 48      \n\t"
        "xvadddp                33, 33, 49      \n\t"
-
-       "lxvp            46, 96(%2)      \n\t"
-
        "xvadddp                34, 34, 50      \n\t"
        "xvadddp                35, 35, 51      \n\t"
-       "addi           %2, %2, 128     \n\t"
+       "lxvp            40, 0(%2)       \n\t"
+       "lxvp            42, 32(%2)      \n\t"
+       "lxvp            44, 64(%2)      \n\t"
+       "lxvp            46, 96(%2)      \n\t"
+
        "xvadddp                36, 36, %x3     \n\t"
        "xvadddp                37, 37, %x4     \n\t"
-       "addic.         %1, %1, -16     \n\t"
        "xvadddp                38, 38, %x5     \n\t"
        "xvadddp                39, 39, %x6     \n\t"
 
+       "xvabsdp                60, 52          \n\t"
+       "xvabsdp                61, 53          \n\t"
+       "xvabsdp                62, 54          \n\t"
+       "xvabsdp                63, 55          \n\t"
+
+       "xvabsdp                %x7, 56         \n\t"
+       "xvabsdp                %x8, 57         \n\t"
+       "xvabsdp                %x9, 58         \n\t"
+       "xvabsdp                %x10, 59        \n\t"
+
+       "xvadddp                %x11, %x11, 60  \n\t"
+       "xvadddp                %x12, %x12, 61  \n\t"
+       "xvadddp                %x13, %x13, 62  \n\t"
+       "xvadddp                %x14, %x14, 63  \n\t"
+
+       "lxvp           52, 128(%2)     \n\t"
+       "lxvp           54, 160(%2)     \n\t"
+       "lxvp           56, 192(%2)     \n\t"
+       "lxvp           58, 224(%2)     \n\t"
+       "xvadddp                %x15, %x15, %x7 \n\t"
+       "xvadddp                %x16, %x16, %x8 \n\t"
+       "xvadddp                %x17, %x17, %x9 \n\t"
+       "xvadddp                %x18, %x18, %x10        \n\t"
+       "addi           %2, %2, 256     \n\t"
+       "addic.         %1, %1, -32     \n\t"
+
        "bgt            one%=           \n"
 
      "two%=:                           \n\t"
@@ -114,6 +159,25 @@ static double dasum_kernel_16 (long n, double *x)
        "xvadddp                38, 38, %x5     \n\t"
        "xvadddp                39, 39, %x6     \n\t"
 
+       "xvabsdp                60, 52          \n\t"
+       "xvabsdp                61, 53          \n\t"
+       "xvabsdp                62, 54          \n\t"
+       "xvabsdp                63, 55          \n\t"
+
+       "xvabsdp                %x7, 56         \n\t"
+       "xvabsdp                %x8, 57         \n\t"
+       "xvabsdp                %x9, 58         \n\t"
+       "xvabsdp                %x10, 59        \n\t"
+       "xvadddp                %x11, %x11, 60  \n\t"
+       "xvadddp                %x12, %x12, 61  \n\t"
+       "xvadddp                %x13, %x13, 62  \n\t"
+       "xvadddp                %x14, %x14, 63  \n\t"
+
+       "xvadddp                %x15, %x15, %x7 \n\t"
+       "xvadddp                %x16, %x16, %x8 \n\t"
+       "xvadddp                %x17, %x17, %x9 \n\t"
+       "xvadddp                %x18, %x18, %x10        \n\t"
+
        "xvadddp                32, 32, 33      \n\t"
        "xvadddp                34, 34, 35      \n\t"
        "xvadddp                36, 36, 37      \n\t"
@@ -122,7 +186,18 @@ static double dasum_kernel_16 (long n, double *x)
        "xvadddp                32, 32, 34      \n\t"
        "xvadddp                36, 36, 38      \n\t"
 
+       "xvadddp                %x11, %x11, %x12        \n\t"
+       "xvadddp                %x13, %x13, %x14        \n\t"
+       "xvadddp                %x15, %x15, %x16        \n\t"
+       "xvadddp                %x17, %x17, %x18        \n\t"
+
+       "xvadddp                %x11, %x11, %x13        \n\t"
+       "xvadddp                %x15, %x15, %x17        \n\t"
+
+       "xvadddp                %x11, %x11, %x15        \n\t"
+
        "xvadddp                32, 32, 36      \n\t"
+       "xvadddp                32, 32, %x11    \n\t"
 
        XXSWAPD_S(33,32)
        "xsadddp                %x0, 32, 33     \n"
@@ -136,14 +211,27 @@ static double dasum_kernel_16 (long n, double *x)
        "=wa" (t0),     // 3
        "=wa" (t1),     // 4
        "=wa" (t2),     // 5
-       "=wa" (t3)      // 6
+       "=wa" (t3),     // 6
+       "=wa" (t4),     // 7
+       "=wa" (t5),     // 8
+       "=wa" (t6),     // 9
+       "=wa" (t7),     // 10
+       "=wa" (a0),     // 11
+       "=wa" (a1),     // 12
+       "=wa" (a2),     // 13
+       "=wa" (a3),     // 14
+       "=wa" (a4),     // 15
+       "=wa" (a5),     // 16
+       "=wa" (a6),     // 17
+       "=wa" (a7)      // 18
      :
        "m" (*x)
      :
        "cr0",
        "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
        "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
-       "vs48","vs49","vs50","vs51"
+       "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
+       "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
      );
 
   return sum;