POWER10: Improve copy performance
authorRajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Sun, 13 Dec 2020 16:41:45 +0000 (10:41 -0600)
committerRajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Sun, 13 Dec 2020 16:41:45 +0000 (10:41 -0600)
This patch aligns the stores to 32 byte boundary for scopy and dcopy
before entering into vector pair loop. For ccopy, changed the store
instructions to stxv to improve performance of unaligned cases.

kernel/power/ccopy_microk_power10.c [new file with mode: 0644]
kernel/power/ccopy_power10.c
kernel/power/copy_microk_power10.c
kernel/power/dcopy_power10.c
kernel/power/scopy_power10.c

diff --git a/kernel/power/ccopy_microk_power10.c b/kernel/power/ccopy_microk_power10.c
new file mode 100644 (file)
index 0000000..6c80f9c
--- /dev/null
@@ -0,0 +1,115 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL 1
+
+static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
+{
+  __asm__
+    (
+       "lxvp           32, 0(%2)       \n\t"
+       "lxvp           34, 32(%2)      \n\t"
+       "lxvp           36, 64(%2)      \n\t"
+       "lxvp           38, 96(%2)      \n\t"
+       "lxvp           40, 128(%2)     \n\t"
+       "lxvp           42, 160(%2)     \n\t"
+       "lxvp           44, 192(%2)     \n\t"
+       "lxvp           46, 224(%2)     \n\t"
+
+       "addi           %2, %2, 256     \n\t"
+       "addic.         %1, %1, -32     \n\t"
+       "ble            two%=           \n\t"
+
+       ".align 5               \n"
+     "one%=:                           \n\t"
+
+       "stxv           33, 0(%3)       \n\t"
+       "stxv           32, 16(%3)      \n\t"
+       "stxv           35, 32(%3)      \n\t"
+       "stxv           34, 48(%3)      \n\t"
+       "stxv           37, 64(%3)      \n\t"
+       "stxv           36, 80(%3)      \n\t"
+       "stxv           39, 96(%3)      \n\t"
+       "stxv           38, 112(%3)     \n\t"
+       "lxvp           32, 0(%2)       \n\t"
+       "lxvp           34, 32(%2)      \n\t"
+       "lxvp           36, 64(%2)      \n\t"
+       "lxvp           38, 96(%2)      \n\t"
+
+       "stxv           41, 128(%3)     \n\t"
+       "stxv           40, 144(%3)     \n\t"
+       "stxv           43, 160(%3)     \n\t"
+       "stxv           42, 176(%3)     \n\t"
+       "stxv           45, 192(%3)     \n\t"
+       "stxv           44, 208(%3)     \n\t"
+       "stxv           47, 224(%3)     \n\t"
+       "stxv           46, 240(%3)     \n\t"
+       "lxvp           40, 128(%2)     \n\t"
+       "lxvp           42, 160(%2)     \n\t"
+       "lxvp           44, 192(%2)     \n\t"
+       "lxvp           46, 224(%2)     \n\t"
+
+
+       "addi           %3, %3, 256     \n\t"
+       "addi           %2, %2, 256     \n\t"
+
+       "addic.         %1, %1, -32     \n\t"
+       "bgt            one%=           \n"
+
+     "two%=:                           \n\t"
+
+       "stxv           33, 0(%3)       \n\t"
+       "stxv           32, 16(%3)      \n\t"
+       "stxv           35, 32(%3)      \n\t"
+       "stxv           34, 48(%3)      \n\t"
+       "stxv           37, 64(%3)      \n\t"
+       "stxv           36, 80(%3)      \n\t"
+       "stxv           39, 96(%3)      \n\t"
+       "stxv           38, 112(%3)     \n\t"
+       "stxv           41, 128(%3)     \n\t"
+       "stxv           40, 144(%3)     \n\t"
+       "stxv           43, 160(%3)     \n\t"
+       "stxv           42, 176(%3)     \n\t"
+       "stxv           45, 192(%3)     \n\t"
+       "stxv           44, 208(%3)     \n\t"
+       "stxv           47, 224(%3)     \n\t"
+       "stxv           46, 240(%3)     \n\t"
+
+     "#n=%1 x=%4=%2 y=%0=%3"
+     :
+       "=m" (*y),
+       "+r" (n),       // 1
+       "+b" (x),       // 2
+       "+b" (y)        // 3
+     :
+       "m" (*x)
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
+     );
+}
index a5877cd..41c5104 100644 (file)
@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if defined(__VEC__) || defined(__ALTIVEC__)
-#include "copy_microk_power10.c"
+#include "ccopy_microk_power10.c"
 #endif
 
 #ifndef HAVE_KERNEL
@@ -86,7 +86,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
        if ( (inc_x == 1) && (inc_y == 1 ))
        {
 
-               BLASLONG n1 = n & -64;
+               BLASLONG n1 = n & -32;
                if ( n1 > 0 )
                {
                        copy_kernel(n1, x, y);
index c90dc37..8bca1a1 100644 (file)
@@ -62,38 +62,39 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
      "one%=:                           \n\t"
 
        "stxvp          32, 0(%3)       \n\t"
-       "lxvp           32, 0(%2)       \n\t"
        "stxvp          34, 32(%3)      \n\t"
-       "lxvp           34, 32(%2)      \n\t"
        "stxvp          36, 64(%3)      \n\t"
-       "lxvp           36, 64(%2)      \n\t"
        "stxvp          38, 96(%3)      \n\t"
+       "lxvp           32, 0(%2)       \n\t"
+       "lxvp           34, 32(%2)      \n\t"
+       "lxvp           36, 64(%2)      \n\t"
        "lxvp           38, 96(%2)      \n\t"
 
        "stxvp          40, 128(%3)     \n\t"
-       "lxvp           40, 128(%2)     \n\t"
        "stxvp          42, 160(%3)     \n\t"
-       "lxvp           42, 160(%2)     \n\t"
        "stxvp          44, 192(%3)     \n\t"
-       "lxvp           44, 192(%2)     \n\t"
        "stxvp          46, 224(%3)     \n\t"
+       "lxvp           40, 128(%2)     \n\t"
+       "lxvp           42, 160(%2)     \n\t"
+       "lxvp           44, 192(%2)     \n\t"
        "lxvp           46, 224(%2)     \n\t"
 
        "stxvp          48, 256(%3)     \n\t"
-       "lxvp           48, 256(%2)     \n\t"
        "stxvp          50, 288(%3)     \n\t"
-       "lxvp           50, 288(%2)     \n\t"
        "stxvp          52, 320(%3)     \n\t"
-       "lxvp           52, 320(%2)     \n\t"
        "stxvp          54, 352(%3)     \n\t"
+       "lxvp           48, 256(%2)     \n\t"
+       "lxvp           50, 288(%2)     \n\t"
+       "lxvp           52, 320(%2)     \n\t"
        "lxvp           54, 352(%2)     \n\t"
+
        "stxvp          56, 384(%3)     \n\t"
-       "lxvp           56, 384(%2)     \n\t"
        "stxvp          58, 416(%3)     \n\t"
-       "lxvp           58, 416(%2)     \n\t"
        "stxvp          60, 448(%3)     \n\t"
-       "lxvp           60, 448(%2)     \n\t"
        "stxvp          62, 480(%3)     \n\t"
+       "lxvp           56, 384(%2)     \n\t"
+       "lxvp           58, 416(%2)     \n\t"
+       "lxvp           60, 448(%2)     \n\t"
        "lxvp           62, 480(%2)     \n\t"
 
        "addi           %3, %3, 512     \n\t"
index cd10b71..6c5eb4d 100644 (file)
@@ -85,12 +85,18 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 
        if ( (inc_x == 1) && (inc_y == 1 ))
        {
-
-               BLASLONG n1 = n & -64;
-               if ( n1 > 0 )
+               if ( n >= 64 )
+               {
+                       BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
+                       for (i = 0; i < align; i++) {
+                               y[i] = x[i] ;
+                       }
+               }
+               BLASLONG n1 = (n-i) & -64;
+               if ( n1 )
                {
-                       copy_kernel(n1, x, y);
-                       i=n1;
+                       copy_kernel(n1, &x[i], &y[i]);
+                       i += n1;
                }
 
                while(i < n)
index 298a899..3398ce8 100644 (file)
@@ -86,11 +86,18 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
        if ( (inc_x == 1) && (inc_y == 1 ))
        {
 
-               BLASLONG n1 = n & -128;
-               if ( n1 > 0 )
+               if ( n >= 128 )
                {
-                       copy_kernel (n1, x, y);
-                       i=n1;
+                       BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
+                       for (i = 0; i < align; i++) {
+                               y[i] = x[i] ;
+                       }
+               }
+               BLASLONG n1 = (n-i) & -128;
+               if ( n1 )
+               {
+                       copy_kernel(n1, &x[i], &y[i]);
+                       i += n1;
                }
 
                while(i < n)