Optimise sscal for POWER9
authorMatt Brown <matthew.brown.dev@gmail.com>
Wed, 14 Jun 2017 06:47:56 +0000 (16:47 +1000)
committerMatt Brown <matthew.brown.dev@gmail.com>
Wed, 14 Jun 2017 07:02:46 +0000 (17:02 +1000)
Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.

kernel/power/sscal_microk_power8.c

index 49862a3..058ff33 100644 (file)
@@ -44,14 +44,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
        "xscvdpspn      %x3, %x3        \n\t"
        "xxspltw                %x3, %x3, 0     \n\t"
 
-       "lxvw4x         32, 0, %2       \n\t"
-       "lxvw4x         33, %4, %2      \n\t"
-       "lxvw4x         34, %5, %2      \n\t"
-       "lxvw4x         35, %6, %2      \n\t"
-       "lxvw4x         36, %7, %2      \n\t"
-       "lxvw4x         37, %8, %2      \n\t"
-       "lxvw4x         38, %9, %2      \n\t"
-       "lxvw4x         39, %10, %2     \n\t"
+       "lxvd2x         32, 0, %2       \n\t"
+       "lxvd2x         33, %4, %2      \n\t"
+       "lxvd2x         34, %5, %2      \n\t"
+       "lxvd2x         35, %6, %2      \n\t"
+       "lxvd2x         36, %7, %2      \n\t"
+       "lxvd2x         37, %8, %2      \n\t"
+       "lxvd2x         38, %9, %2      \n\t"
+       "lxvd2x         39, %10, %2     \n\t"
 
        "addi           %2, %2, 128     \n\t"
 
@@ -63,31 +63,31 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
 
        "xvmulsp                40, 32, %x3     \n\t"
        "xvmulsp                41, 33, %x3     \n\t"
-       "lxvw4x         32, 0, %2       \n\t"
-       "lxvw4x         33, %4, %2      \n\t"
+       "lxvd2x         32, 0, %2       \n\t"
+       "lxvd2x         33, %4, %2      \n\t"
        "xvmulsp                42, 34, %x3     \n\t"
        "xvmulsp                43, 35, %x3     \n\t"
-       "lxvw4x         34, %5, %2      \n\t"
-       "lxvw4x         35, %6, %2      \n\t"
+       "lxvd2x         34, %5, %2      \n\t"
+       "lxvd2x         35, %6, %2      \n\t"
        "xvmulsp                44, 36, %x3     \n\t"
        "xvmulsp                45, 37, %x3     \n\t"
-       "lxvw4x         36, %7, %2      \n\t"
-       "lxvw4x         37, %8, %2      \n\t"
+       "lxvd2x         36, %7, %2      \n\t"
+       "lxvd2x         37, %8, %2      \n\t"
        "xvmulsp                46, 38, %x3     \n\t"
        "xvmulsp                47, 39, %x3     \n\t"
-       "lxvw4x         38, %9, %2      \n\t"
-       "lxvw4x         39, %10, %2     \n\t"
+       "lxvd2x         38, %9, %2      \n\t"
+       "lxvd2x         39, %10, %2     \n\t"
 
        "addi           %2, %2, -128    \n\t"
 
-       "stxvw4x                40, 0, %2       \n\t"
-       "stxvw4x                41, %4, %2      \n\t"
-       "stxvw4x                42, %5, %2      \n\t"
-       "stxvw4x                43, %6, %2      \n\t"
-       "stxvw4x                44, %7, %2      \n\t"
-       "stxvw4x                45, %8, %2      \n\t"
-       "stxvw4x                46, %9, %2      \n\t"
-       "stxvw4x                47, %10, %2     \n\t"
+       "stxvd2x                40, 0, %2       \n\t"
+       "stxvd2x                41, %4, %2      \n\t"
+       "stxvd2x                42, %5, %2      \n\t"
+       "stxvd2x                43, %6, %2      \n\t"
+       "stxvd2x                44, %7, %2      \n\t"
+       "stxvd2x                45, %8, %2      \n\t"
+       "stxvd2x                46, %9, %2      \n\t"
+       "stxvd2x                47, %10, %2     \n\t"
 
        "addi           %2, %2, 256     \n\t"
 
@@ -108,14 +108,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
        "xvmulsp                46, 38, %x3     \n\t"
        "xvmulsp                47, 39, %x3     \n\t"
 
-       "stxvw4x                40, 0, %2       \n\t"
-       "stxvw4x                41, %4, %2      \n\t"
-       "stxvw4x                42, %5, %2      \n\t"
-       "stxvw4x                43, %6, %2      \n\t"
-       "stxvw4x                44, %7, %2      \n\t"
-       "stxvw4x                45, %8, %2      \n\t"
-       "stxvw4x                46, %9, %2      \n\t"
-       "stxvw4x                47, %10, %2     \n"
+       "stxvd2x                40, 0, %2       \n\t"
+       "stxvd2x                41, %4, %2      \n\t"
+       "stxvd2x                42, %5, %2      \n\t"
+       "stxvd2x                43, %6, %2      \n\t"
+       "stxvd2x                44, %7, %2      \n\t"
+       "stxvd2x                45, %8, %2      \n\t"
+       "stxvd2x                46, %9, %2      \n\t"
+       "stxvd2x                47, %10, %2     \n"
 
      "#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
      :
@@ -150,14 +150,14 @@ static void sscal_kernel_16_zero (long n, float *x)
        ".p2align       5               \n"
      "1:                               \n\t"
 
-       "stxvw4x                %x3, 0, %2      \n\t"
-       "stxvw4x                %x3, %4, %2     \n\t"
-       "stxvw4x                %x3, %5, %2     \n\t"
-       "stxvw4x                %x3, %6, %2     \n\t"
-       "stxvw4x                %x3, %7, %2     \n\t"
-       "stxvw4x                %x3, %8, %2     \n\t"
-       "stxvw4x                %x3, %9, %2     \n\t"
-       "stxvw4x                %x3, %10, %2    \n\t"
+       "stxvd2x                %x3, 0, %2      \n\t"
+       "stxvd2x                %x3, %4, %2     \n\t"
+       "stxvd2x                %x3, %5, %2     \n\t"
+       "stxvd2x                %x3, %6, %2     \n\t"
+       "stxvd2x                %x3, %7, %2     \n\t"
+       "stxvd2x                %x3, %8, %2     \n\t"
+       "stxvd2x                %x3, %9, %2     \n\t"
+       "stxvd2x                %x3, %10, %2    \n\t"
 
        "addi           %2, %2, 128     \n\t"