Optimise scopy for POWER9
authorMatt Brown <matthew.brown.dev@gmail.com>
Wed, 14 Jun 2017 04:58:00 +0000 (14:58 +1000)
committerMatt Brown <matthew.brown.dev@gmail.com>
Wed, 14 Jun 2017 06:59:13 +0000 (16:59 +1000)
Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.

kernel/power/scopy_microk_power8.c

index 444a6d4d566b4775bd6b6a45e4ea3ab7fb441b01..7a54d5e1eb52276f58922f0814b2e52fbda60919 100644 (file)
@@ -39,14 +39,14 @@ static void scopy_kernel_32 (long n, float *x, float *y)
 {
   __asm__
     (
-       "lxvw4x         40, 0, %2       \n\t"
-       "lxvw4x         41, %5, %2      \n\t"
-       "lxvw4x         42, %6, %2      \n\t"
-       "lxvw4x         43, %7, %2      \n\t"
-       "lxvw4x         44, %8, %2      \n\t"
-       "lxvw4x         45, %9, %2      \n\t"
-       "lxvw4x         46, %10, %2     \n\t"
-       "lxvw4x         47, %11, %2     \n\t"
+       "lxvd2x         40, 0, %2       \n\t"
+       "lxvd2x         41, %5, %2      \n\t"
+       "lxvd2x         42, %6, %2      \n\t"
+       "lxvd2x         43, %7, %2      \n\t"
+       "lxvd2x         44, %8, %2      \n\t"
+       "lxvd2x         45, %9, %2      \n\t"
+       "lxvd2x         46, %10, %2     \n\t"
+       "lxvd2x         47, %11, %2     \n\t"
 
        "addi           %2, %2, 128     \n\t"
 
@@ -56,22 +56,22 @@ static void scopy_kernel_32 (long n, float *x, float *y)
        ".p2align       5               \n"
      "1:                               \n\t"
 
-       "stxvw4x                40, 0, %3       \n\t"
-       "stxvw4x                41, %5, %3      \n\t"
-       "lxvw4x         40, 0, %2       \n\t"
-       "lxvw4x         41, %5, %2      \n\t"
-       "stxvw4x                42, %6, %3      \n\t"
-       "stxvw4x                43, %7, %3      \n\t"
-       "lxvw4x         42, %6, %2      \n\t"
-       "lxvw4x         43, %7, %2      \n\t"
-       "stxvw4x                44, %8, %3      \n\t"
-       "stxvw4x                45, %9, %3      \n\t"
-       "lxvw4x         44, %8, %2      \n\t"
-       "lxvw4x         45, %9, %2      \n\t"
-       "stxvw4x                46, %10, %3     \n\t"
-       "stxvw4x                47, %11, %3     \n\t"
-       "lxvw4x         46, %10, %2     \n\t"
-       "lxvw4x         47, %11, %2     \n\t"
+       "stxvd2x                40, 0, %3       \n\t"
+       "stxvd2x                41, %5, %3      \n\t"
+       "lxvd2x         40, 0, %2       \n\t"
+       "lxvd2x         41, %5, %2      \n\t"
+       "stxvd2x                42, %6, %3      \n\t"
+       "stxvd2x                43, %7, %3      \n\t"
+       "lxvd2x         42, %6, %2      \n\t"
+       "lxvd2x         43, %7, %2      \n\t"
+       "stxvd2x                44, %8, %3      \n\t"
+       "stxvd2x                45, %9, %3      \n\t"
+       "lxvd2x         44, %8, %2      \n\t"
+       "lxvd2x         45, %9, %2      \n\t"
+       "stxvd2x                46, %10, %3     \n\t"
+       "stxvd2x                47, %11, %3     \n\t"
+       "lxvd2x         46, %10, %2     \n\t"
+       "lxvd2x         47, %11, %2     \n\t"
 
        "addi           %3, %3, 128     \n\t"
        "addi           %2, %2, 128     \n\t"
@@ -81,14 +81,14 @@ static void scopy_kernel_32 (long n, float *x, float *y)
 
      "2:                               \n\t"
 
-       "stxvw4x                40, 0, %3       \n\t"
-       "stxvw4x                41, %5, %3      \n\t"
-       "stxvw4x                42, %6, %3      \n\t"
-       "stxvw4x                43, %7, %3      \n\t"
-       "stxvw4x                44, %8, %3      \n\t"
-       "stxvw4x                45, %9, %3      \n\t"
-       "stxvw4x                46, %10, %3     \n\t"
-       "stxvw4x                47, %11, %3     \n"
+       "stxvd2x                40, 0, %3       \n\t"
+       "stxvd2x                41, %5, %3      \n\t"
+       "stxvd2x                42, %6, %3      \n\t"
+       "stxvd2x                43, %7, %3      \n\t"
+       "stxvd2x                44, %8, %3      \n\t"
+       "stxvd2x                45, %9, %3      \n\t"
+       "stxvd2x                46, %10, %3     \n\t"
+       "stxvd2x                47, %11, %3     \n"
 
      "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
      :