From be55f96cbdc919c7c7da2da2f7a2c6c47336a9f6 Mon Sep 17 00:00:00 2001 From: Matt Brown Date: Wed, 14 Jun 2017 14:58:00 +1000 Subject: [PATCH] Optimise scopy for POWER9 Use lxvd2x instruction instead of lxvw4x. lxvd2x performs far better on the new POWER architecture than lxvw4x. --- kernel/power/scopy_microk_power8.c | 64 +++++++++++++++--------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/kernel/power/scopy_microk_power8.c b/kernel/power/scopy_microk_power8.c index 444a6d4d..7a54d5e1 100644 --- a/kernel/power/scopy_microk_power8.c +++ b/kernel/power/scopy_microk_power8.c @@ -39,14 +39,14 @@ static void scopy_kernel_32 (long n, float *x, float *y) { __asm__ ( - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %5, %2 \n\t" - "lxvw4x 42, %6, %2 \n\t" - "lxvw4x 43, %7, %2 \n\t" - "lxvw4x 44, %8, %2 \n\t" - "lxvw4x 45, %9, %2 \n\t" - "lxvw4x 46, %10, %2 \n\t" - "lxvw4x 47, %11, %2 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" "addi %2, %2, 128 \n\t" @@ -56,22 +56,22 @@ static void scopy_kernel_32 (long n, float *x, float *y) ".p2align 5 \n" "1: \n\t" - "stxvw4x 40, 0, %3 \n\t" - "stxvw4x 41, %5, %3 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %5, %2 \n\t" - "stxvw4x 42, %6, %3 \n\t" - "stxvw4x 43, %7, %3 \n\t" - "lxvw4x 42, %6, %2 \n\t" - "lxvw4x 43, %7, %2 \n\t" - "stxvw4x 44, %8, %3 \n\t" - "stxvw4x 45, %9, %3 \n\t" - "lxvw4x 44, %8, %2 \n\t" - "lxvw4x 45, %9, %2 \n\t" - "stxvw4x 46, %10, %3 \n\t" - "stxvw4x 47, %11, %3 \n\t" - "lxvw4x 46, %10, %2 \n\t" - "lxvw4x 47, %11, %2 \n\t" + "stxvd2x 40, 0, %3 \n\t" + "stxvd2x 41, %5, %3 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "stxvd2x 42, %6, %3 \n\t" + "stxvd2x 43, %7, %3 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "stxvd2x 44, %8, %3 \n\t" + "stxvd2x 45, %9, %3 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "stxvd2x 46, %10, %3 \n\t" + "stxvd2x 47, %11, %3 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" "addi %3, %3, 128 \n\t" "addi %2, %2, 128 \n\t" @@ -81,14 +81,14 @@ static void scopy_kernel_32 (long n, float *x, float *y) "2: \n\t" - "stxvw4x 40, 0, %3 \n\t" - "stxvw4x 41, %5, %3 \n\t" - "stxvw4x 42, %6, %3 \n\t" - "stxvw4x 43, %7, %3 \n\t" - "stxvw4x 44, %8, %3 \n\t" - "stxvw4x 45, %9, %3 \n\t" - "stxvw4x 46, %10, %3 \n\t" - "stxvw4x 47, %11, %3 \n" + "stxvd2x 40, 0, %3 \n\t" + "stxvd2x 41, %5, %3 \n\t" + "stxvd2x 42, %6, %3 \n\t" + "stxvd2x 43, %7, %3 \n\t" + "stxvd2x 44, %8, %3 \n\t" + "stxvd2x 45, %9, %3 \n\t" + "stxvd2x 46, %10, %3 \n\t" + "stxvd2x 47, %11, %3 \n" "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : -- 2.34.1