From c5b1fbcb2ec74dfd2c7cae4a838dda425c3e0af5 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 4 Apr 2016 09:12:08 +0200 Subject: [PATCH] updated optimized cgemm- and ctrmm-kernel for POWER8 --- kernel/power/cgemm_kernel_8x4_power8.S | 10 +- kernel/power/cgemm_logic_8x4_power8.S | 145 +- kernel/power/cgemm_macros_8x4_power8.S | 1327 +++---- kernel/power/ctrmm_kernel_8x4_power8.S | 4 +- kernel/power/ctrmm_logic_8x4_power8.S | 16 +- kernel/power/ctrmm_macros_8x4_power8.S | 6794 ++++++++++++++++++++++++++++++++ param.h | 2 +- 7 files changed, 7393 insertions(+), 905 deletions(-) create mode 100644 kernel/power/ctrmm_macros_8x4_power8.S diff --git a/kernel/power/cgemm_kernel_8x4_power8.S b/kernel/power/cgemm_kernel_8x4_power8.S index a7e7066..f90069e 100644 --- a/kernel/power/cgemm_kernel_8x4_power8.S +++ b/kernel/power/cgemm_kernel_8x4_power8.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/04/03 Werner Saar (wernsaar@googlemail.com) +* 2016/04/04 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK @@ -137,12 +137,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define alpha_si vs31 -#define NOTUSED r14 +#define BBUFFER r14 #define L r15 #define o12 r16 #define o4 r17 #define T2 r19 -#define KK r20 +#define BBO r20 #define o8 r21 #define I r22 #define J r23 @@ -290,6 +290,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. li o32 , 32 li o48 , 48 + li T1, 256 + slwi T1, T1, 9 // 131072 + sub BBUFFER, A, T1 // temp buffer for B unrolled + #ifdef __64BIT__ addi T1 , SP, 296 diff --git a/kernel/power/cgemm_logic_8x4_power8.S b/kernel/power/cgemm_logic_8x4_power8.S index 851a09a..db2a57f 100644 --- a/kernel/power/cgemm_logic_8x4_power8.S +++ b/kernel/power/cgemm_logic_8x4_power8.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/04/03 Werner Saar (wernsaar@googlemail.com) +* 2016/04/04 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK @@ -38,6 +38,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. CGEMM_L4_BEGIN: + mr BO, B + mr BBO, BBUFFER + slwi T1, K, 3 + +CGEMM_L4_COPYB: + dcbtst BBO, PRE + + lxvw4x vs3, o0, BO + lxvw4x vs11, o16, BO + xxspltw vs4, vs3, 0 + xxspltw vs5, vs3, 1 + xxspltw vs6, vs3, 2 + xxspltw vs7, vs3, 3 + xxspltw vs12, vs11, 0 + xxspltw vs13, vs11, 1 + xxspltw vs14, vs11, 2 + xxspltw vs15, vs11, 3 + stxvw4x vs4, o0, BBO + stxvw4x vs5, o16, BBO + stxvw4x vs6, o32, BBO + stxvw4x vs7, o48, BBO + addi BO, BO, 32 + addi BBO, BBO, 64 + stxvw4x vs12, o0, BBO + stxvw4x vs13, o16, BBO + stxvw4x vs14, o32, BBO + stxvw4x vs15, o48, BBO + addic. T1, T1, -8 + addi BBO, BBO, 64 + + bge CGEMM_L4_COPYB + + mr CO, C mr AO, A slwi T1, LDC , 2 @@ -48,7 +81,7 @@ CGEMM_L4_BEGIN: CGEMM_L4x8_BEGIN: - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L4x8_SUB0 cmpwi cr0, L, 1 @@ -59,18 +92,25 @@ CGEMM_L4x8_LOOP_START: dcbt AO, PRE dcbt BO, PRE LOAD4x8_1 + dcbt BO, PRE KERNEL4x8_I1 + dcbt BO, PRE dcbt AO, PRE KERNEL4x8_2 + dcbt BO, PRE KERNEL4x8_1 + dcbt BO, PRE dcbt AO, PRE KERNEL4x8_2 + dcbt BO, PRE KERNEL4x8_1 - dcbt AO, PRE dcbt BO, PRE + dcbt AO, PRE KERNEL4x8_2 + dcbt BO, PRE KERNEL4x8_1 + dcbt BO, PRE dcbt AO, PRE KERNEL4x8_2 @@ -81,18 +121,25 @@ CGEMM_L4x8_LOOP_START: CGEMM_L4x8_LOOP: + dcbt BO, PRE KERNEL4x8_1 + dcbt BO, PRE dcbt AO, PRE KERNEL4x8_2 + dcbt BO, PRE KERNEL4x8_1 + dcbt BO, PRE dcbt AO, PRE KERNEL4x8_2 + dcbt BO, PRE KERNEL4x8_1 - dcbt AO, PRE dcbt BO, PRE + dcbt AO, PRE KERNEL4x8_2 + dcbt BO, PRE KERNEL4x8_1 + dcbt BO, PRE dcbt AO, PRE KERNEL4x8_2 @@ -101,7 +148,9 @@ CGEMM_L4x8_LOOP: CGEMM_L4x8_LOOP_END: + dcbt BO, PRE KERNEL4x8_1 + dcbt BO, PRE dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 @@ -168,7 +217,7 @@ CGEMM_L4x4_BEGIN: andi. T1, M, 4 ble CGEMM_L4x4_END - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L4x4_SUB0 cmpwi cr0, L, 1 @@ -268,7 +317,7 @@ CGEMM_L4x2_BEGIN: andi. T1, M, 2 ble CGEMM_L4x2_END - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L4x2_SUB0 cmpwi cr0, L, 1 @@ -368,7 +417,7 @@ CGEMM_L4x1_BEGIN: andi. T1, M, 1 ble CGEMM_L4x1_END - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L4x1_SUB0 cmpwi cr0, L, 1 @@ -482,6 +531,39 @@ L999_H1: CGEMM_L2_BEGIN: + mr BO, B + mr BBO, BBUFFER + slwi T1, K, 2 + +CGEMM_L2_COPYB: + dcbtst BBO, PRE + + lxvw4x vs3, o0, BO + lxvw4x vs11, o16, BO + xxspltw vs4, vs3, 0 + xxspltw vs5, vs3, 1 + xxspltw vs6, vs3, 2 + xxspltw vs7, vs3, 3 + xxspltw vs12, vs11, 0 + xxspltw vs13, vs11, 1 + xxspltw vs14, vs11, 2 + xxspltw vs15, vs11, 3 + stxvw4x vs4, o0, BBO + stxvw4x vs5, o16, BBO + stxvw4x vs6, o32, BBO + stxvw4x vs7, o48, BBO + addi BO, BO, 32 + addi BBO, BBO, 64 + stxvw4x vs12, o0, BBO + stxvw4x vs13, o16, BBO + stxvw4x vs14, o32, BBO + stxvw4x vs15, o48, BBO + addic. T1, T1, -8 + addi BBO, BBO, 64 + + bge CGEMM_L2_COPYB + + andi. T1, N, 2 ble CGEMM_L2_END mr CO, C @@ -494,7 +576,7 @@ CGEMM_L2_BEGIN: CGEMM_L2x8_BEGIN: - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L2x8_SUB0 cmpwi cr0, L, 1 @@ -611,7 +693,7 @@ CGEMM_L2x4_BEGIN: andi. T1, M, 4 ble CGEMM_L2x4_END - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L2x4_SUB0 cmpwi cr0, L, 1 @@ -711,7 +793,7 @@ CGEMM_L2x2_BEGIN: andi. T1, M, 2 ble CGEMM_L2x2_END - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L2x2_SUB0 cmpwi cr0, L, 1 @@ -811,7 +893,7 @@ CGEMM_L2x1_BEGIN: andi. T1, M, 1 ble CGEMM_L2x1_END - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L2x1_SUB0 cmpwi cr0, L, 1 @@ -919,6 +1001,39 @@ L999_H2: CGEMM_L1_BEGIN: + mr BO, B + mr BBO, BBUFFER + slwi T1, K, 1 + +CGEMM_L1_COPYB: + dcbtst BBO, PRE + + lxvw4x vs3, o0, BO + lxvw4x vs11, o16, BO + xxspltw vs4, vs3, 0 + xxspltw vs5, vs3, 1 + xxspltw vs6, vs3, 2 + xxspltw vs7, vs3, 3 + xxspltw vs12, vs11, 0 + xxspltw vs13, vs11, 1 + xxspltw vs14, vs11, 2 + xxspltw vs15, vs11, 3 + stxvw4x vs4, o0, BBO + stxvw4x vs5, o16, BBO + stxvw4x vs6, o32, BBO + stxvw4x vs7, o48, BBO + addi BO, BO, 32 + addi BBO, BBO, 64 + stxvw4x vs12, o0, BBO + stxvw4x vs13, o16, BBO + stxvw4x vs14, o32, BBO + stxvw4x vs15, o48, BBO + addic. T1, T1, -8 + addi BBO, BBO, 64 + + bge CGEMM_L1_COPYB + + andi. T1, N, 1 ble CGEMM_L1_END mr CO, C @@ -929,7 +1044,7 @@ CGEMM_L1_BEGIN: CGEMM_L1x8_BEGIN: - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L1x8_SUB0 cmpwi cr0, L, 1 @@ -1046,7 +1161,7 @@ CGEMM_L1x4_BEGIN: andi. T1, M, 4 ble CGEMM_L1x4_END - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L1x4_SUB0 cmpwi cr0, L, 1 @@ -1146,7 +1261,7 @@ CGEMM_L1x2_BEGIN: andi. T1, M, 2 ble CGEMM_L1x2_END - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L1x2_SUB0 cmpwi cr0, L, 1 @@ -1246,7 +1361,7 @@ CGEMM_L1x1_BEGIN: andi. T1, M, 1 ble CGEMM_L1x1_END - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L1x1_SUB0 cmpwi cr0, L, 1 diff --git a/kernel/power/cgemm_macros_8x4_power8.S b/kernel/power/cgemm_macros_8x4_power8.S index 48a2125..9a18cb1 100644 --- a/kernel/power/cgemm_macros_8x4_power8.S +++ b/kernel/power/cgemm_macros_8x4_power8.S @@ -86,66 +86,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD4x8_1 lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - lxvw4x vs2, o32, AO // load a4, a5 - lxvw4x vs3, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + addi BO, BO, 64 - lxvw4x vs25, o16, BO // load b2, b3 + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i - xxspltw vs12, vs25, 0 - xxspltw vs13, vs25, 1 - xxspltw vs14, vs25, 2 - xxspltw vs15, vs25, 3 + addi BO, BO, 64 - addi BO, BO, 32 - .endm .macro KERNEL4x8_I1 lxvw4x vs4, o0, AO // load a0, a1 - lxvw4x vs5, o16, AO // load a2, a3 - lxvw4x vs6, o32, AO // load a4, a5 - lxvw4x vs7, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 + addi BO, BO, 64 - lxvw4x vs25, o16, BO // load b2, b3 + lxvw4x vs20, o0, BO // load b2_r + lxvw4x vs21, o16, BO // load b2_i + lxvw4x vs22, o32, BO // load b3_r + lxvw4x vs23, o48, BO // load b3_i - xxspltw vs20, vs25, 0 - xxspltw vs21, vs25, 1 - xxspltw vs22, vs25, 2 - xxspltw vs23, vs25, 3 + addi BO, BO, 64 - addi BO, BO, 32 - xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -190,33 +178,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - lxvw4x vs5, o16, AO // load a2, a3 - lxvw4x vs6, o32, AO // load a4, a5 - lxvw4x vs7, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 + addi BO, BO, 64 - lxvw4x vs25, o16, BO // load b2, b3 + lxvw4x vs20, o0, BO // load b2_r + lxvw4x vs21, o16, BO // load b2_i + lxvw4x vs22, o32, BO // load b3_r + lxvw4x vs23, o48, BO // load b3_i - xxspltw vs20, vs25, 0 - xxspltw vs21, vs25, 1 - xxspltw vs22, vs25, 2 - xxspltw vs23, vs25, 3 + addi BO, BO, 64 - addi BO, BO, 32 - xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -261,33 +243,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - lxvw4x vs2, o32, AO // load a4, a5 - lxvw4x vs3, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + addi BO, BO, 64 - lxvw4x vs25, o16, BO // load b2, b3 + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i - xxspltw vs12, vs25, 0 - xxspltw vs13, vs25, 1 - xxspltw vs14, vs25, 2 - xxspltw vs15, vs25, 3 + addi BO, BO, 64 - addi BO, BO, 32 - xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -374,33 +350,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - lxvw4x vs2, o32, AO // load a4, a5 - lxvw4x vs3, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + addi BO, BO, 64 - lxvw4x vs25, o16, BO // load b2, b3 + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i - xxspltw vs12, vs25, 0 - xxspltw vs13, vs25, 1 - xxspltw vs14, vs25, 2 - xxspltw vs15, vs25, 3 + addi BO, BO, 64 - addi BO, BO, 32 - xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -445,33 +415,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - lxvw4x vs2, o32, AO // load a4, a5 - lxvw4x vs3, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + addi BO, BO, 64 - lxvw4x vs25, o16, BO // load b2, b3 + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i - xxspltw vs12, vs25, 0 - xxspltw vs13, vs25, 1 - xxspltw vs14, vs25, 2 - xxspltw vs15, vs25, 3 + addi BO, BO, 64 - addi BO, BO, 32 - xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -515,6 +479,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x8 mr T1, CO + xxlxor vs24, vs24, vs24 // N=0 @@ -571,7 +536,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -637,7 +601,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -703,7 +666,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -769,7 +731,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -841,7 +802,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -907,7 +867,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -973,7 +932,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -1039,7 +997,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -1111,7 +1068,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -1177,7 +1133,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -1243,7 +1198,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -1309,7 +1263,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -1381,7 +1334,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -1447,7 +1399,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -1513,7 +1464,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -1579,7 +1529,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -1607,57 +1556,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD4x4_1 lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + addi BO, BO, 64 - lxvw4x vs25, o16, BO // load b2, b3 + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i - xxspltw vs12, vs25, 0 - xxspltw vs13, vs25, 1 - xxspltw vs14, vs25, 2 - xxspltw vs15, vs25, 3 + addi BO, BO, 64 - addi BO, BO, 32 - .endm .macro KERNEL4x4_I1 lxvw4x vs4, o0, AO // load a0, a1 - lxvw4x vs5, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i - lxvw4x vs25, o16, BO // load b2, b3 + addi BO, BO, 64 - xxspltw vs20, vs25, 0 - xxspltw vs21, vs25, 1 - xxspltw vs22, vs25, 2 - xxspltw vs23, vs25, 3 + lxvw4x vs20, o0, BO // load b2_r + lxvw4x vs21, o16, BO // load b2_i + lxvw4x vs22, o32, BO // load b3_r + lxvw4x vs23, o48, BO // load b3_i + addi BO, BO, 64 - addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -1687,28 +1628,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - lxvw4x vs5, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i - lxvw4x vs25, o16, BO // load b2, b3 + addi BO, BO, 64 - xxspltw vs20, vs25, 0 - xxspltw vs21, vs25, 1 - xxspltw vs22, vs25, 2 - xxspltw vs23, vs25, 3 + lxvw4x vs20, o0, BO // load b2_r + lxvw4x vs21, o16, BO // load b2_i + lxvw4x vs22, o32, BO // load b3_r + lxvw4x vs23, o48, BO // load b3_i + addi BO, BO, 64 - addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -1738,29 +1675,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + addi BO, BO, 64 - lxvw4x vs25, o16, BO // load b2, b3 + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i - xxspltw vs12, vs25, 0 - xxspltw vs13, vs25, 1 - xxspltw vs14, vs25, 2 - xxspltw vs15, vs25, 3 + addi BO, BO, 64 - addi BO, BO, 32 - xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -1815,28 +1748,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - lxvw4x vs25, o16, BO // load b2, b3 + addi BO, BO, 64 - xxspltw vs12, vs25, 0 - xxspltw vs13, vs25, 1 - xxspltw vs14, vs25, 2 - xxspltw vs15, vs25, 3 + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i + addi BO, BO, 64 - addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -1866,28 +1795,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - lxvw4x vs25, o16, BO // load b2, b3 + addi BO, BO, 64 - xxspltw vs12, vs25, 0 - xxspltw vs13, vs25, 1 - xxspltw vs14, vs25, 2 - xxspltw vs15, vs25, 3 + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i + addi BO, BO, 64 - addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -1916,6 +1841,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x4 mr T1, CO + xxlxor vs24, vs24, vs24 // N=0 @@ -1972,7 +1898,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -2038,7 +1963,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -2110,7 +2034,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -2176,7 +2099,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -2248,7 +2170,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -2314,7 +2235,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -2386,7 +2306,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -2452,7 +2371,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -2481,25 +2399,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - lxvw4x vs25, o16, BO // load b2, b3 + addi BO, BO, 64 - xxspltw vs12, vs25, 0 - xxspltw vs13, vs25, 1 - xxspltw vs14, vs25, 2 - xxspltw vs15, vs25, 3 + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i + addi BO, BO, 64 - addi BO, BO, 32 .endm @@ -2508,25 +2423,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i - lxvw4x vs25, o16, BO // load b2, b3 + addi BO, BO, 64 - xxspltw vs20, vs25, 0 - xxspltw vs21, vs25, 1 - xxspltw vs22, vs25, 2 - xxspltw vs23, vs25, 3 + lxvw4x vs20, o0, BO // load b2_r + lxvw4x vs21, o16, BO // load b2_i + lxvw4x vs22, o32, BO // load b3_r + lxvw4x vs23, o48, BO // load b3_i + addi BO, BO, 64 - addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -2549,25 +2461,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i - lxvw4x vs25, o16, BO // load b2, b3 + addi BO, BO, 64 - xxspltw vs20, vs25, 0 - xxspltw vs21, vs25, 1 - xxspltw vs22, vs25, 2 - xxspltw vs23, vs25, 3 + lxvw4x vs20, o0, BO // load b2_r + lxvw4x vs21, o16, BO // load b2_i + lxvw4x vs22, o32, BO // load b3_r + lxvw4x vs23, o48, BO // load b3_i + addi BO, BO, 64 - addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -2590,26 +2499,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + addi BO, BO, 64 - lxvw4x vs25, o16, BO // load b2, b3 + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i - xxspltw vs12, vs25, 0 - xxspltw vs13, vs25, 1 - xxspltw vs14, vs25, 2 - xxspltw vs15, vs25, 3 + addi BO, BO, 64 - addi BO, BO, 32 - xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -2649,25 +2555,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - lxvw4x vs25, o16, BO // load b2, b3 + addi BO, BO, 64 - xxspltw vs12, vs25, 0 - xxspltw vs13, vs25, 1 - xxspltw vs14, vs25, 2 - xxspltw vs15, vs25, 3 + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i + addi BO, BO, 64 - addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -2690,25 +2593,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - lxvw4x vs25, o16, BO // load b2, b3 + addi BO, BO, 64 - xxspltw vs12, vs25, 0 - xxspltw vs13, vs25, 1 - xxspltw vs14, vs25, 2 - xxspltw vs15, vs25, 3 + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i + addi BO, BO, 64 - addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -2729,6 +2629,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x2 mr T1, CO + xxlxor vs24, vs24, vs24 // N=0 @@ -2785,7 +2686,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -2857,7 +2757,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -2929,7 +2828,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -3001,7 +2899,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -3033,27 +2930,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + lxsspx vs10, o32, BO // load b1_r + lxsspx vs11, o48, BO // load b1_i - lxsspx vs8, o0, T1 // load b0_r - lxsspx vs9, o4, T1 // load b0_i + addi BO, BO, 64 - addi T1, T1,8 + lxsspx vs12, o0, BO // load b2_r + lxsspx vs13, o16, BO // load b2_i + lxsspx vs14, o32, BO // load b3_r + lxsspx vs15, o48, BO // load b3_i - lxsspx vs10, o0, T1 // load b1_r - lxsspx vs11, o4, T1 // load b1_i + addi BO, BO, 64 - addi T1, T1,8 - - lxsspx vs12, o0, T1 // load b2_r - lxsspx vs13, o4, T1 // load b2_i - - addi T1, T1,8 - - lxsspx vs14, o0, T1 // load b3_r - lxsspx vs15, o4, T1 // load b3_i - - addi BO, BO, 32 .endm @@ -3065,27 +2955,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO - - lxsspx vs16, o0, T1 // load b0_r - lxsspx vs17, o4, T1 // load b0_i + lxsspx vs16, o0, BO // load b0_r + lxsspx vs17, o16, BO // load b0_i + lxsspx vs18, o32, BO // load b1_r + lxsspx vs19, o48, BO // load b1_i - addi T1, T1,8 + addi BO, BO, 64 - lxsspx vs18, o0, T1 // load b1_r - lxsspx vs19, o4, T1 // load b1_i + lxsspx vs20, o0, BO // load b2_r + lxsspx vs21, o16, BO // load b2_i + lxsspx vs22, o32, BO // load b3_r + lxsspx vs23, o48, BO // load b3_i - addi T1, T1,8 + addi BO, BO, 64 - lxsspx vs20, o0, T1 // load b2_r - lxsspx vs21, o4, T1 // load b2_i - - addi T1, T1,8 - - lxsspx vs22, o0, T1 // load b3_r - lxsspx vs23, o4, T1 // load b3_i - - addi BO, BO, 32 xsmuldp vs32, vs0, vs8 // a0_r*b0_r @@ -3119,27 +3002,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO - - lxsspx vs16, o0, T1 // load b0_r - lxsspx vs17, o4, T1 // load b0_i - - addi T1, T1,8 + lxsspx vs16, o0, BO // load b0_r + lxsspx vs17, o16, BO // load b0_i + lxsspx vs18, o32, BO // load b1_r + lxsspx vs19, o48, BO // load b1_i - lxsspx vs18, o0, T1 // load b1_r - lxsspx vs19, o4, T1 // load b1_i + addi BO, BO, 64 - addi T1, T1,8 + lxsspx vs20, o0, BO // load b2_r + lxsspx vs21, o16, BO // load b2_i + lxsspx vs22, o32, BO // load b3_r + lxsspx vs23, o48, BO // load b3_i - lxsspx vs20, o0, T1 // load b2_r - lxsspx vs21, o4, T1 // load b2_i + addi BO, BO, 64 - addi T1, T1,8 - - lxsspx vs22, o0, T1 // load b3_r - lxsspx vs23, o4, T1 // load b3_i - - addi BO, BO, 32 xsmaddadp vs32, vs0, vs8 // a0_r*b0_r @@ -3173,27 +3049,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO - - lxsspx vs8, o0, T1 // load b0_r - lxsspx vs9, o4, T1 // load b0_i - - addi T1, T1,8 - - lxsspx vs10, o0, T1 // load b1_r - lxsspx vs11, o4, T1 // load b1_i + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + lxsspx vs10, o32, BO // load b1_r + lxsspx vs11, o48, BO // load b1_i - addi T1, T1,8 + addi BO, BO, 64 - lxsspx vs12, o0, T1 // load b2_r - lxsspx vs13, o4, T1 // load b2_i + lxsspx vs12, o0, BO // load b2_r + lxsspx vs13, o16, BO // load b2_i + lxsspx vs14, o32, BO // load b3_r + lxsspx vs15, o48, BO // load b3_i - addi T1, T1,8 + addi BO, BO, 64 - lxsspx vs14, o0, T1 // load b3_r - lxsspx vs15, o4, T1 // load b3_i - - addi BO, BO, 32 xsmaddadp vs32, vs4, vs16 // a4_r*b0_r @@ -3253,27 +3122,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO - - lxsspx vs8, o0, T1 // load b0_r - lxsspx vs9, o4, T1 // load b0_i - - addi T1, T1,8 - - lxsspx vs10, o0, T1 // load b1_r - lxsspx vs11, o4, T1 // load b1_i - - addi T1, T1,8 + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + lxsspx vs10, o32, BO // load b1_r + lxsspx vs11, o48, BO // load b1_i - lxsspx vs12, o0, T1 // load b2_r - lxsspx vs13, o4, T1 // load b2_i + addi BO, BO, 64 - addi T1, T1,8 + lxsspx vs12, o0, BO // load b2_r + lxsspx vs13, o16, BO // load b2_i + lxsspx vs14, o32, BO // load b3_r + lxsspx vs15, o48, BO // load b3_i - lxsspx vs14, o0, T1 // load b3_r - lxsspx vs15, o4, T1 // load b3_i + addi BO, BO, 64 - addi BO, BO, 32 xsmuldp vs32, vs0, vs8 // a0_r*b0_r @@ -3307,27 +3169,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO - - lxsspx vs8, o0, T1 // load b0_r - lxsspx vs9, o4, T1 // load b0_i - - addi T1, T1,8 + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + lxsspx vs10, o32, BO // load b1_r + lxsspx vs11, o48, BO // load b1_i - lxsspx vs10, o0, T1 // load b1_r - lxsspx vs11, o4, T1 // load b1_i + addi BO, BO, 64 - addi T1, T1,8 + lxsspx vs12, o0, BO // load b2_r + lxsspx vs13, o16, BO // load b2_i + lxsspx vs14, o32, BO // load b3_r + lxsspx vs15, o48, BO // load b3_i - lxsspx vs12, o0, T1 // load b2_r - lxsspx vs13, o4, T1 // load b2_i + addi BO, BO, 64 - addi T1, T1,8 - - lxsspx vs14, o0, T1 // load b3_r - lxsspx vs15, o4, T1 // load b3_i - - addi BO, BO, 32 xsmaddadp vs32, vs0, vs8 // a0_r*b0_r @@ -3356,6 +3211,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x1 mr T1, CO + xxlxor vs24, vs24, vs24 // N=0 @@ -3536,25 +3392,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD2x8_1 lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - lxvw4x vs2, o32, AO // load a4, a5 - lxvw4x vs3, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + addi BO, BO, 64 - addi BO, BO, 16 .endm @@ -3562,25 +3412,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - lxvw4x vs5, o16, AO // load a2, a3 - lxvw4x vs6, o32, AO // load a4, a5 - lxvw4x vs7, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i + addi BO, BO, 64 - addi BO, BO, 16 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -3608,26 +3452,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - lxvw4x vs5, o16, AO // load a2, a3 - lxvw4x vs6, o32, AO // load a4, a5 - lxvw4x vs7, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 + addi BO, BO, 64 - addi BO, BO, 16 - xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -3654,25 +3492,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - lxvw4x vs2, o32, AO // load a4, a5 - lxvw4x vs3, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + addi BO, BO, 64 - addi BO, BO, 16 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -3724,25 +3556,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - lxvw4x vs2, o32, AO // load a4, a5 - lxvw4x vs3, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + addi BO, BO, 64 - addi BO, BO, 16 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -3770,26 +3596,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - lxvw4x vs2, o32, AO // load a4, a5 - lxvw4x vs3, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + addi BO, BO, 64 - addi BO, BO, 16 - xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -3815,6 +3635,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE2x8 mr T1, CO + xxlxor vs24, vs24, vs24 // N=0 @@ -3871,7 +3692,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -3937,7 +3757,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -4003,7 +3822,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -4069,7 +3887,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -4141,7 +3958,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -4207,7 +4023,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -4273,7 +4088,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -4339,7 +4153,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -4367,44 +4180,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD2x4_1 lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + addi BO, BO, 64 - addi BO, BO, 16 - .endm .macro KERNEL2x4_I1 lxvw4x vs4, o0, AO // load a0, a1 - lxvw4x vs5, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 + addi BO, BO, 64 - addi BO, BO, 16 - xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -4423,22 +4228,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - lxvw4x vs5, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 + addi BO, BO, 64 - addi BO, BO, 16 - xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -4457,22 +4258,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + addi BO, BO, 64 - addi BO, BO, 16 - xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -4507,22 +4304,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + addi BO, BO, 64 - addi BO, BO, 16 - xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -4541,22 +4334,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + addi BO, BO, 64 - addi BO, BO, 16 - xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -4574,6 +4363,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE2x4 mr T1, CO + xxlxor vs24, vs24, vs24 // N=0 @@ -4630,7 +4420,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -4696,7 +4485,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -4768,7 +4556,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -4834,7 +4621,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -4863,18 +4649,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + addi BO, BO, 64 - addi BO, BO, 16 .endm @@ -4883,19 +4666,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 + addi BO, BO, 64 - addi BO, BO, 16 - xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -4911,18 +4691,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i + addi BO, BO, 64 - addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -4939,18 +4716,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + addi BO, BO, 64 - addi BO, BO, 16 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -4979,19 +4753,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + addi BO, BO, 64 - addi BO, BO, 16 - xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -5007,18 +4778,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + addi BO, BO, 64 - addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -5033,6 +4801,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE2x2 mr T1, CO + xxlxor vs24, vs24, vs24 // N=0 @@ -5089,7 +4858,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -5161,7 +4929,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -5193,17 +4960,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + lxsspx vs10, o32, BO // load b1_r + lxsspx vs11, o48, BO // load b1_i - lxsspx vs8, o0, T1 // load b0_r - lxsspx vs9, o4, T1 // load b0_i + addi BO, BO, 64 - addi T1, T1,8 - - lxsspx vs10, o0, T1 // load b1_r - lxsspx vs11, o4, T1 // load b1_i - - addi BO, BO, 16 .endm @@ -5215,17 +4978,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO - - lxsspx vs16, o0, T1 // load b0_r - lxsspx vs17, o4, T1 // load b0_i + lxsspx vs16, o0, BO // load b0_r + lxsspx vs17, o16, BO // load b0_i + lxsspx vs18, o32, BO // load b1_r + lxsspx vs19, o48, BO // load b1_i - addi T1, T1,8 + addi BO, BO, 64 - lxsspx vs18, o0, T1 // load b1_r - lxsspx vs19, o4, T1 // load b1_i - - addi BO, BO, 16 xsmuldp vs32, vs0, vs8 // a0_r*b0_r @@ -5249,17 +5008,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO - - lxsspx vs16, o0, T1 // load b0_r - lxsspx vs17, o4, T1 // load b0_i - - addi T1, T1,8 + lxsspx vs16, o0, BO // load b0_r + lxsspx vs17, o16, BO // load b0_i + lxsspx vs18, o32, BO // load b1_r + lxsspx vs19, o48, BO // load b1_i - lxsspx vs18, o0, T1 // load b1_r - lxsspx vs19, o4, T1 // load b1_i + addi BO, BO, 64 - addi BO, BO, 16 xsmaddadp vs32, vs0, vs8 // a0_r*b0_r @@ -5283,17 +5038,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + lxsspx vs10, o32, BO // load b1_r + lxsspx vs11, o48, BO // load b1_i - lxsspx vs8, o0, T1 // load b0_r - lxsspx vs9, o4, T1 // load b0_i + addi BO, BO, 64 - addi T1, T1,8 - - lxsspx vs10, o0, T1 // load b1_r - lxsspx vs11, o4, T1 // load b1_i - - addi BO, BO, 16 xsmaddadp vs32, vs4, vs16 // a4_r*b0_r @@ -5333,17 +5084,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO - - lxsspx vs8, o0, T1 // load b0_r - lxsspx vs9, o4, T1 // load b0_i + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + lxsspx vs10, o32, BO // load b1_r + lxsspx vs11, o48, BO // load b1_i - addi T1, T1,8 + addi BO, BO, 64 - lxsspx vs10, o0, T1 // load b1_r - lxsspx vs11, o4, T1 // load b1_i - - addi BO, BO, 16 xsmuldp vs32, vs0, vs8 // a0_r*b0_r @@ -5367,17 +5114,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO - - lxsspx vs8, o0, T1 // load b0_r - lxsspx vs9, o4, T1 // load b0_i - - addi T1, T1,8 + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + lxsspx vs10, o32, BO // load b1_r + lxsspx vs11, o48, BO // load b1_i - lxsspx vs10, o0, T1 // load b1_r - lxsspx vs11, o4, T1 // load b1_i + addi BO, BO, 64 - addi BO, BO, 16 xsmaddadp vs32, vs0, vs8 // a0_r*b0_r @@ -5396,6 +5139,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE2x1 mr T1, CO + xxlxor vs24, vs24, vs24 // N=0 @@ -5492,27 +5236,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD1x8_1 lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - lxvw4x vs2, o32, AO // load a4, a5 - lxvw4x vs3, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 - + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 - - - addi BO, BO, 8 + addi BO, BO, 32 .endm @@ -5520,27 +5253,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - lxvw4x vs5, o16, AO // load a2, a3 - lxvw4x vs6, o32, AO // load a4, a5 - lxvw4x vs7, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 - - - - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 - + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i - addi BO, BO, 8 + addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -5559,27 +5281,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - lxvw4x vs5, o16, AO // load a2, a3 - lxvw4x vs6, o32, AO // load a4, a5 - lxvw4x vs7, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 - + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i - - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 - - - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -5598,27 +5309,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - lxvw4x vs2, o32, AO // load a4, a5 - lxvw4x vs3, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 - - - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 - + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -5652,27 +5352,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - lxvw4x vs2, o32, AO // load a4, a5 - lxvw4x vs3, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 - + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 - - - addi BO, BO, 8 + addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -5691,27 +5380,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - lxvw4x vs2, o32, AO // load a4, a5 - lxvw4x vs3, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 - - - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 - + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -5729,6 +5407,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE1x8 mr T1, CO + xxlxor vs24, vs24, vs24 // N=0 @@ -5785,7 +5464,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -5851,7 +5529,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -5917,7 +5594,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -5983,7 +5659,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -6011,23 +5686,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD1x4_1 lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 - - - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 - + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i - addi BO, BO, 8 + addi BO, BO, 32 .endm @@ -6035,23 +5701,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - lxvw4x vs5, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 - - - - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i - - addi BO, BO, 8 + addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -6066,23 +5723,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - lxvw4x vs5, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 - - + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 - - - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -6097,23 +5745,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 - + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 - - - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -6139,23 +5778,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i - - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 - - - addi BO, BO, 8 + addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -6170,23 +5800,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 - - + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 - - - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -6200,6 +5821,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE1x4 mr T1, CO + xxlxor vs24, vs24, vs24 // N=0 @@ -6256,7 +5878,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -6322,7 +5943,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -6351,20 +5971,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 - - - addi BO, BO, 8 + addi BO, BO, 32 .endm @@ -6373,20 +5985,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 - - - addi BO, BO, 8 + addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -6400,20 +6004,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 - - - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -6427,20 +6023,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 - - - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -6463,20 +6051,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 - - - addi BO, BO, 8 + addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -6490,20 +6070,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 - - - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -6515,6 +6087,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE1x2 mr T1, CO + xxlxor vs24, vs24, vs24 // N=0 @@ -6571,7 +6144,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -6603,12 +6175,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO - - lxsspx vs8, o0, T1 // load b0_r - lxsspx vs9, o4, T1 // load b0_i + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i - addi BO, BO, 8 + addi BO, BO, 32 .endm @@ -6620,12 +6190,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO - - lxsspx vs16, o0, T1 // load b0_r - lxsspx vs17, o4, T1 // load b0_i + lxsspx vs16, o0, BO // load b0_r + lxsspx vs17, o16, BO // load b0_i - addi BO, BO, 8 + addi BO, BO, 32 xsmuldp vs32, vs0, vs8 // a0_r*b0_r @@ -6644,12 +6212,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO + lxsspx vs16, o0, BO // load b0_r + lxsspx vs17, o16, BO // load b0_i - lxsspx vs16, o0, T1 // load b0_r - lxsspx vs17, o4, T1 // load b0_i - - addi BO, BO, 8 + addi BO, BO, 32 xsmaddadp vs32, vs0, vs8 // a0_r*b0_r @@ -6668,12 +6234,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO - - lxsspx vs8, o0, T1 // load b0_r - lxsspx vs9, o4, T1 // load b0_i + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i - addi BO, BO, 8 + addi BO, BO, 32 xsmaddadp vs32, vs4, vs16 // a4_r*b0_r @@ -6703,12 +6267,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO - - lxsspx vs8, o0, T1 // load b0_r - lxsspx vs9, o4, T1 // load b0_i + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i - addi BO, BO, 8 + addi BO, BO, 32 xsmuldp vs32, vs0, vs8 // a0_r*b0_r @@ -6727,12 +6289,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i - lxsspx vs8, o0, T1 // load b0_r - lxsspx vs9, o4, T1 // load b0_i - - addi BO, BO, 8 + addi BO, BO, 32 xsmaddadp vs32, vs0, vs8 // a0_r*b0_r @@ -6746,6 +6306,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE1x1 mr T1, CO + xxlxor vs24, vs24, vs24 // N=0 diff --git a/kernel/power/ctrmm_kernel_8x4_power8.S b/kernel/power/ctrmm_kernel_8x4_power8.S index b202114..460a387 100644 --- a/kernel/power/ctrmm_kernel_8x4_power8.S +++ b/kernel/power/ctrmm_kernel_8x4_power8.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/04/03 Werner Saar (wernsaar@googlemail.com) +* 2016/04/04 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK @@ -275,7 +275,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#include "cgemm_macros_8x4_power8.S" +#include "ctrmm_macros_8x4_power8.S" cmpwi cr0, M, 0 ble L999_H1 diff --git a/kernel/power/ctrmm_logic_8x4_power8.S b/kernel/power/ctrmm_logic_8x4_power8.S index 3e50646..9ab2585 100644 --- a/kernel/power/ctrmm_logic_8x4_power8.S +++ b/kernel/power/ctrmm_logic_8x4_power8.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/04/03 Werner Saar (wernsaar@googlemail.com) +* 2016/04/04 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK @@ -83,15 +83,22 @@ CTRMM_L4x8_BEGIN: CTRMM_L4x8_LOOP_START: + dcbt AO, PRE + dcbt BO, PRE LOAD4x8_1 KERNEL4x8_I1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE + dcbt BO, PRE KERNEL4x8_2 addic. L, L, -2 @@ -102,13 +109,18 @@ CTRMM_L4x8_LOOP_START: CTRMM_L4x8_LOOP: KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE + dcbt BO, PRE KERNEL4x8_2 addic. L, L, -1 @@ -117,8 +129,10 @@ CTRMM_L4x8_LOOP: CTRMM_L4x8_LOOP_END: KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 diff --git a/kernel/power/ctrmm_macros_8x4_power8.S b/kernel/power/ctrmm_macros_8x4_power8.S new file mode 100644 index 0000000..48a2125 --- /dev/null +++ b/kernel/power/ctrmm_macros_8x4_power8.S @@ -0,0 +1,6794 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/04 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xsadddp + #define XVFADD_R1 xvaddsp + #define XVFADD_R2 xvsubsp + #define XVFADD_I1 xvaddsp + #define XVFADD_I2 xvaddsp + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xsadddp + #define XVFADD_R1 xvaddsp + #define XVFADD_R2 xvaddsp + #define XVFADD_I1 xvsubsp + #define XVFADD_I2 xvaddsp + +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xssubdp + #define XVFADD_R1 xvaddsp + #define XVFADD_R2 xvaddsp + #define XVFADD_I1 xvaddsp + #define XVFADD_I2 xvsubsp + +#else // CC || CR || RC || RR + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xssubdp + #define XVFADD_R1 xvaddsp + #define XVFADD_R2 xvsubsp + #define XVFADD_I1 xvsubsp + #define XVFADD_I2 xvsubsp + +#endif + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro LOAD4x8_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x8_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x8_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs48, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs49, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs50, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs51, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs52, vs6, vs20 // a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs53, vs6, vs21 // a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs54, vs7, vs20 // a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs55, vs7, vs21 // a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs56, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs57, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs58, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs59, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs60, vs6, vs22 // a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs61, vs6, vs23 // a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs62, vs7, vs22 // a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x8_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs48, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs49, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs50, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs51, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs52, vs6, vs20 // a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs53, vs6, vs21 // a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs54, vs7, vs20 // a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs55, vs7, vs21 // a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs56, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs57, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs58, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs59, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs60, vs6, vs22 // a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs61, vs6, vs23 // a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs62, vs7, vs22 // a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x8_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x8_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro SAVE4x8 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 + + + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 + + + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs40, 0 + xxspltw vs9, vs40, 1 + xxspltw vs10, vs40, 2 + xxspltw vs11, vs40, 3 + + + xxspltw vs12, vs41, 0 + xxspltw vs13, vs41, 1 + xxspltw vs14, vs41, 2 + xxspltw vs15, vs41, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs42, 0 + xxspltw vs9, vs42, 1 + xxspltw vs10, vs42, 2 + xxspltw vs11, vs42, 3 + + + xxspltw vs12, vs43, 0 + xxspltw vs13, vs43, 1 + xxspltw vs14, vs43, 2 + xxspltw vs15, vs43, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs44, 0 + xxspltw vs9, vs44, 1 + xxspltw vs10, vs44, 2 + xxspltw vs11, vs44, 3 + + + xxspltw vs12, vs45, 0 + xxspltw vs13, vs45, 1 + xxspltw vs14, vs45, 2 + xxspltw vs15, vs45, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs46, 0 + xxspltw vs9, vs46, 1 + xxspltw vs10, vs46, 2 + xxspltw vs11, vs46, 3 + + + xxspltw vs12, vs47, 0 + xxspltw vs13, vs47, 1 + xxspltw vs14, vs47, 2 + xxspltw vs15, vs47, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=2 + + mr T2, T1 + +// N=2 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs48, 0 + xxspltw vs9, vs48, 1 + xxspltw vs10, vs48, 2 + xxspltw vs11, vs48, 3 + + + xxspltw vs12, vs49, 0 + xxspltw vs13, vs49, 1 + xxspltw vs14, vs49, 2 + xxspltw vs15, vs49, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=2 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs50, 0 + xxspltw vs9, vs50, 1 + xxspltw vs10, vs50, 2 + xxspltw vs11, vs50, 3 + + + xxspltw vs12, vs51, 0 + xxspltw vs13, vs51, 1 + xxspltw vs14, vs51, 2 + xxspltw vs15, vs51, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=2 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs52, 0 + xxspltw vs9, vs52, 1 + xxspltw vs10, vs52, 2 + xxspltw vs11, vs52, 3 + + + xxspltw vs12, vs53, 0 + xxspltw vs13, vs53, 1 + xxspltw vs14, vs53, 2 + xxspltw vs15, vs53, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=2 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs54, 0 + xxspltw vs9, vs54, 1 + xxspltw vs10, vs54, 2 + xxspltw vs11, vs54, 3 + + + xxspltw vs12, vs55, 0 + xxspltw vs13, vs55, 1 + xxspltw vs14, vs55, 2 + xxspltw vs15, vs55, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=3 + + mr T2, T1 + +// N=3 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs56, 0 + xxspltw vs9, vs56, 1 + xxspltw vs10, vs56, 2 + xxspltw vs11, vs56, 3 + + + xxspltw vs12, vs57, 0 + xxspltw vs13, vs57, 1 + xxspltw vs14, vs57, 2 + xxspltw vs15, vs57, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=3 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs58, 0 + xxspltw vs9, vs58, 1 + xxspltw vs10, vs58, 2 + xxspltw vs11, vs58, 3 + + + xxspltw vs12, vs59, 0 + xxspltw vs13, vs59, 1 + xxspltw vs14, vs59, 2 + xxspltw vs15, vs59, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=3 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs60, 0 + xxspltw vs9, vs60, 1 + xxspltw vs10, vs60, 2 + xxspltw vs11, vs60, 3 + + + xxspltw vs12, vs61, 0 + xxspltw vs13, vs61, 1 + xxspltw vs14, vs61, 2 + xxspltw vs15, vs61, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=3 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs62, 0 + xxspltw vs9, vs62, 1 + xxspltw vs10, vs62, 2 + xxspltw vs11, vs62, 3 + + + xxspltw vs12, vs63, 0 + xxspltw vs13, vs63, 1 + xxspltw vs14, vs63, 2 + xxspltw vs15, vs63, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro LOAD4x4_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + +.endm + +.macro KERNEL4x4_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs40, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs41, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs42, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs43, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs44, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs45, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs46, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs40, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs41, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs42, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs43, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs44, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs45, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs46, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro SAVE4x4 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 + + + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 + + + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=2 + + mr T2, T1 + +// N=2 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs40, 0 + xxspltw vs9, vs40, 1 + xxspltw vs10, vs40, 2 + xxspltw vs11, vs40, 3 + + + xxspltw vs12, vs41, 0 + xxspltw vs13, vs41, 1 + xxspltw vs14, vs41, 2 + xxspltw vs15, vs41, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=2 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs42, 0 + xxspltw vs9, vs42, 1 + xxspltw vs10, vs42, 2 + xxspltw vs11, vs42, 3 + + + xxspltw vs12, vs43, 0 + xxspltw vs13, vs43, 1 + xxspltw vs14, vs43, 2 + xxspltw vs15, vs43, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=3 + + mr T2, T1 + +// N=3 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs44, 0 + xxspltw vs9, vs44, 1 + xxspltw vs10, vs44, 2 + xxspltw vs11, vs44, 3 + + + xxspltw vs12, vs45, 0 + xxspltw vs13, vs45, 1 + xxspltw vs14, vs45, 2 + xxspltw vs15, vs45, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=3 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs46, 0 + xxspltw vs9, vs46, 1 + xxspltw vs10, vs46, 2 + xxspltw vs11, vs46, 3 + + + xxspltw vs12, vs47, 0 + xxspltw vs13, vs47, 1 + xxspltw vs14, vs47, 2 + xxspltw vs15, vs47, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro LOAD4x2_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + +.endm + +.macro KERNEL4x2_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs36, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs37, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs38, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs36, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs37, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs38, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro SAVE4x2 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=2 + + mr T2, T1 + +// N=2 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 + + + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=3 + + mr T2, T1 + +// N=3 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 + + + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro LOAD4x1_1 + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs12, o0, T1 // load b2_r + lxsspx vs13, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs14, o0, T1 // load b3_r + lxsspx vs15, o4, T1 // load b3_i + + addi BO, BO, 32 + +.endm + +.macro KERNEL4x1_I1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs18, o0, T1 // load b1_r + lxsspx vs19, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs20, o0, T1 // load b2_r + lxsspx vs21, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs22, o0, T1 // load b3_r + lxsspx vs23, o4, T1 // load b3_i + + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r + + xsmuldp vs36, vs0, vs10 // a0_r*b1_r + xsmuldp vs37, vs1, vs11 // a0_i*b1_i + xsmuldp vs38, vs0, vs11 // a0_r*b1_i + xsmuldp vs39, vs1, vs10 // a0_i*b1_r + + xsmuldp vs40, vs0, vs12 // a0_r*b2_r + xsmuldp vs41, vs1, vs13 // a0_i*b2_i + xsmuldp vs42, vs0, vs13 // a0_r*b2_i + xsmuldp vs43, vs1, vs12 // a0_i*b2_r + + xsmuldp vs44, vs0, vs14 // a0_r*b3_r + xsmuldp vs45, vs1, vs15 // a0_i*b3_i + xsmuldp vs46, vs0, vs15 // a0_r*b3_i + xsmuldp vs47, vs1, vs14 // a0_i*b3_r + + +.endm + +.macro KERNEL4x1_1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs18, o0, T1 // load b1_r + lxsspx vs19, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs20, o0, T1 // load b2_r + lxsspx vs21, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs22, o0, T1 // load b3_r + lxsspx vs23, o4, T1 // load b3_i + + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r + + xsmaddadp vs36, vs0, vs10 // a0_r*b1_r + xsmaddadp vs37, vs1, vs11 // a0_i*b1_i + xsmaddadp vs38, vs0, vs11 // a0_r*b1_i + xsmaddadp vs39, vs1, vs10 // a0_i*b1_r + + xsmaddadp vs40, vs0, vs12 // a0_r*b2_r + xsmaddadp vs41, vs1, vs13 // a0_i*b2_i + xsmaddadp vs42, vs0, vs13 // a0_r*b2_i + xsmaddadp vs43, vs1, vs12 // a0_i*b2_r + + xsmaddadp vs44, vs0, vs14 // a0_r*b3_r + xsmaddadp vs45, vs1, vs15 // a0_i*b3_i + xsmaddadp vs46, vs0, vs15 // a0_r*b3_i + xsmaddadp vs47, vs1, vs14 // a0_i*b3_r + + +.endm + +.macro KERNEL4x1_2 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs12, o0, T1 // load b2_r + lxsspx vs13, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs14, o0, T1 // load b3_r + lxsspx vs15, o4, T1 // load b3_i + + addi BO, BO, 32 + + + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r + + xsmaddadp vs36, vs4, vs18 // a4_r*b1_r + xsmaddadp vs37, vs5, vs19 // a4_i*b1_i + xsmaddadp vs38, vs4, vs19 // a4_r*b1_i + xsmaddadp vs39, vs5, vs18 // a4_i*b1_r + + xsmaddadp vs40, vs4, vs20 // a4_r*b2_r + xsmaddadp vs41, vs5, vs21 // a4_i*b2_i + xsmaddadp vs42, vs4, vs21 // a4_r*b2_i + xsmaddadp vs43, vs5, vs20 // a4_i*b2_r + + xsmaddadp vs44, vs4, vs22 // a4_r*b3_r + xsmaddadp vs45, vs5, vs23 // a4_i*b3_i + xsmaddadp vs46, vs4, vs23 // a4_r*b3_i + xsmaddadp vs47, vs5, vs22 // a4_i*b3_r + + +.endm + +.macro KERNEL4x1_E2 + + + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r + + xsmaddadp vs36, vs4, vs18 // a4_r*b1_r + xsmaddadp vs37, vs5, vs19 // a4_i*b1_i + xsmaddadp vs38, vs4, vs19 // a4_r*b1_i + xsmaddadp vs39, vs5, vs18 // a4_i*b1_r + + xsmaddadp vs40, vs4, vs20 // a4_r*b2_r + xsmaddadp vs41, vs5, vs21 // a4_i*b2_i + xsmaddadp vs42, vs4, vs21 // a4_r*b2_i + xsmaddadp vs43, vs5, vs20 // a4_i*b2_r + + xsmaddadp vs44, vs4, vs22 // a4_r*b3_r + xsmaddadp vs45, vs5, vs23 // a4_i*b3_i + xsmaddadp vs46, vs4, vs23 // a4_r*b3_i + xsmaddadp vs47, vs5, vs22 // a4_i*b3_r + + +.endm + +.macro KERNEL4x1_SUBI1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs12, o0, T1 // load b2_r + lxsspx vs13, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs14, o0, T1 // load b3_r + lxsspx vs15, o4, T1 // load b3_i + + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r + + xsmuldp vs36, vs0, vs10 // a0_r*b1_r + xsmuldp vs37, vs1, vs11 // a0_i*b1_i + xsmuldp vs38, vs0, vs11 // a0_r*b1_i + xsmuldp vs39, vs1, vs10 // a0_i*b1_r + + xsmuldp vs40, vs0, vs12 // a0_r*b2_r + xsmuldp vs41, vs1, vs13 // a0_i*b2_i + xsmuldp vs42, vs0, vs13 // a0_r*b2_i + xsmuldp vs43, vs1, vs12 // a0_i*b2_r + + xsmuldp vs44, vs0, vs14 // a0_r*b3_r + xsmuldp vs45, vs1, vs15 // a0_i*b3_i + xsmuldp vs46, vs0, vs15 // a0_r*b3_i + xsmuldp vs47, vs1, vs14 // a0_i*b3_r + + +.endm + +.macro KERNEL4x1_SUB1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs12, o0, T1 // load b2_r + lxsspx vs13, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs14, o0, T1 // load b3_r + lxsspx vs15, o4, T1 // load b3_i + + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r + + xsmaddadp vs36, vs0, vs10 // a0_r*b1_r + xsmaddadp vs37, vs1, vs11 // a0_i*b1_i + xsmaddadp vs38, vs0, vs11 // a0_r*b1_i + xsmaddadp vs39, vs1, vs10 // a0_i*b1_r + + xsmaddadp vs40, vs0, vs12 // a0_r*b2_r + xsmaddadp vs41, vs1, vs13 // a0_i*b2_i + xsmaddadp vs42, vs0, vs13 // a0_r*b2_i + xsmaddadp vs43, vs1, vs12 // a0_i*b2_r + + xsmaddadp vs44, vs0, vs14 // a0_r*b3_r + xsmaddadp vs45, vs1, vs15 // a0_i*b3_i + xsmaddadp vs46, vs0, vs15 // a0_r*b3_i + xsmaddadp vs47, vs1, vs14 // a0_i*b3_r + + +.endm + +.macro SAVE4x1 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs36 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs39 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs37 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs38 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + +// N=2 + + mr T2, T1 + +// N=2 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs40 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs43 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs41 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs42 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + +// N=3 + + mr T2, T1 + +// N=3 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs44 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs47 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs45 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs46 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro LOAD2x8_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + +.endm + +.macro KERNEL2x8_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro SAVE2x8 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 + + + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 + + + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs40, 0 + xxspltw vs9, vs40, 1 + xxspltw vs10, vs40, 2 + xxspltw vs11, vs40, 3 + + + xxspltw vs12, vs41, 0 + xxspltw vs13, vs41, 1 + xxspltw vs14, vs41, 2 + xxspltw vs15, vs41, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs42, 0 + xxspltw vs9, vs42, 1 + xxspltw vs10, vs42, 2 + xxspltw vs11, vs42, 3 + + + xxspltw vs12, vs43, 0 + xxspltw vs13, vs43, 1 + xxspltw vs14, vs43, 2 + xxspltw vs15, vs43, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs44, 0 + xxspltw vs9, vs44, 1 + xxspltw vs10, vs44, 2 + xxspltw vs11, vs44, 3 + + + xxspltw vs12, vs45, 0 + xxspltw vs13, vs45, 1 + xxspltw vs14, vs45, 2 + xxspltw vs15, vs45, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs46, 0 + xxspltw vs9, vs46, 1 + xxspltw vs10, vs46, 2 + xxspltw vs11, vs46, 3 + + + xxspltw vs12, vs47, 0 + xxspltw vs13, vs47, 1 + xxspltw vs14, vs47, 2 + xxspltw vs15, vs47, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro LOAD2x4_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + +.endm + +.macro KERNEL2x4_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro SAVE2x4 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 + + + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 + + + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro LOAD2x2_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + +.endm + +.macro KERNEL2x2_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro SAVE2x2 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro LOAD2x1_1 + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi BO, BO, 16 + +.endm + +.macro KERNEL2x1_I1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs18, o0, T1 // load b1_r + lxsspx vs19, o4, T1 // load b1_i + + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r + + xsmuldp vs36, vs0, vs10 // a0_r*b1_r + xsmuldp vs37, vs1, vs11 // a0_i*b1_i + xsmuldp vs38, vs0, vs11 // a0_r*b1_i + xsmuldp vs39, vs1, vs10 // a0_i*b1_r + + +.endm + +.macro KERNEL2x1_1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs18, o0, T1 // load b1_r + lxsspx vs19, o4, T1 // load b1_i + + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r + + xsmaddadp vs36, vs0, vs10 // a0_r*b1_r + xsmaddadp vs37, vs1, vs11 // a0_i*b1_i + xsmaddadp vs38, vs0, vs11 // a0_r*b1_i + xsmaddadp vs39, vs1, vs10 // a0_i*b1_r + + +.endm + +.macro KERNEL2x1_2 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi BO, BO, 16 + + + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r + + xsmaddadp vs36, vs4, vs18 // a4_r*b1_r + xsmaddadp vs37, vs5, vs19 // a4_i*b1_i + xsmaddadp vs38, vs4, vs19 // a4_r*b1_i + xsmaddadp vs39, vs5, vs18 // a4_i*b1_r + + +.endm + +.macro KERNEL2x1_E2 + + + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r + + xsmaddadp vs36, vs4, vs18 // a4_r*b1_r + xsmaddadp vs37, vs5, vs19 // a4_i*b1_i + xsmaddadp vs38, vs4, vs19 // a4_r*b1_i + xsmaddadp vs39, vs5, vs18 // a4_i*b1_r + + +.endm + +.macro KERNEL2x1_SUBI1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r + + xsmuldp vs36, vs0, vs10 // a0_r*b1_r + xsmuldp vs37, vs1, vs11 // a0_i*b1_i + xsmuldp vs38, vs0, vs11 // a0_r*b1_i + xsmuldp vs39, vs1, vs10 // a0_i*b1_r + + +.endm + +.macro KERNEL2x1_SUB1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r + + xsmaddadp vs36, vs0, vs10 // a0_r*b1_r + xsmaddadp vs37, vs1, vs11 // a0_i*b1_i + xsmaddadp vs38, vs0, vs11 // a0_r*b1_i + xsmaddadp vs39, vs1, vs10 // a0_i*b1_r + + +.endm + +.macro SAVE2x1 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs36 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs39 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs37 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs38 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro LOAD1x8_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + +.endm + +.macro KERNEL1x8_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro SAVE1x8 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 + + + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 + + + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro LOAD1x4_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + +.endm + +.macro KERNEL1x4_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro SAVE1x4 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro LOAD1x2_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + +.endm + +.macro KERNEL1x2_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro SAVE1x2 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro LOAD1x1_1 + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi BO, BO, 8 + +.endm + +.macro KERNEL1x1_I1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r + + +.endm + +.macro KERNEL1x1_1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r + + +.endm + +.macro KERNEL1x1_2 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi BO, BO, 8 + + + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r + + +.endm + +.macro KERNEL1x1_E2 + + + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r + + +.endm + +.macro KERNEL1x1_SUBI1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r + + +.endm + +.macro KERNEL1x1_SUB1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r + + +.endm + +.macro SAVE1x1 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + diff --git a/param.h b/param.h index fb344cd..d01c992 100644 --- a/param.h +++ b/param.h @@ -1979,7 +1979,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_P 960 #define DGEMM_DEFAULT_P 480 -#define CGEMM_DEFAULT_P 480 +#define CGEMM_DEFAULT_P 720 #define ZGEMM_DEFAULT_P 240 #define SGEMM_DEFAULT_Q 720 -- 2.7.4