--- /dev/null
+/*********************************************************************
+* Macros for N=4, M=16 *
+*********************************************************************/
+
+.macro LOAD4x16_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_I1
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ xvmuldp vs36, vs4, vs24
+ xvmuldp vs37, vs5, vs24
+ xvmuldp vs38, vs6, vs24
+ xvmuldp vs39, vs7, vs24
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ xvmuldp vs44, vs4, vs25
+ xvmuldp vs45, vs5, vs25
+ xvmuldp vs46, vs6, vs25
+ xvmuldp vs47, vs7, vs25
+
+ addi AO, AO, 64
+
+ xvmuldp vs48, vs0, vs26
+ xvmuldp vs49, vs1, vs26
+ xvmuldp vs50, vs2, vs26
+ xvmuldp vs51, vs3, vs26
+
+ lxvd2x vs12, 0, AO
+ lxvd2x vs13, o16, AO
+
+ xvmuldp vs52, vs4, vs26
+ xvmuldp vs53, vs5, vs26
+ xvmuldp vs54, vs6, vs26
+ xvmuldp vs55, vs7, vs26
+
+ lxvd2x vs14, o32, AO
+ lxvd2x vs15, o48, AO
+
+ xvmuldp vs56, vs0, vs27
+ xvmuldp vs57, vs1, vs27
+ xvmuldp vs58, vs2, vs27
+ xvmuldp vs59, vs3, vs27
+
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ xvmuldp vs60, vs4, vs27
+ xvmuldp vs61, vs5, vs27
+ xvmuldp vs62, vs6, vs27
+ xvmuldp vs63, vs7, vs27
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_1
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ xvmaddadp vs36, vs4, vs24
+ xvmaddadp vs37, vs5, vs24
+ xvmaddadp vs38, vs6, vs24
+ xvmaddadp vs39, vs7, vs24
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ xvmaddadp vs44, vs4, vs25
+ xvmaddadp vs45, vs5, vs25
+ xvmaddadp vs46, vs6, vs25
+ xvmaddadp vs47, vs7, vs25
+
+ addi AO, AO, 64
+
+ xvmaddadp vs48, vs0, vs26
+ xvmaddadp vs49, vs1, vs26
+ xvmaddadp vs50, vs2, vs26
+ xvmaddadp vs51, vs3, vs26
+
+ lxvd2x vs12, 0, AO
+ lxvd2x vs13, o16, AO
+
+ xvmaddadp vs52, vs4, vs26
+ xvmaddadp vs53, vs5, vs26
+ xvmaddadp vs54, vs6, vs26
+ xvmaddadp vs55, vs7, vs26
+
+ lxvd2x vs14, o32, AO
+ lxvd2x vs15, o48, AO
+
+ xvmaddadp vs56, vs0, vs27
+ xvmaddadp vs57, vs1, vs27
+ xvmaddadp vs58, vs2, vs27
+ xvmaddadp vs59, vs3, vs27
+
+
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ xvmaddadp vs60, vs4, vs27
+ xvmaddadp vs61, vs5, vs27
+ xvmaddadp vs62, vs6, vs27
+ xvmaddadp vs63, vs7, vs27
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_2
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ xvmaddadp vs36, vs12, vs28
+ xvmaddadp vs37, vs13, vs28
+ xvmaddadp vs38, vs14, vs28
+ xvmaddadp vs39, vs15, vs28
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ xvmaddadp vs44, vs12, vs29
+ xvmaddadp vs45, vs13, vs29
+ xvmaddadp vs46, vs14, vs29
+ xvmaddadp vs47, vs15, vs29
+
+ addi AO, AO, 64
+
+ xvmaddadp vs48, vs8, vs30
+ xvmaddadp vs49, vs9, vs30
+ xvmaddadp vs50, vs10, vs30
+ xvmaddadp vs51, vs11, vs30
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+
+ xvmaddadp vs52, vs12, vs30
+ xvmaddadp vs53, vs13, vs30
+ xvmaddadp vs54, vs14, vs30
+ xvmaddadp vs55, vs15, vs30
+
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ xvmaddadp vs56, vs8, vs31
+ xvmaddadp vs57, vs9, vs31
+ xvmaddadp vs58, vs10, vs31
+ xvmaddadp vs59, vs11, vs31
+
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ xvmaddadp vs60, vs12, vs31
+ xvmaddadp vs61, vs13, vs31
+ xvmaddadp vs62, vs14, vs31
+ xvmaddadp vs63, vs15, vs31
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+ xvmaddadp vs36, vs12, vs28
+ xvmaddadp vs37, vs13, vs28
+ xvmaddadp vs38, vs14, vs28
+ xvmaddadp vs39, vs15, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+ xvmaddadp vs44, vs12, vs29
+ xvmaddadp vs45, vs13, vs29
+ xvmaddadp vs46, vs14, vs29
+ xvmaddadp vs47, vs15, vs29
+
+ xvmaddadp vs48, vs8, vs30
+ xvmaddadp vs49, vs9, vs30
+ xvmaddadp vs50, vs10, vs30
+ xvmaddadp vs51, vs11, vs30
+ xvmaddadp vs52, vs12, vs30
+ xvmaddadp vs53, vs13, vs30
+ xvmaddadp vs54, vs14, vs30
+ xvmaddadp vs55, vs15, vs30
+
+ xvmaddadp vs56, vs8, vs31
+ xvmaddadp vs57, vs9, vs31
+ xvmaddadp vs58, vs10, vs31
+ xvmaddadp vs59, vs11, vs31
+ xvmaddadp vs60, vs12, vs31
+ xvmaddadp vs61, vs13, vs31
+ xvmaddadp vs62, vs14, vs31
+ xvmaddadp vs63, vs15, vs31
+
+.endm
+
+.macro KERNEL4x16_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+ xvmuldp vs36, vs4, vs24
+ xvmuldp vs37, vs5, vs24
+ xvmuldp vs38, vs6, vs24
+ xvmuldp vs39, vs7, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+ xvmuldp vs44, vs4, vs25
+ xvmuldp vs45, vs5, vs25
+ xvmuldp vs46, vs6, vs25
+ xvmuldp vs47, vs7, vs25
+
+ xvmuldp vs48, vs0, vs26
+ xvmuldp vs49, vs1, vs26
+ xvmuldp vs50, vs2, vs26
+ xvmuldp vs51, vs3, vs26
+ xvmuldp vs52, vs4, vs26
+ xvmuldp vs53, vs5, vs26
+ xvmuldp vs54, vs6, vs26
+ xvmuldp vs55, vs7, vs26
+
+ xvmuldp vs56, vs0, vs27
+ xvmuldp vs57, vs1, vs27
+ xvmuldp vs58, vs2, vs27
+ xvmuldp vs59, vs3, vs27
+ xvmuldp vs60, vs4, vs27
+ xvmuldp vs61, vs5, vs27
+ xvmuldp vs62, vs6, vs27
+ xvmuldp vs63, vs7, vs27
+
+.endm
+
+.macro KERNEL4x16_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+ xvmaddadp vs36, vs4, vs24
+ xvmaddadp vs37, vs5, vs24
+ xvmaddadp vs38, vs6, vs24
+ xvmaddadp vs39, vs7, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+ xvmaddadp vs44, vs4, vs25
+ xvmaddadp vs45, vs5, vs25
+ xvmaddadp vs46, vs6, vs25
+ xvmaddadp vs47, vs7, vs25
+
+ xvmaddadp vs48, vs0, vs26
+ xvmaddadp vs49, vs1, vs26
+ xvmaddadp vs50, vs2, vs26
+ xvmaddadp vs51, vs3, vs26
+ xvmaddadp vs52, vs4, vs26
+ xvmaddadp vs53, vs5, vs26
+ xvmaddadp vs54, vs6, vs26
+ xvmaddadp vs55, vs7, vs26
+
+ xvmaddadp vs56, vs0, vs27
+ xvmaddadp vs57, vs1, vs27
+ xvmaddadp vs58, vs2, vs27
+ xvmaddadp vs59, vs3, vs27
+ xvmaddadp vs60, vs4, vs27
+ xvmaddadp vs61, vs5, vs27
+ xvmaddadp vs62, vs6, vs27
+ xvmaddadp vs63, vs7, vs27
+
+.endm
+
+.macro SAVE4x16
+
+ mr T1, CO
+ addi T2, T1, 64
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+
+ lxvd2x vs4, 0, T2
+ lxvd2x vs5, o16, T2
+ lxvd2x vs6, o32, T2
+ lxvd2x vs7, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+ xvmaddadp vs2, vs34, alpha_r
+ xvmaddadp vs3, vs35, alpha_r
+ xvmaddadp vs4, vs36, alpha_r
+ xvmaddadp vs5, vs37, alpha_r
+ xvmaddadp vs6, vs38, alpha_r
+ xvmaddadp vs7, vs39, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+ xvmuldp vs2, vs34, alpha_r
+ xvmuldp vs3, vs35, alpha_r
+ xvmuldp vs4, vs36, alpha_r
+ xvmuldp vs5, vs37, alpha_r
+ xvmuldp vs6, vs38, alpha_r
+ xvmuldp vs7, vs39, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ dcbt T1, PRE
+
+ stxvd2x vs4, 0, T2
+ stxvd2x vs5, o16, T2
+ stxvd2x vs6, o32, T2
+ stxvd2x vs7, o48, T2
+
+ add T1, T1, LDC
+ add T2, T2, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+ lxvd2x vs10, o32, T1
+ lxvd2x vs11, o48, T1
+
+ lxvd2x vs12, 0, T2
+ lxvd2x vs13, o16, T2
+ lxvd2x vs14, o32, T2
+ lxvd2x vs15, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+ xvmaddadp vs9, vs41, alpha_r
+ xvmaddadp vs10, vs42, alpha_r
+ xvmaddadp vs11, vs43, alpha_r
+ xvmaddadp vs12, vs44, alpha_r
+ xvmaddadp vs13, vs45, alpha_r
+ xvmaddadp vs14, vs46, alpha_r
+ xvmaddadp vs15, vs47, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+ xvmuldp vs9, vs41, alpha_r
+ xvmuldp vs10, vs42, alpha_r
+ xvmuldp vs11, vs43, alpha_r
+ xvmuldp vs12, vs44, alpha_r
+ xvmuldp vs13, vs45, alpha_r
+ xvmuldp vs14, vs46, alpha_r
+ xvmuldp vs15, vs47, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ dcbt T1, PRE
+
+ stxvd2x vs12, 0, T2
+ stxvd2x vs13, o16, T2
+ stxvd2x vs14, o32, T2
+ stxvd2x vs15, o48, T2
+
+ add T1, T1, LDC
+ add T2, T2, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+
+ lxvd2x vs4, 0, T2
+ lxvd2x vs5, o16, T2
+ lxvd2x vs6, o32, T2
+ lxvd2x vs7, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs48, alpha_r
+ xvmaddadp vs1, vs49, alpha_r
+ xvmaddadp vs2, vs50, alpha_r
+ xvmaddadp vs3, vs51, alpha_r
+ xvmaddadp vs4, vs52, alpha_r
+ xvmaddadp vs5, vs53, alpha_r
+ xvmaddadp vs6, vs54, alpha_r
+ xvmaddadp vs7, vs55, alpha_r
+#else
+ xvmuldp vs0, vs48, alpha_r
+ xvmuldp vs1, vs49, alpha_r
+ xvmuldp vs2, vs50, alpha_r
+ xvmuldp vs3, vs51, alpha_r
+ xvmuldp vs4, vs52, alpha_r
+ xvmuldp vs5, vs53, alpha_r
+ xvmuldp vs6, vs54, alpha_r
+ xvmuldp vs7, vs55, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ dcbt T1, PRE
+
+ stxvd2x vs4, 0, T2
+ stxvd2x vs5, o16, T2
+ stxvd2x vs6, o32, T2
+ stxvd2x vs7, o48, T2
+
+ add T1, T1, LDC
+ add T2, T2, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+ lxvd2x vs10, o32, T1
+ lxvd2x vs11, o48, T1
+
+ lxvd2x vs12, 0, T2
+ lxvd2x vs13, o16, T2
+ lxvd2x vs14, o32, T2
+ lxvd2x vs15, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs56, alpha_r
+ xvmaddadp vs9, vs57, alpha_r
+ xvmaddadp vs10, vs58, alpha_r
+ xvmaddadp vs11, vs59, alpha_r
+ xvmaddadp vs12, vs60, alpha_r
+ xvmaddadp vs13, vs61, alpha_r
+ xvmaddadp vs14, vs62, alpha_r
+ xvmaddadp vs15, vs63, alpha_r
+#else
+ xvmuldp vs8, vs56, alpha_r
+ xvmuldp vs9, vs57, alpha_r
+ xvmuldp vs10, vs58, alpha_r
+ xvmuldp vs11, vs59, alpha_r
+ xvmuldp vs12, vs60, alpha_r
+ xvmuldp vs13, vs61, alpha_r
+ xvmuldp vs14, vs62, alpha_r
+ xvmuldp vs15, vs63, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ dcbt T1, PRE
+
+ stxvd2x vs12, 0, T2
+ stxvd2x vs13, o16, T2
+ stxvd2x vs14, o32, T2
+ stxvd2x vs15, o48, T2
+
+ addi CO, CO, 128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8 *
+*********************************************************************/
+
+.macro LOAD4x8_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_I1
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+
+ xvmuldp vs48, vs0, vs26
+ xvmuldp vs49, vs1, vs26
+
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ xvmuldp vs50, vs2, vs26
+ xvmuldp vs51, vs3, vs26
+
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ xvmuldp vs56, vs0, vs27
+ xvmuldp vs57, vs1, vs27
+ xvmuldp vs58, vs2, vs27
+ xvmuldp vs59, vs3, vs27
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_1
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ xvmaddadp vs48, vs0, vs26
+ xvmaddadp vs49, vs1, vs26
+
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ xvmaddadp vs50, vs2, vs26
+ xvmaddadp vs51, vs3, vs26
+
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ xvmaddadp vs56, vs0, vs27
+ xvmaddadp vs57, vs1, vs27
+ xvmaddadp vs58, vs2, vs27
+ xvmaddadp vs59, vs3, vs27
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_2
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ xvmaddadp vs48, vs8, vs30
+ xvmaddadp vs49, vs9, vs30
+
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ xvmaddadp vs50, vs10, vs30
+ xvmaddadp vs51, vs11, vs30
+
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ xvmaddadp vs56, vs8, vs31
+ xvmaddadp vs57, vs9, vs31
+ xvmaddadp vs58, vs10, vs31
+ xvmaddadp vs59, vs11, vs31
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+
+ xvmaddadp vs48, vs8, vs30
+ xvmaddadp vs49, vs9, vs30
+ xvmaddadp vs50, vs10, vs30
+ xvmaddadp vs51, vs11, vs30
+
+ xvmaddadp vs56, vs8, vs31
+ xvmaddadp vs57, vs9, vs31
+ xvmaddadp vs58, vs10, vs31
+ xvmaddadp vs59, vs11, vs31
+
+.endm
+
+.macro KERNEL4x8_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+
+ xvmuldp vs48, vs0, vs26
+ xvmuldp vs49, vs1, vs26
+ xvmuldp vs50, vs2, vs26
+ xvmuldp vs51, vs3, vs26
+
+ xvmuldp vs56, vs0, vs27
+ xvmuldp vs57, vs1, vs27
+ xvmuldp vs58, vs2, vs27
+ xvmuldp vs59, vs3, vs27
+
+.endm
+
+.macro KERNEL4x8_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+
+ xvmaddadp vs48, vs0, vs26
+ xvmaddadp vs49, vs1, vs26
+ xvmaddadp vs50, vs2, vs26
+ xvmaddadp vs51, vs3, vs26
+
+ xvmaddadp vs56, vs0, vs27
+ xvmaddadp vs57, vs1, vs27
+ xvmaddadp vs58, vs2, vs27
+ xvmaddadp vs59, vs3, vs27
+
+.endm
+
+.macro SAVE4x8
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+ xvmaddadp vs2, vs34, alpha_r
+ xvmaddadp vs3, vs35, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+ xvmuldp vs2, vs34, alpha_r
+ xvmuldp vs3, vs35, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+ lxvd2x vs10, o32, T1
+ lxvd2x vs11, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+ xvmaddadp vs9, vs41, alpha_r
+ xvmaddadp vs10, vs42, alpha_r
+ xvmaddadp vs11, vs43, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+ xvmuldp vs9, vs41, alpha_r
+ xvmuldp vs10, vs42, alpha_r
+ xvmuldp vs11, vs43, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs48, alpha_r
+ xvmaddadp vs1, vs49, alpha_r
+ xvmaddadp vs2, vs50, alpha_r
+ xvmaddadp vs3, vs51, alpha_r
+#else
+ xvmuldp vs0, vs48, alpha_r
+ xvmuldp vs1, vs49, alpha_r
+ xvmuldp vs2, vs50, alpha_r
+ xvmuldp vs3, vs51, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+ lxvd2x vs10, o32, T1
+ lxvd2x vs11, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs56, alpha_r
+ xvmaddadp vs9, vs57, alpha_r
+ xvmaddadp vs10, vs58, alpha_r
+ xvmaddadp vs11, vs59, alpha_r
+#else
+ xvmuldp vs8, vs56, alpha_r
+ xvmuldp vs9, vs57, alpha_r
+ xvmuldp vs10, vs58, alpha_r
+ xvmuldp vs11, vs59, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ addi CO, CO, 64
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=4 *
+*********************************************************************/
+
+.macro LOAD4x4_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x4_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 32
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+
+ xvmuldp vs48, vs0, vs26
+ xvmuldp vs49, vs1, vs26
+
+ xvmuldp vs56, vs0, vs27
+ xvmuldp vs57, vs1, vs27
+
+.endm
+
+.macro KERNEL4x4_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+
+ xvmaddadp vs48, vs0, vs26
+ xvmaddadp vs49, vs1, vs26
+
+ xvmaddadp vs56, vs0, vs27
+ xvmaddadp vs57, vs1, vs27
+
+.endm
+
+.macro KERNEL4x4_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+
+ xvmaddadp vs48, vs8, vs30
+ xvmaddadp vs49, vs9, vs30
+
+ xvmaddadp vs56, vs8, vs31
+ xvmaddadp vs57, vs9, vs31
+
+.endm
+
+.macro KERNEL4x4_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+
+ xvmaddadp vs48, vs8, vs30
+ xvmaddadp vs49, vs9, vs30
+
+ xvmaddadp vs56, vs8, vs31
+ xvmaddadp vs57, vs9, vs31
+
+.endm
+
+.macro KERNEL4x4_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 32
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+
+ xvmuldp vs48, vs0, vs26
+ xvmuldp vs49, vs1, vs26
+
+ xvmuldp vs56, vs0, vs27
+ xvmuldp vs57, vs1, vs27
+
+.endm
+
+.macro KERNEL4x4_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+
+ xvmaddadp vs48, vs0, vs26
+ xvmaddadp vs49, vs1, vs26
+
+ xvmaddadp vs56, vs0, vs27
+ xvmaddadp vs57, vs1, vs27
+
+.endm
+
+.macro SAVE4x4
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+ xvmaddadp vs9, vs41, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+ xvmuldp vs9, vs41, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs48, alpha_r
+ xvmaddadp vs1, vs49, alpha_r
+#else
+ xvmuldp vs0, vs48, alpha_r
+ xvmuldp vs1, vs49, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs56, alpha_r
+ xvmaddadp vs9, vs57, alpha_r
+#else
+ xvmuldp vs8, vs56, alpha_r
+ xvmuldp vs9, vs57, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+
+ addi CO, CO, 32
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=2 *
+*********************************************************************/
+
+.macro LOAD4x2_1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x2_I1
+
+ lxvd2x vs8, 0, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 32
+
+
+ xvmuldp vs32, vs0, vs24
+
+ xvmuldp vs40, vs0, vs25
+
+ xvmuldp vs48, vs0, vs26
+
+ xvmuldp vs56, vs0, vs27
+
+.endm
+
+.macro KERNEL4x2_1
+
+ lxvd2x vs8, 0, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs0, vs24
+
+ xvmaddadp vs40, vs0, vs25
+
+ xvmaddadp vs48, vs0, vs26
+
+ xvmaddadp vs56, vs0, vs27
+
+.endm
+
+.macro KERNEL4x2_2
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs8, vs28
+
+ xvmaddadp vs40, vs8, vs29
+
+ xvmaddadp vs48, vs8, vs30
+
+ xvmaddadp vs56, vs8, vs31
+
+.endm
+
+.macro KERNEL4x2_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+
+ xvmaddadp vs40, vs8, vs29
+
+ xvmaddadp vs48, vs8, vs30
+
+ xvmaddadp vs56, vs8, vs31
+
+.endm
+
+.macro KERNEL4x2_SUBI1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 32
+
+
+ xvmuldp vs32, vs0, vs24
+
+ xvmuldp vs40, vs0, vs25
+
+ xvmuldp vs48, vs0, vs26
+
+ xvmuldp vs56, vs0, vs27
+
+.endm
+
+.macro KERNEL4x2_SUB1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs0, vs24
+
+ xvmaddadp vs40, vs0, vs25
+
+ xvmaddadp vs48, vs0, vs26
+
+ xvmaddadp vs56, vs0, vs27
+
+.endm
+
+.macro SAVE4x2
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs48, alpha_r
+#else
+ xvmuldp vs0, vs48, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs56, alpha_r
+#else
+ xvmuldp vs8, vs56, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+
+ addi CO, CO, 16
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=1 *
+*********************************************************************/
+
+.macro LOAD4x1_1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+ lxsdx vs26, o16, BO
+ lxsdx vs27, o24, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x1_I1
+
+ lxsdx vs8, 0, AO
+
+ lxsdx vs28, 0, BO
+ lxsdx vs29, o8, BO
+ lxsdx vs30, o16, BO
+ lxsdx vs31, o24, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 32
+
+
+ xsmuldp vs32, vs0, vs24
+
+ xsmuldp vs40, vs0, vs25
+
+ xsmuldp vs48, vs0, vs26
+
+ xsmuldp vs56, vs0, vs27
+
+.endm
+
+.macro KERNEL4x1_1
+
+ lxsdx vs8, 0, AO
+
+ lxsdx vs28, 0, BO
+ lxsdx vs29, o8, BO
+ lxsdx vs30, o16, BO
+ lxsdx vs31, o24, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs0, vs24
+
+ xsmaddadp vs40, vs0, vs25
+
+ xsmaddadp vs48, vs0, vs26
+
+ xsmaddadp vs56, vs0, vs27
+
+.endm
+
+.macro KERNEL4x1_2
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+ lxsdx vs26, o16, BO
+ lxsdx vs27, o24, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs8, vs28
+
+ xsmaddadp vs40, vs8, vs29
+
+ xsmaddadp vs48, vs8, vs30
+
+ xsmaddadp vs56, vs8, vs31
+
+.endm
+
+.macro KERNEL4x1_E2
+
+
+ xsmaddadp vs32, vs8, vs28
+
+ xsmaddadp vs40, vs8, vs29
+
+ xsmaddadp vs48, vs8, vs30
+
+ xsmaddadp vs56, vs8, vs31
+
+.endm
+
+.macro KERNEL4x1_SUBI1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+ lxsdx vs26, o16, BO
+ lxsdx vs27, o24, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 32
+
+
+ xsmuldp vs32, vs0, vs24
+
+ xsmuldp vs40, vs0, vs25
+
+ xsmuldp vs48, vs0, vs26
+
+ xsmuldp vs56, vs0, vs27
+
+.endm
+
+.macro KERNEL4x1_SUB1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+ lxsdx vs26, o16, BO
+ lxsdx vs27, o24, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs0, vs24
+
+ xsmaddadp vs40, vs0, vs25
+
+ xsmaddadp vs48, vs0, vs26
+
+ xsmaddadp vs56, vs0, vs27
+
+.endm
+
+.macro SAVE4x1
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxsdx vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs0, vs32, alpha_r
+#else
+ xsmuldp vs0, vs32, alpha_r
+#endif
+
+ stxsdx vs0, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxsdx vs8, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs8, vs40, alpha_r
+#else
+ xsmuldp vs8, vs40, alpha_r
+#endif
+
+ stxsdx vs8, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxsdx vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs0, vs48, alpha_r
+#else
+ xsmuldp vs0, vs48, alpha_r
+#endif
+
+ stxsdx vs0, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxsdx vs8, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs8, vs56, alpha_r
+#else
+ xsmuldp vs8, vs56, alpha_r
+#endif
+
+ stxsdx vs8, 0, T1
+
+ addi CO, CO, 8
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=16 *
+*********************************************************************/
+
+.macro LOAD2x16_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+.endm
+
+.macro KERNEL2x16_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+ lxvd2x vs12, 0, AO
+ lxvd2x vs13, o16, AO
+ lxvd2x vs14, o32, AO
+ lxvd2x vs15, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+ xvmuldp vs36, vs4, vs24
+ xvmuldp vs37, vs5, vs24
+ xvmuldp vs38, vs6, vs24
+ xvmuldp vs39, vs7, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+ xvmuldp vs44, vs4, vs25
+ xvmuldp vs45, vs5, vs25
+ xvmuldp vs46, vs6, vs25
+ xvmuldp vs47, vs7, vs25
+
+.endm
+
+.macro KERNEL2x16_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+ lxvd2x vs12, 0, AO
+ lxvd2x vs13, o16, AO
+ lxvd2x vs14, o32, AO
+ lxvd2x vs15, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+ xvmaddadp vs36, vs4, vs24
+ xvmaddadp vs37, vs5, vs24
+ xvmaddadp vs38, vs6, vs24
+ xvmaddadp vs39, vs7, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+ xvmaddadp vs44, vs4, vs25
+ xvmaddadp vs45, vs5, vs25
+ xvmaddadp vs46, vs6, vs25
+ xvmaddadp vs47, vs7, vs25
+
+.endm
+
+.macro KERNEL2x16_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+ xvmaddadp vs36, vs12, vs28
+ xvmaddadp vs37, vs13, vs28
+ xvmaddadp vs38, vs14, vs28
+ xvmaddadp vs39, vs15, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+ xvmaddadp vs44, vs12, vs29
+ xvmaddadp vs45, vs13, vs29
+ xvmaddadp vs46, vs14, vs29
+ xvmaddadp vs47, vs15, vs29
+
+.endm
+
+.macro KERNEL2x16_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+ xvmaddadp vs36, vs12, vs28
+ xvmaddadp vs37, vs13, vs28
+ xvmaddadp vs38, vs14, vs28
+ xvmaddadp vs39, vs15, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+ xvmaddadp vs44, vs12, vs29
+ xvmaddadp vs45, vs13, vs29
+ xvmaddadp vs46, vs14, vs29
+ xvmaddadp vs47, vs15, vs29
+
+.endm
+
+.macro KERNEL2x16_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+ xvmuldp vs36, vs4, vs24
+ xvmuldp vs37, vs5, vs24
+ xvmuldp vs38, vs6, vs24
+ xvmuldp vs39, vs7, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+ xvmuldp vs44, vs4, vs25
+ xvmuldp vs45, vs5, vs25
+ xvmuldp vs46, vs6, vs25
+ xvmuldp vs47, vs7, vs25
+
+.endm
+
+.macro KERNEL2x16_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+ xvmaddadp vs36, vs4, vs24
+ xvmaddadp vs37, vs5, vs24
+ xvmaddadp vs38, vs6, vs24
+ xvmaddadp vs39, vs7, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+ xvmaddadp vs44, vs4, vs25
+ xvmaddadp vs45, vs5, vs25
+ xvmaddadp vs46, vs6, vs25
+ xvmaddadp vs47, vs7, vs25
+
+.endm
+
+.macro SAVE2x16
+
+ mr T1, CO
+ addi T2, T1, 64
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+
+ lxvd2x vs4, 0, T2
+ lxvd2x vs5, o16, T2
+ lxvd2x vs6, o32, T2
+ lxvd2x vs7, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+ xvmaddadp vs2, vs34, alpha_r
+ xvmaddadp vs3, vs35, alpha_r
+ xvmaddadp vs4, vs36, alpha_r
+ xvmaddadp vs5, vs37, alpha_r
+ xvmaddadp vs6, vs38, alpha_r
+ xvmaddadp vs7, vs39, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+ xvmuldp vs2, vs34, alpha_r
+ xvmuldp vs3, vs35, alpha_r
+ xvmuldp vs4, vs36, alpha_r
+ xvmuldp vs5, vs37, alpha_r
+ xvmuldp vs6, vs38, alpha_r
+ xvmuldp vs7, vs39, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ stxvd2x vs4, 0, T2
+ stxvd2x vs5, o16, T2
+ stxvd2x vs6, o32, T2
+ stxvd2x vs7, o48, T2
+
+ add T1, T1, LDC
+ add T2, T2, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+ lxvd2x vs10, o32, T1
+ lxvd2x vs11, o48, T1
+
+ lxvd2x vs12, 0, T2
+ lxvd2x vs13, o16, T2
+ lxvd2x vs14, o32, T2
+ lxvd2x vs15, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+ xvmaddadp vs9, vs41, alpha_r
+ xvmaddadp vs10, vs42, alpha_r
+ xvmaddadp vs11, vs43, alpha_r
+ xvmaddadp vs12, vs44, alpha_r
+ xvmaddadp vs13, vs45, alpha_r
+ xvmaddadp vs14, vs46, alpha_r
+ xvmaddadp vs15, vs47, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+ xvmuldp vs9, vs41, alpha_r
+ xvmuldp vs10, vs42, alpha_r
+ xvmuldp vs11, vs43, alpha_r
+ xvmuldp vs12, vs44, alpha_r
+ xvmuldp vs13, vs45, alpha_r
+ xvmuldp vs14, vs46, alpha_r
+ xvmuldp vs15, vs47, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ stxvd2x vs12, 0, T2
+ stxvd2x vs13, o16, T2
+ stxvd2x vs14, o32, T2
+ stxvd2x vs15, o48, T2
+
+ addi CO, CO, 128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8 *
+*********************************************************************/
+
+.macro LOAD2x8_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL2x8_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+
+.endm
+
+.macro KERNEL2x8_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+
+.endm
+
+.macro KERNEL2x8_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+
+.endm
+
+.macro SAVE2x8
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+ xvmaddadp vs2, vs34, alpha_r
+ xvmaddadp vs3, vs35, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+ xvmuldp vs2, vs34, alpha_r
+ xvmuldp vs3, vs35, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+ lxvd2x vs10, o32, T1
+ lxvd2x vs11, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+ xvmaddadp vs9, vs41, alpha_r
+ xvmaddadp vs10, vs42, alpha_r
+ xvmaddadp vs11, vs43, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+ xvmuldp vs9, vs41, alpha_r
+ xvmuldp vs10, vs42, alpha_r
+ xvmuldp vs11, vs43, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ addi CO, CO, 64
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=4 *
+*********************************************************************/
+
+.macro LOAD2x4_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL2x4_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 16
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+
+.endm
+
+.macro KERNEL2x4_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+
+.endm
+
+.macro KERNEL2x4_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 16
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+
+.endm
+
+.macro SAVE2x4
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+ xvmaddadp vs9, vs41, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+ xvmuldp vs9, vs41, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+
+ addi CO, CO, 32
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=2 *
+*********************************************************************/
+
+.macro LOAD2x2_1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL2x2_I1
+
+ lxvd2x vs8, 0, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 16
+
+
+ xvmuldp vs32, vs0, vs24
+
+ xvmuldp vs40, vs0, vs25
+
+.endm
+
+.macro KERNEL2x2_1
+
+ lxvd2x vs8, 0, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs0, vs24
+
+ xvmaddadp vs40, vs0, vs25
+
+.endm
+
+.macro KERNEL2x2_2
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs8, vs28
+
+ xvmaddadp vs40, vs8, vs29
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+
+ xvmaddadp vs40, vs8, vs29
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 16
+
+
+ xvmuldp vs32, vs0, vs24
+
+ xvmuldp vs40, vs0, vs25
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs0, vs24
+
+ xvmaddadp vs40, vs0, vs25
+
+.endm
+
+.macro SAVE2x2
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+
+ addi CO, CO, 16
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=1 *
+*********************************************************************/
+
+.macro LOAD2x1_1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL2x1_I1
+
+ lxsdx vs8, 0, AO
+
+ lxsdx vs28, 0, BO
+ lxsdx vs29, o8, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 16
+
+
+ xsmuldp vs32, vs0, vs24
+
+ xsmuldp vs40, vs0, vs25
+
+.endm
+
+.macro KERNEL2x1_1
+
+ lxsdx vs8, 0, AO
+
+ lxsdx vs28, 0, BO
+ lxsdx vs29, o8, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 16
+
+
+ xsmaddadp vs32, vs0, vs24
+
+ xsmaddadp vs40, vs0, vs25
+
+.endm
+
+.macro KERNEL2x1_2
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 16
+
+
+ xsmaddadp vs32, vs8, vs28
+
+ xsmaddadp vs40, vs8, vs29
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+ xsmaddadp vs32, vs8, vs28
+
+ xsmaddadp vs40, vs8, vs29
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 16
+
+
+ xsmuldp vs32, vs0, vs24
+
+ xsmuldp vs40, vs0, vs25
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 16
+
+
+ xsmaddadp vs32, vs0, vs24
+
+ xsmaddadp vs40, vs0, vs25
+
+.endm
+
+.macro SAVE2x1
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxsdx vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs0, vs32, alpha_r
+#else
+ xsmuldp vs0, vs32, alpha_r
+#endif
+
+ stxsdx vs0, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxsdx vs8, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs8, vs40, alpha_r
+#else
+ xsmuldp vs8, vs40, alpha_r
+#endif
+
+ stxsdx vs8, 0, T1
+
+ addi CO, CO, 8
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=16 *
+*********************************************************************/
+
+.macro LOAD1x16_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+.endm
+
+.macro KERNEL1x16_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+ lxvd2x vs12, 0, AO
+ lxvd2x vs13, o16, AO
+ lxvd2x vs14, o32, AO
+ lxvd2x vs15, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+ xvmuldp vs36, vs4, vs24
+ xvmuldp vs37, vs5, vs24
+ xvmuldp vs38, vs6, vs24
+ xvmuldp vs39, vs7, vs24
+
+.endm
+
+.macro KERNEL1x16_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+ lxvd2x vs12, 0, AO
+ lxvd2x vs13, o16, AO
+ lxvd2x vs14, o32, AO
+ lxvd2x vs15, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+ xvmaddadp vs36, vs4, vs24
+ xvmaddadp vs37, vs5, vs24
+ xvmaddadp vs38, vs6, vs24
+ xvmaddadp vs39, vs7, vs24
+
+.endm
+
+.macro KERNEL1x16_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+ xvmaddadp vs36, vs12, vs28
+ xvmaddadp vs37, vs13, vs28
+ xvmaddadp vs38, vs14, vs28
+ xvmaddadp vs39, vs15, vs28
+
+.endm
+
+.macro KERNEL1x16_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+ xvmaddadp vs36, vs12, vs28
+ xvmaddadp vs37, vs13, vs28
+ xvmaddadp vs38, vs14, vs28
+ xvmaddadp vs39, vs15, vs28
+
+.endm
+
+.macro KERNEL1x16_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+ xvmuldp vs36, vs4, vs24
+ xvmuldp vs37, vs5, vs24
+ xvmuldp vs38, vs6, vs24
+ xvmuldp vs39, vs7, vs24
+
+.endm
+
+.macro KERNEL1x16_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+ xvmaddadp vs36, vs4, vs24
+ xvmaddadp vs37, vs5, vs24
+ xvmaddadp vs38, vs6, vs24
+ xvmaddadp vs39, vs7, vs24
+
+.endm
+
+.macro SAVE1x16
+
+ mr T1, CO
+ addi T2, T1, 64
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+
+ lxvd2x vs4, 0, T2
+ lxvd2x vs5, o16, T2
+ lxvd2x vs6, o32, T2
+ lxvd2x vs7, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+ xvmaddadp vs2, vs34, alpha_r
+ xvmaddadp vs3, vs35, alpha_r
+ xvmaddadp vs4, vs36, alpha_r
+ xvmaddadp vs5, vs37, alpha_r
+ xvmaddadp vs6, vs38, alpha_r
+ xvmaddadp vs7, vs39, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+ xvmuldp vs2, vs34, alpha_r
+ xvmuldp vs3, vs35, alpha_r
+ xvmuldp vs4, vs36, alpha_r
+ xvmuldp vs5, vs37, alpha_r
+ xvmuldp vs6, vs38, alpha_r
+ xvmuldp vs7, vs39, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ stxvd2x vs4, 0, T2
+ stxvd2x vs5, o16, T2
+ stxvd2x vs6, o32, T2
+ stxvd2x vs7, o48, T2
+
+ addi CO, CO, 128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8 *
+*********************************************************************/
+
+.macro LOAD1x8_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL1x8_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+.endm
+
+.macro KERNEL1x8_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+.endm
+
+.macro KERNEL1x8_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+.endm
+
+.macro SAVE1x8
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+ xvmaddadp vs2, vs34, alpha_r
+ xvmaddadp vs3, vs35, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+ xvmuldp vs2, vs34, alpha_r
+ xvmuldp vs3, vs35, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ addi CO, CO, 64
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=4 *
+*********************************************************************/
+
+.macro LOAD1x4_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL1x4_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 8
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+
+.endm
+
+.macro KERNEL1x4_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+
+.endm
+
+.macro KERNEL1x4_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 8
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+
+.endm
+
+.macro SAVE1x4
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+
+ addi CO, CO, 32
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=2 *
+*********************************************************************/
+
+.macro LOAD1x2_1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL1x2_I1
+
+ lxvd2x vs8, 0, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 8
+
+
+ xvmuldp vs32, vs0, vs24
+
+.endm
+
+.macro KERNEL1x2_1
+
+ lxvd2x vs8, 0, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs0, vs24
+
+.endm
+
+.macro KERNEL1x2_2
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs8, vs28
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 8
+
+
+ xvmuldp vs32, vs0, vs24
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs0, vs24
+
+.endm
+
+.macro SAVE1x2
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+
+ addi CO, CO, 16
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=1 *
+*********************************************************************/
+
+.macro LOAD1x1_1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL1x1_I1
+
+ lxsdx vs8, 0, AO
+
+ lxsdx vs28, 0, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 8
+
+
+ xsmuldp vs32, vs0, vs24
+
+.endm
+
+.macro KERNEL1x1_1
+
+ lxsdx vs8, 0, AO
+
+ lxsdx vs28, 0, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 8
+
+
+ xsmaddadp vs32, vs0, vs24
+
+.endm
+
+.macro KERNEL1x1_2
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 8
+
+
+ xsmaddadp vs32, vs8, vs28
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+ xsmaddadp vs32, vs8, vs28
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 8
+
+
+ xsmuldp vs32, vs0, vs24
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 8
+
+
+ xsmaddadp vs32, vs0, vs24
+
+.endm
+
+.macro SAVE1x1
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxsdx vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs0, vs32, alpha_r
+#else
+ xsmuldp vs0, vs32, alpha_r
+#endif
+
+ stxsdx vs0, 0, T1
+
+ addi CO, CO, 8
+
+.endm
+
--- /dev/null
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+ #define XSFADD_R1 xsadddp
+ #define XSFADD_R2 xssubdp
+ #define XSFADD_I1 xsadddp
+ #define XSFADD_I2 xsadddp
+
+#elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
+
+ #define XSFADD_R1 xsadddp
+ #define XSFADD_R2 xsadddp
+ #define XSFADD_I1 xssubdp
+ #define XSFADD_I2 xsadddp
+
+#elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
+
+ #define XSFADD_R1 xsadddp
+ #define XSFADD_R2 xsadddp
+ #define XSFADD_I1 xsadddp
+ #define XSFADD_I2 xssubdp
+
+#else // CC || CR || RC || RR
+
+ #define XSFADD_R1 xsadddp
+ #define XSFADD_R2 xssubdp
+ #define XSFADD_I1 xssubdp
+ #define XSFADD_I2 xssubdp
+
+#endif
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro LOAD2x8_1
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs4, o0, AO // load real,imag from A
+ lxvd2x vs5, o16, AO // load real,imag from A
+ lxvd2x vs6, o32, AO // load real,imag from A
+ lxvd2x vs7, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+
+.endm
+
+.macro KERNEL2x8_I1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+ lxvd2x vs10, o32, AO // load real,imag from A
+ lxvd2x vs11, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs12, o0, AO // load real,imag from A
+ lxvd2x vs13, o16, AO // load real,imag from A
+ lxvd2x vs14, o32, AO // load real,imag from A
+ lxvd2x vs15, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+ lxvdsx vs22, o16, BO // load real part from B
+ lxvdsx vs23, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmuldp vs36, vs2, vs16 // real*real, imag*real
+ xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmuldp vs38, vs3, vs16 // real*real, imag*real
+ xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
+ xvmuldp vs40, vs4, vs16 // real*real, imag*real
+ xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
+ xvmuldp vs42, vs5, vs16 // real*real, imag*real
+ xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
+ xvmuldp vs44, vs6, vs16 // real*real, imag*real
+ xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
+ xvmuldp vs46, vs7, vs16 // real*real, imag*real
+ xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
+
+ xvmuldp vs48, vs0, vs18 // real*real, imag*real
+ xvmuldp vs49, vs0, vs19 // real*imag, imag*imag
+ xvmuldp vs50, vs1, vs18 // real*real, imag*real
+ xvmuldp vs51, vs1, vs19 // real*imag, imag*imag
+ xvmuldp vs52, vs2, vs18 // real*real, imag*real
+ xvmuldp vs53, vs2, vs19 // real*imag, imag*imag
+ xvmuldp vs54, vs3, vs18 // real*real, imag*real
+ xvmuldp vs55, vs3, vs19 // real*imag, imag*imag
+ xvmuldp vs56, vs4, vs18 // real*real, imag*real
+ xvmuldp vs57, vs4, vs19 // real*imag, imag*imag
+ xvmuldp vs58, vs5, vs18 // real*real, imag*real
+ xvmuldp vs59, vs5, vs19 // real*imag, imag*imag
+ xvmuldp vs60, vs6, vs18 // real*real, imag*real
+ xvmuldp vs61, vs6, vs19 // real*imag, imag*imag
+ xvmuldp vs62, vs7, vs18 // real*real, imag*real
+ xvmuldp vs63, vs7, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x8_1
+
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+
+ lxvdsx vs22, o16, BO // load real part from B
+ lxvdsx vs23, o24, BO // load imag part from B
+
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+
+ xvmaddadp vs40, vs4, vs16 // real*real, imag*real
+ xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
+ xvmaddadp vs42, vs5, vs16 // real*real, imag*real
+ xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
+
+ lxvd2x vs10, o32, AO // load real,imag from A
+ lxvd2x vs11, o48, AO // load real,imag from A
+
+ xvmaddadp vs44, vs6, vs16 // real*real, imag*real
+ xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
+
+ addi AO, AO, 64
+
+ xvmaddadp vs46, vs7, vs16 // real*real, imag*real
+ xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
+
+ xvmaddadp vs48, vs0, vs18 // real*real, imag*real
+ xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag
+ xvmaddadp vs50, vs1, vs18 // real*real, imag*real
+ xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag
+
+ lxvd2x vs12, o0, AO // load real,imag from A
+ lxvd2x vs13, o16, AO // load real,imag from A
+
+ xvmaddadp vs52, vs2, vs18 // real*real, imag*real
+ xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag
+ xvmaddadp vs54, vs3, vs18 // real*real, imag*real
+ xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag
+
+ lxvd2x vs14, o32, AO // load real,imag from A
+ lxvd2x vs15, o48, AO // load real,imag from A
+
+ xvmaddadp vs56, vs4, vs18 // real*real, imag*real
+ xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag
+ xvmaddadp vs58, vs5, vs18 // real*real, imag*real
+ xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+
+ xvmaddadp vs60, vs6, vs18 // real*real, imag*real
+ xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag
+ xvmaddadp vs62, vs7, vs18 // real*real, imag*real
+ xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL2x8_2
+
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+
+ xvmaddadp vs40, vs12, vs20 // real*real, imag*real
+ xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
+ xvmaddadp vs42, vs13, vs20 // real*real, imag*real
+ xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
+
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ xvmaddadp vs44, vs14, vs20 // real*real, imag*real
+ xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
+ xvmaddadp vs46, vs15, vs20 // real*real, imag*real
+ xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
+
+ addi AO, AO, 64
+
+ xvmaddadp vs48, vs8, vs22 // real*real, imag*real
+ xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag
+ xvmaddadp vs50, vs9, vs22 // real*real, imag*real
+ xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag
+
+ lxvd2x vs4, o0, AO // load real,imag from A
+ lxvd2x vs5, o16, AO // load real,imag from A
+
+ xvmaddadp vs52, vs10, vs22 // real*real, imag*real
+ xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag
+ xvmaddadp vs54, vs11, vs22 // real*real, imag*real
+ xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag
+
+ lxvd2x vs6, o32, AO // load real,imag from A
+ lxvd2x vs7, o48, AO // load real,imag from A
+
+ xvmaddadp vs56, vs12, vs22 // real*real, imag*real
+ xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag
+ xvmaddadp vs58, vs13, vs22 // real*real, imag*real
+ xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag
+
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ xvmaddadp vs60, vs14, vs22 // real*real, imag*real
+ xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag
+ xvmaddadp vs62, vs15, vs22 // real*real, imag*real
+ xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
+ xvmaddadp vs40, vs12, vs20 // real*real, imag*real
+ xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
+ xvmaddadp vs42, vs13, vs20 // real*real, imag*real
+ xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
+ xvmaddadp vs44, vs14, vs20 // real*real, imag*real
+ xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
+ xvmaddadp vs46, vs15, vs20 // real*real, imag*real
+ xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
+
+ xvmaddadp vs48, vs8, vs22 // real*real, imag*real
+ xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag
+ xvmaddadp vs50, vs9, vs22 // real*real, imag*real
+ xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag
+ xvmaddadp vs52, vs10, vs22 // real*real, imag*real
+ xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag
+ xvmaddadp vs54, vs11, vs22 // real*real, imag*real
+ xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag
+ xvmaddadp vs56, vs12, vs22 // real*real, imag*real
+ xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag
+ xvmaddadp vs58, vs13, vs22 // real*real, imag*real
+ xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag
+ xvmaddadp vs60, vs14, vs22 // real*real, imag*real
+ xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag
+ xvmaddadp vs62, vs15, vs22 // real*real, imag*real
+ xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs4, o0, AO // load real,imag from A
+ lxvd2x vs5, o16, AO // load real,imag from A
+ lxvd2x vs6, o32, AO // load real,imag from A
+ lxvd2x vs7, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmuldp vs36, vs2, vs16 // real*real, imag*real
+ xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmuldp vs38, vs3, vs16 // real*real, imag*real
+ xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
+ xvmuldp vs40, vs4, vs16 // real*real, imag*real
+ xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
+ xvmuldp vs42, vs5, vs16 // real*real, imag*real
+ xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
+ xvmuldp vs44, vs6, vs16 // real*real, imag*real
+ xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
+ xvmuldp vs46, vs7, vs16 // real*real, imag*real
+ xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
+
+ xvmuldp vs48, vs0, vs18 // real*real, imag*real
+ xvmuldp vs49, vs0, vs19 // real*imag, imag*imag
+ xvmuldp vs50, vs1, vs18 // real*real, imag*real
+ xvmuldp vs51, vs1, vs19 // real*imag, imag*imag
+ xvmuldp vs52, vs2, vs18 // real*real, imag*real
+ xvmuldp vs53, vs2, vs19 // real*imag, imag*imag
+ xvmuldp vs54, vs3, vs18 // real*real, imag*real
+ xvmuldp vs55, vs3, vs19 // real*imag, imag*imag
+ xvmuldp vs56, vs4, vs18 // real*real, imag*real
+ xvmuldp vs57, vs4, vs19 // real*imag, imag*imag
+ xvmuldp vs58, vs5, vs18 // real*real, imag*real
+ xvmuldp vs59, vs5, vs19 // real*imag, imag*imag
+ xvmuldp vs60, vs6, vs18 // real*real, imag*real
+ xvmuldp vs61, vs6, vs19 // real*imag, imag*imag
+ xvmuldp vs62, vs7, vs18 // real*real, imag*real
+ xvmuldp vs63, vs7, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs4, o0, AO // load real,imag from A
+ lxvd2x vs5, o16, AO // load real,imag from A
+ lxvd2x vs6, o32, AO // load real,imag from A
+ lxvd2x vs7, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
+ xvmaddadp vs40, vs4, vs16 // real*real, imag*real
+ xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
+ xvmaddadp vs42, vs5, vs16 // real*real, imag*real
+ xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
+ xvmaddadp vs44, vs6, vs16 // real*real, imag*real
+ xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
+ xvmaddadp vs46, vs7, vs16 // real*real, imag*real
+ xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
+
+ xvmaddadp vs48, vs0, vs18 // real*real, imag*real
+ xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag
+ xvmaddadp vs50, vs1, vs18 // real*real, imag*real
+ xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag
+ xvmaddadp vs52, vs2, vs18 // real*real, imag*real
+ xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag
+ xvmaddadp vs54, vs3, vs18 // real*real, imag*real
+ xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag
+ xvmaddadp vs56, vs4, vs18 // real*real, imag*real
+ xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag
+ xvmaddadp vs58, vs5, vs18 // real*real, imag*real
+ xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag
+ xvmaddadp vs60, vs6, vs18 // real*real, imag*real
+ xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag
+ xvmaddadp vs62, vs7, vs18 // real*real, imag*real
+ xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x8
+
+
+ mr T1, CO
+ addi T2, T1, 64
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+ lxvd2x vs17, o16, T1
+ lxvd2x vs18, o32, T1
+ lxvd2x vs19, o48, T1
+ lxvd2x vs20, o0, T2
+ lxvd2x vs21, o16, T2
+ lxvd2x vs22, o32, T2
+ lxvd2x vs23, o48, T2
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs32 // realA*realB
+ XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
+
+ xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs32 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs33 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs34 // realA*realB
+ XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
+
+ xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs34 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs35 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs36 // realA*realB
+ XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
+
+ xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs36 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs37 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs38 // realA*realB
+ XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
+
+ xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs38 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs39 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs40 // realA*realB
+ XSFADD_R2 vs0, vs0, vs41 // imagA*imagB
+
+ xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs40 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs41 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs12, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs42 // realA*realB
+ XSFADD_R2 vs0, vs0, vs43 // imagA*imagB
+
+ xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs42 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs43 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs13, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs44 // realA*realB
+ XSFADD_R2 vs0, vs0, vs45 // imagA*imagB
+
+ xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs44 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs45 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs14, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs46 // realA*realB
+ XSFADD_R2 vs0, vs0, vs47 // imagA*imagB
+
+ xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs46 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs47 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs15, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+ xvadddp vs9, vs9, vs17
+ xvadddp vs10, vs10, vs18
+ xvadddp vs11, vs11, vs19
+ xvadddp vs12, vs12, vs20
+ xvadddp vs13, vs13, vs21
+ xvadddp vs14, vs14, vs22
+ xvadddp vs15, vs15, vs23
+
+#endif
+
+ stxvd2x vs8, o0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+ stxvd2x vs12, o0, T2
+ stxvd2x vs13, o16, T2
+ stxvd2x vs14, o32, T2
+ stxvd2x vs15, o48, T2
+
+ add T1, T1, LDC
+ add T2, T2, LDC
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+ lxvd2x vs17, o16, T1
+ lxvd2x vs18, o32, T1
+ lxvd2x vs19, o48, T1
+ lxvd2x vs20, o0, T2
+ lxvd2x vs21, o16, T2
+ lxvd2x vs22, o32, T2
+ lxvd2x vs23, o48, T2
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs49, vs49 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs48 // realA*realB
+ XSFADD_R2 vs0, vs0, vs49 // imagA*imagB
+
+ xxswapd vs48, vs48 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs49, vs49 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs48 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs49 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs51, vs51 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs50 // realA*realB
+ XSFADD_R2 vs0, vs0, vs51 // imagA*imagB
+
+ xxswapd vs50, vs50 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs51, vs51 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs50 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs51 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs53, vs53 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs52 // realA*realB
+ XSFADD_R2 vs0, vs0, vs53 // imagA*imagB
+
+ xxswapd vs52, vs52 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs53, vs53 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs52 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs53 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs55, vs55 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs54 // realA*realB
+ XSFADD_R2 vs0, vs0, vs55 // imagA*imagB
+
+ xxswapd vs54, vs54 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs55, vs55 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs54 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs55 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs57, vs57 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs56 // realA*realB
+ XSFADD_R2 vs0, vs0, vs57 // imagA*imagB
+
+ xxswapd vs56, vs56 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs57, vs57 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs56 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs57 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs12, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs59, vs59 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs58 // realA*realB
+ XSFADD_R2 vs0, vs0, vs59 // imagA*imagB
+
+ xxswapd vs58, vs58 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs59, vs59 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs58 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs59 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs13, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs61, vs61 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs60 // realA*realB
+ XSFADD_R2 vs0, vs0, vs61 // imagA*imagB
+
+ xxswapd vs60, vs60 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs61, vs61 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs60 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs61 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs14, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs63, vs63 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs62 // realA*realB
+ XSFADD_R2 vs0, vs0, vs63 // imagA*imagB
+
+ xxswapd vs62, vs62 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs63, vs63 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs62 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs63 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs15, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+ xvadddp vs9, vs9, vs17
+ xvadddp vs10, vs10, vs18
+ xvadddp vs11, vs11, vs19
+ xvadddp vs12, vs12, vs20
+ xvadddp vs13, vs13, vs21
+ xvadddp vs14, vs14, vs22
+ xvadddp vs15, vs15, vs23
+
+#endif
+
+ stxvd2x vs8, o0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+ stxvd2x vs12, o0, T2
+ stxvd2x vs13, o16, T2
+ stxvd2x vs14, o32, T2
+ stxvd2x vs15, o48, T2
+
+ add T1, T1, LDC
+ add T2, T2, LDC
+ addi CO, CO, 128
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro LOAD2x4_1
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+
+.endm
+
+.macro KERNEL2x4_I1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+ lxvd2x vs10, o32, AO // load real,imag from A
+ lxvd2x vs11, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+ lxvdsx vs22, o16, BO // load real part from B
+ lxvdsx vs23, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmuldp vs36, vs2, vs16 // real*real, imag*real
+ xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmuldp vs38, vs3, vs16 // real*real, imag*real
+ xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
+
+ xvmuldp vs40, vs0, vs18 // real*real, imag*real
+ xvmuldp vs41, vs0, vs19 // real*imag, imag*imag
+ xvmuldp vs42, vs1, vs18 // real*real, imag*real
+ xvmuldp vs43, vs1, vs19 // real*imag, imag*imag
+ xvmuldp vs44, vs2, vs18 // real*real, imag*real
+ xvmuldp vs45, vs2, vs19 // real*imag, imag*imag
+ xvmuldp vs46, vs3, vs18 // real*real, imag*real
+ xvmuldp vs47, vs3, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+ lxvd2x vs10, o32, AO // load real,imag from A
+ lxvd2x vs11, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+ lxvdsx vs22, o16, BO // load real part from B
+ lxvdsx vs23, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
+
+ xvmaddadp vs40, vs0, vs18 // real*real, imag*real
+ xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag
+ xvmaddadp vs42, vs1, vs18 // real*real, imag*real
+ xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag
+ xvmaddadp vs44, vs2, vs18 // real*real, imag*real
+ xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag
+ xvmaddadp vs46, vs3, vs18 // real*real, imag*real
+ xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_2
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
+
+ xvmaddadp vs40, vs8, vs22 // real*real, imag*real
+ xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag
+ xvmaddadp vs42, vs9, vs22 // real*real, imag*real
+ xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag
+ xvmaddadp vs44, vs10, vs22 // real*real, imag*real
+ xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag
+ xvmaddadp vs46, vs11, vs22 // real*real, imag*real
+ xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
+
+ xvmaddadp vs40, vs8, vs22 // real*real, imag*real
+ xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag
+ xvmaddadp vs42, vs9, vs22 // real*real, imag*real
+ xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag
+ xvmaddadp vs44, vs10, vs22 // real*real, imag*real
+ xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag
+ xvmaddadp vs46, vs11, vs22 // real*real, imag*real
+ xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmuldp vs36, vs2, vs16 // real*real, imag*real
+ xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmuldp vs38, vs3, vs16 // real*real, imag*real
+ xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
+
+ xvmuldp vs40, vs0, vs18 // real*real, imag*real
+ xvmuldp vs41, vs0, vs19 // real*imag, imag*imag
+ xvmuldp vs42, vs1, vs18 // real*real, imag*real
+ xvmuldp vs43, vs1, vs19 // real*imag, imag*imag
+ xvmuldp vs44, vs2, vs18 // real*real, imag*real
+ xvmuldp vs45, vs2, vs19 // real*imag, imag*imag
+ xvmuldp vs46, vs3, vs18 // real*real, imag*real
+ xvmuldp vs47, vs3, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
+
+ xvmaddadp vs40, vs0, vs18 // real*real, imag*real
+ xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag
+ xvmaddadp vs42, vs1, vs18 // real*real, imag*real
+ xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag
+ xvmaddadp vs44, vs2, vs18 // real*real, imag*real
+ xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag
+ xvmaddadp vs46, vs3, vs18 // real*real, imag*real
+ xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x4
+
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+ lxvd2x vs17, o16, T1
+ lxvd2x vs18, o32, T1
+ lxvd2x vs19, o48, T1
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs32 // realA*realB
+ XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
+
+ xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs32 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs33 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs34 // realA*realB
+ XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
+
+ xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs34 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs35 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs36 // realA*realB
+ XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
+
+ xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs36 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs37 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs38 // realA*realB
+ XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
+
+ xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs38 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs39 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+ xvadddp vs9, vs9, vs17
+ xvadddp vs10, vs10, vs18
+ xvadddp vs11, vs11, vs19
+
+#endif
+
+ stxvd2x vs8, o0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+ lxvd2x vs17, o16, T1
+ lxvd2x vs18, o32, T1
+ lxvd2x vs19, o48, T1
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs40 // realA*realB
+ XSFADD_R2 vs0, vs0, vs41 // imagA*imagB
+
+ xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs40 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs41 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs42 // realA*realB
+ XSFADD_R2 vs0, vs0, vs43 // imagA*imagB
+
+ xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs42 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs43 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs44 // realA*realB
+ XSFADD_R2 vs0, vs0, vs45 // imagA*imagB
+
+ xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs44 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs45 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs46 // realA*realB
+ XSFADD_R2 vs0, vs0, vs47 // imagA*imagB
+
+ xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs46 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs47 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+ xvadddp vs9, vs9, vs17
+ xvadddp vs10, vs10, vs18
+ xvadddp vs11, vs11, vs19
+
+#endif
+
+ stxvd2x vs8, o0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ add T1, T1, LDC
+ addi CO, CO, 64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro LOAD2x2_1
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+
+.endm
+
+.macro KERNEL2x2_I1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+ lxvdsx vs22, o16, BO // load real part from B
+ lxvdsx vs23, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+
+ xvmuldp vs36, vs0, vs18 // real*real, imag*real
+ xvmuldp vs37, vs0, vs19 // real*imag, imag*imag
+ xvmuldp vs38, vs1, vs18 // real*real, imag*real
+ xvmuldp vs39, vs1, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+ lxvdsx vs22, o16, BO // load real part from B
+ lxvdsx vs23, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+
+ xvmaddadp vs36, vs0, vs18 // real*real, imag*real
+ xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag
+ xvmaddadp vs38, vs1, vs18 // real*real, imag*real
+ xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_2
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+
+ xvmaddadp vs36, vs8, vs22 // real*real, imag*real
+ xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag
+ xvmaddadp vs38, vs9, vs22 // real*real, imag*real
+ xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+
+ xvmaddadp vs36, vs8, vs22 // real*real, imag*real
+ xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag
+ xvmaddadp vs38, vs9, vs22 // real*real, imag*real
+ xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+
+ xvmuldp vs36, vs0, vs18 // real*real, imag*real
+ xvmuldp vs37, vs0, vs19 // real*imag, imag*imag
+ xvmuldp vs38, vs1, vs18 // real*real, imag*real
+ xvmuldp vs39, vs1, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+
+ xvmaddadp vs36, vs0, vs18 // real*real, imag*real
+ xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag
+ xvmaddadp vs38, vs1, vs18 // real*real, imag*real
+ xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x2
+
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+ lxvd2x vs17, o16, T1
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs32 // realA*realB
+ XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
+
+ xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs32 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs33 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs34 // realA*realB
+ XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
+
+ xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs34 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs35 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+ xvadddp vs9, vs9, vs17
+
+#endif
+
+ stxvd2x vs8, o0, T1
+ stxvd2x vs9, o16, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+ lxvd2x vs17, o16, T1
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs36 // realA*realB
+ XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
+
+ xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs36 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs37 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs38 // realA*realB
+ XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
+
+ xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs38 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs39 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+ xvadddp vs9, vs9, vs17
+
+#endif
+
+ stxvd2x vs8, o0, T1
+ stxvd2x vs9, o16, T1
+
+ add T1, T1, LDC
+ addi CO, CO, 32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro LOAD2x1_1
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+
+.endm
+
+.macro KERNEL2x1_I1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+ lxvdsx vs22, o16, BO // load real part from B
+ lxvdsx vs23, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+
+ xvmuldp vs34, vs0, vs18 // real*real, imag*real
+ xvmuldp vs35, vs0, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+ lxvdsx vs22, o16, BO // load real part from B
+ lxvdsx vs23, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+
+ xvmaddadp vs34, vs0, vs18 // real*real, imag*real
+ xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_2
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+
+ xvmaddadp vs34, vs8, vs22 // real*real, imag*real
+ xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+
+ xvmaddadp vs34, vs8, vs22 // real*real, imag*real
+ xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+
+ xvmuldp vs34, vs0, vs18 // real*real, imag*real
+ xvmuldp vs35, vs0, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+
+ xvmaddadp vs34, vs0, vs18 // real*real, imag*real
+ xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x1
+
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs32 // realA*realB
+ XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
+
+ xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs32 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs33 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+
+#endif
+
+ stxvd2x vs8, o0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs34 // realA*realB
+ XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
+
+ xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs34 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs35 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+
+#endif
+
+ stxvd2x vs8, o0, T1
+
+ add T1, T1, LDC
+ addi CO, CO, 16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro LOAD1x8_1
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs4, o0, AO // load real,imag from A
+ lxvd2x vs5, o16, AO // load real,imag from A
+ lxvd2x vs6, o32, AO // load real,imag from A
+ lxvd2x vs7, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+
+.endm
+
+.macro KERNEL1x8_I1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+ lxvd2x vs10, o32, AO // load real,imag from A
+ lxvd2x vs11, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs12, o0, AO // load real,imag from A
+ lxvd2x vs13, o16, AO // load real,imag from A
+ lxvd2x vs14, o32, AO // load real,imag from A
+ lxvd2x vs15, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmuldp vs36, vs2, vs16 // real*real, imag*real
+ xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmuldp vs38, vs3, vs16 // real*real, imag*real
+ xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
+ xvmuldp vs40, vs4, vs16 // real*real, imag*real
+ xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
+ xvmuldp vs42, vs5, vs16 // real*real, imag*real
+ xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
+ xvmuldp vs44, vs6, vs16 // real*real, imag*real
+ xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
+ xvmuldp vs46, vs7, vs16 // real*real, imag*real
+ xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+ lxvd2x vs10, o32, AO // load real,imag from A
+ lxvd2x vs11, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs12, o0, AO // load real,imag from A
+ lxvd2x vs13, o16, AO // load real,imag from A
+ lxvd2x vs14, o32, AO // load real,imag from A
+ lxvd2x vs15, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
+ xvmaddadp vs40, vs4, vs16 // real*real, imag*real
+ xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
+ xvmaddadp vs42, vs5, vs16 // real*real, imag*real
+ xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
+ xvmaddadp vs44, vs6, vs16 // real*real, imag*real
+ xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
+ xvmaddadp vs46, vs7, vs16 // real*real, imag*real
+ xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_2
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs4, o0, AO // load real,imag from A
+ lxvd2x vs5, o16, AO // load real,imag from A
+ lxvd2x vs6, o32, AO // load real,imag from A
+ lxvd2x vs7, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
+ xvmaddadp vs40, vs12, vs20 // real*real, imag*real
+ xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
+ xvmaddadp vs42, vs13, vs20 // real*real, imag*real
+ xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
+ xvmaddadp vs44, vs14, vs20 // real*real, imag*real
+ xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
+ xvmaddadp vs46, vs15, vs20 // real*real, imag*real
+ xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
+ xvmaddadp vs40, vs12, vs20 // real*real, imag*real
+ xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
+ xvmaddadp vs42, vs13, vs20 // real*real, imag*real
+ xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
+ xvmaddadp vs44, vs14, vs20 // real*real, imag*real
+ xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
+ xvmaddadp vs46, vs15, vs20 // real*real, imag*real
+ xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs4, o0, AO // load real,imag from A
+ lxvd2x vs5, o16, AO // load real,imag from A
+ lxvd2x vs6, o32, AO // load real,imag from A
+ lxvd2x vs7, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmuldp vs36, vs2, vs16 // real*real, imag*real
+ xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmuldp vs38, vs3, vs16 // real*real, imag*real
+ xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
+ xvmuldp vs40, vs4, vs16 // real*real, imag*real
+ xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
+ xvmuldp vs42, vs5, vs16 // real*real, imag*real
+ xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
+ xvmuldp vs44, vs6, vs16 // real*real, imag*real
+ xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
+ xvmuldp vs46, vs7, vs16 // real*real, imag*real
+ xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs4, o0, AO // load real,imag from A
+ lxvd2x vs5, o16, AO // load real,imag from A
+ lxvd2x vs6, o32, AO // load real,imag from A
+ lxvd2x vs7, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
+ xvmaddadp vs40, vs4, vs16 // real*real, imag*real
+ xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
+ xvmaddadp vs42, vs5, vs16 // real*real, imag*real
+ xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
+ xvmaddadp vs44, vs6, vs16 // real*real, imag*real
+ xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
+ xvmaddadp vs46, vs7, vs16 // real*real, imag*real
+ xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x8
+
+
+ mr T1, CO
+ addi T2, T1, 64
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+ lxvd2x vs17, o16, T1
+ lxvd2x vs18, o32, T1
+ lxvd2x vs19, o48, T1
+ lxvd2x vs20, o0, T2
+ lxvd2x vs21, o16, T2
+ lxvd2x vs22, o32, T2
+ lxvd2x vs23, o48, T2
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs32 // realA*realB
+ XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
+
+ xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs32 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs33 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs34 // realA*realB
+ XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
+
+ xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs34 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs35 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs36 // realA*realB
+ XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
+
+ xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs36 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs37 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs38 // realA*realB
+ XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
+
+ xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs38 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs39 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs40 // realA*realB
+ XSFADD_R2 vs0, vs0, vs41 // imagA*imagB
+
+ xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs40 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs41 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs12, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs42 // realA*realB
+ XSFADD_R2 vs0, vs0, vs43 // imagA*imagB
+
+ xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs42 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs43 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs13, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs44 // realA*realB
+ XSFADD_R2 vs0, vs0, vs45 // imagA*imagB
+
+ xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs44 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs45 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs14, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs46 // realA*realB
+ XSFADD_R2 vs0, vs0, vs47 // imagA*imagB
+
+ xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs46 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs47 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs15, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+ xvadddp vs9, vs9, vs17
+ xvadddp vs10, vs10, vs18
+ xvadddp vs11, vs11, vs19
+ xvadddp vs12, vs12, vs20
+ xvadddp vs13, vs13, vs21
+ xvadddp vs14, vs14, vs22
+ xvadddp vs15, vs15, vs23
+
+#endif
+
+ stxvd2x vs8, o0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+ stxvd2x vs12, o0, T2
+ stxvd2x vs13, o16, T2
+ stxvd2x vs14, o32, T2
+ stxvd2x vs15, o48, T2
+
+ add T1, T1, LDC
+ add T2, T2, LDC
+ addi CO, CO, 128
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro LOAD1x4_1
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+
+.endm
+
+.macro KERNEL1x4_I1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+ lxvd2x vs10, o32, AO // load real,imag from A
+ lxvd2x vs11, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmuldp vs36, vs2, vs16 // real*real, imag*real
+ xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmuldp vs38, vs3, vs16 // real*real, imag*real
+ xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+ lxvd2x vs10, o32, AO // load real,imag from A
+ lxvd2x vs11, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_2
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmuldp vs36, vs2, vs16 // real*real, imag*real
+ xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmuldp vs38, vs3, vs16 // real*real, imag*real
+ xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x4
+
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+ lxvd2x vs17, o16, T1
+ lxvd2x vs18, o32, T1
+ lxvd2x vs19, o48, T1
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs32 // realA*realB
+ XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
+
+ xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs32 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs33 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs34 // realA*realB
+ XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
+
+ xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs34 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs35 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs36 // realA*realB
+ XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
+
+ xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs36 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs37 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs38 // realA*realB
+ XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
+
+ xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs38 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs39 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+ xvadddp vs9, vs9, vs17
+ xvadddp vs10, vs10, vs18
+ xvadddp vs11, vs11, vs19
+
+#endif
+
+ stxvd2x vs8, o0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ add T1, T1, LDC
+ addi CO, CO, 64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro LOAD1x2_1
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+
+.endm
+
+.macro KERNEL1x2_I1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_2
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x2
+
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+ lxvd2x vs17, o16, T1
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs32 // realA*realB
+ XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
+
+ xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs32 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs33 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs34 // realA*realB
+ XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
+
+ xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs34 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs35 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+ xvadddp vs9, vs9, vs17
+
+#endif
+
+ stxvd2x vs8, o0, T1
+ stxvd2x vs9, o16, T1
+
+ add T1, T1, LDC
+ addi CO, CO, 32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro LOAD1x1_1
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+
+.endm
+
+.macro KERNEL1x1_I1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_2
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x1
+
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs32 // realA*realB
+ XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
+
+ xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs32 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs33 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+
+#endif
+
+ stxvd2x vs8, o0, T1
+
+ add T1, T1, LDC
+ addi CO, CO, 16
+
+.endm
+