LDGEMM_L4_BEGIN:
- mr CO, C
+ li T1, 128
+ li T2, 256
mr AO, A
- slwi T1, LDC , 2
- add C, C, T1
+
+ mr CO, C
+ slwi T3, LDC , 2
+ add C, C, T3
+
+ dcbt A, T1
+ dcbt A, T2
+
srawi. I, M, 4
ble LDGEMM_L4x16_END
.align 4
+LDGEMM_L4x16_BEGIN_FIRST:
+
+ li L, -128
+
+ mr T1, CO
+ add T2, T1, LDC
+ add T3, T2, LDC
+ add T4, T3, LDC
+
+ and T1, T1, L
+ and T2, T2, L
+ and T3, T3, L
+ and T4, T4, L
+
+ dcbt T1, r0
+ dcbt T2, r0
+ dcbt T3, r0
+ dcbt T4, r0
+
+ mr BO, B
+ srawi. L, K, 2
+
+ addi T1, T1, 128
+ addi T2, T2, 128
+ addi T3, T3, 128
+ addi T4, T4, 128
+
+ dcbt T1, r0
+ dcbt T2, r0
+ dcbt T3, r0
+ dcbt T4, r0
+
+ ble LDGEMM_L4x16_SUB0_FIRST
+ cmpwi cr0, L, 1
+ ble LDGEMM_L4x16_SUB4_FIRST
+
+ .align 4
+LDGEMM_L4x16_LOOP_START_FIRST:
+
+ li T2, 512
+ li o40, 40
+ li o56, 56
+
+ dcbt AO, PRE
+ dcbt BO, T2
+ LOAD4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_I1
+ dcbt AO, PRE
+ addic. L, L, -2
+ KERNEL4x16_L2
+
+ dcbt AO, PRE
+ KERNEL4x16_L1
+ dcbt AO, PRE
+ dcbt BO, T2
+ KERNEL4x16_L2
+
+ ble LDGEMM_L4x16_LOOP_END_FIRST
+ mtctr L
+
+ .align 4
+
+LDGEMM_L4x16_LOOP_FIRST:
+
+ dcbt AO, PRE
+ KERNEL4x16_L1
+ dcbt AO, PRE
+ KERNEL4x16_L2
+
+ dcbt AO, PRE
+ KERNEL4x16_L1
+ dcbt AO, PRE
+ dcbt BO, T2
+ KERNEL4x16_L2
+
+ bdnz LDGEMM_L4x16_LOOP_FIRST
+
+ .align 4
+
+LDGEMM_L4x16_LOOP_END_FIRST:
+
+ KERNEL4x16_L1
+ KERNEL4x16_L2
+
+ KERNEL4x16_1
+ KERNEL4x16_E2
+
+ b LDGEMM_L4x16_SUB1_FIRST
+
+LDGEMM_L4x16_SUB4_FIRST:
+
+ KERNEL4x16_SUBI1
+ KERNEL4x16_SUB1
+ KERNEL4x16_SUB1
+ KERNEL4x16_SUB1
+
+ b LDGEMM_L4x16_SUB1_FIRST
+
+LDGEMM_L4x16_SUB0_FIRST:
+
+ andi. L, K, 3
+
+ KERNEL4x16_SUBI1
+
+ addic. L, L, -1
+ ble LDGEMM_L4x16_SAVE_FIRST
+ b LDGEMM_L4x16_SUB2_FIRST
+
+LDGEMM_L4x16_SUB1_FIRST:
+
+ andi. L, K, 3
+ ble LDGEMM_L4x16_SAVE_FIRST
+
+LDGEMM_L4x16_SUB2_FIRST:
+
+ KERNEL4x16_SUB1
+
+ addic. L, L, -1
+ bgt LDGEMM_L4x16_SUB2_FIRST
+
+ .align 4
+LDGEMM_L4x16_SAVE_FIRST:
+
+ SAVE4x16
+
+ addic. I, I, -1
+ ble LDGEMM_L4x16_END
+
+LDGEMM_L4x16_END_FIRST:
+
+ .align 4
LDGEMM_L4x16_BEGIN:
li L, -128
dcbt T3, r0
dcbt T4, r0
- ble LDGEMM_L4x16_SUB0
+ ble- LDGEMM_L4x16_SUB0
cmpwi cr0, L, 1
- ble LDGEMM_L4x16_SUB4
+ ble- LDGEMM_L4x16_SUB4
.align 4
LDGEMM_L4x16_LOOP_START:
addic. L, L, -2
KERNEL4x16_L2
- ble LDGEMM_L4x16_LOOP_END
+ ble- LDGEMM_L4x16_LOOP_END
+ mtctr L
.align 4
dcbt AO, PRE
KERNEL4x16_L1
dcbt AO, PRE
- addic. L, L, -1
+ // addic. L, L, -1
KERNEL4x16_L2
- bgt LDGEMM_L4x16_LOOP
+ bdnz+ LDGEMM_L4x16_LOOP
.align 4
SAVE4x16
addic. I, I, -1
- bgt LDGEMM_L4x16_BEGIN
+ bgt+ LDGEMM_L4x16_BEGIN
LDGEMM_L4x16_END:
.macro SAVE4x16
- mr T1, CO
- add T2, T1, LDC
- add T3, T2, LDC
- add T4, T3, LDC
+ add T2, CO, LDC
lxvd2x vs0, 0, CO
lxvd2x vs1, o16, CO
lxvd2x vs3, o48, CO
lxvd2x vs4, o64, CO
lxvd2x vs5, o80, CO
+ add T3, T2, LDC
lxvd2x vs6, o96, CO
lxvd2x vs7, o112, CO
lxvd2x vs11, o48, T2
lxvd2x vs12, o64, T2
lxvd2x vs13, o80, T2
+ add T4, T3, LDC
lxvd2x vs14, o96, T2
lxvd2x vs15, o112, T2
lxvd2x vs31, o112, T3
xvmaddadp vs0, vs32, alpha_r
- xvmaddadp vs1, vs33, alpha_r
- xvmaddadp vs2, vs34, alpha_r
- xvmaddadp vs3, vs35, alpha_r
- xvmaddadp vs4, vs36, alpha_r
- xvmaddadp vs5, vs37, alpha_r
- xvmaddadp vs6, vs38, alpha_r
- xvmaddadp vs7, vs39, alpha_r
-
lxvd2x vs32, 0, T4
+ xvmaddadp vs1, vs33, alpha_r
lxvd2x vs33, o16, T4
+ xvmaddadp vs2, vs34, alpha_r
lxvd2x vs34, o32, T4
+ xvmaddadp vs3, vs35, alpha_r
lxvd2x vs35, o48, T4
+ xvmaddadp vs4, vs36, alpha_r
lxvd2x vs36, o64, T4
+ xvmaddadp vs5, vs37, alpha_r
lxvd2x vs37, o80, T4
+ xvmaddadp vs6, vs38, alpha_r
lxvd2x vs38, o96, T4
+ xvmaddadp vs7, vs39, alpha_r
lxvd2x vs39, o112, T4
xvmaddadp vs8, vs40, alpha_r
xvmaddadp vs10, vs42, alpha_r
xvmaddadp vs11, vs43, alpha_r
- stxvd2x vs0, 0, T1
- stxvd2x vs1, o16, T1
- stxvd2x vs2, o32, T1
- stxvd2x vs3, o48, T1
-
xvmaddadp vs12, vs44, alpha_r
xvmaddadp vs13, vs45, alpha_r
xvmaddadp vs14, vs46, alpha_r
xvmaddadp vs15, vs47, alpha_r
- stxvd2x vs4, o64, T1
- stxvd2x vs5, o80, T1
- stxvd2x vs6, o96, T1
- stxvd2x vs7, o112, T1
-
xvmaddadp vs24, vs48, alpha_r
xvmaddadp vs25, vs49, alpha_r
xvmaddadp vs26, vs50, alpha_r
xvmaddadp vs27, vs51, alpha_r
- stxvd2x vs8, o0, T2
- stxvd2x vs9, o16, T2
- stxvd2x vs10, o32, T2
- stxvd2x vs11, o48, T2
-
xvmaddadp vs28, vs52, alpha_r
xvmaddadp vs29, vs53, alpha_r
xvmaddadp vs30, vs54, alpha_r
xvmaddadp vs31, vs55, alpha_r
- stxvd2x vs12, o64, T2
- stxvd2x vs13, o80, T2
- stxvd2x vs14, o96, T2
- stxvd2x vs15, o112, T2
+ stxvd2x vs0, 0, CO
+ stxvd2x vs1, o16, CO
+ stxvd2x vs2, o32, CO
+ stxvd2x vs3, o48, CO
+
+ stxvd2x vs4, o64, CO
+ stxvd2x vs5, o80, CO
+ stxvd2x vs6, o96, CO
+ stxvd2x vs7, o112, CO
xvmaddadp vs32, vs56, alpha_r
xvmaddadp vs33, vs57, alpha_r
xvmaddadp vs34, vs58, alpha_r
xvmaddadp vs35, vs59, alpha_r
- stxvd2x vs24, 0, T3
- stxvd2x vs25, o16, T3
- stxvd2x vs26, o32, T3
- stxvd2x vs27, o48, T3
-
xvmaddadp vs36, vs60, alpha_r
xvmaddadp vs37, vs61, alpha_r
xvmaddadp vs38, vs62, alpha_r
xvmaddadp vs39, vs63, alpha_r
+ addi CO, CO, 128
+
+ stxvd2x vs8, o0, T2
+ stxvd2x vs9, o16, T2
+ stxvd2x vs10, o32, T2
+ stxvd2x vs11, o48, T2
+
+ stxvd2x vs12, o64, T2
+ stxvd2x vs13, o80, T2
+ stxvd2x vs14, o96, T2
+ stxvd2x vs15, o112, T2
+
+ stxvd2x vs24, 0, T3
+ stxvd2x vs25, o16, T3
stxvd2x vs28, o64, T3
stxvd2x vs29, o80, T3
+
+ stxvd2x vs26, o32, T3
+ stxvd2x vs27, o48, T3
stxvd2x vs30, o96, T3
stxvd2x vs31, o112, T3
stxvd2x vs34, o32, T4
stxvd2x vs35, o48, T4
- addi CO, CO, 128
-
stxvd2x vs36, o64, T4
stxvd2x vs37, o80, T4
stxvd2x vs38, o96, T4
#define DNUMOPT 8
#define GEMM_DEFAULT_OFFSET_A 0
-#define GEMM_DEFAULT_OFFSET_B 4096
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#define GEMM_DEFAULT_OFFSET_B 65536
+#define GEMM_DEFAULT_ALIGN 0x0ffffUL
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 8
#define ZGEMM_DEFAULT_P 320
#define SGEMM_DEFAULT_Q 640
-#define DGEMM_DEFAULT_Q 640
+#define DGEMM_DEFAULT_Q 720
#define CGEMM_DEFAULT_Q 640
#define ZGEMM_DEFAULT_Q 640