From 14f81da375232998e8c1f149ab61db43bfb300af Mon Sep 17 00:00:00 2001 From: traz Date: Thu, 23 Jun 2011 10:46:58 +0000 Subject: [PATCH] Change prefetch length of A and B, the performance is 2.1G now. --- kernel/mips64/zgemm_kernel_loongson3a.S | 359 ++++++++++++++++++-------------- 1 file changed, 200 insertions(+), 159 deletions(-) diff --git a/kernel/mips64/zgemm_kernel_loongson3a.S b/kernel/mips64/zgemm_kernel_loongson3a.S index 0b0d731..4960367 100644 --- a/kernel/mips64/zgemm_kernel_loongson3a.S +++ b/kernel/mips64/zgemm_kernel_loongson3a.S @@ -6,6 +6,7 @@ #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) + #define STACKSIZE 160 #define M $4 #define N $5 @@ -109,12 +110,18 @@ #define ALPHA_R $f15 #define ALPHA_I $f16 -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +################################# +## MADD1 a*c +## MADD2 b*c +## MADD3 a*d +## MADD4 d*b +################################## +####if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define MADD1 MADD #define MADD2 MADD #define MADD3 MADD #define MADD4 NMSUB -#endif +###endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) #define MADD1 MADD @@ -166,25 +173,28 @@ sdc1 $f23,112($sp) #endif - dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE + dsra J, N, 1 # J=N/2 ST ALPHA_R, 128($sp) # store alpha_r & alpha_i - dsra J, N, 1 # J=N/2 + dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE + blez J, .L20 ST ALPHA_I, 136($sp) - dsll PREB, K, 1+ZBASE_SHIFT # PREA=K*2*2^4 - blez J, .L20 - dsll PREA, K, 1+ZBASE_SHIFT # PREA=K*2*2^4 .align 5 .L10: daddiu J, J, -1 - move CO1, C # Fix pointer Cx + dsra I, M, 1 # I=M/2 + + dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + dsll PREA, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + move CO1, C # Fix pointer Cx daddu CO2, C, LDC + move AO, A # Reset AO + daddu PREB, PREB, B # PREA=A+panel size - dsra I, M, 1 # I=M/2 blez I, .L30 daddu PREA, PREA, A # PREA=A+panel size @@ -192,41 +202,32 @@ dsra L, K, 2 # Unroll K 4 times move BO, B - gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 MTC $0, c11 # Clear results regs MOV c12, c11 + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 - gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MOV c13, c11 MOV c14, c11 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 - gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 MOV c21, c11 MOV c22, c11 - - gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + MOV c23, c11 MOV c24, c11 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 - FETCH $0, 0 * SIZE(PREA) # LOAD 32 Byte 4 double - daddu PREB, PREB, B # PREA=A+panel size - - FETCH $0, 0 * SIZE(CO1) MOV c31, c11 MOV c32, c11 - - FETCH $0, 0 * SIZE(CO2) + MOV c33, c11 MOV c34, c11 - - FETCH $0, 0 * SIZE(PREB) - MOV c41, c11 - FETCH $0, 4 * SIZE(CO1) + MOV c41, c11 MOV c42, c11 + MOV c43, c11 - - FETCH $0, 4 * SIZE(CO2) blez L, .L15 MOV c44, c11 @@ -234,26 +235,26 @@ .L12: gsLQC1(R12, F9, F8, 2) # Unroll K=1 + gsLQC1(R13, F13, F12, 2) MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd - gsLQC1(R13, F13, F12, 2) + gsLQC1(R12, F11, F10, 3) + gsLQC1(R13, F16, F15, 3) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd - gsLQC1(R12, F11, F10, 3) MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 - gsLQC1(R13, F16, F15, 3) MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 FETCH $0, 4 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREB) MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 - FETCH $0, 4 * SIZE(PREB) MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 @@ -262,27 +263,27 @@ MADD2 c42, c42, a4, b3 MADD4 c44, c44, a4, b4 - gsLQC1(R12, F1, F0, 4) # Unroll K=2 + gsLQC1(R12, F1, F0, 4) # unroll k=2 + gsLQC1(R13, F5, F4, 4) MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd - gsLQC1(R13, F5, F4, 4) MADD2 c12, c12, a6, b5 # bxc MADD4 c14, c14, a6, b6 # bxd gsLQC1(R12, F3, F2, 5) + gsLQC1(R13, F7, F6, 5) MADD1 c21, c21, a7, b5 # A2xB1 MADD3 c23, c23, a7, b6 - gsLQC1(R13, F7, F6, 5) MADD2 c22, c22, a8, b5 MADD4 c24, c24, a8, b6 FETCH $0, 8 * SIZE(PREA) + FETCH $0, 8 * SIZE(PREB) MADD1 c31, c31, a5, b7 # A1xB2 MADD3 c33, c33, a5, b8 - FETCH $0, 8 * SIZE(PREB) MADD2 c32, c32, a6, b7 MADD4 c34, c34, a6, b8 @@ -292,61 +293,61 @@ MADD4 c44, c44, a8, b8 gsLQC1(R12, F9, F8, 6) # Unroll K=3 + gsLQC1(R13, F13, F12, 6) MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd - daddiu L, L, -1 - gsLQC1(R13, F13, F12, 6) + gsLQC1(R13, F16, F15, 7) + gsLQC1(R12, F11, F10, 7) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd - gsLQC1(R12, F11, F10, 7) MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 + daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx + daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx - gsLQC1(R13, F16, F15, 7) MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 - daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx FETCH $0, 12 * SIZE(PREA) MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 - daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx + daddiu L, L, -1 FETCH $0, 12 * SIZE(PREB) MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 - daddu PREA, PREA, 16 * SIZE MADD1 c41, c41, a3, b3 # A2xB2 MADD3 c43, c43, a3, b4 + daddu PREA, PREA, 16 * SIZE daddu PREB, PREB, 16 * SIZE MADD2 c42, c42, a4, b3 MADD4 c44, c44, a4, b4 gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd - gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MADD2 c12, c12, a6, b5 # bxc MADD4 c14, c14, a6, b6 # bxd - gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 MADD1 c21, c21, a7, b5 # A2xB1 MADD3 c23, c23, a7, b6 - gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MADD2 c22, c22, a8, b5 MADD4 c24, c24, a8, b6 FETCH $0, 0 * SIZE(PREA) + FETCH $0, 0 * SIZE(PREB) MADD1 c31, c31, a5, b7 # A1xB2 MADD3 c33, c33, a5, b8 - FETCH $0, 0 * SIZE(PREB) MADD2 c32, c32, a6, b7 MADD4 c34, c34, a6, b8 @@ -362,46 +363,52 @@ .L15: andi L, K, 3 LD ALPHA_R, 128($sp) - NOP blez L, .L18 LD ALPHA_I, 136($sp) .align 5 .L16: - daddiu L, L, -1 - daddiu BO, BO, 2 * SIZE * COMPSIZE # 2nr*1kr*cmpx - daddiu AO, AO, 2 * SIZE * COMPSIZE # 2mr*1kr*cmpx - + daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx + daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd + + daddiu PREA, PREA, 4 * SIZE + daddiu PREB, PREB, 4 * SIZE MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 + MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 + FETCH $0, 0 * SIZE(PREA) MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 + daddiu L, L, -1 + MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 + FETCH $0, 0 * SIZE(PREB) MADD1 c41, c41, a3, b3 # A2xB2 MADD3 c43, c43, a3, b4 + MADD2 c42, c42, a4, b3 MADD4 c44, c44, a4, b4 gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 - gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 - bgtz L, .L16 NOP .L18: + ADD c11, c14, c11 LD a1, 0 * SIZE(CO1) ADD c12, c13, c12 @@ -426,170 +433,196 @@ MADD b1, b1, ALPHA_R, c21 MADD b2, b2, ALPHA_R, c22 - MADD a3, a3, ALPHA_R, c31 - MADD a4, a4, ALPHA_R, c32 - MADD b3, b3, ALPHA_R, c41 - MADD b4, b4, ALPHA_R, c42 - NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 NMSUB b1, b1, ALPHA_I, c22 MADD b2, b2, ALPHA_I, c21 + + MADD a3, a3, ALPHA_R, c31 + MADD a4, a4, ALPHA_R, c32 ST a1, 0 * SIZE(CO1) + MADD b3, b3, ALPHA_R, c41 + MADD b4, b4, ALPHA_R, c42 + ST a2, 1 * SIZE(CO1) NMSUB a3, a3, ALPHA_I, c32 MADD a4, a4, ALPHA_I, c31 - ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) NMSUB b3, b3, ALPHA_I, c42 MADD b4, b4, ALPHA_I, c41 - ST b1, 2 * SIZE(CO1) - ST b2, 3 * SIZE(CO1) + ST a3, 0 * SIZE(CO2) ST a4, 1 * SIZE(CO2) ST b3, 2 * SIZE(CO2) ST b4, 3 * SIZE(CO2) + FETCH $0, 4 * SIZE(CO2) + FETCH $0, 4 * SIZE(CO1) + FETCH $0, 8 * SIZE(CO2) + FETCH $0, 8 * SIZE(CO1) + FETCH $0, 12 * SIZE(CO2) + FETCH $0, 12 * SIZE(CO1) + FETCH $0, 16 * SIZE(CO2) + FETCH $0, 16 * SIZE(CO1) + daddiu CO1,CO1, 4 * SIZE bgtz I, .L11 daddiu CO2,CO2, 4 * SIZE + .L30: andi I, M, 1 daddu C, C, LDC # Change C to next panel + + daddu PREB, PREB, B # PREA=A+panel size blez I, .L19 daddu C, C, LDC # Change C to next panel dsra L, K, 2 # Unroll K 4 times move BO, B + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 MTC $0, c11 # Clear results regs MOV c12, c11 + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MOV c13, c11 MOV c14, c11 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MOV c31, c11 MOV c32, c11 - MOV c33, c11 - MOV c34, c11 - gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 - gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 - gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + FETCH $0, 0 * SIZE(PREB) + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 4 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO2) + MOV c33, c11 blez L, .L35 - NOP + MOV c34, c11 - .align 3 + .align 5 .L32: gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 gsLQC1(R13, F13, F12, 2) - gsLQC1(R13, F16, F15, 3) - MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd + + gsLQC1(R13, F16, F15, 3) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd + NOP MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 + + FETCH $0, 4 * SIZE(PREB) MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 + NOP gsLQC1(R12, F9, F8, 2) # Unroll K=1 gsLQC1(R13, F5, F4, 4) - gsLQC1(R13, F7, F6, 5) - MADD1 c11, c11, a3, b5 # axc A1xB1 MADD3 c13, c13, a3, b6 # axd + + gsLQC1(R13, F7, F6, 5) MADD2 c12, c12, a4, b5 # bxc MADD4 c14, c14, a4, b6 # bxd + NOP MADD1 c31, c31, a3, b7 # A1xB2 MADD3 c33, c33, a3, b8 + + FETCH $0, 8 * SIZE(PREB) MADD2 c32, c32, a4, b7 MADD4 c34, c34, a4, b8 - daddiu L, L, -1 + gsLQC1(R12, F11, F10, 3) gsLQC1(R13, F13, F12, 6) - gsLQC1(R13, F16, F15, 7) - MADD1 c11, c11, a5, b1 # axc A1xB1 MADD3 c13, c13, a5, b2 # axd + + gsLQC1(R13, F16, F15, 7) MADD2 c12, c12, a6, b1 # bxc MADD4 c14, c14, a6, b2 # bxd - - daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx MADD1 c31, c31, a5, b3 # A1xB2 MADD3 c33, c33, a5, b4 + + FETCH $0, 12 * SIZE(PREB) MADD2 c32, c32, a6, b3 MADD4 c34, c34, a6, b4 + daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 - gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 - MADD1 c11, c11, a7, b5 # axc A1xB1 MADD3 c13, c13, a7, b6 # axd + + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MADD2 c12, c12, a8, b5 # bxc MADD4 c14, c14, a8, b6 # bxd + daddiu PREB, PREB, 16 * SIZE MADD1 c31, c31, a7, b7 # A1xB2 MADD3 c33, c33, a7, b8 - MADD2 c32, c32, a8, b7 - MADD4 c34, c34, a8, b8 + FETCH $0, 0 * SIZE(PREB) + MADD2 c32, c32, a8, b7 bgtz L, .L32 - NOP + MADD4 c34, c34, a8, b8 - .align 3 .L35: andi L, K, 3 LD ALPHA_R, 128($sp) - LD ALPHA_I, 136($sp) - blez L, .L38 NOP - .align 3 + blez L, .L38 + LD ALPHA_I, 136($sp) + .align 5 .L36: - daddiu L, L, -1 - daddiu BO, BO, 2 * SIZE * COMPSIZE # 2nr*1kr*cmpx - daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx + daddiu L, L, -1 MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd + + daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd + daddiu AO, AO, 2 * SIZE # 2mr*1kr*cmpx MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 + + daddiu PREB, PREB, 4 * SIZE MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 - gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 - gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + NOP bgtz L, .L36 - NOP + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 .L38: ADD c11, c14, c11 - ADD c12, c13, c12 - - ADD c31, c34, c31 - ADD c32, c33, c32 - LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 LD a2, 1 * SIZE(CO1) + ADD c31, c34, c31 LD a3, 0 * SIZE(CO2) + ADD c32, c33, c32 LD a4, 1 * SIZE(CO2) MADD a1, a1, ALPHA_R, c11 @@ -613,43 +646,48 @@ daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE - .align 3 + .align 5 .L19: bgtz J, .L10 move B, BO - .align 3 + .align 5 .L20: andi J, N, 1 blez J, .L999 - NOP + dsll PREA, K, 1+ZBASE_SHIFT # PREA=K*2*2^4 + dsra I, M, 1 # I=M/2 move CO1, C - move AO, A # Reset AO - dsra I, M, 1 # I=M/2 + move AO, A # Reset AO blez I, .L29 - NOP + daddu PREA, PREA, A .L21: dsra L, K, 2 # Unroll K 4 times move BO, B + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 MTC $0, c11 # Clear results regs MOV c12, c11 + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MOV c13, c11 MOV c14, c11 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 MOV c21, c11 MOV c22, c11 + + FETCH $0, 0 * SIZE(PREA) MOV c23, c11 MOV c24, c11 - gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 - gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 - gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO1) blez L, .L25 NOP @@ -658,110 +696,116 @@ .L22: gsLQC1(R12, F9, F8, 2) # Unroll K=1 - gsLQC1(R12, F11, F10, 3) - gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 - MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd + + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd + gsLQC1(R12, F11, F10, 3) MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 + + FETCH $0, 4 * SIZE(PREA) MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 gsLQC1(R12, F1, F0, 4) # Unroll K=2 - gsLQC1(R12, F3, F2, 5) - gsLQC1(R13, F13, F12, 2) - MADD1 c11, c11, a5, b3 # axc A1xB1 MADD3 c13, c13, a5, b4 # axd + + gsLQC1(R13, F13, F12, 2) MADD2 c12, c12, a6, b3 # bxc MADD4 c14, c14, a6, b4 # bxd + gsLQC1(R12, F3, F2, 5) MADD1 c21, c21, a7, b3 # A2xB1 MADD3 c23, c23, a7, b4 + + FETCH $0, 8 * SIZE(PREA) MADD2 c22, c22, a8, b3 MADD4 c24, c24, a8, b4 - - daddiu L, L, -1 - gsLQC1(R12, F9, F8, 6) # Unroll K=3 - gsLQC1(R12, F11, F10, 7) - gsLQC1(R13, F16, F15, 3) + gsLQC1(R12, F9, F8, 6) # Unroll K=3 MADD1 c11, c11, a1, b5 # axc A1xB1 MADD3 c13, c13, a1, b6 # axd + + gsLQC1(R13, F16, F15, 3) MADD2 c12, c12, a2, b5 # bxc MADD4 c14, c14, a2, b6 # bxd - daddiu BO, BO, 8 * SIZE # 1nr*4kr*cmpx - daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx - + gsLQC1(R12, F11, F10, 7) MADD1 c21, c21, a3, b5 # A2xB1 MADD3 c23, c23, a3, b6 + daddiu BO, BO, 8 * SIZE # 1nr*4kr*cmpx + + FETCH $0, 12 * SIZE(PREA) MADD2 c22, c22, a4, b5 MADD4 c24, c24, a4, b6 + daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 - gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 - gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 - MADD1 c11, c11, a5, b7 # axc A1xB1 MADD3 c13, c13, a5, b8 # axd + daddiu PREA, PREA, 16 * SIZE + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MADD2 c12, c12, a6, b7 # bxc MADD4 c14, c14, a6, b8 # bxd + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 MADD1 c21, c21, a7, b7 # A2xB1 MADD3 c23, c23, a7, b8 - MADD2 c22, c22, a8, b7 - MADD4 c24, c24, a8, b8 + FETCH $0, 0 * SIZE(PREA) + MADD2 c22, c22, a8, b7 bgtz L, .L22 - NOP + MADD4 c24, c24, a8, b8 - .align 3 .L25: andi L, K, 3 LD ALPHA_R, 128($sp) - LD ALPHA_I, 136($sp) + blez L, .L28 - NOP + LD ALPHA_I, 136($sp) .align 3 .L26: - daddiu L, L, -1 - daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx - daddiu AO, AO, 2 * SIZE * COMPSIZE # 2mr*1kr*cmpx + daddiu L, L, -1 MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd + + daddiu BO, BO, 2 * SIZE # 2nr*1kr*cmpx MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd + daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 + + daddiu PREA, PREA, 4 * SIZE # 2mr*1kr*cmpx MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 - gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 bgtz L, .L26 - NOP + FETCH $0, 0 * SIZE(PREA) .L28: ADD c11, c14, c11 - ADD c12, c13, c12 - ADD c21, c24, c21 - ADD c22, c23, c22 - LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 LD a2, 1 * SIZE(CO1) + ADD c21, c24, c21 LD b1, 2 * SIZE(CO1) + ADD c22, c23, c22 LD b2, 3 * SIZE(CO1) daddiu I, I, -1 @@ -792,15 +836,16 @@ dsra L, K, 2 # Unroll K 4 times move BO, B + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 MTC $0, c11 # Clear results regs MOV c12, c11 - MOV c13, c11 - MOV c14, c11 - - gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + MOV c13, c11 + MOV c14, c11 + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREA) blez L, .L45 NOP @@ -808,53 +853,49 @@ .L42: gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 - gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 - MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd + + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd gsLQC1(R12, F9, F8, 2) # Unroll K=1 - gsLQC1(R13, F13, F12, 2) - MADD1 c11, c11, a3, b3 # axc A1xB1 MADD3 c13, c13, a3, b4 # axd + + gsLQC1(R13, F13, F12, 2) MADD2 c12, c12, a4, b3 # bxc MADD4 c14, c14, a4, b4 # bxd - daddiu L, L, -1 - gsLQC1(R12, F11, F10, 3) - gsLQC1(R13, F16, F15, 3) + gsLQC1(R12, F11, F10, 3) MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd + daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx + + gsLQC1(R13, F16, F15, 3) MADD2 c12, c12, a6, b5 # bxc MADD4 c14, c14, a6, b6 # bxd - daddiu BO, BO, 8 * SIZE # 2nr*4kr*cmpx - daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 - gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 - MADD1 c11, c11, a7, b7 # axc A1xB1 MADD3 c13, c13, a7, b8 # axd + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MADD2 c12, c12, a8, b7 # bxc + bgtz L, .L42 MADD4 c14, c14, a8, b8 # bxd - bgtz L, .L42 - NOP - .align 3 + .align 5 .L45: andi L, K, 3 LD ALPHA_R, 128($sp) - LD ALPHA_I, 136($sp) blez L, .L48 - NOP - .align 3 + LD ALPHA_I, 136($sp) .L46: daddiu L, L, -1 @@ -892,7 +933,7 @@ - .align 3 + .align 5 .L999: LDARG $16, 0($sp) -- 2.7.4