#define ASSEMBLER
#include "common.h"
+
+
#define M $4
#define N $5
#define K $6
.L15: # N=4 M=4 K=2
#ifndef TRMMKERNEL
- and K,KCO,2 # k = KCO&2
+ andi K,KCO,2 # k = KCO&2
#else
andi K,TEMP, 2
#endif
.L14_M2:
- and M,MCO,2 # Remainder M = 2
+ andi M,MCO,2 # Remainder M = 2
beqz M,.L14_M1
nop
.L25: # N=4 M=2 K=2
#ifndef TRMMKERNEL
- and K,KCO,2 # k = KCO&2
+ andi K,KCO,2 # k = KCO&2
#else
- and K,TEMP,2
+ andi K,TEMP,2
#endif
beqz K,.L28
nop
.L28: # N=4, M=2, K=1
#ifndef TRMMKERNEL
- and K,KCO,1
+ andi K,KCO,1
#else
- and K,TEMP,1
+ andi K,TEMP,1
#endif
beqz K,.L29 #
LD ALPHA,152($sp) # Get ALPHA
MADD t24,c24,t24,ALPHA
ST t13,0(CO3)
- move B,BO # Reset B
ST t23,1*SIZE(CO3)
daddu CO1,CO1,2*SIZE # COx += 2*8Byte
.L14_M1:
- and M,MCO,1 # Remainder M = 1
+ andi M,MCO,1 # Remainder M = 1
beqz M,.L0_N4_Loop # M = 0, finishing one panel B
nop
daddu B,BO,TEMP
#endif
- gsLQC1(R8,F1,F0,0)
+ LD a0, 0 * SIZE(A)
+# gsLQC1(R8,F1,F0,0)
gsLQC1(R9,F9,F8,0) #b0,b1
MTC $0,t11
gsLQC1(R9,F11,F10,1) #b2,b3
beqz K,.L35
MOV t14,t11
-#else
+#else
+ # gemm
move B,BO
- gsLQC1(R8,F1,F0,0)
+ LD a0, 0 * SIZE(A)
+# gsLQC1(R8,F1,F0,0)
dsra K,KCO,2 # K=KCO/2
gsLQC1(R9,F9,F8,0) #b0,b1
MTC $0,t11
#endif
.L31: # N=4 m=1,=K=4
- gsLQC1(R8,F3,F2,1)
+# gsLQC1(R8,F3,F2,1)
+ LD a1, 1*SIZE(A)
gsLQC1(R9,F13,F12,2) # R9=B
MADD t11,t11,a0,b0
MADD t12,t12,a0,b1
gsLQC1(R9,F15,F14,3)
MADD t13,t13,a0,b2
MADD t14,t14,a0,b3
-
+
+ LD a2, 2*SIZE(A)
gsLQC1(R9,F9,F8,4)
MADD t11,t11,a1,b4
MADD t12,t12,a1,b5
MADD t13,t13,a1,b6
MADD t14,t14,a1,b7
daddiu K,K,-1
-
+
+ LD a3, 3*SIZE(A)
gsLQC1(R9,F13,F12,6)
MADD t11,t11,a2,b0
MADD t12,t12,a2,b1
- daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=8*SIZE
gsLQC1(R9,F15,F14,7)
MADD t13,t13,a2,b2
MADD t14,t14,a2,b3
+
+ daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=8*SIZE
daddu B,B,16*SIZE # B+=4(nr)*4(kr)*8Byte=16*SIZE
- gsLQC1(R8,F1,F0,0)
+# gsLQC1(R8,F1,F0,0)
+ LD a0, 0*SIZE(A)
gsLQC1(R9,F9,F8,0)
MADD t11,t11,a3,b4
MADD t12,t12,a3,b5
.L35: # N=4 M=1 K=2
#ifndef TRMMKERNEL
- and K,KCO,2 # k = KCO&2
+ andi K,KCO,2 # k = KCO&2
#else
- and K,TEMP,2
+ andi K,TEMP,2
#endif
beqz K,.L38
nop
.L36:
+ LD a1,1*SIZE(A)
gsLQC1(R9,F13,F12,2) # R9=B
MADD t11,t11,a0,b0
MADD t12,t12,a0,b1
.L37:
LD a0,0(A)
-
gsLQC1(R9,F9,F8,0)
MADD t11,t11,a1,b4
MADD t12,t12,a1,b5
.L38: # N=4, M=1, K=1
#ifndef TRMMKERNEL
- and K,KCO,1
+ andi K,KCO,1
#else
andi K,TEMP,1
#endif
.align 5
.L0_N2:
- and N,NCO,2 # Remainder N = 2
+ andi N,NCO,2 # Remainder N = 2
beqz N,.L0_N1 # N=0,NCO<2
nop
.L45: # N=2 M=4 K=2
#ifndef TRMMKERNEL
- and K,KCO,2 # k = KCO&2
+ andi K,KCO,2 # k = KCO&2
#else
andi K,TEMP,2
#endif
.L48: # N=2, M=4, K=1
#ifndef TRMMKERNEL
- and K,KCO,1
+ andi K,KCO,1
#else
andi K,TEMP,1
#endif
#endif
.L12_M2:
- and M,MCO,2 # Remainder M = 2
+ andi M,MCO,2 # Remainder M = 2
beqz M,.L12_M1
nop
.L55: # N=2 M=2 K=2
#ifndef TRMMKERNEL
- and K,KCO,2 # k = KCO&2
+ andi K,KCO,2 # k = KCO&2
#else
andi K,TEMP,2
#endif
.L58: # N=2, M=2, K=1
#ifndef TRMMKERNEL
- and K,KCO,1
+ andi K,KCO,1
#else
- and K, TEMP, 1
+ andi K, TEMP, 1
#endif
beqz K,.L59 #
LD ALPHA,152($sp) # Get ALPHA
.L12_M1:
- and M,MCO,1 # Remainder M = 1
+ andi M,MCO,1 # Remainder M = 1
beqz M,.L0_N2_Loop # M = 0, finishing one panel B
nop
daddu B, BO, TEMP
#endif
MTC $0,t11
- gsLQC1(R8,F4,F0,0)
-
+#gsLQC1(R8,F4,F0,0)
+ LD a0, 0*SIZE(A)
MOV t21,t11
MOV t12,t11
gsLQC1(R9,F9,F8,0) #b0,b1
dsra K,KCO,2 # K=KCO/2
MTC $0,t11
move B,BO # Reset B
- gsLQC1(R8,F4,F0,0)
-
+# gsLQC1(R8,F4,F0,0)
+ LD a0,0*SIZE(A)
MOV t21,t11
MOV t12,t11
gsLQC1(R9,F9,F8,0) #b0,b1
#endif
.L61: # N=2 m=1,=K=4
+ LD a4, 1*SIZE(A)
gsLQC1(R9,F13,F12,1) # R9=B
MADD t11,t11,a0,b0
MADD t12,t12,a0,b1
+ LD a2, 2*SIZE(A)
gsLQC1(R9,F11,F10,2)
MADD t11,t11,a4,b4
MADD t12,t12,a4,b5
- daddiu K,K,-1
- gsLQC1(R8,F6,F2,1)
+# gsLQC1(R8,F6,F2,1)
+ LD a6, 3*SIZE(A)
MADD t11,t11,a2,b2
+ MADD t12,t12,a2,b3
+ daddiu K,K,-1
gsLQC1(R9,F15,F14,3)
- MADD t12,t12,a2,b3
daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
+# gsLQC1(R8,F4,F0,0)
- gsLQC1(R8,F4,F0,0)
+ LD a0, 0*SIZE(A)
daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE
gsLQC1(R9,F9,F8,0)
.L65: # N=2 M=1 K=2
#ifndef TRMMKERNEL
- and K,KCO,2 # k = KCO&2
+ andi K,KCO,2 # k = KCO&2
#else
- and K,TEMP,2
+ andi K,TEMP,2
#endif
beqz K,.L68
nop
.L66:
- gsLQC1(R9,F13,F12,1) # R9=B
+ LD a4, 1*SIZE(A)
MADD t11,t11,a0,b0
+
+ gsLQC1(R9,F13,F12,1) # R9=B
MADD t12,t12,a0,b1
daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16
daddu B,B,4*SIZE
.L68: # N=2, M=1, K=1
#ifndef TRMMKERNEL
- and K,KCO,1
+ andi K,KCO,1
#else
- and K,TEMP,1
+ andi K,TEMP,1
#endif
beqz K,.L69 #
LD ALPHA,152($sp) # Get ALPHA
.align 5
.L0_N1:
- and N,NCO,1 # Remainder N = 1
+ andi N,NCO,1 # Remainder N = 1
beqz N,.L999 # N=0,NCO<1
nop
daddu A, A, K
daddu B, BO, TEMP
#endif
- gsLQC1(R9,F12,F8,0)
+# gsLQC1(R9,F12,F8,0)
+ LD b0, 0*SIZE(B)
MTC $0,t11
gsLQC1(R8,F1,F0,0) #a0,a1
MOV t21,t11
#else
move B, BO
dsra K,KCO,2 # K=KCO/2
- gsLQC1(R9,F12,F8,0)
+# gsLQC1(R9,F12,F8,0)
+ LD b0, 0*SIZE(B)
MTC $0,t11
gsLQC1(R8,F1,F0,0) #a0,a1
MOV t21,t11
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
+ LD b4, 1*SIZE(B)
FETCH $0,(PREA)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
.L72:
- gsLQC1(R9,F14,F10,1)
+# gsLQC1(R9,F14,F10,1)
gsLQC1(R8,F1,F0,4)
gsLQC1(R8,F3,F2,5)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
+ LD b2, 2*SIZE(B)
FETCH $0,4*SIZE(PREA)
MADD t31,t31,a6,b4
MADD t41,t41,a7,b4
gsLQC1(R8,F5,F4,6)
gsLQC1(R8,F7,F6,7)
MADD t11,t11,a0,b2
+
+ LD b6, 3*SIZE(B)
MADD t21,t21,a1,b2
- daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
+ daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE
FETCH $0,8*SIZE(PREA)
MADD t31,t31,a2,b2
MADD t41,t41,a3,b2
- daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE
+ daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
.L74:
- gsLQC1(R9,F12,F8,0)
+# gsLQC1(R9,F12,F8,0)
gsLQC1(R8,F1,F0,0)
daddu PREA,PREA,16*SIZE
gsLQC1(R8,F3,F2,1)
MADD t11,t11,a4,b6
MADD t21,t21,a5,b6
+
+ LD b0, 0*SIZE(B)
daddiu K,K,-1
-
FETCH $0,-32(PREA)
+
MADD t31,t31,a6,b6
bnez K,.L71
MADD t41,t41,a7,b6
.L75: # N=2 M=4 K=2
#ifndef TRMMKERNEL
- and K,KCO,2 # k = KCO&2
+ andi K,KCO,2 # k = KCO&2
#else
- and K,TEMP,2
+ andi K,TEMP,2
#endif
beqz K,.L78
nop
gsLQC1(R8,F7,F6,3)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
- daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32
+ daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
+ LD b4, 1*SIZE(B)
FETCH $0,0(PREA)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
- daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
+ daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32
.L77:
- LD b0,0(B)
gsLQC1(R8,F1,F0,0)
gsLQC1(R8,F3,F2,1)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
+ LD b0,0(B)
FETCH $0,4*SIZE(PREA)
MADD t31,t31,a6,b4
MADD t41,t41,a7,b4
.L78: # N=2, M=4, K=1
#ifndef TRMMKERNEL
- and K,KCO,1
+ andi K,KCO,1
#else
- and K,TEMP,1
+ andi K,TEMP,1
#endif
beqz K,.L79 #
LD ALPHA,152($sp) # Get ALPHA
.L11_M2:
- and M,MCO,2 # Remainder M = 2
+ andi M,MCO,2 # Remainder M = 2
beqz M,.L11_M1
nop
daddu B, BO, TEMP
#endif
- gsLQC1(R9,F12,F8,0)
+# gsLQC1(R9,F12,F8,0)
+ LD b0, 0*SIZE(B)
MTC $0,t11
gsLQC1(R8,F1,F0,0) #a0,a1
MOV t21,t11
#else
move B, BO
dsra K,KCO,2 # K=KCO/2
- gsLQC1(R9,F12,F8,0)
+# gsLQC1(R9,F12,F8,0)
+ LD b0, 0*SIZE(B)
MTC $0,t11
gsLQC1(R8,F1,F0,0) #a0,a1
MOV t21,t11
#endif
.L81: # N=1,M=2,K=4
+ LD b4, 1*SIZE(B)
gsLQC1(R8,F5,F4,1) # R8=A
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
+ LD b2, 2*SIZE(B)
gsLQC1(R8,F3,F2,2)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
-
- gsLQC1(R9,F14,F10,1)
- daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
+# gsLQC1(R9,F14,F10,1)
+
+ LD b6, 3*SIZE(B)
gsLQC1(R8,F7,F6,3)
MADD t11,t11,a2,b2
+
MADD t21,t21,a3,b2
daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE
+ daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
- gsLQC1(R9,F12,F8,0)
- daddiu K,K,-1
-
+# gsLQC1(R9,F12,F8,0)
gsLQC1(R8,F1,F0,0)
+ daddiu K,K,-1
MADD t11,t11,a6,b6
+
+ LD b0, 0*SIZE(B)
bnez K,.L81
MADD t21,t21,a7,b6
.L85: # N=2 M=4 K=2
#ifndef TRMMKERNEL
- and K,KCO,2 # k = KCO&2
+ andi K,KCO,2 # k = KCO&2
#else
andi K,TEMP,2
#endif
.L86:
gsLQC1(R8,F5,F4,1) # R8=A
+ LD b4, 1*SIZE(B)
MADD t11,t11,a0,b0
+
MADD t21,t21,a1,b0
- daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16
-
- LD b0,0(B)
daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
+ daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16
gsLQC1(R8,F1,F0,0)
+ LD b0,0(B)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
.L88: # N=2, M=4, K=1
#ifndef TRMMKERNEL
- and K,KCO,1
+ andi K,KCO,1
#else
andi K,TEMP,1
#endif
.L11_M1:
- and M,MCO,1 # Remainder M = 1
+ andi M,MCO,1 # Remainder M = 1
beqz M,.L999 # M = 0, End
nop
daddu A, A, K
daddu B, BO, TEMP
#endif
- gsLQC1(R8,F4,F0,0)
+# gsLQC1(R8,F4,F0,0)
MTC $0,t11
- gsLQC1(R9,F12,F8,0)
+# gsLQC1(R9,F12,F8,0)
+ LD a0, 0*SIZE(A)
+ LD b0, 0*SIZE(B)
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, KCO, KK
#elif defined(LEFT)
#else
move B, BO
dsra K,KCO,2 # K=KCO/2
- gsLQC1(R8,F4,F0,0)
- gsLQC1(R9,F12,F8,0)
+# gsLQC1(R8,F4,F0,0)
+# gsLQC1(R9,F12,F8,0)
+ LD a0, 0*SIZE(A)
+ LD b0, 0*SIZE(B)
beqz K,.L95
MTC $0,t11
#endif
.L91: # N=1,M=1,K=4
- gsLQC1(R8,F6,F2,1)
+# gsLQC1(R8,F6,F2,1)
+ LD a4, 1*SIZE(A)
+ LD b4, 1*SIZE(B)
MADD t11,t11,a0,b0
- gsLQC1(R9,F14,F10,1)
+# gsLQC1(R9,F14,F10,1)
+ LD a2, 2*SIZE(A)
+ LD b2, 2*SIZE(B)
MADD t11,t11,a4,b4
- daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
- gsLQC1(R8,F4,F0,0)
+# gsLQC1(R8,F4,F0,0)
+ LD a6, 3*SIZE(A)
+ LD b6, 3*SIZE(B)
MADD t11,t11,a2,b2
- daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
- gsLQC1(R9,F12,F8,0)
+ daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
+ daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
+
+ LD a0, 0*SIZE(A)
+ LD b0, 0*SIZE(B)
+# gsLQC1(R9,F12,F8,0)
MADD t11,t11,a6,b6
+
daddiu K,K,-1
bnez K,.L91
nop
.L95: # N=2 M=4 K=2
#ifndef TRMMKERNEL
- and K,KCO,2 # k = KCO&2
+ andi K,KCO,2 # k = KCO&2
#else
andi K,TEMP,2
#endif
nop
.L96:
+ LD a4, 1*SIZE(A)
+ LD b4, 1*SIZE(B)
MADD t11,t11,a0,b0
- MADD t11,t11,a4,b4
daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16
daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32
LD b0,0(B)
LD a0,0(A)
+ MADD t11,t11,a4,b4
+
.L98: # N=2, M=4, K=1
#ifndef TRMMKERNEL
- and K,KCO,1
+ andi K,KCO,1
#else
andi K,TEMP,1
#endif