-#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
-#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
-#define FETCH ld
-
#define REALNAME ASMNAME
-
#define ASSEMBLER
#include "common.h"
-
-
+#define FETCH ld
+#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
+#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
#define M $4
#define N $5
ST $f23,144($sp)
- .align 5 # BACKUP
-.L0_N4: # Loop N
- ST ALPHA,152($sp) # Backup ALPHA
-
- move MCO,M # Backup M
+ .align 5
+.L0_N4: # Loop N
+ ST ALPHA,152($sp) # Backup ALPHA
+ move MCO,M # Backup M
- move NCO,N # Backup N
- move KCO,K # Backup K
+ move NCO,N # Backup N
+ move KCO,K # Backup K
- move AO,A # Backup A_addr
- dsra N,NCO,2 # N=NCO/2
+ move AO,A # Backup A_addr
+ dsra N,NCO,2 # N=NCO/2
dsll LDC,LDC,BASE_SHIFT # LDC*8Byte
- dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*NR(4)*8Byte=KC*2^5
+ dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5
- move BO,B # Backup B_addr
-
#if defined(TRMMKERNEL)
- LDARG OFFSET,160($sp) #
+ LDARG OFFSET,160($sp) # OFFSET is relate to the data part
#endif
#if defined(TRMMKERNEL) && !defined(LEFT)
- neg KK,OFFSET # right
+ neg KK,OFFSET
#endif
- beq N,$0,.L0_N2 # N=0,NCO<4
- dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*4mr*8Byte
+ move BO,B # Backup B_addr
+ beq N,$0,.L0_N2 # N=0,NCO<4
+ dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte
-.L0_N4_Lb:
- move CO1,C # Set C
- dsra M,MCO,2 # M=MCO/2
+.L0_N4_Lb: # mr=4,nr=4
+ move CO1,C
+ dsra M,MCO,2 # M=MCO/2
- move A,AO # Reset A
- daddu CO2,CO1,LDC
+ move A,AO # Reset A
+ daddu CO2,C,LDC
+ daddu PREB,BO,SPANB # PreB point next panelB
daddu CO3,CO2,LDC
- daddu PREB,BO,SPANB # PreB point next panelB
- daddu CO4,CO3,LDC
daddu PREA,AO,SPANA
+ daddu CO4,CO3,LDC
#if defined(TRMMKERNEL) && defined(LEFT)
- move KK,OFFSET # left
+ move KK,OFFSET
#endif
-
beqz M,.L14_M2
- daddu C,CO4,LDC
+ daddu C,CO4,LDC # move C to next panel Cj
.L10:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
- move B,BO
+ move B,BO # (SIDE=L and UPLO=L) or (SIZE=R and UPLO=U)
#else
- dsll K,KK,2 + BASE_SHIFT # KK no data part
+ dsll K,KK,2 + BASE_SHIFT # KK is the length that needs to span to the data part
dsll TEMP,KK,2 + BASE_SHIFT
- daddu A,A,K # move A B to data part
+ daddu A,A,K # move A B to data part
daddu B,BO,TEMP
#endif
MTC $0,t11
MOV t21,t11
- gsLQC1(R8,F1,F0,0) #a0,a1
+ gsLQC1(R8,F1,F0,0) # a0,a1
MOV t31,t11
MOV t41,t11
- gsLQC1(R9,F9,F8,0) #b0,b1
+ gsLQC1(R9,F9,F8,0) # b0,b1
MOV t12,t11
MOV t22,t11
- gsLQC1(R8,F3,F2,1) #a2,a3
+ gsLQC1(R8,F3,F2,1) # a2,a3
MOV t32,t11
MOV t42,t11
- gsLQC1(R9,F11,F10,1) #b2,b3
+ gsLQC1(R9,F11,F10,1) # b2,b3
MOV t13,t11
MOV t23,t11
MOV t14,t11
MOV t24,t11
- MOV t34,t11
- MOV t44,t11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
- dsubu TEMP,KCO,KK # temp = kco - kk
+ dsubu TEMP,KCO,KK # temp is the length of the data part
#elif defined(LEFT)
- daddiu TEMP, KK, 4
+ daddiu TEMP, KK, 4 # S=L,U=L
#else
- daddiu TEMP, KK, 4
+ daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part
#endif
-
- dsra K,TEMP,2 # K=KCO/2
+ dsra K,TEMP,2 # K=KCO/2
+ MOV t34,t11
beqz K,.L15
- nop
+ MOV t44,t11
#else
- MTC $0,t11 # gemm part
- move B,BO
- MOV t21,t11
- gsLQC1(R8,F1,F0,0) #a0,a1
+ move B,BO # Reset B
+ MTC $0,t11 # GEMM part NR=4,MR=4
+ gsLQC1(R8,F1,F0,0) # a0,a1
+ MOV t21,t11
MOV t31,t11
- MOV t41,t11
- gsLQC1(R9,F9,F8,0) #b0,b1
+ gsLQC1(R9,F9,F8,0) # b0,b1
+ MOV t41,t11
MOV t12,t11
- MOV t22,t11
- gsLQC1(R8,F3,F2,1) #a2,a3
+ gsLQC1(R8,F3,F2,1) # a2,a3
+ MOV t22,t11
MOV t32,t11
- MOV t42,t11
- gsLQC1(R9,F11,F10,1) #b2,b3
+ gsLQC1(R9,F11,F10,1) # b2,b3
- dsra K,KCO,2 # K=KCO/2
- MOV t13,t11
+ MOV t42,t11
+ dsra K,KCO,2 # K=KCO/2
+ MOV t13,t11
MOV t23,t11
+
MOV t33,t11
-
MOV t43,t11
+
MOV t14,t11
-
MOV t24,t11
- MOV t34,t11
- MOV t44,t11
+ MOV t34,t11
beqz K,.L15
- nop
+ MOV t44,t11 # clear 16 results registers
#endif
.align 5
-.L11: # N=M=K=4
- gsLQC1(R8,F5,F4,2) # R8=A
+.L11: # kr=4
+ gsLQC1(R8,F5,F4,2)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
- gsLQC1(R9,F13,F12,2) # R9=B
+ gsLQC1(R9,F13,F12,2)
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
MADD t34,t34,a2,b3
MADD t44,t44,a3,b3
- #load2 comp1
+
.L12:
gsLQC1(R8,F1,F0,4)
MADD t11,t11,a4,b4
gsLQC1(R9,F15,F14,7)
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
- daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE
+ daddu A,A,16*SIZE # 4mr*4kr
FETCH $0,8*SIZE(PREB)
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
- daddu B,B,16*SIZE
+ daddu B,B,16*SIZE # 4nr*4kr
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
MADD t44,t44,a3,b3
.L14:
- gsLQC1(R8,F1,F0,0)
+ gsLQC1(R8,F1,F0,0)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
MADD t13,t13,a4,b6
MADD t23,t23,a5,b6
+ FETCH $0,12*SIZE(PREA)
MADD t14,t14,a4,b7
MADD t24,t24,a5,b7
- FETCH $0,12*SIZE(PREA)
MADD t33,t33,a6,b6
MADD t43,t43,a7,b6
daddu PREB,PREB,16*SIZE
MADD t34,t34,a6,b7
- daddu PREA,PREA,16*SIZE
- bnez K,.L11
MADD t44,t44,a7,b7
+ bnez K,.L11
+ daddu PREA,PREA,16*SIZE
-.L15: # N=4 M=4 K=2
+.L15: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2 # k = KCO&2
+ andi K,KCO,2
#else
andi K,TEMP, 2
#endif
- nop
-
beqz K,.L18
nop
.L16:
- gsLQC1(R8,F5,F4,2) # R8=A
+ gsLQC1(R8,F5,F4,2)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
- gsLQC1(R9,F13,F12,2) # R9=B
+ gsLQC1(R9,F13,F12,2)
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R9,F15,F14,3)
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
- daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
+ daddu A,A,8*SIZE # 4mr*2kr
FETCH $0,0(PREB)
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
- daddu B,B,8*SIZE
+ daddu B,B,8*SIZE # 4nr*2kr
+ FETCH $0,0(PREA)
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
- FETCH $0,0(PREA)
MADD t33,t33,a2,b2
MADD t43,t43,a3,b2
MADD t13,t13,a4,b6
MADD t23,t23,a5,b6
+ FETCH $0,4*SIZE(PREA)
MADD t14,t14,a4,b7
MADD t24,t24,a5,b7
+ daddu PREB,PREB,8*SIZE
- FETCH $0,4*SIZE(PREA)
MADD t33,t33,a6,b6
MADD t43,t43,a7,b6
- daddu PREB,PREB,8*SIZE
+ daddu PREA,PREA,8*SIZE
MADD t34,t34,a6,b7
MADD t44,t44,a7,b7
- daddu PREA,PREA,8*SIZE
-.L18: # N=4, M=4, K=1
+.L18: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
- andi K,TEMP, 1
+ andi K,TEMP,1
#endif
- NOP
-
- beqz K,.L19 #
- LD ALPHA,152($sp) # Get ALPHA
+ beqz K,.L19
+ LD ALPHA,152($sp) # Get ALPHA
FETCH $0,0(PREB)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
- daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32
+ daddu A,A,4*SIZE # 4mr*kr
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
- daddu B,B,4*SIZE
+ daddu B,B,4*SIZE # 4nr*kr
FETCH $0,0(PREA)
MADD t31,t31,a2,b0
MADD t34,t34,a2,b3
MADD t44,t44,a3,b3
-.L19: # Write Back
+.L19: # Write Back to C
#ifndef TRMMKERNEL
- LD c11,0(CO1) # gemm write part Fetch 16 C
- LD c21,1*SIZE(CO1)
+ LD c11,0(CO1) # GEMM write part
+ LD c21,1*SIZE(CO1) # get 16 C
LD c31,2*SIZE(CO1)
LD c41,3*SIZE(CO1)
MADD t34,c34,t34,ALPHA
ST t41,3*SIZE(CO1)
MADD t44,c44,t44,ALPHA
- daddiu M,M,-1 # M--
+ daddiu M,M,-1 # M--
ST t12,0(CO2)
ST t22,1*SIZE(CO2)
FETCH $0,8*SIZE(CO4)
ST t14,0(CO4)
- daddu CO1,CO1,4*SIZE # COx += 4*8Byte
+ daddu CO1,CO1,4*SIZE # COi += 4
ST t24,1*SIZE(CO4)
daddu CO2,CO2,4*SIZE
ST t34,2*SIZE(CO4)
daddu CO3,CO3,4*SIZE
ST t44,3*SIZE(CO4)
daddu PREB,BO,SPANB
- bnez M,.L10 # M!=0
+
+ bnez M,.L10
daddu CO4,CO4,4*SIZE
#else
- MUL t11, ALPHA, t11
+ MUL t11, ALPHA, t11 # TRMM write back part
MUL t21, ALPHA, t21
MUL t31, ALPHA, t31
MUL t41, ALPHA, t41
ST t11, 0 * SIZE(CO1)
- ST t21, 1 * SIZE(CO1)
- ST t31, 2 * SIZE(CO1)
- ST t41, 3 * SIZE(CO1)
-
MUL t12, ALPHA, t12
+ ST t21, 1 * SIZE(CO1)
MUL t22, ALPHA, t22
+ ST t31, 2 * SIZE(CO1)
MUL t32, ALPHA, t32
+ ST t41, 3 * SIZE(CO1)
MUL t42, ALPHA, t42
ST t12, 0 * SIZE(CO2)
- ST t22, 1 * SIZE(CO2)
- ST t32, 2 * SIZE(CO2)
- ST t42, 3 * SIZE(CO2)
-
MUL t13, ALPHA, t13
+ ST t22, 1 * SIZE(CO2)
MUL t23, ALPHA, t23
+ ST t32, 2 * SIZE(CO2)
MUL t33, ALPHA, t33
+ ST t42, 3 * SIZE(CO2)
MUL t43, ALPHA, t43
ST t13, 0 * SIZE(CO3)
- ST t23, 1 * SIZE(CO3)
- ST t33, 2 * SIZE(CO3)
- ST t43, 3 * SIZE(CO3)
-
MUL t14, ALPHA, t14
+ ST t23, 1 * SIZE(CO3)
MUL t24, ALPHA, t24
+ ST t33, 2 * SIZE(CO3)
MUL t34, ALPHA, t34
+ ST t43, 3 * SIZE(CO3)
MUL t44, ALPHA, t44
ST t14, 0 * SIZE(CO4)
+ daddiu M,M,-1 # M--
ST t24, 1 * SIZE(CO4)
ST t34, 2 * SIZE(CO4)
ST t44, 3 * SIZE(CO4)
+ daddiu CO1,CO1, 4 * SIZE
+ daddiu CO2,CO2, 4 * SIZE
+ daddiu CO3,CO3, 4 * SIZE
+ daddiu CO4,CO4, 4 * SIZE
- daddiu M,M,-1 # M--
+ FETCH $0,4*SIZE(CO1)
+ FETCH $0,4*SIZE(CO2)
+ FETCH $0,4*SIZE(CO3)
+ FETCH $0,4*SIZE(CO4)
+
+ FETCH $0,0(CO1)
+ FETCH $0,0(CO2)
+ FETCH $0,0(CO3)
+ FETCH $0,0(CO4)
- daddiu CO4,CO4, 4 * SIZE # trmm part write back
- daddiu CO3,CO3, 4 * SIZE
- daddiu CO2,CO2, 4 * SIZE
- daddiu CO1,CO1, 4 * SIZE
-
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
- dsubu TEMP,KCO,KK
+ dsubu TEMP,KCO,KK
#ifdef LEFT
daddiu TEMP,TEMP, -4
#else
daddiu TEMP,TEMP, -4
#endif
-
dsll K,TEMP,2 + BASE_SHIFT
dsll TEMP,TEMP,2 + BASE_SHIFT
-
- daddu A,A,K # mov A to the end of panel Ai
- daddu B,B,TEMP # mov B to the end of panel Bj
+ daddu A,A,K # mov A to the end of panel Ai
+ daddu B,B,TEMP # mov B to the end of panel Bj
#endif
-#ifdef LEFT # right control by N loop
+#ifdef LEFT
daddiu KK, KK,4
#endif
- bnez M,.L10 # M!=0
+ bnez M,.L10
nop
#endif
-
+ .align 3
.L14_M2:
- andi M,MCO,2 # Remainder M = 2
+ andi M, MCO, 2 # nr=4,mr=2
beqz M,.L14_M1
nop
.L20:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
- move B,BO
+ move B,BO # Reset B
#else
- dsll K,KK,1 + BASE_SHIFT #mr=2 so KK*2
- dsll TEMP,KK,2 + BASE_SHIFT
-
+ dsll K,KK,1 + BASE_SHIFT # mr=2
+ dsll TEMP,KK,2 + BASE_SHIFT # nr=4
daddu A,A,K
daddu B,BO,TEMP
#endif
MTC $0,t11
MOV t21,t11
- gsLQC1(R8,F1,F0,0) #a0,a1
+ gsLQC1(R8,F1,F0,0) # a0,a1
MOV t12,t11
MOV t22,t11
- gsLQC1(R9,F9,F8,0) #b0,b1
+ gsLQC1(R9,F9,F8,0) # b0,b1
MOV t13,t11
- gsLQC1(R9,F11,F10,1) #b2,b3
-
MOV t23,t11
- MOV t14,t11
- MOV t24,t11
+ gsLQC1(R9,F11,F10,1) # b2,b3
+
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP,KCO,KK
#elif defined(LEFT)
- daddiu TEMP,KK,2
+ daddiu TEMP,KK,2 # left part,controlled by mr, mr=2
#else
- daddiu TEMP,KK,4 # not sure
+ daddiu TEMP,KK,4 # right part,controlled by nr,nr=4
#endif
dsra K,TEMP,2
+ MOV t14,t11
beqz K,.L25
- nop
+ MOV t24,t11 # clear 2*4=8 results registers
#else
- move B,BO # gemm part
+ move B,BO # Reset B
MTC $0,t11
+ gsLQC1(R8,F1,F0,0)
+
MOV t21,t11
- gsLQC1(R8,F1,F0,0) #a0,a1
-
MOV t12,t11
+ gsLQC1(R9,F9,F8,0)
+
MOV t22,t11
- gsLQC1(R9,F9,F8,0) #b0,b1
+ dsra K,KCO,2
+ gsLQC1(R9,F11,F10,1)
- dsra K,KCO,2 # K=KCO/2
MOV t13,t11
- gsLQC1(R9,F11,F10,1) #b2,b3
-
MOV t23,t11
- MOV t14,t11
- MOV t24,t11
+ MOV t14,t11
beqz K,.L25
- nop
+ MOV t24,t11
#endif
-.L21: # N=4 m=2,=K=4
- gsLQC1(R8,F5,F4,1) # R8=A
+.L21: # nr=4,mr=2,kr=4
+ gsLQC1(R8,F5,F4,1)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
- gsLQC1(R9,F13,F12,2) # R9=B
+ gsLQC1(R9,F13,F12,2)
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
- gsLQC1(R8,F3,F2,2)
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
- gsLQC1(R9,F9,F8,4)
+ gsLQC1(R8,F3,F2,2)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
- gsLQC1(R9,F11,F10,5)
+ gsLQC1(R9,F9,F8,4)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
- gsLQC1(R8,F7,F6,3)
+ gsLQC1(R9,F11,F10,5)
MADD t13,t13,a4,b6
MADD t23,t23,a5,b6
MADD t14,t14,a4,b7
MADD t24,t24,a5,b7
+ daddiu K,K,-1
- gsLQC1(R9,F13,F12,6)
+ gsLQC1(R8,F7,F6,3)
MADD t11,t11,a2,b0
MADD t21,t21,a3,b0
- daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE
- gsLQC1(R9,F15,F14,7)
+ gsLQC1(R9,F13,F12,6)
MADD t12,t12,a2,b1
MADD t22,t22,a3,b1
- daddiu K,K,-1
- gsLQC1(R8,F1,F0,0)
+ gsLQC1(R9,F15,F14,7)
MADD t13,t13,a2,b2
MADD t23,t23,a3,b2
- daddu B,B,16*SIZE # B+=4(nr)*4(kr)*8Byte=16*SIZE
+ daddu A,A,8*SIZE # 2mr*4kr
MADD t14,t14,a2,b3
MADD t24,t24,a3,b3
+ daddu B,B,16*SIZE # 4nr*4kr
- gsLQC1(R9,F9,F8,0)
+ gsLQC1(R8,F1,F0,0)
MADD t11,t11,a6,b4
MADD t21,t21,a7,b4
- gsLQC1(R9,F11,F10,1)
+ gsLQC1(R9,F9,F8,0)
MADD t12,t12,a6,b5
MADD t22,t22,a7,b5
+ gsLQC1(R9,F11,F10,1)
MADD t13,t13,a6,b6
MADD t23,t23,a7,b6
bnez K,.L21
MADD t24,t24,a7,b7
-.L25: # N=4 M=2 K=2
+.L25:
#ifndef TRMMKERNEL
- andi K,KCO,2 # k = KCO&2
+ andi K,KCO,2 # kr=2
#else
- andi K,TEMP,2
+ andi K,TEMP,2
#endif
beqz K,.L28
nop
.L26:
- gsLQC1(R8,F5,F4,1) # R8=A
+ gsLQC1(R8,F5,F4,1)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
- gsLQC1(R9,F13,F12,2) # R9=B
+ gsLQC1(R9,F13,F12,2)
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
- daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
gsLQC1(R9,F15,F14,3)
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
- daddu B,B,8*SIZE
+ daddu A,A,4*SIZE # 2mr*2kr
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
+ daddu B,B,8*SIZE # 4nr*2kr
.L27:
gsLQC1(R8,F1,F0,0)
MADD t14,t14,a4,b7
MADD t24,t24,a5,b7
-.L28: # N=4, M=2, K=1
+.L28: # kr=1
#ifndef TRMMKERNEL
- andi K,KCO,1
+ andi K,KCO,1
#else
- andi K,TEMP,1
+ andi K,TEMP,1
#endif
- beqz K,.L29 #
- LD ALPHA,152($sp) # Get ALPHA
+ beqz K,.L29
+ LD ALPHA,152($sp) # Get ALPHA
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
- daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
- daddu B,B,4*SIZE
+ daddu A,A,2*SIZE # 2mr*kr
+ daddu B,B,4*SIZE # 4nr*kr
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
-.L29: # Write Back
+.L29: # Write Back to C
#ifndef TRMMKERNEL
- LD c11,0(CO1) # gemm write back part Fetch 16 C
+ LD c11,0(CO1) # GEMM write back part
LD c21,1*SIZE(CO1)
LD c12,0(CO2)
MADD t24,c24,t24,ALPHA
ST t13,0(CO3)
+ daddu CO1,CO1,2*SIZE # COi += 2
ST t23,1*SIZE(CO3)
- daddu CO1,CO1,2*SIZE # COx += 2*8Byte
-
- FETCH $0,0(CO1)
- FETCH $0,2*SIZE(CO2)
- FETCH $0,2*SIZE(CO3)
- FETCH $0,2*SIZE(CO4)
+ daddu CO2,CO2,2*SIZE
ST t14,0(CO4)
- daddu CO2,CO2,2*SIZE
- ST t24,1*SIZE(CO4)
daddu CO3,CO3,2*SIZE
+ ST t24,1*SIZE(CO4)
daddu CO4,CO4,2*SIZE
+ FETCH $0,0(CO1)
+ FETCH $0,0(CO2)
+ FETCH $0,0(CO3)
+ FETCH $0,0(CO4)
+
#else
- MUL t11, ALPHA, t11
+ MUL t11, ALPHA, t11 # TRMM write back part
MUL t21, ALPHA, t21
ST t11, 0 * SIZE(CO1)
- ST t21, 1 * SIZE(CO1)
-
MUL t12, ALPHA, t12
+ ST t21, 1 * SIZE(CO1)
MUL t22, ALPHA, t22
ST t12, 0 * SIZE(CO2)
- ST t22, 1 * SIZE(CO2)
-
MUL t13, ALPHA, t13
+ ST t22, 1 * SIZE(CO2)
MUL t23, ALPHA, t23
ST t13, 0 * SIZE(CO3)
- ST t23, 1 * SIZE(CO3)
-
MUL t14, ALPHA, t14
+ ST t23, 1 * SIZE(CO3)
MUL t24, ALPHA, t24
ST t14, 0 * SIZE(CO4)
ST t24, 1 * SIZE(CO4)
-
+
daddiu CO1,CO1, 2 * SIZE
daddiu CO2,CO2, 2 * SIZE
daddiu CO3,CO3, 2 * SIZE
daddiu CO4,CO4, 2 * SIZE
-#if ( defined(LEFT) && defined(TRANSA)) || \
- (!defined(LEFT) && !defined(TRANSA))
+ FETCH $0,0(CO1)
+ FETCH $0,0(CO2)
+ FETCH $0,0(CO3)
+ FETCH $0,0(CO4)
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
dsubu TEMP,KCO,KK
#ifdef LEFT
daddiu TEMP,TEMP,-2
#else
daddiu TEMP,TEMP,-4
#endif
-
dsll K,TEMP,1 + BASE_SHIFT
dsll TEMP,TEMP,2 + BASE_SHIFT
- daddu A,A,K
- daddu B,B,TEMP
+ daddu A,A,K # move A to next panel Ai
+ daddu B,B,TEMP # move B to next panel Bj
#endif
#ifdef LEFT
#endif
+ .align 3
.L14_M1:
- andi M,MCO,1 # Remainder M = 1
- beqz M,.L0_N4_Loop # M = 0, finishing one panel B
+ andi M,MCO,1 # mr=1
+ beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj
nop
.L30:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
- move B,BO
+ move B,BO # Reset B
#else
dsll K,KK, 0 + BASE_SHIFT
dsll TEMP,KK,2 + BASE_SHIFT
daddu A,A,K
daddu B,BO,TEMP
#endif
-
- LD a0, 0 * SIZE(A)
-# gsLQC1(R8,F1,F0,0)
- gsLQC1(R9,F9,F8,0) #b0,b1
MTC $0,t11
- gsLQC1(R9,F11,F10,1) #b2,b3
MOV t12,t11
+ LD a0, 0 * SIZE(A) # a0
+
MOV t13,t11
+ gsLQC1(R9,F9,F8,0) # b0,b1
+
+ MOV t14,t11 # clear result registers
+ gsLQC1(R9,F11,F10,1) # b2,b3
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, KCO, KK
daddiu TEMP, KK, 4
#endif
dsra K,TEMP, 2
-
+ nop
beqz K,.L35
- MOV t14,t11
+ nop
+
#else
- # gemm
- move B,BO
- LD a0, 0 * SIZE(A)
-# gsLQC1(R8,F1,F0,0)
- dsra K,KCO,2 # K=KCO/2
- gsLQC1(R9,F9,F8,0) #b0,b1
+ move B,BO # Reset B, GEMM part
+ dsra K,KCO,2 # K=KCO/2
+ LD a0, 0 * SIZE(A) # a0
+
MTC $0,t11
- gsLQC1(R9,F11,F10,1) #b2,b3
MOV t12,t11
+ gsLQC1(R9,F9,F8,0) # b0,b1
+
MOV t13,t11
- dsra K,KCO,2
- beqz K,.L35
MOV t14,t11
+ gsLQC1(R9,F11,F10,1) # b2,b3
+
+ beqz K,.L35
+ nop
#endif
-.L31: # N=4 m=1,=K=4
-# gsLQC1(R8,F3,F2,1)
- LD a1, 1*SIZE(A)
- gsLQC1(R9,F13,F12,2) # R9=B
+.L31: # nr=4,mr=1,kr=4
+ LD a1, 1*SIZE(A) # load a1
MADD t11,t11,a0,b0
+
+ gsLQC1(R9,F13,F12,2) # b4,b5
MADD t12,t12,a0,b1
- gsLQC1(R9,F15,F14,3)
+ gsLQC1(R9,F15,F14,3) # b6,b7
MADD t13,t13,a0,b2
MADD t14,t14,a0,b3
- LD a2, 2*SIZE(A)
- gsLQC1(R9,F9,F8,4)
+ LD a2, 2*SIZE(A) # a2
MADD t11,t11,a1,b4
+
+ gsLQC1(R9,F9,F8,4)
MADD t12,t12,a1,b5
gsLQC1(R9,F11,F10,5)
MADD t14,t14,a1,b7
daddiu K,K,-1
- LD a3, 3*SIZE(A)
- gsLQC1(R9,F13,F12,6)
+ LD a3, 3*SIZE(A) # a3
MADD t11,t11,a2,b0
+
+ gsLQC1(R9,F13,F12,6)
MADD t12,t12,a2,b1
+ daddu A,A,4*SIZE # 1mr*4kr
gsLQC1(R9,F15,F14,7)
MADD t13,t13,a2,b2
MADD t14,t14,a2,b3
-
- daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=8*SIZE
- daddu B,B,16*SIZE # B+=4(nr)*4(kr)*8Byte=16*SIZE
+ daddu B,B,16*SIZE # 4nr*4kr
-# gsLQC1(R8,F1,F0,0)
- LD a0, 0*SIZE(A)
- gsLQC1(R9,F9,F8,0)
+ LD a0, 0*SIZE(A) # a0
MADD t11,t11,a3,b4
+
+ gsLQC1(R9,F9,F8,0)
MADD t12,t12,a3,b5
gsLQC1(R9,F11,F10,1)
bnez K,.L31
MADD t14,t14,a3,b7
-.L35: # N=4 M=1 K=2
+.L35: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2 # k = KCO&2
+ andi K,KCO,2
#else
- andi K,TEMP,2
+ andi K,TEMP,2
#endif
beqz K,.L38
nop
.L36:
- LD a1,1*SIZE(A)
- gsLQC1(R9,F13,F12,2) # R9=B
+ LD a1,1*SIZE(A) # load a1
MADD t11,t11,a0,b0
+
+ gsLQC1(R9,F13,F12,2)
MADD t12,t12,a0,b1
- daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32
+ daddu A,A,2*SIZE # mr*2kr
gsLQC1(R9,F15,F14,3)
MADD t13,t13,a0,b2
MADD t14,t14,a0,b3
- daddu B,B,8*SIZE
+ daddu B,B,8*SIZE # 4nr*2kr
.L37:
LD a0,0(A)
- gsLQC1(R9,F9,F8,0)
MADD t11,t11,a1,b4
+
+ gsLQC1(R9,F9,F8,0)
MADD t12,t12,a1,b5
gsLQC1(R9,F11,F10,1)
MADD t13,t13,a1,b6
MADD t14,t14,a1,b7
-.L38: # N=4, M=1, K=1
+.L38: # kr=1
#ifndef TRMMKERNEL
- andi K,KCO,1
+ andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L39 #
- LD ALPHA,152($sp) # Get ALPHA
+ beqz K,.L39
+ LD ALPHA,152($sp) # Get ALPHA
MADD t11,t11,a0,b0
MADD t12,t12,a0,b1
- daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16
+ daddu A,A,1*SIZE
daddu B,B,4*SIZE
MADD t13,t13,a0,b2
MADD t14,t14,a0,b3
-.L39: # Write Back
+.L39: # Write Back
#ifndef TRMMKERNEL
- LD c11,0(CO1) # Fetch 16 C
+ LD c11,0(CO1)
LD c12,0(CO2)
LD c13,0(CO3)
LD c14,0(CO4)
ST t13, 0 * SIZE(CO3)
ST t14, 0 * SIZE(CO4)
-#if ( defined(LEFT) && defined(TRANSA)) || \
- (!defined(LEFT) && !defined(TRANSA))
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, KCO, KK
#ifdef LEFT
daddiu TEMP, TEMP, -1
#endif
-.L0_N4_Loop:
- daddiu N,N,-1 # N--
+ .align 3
+.L0_N4_Loop: # mc finished
+ daddiu N,N,-1 # N--
#if defined(TRMMKERNEL) && !defined(LEFT)
daddiu KK, KK,4
#endif
- bnez N,.L0_N4_Lb # N!=0
- move BO,B # Set B
-
-
+ bnez N,.L0_N4_Lb
+ move BO,B # Set BO point to next panel Bj
.align 5
.L0_N2:
- andi N,NCO,2 # Remainder N = 2
- beqz N,.L0_N1 # N=0,NCO<2
+ andi N,NCO,2 # nr = 2
+ beqz N,.L0_N1
nop
.L0_N2_Lb:
- move CO1,C # Set C
- dsra M,MCO,2 # M=MCO/2
+ move CO1,C
+ daddu CO2,C,LDC
+
+ dsra M,MCO,2
+ move A,AO # Reset A
+
+ daddu PREA,AO,SPANA
+ daddu C,CO2,LDC
#if defined(TRMMKERNEL) && defined(LEFT)
move KK, OFFSET
#endif
-
- dsll SPANB,KCO,1+BASE_SHIFT # SPANB=KC*NR(2)*8Byte=KC*16=KC*2^4
- move A,AO # Reset A
-
- daddu CO2,CO1,LDC
- daddu PREA,AO,SPANA
beqz M,.L12_M2
- daddu C,CO2,LDC
+ nop
.L40:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
- move B,BO
+ move B,BO # Reset B
#else
- dsll K,KK, 2 + BASE_SHIFT # mr=4
- dsll TEMP, KK,1 + BASE_SHIFT # nr=2
+ dsll K,KK, 2 + BASE_SHIFT
+ dsll TEMP, KK,1 + BASE_SHIFT
daddu A,A,K
daddu B,BO,TEMP
#endif
MTC $0,t11
MOV t21,t11
- gsLQC1(R8,F1,F0,0) #a0,a1
+ gsLQC1(R8,F1,F0,0) # a0,a1
MOV t31,t11
MOV t41,t11
- gsLQC1(R9,F9,F8,0) #b0,b1
+ gsLQC1(R9,F9,F8,0) # b0,b1
MOV t12,t11
- gsLQC1(R8,F3,F2,1) #a2,a3
-
MOV t22,t11
- MOV t32,t11
+ gsLQC1(R8,F3,F2,1) # a2,a3
- MOV t42,t11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP,KCO,KK
#elif defined(LEFT)
#else
daddiu TEMP, KK, 2
#endif
- dsra K,TEMP,2 # K=KCO/2
+ dsra K,TEMP,2
+ MOV t32,t11
beqz K,.L45
- nop
+ MOV t42,t11
+
#else
- move B,BO
- MTC $0,t11 # gemm part
- MOV t21,t11
- gsLQC1(R8,F1,F0,0) #a0,a1
+ move B,BO # Reset B
+ MTC $0,t11 # gemm part
+ gsLQC1(R8,F1,F0,0) # a0,a1
+ MOV t21,t11
MOV t31,t11
- MOV t41,t11
- gsLQC1(R9,F9,F8,0) #b0,b1
+ gsLQC1(R9,F9,F8,0) # b0,b1
- dsra K,KCO,2 # K=KCO/2
- MOV t12,t11
- gsLQC1(R8,F3,F2,1) #a2,a3
+ MOV t41,t11
+ dsra K,KCO,2 # K=KCO/2
+ gsLQC1(R8,F3,F2,1) # a2,a3
+ MOV t12,t11
MOV t22,t11
- MOV t32,t11
- MOV t42,t11
+ MOV t32,t11
beqz K,.L45
- nop
+ MOV t42,t11
#endif
-.L41: # N=2,M=K=4
- gsLQC1(R8,F5,F4,2) # R8=A
+.L41: # nr=2,mr=kr=4
+ gsLQC1(R8,F5,F4,2)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
- gsLQC1(R9,F13,F12,1) # R9=B
+ gsLQC1(R9,F13,F12,1)
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R8,F7,F6,7)
MADD t31,t31,a2,b2
MADD t41,t41,a3,b2
- daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE
+ daddu B,B,8*SIZE # 2nr*4kr
FETCH $0,8*SIZE(PREA)
MADD t32,t32,a2,b3
MADD t42,t42,a3,b3
- daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE
+ daddu A,A,16*SIZE # 4mr*4kr
.L44:
gsLQC1(R8,F1,F0,0)
MADD t42,t42,a7,b7
-.L45: # N=2 M=4 K=2
+.L45: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2 # k = KCO&2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
nop
.L46:
- gsLQC1(R8,F5,F4,2) # R8=A
+ gsLQC1(R8,F5,F4,2)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
- gsLQC1(R9,F13,F12,1) # R9=B
+ gsLQC1(R9,F13,F12,1)
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R8,F7,F6,3)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
- daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32
+ daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32
FETCH $0,0(PREA)
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
- daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
+ daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
.L47:
gsLQC1(R8,F1,F0,0)
daddu PREA,PREA,8*SIZE
-.L48: # N=2, M=4, K=1
+.L48: # kr=1
#ifndef TRMMKERNEL
- andi K,KCO,1
+ andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L49 #
- LD ALPHA,152($sp) # Get ALPHA
+ beqz K,.L49
+ LD ALPHA,152($sp) # Get ALPHA
FETCH $0,0(PREA)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
- daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32
+ daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
-.L49: # Write Back
+.L49: # Write Back
#ifndef TRMMKERNEL
- LD c11,0(CO1) # gemm write back part Fetch 16 C
+ LD c11,0(CO1) # gemm write back part Fetch 16 C
LD c21,1*SIZE(CO1)
LD c31,2*SIZE(CO1)
LD c41,3*SIZE(CO1)
MADD t32,c32,t32,ALPHA
ST t41,3*SIZE(CO1)
MADD t42,c42,t42,ALPHA
- daddiu M,M,-1 # M--
+ daddiu M,M,-1
ST t12,0(CO2)
ST t22,1*SIZE(CO2)
FETCH $0,4*SIZE(CO1)
FETCH $0,4*SIZE(CO2)
-
FETCH $0,8*SIZE(CO1)
FETCH $0,8*SIZE(CO2)
- daddu CO1,CO1,4*SIZE # COx += 4*8Byte
- bnez M,.L40 # M!=0
+ daddu CO1,CO1,4*SIZE
+ bnez M,.L40
daddu CO2,CO2,4*SIZE
-#else
- daddiu M,M,-1
-
- daddiu CO1,CO1, 4*SIZE
- daddiu CO2,CO2, 4*SIZE
+#else
MUL t11, ALPHA, t11
MUL t21, ALPHA, t21
MUL t31, ALPHA, t31
MUL t41, ALPHA, t41
MUL t12, ALPHA, t12
+ ST t11, 0 * SIZE(CO1)
MUL t22, ALPHA, t22
+ ST t21, 1 * SIZE(CO1)
MUL t32, ALPHA, t32
+ ST t31, 2 * SIZE(CO1)
MUL t42, ALPHA, t42
-
- ST t11, -4 * SIZE(CO1)
- ST t21, -3 * SIZE(CO1)
- ST t31, -2 * SIZE(CO1)
- ST t41, -1 * SIZE(CO1)
+ ST t41, 3 * SIZE(CO1)
- ST t12, -4 * SIZE(CO2)
- ST t22, -3 * SIZE(CO2)
- ST t32, -2 * SIZE(CO2)
- ST t42, -1 * SIZE(CO2)
+ ST t12, 0 * SIZE(CO2)
+ daddiu M,M,-1
+ ST t22, 1 * SIZE(CO2)
+ ST t32, 2 * SIZE(CO2)
+ ST t42, 3 * SIZE(CO2)
+
+ daddiu CO1,CO1, 4*SIZE
+ daddiu CO2,CO2, 4*SIZE
+
+ FETCH $0,0(CO1)
+ FETCH $0,0(CO2)
+ FETCH $0,4(CO1)
+ FETCH $0,4(CO2)
-#if ( defined(LEFT) && defined(TRANSA)) || \
- (!defined(LEFT) && !defined(TRANSA))
+#if ( defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, KCO, KK
#ifdef LEFT
daddiu TEMP, TEMP, -4
#else
daddiu TEMP, TEMP, -2
#endif
-
dsll K,TEMP, 2 + BASE_SHIFT
dsll TEMP, TEMP, 1 + BASE_SHIFT
#ifdef LEFT
daddiu KK, KK, 4
#endif
-
bnez M,.L40
nop
#endif
+
+ .align 3
.L12_M2:
- andi M,MCO,2 # Remainder M = 2
+ andi M,MCO,2 # mr = 2
beqz M,.L12_M1
nop
daddu B, BO, TEMP
#endif
MTC $0,t11
- gsLQC1(R8,F1,F0,0) #a0,a1
+ gsLQC1(R8,F1,F0,0) #a0,a1
MOV t21,t11
- MOV t12,t11
- gsLQC1(R9,F9,F8,0) #b0,b1
-
- MOV t22,t11
+ gsLQC1(R9,F9,F8,0) #b0,b1
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, KCO, KK
#elif defined(LEFT)
#else
daddiu TEMP, KK, 2
#endif
- dsra K,TEMP,2 # K=KCO/2
+ dsra K,TEMP,2
+ MOV t12,t11
beqz K,.L55
- nop
+ MOV t22,t11
#else
move B,BO
- dsra K,KCO,2 # K=KCO/2
- MTC $0,t11
- gsLQC1(R8,F1,F0,0) #a0,a1
+ dsra K,KCO,2 # K=KCO/2
+ gsLQC1(R8,F1,F0,0) #a0,a1
+ MTC $0,t11
MOV t21,t11
- MOV t12,t11
- gsLQC1(R9,F9,F8,0) #b0,b1
+ gsLQC1(R9,F9,F8,0) #b0,b1
- MOV t22,t11
+ MOV t12,t11
beqz K,.L55
- nop
+ MOV t22,t11
#endif
-.L51: # N=2 m=2,=K=4
- gsLQC1(R8,F5,F4,1) # R8=A
+.L51: # nr=2 mr=2,kr=4
+ gsLQC1(R8,F5,F4,1)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
- gsLQC1(R9,F13,F12,1) # R9=B
+ gsLQC1(R9,F13,F12,1)
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R8,F7,F6,3)
MADD t11,t11,a2,b2
MADD t21,t21,a3,b2
- daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE
+ daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE
gsLQC1(R9,F15,F14,3)
MADD t12,t12,a2,b3
MADD t22,t22,a3,b3
- daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=16*SIZE
+ daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=16*SIZE
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a6,b6
bnez K,.L51
MADD t22,t22,a7,b7
-.L55: # N=2 M=2 K=2
+.L55: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2 # k = KCO&2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
- NOP
beqz K,.L58
nop
.L56:
- gsLQC1(R8,F5,F4,1) # R8=A
+ gsLQC1(R8,F5,F4,1)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
- daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
+ daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
- gsLQC1(R9,F13,F12,1) # R9=B
+ gsLQC1(R9,F13,F12,1)
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
- daddu B,B,4*SIZE
+ daddu B,B,4*SIZE # 2nr*2kr
.L57:
gsLQC1(R8,F1,F0,0)
MADD t22,t22,a5,b5
-.L58: # N=2, M=2, K=1
+.L58: # kr=1
#ifndef TRMMKERNEL
- andi K,KCO,1
+ andi K,KCO,1
#else
- andi K, TEMP, 1
+ andi K,TEMP, 1
#endif
- beqz K,.L59 #
- LD ALPHA,152($sp) # Get ALPHA
+ beqz K,.L59
+ LD ALPHA,152($sp) # Get ALPHA
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
- daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
- daddu B,B,2*SIZE
+ daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
+ daddu B,B,2*SIZE # 2nr*kr
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
-.L59: # Write Back
+.L59: # Write Back
#ifndef TRMMKERNEL
- LD c11,0(CO1) # write gemm part back Fetch 16 C
+ LD c11,0(CO1) # write gemm part back Fetch 16 C
LD c21,1*SIZE(CO1)
LD c12,0(CO2)
LD c22,1*SIZE(CO2)
ST t12,0(CO2)
ST t22,1*SIZE(CO2)
- daddu CO1,CO1,2*SIZE # COx += 2*8Byte
+ daddu CO1,CO1,2*SIZE
daddu CO2,CO2,2*SIZE
FETCH $0,0(CO1)
FETCH $0,0(CO2)
#else
daddiu M, M, -1
-
daddiu CO1,CO1, 2 * SIZE
daddiu CO2,CO2, 2 * SIZE
-
MUL t11, ALPHA, t11
MUL t21, ALPHA, t21
MUL t12, ALPHA, t12
ST t12, -2 * SIZE(CO2)
ST t22, -1 * SIZE(CO2)
+ FETCH $0,0(CO1)
+ FETCH $0,0(CO2)
+
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, KCO, KK
#ifdef LEFT
#ifdef LEFT
daddiu KK, KK, 2
#endif
- FETCH $0,0(CO1)
- FETCH $0,0(CO2)
-
#endif
+ .align 3
.L12_M1:
- andi M,MCO,1 # Remainder M = 1
- beqz M,.L0_N2_Loop # M = 0, finishing one panel B
+ andi M,MCO,1 # mr = 1
+ beqz M,.L0_N2_Loop
nop
.L60:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
- move B,BO
+ move B,BO # Reset B
#else
dsll K, KK, 0 + BASE_SHIFT
dsll TEMP, KK, 1 + BASE_SHIFT
daddu B, BO, TEMP
#endif
MTC $0,t11
-#gsLQC1(R8,F4,F0,0)
- LD a0, 0*SIZE(A)
+ LD a0, 0*SIZE(A) # a0
+
MOV t21,t11
- MOV t12,t11
- gsLQC1(R9,F9,F8,0) #b0,b1
+ gsLQC1(R9,F9,F8,0) # b0,b1
- MOV t22,t11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, KCO, KK
#elif defined(LEFT)
#else
daddiu TEMP, KK, 2
#endif
- dsra K,TEMP,2 # K=KCO/2
+ dsra K,TEMP,2
+ MOV t12,t11
beqz K,.L65
- nop
+ MOV t22,t11
#else
- dsra K,KCO,2 # K=KCO/2
- MTC $0,t11
- move B,BO # Reset B
-# gsLQC1(R8,F4,F0,0)
+ dsra K,KCO,2
+ move B,BO # Reset B
LD a0,0*SIZE(A)
+
+ MTC $0,t11
MOV t21,t11
+ gsLQC1(R9,F9,F8,0)
+
MOV t12,t11
- gsLQC1(R9,F9,F8,0) #b0,b1
-
- MOV t22,t11
beqz K,.L65
- nop
+ MOV t22,t11
#endif
-.L61: # N=2 m=1,=K=4
- LD a4, 1*SIZE(A)
- gsLQC1(R9,F13,F12,1) # R9=B
+.L61: # nr=2,mr=1,kr=4
+ LD a4, 1*SIZE(A) # a2
MADD t11,t11,a0,b0
+
+ gsLQC1(R9,F13,F12,1)
MADD t12,t12,a0,b1
- LD a2, 2*SIZE(A)
- gsLQC1(R9,F11,F10,2)
+ LD a2, 2*SIZE(A) # a3
MADD t11,t11,a4,b4
+
+ gsLQC1(R9,F11,F10,2)
MADD t12,t12,a4,b5
-# gsLQC1(R8,F6,F2,1)
- LD a6, 3*SIZE(A)
+ LD a6, 3*SIZE(A) # a4
MADD t11,t11,a2,b2
- MADD t12,t12,a2,b3
daddiu K,K,-1
gsLQC1(R9,F15,F14,3)
+ MADD t12,t12,a2,b3
daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
-# gsLQC1(R8,F4,F0,0)
LD a0, 0*SIZE(A)
+ MADD t11,t11,a6,b6
daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE
- gsLQC1(R9,F9,F8,0)
- MADD t11,t11,a6,b6
+ gsLQC1(R9,F9,F8,0) # a0
bnez K,.L61
MADD t12,t12,a6,b7
-.L65: # N=2 M=1 K=2
+.L65: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2 # k = KCO&2
+ andi K,KCO,2
#else
- andi K,TEMP,2
+ andi K,TEMP,2
#endif
beqz K,.L68
nop
.L66:
- LD a4, 1*SIZE(A)
+ LD a4, 1*SIZE(A) # a1
MADD t11,t11,a0,b0
+ daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16
- gsLQC1(R9,F13,F12,1) # R9=B
+ gsLQC1(R9,F13,F12,1)
MADD t12,t12,a0,b1
- daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16
daddu B,B,4*SIZE
.L67:
- LD a0,0(A)
- gsLQC1(R9,F9,F8,0)
+ LD a0,0(A) # a0
MADD t11,t11,a4,b4
+
+ gsLQC1(R9,F9,F8,0)
MADD t12,t12,a4,b5
-.L68: # N=2, M=1, K=1
+.L68: # kr=1
#ifndef TRMMKERNEL
- andi K,KCO,1
+ andi K,KCO,1
#else
- andi K,TEMP,1
+ andi K,TEMP,1
#endif
- beqz K,.L69 #
- LD ALPHA,152($sp) # Get ALPHA
+ beqz K,.L69
+ LD ALPHA,152($sp) # Get ALPHA
MADD t11,t11,a0,b0
MADD t12,t12,a0,b1
daddu B,B,2*SIZE
-.L69: # Write Back
+.L69: # Write Back
#ifndef TRMMKERNEL
- LD c11,0(CO1) # Fetch 16 C
+ LD c11,0(CO1) # Fetch 16 C
LD c12,0(CO2)
MADD t11,c11,t11,ALPHA
ST t11,0(CO1)
ST t12,0(CO2)
- daddu CO1,CO1,1*SIZE # COx += 2*8Byte
+ daddu CO1,CO1,1*SIZE
daddu CO2,CO2,1*SIZE
- FETCH $0,0(CO1)
- FETCH $0,0(CO2)
#else
MUL t11, ALPHA, t11
MUL t12, ALPHA, t12
ST t11, 0 * SIZE(CO1)
ST t12, 0 * SIZE(CO2)
- daddu CO1,CO1,1*SIZE # COx += 2*8Byte
+ daddu CO1,CO1,1*SIZE
daddu CO2,CO2,1*SIZE
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
.align 5
.L0_N1:
- andi N,NCO,1 # Remainder N = 1
- beqz N,.L999 # N=0,NCO<1
+ andi N,NCO,1 # nr = 1
+ beqz N,.L999
nop
- move CO1,C # Set C
- dsra M,MCO,2 # M=MCO/2
-
+ move CO1,C
+ dsra M,MCO,2
+
+ move A,AO # Reset A
+ daddu PREA,AO,SPANA
#if defined(TRMMKERNEL) && defined(LEFT)
move KK, OFFSET
#endif
- move A,AO # Reset A
beqz M,.L11_M2
- daddu PREA,AO,SPANA
-
+ daddu C,CO1,LDC
.L70:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
- move B, BO
+ move B, BO # Reset B
#else
dsll K, KK, 2 + BASE_SHIFT
dsll TEMP, KK, 0 + BASE_SHIFT
daddu A, A, K
daddu B, BO, TEMP
#endif
-# gsLQC1(R9,F12,F8,0)
- LD b0, 0*SIZE(B)
+
MTC $0,t11
- gsLQC1(R8,F1,F0,0) #a0,a1
+ LD b0, 0*SIZE(B)
+
MOV t21,t11
- gsLQC1(R8,F3,F2,1) #a2,a3
+ gsLQC1(R8,F1,F0,0) #a0,a1
+
MOV t31,t11
+ gsLQC1(R8,F3,F2,1) #a2,a3
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, KCO, KK
#elif defined(LEFT)
#else
daddiu TEMP, KK, 1
#endif
- dsra K,TEMP,2 # K=KCO/2
- beqz K,.L75
+ dsra K,TEMP,2
MOV t41,t11
+ beqz K,.L75
+ nop
#else
- move B, BO
- dsra K,KCO,2 # K=KCO/2
-# gsLQC1(R9,F12,F8,0)
+ move B, BO # Reset B
+ dsra K,KCO,2
LD b0, 0*SIZE(B)
+
MTC $0,t11
- gsLQC1(R8,F1,F0,0) #a0,a1
MOV t21,t11
- gsLQC1(R8,F3,F2,1) #a2,a3
+ gsLQC1(R8,F1,F0,0) #a0,a1
+
MOV t31,t11
- beqz K,.L75
MOV t41,t11
+ gsLQC1(R8,F3,F2,1) #a2,a3
+
+ beqz K,.L75
+ nop
#endif
-
-.L71: # N=1,M=K=4
- gsLQC1(R8,F5,F4,2) # R8=A
- gsLQC1(R8,F7,F6,3)
+.L71: # nr=1,mr=kr=4
+ LD b4, 1*SIZE(B) # b1
MADD t11,t11,a0,b0
+
+ gsLQC1(R8,F5,F4,2)
MADD t21,t21,a1,b0
- LD b4, 1*SIZE(B)
+ gsLQC1(R8,F7,F6,3)
FETCH $0,(PREA)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
.L72:
-# gsLQC1(R9,F14,F10,1)
- gsLQC1(R8,F1,F0,4)
- gsLQC1(R8,F3,F2,5)
+ LD b2, 2*SIZE(B) # b2
MADD t11,t11,a4,b4
+ gsLQC1(R8,F1,F0,4)
MADD t21,t21,a5,b4
- LD b2, 2*SIZE(B)
+ gsLQC1(R8,F3,F2,5)
FETCH $0,4*SIZE(PREA)
MADD t31,t31,a6,b4
MADD t41,t41,a7,b4
.L73:
- gsLQC1(R8,F5,F4,6)
- gsLQC1(R8,F7,F6,7)
- MADD t11,t11,a0,b2
-
LD b6, 3*SIZE(B)
+ MADD t11,t11,a0,b2
+ daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
+
+ gsLQC1(R8,F5,F4,6)
MADD t21,t21,a1,b2
- daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE
-
FETCH $0,8*SIZE(PREA)
+
+ gsLQC1(R8,F7,F6,7)
MADD t31,t31,a2,b2
MADD t41,t41,a3,b2
- daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
+ daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE
.L74:
-# gsLQC1(R9,F12,F8,0)
- gsLQC1(R8,F1,F0,0)
- daddu PREA,PREA,16*SIZE
- gsLQC1(R8,F3,F2,1)
+ LD b0, 0*SIZE(B)
MADD t11,t11,a4,b6
- MADD t21,t21,a5,b6
+ daddu PREA,PREA,16*SIZE
- LD b0, 0*SIZE(B)
+ gsLQC1(R8,F1,F0,0)
+ MADD t21,t21,a5,b6
daddiu K,K,-1
FETCH $0,-32(PREA)
+ gsLQC1(R8,F3,F2,1)
MADD t31,t31,a6,b6
bnez K,.L71
MADD t41,t41,a7,b6
-.L75: # N=2 M=4 K=2
+.L75: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2 # k = KCO&2
+ andi K,KCO,2
#else
- andi K,TEMP,2
+ andi K,TEMP,2
#endif
beqz K,.L78
nop
.L76:
- gsLQC1(R8,F5,F4,2) # R8=A
- gsLQC1(R8,F7,F6,3)
+ LD b4, 1*SIZE(B)
MADD t11,t11,a0,b0
+ daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32
+
+ gsLQC1(R8,F5,F4,2)
MADD t21,t21,a1,b0
- daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
-
- LD b4, 1*SIZE(B)
FETCH $0,0(PREA)
+
+ gsLQC1(R8,F7,F6,3)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
- daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32
+ daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
.L77:
- gsLQC1(R8,F1,F0,0)
- gsLQC1(R8,F3,F2,1)
+ LD b0,0(B)
MADD t11,t11,a4,b4
+
+ gsLQC1(R8,F1,F0,0)
MADD t21,t21,a5,b4
-
- LD b0,0(B)
FETCH $0,4*SIZE(PREA)
+
+ gsLQC1(R8,F3,F2,1)
MADD t31,t31,a6,b4
MADD t41,t41,a7,b4
daddu PREA,PREA,8*SIZE
-
-.L78: # N=2, M=4, K=1
+.L78: # kr=1
#ifndef TRMMKERNEL
- andi K,KCO,1
+ andi K,KCO,1
#else
- andi K,TEMP,1
+ andi K,TEMP,1
#endif
- beqz K,.L79 #
- LD ALPHA,152($sp) # Get ALPHA
+ beqz K,.L79
+ LD ALPHA,152($sp) # Get ALPHA
FETCH $0,0(PREA)
MADD t11,t11,a0,b0
daddu PREA,PREA,4*SIZE
-.L79: # Write Back
+.L79: # Write Back
#ifndef TRMMKERNEL
- LD c11,0(CO1) # Fetch 16 C
+ LD c11,0(CO1) # Fetch 16 C
LD c21,1*SIZE(CO1)
LD c31,2*SIZE(CO1)
LD c41,3*SIZE(CO1)
ST t21,1*SIZE(CO1)
ST t31,2*SIZE(CO1)
ST t41,3*SIZE(CO1)
- daddiu M,M,-1 # M--
+ daddiu M,M,-1 # M--
FETCH $0,4*SIZE(CO1)
+ FETCH $0,8*SIZE(CO1)
+ bnez M,.L70 # M!=0
daddu CO1,CO1,4*SIZE # COx += 4*8Byte
- bnez M,.L70 # M!=0
- nop
#else
- daddiu M,M,-1 # M--
+ daddiu M,M,-1 # M--
MUL t11, ALPHA, t11
MUL t21, ALPHA, t21
MUL t31, ALPHA, t31
ST t31,2*SIZE(CO1)
ST t41,3*SIZE(CO1)
- daddu CO1,CO1,4*SIZE # COx += 4*8Byte
-#if ( defined(LEFT) && defined(TRANSA)) || \
- (!defined(LEFT) && !defined(TRANSA))
+ FETCH $0,4*SIZE(CO1)
+ FETCH $0,8*SIZE(CO1)
+
+ daddu CO1,CO1,4*SIZE
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, KCO, KK
#ifdef LEFT
daddiu TEMP, TEMP, -4
#ifdef LEFT
daddiu KK, KK, 4
#endif
- bnez M,.L70 # M!=0
+ bnez M,.L70
nop
#endif
-
+ .align 3
.L11_M2:
- andi M,MCO,2 # Remainder M = 2
+ andi M,MCO,2 # mr = 2
beqz M,.L11_M1
nop
daddu B, BO, TEMP
#endif
-# gsLQC1(R9,F12,F8,0)
LD b0, 0*SIZE(B)
MTC $0,t11
- gsLQC1(R8,F1,F0,0) #a0,a1
+
+ gsLQC1(R8,F1,F0,0) #a0,a1
MOV t21,t11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, KCO, KK
nop
#else
move B, BO
- dsra K,KCO,2 # K=KCO/2
-# gsLQC1(R9,F12,F8,0)
+ dsra K,KCO,2
LD b0, 0*SIZE(B)
+
MTC $0,t11
- gsLQC1(R8,F1,F0,0) #a0,a1
MOV t21,t11
+ gsLQC1(R8,F1,F0,0) #a0,a1
+
beqz K,.L85
nop
#endif
-.L81: # N=1,M=2,K=4
+.L81: # nr=1,mr=2,kr=4
LD b4, 1*SIZE(B)
- gsLQC1(R8,F5,F4,1) # R8=A
+ gsLQC1(R8,F5,F4,1)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
-# gsLQC1(R9,F14,F10,1)
-
LD b6, 3*SIZE(B)
gsLQC1(R8,F7,F6,3)
MADD t11,t11,a2,b2
-
MADD t21,t21,a3,b2
+
daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE
daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
-# gsLQC1(R9,F12,F8,0)
+ LD b0, 0*SIZE(B)
gsLQC1(R8,F1,F0,0)
- daddiu K,K,-1
MADD t11,t11,a6,b6
+ MADD t21,t21,a7,b6
- LD b0, 0*SIZE(B)
+ daddiu K,K,-1
bnez K,.L81
- MADD t21,t21,a7,b6
-
+ nop
-.L85: # N=2 M=4 K=2
+.L85: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2 # k = KCO&2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
-
beqz K,.L88
nop
.L86:
- gsLQC1(R8,F5,F4,1) # R8=A
+ gsLQC1(R8,F5,F4,1)
LD b4, 1*SIZE(B)
MADD t11,t11,a0,b0
-
MADD t21,t21,a1,b0
+
daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16
MADD t21,t21,a5,b4
-.L88: # N=2, M=4, K=1
+.L88: # kr=1
#ifndef TRMMKERNEL
- andi K,KCO,1
+ andi K,KCO,1
#else
andi K,TEMP,1
#endif
-
- beqz K,.L89 #
- LD ALPHA,152($sp) # Get ALPHA
+ beqz K,.L89
+ LD ALPHA,152($sp) # Get ALPHA
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu B,B,1*SIZE
-.L89: # Write Back
+.L89: # Write Back
#ifndef TRMMKERNEL
- LD c11,0(CO1) # Fetch 16 C
+ LD c11,0(CO1) # Fetch 16 C
LD c21,1*SIZE(CO1)
MADD t11,c11,t11,ALPHA
FETCH $0,2*SIZE(CO1)
daddu CO1,CO1,2*SIZE # COx += 2*8Byte
+
#else
daddu CO1,CO1,2*SIZE # COx += 2*8Byte
MUL t11, ALPHA, t11
MUL t21, ALPHA, t21
+ FETCH $0,0(CO1)
ST t11, -2 * SIZE(CO1)
ST t21, -1 * SIZE(CO1)
-#if ( defined(LEFT) && defined(TRANSA)) || \
- (!defined(LEFT) && !defined(TRANSA))
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, KCO, KK
#ifdef LEFT
daddiu TEMP, TEMP, -2
#endif
-
+ .align 3
.L11_M1:
- andi M,MCO,1 # Remainder M = 1
- beqz M,.L999 # M = 0, End
+ andi M,MCO,1 # mr = 1
+ beqz M,.L999
nop
.L90:
daddu A, A, K
daddu B, BO, TEMP
#endif
-# gsLQC1(R8,F4,F0,0)
- MTC $0,t11
-# gsLQC1(R9,F12,F8,0)
LD a0, 0*SIZE(A)
LD b0, 0*SIZE(B)
+ MTC $0,t11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, KCO, KK
#elif defined(LEFT)
#else
move B, BO
- dsra K,KCO,2 # K=KCO/2
-# gsLQC1(R8,F4,F0,0)
-# gsLQC1(R9,F12,F8,0)
LD a0, 0*SIZE(A)
LD b0, 0*SIZE(B)
+ dsra K,KCO,2
beqz K,.L95
MTC $0,t11
#endif
-.L91: # N=1,M=1,K=4
-# gsLQC1(R8,F6,F2,1)
+.L91: # nr=mr=1,kr=4
LD a4, 1*SIZE(A)
LD b4, 1*SIZE(B)
MADD t11,t11,a0,b0
-# gsLQC1(R9,F14,F10,1)
+
LD a2, 2*SIZE(A)
LD b2, 2*SIZE(B)
MADD t11,t11,a4,b4
-
-# gsLQC1(R8,F4,F0,0)
LD a6, 3*SIZE(A)
LD b6, 3*SIZE(B)
MADD t11,t11,a2,b2
LD a0, 0*SIZE(A)
LD b0, 0*SIZE(B)
-# gsLQC1(R9,F12,F8,0)
MADD t11,t11,a6,b6
daddiu K,K,-1
bnez K,.L91
nop
-.L95: # N=2 M=4 K=2
+.L95: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2 # k = KCO&2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
LD b0,0(B)
LD a0,0(A)
MADD t11,t11,a4,b4
-
-
-.L98: # N=2, M=4, K=1
+.L98: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L99 #
- LD ALPHA,152($sp) # Get ALPHA
+ beqz K,.L99
+ LD ALPHA,152($sp) # Get ALPHA
+
MADD t11,t11,a0,b0
-.L99: # Write Back
+.L99: # Write Back
#ifndef TRMMKERNEL
- LD c11,0(CO1) # Fetch 16 C
+ LD c11,0(CO1) # Fetch 16 C
MADD t11,c11,t11,ALPHA
ST t11,0(CO1)
+
#else
MUL t11, ALPHA, t11