--- /dev/null
+#define REALNAME ASMNAME
+#define ASSEMBLER
+#include "common.h"
+#define FETCH ld
+#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
+#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
+
+#define M $4
+#define N $5
+#define K $6
+#define A $8
+#define B $9
+#define C $10
+#define LDC $11
+
+#define AO $12
+#define BO $13
+
+#define CO1 $14
+#define CO2 $15
+#define CO3 $16
+#define CO4 $17
+
+#define KCO $18
+#define MCO $19
+#define NCO $20
+
+#define SPANB $21
+#define PREB $23
+#define PREA $24
+#define SPANA $25
+
+#define ALPHA $f15
+
+#if defined(TRMMKERNEL)
+#define OFFSET $2
+#define KK $3
+#define TEMP $7
+#endif
+
+#define R8 8
+#define R9 9
+#define R14 14
+#define R15 15
+#define R16 16
+#define R17 17
+
+#define t11 $f30
+#define t21 $f31
+#define t31 $f28
+#define t41 $f29
+
+#define t12 $f26
+#define t22 $f27
+#define t32 $f24
+#define t42 $f25
+
+#define t13 $f22
+#define t23 $f23
+#define t33 $f20
+#define t43 $f21
+
+#define t14 $f18
+#define t24 $f19
+#define t34 $f16
+#define t44 $f17
+
+#define c11 $f0
+#define c21 $f1
+#define c31 $f2
+#define c41 $f3
+
+#define c12 $f4
+#define c22 $f5
+#define c32 $f6
+#define c42 $f7
+
+#define c13 $f8
+#define c23 $f9
+#define c33 $f10
+#define c43 $f11
+
+#define c14 $f12
+#define c24 $f13
+#define c34 $f14
+#define c44 $f0
+
+#define a0 $f0
+#define a1 $f1
+#define a2 $f2
+#define a3 $f3
+#define a4 $f4
+#define a5 $f5
+#define a6 $f6
+#define a7 $f7
+#define b0 $f8
+#define b1 $f9
+#define b2 $f10
+#define b3 $f11
+#define b4 $f12
+#define b5 $f13
+#define b6 $f14
+#define b7 $f15
+
+#define F31 31
+#define F30 30
+#define F29 29
+#define F28 28
+#define F27 27
+#define F26 26
+#define F25 25
+#define F24 24
+#define F23 23
+#define F22 22
+#define F21 21
+#define F20 20
+#define F19 19
+#define F18 18
+#define F17 17
+#define F16 16
+#define F15 15
+#define F14 14
+#define F13 13
+#define F12 12
+#define F11 11
+#define F10 10
+#define F9 9
+#define F8 8
+#define F7 7
+#define F6 6
+#define F5 5
+#define F4 4
+#define F3 3
+#define F2 2
+#define F1 1
+#define F0 0
+
+ PROLOGUE
+
+ daddiu $sp, $sp, -160
+ sd $16, 0($sp)
+ sd $17, 8($sp)
+ sd $18, 16($sp)
+ sd $19, 24($sp)
+ sd $20, 32($sp)
+ sd $21, 40($sp)
+ sd $22, 48($sp)
+ ST $f24, 56($sp)
+ ST $f25, 64($sp)
+ ST $f26, 72($sp)
+ ST $f27, 80($sp)
+ ST $f28, 88($sp)
+ sd $23, 96($sp)
+ sd $24, 104($sp)
+ sd $25, 112($sp)
+ ST $f20,120($sp)
+ ST $f21,128($sp)
+ ST $f22,136($sp)
+ ST $f23,144($sp)
+
+
+ .align 5
+.L0_N4: # Loop N
+ ST ALPHA,152($sp) # Backup ALPHA
+ move MCO,M # Backup M
+
+ move NCO,N # Backup N
+ move KCO,K # Backup K
+
+ move AO,A # Backup A_addr
+ dsra N,NCO,2 # N=NCO/2
+
+ dsll LDC,LDC,BASE_SHIFT # LDC*8Byte
+ dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5
+
+#if defined(TRMMKERNEL)
+ LDARG OFFSET,160($sp) # OFFSET is relate to the data part
+#endif
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ neg KK,OFFSET
+#endif
+
+ move BO,B # Backup B_addr
+ beq N,$0,.L0_N2 # N=0,NCO<4
+ dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte
+
+.L0_N4_Lb: # mr=4,nr=4
+ move CO1,C
+ dsra M,MCO,2 # M=MCO/2
+
+ move A,AO # Reset A
+ daddu CO2,C,LDC
+
+ daddu PREB,BO,SPANB # PreB point next panelB
+ daddu CO3,CO2,LDC
+
+ daddu PREA,AO,SPANA
+ daddu CO4,CO3,LDC
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ move KK,OFFSET
+#endif
+ beqz M,.L14_M2
+ daddu C,CO4,LDC # move C to next panel Cj
+
+.L10:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move B,BO # (SIDE=L and UPLO=L) or (SIZE=R and UPLO=U)
+#else
+ dsll K,KK,2 + BASE_SHIFT # KK is the length that needs to span to the data part
+ dsll TEMP,KK,2 + BASE_SHIFT
+
+ daddu A,A,K # move A B to data part
+ daddu B,BO,TEMP
+#endif
+ MTC $0,t11
+ MOV t21,t11
+ LD a0,0(A)
+
+ MOV t31,t11
+ MOV t41,t11
+ LD a1,1*SIZE(A)
+
+ MOV t12,t11
+ MOV t22,t11
+ LD b0,0(B)
+
+ MOV t32,t11
+ MOV t42,t11
+ LD b1,1*SIZE(B)
+
+ MOV t13,t11
+ MOV t23,t11
+ LD a2,2*SIZE(A)
+
+ MOV t33,t11
+ MOV t43,t11
+ LD b2,2*SIZE(B)
+
+ MOV t14,t11
+ MOV t24,t11
+ LD a3,3*SIZE(A)
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP,KCO,KK # temp is the length of the data part
+#elif defined(LEFT)
+ daddiu TEMP, KK, 4 # S=L,U=L
+#else
+ daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part
+#endif
+ dsra K,TEMP,2 # K=KCO/2
+ MOV t34,t11
+ beqz K,.L15
+ MOV t44,t11
+
+#else
+ move B,BO # Reset B
+ MTC $0,t11 # GEMM part NR=4,MR=4
+ LD a0,0(A)
+
+ MOV t21,t11
+ MOV t31,t11
+ LD a1,1*SIZE(A)
+
+ MOV t41,t11
+ MOV t12,t11
+ LD b0,0(B)
+
+ MOV t22,t11
+ MOV t32,t11
+ LD b1,1*SIZE(B)
+
+ MOV t42,t11
+ dsra K,KCO,2 # K=KCO/2
+ LD a2,2*SIZE(A)
+
+ MOV t13,t11
+ MOV t23,t11
+ LD b2,2*SIZE(B)
+
+ MOV t33,t11
+ MOV t43,t11
+ LD a3,3*SIZE(A)
+
+ MOV t14,t11
+ MOV t24,t11
+ LD b3,3*SIZE(B)
+
+ MOV t34,t11
+ beqz K,.L15
+ MOV t44,t11 # clear 16 results registers
+#endif
+
+ .align 5
+.L11: # kr=4
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+ LD a4,4*SIZE(A)
+
+ MADD t12,t12,a0,b1
+ MADD t22,t22,a1,b1
+ LD a5,5*SIZE(A)
+
+ MADD t31,t31,a2,b0
+ MADD t41,t41,a3,b0
+ LD b4,4*SIZE(B)
+
+ MADD t32,t32,a2,b1
+ MADD t42,t42,a3,b1
+ LD b5,5*SIZE(B)
+ FETCH $0,(PREB)
+
+ MADD t13,t13,a0,b2
+ MADD t23,t23,a1,b2
+ LD a6,6*SIZE(A)
+
+ MADD t14,t14,a0,b3
+ MADD t24,t24,a1,b3
+ LD b6,6*SIZE(B)
+ FETCH $0,(PREA)
+
+ MADD t33,t33,a2,b2
+ MADD t43,t43,a3,b2
+ LD a7,7*SIZE(A)
+
+ MADD t34,t34,a2,b3
+ MADD t44,t44,a3,b3
+ LD b7,7*SIZE(B)
+
+.L12:
+ MADD t11,t11,a4,b4
+ MADD t21,t21,a5,b4
+ LD a0,8*SIZE(A)
+
+ MADD t12,t12,a4,b5
+ MADD t22,t22,a5,b5
+ LD a1,9*SIZE(A)
+
+ MADD t31,t31,a6,b4
+ MADD t41,t41,a7,b4
+ LD b0,8*SIZE(B)
+
+ MADD t32,t32,a6,b5
+ MADD t42,t42,a7,b5
+ LD b1,9*SIZE(B)
+
+ FETCH $0,4*SIZE(PREB)
+ MADD t13,t13,a4,b6
+ MADD t23,t23,a5,b6
+ LD a2,10*SIZE(A)
+
+ MADD t14,t14,a4,b7
+ MADD t24,t24,a5,b7
+ LD b2,10*SIZE(B)
+
+ FETCH $0,4*SIZE(PREA)
+ MADD t33,t33,a6,b6
+ MADD t43,t43,a7,b6
+ LD a3,11*SIZE(A)
+
+ MADD t34,t34,a6,b7
+ MADD t44,t44,a7,b7
+ LD b3,11*SIZE(B)
+
+.L13:
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+ LD a4,12*SIZE(A)
+
+ MADD t12,t12,a0,b1
+ MADD t22,t22,a1,b1
+ LD a5,13*SIZE(A)
+
+ MADD t31,t31,a2,b0
+ MADD t41,t41,a3,b0
+ LD b4,12*SIZE(B)
+
+ FETCH $0,8*SIZE(PREA)
+ MADD t32,t32,a2,b1
+ MADD t42,t42,a3,b1
+ LD b5,13*SIZE(B)
+
+ FETCH $0,8*SIZE(PREB)
+ MADD t13,t13,a0,b2
+ MADD t23,t23,a1,b2
+ LD a6,14*SIZE(A)
+
+ MADD t14,t14,a0,b3
+ MADD t24,t24,a1,b3
+ daddu A,A,16*SIZE # 4mr*4kr
+ LD b6,14*SIZE(B)
+
+ MADD t33,t33,a2,b2
+ MADD t43,t43,a3,b2
+ daddu B,B,16*SIZE # 4nr*4kr
+ LD a7,-1*SIZE(A)
+
+ MADD t34,t34,a2,b3
+ MADD t44,t44,a3,b3
+ LD b7,-1*SIZE(B)
+
+.L14:
+ MADD t11,t11,a4,b4
+ MADD t21,t21,a5,b4
+ LD a0,0(A)
+
+ MADD t12,t12,a4,b5
+ MADD t22,t22,a5,b5
+ LD a1,1*SIZE(A)
+
+ MADD t31,t31,a6,b4
+ MADD t41,t41,a7,b4
+ daddiu K,K,-1
+ LD b0,0(B)
+
+ MADD t32,t32,a6,b5
+ MADD t42,t42,a7,b5
+ daddu PREA,PREA,16*SIZE
+ LD b1,1*SIZE(B)
+
+ FETCH $0,12*SIZE(PREB)
+ MADD t13,t13,a4,b6
+ MADD t23,t23,a5,b6
+ LD a2,2*SIZE(A)
+
+ FETCH $0,-4*SIZE(PREA)
+ MADD t14,t14,a4,b7
+ MADD t24,t24,a5,b7
+ LD b2,2*SIZE(B)
+
+ MADD t33,t33,a6,b6
+ MADD t43,t43,a7,b6
+ daddu PREB,PREB,16*SIZE
+ LD a3,3*SIZE(A)
+
+ MADD t34,t34,a6,b7
+ MADD t44,t44,a7,b7
+ bnez K,.L11
+ LD b3,3*SIZE(B)
+
+
+.L15: # kr=2
+#ifndef TRMMKERNEL
+ andi K,KCO,2
+#else
+ andi K,TEMP, 2
+#endif
+ beqz K,.L18
+ nop
+
+.L16:
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+ LD a4,4*SIZE(A)
+
+ MADD t12,t12,a0,b1
+ MADD t22,t22,a1,b1
+ LD a5,5*SIZE(A)
+
+ MADD t31,t31,a2,b0
+ MADD t41,t41,a3,b0
+ LD b4,4*SIZE(B)
+
+ FETCH $0,0(PREA)
+ MADD t32,t32,a2,b1
+ MADD t42,t42,a3,b1
+ LD b5,5*SIZE(B)
+
+ FETCH $0,0(PREB)
+ MADD t13,t13,a0,b2
+ MADD t23,t23,a1,b2
+ LD a6,6*SIZE(A)
+
+ MADD t14,t14,a0,b3
+ MADD t24,t24,a1,b3
+ daddu A,A,8*SIZE # 4mr*2kr
+ LD b6,6*SIZE(B)
+
+ MADD t33,t33,a2,b2
+ MADD t43,t43,a3,b2
+ daddu B,B,8*SIZE # 4nr*2kr
+ LD a7,-1*SIZE(A)
+
+ MADD t34,t34,a2,b3
+ MADD t44,t44,a3,b3
+ LD b7,-1*SIZE(B)
+
+.L17:
+ MADD t11,t11,a4,b4
+ MADD t21,t21,a5,b4
+ LD a0,0*SIZE(A)
+
+ MADD t12,t12,a4,b5
+ MADD t22,t22,a5,b5
+ LD a1,1*SIZE(A)
+
+ MADD t31,t31,a6,b4
+ MADD t41,t41,a7,b4
+ LD b0,0*SIZE(B)
+
+ MADD t32,t32,a6,b5
+ MADD t42,t42,a7,b5
+ LD b1,1*SIZE(B)
+
+ FETCH $0,4*SIZE(PREB)
+ MADD t13,t13,a4,b6
+ MADD t23,t23,a5,b6
+ LD a2,2*SIZE(A)
+
+ FETCH $0,4*SIZE(PREA)
+ MADD t14,t14,a4,b7
+ MADD t24,t24,a5,b7
+ LD b2,2*SIZE(B)
+
+ MADD t33,t33,a6,b6
+ MADD t43,t43,a7,b6
+ daddu PREA,PREA,8*SIZE
+ LD a3,3*SIZE(A)
+
+ MADD t34,t34,a6,b7
+ MADD t44,t44,a7,b7
+ daddu PREB,PREB,8*SIZE
+ LD b3,3*SIZE(B)
+
+
+.L18: # kr=1
+#ifndef TRMMKERNEL
+ andi K,KCO,1
+#else
+ andi K,TEMP,1
+#endif
+ beqz K,.L19
+ LD ALPHA,152($sp) # Get ALPHA
+
+ FETCH $0,0(PREB)
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+ daddu A,A,4*SIZE # 4mr*kr
+
+ MADD t12,t12,a0,b1
+ MADD t22,t22,a1,b1
+ daddu B,B,4*SIZE # 4nr*kr
+
+ FETCH $0,0(PREA)
+ MADD t31,t31,a2,b0
+ MADD t41,t41,a3,b0
+ daddu PREB,PREB,4*SIZE
+
+ MADD t32,t32,a2,b1
+ MADD t42,t42,a3,b1
+ daddu PREA,PREA,4*SIZE
+
+ MADD t13,t13,a0,b2
+ MADD t23,t23,a1,b2
+
+ MADD t14,t14,a0,b3
+ MADD t24,t24,a1,b3
+
+ MADD t33,t33,a2,b2
+ MADD t43,t43,a3,b2
+
+ MADD t34,t34,a2,b3
+ MADD t44,t44,a3,b3
+
+.L19: # Write Back to C
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # GEMM write part
+ LD c21,1*SIZE(CO1) # get 16 C
+ LD c31,2*SIZE(CO1)
+ LD c41,3*SIZE(CO1)
+
+ LD c12,0(CO2)
+ MADD t11,c11,t11,ALPHA
+ LD c22,1*SIZE(CO2)
+ MADD t21,c21,t21,ALPHA
+ LD c32,2*SIZE(CO2)
+ MADD t31,c31,t31,ALPHA
+ LD c42,3*SIZE(CO2)
+ MADD t41,c41,t41,ALPHA
+
+ LD c13,0(CO3)
+ MADD t12,c12,t12,ALPHA
+ LD c23,1*SIZE(CO3)
+ MADD t22,c22,t22,ALPHA
+ LD c33,2*SIZE(CO3)
+ MADD t32,c32,t32,ALPHA
+ LD c43,3*SIZE(CO3)
+ MADD t42,c42,t42,ALPHA
+
+ LD c14,0(CO4)
+ MADD t13,c13,t13,ALPHA
+ LD c24,1*SIZE(CO4)
+ MADD t23,c23,t23,ALPHA
+ LD c34,2*SIZE(CO4)
+ MADD t33,c33,t33,ALPHA
+ LD c44,3*SIZE(CO4)
+ MADD t43,c43,t43,ALPHA
+
+ ST t11,0(CO1)
+ MADD t14,c14,t14,ALPHA
+ ST t21,1*SIZE(CO1)
+ MADD t24,c24,t24,ALPHA
+ ST t31,2*SIZE(CO1)
+ MADD t34,c34,t34,ALPHA
+ ST t41,3*SIZE(CO1)
+ MADD t44,c44,t44,ALPHA
+ daddiu M,M,-1 # M--
+
+ ST t12,0(CO2)
+ ST t22,1*SIZE(CO2)
+ ST t32,2*SIZE(CO2)
+ ST t42,3*SIZE(CO2)
+
+ ST t13,0(CO3)
+ ST t23,1*SIZE(CO3)
+ ST t33,2*SIZE(CO3)
+ ST t43,3*SIZE(CO3)
+
+ FETCH $0,4*SIZE(CO1)
+ FETCH $0,4*SIZE(CO2)
+ FETCH $0,4*SIZE(CO3)
+ FETCH $0,4*SIZE(CO4)
+
+ FETCH $0,8*SIZE(CO1)
+ FETCH $0,8*SIZE(CO2)
+ FETCH $0,8*SIZE(CO3)
+ FETCH $0,8*SIZE(CO4)
+
+ ST t14,0(CO4)
+ daddu CO1,CO1,4*SIZE # COi += 4
+ ST t24,1*SIZE(CO4)
+ daddu CO2,CO2,4*SIZE
+ ST t34,2*SIZE(CO4)
+ daddu CO3,CO3,4*SIZE
+ ST t44,3*SIZE(CO4)
+ daddu PREB,BO,SPANB
+
+ bnez M,.L10
+ daddu CO4,CO4,4*SIZE
+
+#else
+ MUL t11, ALPHA, t11 # TRMM write back part
+ MUL t21, ALPHA, t21
+ MUL t31, ALPHA, t31
+ MUL t41, ALPHA, t41
+
+ ST t11, 0 * SIZE(CO1)
+ MUL t12, ALPHA, t12
+ ST t21, 1 * SIZE(CO1)
+ MUL t22, ALPHA, t22
+ ST t31, 2 * SIZE(CO1)
+ MUL t32, ALPHA, t32
+ ST t41, 3 * SIZE(CO1)
+ MUL t42, ALPHA, t42
+
+ ST t12, 0 * SIZE(CO2)
+ MUL t13, ALPHA, t13
+ ST t22, 1 * SIZE(CO2)
+ MUL t23, ALPHA, t23
+ ST t32, 2 * SIZE(CO2)
+ MUL t33, ALPHA, t33
+ ST t42, 3 * SIZE(CO2)
+ MUL t43, ALPHA, t43
+
+ ST t13, 0 * SIZE(CO3)
+ MUL t14, ALPHA, t14
+ ST t23, 1 * SIZE(CO3)
+ MUL t24, ALPHA, t24
+ ST t33, 2 * SIZE(CO3)
+ MUL t34, ALPHA, t34
+ ST t43, 3 * SIZE(CO3)
+ MUL t44, ALPHA, t44
+
+ ST t14, 0 * SIZE(CO4)
+ daddiu M,M,-1 # M--
+ ST t24, 1 * SIZE(CO4)
+ ST t34, 2 * SIZE(CO4)
+ ST t44, 3 * SIZE(CO4)
+ daddiu CO1,CO1, 4 * SIZE
+ daddiu CO2,CO2, 4 * SIZE
+ daddiu CO3,CO3, 4 * SIZE
+ daddiu CO4,CO4, 4 * SIZE
+
+ FETCH $0,4*SIZE(CO1)
+ FETCH $0,4*SIZE(CO2)
+ FETCH $0,4*SIZE(CO3)
+ FETCH $0,4*SIZE(CO4)
+
+ FETCH $0,0(CO1)
+ FETCH $0,0(CO2)
+ FETCH $0,0(CO3)
+ FETCH $0,0(CO4)
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP,KCO,KK
+#ifdef LEFT
+ daddiu TEMP,TEMP, -4
+#else
+ daddiu TEMP,TEMP, -4
+#endif
+ dsll K,TEMP,2 + BASE_SHIFT
+ dsll TEMP,TEMP,2 + BASE_SHIFT
+ daddu A,A,K # mov A to the end of panel Ai
+ daddu B,B,TEMP # mov B to the end of panel Bj
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK,4
+#endif
+ bnez M,.L10
+ nop
+#endif
+
+
+ .align 3
+.L14_M2:
+ andi M, MCO, 2 # nr=4,mr=2
+ beqz M,.L14_M1
+ nop
+
+.L20:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move B,BO # Reset B
+#else
+ dsll K,KK,1 + BASE_SHIFT # mr=2
+ dsll TEMP,KK,2 + BASE_SHIFT # nr=4
+ daddu A,A,K
+ daddu B,BO,TEMP
+#endif
+
+ MTC $0,t11
+ LD a0,0*SIZE(A)
+ MOV t21,t11
+ LD a1,1*SIZE(A)
+
+ MOV t12,t11
+ LD b0,0*SIZE(B)
+ MOV t22,t11
+ LD b1,1*SIZE(B)
+
+ MOV t13,t11
+ LD b2,2*SIZE(B)
+ MOV t23,t11
+ LD b3,3*SIZE(B)
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP,KCO,KK
+#elif defined(LEFT)
+ daddiu TEMP,KK,2 # left part,controlled by mr, mr=2
+#else
+ daddiu TEMP,KK,4 # right part,controlled by nr,nr=4
+#endif
+ dsra K,TEMP,2
+ MOV t14,t11
+ beqz K,.L25
+ MOV t24,t11 # clear 2*4=8 results registers
+
+#else
+ move B,BO # Reset B
+ LD a0,0*SIZE(A)
+ MTC $0,t11
+ LD a1,1*SIZE(A)
+
+ MOV t21,t11
+ LD b0,0*SIZE(B)
+ MOV t12,t11
+ LD b1,1*SIZE(B)
+
+ MOV t22,t11
+ dsra K,KCO,2
+ LD b2,2*SIZE(B)
+
+ MOV t13,t11
+ MOV t23,t11
+ LD b3,3*SIZE(B)
+
+ MOV t14,t11
+ beqz K,.L25
+ MOV t24,t11
+
+#endif
+
+.L21: # nr=4,mr=2,kr=4
+ MADD t11,t11,a0,b0
+ LD a4,2*SIZE(A)
+ MADD t21,t21,a1,b0
+ LD a5,3*SIZE(A)
+
+ MADD t12,t12,a0,b1
+ LD b4,4*SIZE(B)
+ MADD t22,t22,a1,b1
+ LD b5,5*SIZE(B)
+
+ MADD t13,t13,a0,b2
+ LD b6,6*SIZE(B)
+ MADD t23,t23,a1,b2
+ LD b7,7*SIZE(B)
+
+ MADD t14,t14,a0,b3
+ MADD t24,t24,a1,b3
+
+ MADD t11,t11,a4,b4
+ LD a2,4*SIZE(A)
+ MADD t21,t21,a5,b4
+ LD a3,5*SIZE(A)
+
+ MADD t12,t12,a4,b5
+ LD b0,8*SIZE(B)
+ MADD t22,t22,a5,b5
+ LD b1,9*SIZE(B)
+
+ MADD t13,t13,a4,b6
+ LD b2,10*SIZE(B)
+ MADD t23,t23,a5,b6
+ LD b3,11*SIZE(B)
+
+ MADD t14,t14,a4,b7
+ MADD t24,t24,a5,b7
+ daddiu K,K,-1
+
+ MADD t11,t11,a2,b0
+ LD a6,6*SIZE(A)
+ MADD t21,t21,a3,b0
+ LD a7,7*SIZE(A)
+
+ MADD t12,t12,a2,b1
+ LD b4,12*SIZE(B)
+ MADD t22,t22,a3,b1
+ LD b5,13*SIZE(B)
+
+ MADD t13,t13,a2,b2
+ LD b6,14*SIZE(B)
+ MADD t23,t23,a3,b2
+ LD b7,15*SIZE(B)
+
+ MADD t14,t14,a2,b3
+ MADD t24,t24,a3,b3
+ daddu A,A,8*SIZE # 2mr*4kr
+ daddu B,B,16*SIZE # 4nr*4kr
+
+ MADD t11,t11,a6,b4
+ LD a0,0*SIZE(A)
+ MADD t21,t21,a7,b4
+ LD a1,1*SIZE(A)
+
+ MADD t12,t12,a6,b5
+ LD b0,0*SIZE(B)
+ MADD t22,t22,a7,b5
+ LD b1,1*SIZE(B)
+
+ MADD t13,t13,a6,b6
+ LD b2,2*SIZE(B)
+ MADD t23,t23,a7,b6
+ LD b3,3*SIZE(B)
+
+ MADD t14,t14,a6,b7
+ bnez K,.L21
+ MADD t24,t24,a7,b7
+
+
+.L25:
+#ifndef TRMMKERNEL
+ andi K,KCO,2 # kr=2
+#else
+ andi K,TEMP,2
+#endif
+ beqz K,.L28
+ nop
+
+.L26:
+ MADD t11,t11,a0,b0
+ LD a4,2*SIZE(A)
+ MADD t21,t21,a1,b0
+ LD a5,3*SIZE(A)
+
+ MADD t12,t12,a0,b1
+ LD b4,4*SIZE(B)
+ MADD t22,t22,a1,b1
+ LD b5,5*SIZE(B)
+
+ MADD t13,t13,a0,b2
+ LD b6,6*SIZE(B)
+ MADD t23,t23,a1,b2
+ LD b7,7*SIZE(B)
+
+ MADD t14,t14,a0,b3
+ MADD t24,t24,a1,b3
+ daddu A,A,4*SIZE # 2mr*2kr
+ daddu B,B,8*SIZE # 4nr*2kr
+
+.L27:
+ MADD t11,t11,a4,b4
+ LD a0,0*SIZE(A)
+ MADD t21,t21,a5,b4
+ LD a1,1*SIZE(A)
+
+ MADD t12,t12,a4,b5
+ LD b0,0*SIZE(B)
+ MADD t22,t22,a5,b5
+ LD b1,1*SIZE(B)
+
+ MADD t13,t13,a4,b6
+ LD b2,2*SIZE(B)
+ MADD t23,t23,a5,b6
+ LD b3,3*SIZE(B)
+
+ MADD t14,t14,a4,b7
+ MADD t24,t24,a5,b7
+
+
+.L28: # kr=1
+#ifndef TRMMKERNEL
+ andi K,KCO,1
+#else
+ andi K,TEMP,1
+#endif
+ beqz K,.L29
+ LD ALPHA,152($sp) # Get ALPHA
+
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+ daddu A,A,2*SIZE # 2mr*kr
+ daddu B,B,4*SIZE # 4nr*kr
+
+ MADD t12,t12,a0,b1
+ MADD t22,t22,a1,b1
+
+ MADD t13,t13,a0,b2
+ MADD t23,t23,a1,b2
+
+ MADD t14,t14,a0,b3
+ MADD t24,t24,a1,b3
+
+.L29: # Write Back to C
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # GEMM write back part
+ LD c21,1*SIZE(CO1)
+
+ LD c12,0(CO2)
+ LD c22,1*SIZE(CO2)
+
+ LD c13,0(CO3)
+ MADD t11,c11,t11,ALPHA
+ LD c23,1*SIZE(CO3)
+ MADD t21,c21,t21,ALPHA
+
+ LD c14,0(CO4)
+ MADD t12,c12,t12,ALPHA
+ LD c24,1*SIZE(CO4)
+ MADD t22,c22,t22,ALPHA
+
+ ST t11,0(CO1)
+ MADD t13,c13,t13,ALPHA
+ ST t21,1*SIZE(CO1)
+ MADD t23,c23,t23,ALPHA
+
+ ST t12,0(CO2)
+ MADD t14,c14,t14,ALPHA
+ ST t22,1*SIZE(CO2)
+ MADD t24,c24,t24,ALPHA
+
+ ST t13,0(CO3)
+ daddu CO1,CO1,2*SIZE # COi += 2
+ ST t23,1*SIZE(CO3)
+ daddu CO2,CO2,2*SIZE
+
+ ST t14,0(CO4)
+ daddu CO3,CO3,2*SIZE
+ ST t24,1*SIZE(CO4)
+ daddu CO4,CO4,2*SIZE
+
+ FETCH $0,0(CO1)
+ FETCH $0,0(CO2)
+ FETCH $0,0(CO3)
+ FETCH $0,0(CO4)
+
+#else
+ MUL t11, ALPHA, t11 # TRMM write back part
+ MUL t21, ALPHA, t21
+
+ ST t11, 0 * SIZE(CO1)
+ MUL t12, ALPHA, t12
+ ST t21, 1 * SIZE(CO1)
+ MUL t22, ALPHA, t22
+
+ ST t12, 0 * SIZE(CO2)
+ MUL t13, ALPHA, t13
+ ST t22, 1 * SIZE(CO2)
+ MUL t23, ALPHA, t23
+
+ ST t13, 0 * SIZE(CO3)
+ MUL t14, ALPHA, t14
+ ST t23, 1 * SIZE(CO3)
+ MUL t24, ALPHA, t24
+
+ ST t14, 0 * SIZE(CO4)
+ ST t24, 1 * SIZE(CO4)
+
+ daddiu CO1,CO1, 2 * SIZE
+ daddiu CO2,CO2, 2 * SIZE
+ daddiu CO3,CO3, 2 * SIZE
+ daddiu CO4,CO4, 2 * SIZE
+
+ FETCH $0,0(CO1)
+ FETCH $0,0(CO2)
+ FETCH $0,0(CO3)
+ FETCH $0,0(CO4)
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP,KCO,KK
+#ifdef LEFT
+ daddiu TEMP,TEMP,-2
+#else
+ daddiu TEMP,TEMP,-4
+#endif
+ dsll K,TEMP,1 + BASE_SHIFT
+ dsll TEMP,TEMP,2 + BASE_SHIFT
+
+ daddu A,A,K # move A to next panel Ai
+ daddu B,B,TEMP # move B to next panel Bj
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK, 2
+#endif
+#endif
+
+
+ .align 3
+.L14_M1:
+ andi M,MCO,1 # mr=1
+ beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj
+ nop
+
+.L30:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move B,BO # Reset B
+#else
+ dsll K,KK, 0 + BASE_SHIFT
+ dsll TEMP,KK,2 + BASE_SHIFT
+
+ daddu A,A,K
+ daddu B,BO,TEMP
+#endif
+ MTC $0,t11
+ MOV t12,t11
+ LD a0, 0 * SIZE(A) # a0
+
+ MOV t13,t11
+ LD b0,0*SIZE(B)
+ MOV t14,t11 # clear result registers
+ LD b1,1*SIZE(B)
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#elif defined(LEFT)
+ daddiu TEMP, KK, 1
+#else
+ daddiu TEMP, KK, 4
+#endif
+ dsra K,TEMP, 2
+ nop
+ beqz K,.L35
+ nop
+
+#else
+ move B,BO # Reset B, GEMM part
+ dsra K,KCO,2 # K=KCO/2
+ LD a0, 0 * SIZE(A) # a0
+
+ MTC $0,t11
+ LD b0,0*SIZE(B)
+
+ MOV t12,t11
+ LD b1,1*SIZE(B)
+
+ MOV t13,t11
+ LD b2,2*SIZE(B)
+
+ MOV t14,t11
+ beqz K,.L35
+ LD b3,3*SIZE(B)
+
+#endif
+
+.L31: # nr=4,mr=1,kr=4
+ LD a1, 1*SIZE(A) # load a1
+ MADD t11,t11,a0,b0
+
+ LD b4,4*SIZE(B)
+ LD b5,5*SIZE(B)
+ MADD t12,t12,a0,b1
+
+ LD b6,6*SIZE(B)
+ LD b7,7*SIZE(B)
+ MADD t13,t13,a0,b2
+ MADD t14,t14,a0,b3
+
+ LD a2, 2*SIZE(A) # a2
+ MADD t11,t11,a1,b4
+
+ LD b0,8*SIZE(B)
+ LD b1,9*SIZE(B)
+ MADD t12,t12,a1,b5
+
+ LD b2,10*SIZE(B)
+ LD b3,11*SIZE(B)
+ MADD t13,t13,a1,b6
+ MADD t14,t14,a1,b7
+
+ LD a3, 3*SIZE(A) # a3
+ MADD t11,t11,a2,b0
+ daddiu K,K,-1
+
+ LD b4,12*SIZE(B)
+ LD b5,13*SIZE(B)
+ MADD t12,t12,a2,b1
+ daddu A,A,4*SIZE # 1mr*4kr
+
+ LD b6,14*SIZE(B)
+ LD b7,15*SIZE(B)
+ MADD t13,t13,a2,b2
+ MADD t14,t14,a2,b3
+
+ LD a0, 0*SIZE(A) # a0
+ daddu B,B,16*SIZE # 4nr*4kr
+ MADD t11,t11,a3,b4
+
+ LD b0,0*SIZE(B)
+ MADD t12,t12,a3,b5
+ LD b1,1*SIZE(B)
+ MADD t13,t13,a3,b6
+
+ LD b2,2*SIZE(B)
+ MADD t14,t14,a3,b7
+ bnez K,.L31
+ LD b3,3*SIZE(B)
+
+
+.L35: # kr=2
+#ifndef TRMMKERNEL
+ andi K,KCO,2
+#else
+ andi K,TEMP,2
+#endif
+ beqz K,.L38
+ nop
+
+.L36:
+ LD a1,1*SIZE(A) # load a1
+ MADD t11,t11,a0,b0
+
+ LD b4,4*SIZE(B)
+ LD b5,5*SIZE(B)
+ MADD t12,t12,a0,b1
+ daddu A,A,2*SIZE # mr*2kr
+
+ LD b6,6*SIZE(B)
+ MADD t13,t13,a0,b2
+
+ LD b7,7*SIZE(B)
+ MADD t14,t14,a0,b3
+ daddu B,B,8*SIZE # 4nr*2kr
+
+
+.L37:
+ LD a0,0(A)
+ MADD t11,t11,a1,b4
+
+ LD b0,0*SIZE(B)
+ LD b1,1*SIZE(B)
+ MADD t12,t12,a1,b5
+
+ LD b2,2*SIZE(B)
+ LD b3,3*SIZE(B)
+ MADD t13,t13,a1,b6
+ MADD t14,t14,a1,b7
+
+
+.L38: # kr=1
+#ifndef TRMMKERNEL
+ andi K,KCO,1
+#else
+ andi K,TEMP,1
+#endif
+ beqz K,.L39
+ LD ALPHA,152($sp) # Get ALPHA
+
+ MADD t11,t11,a0,b0
+ MADD t12,t12,a0,b1
+ daddu A,A,1*SIZE
+ daddu B,B,4*SIZE
+
+ MADD t13,t13,a0,b2
+ MADD t14,t14,a0,b3
+
+.L39: # Write Back
+#ifndef TRMMKERNEL
+ LD c11,0(CO1)
+ LD c12,0(CO2)
+ LD c13,0(CO3)
+ LD c14,0(CO4)
+
+ MADD t11,c11,t11,ALPHA
+ MADD t12,c12,t12,ALPHA
+ MADD t13,c13,t13,ALPHA
+ MADD t14,c14,t14,ALPHA
+
+ ST t11,0(CO1)
+ ST t12,0(CO2)
+ ST t13,0(CO3)
+ ST t14,0(CO4)
+#else
+ MUL t11, ALPHA, t11
+ MUL t12, ALPHA, t12
+ MUL t13, ALPHA, t13
+ MUL t14, ALPHA, t14
+
+ ST t11, 0 * SIZE(CO1)
+ ST t12, 0 * SIZE(CO2)
+ ST t13, 0 * SIZE(CO3)
+ ST t14, 0 * SIZE(CO4)
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#ifdef LEFT
+ daddiu TEMP, TEMP, -1
+#else
+ daddiu TEMP, TEMP, -4
+#endif
+
+ dsll K,TEMP, 0 + BASE_SHIFT
+ dsll TEMP,TEMP, 2 + BASE_SHIFT
+
+ daddu A,A,K
+ daddu B,B,TEMP
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK, 1
+#endif
+#endif
+
+
+ .align 3
+.L0_N4_Loop: # mc finished
+ daddiu N,N,-1 # N--
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ daddiu KK, KK,4
+#endif
+ bnez N,.L0_N4_Lb
+ move BO,B # Set BO point to next panel Bj
+
+ .align 5
+.L0_N2:
+ andi N,NCO,2 # nr = 2
+ beqz N,.L0_N1
+ nop
+
+.L0_N2_Lb:
+ move CO1,C
+ daddu CO2,C,LDC
+
+ dsra M,MCO,2
+ move A,AO # Reset A
+
+ daddu PREA,AO,SPANA
+ daddu C,CO2,LDC
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ move KK, OFFSET
+#endif
+ beqz M,.L12_M2
+ nop
+
+.L40:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move B,BO # Reset B
+#else
+ dsll K,KK, 2 + BASE_SHIFT
+ dsll TEMP, KK,1 + BASE_SHIFT
+
+ daddu A,A,K
+ daddu B,BO,TEMP
+#endif
+ MTC $0,t11
+ LD a0,0*SIZE(A)
+ MOV t21,t11
+ LD a1,1*SIZE(A)
+
+ MOV t31,t11
+ LD b0,0*SIZE(B)
+ MOV t41,t11
+ LD b1,1*SIZE(B)
+
+ MOV t12,t11
+ LD a2,2*SIZE(A)
+ MOV t22,t11
+ LD a3,3*SIZE(A)
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP,KCO,KK
+#elif defined(LEFT)
+ daddiu TEMP, KK, 4
+#else
+ daddiu TEMP, KK, 2
+#endif
+ dsra K,TEMP,2
+ MOV t32,t11
+ beqz K,.L45
+ MOV t42,t11
+
+#else
+ move B,BO # Reset B
+ LD a0,0*SIZE(A)
+ MTC $0,t11 # gemm part
+ LD a1,1*SIZE(A)
+
+ MOV t21,t11
+ LD b0,0*SIZE(B)
+ MOV t31,t11
+ LD b1,1*SIZE(B)
+
+ MOV t41,t11
+ LD a2,2*SIZE(A)
+ dsra K,KCO,2 # K=KCO/2
+ LD a3,3*SIZE(A)
+
+ MOV t12,t11
+ MOV t22,t11
+
+ MOV t32,t11
+ beqz K,.L45
+ MOV t42,t11
+
+#endif
+
+.L41: # nr=2,mr=kr=4
+ MADD t11,t11,a0,b0
+ LD a4,4*SIZE(A)
+ MADD t21,t21,a1,b0
+ LD a5,5*SIZE(A)
+
+ MADD t12,t12,a0,b1
+ LD b4,2*SIZE(B)
+ MADD t22,t22,a1,b1
+ LD b5,3*SIZE(B)
+
+ MADD t31,t31,a2,b0
+ LD a6,6*SIZE(A)
+ MADD t41,t41,a3,b0
+ LD a7,7*SIZE(A)
+
+ FETCH $0,(PREA)
+ MADD t32,t32,a2,b1
+ MADD t42,t42,a3,b1
+
+.L42:
+ MADD t11,t11,a4,b4
+ LD a0,8*SIZE(A)
+ MADD t21,t21,a5,b4
+ LD a1,9*SIZE(A)
+
+ MADD t12,t12,a4,b5
+ LD b2,4*SIZE(B)
+ MADD t22,t22,a5,b5
+ LD b3,5*SIZE(B)
+
+ MADD t31,t31,a6,b4
+ LD a2,10*SIZE(A)
+ MADD t41,t41,a7,b4
+ LD a3,11*SIZE(A)
+
+ FETCH $0,4*SIZE(PREA)
+ MADD t32,t32,a6,b5
+ MADD t42,t42,a7,b5
+
+.L43:
+ MADD t11,t11,a0,b2
+ LD a4,12*SIZE(A)
+ MADD t21,t21,a1,b2
+ LD a5,13*SIZE(A)
+
+ MADD t12,t12,a0,b3
+ LD b6,6*SIZE(B)
+ MADD t22,t22,a1,b3
+ LD b7,7*SIZE(B)
+
+ MADD t31,t31,a2,b2
+ LD a6,14*SIZE(A)
+ MADD t41,t41,a3,b2
+ LD a7,15*SIZE(A)
+
+ FETCH $0,8*SIZE(PREA)
+ MADD t32,t32,a2,b3
+ MADD t42,t42,a3,b3
+
+ daddu A,A,16*SIZE # 4mr*4kr
+ daddu B,B,8*SIZE # 2nr*4kr
+
+.L44:
+ MADD t11,t11,a4,b6
+ LD a0,0*SIZE(A)
+ MADD t21,t21,a5,b6
+ LD a1,1*SIZE(A)
+
+
+ MADD t12,t12,a4,b7
+ LD b0,0*SIZE(B)
+ MADD t22,t22,a5,b7
+ LD b1,1*SIZE(B)
+
+ daddiu K,K,-1
+ daddu PREA,PREA,16*SIZE
+
+ MADD t31,t31,a6,b6
+ LD a2,2*SIZE(A)
+ MADD t41,t41,a7,b6
+ LD a3,3*SIZE(A)
+
+ FETCH $0,-4*SIZE(PREA)
+ MADD t32,t32,a6,b7
+ bnez K,.L41
+ MADD t42,t42,a7,b7
+
+
+.L45: # kr=2
+#ifndef TRMMKERNEL
+ andi K,KCO,2
+#else
+ andi K,TEMP,2
+#endif
+ beqz K,.L48
+ nop
+
+.L46:
+ MADD t11,t11,a0,b0
+ LD a4,4*SIZE(A)
+ MADD t21,t21,a1,b0
+ LD a5,5*SIZE(A)
+
+ MADD t12,t12,a0,b1
+ LD b4,2*SIZE(B)
+ MADD t22,t22,a1,b1
+ LD b5,3*SIZE(B)
+
+ MADD t31,t31,a2,b0
+ LD a6,6*SIZE(A)
+ MADD t41,t41,a3,b0
+ LD a7,7*SIZE(A)
+
+ FETCH $0,0(PREA)
+ MADD t32,t32,a2,b1
+ daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32
+
+ MADD t42,t42,a3,b1
+ daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
+
+.L47:
+ MADD t11,t11,a4,b4
+ LD a0,0*SIZE(A)
+ MADD t21,t21,a5,b4
+ LD a1,1*SIZE(A)
+
+ MADD t12,t12,a4,b5
+ LD b0,0*SIZE(B)
+ MADD t22,t22,a5,b5
+ LD b1,1*SIZE(B)
+
+ MADD t31,t31,a6,b4
+ LD a2,2*SIZE(A)
+ MADD t41,t41,a7,b4
+ LD a3,3*SIZE(A)
+
+ FETCH $0,4*SIZE(PREA)
+ MADD t32,t32,a6,b5
+ MADD t42,t42,a7,b5
+ daddu PREA,PREA,8*SIZE
+
+
+
+.L48: # kr=1
+#ifndef TRMMKERNEL
+ andi K,KCO,1
+#else
+ andi K,TEMP,1
+#endif
+ beqz K,.L49
+ LD ALPHA,152($sp) # Get ALPHA
+
+ FETCH $0,0(PREA)
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+ daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32
+
+ MADD t12,t12,a0,b1
+ MADD t22,t22,a1,b1
+ daddu B,B,2*SIZE
+ daddu PREA,PREA,4*SIZE
+
+ MADD t31,t31,a2,b0
+ MADD t41,t41,a3,b0
+
+ MADD t32,t32,a2,b1
+ MADD t42,t42,a3,b1
+
+.L49: # Write Back
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # gemm write back part Fetch 16 C
+ LD c21,1*SIZE(CO1)
+ LD c31,2*SIZE(CO1)
+ LD c41,3*SIZE(CO1)
+
+ LD c12,0(CO2)
+ MADD t11,c11,t11,ALPHA
+ LD c22,1*SIZE(CO2)
+ MADD t21,c21,t21,ALPHA
+ LD c32,2*SIZE(CO2)
+ MADD t31,c31,t31,ALPHA
+ LD c42,3*SIZE(CO2)
+ MADD t41,c41,t41,ALPHA
+
+ ST t11,0(CO1)
+ MADD t12,c12,t12,ALPHA
+ ST t21,1*SIZE(CO1)
+ MADD t22,c22,t22,ALPHA
+ ST t31,2*SIZE(CO1)
+ MADD t32,c32,t32,ALPHA
+ ST t41,3*SIZE(CO1)
+ MADD t42,c42,t42,ALPHA
+ daddiu M,M,-1
+
+ ST t12,0(CO2)
+ ST t22,1*SIZE(CO2)
+ ST t32,2*SIZE(CO2)
+ ST t42,3*SIZE(CO2)
+
+ FETCH $0,4*SIZE(CO1)
+ FETCH $0,4*SIZE(CO2)
+ FETCH $0,8*SIZE(CO1)
+ FETCH $0,8*SIZE(CO2)
+
+ daddu CO1,CO1,4*SIZE
+ bnez M,.L40
+ daddu CO2,CO2,4*SIZE
+
+#else
+ MUL t11, ALPHA, t11
+ MUL t21, ALPHA, t21
+ MUL t31, ALPHA, t31
+ MUL t41, ALPHA, t41
+
+ MUL t12, ALPHA, t12
+ ST t11, 0 * SIZE(CO1)
+ MUL t22, ALPHA, t22
+ ST t21, 1 * SIZE(CO1)
+ MUL t32, ALPHA, t32
+ ST t31, 2 * SIZE(CO1)
+ MUL t42, ALPHA, t42
+ ST t41, 3 * SIZE(CO1)
+
+ ST t12, 0 * SIZE(CO2)
+ daddiu M,M,-1
+ ST t22, 1 * SIZE(CO2)
+ ST t32, 2 * SIZE(CO2)
+ ST t42, 3 * SIZE(CO2)
+
+ daddiu CO1,CO1, 4*SIZE
+ daddiu CO2,CO2, 4*SIZE
+
+ FETCH $0,0(CO1)
+ FETCH $0,0(CO2)
+ FETCH $0,4(CO1)
+ FETCH $0,4(CO2)
+
+#if ( defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#ifdef LEFT
+ daddiu TEMP, TEMP, -4
+#else
+ daddiu TEMP, TEMP, -2
+#endif
+ dsll K,TEMP, 2 + BASE_SHIFT
+ dsll TEMP, TEMP, 1 + BASE_SHIFT
+
+ daddu A,A,K
+ daddu B,B,TEMP
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK, 4
+#endif
+ bnez M,.L40
+ nop
+#endif
+
+
+ .align 3
+.L12_M2:
+ andi M,MCO,2 # mr = 2
+ beqz M,.L12_M1
+ nop
+
+.L50:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move B,BO
+#else
+ dsll K, KK, 1 + BASE_SHIFT #mr=2
+ dsll TEMP, KK, 1 + BASE_SHIFT #nr=2
+
+ daddu A, A, K
+ daddu B, BO, TEMP
+#endif
+ MTC $0,t11
+ LD a0,0*SIZE(A)
+ MOV t21,t11
+ LD a1,1*SIZE(A)
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#elif defined(LEFT)
+ daddiu TEMP, KK, 2
+#else
+ daddiu TEMP, KK, 2
+#endif
+ dsra K,TEMP,2
+ MOV t12,t11
+ beqz K,.L55
+ MOV t22,t11
+
+#else
+ move B,BO
+ LD a0,0*SIZE(A)
+ dsra K,KCO,2 # K=KCO/2
+ LD a1,1*SIZE(A)
+
+ MTC $0,t11
+ LD b0,0*SIZE(B)
+ MOV t21,t11
+ LD b1,1*SIZE(B)
+
+ MOV t12,t11
+ beqz K,.L55
+ MOV t22,t11
+
+#endif
+
+.L51: # nr=2 mr=2,kr=4
+ MADD t11,t11,a0,b0
+ LD a4,2*SIZE(A)
+ MADD t21,t21,a1,b0
+ LD b4,2*SIZE(B)
+
+ MADD t12,t12,a0,b1
+ LD a5,3*SIZE(A)
+ MADD t22,t22,a1,b1
+ LD b5,3*SIZE(B)
+
+ MADD t11,t11,a4,b4
+ LD a2,4*SIZE(A)
+ MADD t21,t21,a5,b4
+ LD b2,4*SIZE(B)
+
+ MADD t12,t12,a4,b5
+ LD a3,5*SIZE(A)
+ MADD t22,t22,a5,b5
+ daddiu K,K,-1
+ LD b3,5*SIZE(B)
+
+ MADD t11,t11,a2,b2
+ LD a6,6*SIZE(A)
+ MADD t21,t21,a3,b2
+ daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE
+ LD b6,6*SIZE(B)
+
+ MADD t12,t12,a2,b3
+ daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=16*SIZE
+ LD a7,-1*SIZE(A)
+ MADD t22,t22,a3,b3
+ LD b7,-1*SIZE(B)
+
+ MADD t11,t11,a6,b6
+ LD a0,0*SIZE(A)
+ MADD t21,t21,a7,b6
+ LD b0,0*SIZE(B)
+
+ MADD t12,t12,a6,b7
+ LD a1,1*SIZE(A)
+
+ MADD t22,t22,a7,b7
+ bnez K,.L51
+ LD b1,1*SIZE(B)
+
+
+.L55: # kr=2
+#ifndef TRMMKERNEL
+ andi K,KCO,2
+#else
+ andi K,TEMP,2
+#endif
+ beqz K,.L58
+ nop
+
+.L56:
+ MADD t11,t11,a0,b0
+ LD a4,2*SIZE(A)
+ MADD t21,t21,a1,b0
+ daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
+ LD b4,2*SIZE(B)
+
+ MADD t12,t12,a0,b1
+ daddu B,B,4*SIZE # 2nr*2kr
+ LD a5,-1*SIZE(A)
+ MADD t22,t22,a1,b1
+ LD b5,-1*SIZE(B)
+
+.L57:
+ MADD t11,t11,a4,b4
+ LD a0,0*SIZE(A)
+ MADD t21,t21,a5,b4
+ LD b0,0*SIZE(B)
+
+ MADD t12,t12,a4,b5
+ LD a1,1*SIZE(A)
+ MADD t22,t22,a5,b5
+ LD b1,1*SIZE(B)
+
+.L58: # kr=1
+#ifndef TRMMKERNEL
+ andi K,KCO,1
+#else
+ andi K,TEMP, 1
+#endif
+ beqz K,.L59
+ LD ALPHA,152($sp) # Get ALPHA
+
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+ daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
+ daddu B,B,2*SIZE # 2nr*kr
+
+ MADD t12,t12,a0,b1
+ MADD t22,t22,a1,b1
+
+
+.L59: # Write Back
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # write gemm part back Fetch 16 C
+ LD c21,1*SIZE(CO1)
+ LD c12,0(CO2)
+ LD c22,1*SIZE(CO2)
+
+ MADD t11,c11,t11,ALPHA
+ MADD t21,c21,t21,ALPHA
+ MADD t12,c12,t12,ALPHA
+ MADD t22,c22,t22,ALPHA
+
+ ST t11,0(CO1)
+ ST t21,1*SIZE(CO1)
+ ST t12,0(CO2)
+ ST t22,1*SIZE(CO2)
+
+ daddu CO1,CO1,2*SIZE
+ daddu CO2,CO2,2*SIZE
+
+ FETCH $0,0(CO1)
+ FETCH $0,0(CO2)
+#else
+ daddiu M, M, -1
+ daddiu CO1,CO1, 2 * SIZE
+ daddiu CO2,CO2, 2 * SIZE
+ MUL t11, ALPHA, t11
+ MUL t21, ALPHA, t21
+ MUL t12, ALPHA, t12
+ MUL t22, ALPHA, t22
+
+ ST t11, -2 * SIZE(CO1)
+ ST t21, -1 * SIZE(CO1)
+ ST t12, -2 * SIZE(CO2)
+ ST t22, -1 * SIZE(CO2)
+
+ FETCH $0,0(CO1)
+ FETCH $0,0(CO2)
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#ifdef LEFT
+ daddiu TEMP, TEMP, -2
+#else
+ daddiu TEMP, TEMP, -2
+#endif
+
+ dsll K, TEMP, 1 + BASE_SHIFT
+ dsll TEMP, TEMP, 1 + BASE_SHIFT
+
+ daddu A, A, K
+ daddu B, B, TEMP
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK, 2
+#endif
+#endif
+
+
+ .align 3
+.L12_M1:
+ andi M,MCO,1 # mr = 1
+ beqz M,.L0_N2_Loop
+ nop
+
+.L60:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move B,BO # Reset B
+#else
+ dsll K, KK, 0 + BASE_SHIFT
+ dsll TEMP, KK, 1 + BASE_SHIFT
+
+ daddu A, A, K
+ daddu B, BO, TEMP
+#endif
+ MTC $0,t11
+ LD a0, 0*SIZE(A) # a0
+
+ MOV t21,t11
+ LD b0,0*SIZE(B)
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#elif defined(LEFT)
+ daddiu TEMP, KK, 1
+#else
+ daddiu TEMP, KK, 2
+#endif
+ dsra K,TEMP,2
+ MOV t12,t11
+ beqz K,.L65
+ MOV t22,t11
+
+#else
+ dsra K,KCO,2
+ move B,BO # Reset B
+ LD a0,0*SIZE(A)
+
+ MTC $0,t11
+ MOV t21,t11
+ LD b0,0*SIZE(B)
+
+ MOV t12,t11
+ LD b1,1*SIZE(B)
+ beqz K,.L65
+ MOV t22,t11
+
+#endif
+
+.L61: # nr=2,mr=1,kr=4
+ LD a4, 1*SIZE(A) # a2
+ LD b4, 2*SIZE(B)
+ MADD t11,t11,a0,b0
+
+ LD b5,3*SIZE(B)
+ MADD t12,t12,a0,b1
+
+ LD a2, 2*SIZE(A) # a3
+ LD b2,4*SIZE(B)
+ MADD t11,t11,a4,b4
+
+ LD b3,5*SIZE(B)
+ MADD t12,t12,a4,b5
+
+ LD a6, 3*SIZE(A) # a4
+ daddiu K,K,-1
+ LD b6,6*SIZE(B)
+ MADD t11,t11,a2,b2
+
+ LD b7,7*SIZE(B)
+ MADD t12,t12,a2,b3
+ daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
+
+ LD a0, 0*SIZE(A)
+ daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE
+
+ LD b0,0*SIZE(B)
+ MADD t11,t11,a6,b6
+
+ LD b1,1*SIZE(B)
+ bnez K,.L61
+ MADD t12,t12,a6,b7
+
+
+
+.L65: # kr=2
+#ifndef TRMMKERNEL
+ andi K,KCO,2
+#else
+ andi K,TEMP,2
+#endif
+ beqz K,.L68
+ nop
+
+.L66:
+ LD a4, 1*SIZE(A) # a1
+ MADD t11,t11,a0,b0
+ LD b4,2*SIZE(B)
+ daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16
+
+ LD b5,3*SIZE(B)
+ MADD t12,t12,a0,b1
+ daddu B,B,4*SIZE
+
+.L67:
+ LD a0,0(A) # a0
+ LD b0,0*SIZE(B)
+ MADD t11,t11,a4,b4
+
+ LD b1,1*SIZE(B)
+ MADD t12,t12,a4,b5
+
+
+.L68: # kr=1
+#ifndef TRMMKERNEL
+ andi K,KCO,1
+#else
+ andi K,TEMP,1
+#endif
+ beqz K,.L69
+ LD ALPHA,152($sp) # Get ALPHA
+
+ MADD t11,t11,a0,b0
+ MADD t12,t12,a0,b1
+ daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16
+ daddu B,B,2*SIZE
+
+
+.L69: # Write Back
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # Fetch 16 C
+ LD c12,0(CO2)
+
+ MADD t11,c11,t11,ALPHA
+ MADD t12,c12,t12,ALPHA
+
+ ST t11,0(CO1)
+ ST t12,0(CO2)
+
+ daddu CO1,CO1,1*SIZE
+ daddu CO2,CO2,1*SIZE
+
+#else
+ MUL t11, ALPHA, t11
+ MUL t12, ALPHA, t12
+
+ ST t11, 0 * SIZE(CO1)
+ ST t12, 0 * SIZE(CO2)
+
+ daddu CO1,CO1,1*SIZE
+ daddu CO2,CO2,1*SIZE
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#ifdef LEFT
+ daddiu TEMP, TEMP, -1
+#else
+ daddiu TEMP, TEMP, -2
+#endif
+
+ dsll K, TEMP, 0 + BASE_SHIFT
+ dsll TEMP, TEMP, 1 + BASE_SHIFT
+
+ daddu A, A, K
+ daddu B, B, TEMP
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK, 1
+#endif
+#endif
+
+.L0_N2_Loop:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ daddiu KK, KK, 2
+#endif
+ move BO, B
+
+
+ .align 5
+.L0_N1:
+ andi N,NCO,1 # nr = 1
+ beqz N,.L999
+ nop
+
+ move CO1,C
+ dsra M,MCO,2
+
+ move A,AO # Reset A
+ daddu PREA,AO,SPANA
+#if defined(TRMMKERNEL) && defined(LEFT)
+ move KK, OFFSET
+#endif
+
+ beqz M,.L11_M2
+ daddu C,CO1,LDC
+
+.L70:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move B, BO # Reset B
+#else
+ dsll K, KK, 2 + BASE_SHIFT
+ dsll TEMP, KK, 0 + BASE_SHIFT
+
+ daddu A, A, K
+ daddu B, BO, TEMP
+#endif
+ MTC $0,t11
+ LD b0, 0*SIZE(B)
+
+ MOV t21,t11
+ LD a0,0*SIZE(A)
+ MOV t31,t11
+ LD a1,1*SIZE(A)
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#elif defined(LEFT)
+ daddiu TEMP, KK, 4
+#else
+ daddiu TEMP, KK, 1
+#endif
+ dsra K,TEMP,2
+ MOV t41,t11
+ beqz K,.L75
+ nop
+#else
+ move B, BO # Reset B
+ dsra K,KCO,2
+ LD b0, 0*SIZE(B)
+
+ MTC $0,t11
+ LD a0,0*SIZE(A)
+ MOV t21,t11
+ LD a1,1*SIZE(A)
+
+ MOV t31,t11
+ LD a2,2*SIZE(A)
+ MOV t41,t11
+ beqz K,.L75
+ LD a3,3*SIZE(A)
+
+#endif
+
+.L71: # nr=1,mr=kr=4
+ LD b4, 1*SIZE(B) # b1
+ MADD t11,t11,a0,b0
+
+ LD a4, 4*SIZE(A)
+ MADD t21,t21,a1,b0
+
+ LD a5, 5*SIZE(A)
+ FETCH $0,(PREA)
+
+ LD a6,6*SIZE(A)
+ MADD t31,t31,a2,b0
+
+ LD a7,7*SIZE(A)
+ MADD t41,t41,a3,b0
+
+.L72:
+ LD b2, 2*SIZE(B) # b2
+ MADD t11,t11,a4,b4
+
+ LD a0,8*SIZE(A)
+ MADD t21,t21,a5,b4
+
+ LD a1,9*SIZE(A)
+ FETCH $0,4*SIZE(PREA)
+
+ LD a2,10*SIZE(A)
+ MADD t31,t31,a6,b4
+
+ LD a3,11*SIZE(A)
+ MADD t41,t41,a7,b4
+
+.L73:
+ LD b6, 3*SIZE(B)
+ MADD t11,t11,a0,b2
+
+ LD a4,12*SIZE(A)
+ daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
+
+ LD a5,13*SIZE(A)
+ MADD t21,t21,a1,b2
+
+ LD a6,14*SIZE(A)
+ FETCH $0,8*SIZE(PREA)
+ MADD t31,t31,a2,b2
+
+ LD a7,15*SIZE(A)
+ MADD t41,t41,a3,b2
+ daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE
+
+.L74:
+ LD b0, 0*SIZE(B)
+ MADD t11,t11,a4,b6
+
+ LD a0,0*SIZE(A)
+ daddu PREA,PREA,16*SIZE
+
+ LD a1,1*SIZE(A)
+ MADD t21,t21,a5,b6
+
+ LD a2,2*SIZE(A)
+ daddiu K,K,-1
+ MADD t31,t31,a6,b6
+
+ LD a3,3*SIZE(A)
+ MADD t41,t41,a7,b6
+ bnez K,.L71
+ FETCH $0,-32(PREA)
+
+
+.L75: # kr=2
+#ifndef TRMMKERNEL
+ andi K,KCO,2
+#else
+ andi K,TEMP,2
+#endif
+ beqz K,.L78
+ nop
+
+.L76:
+ LD b4, 1*SIZE(B)
+ MADD t11,t11,a0,b0
+
+ LD a4,4*SIZE(A)
+ daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32
+
+ LD a5,5*SIZE(A)
+ MADD t21,t21,a1,b0
+ FETCH $0,0(PREA)
+
+ LD a6,6*SIZE(A)
+ MADD t31,t31,a2,b0
+
+ LD a7,7*SIZE(A)
+ MADD t41,t41,a3,b0
+ daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
+
+.L77:
+ LD b0,0(B)
+ MADD t11,t11,a4,b4
+
+ LD a0,0*SIZE(A)
+ MADD t21,t21,a5,b4
+ FETCH $0,4*SIZE(PREA)
+
+ LD a1,1*SIZE(A)
+ MADD t31,t31,a6,b4
+
+ LD a2,2*SIZE(A)
+ MADD t41,t41,a7,b4
+
+ LD a3,3*SIZE(A)
+ daddu PREA,PREA,8*SIZE
+
+
+
+.L78: # kr=1
+#ifndef TRMMKERNEL
+ andi K,KCO,1
+#else
+ andi K,TEMP,1
+#endif
+ beqz K,.L79
+ LD ALPHA,152($sp) # Get ALPHA
+
+ FETCH $0,0(PREA)
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+ daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32
+
+ MADD t31,t31,a2,b0
+ MADD t41,t41,a3,b0
+ daddu B,B,1*SIZE
+ daddu PREA,PREA,4*SIZE
+
+
+.L79: # Write Back
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # Fetch 16 C
+ LD c21,1*SIZE(CO1)
+ LD c31,2*SIZE(CO1)
+ LD c41,3*SIZE(CO1)
+
+ MADD t11,c11,t11,ALPHA
+ MADD t21,c21,t21,ALPHA
+ MADD t31,c31,t31,ALPHA
+ MADD t41,c41,t41,ALPHA
+
+ ST t11,0(CO1)
+ ST t21,1*SIZE(CO1)
+ ST t31,2*SIZE(CO1)
+ ST t41,3*SIZE(CO1)
+ daddiu M,M,-1 # M--
+
+ FETCH $0,4*SIZE(CO1)
+ FETCH $0,8*SIZE(CO1)
+
+ bnez M,.L70 # M!=0
+ daddu CO1,CO1,4*SIZE # COx += 4*8Byte
+#else
+ daddiu M,M,-1 # M--
+ MUL t11, ALPHA, t11
+ MUL t21, ALPHA, t21
+ MUL t31, ALPHA, t31
+ MUL t41, ALPHA, t41
+
+ ST t11,0(CO1)
+ ST t21,1*SIZE(CO1)
+ ST t31,2*SIZE(CO1)
+ ST t41,3*SIZE(CO1)
+
+ FETCH $0,4*SIZE(CO1)
+ FETCH $0,8*SIZE(CO1)
+
+ daddu CO1,CO1,4*SIZE
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#ifdef LEFT
+ daddiu TEMP, TEMP, -4
+#else
+ daddiu TEMP, TEMP, -1
+#endif
+
+ dsll K, TEMP, 2 + BASE_SHIFT
+ dsll TEMP, TEMP, 0 + BASE_SHIFT
+
+ daddu A, A,K
+ daddu B, B, TEMP
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK, 4
+#endif
+ bnez M,.L70
+ nop
+#endif
+
+
+ .align 3
+.L11_M2:
+ andi M,MCO,2 # mr = 2
+ beqz M,.L11_M1
+ nop
+
+.L80:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move B, BO
+#else
+ dsll K, KK, 1 + BASE_SHIFT
+ dsll TEMP, KK, 0 + BASE_SHIFT
+
+ daddu A, A, K
+ daddu B, BO, TEMP
+#endif
+ LD b0, 0*SIZE(B)
+ MTC $0,t11
+
+ LD a0,0*SIZE(A)
+ MOV t21,t11
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#elif defined(LEFT)
+ daddiu TEMP, KK, 2
+#else
+ daddiu TEMP, KK, 1
+#endif
+ dsra K,TEMP,2 # K=KCO/2
+ beqz K,.L85
+ nop
+#else
+ move B, BO
+ dsra K,KCO,2
+ LD b0, 0*SIZE(B)
+
+ MTC $0,t11
+ MOV t21,t11
+ LD a0,0*SIZE(A)
+
+ beqz K,.L85
+ LD a1,1*SIZE(A)
+
+#endif
+
+.L81: # nr=1,mr=2,kr=4
+ LD b4, 1*SIZE(B)
+ LD a4,2*SIZE(A)
+ MADD t11,t11,a0,b0
+ LD a5,3*SIZE(A)
+ MADD t21,t21,a1,b0
+
+ LD b2, 2*SIZE(B)
+ LD a2,4*SIZE(A)
+ MADD t11,t11,a4,b4
+ LD a3,5*SIZE(A)
+ MADD t21,t21,a5,b4
+
+ LD b6, 3*SIZE(B)
+ LD a6,6*SIZE(A)
+ MADD t11,t11,a2,b2
+ LD a7,7*SIZE(A)
+ MADD t21,t21,a3,b2
+
+ daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE
+ daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
+
+ LD b0, 0*SIZE(B)
+ daddiu K,K,-1
+
+ LD a0,0*SIZE(A)
+ MADD t11,t11,a6,b6
+
+ LD a1,1*SIZE(A)
+ bnez K,.L81
+ MADD t21,t21,a7,b6
+
+.L85: # kr=2
+#ifndef TRMMKERNEL
+ andi K,KCO,2
+#else
+ andi K,TEMP,2
+#endif
+ beqz K,.L88
+ nop
+
+.L86:
+ LD b4, 1*SIZE(B)
+ LD a4,2*SIZE(A)
+ MADD t11,t11,a0,b0
+ LD a5,3*SIZE(A)
+ MADD t21,t21,a1,b0
+
+ daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
+ daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16
+
+ LD b0,0(B)
+ LD a0,0*SIZE(A)
+ MADD t11,t11,a4,b4
+ LD a1,1*SIZE(A)
+ MADD t21,t21,a5,b4
+
+
+
+.L88: # kr=1
+#ifndef TRMMKERNEL
+ andi K,KCO,1
+#else
+ andi K,TEMP,1
+#endif
+ beqz K,.L89
+ LD ALPHA,152($sp) # Get ALPHA
+
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+ daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
+ daddu B,B,1*SIZE
+
+
+.L89: # Write Back
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # Fetch 16 C
+ LD c21,1*SIZE(CO1)
+
+ MADD t11,c11,t11,ALPHA
+ MADD t21,c21,t21,ALPHA
+
+ ST t11,0(CO1)
+ ST t21,1*SIZE(CO1)
+
+ FETCH $0,2*SIZE(CO1)
+
+ daddu CO1,CO1,2*SIZE # COx += 2*8Byte
+
+#else
+ daddu CO1,CO1,2*SIZE # COx += 2*8Byte
+ MUL t11, ALPHA, t11
+ MUL t21, ALPHA, t21
+
+ FETCH $0,0(CO1)
+ ST t11, -2 * SIZE(CO1)
+ ST t21, -1 * SIZE(CO1)
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#ifdef LEFT
+ daddiu TEMP, TEMP, -2
+#else
+ daddiu TEMP, TEMP, -1
+#endif
+
+ dsll K, TEMP, 1 + BASE_SHIFT
+ dsll TEMP, TEMP, 0 + BASE_SHIFT
+
+ daddu A, A, K
+ daddu B, B, TEMP
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK, 2
+#endif
+#endif
+
+
+ .align 3
+.L11_M1:
+ andi M,MCO,1 # mr = 1
+ beqz M,.L999
+ nop
+
+.L90:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move B, BO
+#else
+ dsll K, KK, 0 + BASE_SHIFT
+ dsll TEMP, KK, 0 + BASE_SHIFT
+
+ daddu A, A, K
+ daddu B, BO, TEMP
+#endif
+ LD a0, 0*SIZE(A)
+ LD b0, 0*SIZE(B)
+ MTC $0,t11
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#elif defined(LEFT)
+ daddiu TEMP, KK, 1
+#else
+ daddiu TEMP, KK, 1
+#endif
+ dsra K, TEMP, 2
+ beqz K,.L95
+ nop
+
+#else
+ move B, BO
+ LD a0, 0*SIZE(A)
+ LD b0, 0*SIZE(B)
+ dsra K,KCO,2
+ beqz K,.L95
+ MTC $0,t11
+#endif
+
+.L91: # nr=mr=1,kr=4
+ LD a4, 1*SIZE(A)
+ LD b4, 1*SIZE(B)
+ MADD t11,t11,a0,b0
+
+ LD a2, 2*SIZE(A)
+ LD b2, 2*SIZE(B)
+ MADD t11,t11,a4,b4
+
+ LD a6, 3*SIZE(A)
+ LD b6, 3*SIZE(B)
+ MADD t11,t11,a2,b2
+
+ daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
+ daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
+
+ LD a0, 0*SIZE(A)
+ LD b0, 0*SIZE(B)
+ MADD t11,t11,a6,b6
+
+ daddiu K,K,-1
+ bnez K,.L91
+ nop
+
+.L95: # kr=2
+#ifndef TRMMKERNEL
+ andi K,KCO,2
+#else
+ andi K,TEMP,2
+#endif
+ beqz K,.L98
+ nop
+
+.L96:
+ LD a4, 1*SIZE(A)
+ LD b4, 1*SIZE(B)
+ MADD t11,t11,a0,b0
+ daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16
+ daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32
+
+ LD b0,0(B)
+ LD a0,0(A)
+ MADD t11,t11,a4,b4
+
+.L98: # kr=1
+#ifndef TRMMKERNEL
+ andi K,KCO,1
+#else
+ andi K,TEMP,1
+#endif
+ beqz K,.L99
+ LD ALPHA,152($sp) # Get ALPHA
+
+ MADD t11,t11,a0,b0
+
+
+.L99: # Write Back
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # Fetch 16 C
+ MADD t11,c11,t11,ALPHA
+ ST t11,0(CO1)
+
+#else
+ MUL t11, ALPHA, t11
+
+ ST t11, 0 * SIZE(CO1)
+#endif
+
+
+.L999: # End
+ ld $16, 0($sp)
+ ld $17, 8($sp)
+ ld $18, 16($sp)
+ ld $19, 24($sp)
+ ld $20, 32($sp)
+ ld $21, 40($sp)
+ ld $22, 48($sp)
+ LD $f24, 56($sp)
+ LD $f25, 64($sp)
+ LD $f26, 72($sp)
+ LD $f27, 80($sp)
+ LD $f28, 88($sp)
+ ld $23, 96($sp)
+ ld $24, 104($sp)
+ ld $25, 112($sp)
+ LD $f20,120($sp)
+ LD $f21,128($sp)
+ LD $f22,136($sp)
+ LD $f23,144($sp)
+
+ j $31
+ daddiu $sp, $sp, 160
+
+ EPILOGUE