Modify single precision compiler conditions, increasing single precision kernel code...
authortraz <wangqian10@iscas.ac.cn>
Fri, 27 May 2011 09:47:17 +0000 (09:47 +0000)
committertraz <wangqian10@iscas.ac.cn>
Fri, 27 May 2011 09:47:17 +0000 (09:47 +0000)
kernel/mips64/KERNEL
kernel/mips64/KERNEL.LOONGSON3A
kernel/mips64/sgemm_kernel_loongson3a.S [new file with mode: 0644]

index f6615bf..ebb447b 100644 (file)
@@ -91,10 +91,21 @@ ifndef ZGEMM_BETA
 ZGEMM_BETA = ../generic/zgemm_beta.c
 endif
 
+ifndef STRSMKERNEL_LN
 STRSMKERNEL_LN =  trsm_kernel_LN.S
+endif
+
+ifndef STRSMKERNEL_LT
 STRSMKERNEL_LT =  trsm_kernel_LT.S
+endif
+
+ifndef STRSMKERNEL_RN
 STRSMKERNEL_RN =  trsm_kernel_LT.S
+endif
+
+ifndef STRSMKERNEL_RT
 STRSMKERNEL_RT =  trsm_kernel_RT.S
+endif
 
 ifndef DTRSMKERNEL_LN
 DTRSMKERNEL_LN =  trsm_kernel_LN.S
index 0e387c0..e72ac14 100644 (file)
@@ -1,14 +1,24 @@
 SAXPYKERNEL=axpy_loongson3a.S
 DAXPYKERNEL=daxpy_loongson3a_simd.S
 
+SGEMMKERNEL    =  sgemm_kernel_loongson3a.S
+SGEMMONCOPY    = ../generic/gemm_ncopy_4.c
+SGEMMOTCOPY    = ../generic/gemm_tcopy_4.c
+SGEMMONCOPYOBJ =  sgemm_oncopy.o
+SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+
 DGEMMKERNEL    =  gemm_kernel_loongson3a.S
 DGEMMONCOPY    = ../generic/gemm_ncopy_4.c
 DGEMMOTCOPY    = ../generic/gemm_tcopy_4.c
 DGEMMONCOPYOBJ = dgemm_oncopy.o
 DGEMMOTCOPYOBJ = dgemm_otcopy.o
 
+STRSMKERNEL_LN =  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT =  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN =  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT =  ../generic/trsm_kernel_RT.c
 
-DTRSMKERNEL_LN = trsm_kernel_LN_loongson3a.S 
-DTRSMKERNEL_LT = trsm_kernel_LT_loongson3a.S
-DTRSMKERNEL_RN = trsm_kernel_RN_loongson3a.S
-DTRSMKERNEL_RT = trsm_kernel_RT_loongson3a.S
+DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
diff --git a/kernel/mips64/sgemm_kernel_loongson3a.S b/kernel/mips64/sgemm_kernel_loongson3a.S
new file mode 100644 (file)
index 0000000..36c3b38
--- /dev/null
@@ -0,0 +1,2559 @@
+#define REALNAME ASMNAME
+#define ASSEMBLER
+#include "common.h"
+#define FETCH  ld
+#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
+#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
+
+#define M      $4
+#define        N       $5
+#define        K       $6
+#define A      $8
+#define B      $9
+#define C      $10
+#define LDC    $11
+
+#define AO     $12
+#define BO     $13
+
+#define CO1    $14
+#define CO2    $15
+#define CO3    $16
+#define CO4    $17
+
+#define KCO    $18
+#define MCO    $19
+#define NCO    $20
+
+#define SPANB  $21
+#define PREB   $23
+#define PREA   $24
+#define SPANA  $25
+
+#define ALPHA  $f15
+
+#if defined(TRMMKERNEL)
+#define        OFFSET  $2
+#define        KK      $3
+#define        TEMP    $7
+#endif
+
+#define R8     8
+#define        R9      9
+#define R14    14
+#define R15    15
+#define R16    16
+#define R17 17
+
+#define        t11     $f30
+#define        t21     $f31
+#define        t31     $f28
+#define        t41     $f29
+
+#define        t12     $f26
+#define        t22     $f27
+#define        t32     $f24
+#define        t42     $f25
+
+#define        t13     $f22
+#define        t23     $f23
+#define        t33     $f20
+#define        t43     $f21
+
+#define        t14     $f18
+#define        t24     $f19
+#define        t34     $f16
+#define        t44     $f17
+
+#define        c11     $f0
+#define        c21     $f1
+#define        c31     $f2
+#define        c41     $f3
+
+#define        c12     $f4
+#define        c22     $f5
+#define        c32     $f6
+#define        c42     $f7
+
+#define        c13     $f8
+#define        c23     $f9
+#define        c33     $f10
+#define c43    $f11
+
+#define        c14     $f12
+#define        c24     $f13
+#define        c34     $f14
+#define        c44     $f0
+
+#define        a0      $f0
+#define        a1      $f1
+#define        a2      $f2
+#define        a3      $f3
+#define        a4      $f4
+#define        a5      $f5
+#define        a6      $f6
+#define        a7      $f7
+#define        b0      $f8
+#define        b1      $f9
+#define        b2      $f10
+#define b3     $f11
+#define        b4      $f12
+#define        b5      $f13
+#define        b6      $f14
+#define        b7      $f15
+
+#define F31 31
+#define F30 30
+#define F29 29
+#define F28 28
+#define F27 27
+#define F26 26
+#define F25 25
+#define F24 24 
+#define F23 23
+#define F22 22
+#define F21 21
+#define F20 20
+#define F19 19
+#define F18 18
+#define F17 17
+#define F16 16 
+#define F15 15
+#define F14 14
+#define F13 13
+#define F12 12
+#define F11 11
+#define F10 10
+#define F9 9
+#define F8 8
+#define F7 7
+#define F6 6
+#define F5 5
+#define F4 4 
+#define F3 3 
+#define F2 2 
+#define F1 1 
+#define F0 0
+
+       PROLOGUE
+       
+       daddiu  $sp, $sp, -160
+       sd      $16,   0($sp)
+       sd      $17,   8($sp)
+       sd      $18,  16($sp)
+       sd      $19,  24($sp)
+       sd      $20,  32($sp)
+       sd      $21,  40($sp)
+       sd      $22,  48($sp)
+       ST      $f24, 56($sp)
+       ST      $f25, 64($sp)
+       ST      $f26, 72($sp)
+       ST      $f27, 80($sp)
+       ST      $f28, 88($sp)
+       sd      $23,  96($sp)
+       sd      $24, 104($sp)
+       sd      $25, 112($sp)
+       ST      $f20,120($sp)
+       ST      $f21,128($sp)
+       ST      $f22,136($sp)
+       ST      $f23,144($sp)
+
+
+       .align  5                                       
+.L0_N4:                                                                        #  Loop N
+       ST      ALPHA,152($sp)                                  #  Backup       ALPHA
+       move    MCO,M                                           #  Backup       M
+
+       move    NCO,N                                           #  Backup       N
+       move    KCO,K                                           #  Backup       K
+
+       move    AO,A                                            #  Backup       A_addr
+       dsra    N,NCO,2                                         #  N=NCO/2
+       
+       dsll    LDC,LDC,BASE_SHIFT                      #  LDC*8Byte
+       dsll    SPANB,KCO,2+BASE_SHIFT          #  SPANB=KC*4nr*8Byte=KC*2^5
+       
+#if defined(TRMMKERNEL)
+       LDARG   OFFSET,160($sp)                         #       OFFSET is relate to the data part               
+#endif
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+       neg             KK,OFFSET                               
+#endif
+       
+       move    BO,B                                            #  Backup       B_addr
+       beq             N,$0,.L0_N2                                     #  N=0,NCO<4
+       dsll    SPANA,KCO,1+BASE_SHIFT          #  SPANA = KCO*2mr*8Byte
+
+.L0_N4_Lb:                                                             #       mr=4,nr=4
+       move    CO1,C                                                   
+       dsra    M,MCO,2                                         #  M=MCO/2
+       
+       move    A,AO                                            #  Reset A
+       daddu   CO2,C,LDC
+
+       daddu   PREB,BO,SPANB                           #  PreB point next panelB
+       daddu   CO3,CO2,LDC
+
+       daddu   PREA,AO,SPANA
+       daddu   CO4,CO3,LDC
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+       move    KK,OFFSET                                       
+#endif
+       beqz    M,.L14_M2
+       daddu   C,CO4,LDC                                       #       move C to next panel Cj
+
+.L10:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       move    B,BO                                            #       (SIDE=L and UPLO=L) or (SIZE=R and UPLO=U)
+#else
+       dsll    K,KK,2 + BASE_SHIFT                     #  KK is the length that needs to span to the data part
+       dsll    TEMP,KK,2 + BASE_SHIFT
+
+       daddu   A,A,K                                           #  move A B to data part
+       daddu   B,BO,TEMP
+#endif
+       MTC             $0,t11
+       MOV     t21,t11
+       LD      a0,0(A)
+       
+       MOV     t31,t11
+       MOV     t41,t11
+       LD      a1,1*SIZE(A)
+
+       MOV     t12,t11
+       MOV     t22,t11
+       LD      b0,0(B)
+       
+       MOV     t32,t11
+       MOV     t42,t11
+       LD      b1,1*SIZE(B)
+
+       MOV     t13,t11
+       MOV     t23,t11
+       LD      a2,2*SIZE(A)
+
+       MOV     t33,t11
+       MOV     t43,t11
+       LD      b2,2*SIZE(B)
+
+       MOV     t14,t11
+       MOV     t24,t11
+       LD      a3,3*SIZE(A)
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       dsubu   TEMP,KCO,KK                                     #  temp is the length of the data part
+#elif defined(LEFT)
+       daddiu  TEMP, KK, 4                                     #       S=L,U=L 
+#else
+       daddiu  TEMP, KK, 4                                     #       S=R,U=U,for this two situation KK is the length of the data part
+#endif
+       dsra    K,TEMP,2                                        #  K=KCO/2
+       MOV     t34,t11
+       beqz    K,.L15
+       MOV     t44,t11
+
+#else                                                  
+       move    B,BO                                            #       Reset B
+       MTC             $0,t11                                          #       GEMM part       NR=4,MR=4
+       LD      a0,0(A)
+
+       MOV     t21,t11
+       MOV     t31,t11
+       LD      a1,1*SIZE(A)
+
+       MOV     t41,t11
+       MOV     t12,t11
+       LD      b0,0(B)
+       
+       MOV     t22,t11
+       MOV     t32,t11
+       LD      b1,1*SIZE(B)
+
+       MOV     t42,t11
+       dsra    K,KCO,2                                         #  K=KCO/2
+       LD      a2,2*SIZE(A)
+       
+       MOV     t13,t11
+       MOV     t23,t11
+       LD      b2,2*SIZE(B)
+       
+       MOV     t33,t11
+       MOV     t43,t11
+       LD      a3,3*SIZE(A)
+
+       MOV     t14,t11
+       MOV     t24,t11
+       LD      b3,3*SIZE(B)
+
+       MOV     t34,t11
+       beqz    K,.L15
+       MOV     t44,t11                                                 #       clear 16 results registers
+#endif
+       
+       .align  5
+.L11:                                                                  #  kr=4
+       MADD    t11,t11,a0,b0
+       MADD    t21,t21,a1,b0
+       LD      a4,4*SIZE(A)
+
+       MADD    t12,t12,a0,b1
+       MADD    t22,t22,a1,b1
+       LD      a5,5*SIZE(A)
+       
+       MADD    t31,t31,a2,b0
+       MADD    t41,t41,a3,b0
+       LD      b4,4*SIZE(B)
+       
+       MADD    t32,t32,a2,b1
+       MADD    t42,t42,a3,b1
+       LD      b5,5*SIZE(B)
+       FETCH           $0,(PREB)
+       
+       MADD    t13,t13,a0,b2
+       MADD    t23,t23,a1,b2
+       LD      a6,6*SIZE(A)
+       
+       MADD    t14,t14,a0,b3
+       MADD    t24,t24,a1,b3
+       LD      b6,6*SIZE(B)
+       FETCH           $0,(PREA)
+       
+       MADD    t33,t33,a2,b2
+       MADD    t43,t43,a3,b2
+       LD      a7,7*SIZE(A)
+       
+       MADD    t34,t34,a2,b3
+       MADD    t44,t44,a3,b3
+       LD      b7,7*SIZE(B)
+
+.L12:
+       MADD    t11,t11,a4,b4
+       MADD    t21,t21,a5,b4
+       LD      a0,8*SIZE(A)
+
+       MADD    t12,t12,a4,b5
+       MADD    t22,t22,a5,b5
+       LD      a1,9*SIZE(A)
+
+       MADD    t31,t31,a6,b4
+       MADD    t41,t41,a7,b4
+       LD      b0,8*SIZE(B)
+
+       MADD    t32,t32,a6,b5
+       MADD    t42,t42,a7,b5
+       LD      b1,9*SIZE(B)
+
+       FETCH           $0,4*SIZE(PREB)
+       MADD    t13,t13,a4,b6
+       MADD    t23,t23,a5,b6
+       LD      a2,10*SIZE(A)
+
+       MADD    t14,t14,a4,b7
+       MADD    t24,t24,a5,b7
+       LD      b2,10*SIZE(B)
+
+       FETCH           $0,4*SIZE(PREA)
+       MADD    t33,t33,a6,b6
+       MADD    t43,t43,a7,b6
+       LD      a3,11*SIZE(A)
+
+       MADD    t34,t34,a6,b7
+       MADD    t44,t44,a7,b7
+       LD      b3,11*SIZE(B)
+
+.L13:
+       MADD    t11,t11,a0,b0
+       MADD    t21,t21,a1,b0
+       LD      a4,12*SIZE(A)
+
+       MADD    t12,t12,a0,b1
+       MADD    t22,t22,a1,b1
+       LD      a5,13*SIZE(A)
+
+       MADD    t31,t31,a2,b0
+       MADD    t41,t41,a3,b0
+       LD      b4,12*SIZE(B)
+
+       FETCH           $0,8*SIZE(PREA)
+       MADD    t32,t32,a2,b1
+       MADD    t42,t42,a3,b1
+       LD      b5,13*SIZE(B)
+
+       FETCH           $0,8*SIZE(PREB)
+       MADD    t13,t13,a0,b2
+       MADD    t23,t23,a1,b2
+       LD      a6,14*SIZE(A)
+
+       MADD    t14,t14,a0,b3
+       MADD    t24,t24,a1,b3
+       daddu   A,A,16*SIZE                                     #  4mr*4kr
+       LD      b6,14*SIZE(B)
+
+       MADD    t33,t33,a2,b2
+       MADD    t43,t43,a3,b2
+       daddu   B,B,16*SIZE                                     #       4nr*4kr
+       LD      a7,-1*SIZE(A)
+
+       MADD    t34,t34,a2,b3
+       MADD    t44,t44,a3,b3
+       LD      b7,-1*SIZE(B)
+
+.L14:
+       MADD    t11,t11,a4,b4
+       MADD    t21,t21,a5,b4
+       LD      a0,0(A)
+
+       MADD    t12,t12,a4,b5
+       MADD    t22,t22,a5,b5
+       LD      a1,1*SIZE(A)
+
+       MADD    t31,t31,a6,b4
+       MADD    t41,t41,a7,b4
+       daddiu  K,K,-1
+       LD      b0,0(B)
+
+       MADD    t32,t32,a6,b5
+       MADD    t42,t42,a7,b5
+       daddu   PREA,PREA,16*SIZE
+       LD      b1,1*SIZE(B)
+
+       FETCH           $0,12*SIZE(PREB)
+       MADD    t13,t13,a4,b6
+       MADD    t23,t23,a5,b6
+       LD      a2,2*SIZE(A)
+
+       FETCH           $0,-4*SIZE(PREA)
+       MADD    t14,t14,a4,b7
+       MADD    t24,t24,a5,b7
+       LD      b2,2*SIZE(B)
+
+       MADD    t33,t33,a6,b6
+       MADD    t43,t43,a7,b6
+       daddu   PREB,PREB,16*SIZE
+       LD      a3,3*SIZE(A)
+
+       MADD    t34,t34,a6,b7
+       MADD    t44,t44,a7,b7
+       bnez    K,.L11
+       LD      b3,3*SIZE(B)
+
+
+.L15:                                                                  #  kr=2
+#ifndef TRMMKERNEL
+       andi    K,KCO,2                         
+#else
+       andi    K,TEMP, 2
+#endif
+       beqz    K,.L18
+       nop
+
+.L16:                  
+       MADD    t11,t11,a0,b0
+       MADD    t21,t21,a1,b0
+       LD      a4,4*SIZE(A)
+
+       MADD    t12,t12,a0,b1
+       MADD    t22,t22,a1,b1
+       LD      a5,5*SIZE(A)
+
+       MADD    t31,t31,a2,b0
+       MADD    t41,t41,a3,b0
+       LD      b4,4*SIZE(B)
+
+       FETCH           $0,0(PREA)
+       MADD    t32,t32,a2,b1
+       MADD    t42,t42,a3,b1
+       LD      b5,5*SIZE(B)
+
+       FETCH           $0,0(PREB)
+       MADD    t13,t13,a0,b2
+       MADD    t23,t23,a1,b2
+       LD      a6,6*SIZE(A)
+
+       MADD    t14,t14,a0,b3
+       MADD    t24,t24,a1,b3
+       daddu   A,A,8*SIZE                                      #       4mr*2kr
+       LD      b6,6*SIZE(B)
+
+       MADD    t33,t33,a2,b2
+       MADD    t43,t43,a3,b2
+       daddu   B,B,8*SIZE                                      #       4nr*2kr
+       LD      a7,-1*SIZE(A)
+
+       MADD    t34,t34,a2,b3
+       MADD    t44,t44,a3,b3
+       LD      b7,-1*SIZE(B)
+
+.L17:
+       MADD    t11,t11,a4,b4
+       MADD    t21,t21,a5,b4
+       LD      a0,0*SIZE(A)
+
+       MADD    t12,t12,a4,b5
+       MADD    t22,t22,a5,b5
+       LD      a1,1*SIZE(A)
+
+       MADD    t31,t31,a6,b4
+       MADD    t41,t41,a7,b4
+       LD      b0,0*SIZE(B)
+
+       MADD    t32,t32,a6,b5
+       MADD    t42,t42,a7,b5
+       LD      b1,1*SIZE(B)
+
+       FETCH           $0,4*SIZE(PREB)
+       MADD    t13,t13,a4,b6
+       MADD    t23,t23,a5,b6
+       LD      a2,2*SIZE(A)
+
+       FETCH           $0,4*SIZE(PREA)
+       MADD    t14,t14,a4,b7
+       MADD    t24,t24,a5,b7
+       LD      b2,2*SIZE(B)
+
+       MADD    t33,t33,a6,b6
+       MADD    t43,t43,a7,b6
+       daddu   PREA,PREA,8*SIZE
+       LD      a3,3*SIZE(A)
+
+       MADD    t34,t34,a6,b7
+       MADD    t44,t44,a7,b7
+       daddu   PREB,PREB,8*SIZE
+       LD      b3,3*SIZE(B)
+
+       
+.L18:                                                                  #       kr=1
+#ifndef TRMMKERNEL
+       andi    K,KCO,1
+#else
+       andi    K,TEMP,1
+#endif
+       beqz    K,.L19                            
+       LD      ALPHA,152($sp)                                  #  Get ALPHA
+       
+       FETCH           $0,0(PREB)
+       MADD    t11,t11,a0,b0
+       MADD    t21,t21,a1,b0
+       daddu   A,A,4*SIZE                                      #       4mr*kr
+
+       MADD    t12,t12,a0,b1
+       MADD    t22,t22,a1,b1
+       daddu   B,B,4*SIZE                                      #       4nr*kr
+
+       FETCH           $0,0(PREA)
+       MADD    t31,t31,a2,b0
+       MADD    t41,t41,a3,b0
+       daddu   PREB,PREB,4*SIZE
+
+       MADD    t32,t32,a2,b1
+       MADD    t42,t42,a3,b1
+       daddu   PREA,PREA,4*SIZE
+
+       MADD    t13,t13,a0,b2
+       MADD    t23,t23,a1,b2
+
+       MADD    t14,t14,a0,b3
+       MADD    t24,t24,a1,b3
+
+       MADD    t33,t33,a2,b2
+       MADD    t43,t43,a3,b2
+
+       MADD    t34,t34,a2,b3
+       MADD    t44,t44,a3,b3
+
+.L19:                                                                  #  Write Back to C
+#ifndef TRMMKERNEL                             
+       LD      c11,0(CO1)                                              #  GEMM write part 
+       LD      c21,1*SIZE(CO1)                                 #  get 16 C
+       LD      c31,2*SIZE(CO1)
+       LD      c41,3*SIZE(CO1)
+
+       LD      c12,0(CO2)
+       MADD    t11,c11,t11,ALPHA
+       LD      c22,1*SIZE(CO2)
+       MADD    t21,c21,t21,ALPHA
+       LD      c32,2*SIZE(CO2)
+       MADD    t31,c31,t31,ALPHA
+       LD      c42,3*SIZE(CO2)
+       MADD    t41,c41,t41,ALPHA
+
+       LD      c13,0(CO3)
+       MADD    t12,c12,t12,ALPHA
+       LD      c23,1*SIZE(CO3)
+       MADD    t22,c22,t22,ALPHA
+       LD      c33,2*SIZE(CO3)
+       MADD    t32,c32,t32,ALPHA
+       LD      c43,3*SIZE(CO3)
+       MADD    t42,c42,t42,ALPHA
+
+       LD      c14,0(CO4)
+       MADD    t13,c13,t13,ALPHA
+       LD      c24,1*SIZE(CO4)
+       MADD    t23,c23,t23,ALPHA
+       LD      c34,2*SIZE(CO4)
+       MADD    t33,c33,t33,ALPHA
+       LD      c44,3*SIZE(CO4)
+       MADD    t43,c43,t43,ALPHA
+
+       ST      t11,0(CO1)
+       MADD    t14,c14,t14,ALPHA
+       ST      t21,1*SIZE(CO1)
+       MADD    t24,c24,t24,ALPHA
+       ST      t31,2*SIZE(CO1)
+       MADD    t34,c34,t34,ALPHA
+       ST      t41,3*SIZE(CO1)
+       MADD    t44,c44,t44,ALPHA
+       daddiu  M,M,-1                                          #  M--
+
+       ST      t12,0(CO2)
+       ST      t22,1*SIZE(CO2)
+       ST      t32,2*SIZE(CO2)
+       ST      t42,3*SIZE(CO2)
+
+       ST      t13,0(CO3)
+       ST      t23,1*SIZE(CO3)
+       ST      t33,2*SIZE(CO3)
+       ST      t43,3*SIZE(CO3)
+
+       FETCH   $0,4*SIZE(CO1)
+       FETCH   $0,4*SIZE(CO2)
+       FETCH   $0,4*SIZE(CO3)
+       FETCH   $0,4*SIZE(CO4)
+
+       FETCH   $0,8*SIZE(CO1)
+       FETCH   $0,8*SIZE(CO2)
+       FETCH   $0,8*SIZE(CO3)
+       FETCH   $0,8*SIZE(CO4)
+
+       ST      t14,0(CO4)
+       daddu   CO1,CO1,4*SIZE                          #  COi += 4
+       ST      t24,1*SIZE(CO4)
+       daddu   CO2,CO2,4*SIZE
+       ST      t34,2*SIZE(CO4)
+       daddu   CO3,CO3,4*SIZE
+       ST      t44,3*SIZE(CO4)
+       daddu   PREB,BO,SPANB
+       
+       bnez    M,.L10                          
+       daddu   CO4,CO4,4*SIZE
+
+#else                                                  
+       MUL     t11, ALPHA, t11                                 #       TRMM write back part
+       MUL     t21, ALPHA, t21
+       MUL     t31, ALPHA, t31
+       MUL     t41, ALPHA, t41
+
+       ST      t11, 0 * SIZE(CO1)
+       MUL     t12, ALPHA, t12
+       ST      t21, 1 * SIZE(CO1)
+       MUL     t22, ALPHA, t22
+       ST      t31, 2 * SIZE(CO1)
+       MUL     t32, ALPHA, t32
+       ST      t41, 3 * SIZE(CO1)
+       MUL     t42, ALPHA, t42
+
+       ST      t12, 0 * SIZE(CO2)
+       MUL     t13, ALPHA, t13
+       ST      t22, 1 * SIZE(CO2)
+       MUL     t23, ALPHA, t23
+       ST      t32, 2 * SIZE(CO2)
+       MUL     t33, ALPHA, t33
+       ST      t42, 3 * SIZE(CO2)
+       MUL     t43, ALPHA, t43
+
+       ST      t13, 0 * SIZE(CO3)
+       MUL     t14, ALPHA, t14
+       ST      t23, 1 * SIZE(CO3)
+       MUL     t24, ALPHA, t24
+       ST      t33, 2 * SIZE(CO3)
+       MUL     t34, ALPHA, t34
+       ST      t43, 3 * SIZE(CO3)
+       MUL     t44, ALPHA, t44
+
+       ST      t14, 0 * SIZE(CO4)
+       daddiu  M,M,-1                                          #  M--
+       ST      t24, 1 * SIZE(CO4)
+       ST      t34, 2 * SIZE(CO4)
+       ST      t44, 3 * SIZE(CO4)
+       daddiu  CO1,CO1, 4 * SIZE
+       daddiu  CO2,CO2, 4 * SIZE
+       daddiu  CO3,CO3, 4 * SIZE
+       daddiu  CO4,CO4, 4 * SIZE       
+
+       FETCH   $0,4*SIZE(CO1)
+       FETCH   $0,4*SIZE(CO2)
+       FETCH   $0,4*SIZE(CO3)
+       FETCH   $0,4*SIZE(CO4)
+
+       FETCH   $0,0(CO1)
+       FETCH   $0,0(CO2)
+       FETCH   $0,0(CO3)
+       FETCH   $0,0(CO4)
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       dsubu   TEMP,KCO,KK                                                             
+#ifdef LEFT
+       daddiu  TEMP,TEMP, -4
+#else
+       daddiu  TEMP,TEMP, -4
+#endif
+       dsll    K,TEMP,2 + BASE_SHIFT
+       dsll    TEMP,TEMP,2 + BASE_SHIFT
+       daddu   A,A,K                                           #       mov A to the end of panel Ai
+       daddu   B,B,TEMP                                        #       mov B to the end of panel Bj
+#endif
+
+#ifdef LEFT                                                                            
+       daddiu  KK, KK,4
+#endif
+       bnez    M,.L10                                  
+       nop
+#endif
+
+
+       .align 3
+.L14_M2:
+       andi    M, MCO, 2                                       #       nr=4,mr=2
+       beqz    M,.L14_M1                       
+       nop
+
+.L20:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       move    B,BO                                            #       Reset B
+#else
+       dsll    K,KK,1 + BASE_SHIFT                     #       mr=2    
+       dsll    TEMP,KK,2 + BASE_SHIFT          #       nr=4
+       daddu   A,A,K
+       daddu   B,BO,TEMP
+#endif
+
+       MTC             $0,t11
+       LD      a0,0*SIZE(A)
+       MOV     t21,t11
+       LD      a1,1*SIZE(A)
+
+       MOV     t12,t11
+       LD      b0,0*SIZE(B)
+       MOV     t22,t11
+       LD      b1,1*SIZE(B)
+
+       MOV     t13,t11
+       LD      b2,2*SIZE(B)
+       MOV     t23,t11
+       LD      b3,3*SIZE(B)
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       dsubu   TEMP,KCO,KK
+#elif defined(LEFT)
+       daddiu  TEMP,KK,2                                       #       left part,controlled by mr, mr=2
+#else
+       daddiu  TEMP,KK,4                                       #       right part,controlled by nr,nr=4
+#endif
+       dsra    K,TEMP,2
+       MOV     t14,t11
+       beqz    K,.L25
+       MOV     t24,t11                                                 #       clear 2*4=8 results registers
+
+#else
+       move    B,BO                                            #       Reset B 
+       LD      a0,0*SIZE(A)
+       MTC             $0,t11
+       LD      a1,1*SIZE(A)
+       
+       MOV     t21,t11
+       LD      b0,0*SIZE(B)
+       MOV     t12,t11
+       LD      b1,1*SIZE(B)
+
+       MOV     t22,t11
+       dsra    K,KCO,2                         
+       LD      b2,2*SIZE(B)
+
+       MOV     t13,t11
+       MOV     t23,t11
+       LD      b3,3*SIZE(B)
+
+       MOV     t14,t11
+       beqz    K,.L25
+       MOV     t24,t11
+
+#endif
+
+.L21:                                                                  #  nr=4,mr=2,kr=4
+       MADD    t11,t11,a0,b0
+       LD      a4,2*SIZE(A)
+       MADD    t21,t21,a1,b0
+       LD      a5,3*SIZE(A)
+
+       MADD    t12,t12,a0,b1
+       LD      b4,4*SIZE(B)
+       MADD    t22,t22,a1,b1
+       LD      b5,5*SIZE(B)
+
+       MADD    t13,t13,a0,b2
+       LD      b6,6*SIZE(B)
+       MADD    t23,t23,a1,b2
+       LD      b7,7*SIZE(B)
+
+       MADD    t14,t14,a0,b3
+       MADD    t24,t24,a1,b3
+       
+       MADD    t11,t11,a4,b4
+       LD      a2,4*SIZE(A)
+       MADD    t21,t21,a5,b4
+       LD      a3,5*SIZE(A)
+
+       MADD    t12,t12,a4,b5
+       LD      b0,8*SIZE(B)
+       MADD    t22,t22,a5,b5
+       LD      b1,9*SIZE(B)
+
+       MADD    t13,t13,a4,b6
+       LD      b2,10*SIZE(B)
+       MADD    t23,t23,a5,b6
+       LD      b3,11*SIZE(B)
+
+       MADD    t14,t14,a4,b7
+       MADD    t24,t24,a5,b7
+       daddiu  K,K,-1
+
+       MADD    t11,t11,a2,b0
+       LD      a6,6*SIZE(A)
+       MADD    t21,t21,a3,b0
+       LD      a7,7*SIZE(A)
+
+       MADD    t12,t12,a2,b1
+       LD      b4,12*SIZE(B)
+       MADD    t22,t22,a3,b1
+       LD      b5,13*SIZE(B)
+
+       MADD    t13,t13,a2,b2
+       LD      b6,14*SIZE(B)
+       MADD    t23,t23,a3,b2
+       LD      b7,15*SIZE(B)
+
+       MADD    t14,t14,a2,b3
+       MADD    t24,t24,a3,b3
+       daddu   A,A,8*SIZE                                      #  2mr*4kr
+       daddu   B,B,16*SIZE                                     #       4nr*4kr
+
+       MADD    t11,t11,a6,b4
+       LD      a0,0*SIZE(A)
+       MADD    t21,t21,a7,b4
+       LD      a1,1*SIZE(A)
+
+       MADD    t12,t12,a6,b5
+       LD      b0,0*SIZE(B)
+       MADD    t22,t22,a7,b5
+       LD      b1,1*SIZE(B)
+
+       MADD    t13,t13,a6,b6
+       LD      b2,2*SIZE(B)
+       MADD    t23,t23,a7,b6
+       LD      b3,3*SIZE(B)
+
+       MADD    t14,t14,a6,b7
+       bnez    K,.L21
+       MADD    t24,t24,a7,b7
+
+
+.L25:                                                                          
+#ifndef TRMMKERNEL
+       andi    K,KCO,2                                         #       kr=2
+#else
+       andi    K,TEMP,2
+#endif
+       beqz    K,.L28
+       nop
+
+.L26:                  
+       MADD    t11,t11,a0,b0
+       LD      a4,2*SIZE(A)
+       MADD    t21,t21,a1,b0
+       LD      a5,3*SIZE(A)
+
+       MADD    t12,t12,a0,b1
+       LD      b4,4*SIZE(B)
+       MADD    t22,t22,a1,b1
+       LD      b5,5*SIZE(B)
+
+       MADD    t13,t13,a0,b2
+       LD      b6,6*SIZE(B)
+       MADD    t23,t23,a1,b2
+       LD      b7,7*SIZE(B)
+       
+       MADD    t14,t14,a0,b3
+       MADD    t24,t24,a1,b3
+       daddu   A,A,4*SIZE                                      #       2mr*2kr
+       daddu   B,B,8*SIZE                                      #       4nr*2kr
+
+.L27:
+       MADD    t11,t11,a4,b4
+       LD      a0,0*SIZE(A)
+       MADD    t21,t21,a5,b4
+       LD      a1,1*SIZE(A)
+
+       MADD    t12,t12,a4,b5
+       LD      b0,0*SIZE(B)
+       MADD    t22,t22,a5,b5
+       LD      b1,1*SIZE(B)
+
+       MADD    t13,t13,a4,b6
+       LD      b2,2*SIZE(B)
+       MADD    t23,t23,a5,b6
+       LD      b3,3*SIZE(B)
+
+       MADD    t14,t14,a4,b7
+       MADD    t24,t24,a5,b7
+
+       
+.L28:                                                                  #       kr=1    
+#ifndef TRMMKERNEL
+       andi    K,KCO,1
+#else
+       andi    K,TEMP,1
+#endif
+       beqz    K,.L29                            
+       LD      ALPHA,152($sp)                                  #  Get ALPHA
+       
+       MADD    t11,t11,a0,b0
+       MADD    t21,t21,a1,b0
+       daddu   A,A,2*SIZE                                      #  2mr*kr
+       daddu   B,B,4*SIZE                                      #  4nr*kr
+
+       MADD    t12,t12,a0,b1
+       MADD    t22,t22,a1,b1
+
+       MADD    t13,t13,a0,b2
+       MADD    t23,t23,a1,b2
+
+       MADD    t14,t14,a0,b3
+       MADD    t24,t24,a1,b3
+
+.L29:                                                                  #  Write Back to C
+#ifndef TRMMKERNEL
+       LD      c11,0(CO1)                                              #       GEMM write back part
+       LD      c21,1*SIZE(CO1)                 
+
+       LD      c12,0(CO2)
+       LD      c22,1*SIZE(CO2)
+       
+       LD      c13,0(CO3)
+       MADD    t11,c11,t11,ALPHA
+       LD      c23,1*SIZE(CO3)
+       MADD    t21,c21,t21,ALPHA
+
+       LD      c14,0(CO4)
+       MADD    t12,c12,t12,ALPHA
+       LD      c24,1*SIZE(CO4)
+       MADD    t22,c22,t22,ALPHA
+
+       ST      t11,0(CO1)
+       MADD    t13,c13,t13,ALPHA
+       ST      t21,1*SIZE(CO1)
+       MADD    t23,c23,t23,ALPHA
+
+       ST      t12,0(CO2)
+       MADD    t14,c14,t14,ALPHA
+       ST      t22,1*SIZE(CO2)
+       MADD    t24,c24,t24,ALPHA
+
+       ST      t13,0(CO3)
+       daddu   CO1,CO1,2*SIZE                          #  COi += 2
+       ST      t23,1*SIZE(CO3)
+       daddu   CO2,CO2,2*SIZE
+
+       ST      t14,0(CO4)
+       daddu   CO3,CO3,2*SIZE
+       ST      t24,1*SIZE(CO4)
+       daddu   CO4,CO4,2*SIZE
+
+       FETCH   $0,0(CO1)
+       FETCH   $0,0(CO2)
+       FETCH   $0,0(CO3)
+       FETCH   $0,0(CO4)
+
+#else
+       MUL     t11, ALPHA, t11                                 #       TRMM write back part
+       MUL     t21, ALPHA, t21
+       
+       ST      t11, 0 * SIZE(CO1)
+       MUL     t12, ALPHA, t12
+       ST      t21, 1 * SIZE(CO1)
+       MUL     t22, ALPHA, t22
+       
+       ST      t12, 0 * SIZE(CO2)
+       MUL     t13, ALPHA, t13
+       ST      t22, 1 * SIZE(CO2)
+       MUL     t23, ALPHA, t23
+       
+       ST      t13, 0 * SIZE(CO3)
+       MUL     t14, ALPHA, t14
+       ST      t23, 1 * SIZE(CO3)
+       MUL     t24, ALPHA, t24
+       
+       ST      t14, 0 * SIZE(CO4)
+       ST      t24, 1 * SIZE(CO4)
+       
+       daddiu  CO1,CO1, 2 * SIZE
+       daddiu  CO2,CO2, 2 * SIZE
+       daddiu  CO3,CO3, 2 * SIZE
+       daddiu  CO4,CO4, 2 * SIZE
+
+       FETCH   $0,0(CO1)
+       FETCH   $0,0(CO2)
+       FETCH   $0,0(CO3)
+       FETCH   $0,0(CO4)
+
+#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       dsubu   TEMP,KCO,KK
+#ifdef LEFT
+       daddiu  TEMP,TEMP,-2
+#else
+       daddiu  TEMP,TEMP,-4
+#endif
+       dsll    K,TEMP,1 + BASE_SHIFT
+       dsll    TEMP,TEMP,2 + BASE_SHIFT
+
+       daddu   A,A,K                                           #       move A to next panel Ai
+       daddu   B,B,TEMP                                        #       move B to next panel Bj
+#endif
+
+#ifdef LEFT
+       daddiu  KK, KK, 2
+#endif
+#endif
+
+
+       .align 3
+.L14_M1:
+       andi    M,MCO,1                                         #       mr=1    
+       beqz    M,.L0_N4_Loop                           #       M = 0, finishing one panel Bj
+       nop
+
+.L30:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       move    B,BO                                            #       Reset B
+#else
+       dsll    K,KK, 0 + BASE_SHIFT
+       dsll    TEMP,KK,2 + BASE_SHIFT
+
+       daddu   A,A,K
+       daddu   B,BO,TEMP
+#endif
+       MTC             $0,t11
+       MOV     t12,t11
+       LD      a0,     0 * SIZE(A)                                     #       a0
+
+       MOV     t13,t11
+       LD      b0,0*SIZE(B)
+       MOV     t14,t11                                                 #       clear result registers
+       LD      b1,1*SIZE(B)
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       dsubu   TEMP, KCO, KK
+#elif defined(LEFT)
+       daddiu  TEMP, KK, 1
+#else
+       daddiu  TEMP, KK, 4
+#endif
+       dsra    K,TEMP, 2
+       nop
+       beqz    K,.L35
+       nop
+                                                               
+#else                                                  
+       move    B,BO                                            #       Reset B, GEMM part
+       dsra    K,KCO,2                                         #       K=KCO/2
+       LD      a0, 0 * SIZE(A)                                 #       a0
+
+       MTC             $0,t11
+       LD      b0,0*SIZE(B)
+       
+       MOV     t12,t11
+       LD      b1,1*SIZE(B)
+
+       MOV     t13,t11
+       LD      b2,2*SIZE(B)
+       
+       MOV     t14,t11
+       beqz    K,.L35
+       LD      b3,3*SIZE(B)
+
+#endif
+
+.L31:                                                                  #       nr=4,mr=1,kr=4  
+       LD      a1,     1*SIZE(A)                                       #       load a1
+       MADD    t11,t11,a0,b0
+       
+       LD      b4,4*SIZE(B)
+       LD      b5,5*SIZE(B)
+       MADD    t12,t12,a0,b1
+       
+       LD      b6,6*SIZE(B)
+       LD      b7,7*SIZE(B)
+       MADD    t13,t13,a0,b2
+       MADD    t14,t14,a0,b3
+
+       LD      a2,     2*SIZE(A)                                       #       a2
+       MADD    t11,t11,a1,b4
+       
+       LD      b0,8*SIZE(B)
+       LD      b1,9*SIZE(B)
+       MADD    t12,t12,a1,b5
+       
+       LD      b2,10*SIZE(B)
+       LD      b3,11*SIZE(B)
+       MADD    t13,t13,a1,b6
+       MADD    t14,t14,a1,b7
+
+       LD      a3,     3*SIZE(A)                                       #       a3
+       MADD    t11,t11,a2,b0
+       daddiu  K,K,-1
+       
+       LD      b4,12*SIZE(B)
+       LD      b5,13*SIZE(B)
+       MADD    t12,t12,a2,b1
+       daddu   A,A,4*SIZE                                      #       1mr*4kr
+       
+       LD      b6,14*SIZE(B)
+       LD      b7,15*SIZE(B)
+       MADD    t13,t13,a2,b2
+       MADD    t14,t14,a2,b3
+
+       LD      a0,     0*SIZE(A)                                       #       a0
+       daddu   B,B,16*SIZE                                     #       4nr*4kr
+       MADD    t11,t11,a3,b4
+       
+       LD      b0,0*SIZE(B)
+       MADD    t12,t12,a3,b5
+       LD      b1,1*SIZE(B)
+       MADD    t13,t13,a3,b6
+
+       LD      b2,2*SIZE(B)
+       MADD    t14,t14,a3,b7
+       bnez    K,.L31
+       LD      b3,3*SIZE(B)
+
+
+.L35:                                                                  #  kr=2
+#ifndef TRMMKERNEL
+       andi    K,KCO,2                 
+#else
+       andi    K,TEMP,2
+#endif
+       beqz    K,.L38
+       nop
+
+.L36:                  
+       LD      a1,1*SIZE(A)                                    #       load a1
+       MADD    t11,t11,a0,b0
+
+       LD      b4,4*SIZE(B)
+       LD      b5,5*SIZE(B)
+       MADD    t12,t12,a0,b1
+       daddu   A,A,2*SIZE                                      #       mr*2kr
+       
+       LD      b6,6*SIZE(B)
+       MADD    t13,t13,a0,b2
+       
+       LD      b7,7*SIZE(B)
+       MADD    t14,t14,a0,b3
+       daddu   B,B,8*SIZE                                      #       4nr*2kr
+
+
+.L37:
+       LD      a0,0(A)
+       MADD    t11,t11,a1,b4
+       
+       LD      b0,0*SIZE(B)
+       LD      b1,1*SIZE(B)
+       MADD    t12,t12,a1,b5
+       
+       LD      b2,2*SIZE(B)
+       LD      b3,3*SIZE(B)
+       MADD    t13,t13,a1,b6
+       MADD    t14,t14,a1,b7
+       
+       
+.L38:                                                                  #       kr=1
+#ifndef TRMMKERNEL
+       andi    K,KCO,1
+#else
+       andi    K,TEMP,1
+#endif
+       beqz    K,.L39                          
+       LD      ALPHA,152($sp)                                  #  Get ALPHA
+       
+       MADD    t11,t11,a0,b0
+       MADD    t12,t12,a0,b1
+       daddu   A,A,1*SIZE                              
+       daddu   B,B,4*SIZE
+       
+       MADD    t13,t13,a0,b2
+       MADD    t14,t14,a0,b3
+
+.L39:                                                                  #  Write Back
+#ifndef TRMMKERNEL
+       LD      c11,0(CO1)                      
+       LD      c12,0(CO2)
+       LD      c13,0(CO3)
+       LD      c14,0(CO4)
+       
+       MADD    t11,c11,t11,ALPHA
+       MADD    t12,c12,t12,ALPHA
+       MADD    t13,c13,t13,ALPHA
+       MADD    t14,c14,t14,ALPHA
+
+       ST      t11,0(CO1)
+       ST      t12,0(CO2)
+       ST      t13,0(CO3)
+       ST      t14,0(CO4)
+#else
+       MUL     t11, ALPHA, t11
+       MUL     t12, ALPHA, t12
+       MUL     t13, ALPHA, t13
+       MUL     t14, ALPHA, t14
+
+       ST      t11,  0 * SIZE(CO1)
+       ST      t12,  0 * SIZE(CO2)
+       ST      t13,  0 * SIZE(CO3)
+       ST      t14,  0 * SIZE(CO4)
+
+#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       dsubu   TEMP, KCO, KK
+#ifdef LEFT
+       daddiu  TEMP, TEMP, -1
+#else
+       daddiu  TEMP, TEMP, -4
+#endif
+
+       dsll    K,TEMP, 0 + BASE_SHIFT
+       dsll    TEMP,TEMP, 2 + BASE_SHIFT
+
+       daddu   A,A,K
+       daddu   B,B,TEMP
+#endif
+
+#ifdef LEFT
+       daddiu  KK, KK, 1
+#endif
+#endif
+
+
+       .align  3
+.L0_N4_Loop:                                                           #       mc finished
+       daddiu  N,N,-1                                                  #  N--
+#if defined(TRMMKERNEL) && !defined(LEFT)
+       daddiu  KK, KK,4 
+#endif
+       bnez    N,.L0_N4_Lb                     
+       move    BO,B                                                    #  Set BO point to next panel Bj
+
+       .align  5                                       
+.L0_N2:
+       andi    N,NCO,2                                                 #       nr = 2
+       beqz    N,.L0_N1                
+       nop
+
+.L0_N2_Lb:
+       move    CO1,C                                   
+       daddu   CO2,C,LDC
+
+       dsra    M,MCO,2                         
+       move    A,AO                                                    #  Reset A
+
+       daddu   PREA,AO,SPANA
+       daddu   C,CO2,LDC
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+       move    KK, OFFSET
+#endif
+       beqz    M,.L12_M2
+       nop
+
+.L40:                                          
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       move    B,BO                                                    #       Reset B
+#else
+       dsll    K,KK, 2 + BASE_SHIFT
+       dsll    TEMP, KK,1 + BASE_SHIFT 
+
+       daddu   A,A,K
+       daddu   B,BO,TEMP
+#endif
+       MTC             $0,t11
+       LD      a0,0*SIZE(A)
+       MOV     t21,t11
+       LD      a1,1*SIZE(A)
+
+       MOV     t31,t11
+       LD      b0,0*SIZE(B)
+       MOV     t41,t11
+       LD      b1,1*SIZE(B)
+
+       MOV     t12,t11
+       LD      a2,2*SIZE(A)
+       MOV     t22,t11
+       LD      a3,3*SIZE(A)
+
+       
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       dsubu   TEMP,KCO,KK
+#elif defined(LEFT)
+       daddiu  TEMP, KK, 4
+#else
+       daddiu  TEMP, KK, 2
+#endif
+       dsra    K,TEMP,2                                
+       MOV     t32,t11
+       beqz    K,.L45
+       MOV     t42,t11
+
+#else
+       move    B,BO                                                    #       Reset B
+       LD      a0,0*SIZE(A)
+       MTC             $0,t11                                                  #       gemm part
+       LD      a1,1*SIZE(A)
+
+       MOV     t21,t11
+       LD      b0,0*SIZE(B)
+       MOV     t31,t11
+       LD      b1,1*SIZE(B)
+
+       MOV     t41,t11
+       LD      a2,2*SIZE(A)
+       dsra    K,KCO,2                                                 #       K=KCO/2
+       LD      a3,3*SIZE(A)
+       
+       MOV     t12,t11
+       MOV     t22,t11
+       
+       MOV     t32,t11
+       beqz    K,.L45
+       MOV     t42,t11
+
+#endif
+
+.L41:                                                                          #       nr=2,mr=kr=4
+       MADD    t11,t11,a0,b0
+       LD      a4,4*SIZE(A)
+       MADD    t21,t21,a1,b0
+       LD      a5,5*SIZE(A)
+
+       MADD    t12,t12,a0,b1
+       LD      b4,2*SIZE(B)
+       MADD    t22,t22,a1,b1
+       LD      b5,3*SIZE(B)
+
+       MADD    t31,t31,a2,b0
+       LD      a6,6*SIZE(A)
+       MADD    t41,t41,a3,b0
+       LD      a7,7*SIZE(A)
+
+       FETCH           $0,(PREA)
+       MADD    t32,t32,a2,b1
+       MADD    t42,t42,a3,b1
+
+.L42:
+       MADD    t11,t11,a4,b4
+       LD      a0,8*SIZE(A)
+       MADD    t21,t21,a5,b4
+       LD      a1,9*SIZE(A)
+
+       MADD    t12,t12,a4,b5
+       LD      b2,4*SIZE(B)
+       MADD    t22,t22,a5,b5
+       LD      b3,5*SIZE(B)
+
+       MADD    t31,t31,a6,b4
+       LD      a2,10*SIZE(A)
+       MADD    t41,t41,a7,b4
+       LD      a3,11*SIZE(A)
+
+       FETCH           $0,4*SIZE(PREA)
+       MADD    t32,t32,a6,b5
+       MADD    t42,t42,a7,b5
+
+.L43:
+       MADD    t11,t11,a0,b2
+       LD      a4,12*SIZE(A)
+       MADD    t21,t21,a1,b2
+       LD      a5,13*SIZE(A)
+
+       MADD    t12,t12,a0,b3
+       LD      b6,6*SIZE(B)
+       MADD    t22,t22,a1,b3
+       LD      b7,7*SIZE(B)
+
+       MADD    t31,t31,a2,b2
+       LD      a6,14*SIZE(A)
+       MADD    t41,t41,a3,b2
+       LD      a7,15*SIZE(A)
+
+       FETCH           $0,8*SIZE(PREA)
+       MADD    t32,t32,a2,b3
+       MADD    t42,t42,a3,b3
+       
+       daddu   A,A,16*SIZE                                             #       4mr*4kr
+       daddu   B,B,8*SIZE                                              #       2nr*4kr 
+
+.L44:
+       MADD    t11,t11,a4,b6
+       LD      a0,0*SIZE(A)
+       MADD    t21,t21,a5,b6
+       LD      a1,1*SIZE(A)
+
+
+       MADD    t12,t12,a4,b7
+       LD      b0,0*SIZE(B)
+       MADD    t22,t22,a5,b7
+       LD      b1,1*SIZE(B)
+
+       daddiu  K,K,-1
+       daddu   PREA,PREA,16*SIZE
+
+       MADD    t31,t31,a6,b6
+       LD      a2,2*SIZE(A)
+       MADD    t41,t41,a7,b6
+       LD      a3,3*SIZE(A)
+
+       FETCH           $0,-4*SIZE(PREA)
+       MADD    t32,t32,a6,b7
+       bnez    K,.L41
+       MADD    t42,t42,a7,b7
+
+
+.L45:                                                                          #       kr=2
+#ifndef TRMMKERNEL
+       andi    K,KCO,2                         
+#else
+       andi    K,TEMP,2
+#endif
+       beqz    K,.L48
+       nop
+
+.L46:                  
+       MADD    t11,t11,a0,b0
+       LD      a4,4*SIZE(A)
+       MADD    t21,t21,a1,b0
+       LD      a5,5*SIZE(A)
+
+       MADD    t12,t12,a0,b1
+       LD      b4,2*SIZE(B)
+       MADD    t22,t22,a1,b1
+       LD      b5,3*SIZE(B)
+
+       MADD    t31,t31,a2,b0
+       LD      a6,6*SIZE(A)
+       MADD    t41,t41,a3,b0
+       LD      a7,7*SIZE(A)
+
+       FETCH           $0,0(PREA)
+       MADD    t32,t32,a2,b1
+       daddu   B,B,4*SIZE                                              #  B+=2(nr)*2(kr)*8Byte=32
+       
+       MADD    t42,t42,a3,b1
+       daddu   A,A,8*SIZE                                              #  A+=4(mr)*2(kr)*8Byte=8*SIZE
+
+.L47:
+       MADD    t11,t11,a4,b4
+       LD      a0,0*SIZE(A)
+       MADD    t21,t21,a5,b4
+       LD      a1,1*SIZE(A)
+
+       MADD    t12,t12,a4,b5
+       LD      b0,0*SIZE(B)
+       MADD    t22,t22,a5,b5
+       LD      b1,1*SIZE(B)
+
+       MADD    t31,t31,a6,b4
+       LD      a2,2*SIZE(A)
+       MADD    t41,t41,a7,b4
+       LD      a3,3*SIZE(A)
+
+       FETCH           $0,4*SIZE(PREA)
+       MADD    t32,t32,a6,b5
+       MADD    t42,t42,a7,b5
+       daddu   PREA,PREA,8*SIZE
+
+
+       
+.L48:                                                                          #        kr=1
+#ifndef TRMMKERNEL
+       andi    K,KCO,1
+#else
+       andi    K,TEMP,1
+#endif
+       beqz    K,.L49                           
+       LD      ALPHA,152($sp)                                          #  Get ALPHA
+       
+       FETCH           $0,0(PREA)
+       MADD    t11,t11,a0,b0
+       MADD    t21,t21,a1,b0
+       daddu   A,A,4*SIZE                                              #  A+=4(mr)*1(kr)*8Byte=32
+
+       MADD    t12,t12,a0,b1
+       MADD    t22,t22,a1,b1
+       daddu   B,B,2*SIZE
+       daddu   PREA,PREA,4*SIZE
+
+       MADD    t31,t31,a2,b0
+       MADD    t41,t41,a3,b0
+
+       MADD    t32,t32,a2,b1
+       MADD    t42,t42,a3,b1
+
+.L49:                                                                          #  Write Back
+#ifndef TRMMKERNEL
+       LD      c11,0(CO1)                                                      #  gemm write back part Fetch 16 C
+       LD      c21,1*SIZE(CO1)                 
+       LD      c31,2*SIZE(CO1)
+       LD      c41,3*SIZE(CO1)
+
+       LD      c12,0(CO2)
+       MADD    t11,c11,t11,ALPHA
+       LD      c22,1*SIZE(CO2)
+       MADD    t21,c21,t21,ALPHA
+       LD      c32,2*SIZE(CO2)
+       MADD    t31,c31,t31,ALPHA
+       LD      c42,3*SIZE(CO2)
+       MADD    t41,c41,t41,ALPHA
+
+       ST      t11,0(CO1)
+       MADD    t12,c12,t12,ALPHA
+       ST      t21,1*SIZE(CO1)
+       MADD    t22,c22,t22,ALPHA
+       ST      t31,2*SIZE(CO1)
+       MADD    t32,c32,t32,ALPHA
+       ST      t41,3*SIZE(CO1)
+       MADD    t42,c42,t42,ALPHA
+       daddiu  M,M,-1                          
+
+       ST      t12,0(CO2)
+       ST      t22,1*SIZE(CO2)
+       ST      t32,2*SIZE(CO2)
+       ST      t42,3*SIZE(CO2)
+
+       FETCH   $0,4*SIZE(CO1)
+       FETCH   $0,4*SIZE(CO2)
+       FETCH   $0,8*SIZE(CO1)
+       FETCH   $0,8*SIZE(CO2)
+
+       daddu   CO1,CO1,4*SIZE                  
+       bnez    M,.L40                          
+       daddu   CO2,CO2,4*SIZE
+
+#else
+       MUL     t11, ALPHA, t11
+       MUL     t21, ALPHA, t21
+       MUL     t31, ALPHA, t31
+       MUL     t41, ALPHA, t41
+       
+       MUL     t12, ALPHA, t12
+       ST      t11, 0 * SIZE(CO1)
+       MUL     t22, ALPHA, t22
+       ST      t21, 1 * SIZE(CO1)
+       MUL     t32, ALPHA, t32
+       ST      t31, 2 * SIZE(CO1)
+       MUL     t42, ALPHA, t42
+       ST      t41, 3 * SIZE(CO1)
+       
+       ST      t12, 0 * SIZE(CO2)
+       daddiu  M,M,-1
+       ST      t22, 1 * SIZE(CO2)
+       ST      t32, 2 * SIZE(CO2)
+       ST      t42, 3 * SIZE(CO2)
+       
+       daddiu  CO1,CO1, 4*SIZE
+       daddiu  CO2,CO2, 4*SIZE
+
+       FETCH   $0,0(CO1)
+       FETCH   $0,0(CO2)
+       FETCH   $0,4(CO1)
+       FETCH   $0,4(CO2)
+
+#if ( defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+       dsubu   TEMP, KCO, KK
+#ifdef LEFT
+       daddiu  TEMP, TEMP, -4
+#else
+       daddiu  TEMP, TEMP, -2
+#endif
+       dsll    K,TEMP, 2 + BASE_SHIFT
+       dsll    TEMP, TEMP, 1 + BASE_SHIFT
+
+       daddu   A,A,K
+       daddu   B,B,TEMP
+#endif
+
+#ifdef LEFT
+       daddiu  KK, KK, 4
+#endif
+       bnez    M,.L40
+       nop
+#endif
+
+
+       .align 3
+.L12_M2:
+       andi    M,MCO,2                                         #       mr = 2
+       beqz    M,.L12_M1                       
+       nop
+
+.L50:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       move    B,BO
+#else
+       dsll    K,    KK, 1 + BASE_SHIFT        #mr=2
+       dsll    TEMP, KK, 1 + BASE_SHIFT        #nr=2
+
+       daddu   A, A, K
+       daddu   B, BO,  TEMP
+#endif
+       MTC             $0,t11
+       LD      a0,0*SIZE(A)
+       MOV     t21,t11
+       LD      a1,1*SIZE(A)
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       dsubu   TEMP, KCO, KK
+#elif defined(LEFT)
+       daddiu  TEMP, KK, 2
+#else
+       daddiu  TEMP, KK, 2
+#endif
+       dsra    K,TEMP,2                        
+       MOV     t12,t11
+       beqz    K,.L55
+       MOV     t22,t11
+
+#else
+       move    B,BO
+       LD      a0,0*SIZE(A)
+       dsra    K,KCO,2                                         #  K=KCO/2
+       LD      a1,1*SIZE(A)
+
+       MTC             $0,t11
+       LD      b0,0*SIZE(B)
+       MOV     t21,t11
+       LD      b1,1*SIZE(B)
+       
+       MOV     t12,t11
+       beqz    K,.L55
+       MOV     t22,t11
+
+#endif
+
+.L51:                                                                  #  nr=2 mr=2,kr=4
+       MADD    t11,t11,a0,b0
+       LD      a4,2*SIZE(A)
+       MADD    t21,t21,a1,b0
+       LD      b4,2*SIZE(B)
+
+       MADD    t12,t12,a0,b1
+       LD      a5,3*SIZE(A)
+       MADD    t22,t22,a1,b1
+       LD      b5,3*SIZE(B)
+
+       MADD    t11,t11,a4,b4
+       LD      a2,4*SIZE(A)
+       MADD    t21,t21,a5,b4
+       LD      b2,4*SIZE(B)
+
+       MADD    t12,t12,a4,b5
+       LD      a3,5*SIZE(A)
+       MADD    t22,t22,a5,b5
+       daddiu  K,K,-1
+       LD      b3,5*SIZE(B)
+
+       MADD    t11,t11,a2,b2
+       LD      a6,6*SIZE(A)
+       MADD    t21,t21,a3,b2
+       daddu   A,A,8*SIZE                                      #  A+=2(mr)*4(kr)*8Byte=8*SIZE
+       LD      b6,6*SIZE(B)
+
+       MADD    t12,t12,a2,b3
+       daddu   B,B,8*SIZE                                      #  B+=2(nr)*4(kr)*8Byte=16*SIZE
+       LD      a7,-1*SIZE(A)
+       MADD    t22,t22,a3,b3
+       LD      b7,-1*SIZE(B)
+
+       MADD    t11,t11,a6,b6
+       LD      a0,0*SIZE(A)
+       MADD    t21,t21,a7,b6
+       LD      b0,0*SIZE(B)
+
+       MADD    t12,t12,a6,b7
+       LD      a1,1*SIZE(A)
+
+       MADD    t22,t22,a7,b7
+       bnez    K,.L51
+       LD      b1,1*SIZE(B)
+
+
+.L55:                                                                  #       kr=2
+#ifndef TRMMKERNEL
+       andi    K,KCO,2                         
+#else
+       andi    K,TEMP,2
+#endif
+       beqz    K,.L58
+       nop
+
+.L56:                  
+       MADD    t11,t11,a0,b0
+       LD      a4,2*SIZE(A)
+       MADD    t21,t21,a1,b0
+       daddu   A,A,4*SIZE                                      #  A+=2(mr)*2(kr)*8Byte=32
+       LD      b4,2*SIZE(B)
+
+       MADD    t12,t12,a0,b1
+       daddu   B,B,4*SIZE                                      #       2nr*2kr
+       LD      a5,-1*SIZE(A)
+       MADD    t22,t22,a1,b1
+       LD      b5,-1*SIZE(B)
+
+.L57:
+       MADD    t11,t11,a4,b4
+       LD      a0,0*SIZE(A)
+       MADD    t21,t21,a5,b4
+       LD      b0,0*SIZE(B)
+
+       MADD    t12,t12,a4,b5
+       LD      a1,1*SIZE(A)
+       MADD    t22,t22,a5,b5
+       LD      b1,1*SIZE(B)
+
+.L58:                                                                  #  kr=1
+#ifndef TRMMKERNEL
+       andi    K,KCO,1
+#else
+       andi    K,TEMP, 1
+#endif
+       beqz    K,.L59                          
+       LD      ALPHA,152($sp)                                  #  Get ALPHA
+       
+       MADD    t11,t11,a0,b0
+       MADD    t21,t21,a1,b0
+       daddu   A,A,2*SIZE                                      #       A+=2(mr)*1(kr)*8Byte=16
+       daddu   B,B,2*SIZE                                      #       2nr*kr
+
+       MADD    t12,t12,a0,b1
+       MADD    t22,t22,a1,b1
+
+
+.L59:                                                                  #  Write Back
+#ifndef TRMMKERNEL
+       LD      c11,0(CO1)                                              #  write gemm part back Fetch 16 C
+       LD      c21,1*SIZE(CO1)                 
+       LD      c12,0(CO2)
+       LD      c22,1*SIZE(CO2)
+       
+       MADD    t11,c11,t11,ALPHA
+       MADD    t21,c21,t21,ALPHA
+       MADD    t12,c12,t12,ALPHA
+       MADD    t22,c22,t22,ALPHA
+
+       ST      t11,0(CO1)
+       ST      t21,1*SIZE(CO1)
+       ST      t12,0(CO2)
+       ST      t22,1*SIZE(CO2)
+
+       daddu   CO1,CO1,2*SIZE                  
+       daddu   CO2,CO2,2*SIZE
+
+       FETCH   $0,0(CO1)
+       FETCH   $0,0(CO2)
+#else
+       daddiu  M, M, -1
+       daddiu  CO1,CO1, 2 * SIZE
+       daddiu  CO2,CO2, 2 * SIZE
+       MUL     t11, ALPHA, t11
+       MUL     t21, ALPHA, t21
+       MUL     t12, ALPHA, t12
+       MUL     t22, ALPHA, t22
+
+       ST      t11, -2 * SIZE(CO1)
+       ST      t21, -1 * SIZE(CO1)
+       ST      t12, -2 * SIZE(CO2)
+       ST      t22, -1 * SIZE(CO2)
+
+       FETCH   $0,0(CO1)
+       FETCH   $0,0(CO2)
+
+#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       dsubu   TEMP, KCO, KK
+#ifdef LEFT
+       daddiu  TEMP, TEMP, -2
+#else
+       daddiu  TEMP, TEMP, -2
+#endif
+
+       dsll    K,    TEMP, 1 + BASE_SHIFT
+       dsll    TEMP, TEMP, 1 + BASE_SHIFT
+
+       daddu   A, A, K
+       daddu   B, B, TEMP
+#endif
+
+#ifdef LEFT
+       daddiu  KK, KK, 2
+#endif
+#endif
+
+
+       .align 3
+.L12_M1:
+       andi    M,MCO,1                                 #       mr = 1
+       beqz    M,.L0_N2_Loop           
+       nop
+
+.L60:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       move    B,BO                                    #       Reset B
+#else
+       dsll    K,    KK, 0 + BASE_SHIFT
+       dsll    TEMP, KK, 1 + BASE_SHIFT
+
+       daddu   A, A, K
+       daddu   B, BO,  TEMP
+#endif
+       MTC             $0,t11
+       LD      a0, 0*SIZE(A)                           #       a0
+       
+       MOV     t21,t11
+       LD      b0,0*SIZE(B)            
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       dsubu   TEMP, KCO, KK
+#elif defined(LEFT)
+       daddiu  TEMP, KK, 1
+#else
+       daddiu  TEMP, KK, 2
+#endif
+       dsra    K,TEMP,2                                
+       MOV     t12,t11
+       beqz    K,.L65
+       MOV     t22,t11
+
+#else
+       dsra    K,KCO,2                         
+       move    B,BO                                    #  Reset B
+       LD      a0,0*SIZE(A)
+       
+       MTC             $0,t11
+       MOV     t21,t11
+       LD      b0,0*SIZE(B)
+
+       MOV     t12,t11
+       LD      b1,1*SIZE(B)
+       beqz    K,.L65
+       MOV     t22,t11
+
+#endif
+
+.L61:                                                          #       nr=2,mr=1,kr=4  
+       LD      a4,     1*SIZE(A)                               #       a2
+       LD      b4, 2*SIZE(B)
+       MADD    t11,t11,a0,b0
+       
+       LD      b5,3*SIZE(B)
+       MADD    t12,t12,a0,b1
+
+       LD      a2,     2*SIZE(A)                               #       a3
+       LD      b2,4*SIZE(B)
+       MADD    t11,t11,a4,b4
+       
+       LD      b3,5*SIZE(B)
+       MADD    t12,t12,a4,b5
+
+       LD      a6,     3*SIZE(A)                               #       a4
+       daddiu  K,K,-1
+       LD      b6,6*SIZE(B)
+       MADD    t11,t11,a2,b2
+       
+       LD      b7,7*SIZE(B)
+       MADD    t12,t12,a2,b3
+       daddu   A,A,4*SIZE                              #  A+=1(mr)*4(kr)*8Byte=32
+
+       LD      a0,     0*SIZE(A)
+       daddu   B,B,8*SIZE                              #  B+=2(nr)*4(kr)*8Byte=8*SIZE
+       
+       LD      b0,0*SIZE(B)    
+       MADD    t11,t11,a6,b6
+       
+       LD      b1,1*SIZE(B)
+       bnez    K,.L61
+       MADD    t12,t12,a6,b7
+
+
+
+.L65:                                                          #  kr=2
+#ifndef TRMMKERNEL
+       andi    K,KCO,2                         
+#else
+       andi    K,TEMP,2
+#endif
+       beqz    K,.L68
+       nop
+
+.L66:                  
+       LD      a4,     1*SIZE(A)                               #       a1
+       MADD    t11,t11,a0,b0
+       LD      b4,2*SIZE(B)
+       daddu   A,A,2*SIZE                              #  A+=1(mr)*2(kr)*8Byte=16
+       
+       LD      b5,3*SIZE(B)
+       MADD    t12,t12,a0,b1
+       daddu   B,B,4*SIZE
+
+.L67:
+       LD      a0,0(A)                                         #       a0
+       LD      b0,0*SIZE(B)
+       MADD    t11,t11,a4,b4
+       
+       LD      b1,1*SIZE(B)
+       MADD    t12,t12,a4,b5
+
+
+.L68:                                                          #   kr=1
+#ifndef TRMMKERNEL
+       andi    K,KCO,1
+#else
+       andi    K,TEMP,1
+#endif
+       beqz    K,.L69                            
+       LD      ALPHA,152($sp)                          #  Get ALPHA
+       
+       MADD    t11,t11,a0,b0
+       MADD    t12,t12,a0,b1
+       daddu   A,A,1*SIZE                              #  A+=1(mr)*1(kr)*8Byte=16
+       daddu   B,B,2*SIZE
+
+
+.L69:                                                          #  Write Back
+#ifndef TRMMKERNEL
+       LD      c11,0(CO1)                                      #  Fetch 16 C
+       LD      c12,0(CO2)
+       
+       MADD    t11,c11,t11,ALPHA
+       MADD    t12,c12,t12,ALPHA
+
+       ST      t11,0(CO1)
+       ST      t12,0(CO2)
+
+       daddu   CO1,CO1,1*SIZE          
+       daddu   CO2,CO2,1*SIZE
+
+#else
+       MUL     t11, ALPHA, t11
+       MUL     t12, ALPHA, t12
+
+       ST      t11,  0 * SIZE(CO1)
+       ST      t12,  0 * SIZE(CO2)
+
+       daddu   CO1,CO1,1*SIZE                  
+       daddu   CO2,CO2,1*SIZE
+
+#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       dsubu   TEMP, KCO, KK
+#ifdef LEFT
+       daddiu  TEMP, TEMP, -1
+#else
+       daddiu  TEMP, TEMP, -2
+#endif
+
+       dsll    K,    TEMP, 0 + BASE_SHIFT
+       dsll    TEMP, TEMP, 1 + BASE_SHIFT
+
+       daddu   A, A, K
+       daddu   B, B, TEMP
+#endif
+
+#ifdef LEFT
+       daddiu  KK, KK, 1
+#endif
+#endif
+
+.L0_N2_Loop:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+       daddiu  KK, KK, 2
+#endif
+       move    BO, B
+
+
+       .align  5                                       
+.L0_N1:
+       andi    N,NCO,1                                 #  nr = 1
+       beqz    N,.L999                                 
+       nop
+
+       move    CO1,C                           
+       dsra    M,MCO,2                         
+       
+       move    A,AO                                    #  Reset A
+       daddu   PREA,AO,SPANA
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+       move    KK, OFFSET
+#endif
+
+       beqz    M,.L11_M2
+       daddu   C,CO1,LDC
+
+.L70:                                          
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       move    B, BO                                   #       Reset B
+#else
+       dsll    K,    KK, 2 + BASE_SHIFT
+       dsll    TEMP, KK, 0 + BASE_SHIFT
+
+       daddu   A, A, K
+       daddu   B, BO,  TEMP
+#endif
+       MTC             $0,t11
+       LD      b0,     0*SIZE(B)
+       
+       MOV     t21,t11
+       LD      a0,0*SIZE(A)
+       MOV     t31,t11
+       LD      a1,1*SIZE(A)
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       dsubu   TEMP, KCO, KK
+#elif defined(LEFT)
+       daddiu  TEMP, KK, 4
+#else
+       daddiu  TEMP, KK, 1
+#endif
+       dsra    K,TEMP,2                
+       MOV     t41,t11
+       beqz    K,.L75
+       nop
+#else
+       move    B, BO                                   #       Reset B
+       dsra    K,KCO,2                 
+       LD      b0,     0*SIZE(B)
+       
+       MTC             $0,t11
+       LD      a0,0*SIZE(A)
+       MOV     t21,t11
+       LD      a1,1*SIZE(A)
+       
+       MOV     t31,t11
+       LD      a2,2*SIZE(A)
+       MOV     t41,t11
+       beqz    K,.L75
+       LD      a3,3*SIZE(A)
+
+#endif
+
+.L71:                                                          #  nr=1,mr=kr=4
+       LD      b4,     1*SIZE(B)                               #       b1
+       MADD    t11,t11,a0,b0
+       
+       LD      a4,     4*SIZE(A)
+       MADD    t21,t21,a1,b0
+
+       LD      a5,     5*SIZE(A)
+       FETCH           $0,(PREA)
+
+       LD      a6,6*SIZE(A)
+       MADD    t31,t31,a2,b0
+
+       LD      a7,7*SIZE(A)
+       MADD    t41,t41,a3,b0
+
+.L72:
+       LD      b2,     2*SIZE(B)                               #       b2
+       MADD    t11,t11,a4,b4
+       
+       LD      a0,8*SIZE(A)
+       MADD    t21,t21,a5,b4
+
+       LD      a1,9*SIZE(A)
+       FETCH           $0,4*SIZE(PREA)
+
+       LD      a2,10*SIZE(A)
+       MADD    t31,t31,a6,b4
+       
+       LD      a3,11*SIZE(A)
+       MADD    t41,t41,a7,b4
+
+.L73:
+       LD      b6,     3*SIZE(B)
+       MADD    t11,t11,a0,b2
+       
+       LD      a4,12*SIZE(A)
+       daddu   B,B,4*SIZE                              #  B+=1(nr)*4(kr)*8Byte=32
+       
+       LD      a5,13*SIZE(A)
+       MADD    t21,t21,a1,b2
+
+       LD      a6,14*SIZE(A)
+       FETCH           $0,8*SIZE(PREA)
+       MADD    t31,t31,a2,b2
+
+       LD      a7,15*SIZE(A)
+       MADD    t41,t41,a3,b2
+       daddu   A,A,16*SIZE                             #  A+=4(mr)*4(kr)*8Byte=16*SIZE
+
+.L74:
+       LD      b0,     0*SIZE(B)
+       MADD    t11,t11,a4,b6
+       
+       LD      a0,0*SIZE(A)
+       daddu   PREA,PREA,16*SIZE
+
+       LD      a1,1*SIZE(A)
+       MADD    t21,t21,a5,b6
+
+       LD      a2,2*SIZE(A)
+       daddiu  K,K,-1
+       MADD    t31,t31,a6,b6
+
+       LD      a3,3*SIZE(A)
+       MADD    t41,t41,a7,b6
+       bnez    K,.L71
+       FETCH           $0,-32(PREA)
+
+
+.L75:                                                          #  kr=2
+#ifndef TRMMKERNEL
+       andi    K,KCO,2                         
+#else
+       andi    K,TEMP,2
+#endif
+       beqz    K,.L78
+       nop
+
+.L76:                  
+       LD      b4,     1*SIZE(B)
+       MADD    t11,t11,a0,b0
+       
+       LD      a4,4*SIZE(A)
+       daddu   B,B,2*SIZE                              #  B+=1(nr)*2(kr)*8Byte=32
+       
+       LD      a5,5*SIZE(A)
+       MADD    t21,t21,a1,b0
+       FETCH           $0,0(PREA)
+
+       LD      a6,6*SIZE(A)
+       MADD    t31,t31,a2,b0
+
+       LD      a7,7*SIZE(A)
+       MADD    t41,t41,a3,b0
+       daddu   A,A,8*SIZE                              #  A+=4(mr)*2(kr)*8Byte=8*SIZE
+
+.L77:
+       LD      b0,0(B)
+       MADD    t11,t11,a4,b4
+
+       LD      a0,0*SIZE(A)
+       MADD    t21,t21,a5,b4
+       FETCH           $0,4*SIZE(PREA)
+
+       LD      a1,1*SIZE(A)
+       MADD    t31,t31,a6,b4
+
+       LD      a2,2*SIZE(A)
+       MADD    t41,t41,a7,b4
+
+       LD      a3,3*SIZE(A)
+       daddu   PREA,PREA,8*SIZE
+
+
+       
+.L78:                                                          #   kr=1
+#ifndef TRMMKERNEL
+       andi    K,KCO,1
+#else
+       andi    K,TEMP,1
+#endif
+       beqz    K,.L79                           
+       LD      ALPHA,152($sp)                          #  Get ALPHA
+       
+       FETCH           $0,0(PREA)
+       MADD    t11,t11,a0,b0
+       MADD    t21,t21,a1,b0
+       daddu   A,A,4*SIZE                              #  A+=4(mr)*1(kr)*8Byte=32
+
+       MADD    t31,t31,a2,b0
+       MADD    t41,t41,a3,b0
+       daddu   B,B,1*SIZE
+       daddu   PREA,PREA,4*SIZE
+
+
+.L79:                                                          #  Write Back
+#ifndef TRMMKERNEL
+       LD      c11,0(CO1)                                      #  Fetch 16 C
+       LD      c21,1*SIZE(CO1)                 
+       LD      c31,2*SIZE(CO1)
+       LD      c41,3*SIZE(CO1)
+
+       MADD    t11,c11,t11,ALPHA
+       MADD    t21,c21,t21,ALPHA
+       MADD    t31,c31,t31,ALPHA
+       MADD    t41,c41,t41,ALPHA
+
+       ST      t11,0(CO1)
+       ST      t21,1*SIZE(CO1)
+       ST      t31,2*SIZE(CO1)
+       ST      t41,3*SIZE(CO1)
+       daddiu  M,M,-1                                  #  M--
+
+       FETCH   $0,4*SIZE(CO1)
+       FETCH   $0,8*SIZE(CO1)
+
+       bnez    M,.L70                                  #  M!=0
+       daddu   CO1,CO1,4*SIZE                  #  COx += 4*8Byte
+#else
+       daddiu  M,M,-1                                  #  M--
+       MUL     t11, ALPHA, t11
+       MUL     t21, ALPHA, t21
+       MUL     t31, ALPHA, t31
+       MUL     t41, ALPHA, t41
+
+       ST      t11,0(CO1)
+       ST      t21,1*SIZE(CO1)
+       ST      t31,2*SIZE(CO1)
+       ST      t41,3*SIZE(CO1)
+
+       FETCH   $0,4*SIZE(CO1)
+       FETCH   $0,8*SIZE(CO1)
+
+       daddu   CO1,CO1,4*SIZE                  
+#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       dsubu   TEMP, KCO, KK
+#ifdef LEFT
+       daddiu  TEMP, TEMP, -4
+#else
+       daddiu  TEMP, TEMP, -1
+#endif
+
+       dsll    K,    TEMP, 2 + BASE_SHIFT
+       dsll    TEMP, TEMP, 0 + BASE_SHIFT
+
+       daddu   A, A,K
+       daddu   B, B, TEMP
+#endif
+
+#ifdef LEFT
+       daddiu  KK, KK, 4
+#endif
+       bnez    M,.L70                          
+       nop
+#endif
+
+
+       .align 3
+.L11_M2:
+       andi    M,MCO,2                                 #  mr = 2
+       beqz    M,.L11_M1                       
+       nop
+
+.L80:                                          
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       move    B, BO
+#else
+       dsll    K,    KK, 1 + BASE_SHIFT
+       dsll    TEMP, KK, 0 + BASE_SHIFT
+
+       daddu   A, A, K
+       daddu   B, BO,  TEMP
+#endif
+       LD      b0,     0*SIZE(B)
+       MTC             $0,t11
+       
+       LD      a0,0*SIZE(A)
+       MOV             t21,t11
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       dsubu   TEMP, KCO, KK
+#elif defined(LEFT)
+       daddiu  TEMP, KK, 2
+#else
+       daddiu  TEMP, KK, 1
+#endif
+       dsra    K,TEMP,2                                #  K=KCO/2
+       beqz    K,.L85
+       nop
+#else
+       move    B, BO
+       dsra    K,KCO,2                         
+       LD      b0,     0*SIZE(B)
+
+       MTC             $0,t11
+       MOV             t21,t11
+       LD      a0,0*SIZE(A)
+       
+       beqz    K,.L85
+       LD      a1,1*SIZE(A)
+
+#endif
+
+.L81:                                                          #  nr=1,mr=2,kr=4
+       LD      b4,     1*SIZE(B)
+       LD      a4,2*SIZE(A)
+       MADD    t11,t11,a0,b0
+       LD      a5,3*SIZE(A)
+       MADD    t21,t21,a1,b0
+
+       LD      b2,     2*SIZE(B)
+       LD      a2,4*SIZE(A)
+       MADD    t11,t11,a4,b4
+       LD      a3,5*SIZE(A)
+       MADD    t21,t21,a5,b4
+       
+       LD      b6,     3*SIZE(B)
+       LD      a6,6*SIZE(A)
+       MADD    t11,t11,a2,b2
+       LD      a7,7*SIZE(A)
+       MADD    t21,t21,a3,b2
+
+       daddu   A,A,8*SIZE                              #  A+=2(mr)*4(kr)*8Byte=8*SIZE
+       daddu   B,B,4*SIZE                              #  B+=1(nr)*4(kr)*8Byte=32
+
+       LD      b0,     0*SIZE(B)
+       daddiu  K,K,-1
+
+       LD      a0,0*SIZE(A)
+       MADD    t11,t11,a6,b6
+
+       LD      a1,1*SIZE(A)
+       bnez    K,.L81
+       MADD    t21,t21,a7,b6
+
+.L85:                                                          #  kr=2
+#ifndef TRMMKERNEL
+       andi    K,KCO,2                         
+#else
+       andi    K,TEMP,2
+#endif
+       beqz    K,.L88
+       nop
+
+.L86:                  
+       LD      b4,     1*SIZE(B)
+       LD      a4,2*SIZE(A)
+       MADD    t11,t11,a0,b0
+       LD      a5,3*SIZE(A)
+       MADD    t21,t21,a1,b0
+       
+       daddu   A,A,4*SIZE                              #  A+=2(mr)*2(kr)*8Byte=32
+       daddu   B,B,2*SIZE                              #  B+=1(nr)*2(kr)*8Byte=16
+       
+       LD      b0,0(B)
+       LD      a0,0*SIZE(A)
+       MADD    t11,t11,a4,b4
+       LD      a1,1*SIZE(A)
+       MADD    t21,t21,a5,b4
+
+
+       
+.L88:                                                          #  kr=1
+#ifndef TRMMKERNEL
+       andi    K,KCO,1
+#else
+       andi    K,TEMP,1
+#endif
+       beqz    K,.L89                          
+       LD      ALPHA,152($sp)                          #  Get ALPHA
+       
+       MADD    t11,t11,a0,b0
+       MADD    t21,t21,a1,b0
+       daddu   A,A,2*SIZE                              #  A+=2(mr)*1(kr)*8Byte=16
+       daddu   B,B,1*SIZE
+
+
+.L89:                                                          #  Write Back
+#ifndef TRMMKERNEL
+       LD      c11,0(CO1)                                      #  Fetch 16 C
+       LD      c21,1*SIZE(CO1)                 
+
+       MADD    t11,c11,t11,ALPHA
+       MADD    t21,c21,t21,ALPHA
+
+       ST      t11,0(CO1)
+       ST      t21,1*SIZE(CO1)
+
+       FETCH   $0,2*SIZE(CO1)
+       
+       daddu   CO1,CO1,2*SIZE                  #  COx += 2*8Byte
+
+#else
+       daddu   CO1,CO1,2*SIZE                  #  COx += 2*8Byte
+       MUL     t11, ALPHA, t11
+       MUL     t21, ALPHA, t21
+
+       FETCH   $0,0(CO1)
+       ST      t11, -2 * SIZE(CO1)
+       ST      t21, -1 * SIZE(CO1)
+#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       dsubu   TEMP, KCO, KK
+#ifdef LEFT
+       daddiu  TEMP, TEMP, -2
+#else
+       daddiu  TEMP, TEMP, -1
+#endif
+
+       dsll    K,    TEMP, 1 + BASE_SHIFT
+       dsll    TEMP, TEMP, 0 + BASE_SHIFT
+
+       daddu   A, A, K
+       daddu   B, B, TEMP
+#endif
+
+#ifdef LEFT
+       daddiu  KK, KK, 2
+#endif
+#endif
+
+
+       .align 3
+.L11_M1:
+       andi            M,MCO,1                         #   mr = 1
+       beqz    M,.L999                 
+       nop
+
+.L90:                  
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       move    B,  BO
+#else
+       dsll    K,    KK, 0 + BASE_SHIFT
+       dsll    TEMP, KK, 0 + BASE_SHIFT
+
+       daddu   A, A, K
+       daddu   B, BO,  TEMP
+#endif
+       LD      a0,     0*SIZE(A)
+       LD      b0,     0*SIZE(B)
+       MTC             $0,t11
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       dsubu   TEMP, KCO, KK
+#elif defined(LEFT)
+       daddiu  TEMP, KK, 1
+#else
+       daddiu  TEMP, KK, 1
+#endif
+       dsra    K,  TEMP, 2
+       beqz    K,.L95
+       nop
+
+#else
+       move    B,  BO
+       LD      a0,     0*SIZE(A)
+       LD      b0,     0*SIZE(B)
+       dsra    K,KCO,2                         
+       beqz    K,.L95
+       MTC             $0,t11
+#endif
+
+.L91:                                                          #  nr=mr=1,kr=4
+       LD      a4,     1*SIZE(A)
+       LD      b4,     1*SIZE(B)
+       MADD    t11,t11,a0,b0
+       
+       LD      a2,     2*SIZE(A)
+       LD      b2,     2*SIZE(B)
+       MADD    t11,t11,a4,b4
+
+       LD      a6,     3*SIZE(A)
+       LD      b6,     3*SIZE(B)
+       MADD    t11,t11,a2,b2
+       
+       daddu   A,A,4*SIZE                              #  A+=1(mr)*4(kr)*8Byte=32
+       daddu   B,B,4*SIZE                              #  B+=1(nr)*4(kr)*8Byte=32
+
+       LD      a0,     0*SIZE(A)
+       LD      b0,     0*SIZE(B)
+       MADD    t11,t11,a6,b6
+       
+       daddiu  K,K,-1
+       bnez    K,.L91
+       nop
+
+.L95:                                                          #  kr=2
+#ifndef TRMMKERNEL
+       andi    K,KCO,2                 
+#else
+       andi    K,TEMP,2
+#endif
+       beqz    K,.L98
+       nop
+
+.L96:                  
+       LD      a4,     1*SIZE(A)
+       LD      b4,     1*SIZE(B)
+       MADD    t11,t11,a0,b0
+       daddu   B,B,2*SIZE                              #  B+=1(nr)*2(kr)*8Byte=16
+       daddu   A,A,2*SIZE                              #  A+=1(mr)*2(kr)*8Byte=32
+
+       LD      b0,0(B)
+       LD      a0,0(A)
+       MADD    t11,t11,a4,b4
+       
+.L98:                                                          #  kr=1
+#ifndef TRMMKERNEL
+       andi            K,KCO,1
+#else
+       andi    K,TEMP,1
+#endif
+       beqz    K,.L99                          
+       LD      ALPHA,152($sp)                          #  Get ALPHA
+
+       MADD    t11,t11,a0,b0
+
+
+.L99:                                                          #  Write Back
+#ifndef TRMMKERNEL
+       LD      c11,0(CO1)                                      #  Fetch 16 C
+       MADD    t11,c11,t11,ALPHA
+       ST      t11,0(CO1)
+
+#else
+       MUL     t11, ALPHA, t11
+
+       ST      t11,  0 * SIZE(CO1)
+#endif
+
+
+.L999:                                                 #  End
+       ld      $16,   0($sp)
+       ld      $17,   8($sp)
+       ld      $18,  16($sp)
+       ld      $19,  24($sp)
+       ld      $20,  32($sp)
+       ld      $21,  40($sp)
+       ld      $22,  48($sp)
+       LD      $f24, 56($sp)
+       LD      $f25, 64($sp)
+       LD      $f26, 72($sp)
+       LD      $f27, 80($sp)
+       LD      $f28, 88($sp)
+       ld      $23,  96($sp)
+       ld      $24, 104($sp)
+       ld      $25, 112($sp)
+       LD      $f20,120($sp)
+       LD      $f21,128($sp)
+       LD      $f22,136($sp)
+       LD      $f23,144($sp)
+
+       j       $31
+       daddiu  $sp, $sp, 160
+
+       EPILOGUE