Completely dtrmm function.
authortraz <wangqian10@iscas.ac.cn>
Sun, 17 Apr 2011 20:26:49 +0000 (20:26 +0000)
committertraz <wangqian10@iscas.ac.cn>
Sun, 17 Apr 2011 20:26:49 +0000 (20:26 +0000)
kernel/mips64/gemm_kernel_loongson3a.S

index 389b38f462f6920393833638593880cc1963f47e..a785c3e0dc27cf50b81f0fd2bce559c498647df2 100644 (file)
@@ -3,6 +3,7 @@
 #define FETCH   ld
 
 #define REALNAME ASMNAME
+
 #define ASSEMBLER
 #include "common.h"
 
        MOV     t22,t11
        gsLQC1(R9,F9,F8,0)                      #b0,b1
        
-       dsra    K,KCO,2                         #  K=KCO/2
        MOV     t13,t11
        gsLQC1(R9,F11,F10,1)            #b2,b3
        
        nop
 
 .L30:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       move    B,BO
+#else
+       dsll    K,KK, 0 + BASE_SHIFT
+       dsll    TEMP,KK,2 + BASE_SHIFT
+
+       daddu   A,A,K
+       daddu   B,BO,TEMP
+#endif
+
+       gsLQC1(R8,F1,F0,0)
+       gsLQC1(R9,F9,F8,0)                      #b0,b1
+       MTC             $0,t11
+       gsLQC1(R9,F11,F10,1)            #b2,b3
+       MOV     t12,t11
+       MOV     t13,t11
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       dsubu   TEMP, KCO, KK
+#elif defined(LEFT)
+       daddiu  TEMP, KK, 1
+#else
+       daddiu  TEMP, KK, 4
+#endif
+       dsra    K,TEMP, 2
+
+       beqz    K,.L35
+       MOV     t14,t11
+#else
+       move    B,BO
        gsLQC1(R8,F1,F0,0)
        dsra    K,KCO,2                         #  K=KCO/2
        gsLQC1(R9,F9,F8,0)                      #b0,b1
        gsLQC1(R9,F11,F10,1)            #b2,b3
        MOV     t12,t11
        MOV     t13,t11
+       dsra    K,KCO,2
        beqz    K,.L35
        MOV     t14,t11
+#endif
 
 .L31:                                                  #  N=4 m=1,=K=4
        gsLQC1(R8,F3,F2,1)      
        MADD    t14,t14,a3,b7
 
 .L35:                                                  #  N=4 M=1 K=2
+#ifndef TRMMKERNEL
        and             K,KCO,2                         #  k = KCO&2
+#else
+       and             K,TEMP,2
+#endif
        beqz    K,.L38
        nop
 
        MADD    t14,t14,a1,b7
        
 .L38:                                                  #  N=4, M=1, K=1
+#ifndef TRMMKERNEL
        and             K,KCO,1
+#else
+       andi    K,TEMP,1
+#endif
        beqz    K,.L39                          #  
        LD      ALPHA,152($sp)          #  Get ALPHA
        
        MADD    t14,t14,a0,b3
 
 .L39:                                                  #  Write Back
+#ifndef TRMMKERNEL
        LD      c11,0(CO1)                      #  Fetch 16 C
        LD      c12,0(CO2)
        LD      c13,0(CO3)
        ST      t12,0(CO2)
        ST      t13,0(CO3)
        ST      t14,0(CO4)
+#else
+       MUL     t11, ALPHA, t11
+       MUL     t12, ALPHA, t12
+       MUL     t13, ALPHA, t13
+       MUL     t14, ALPHA, t14
+
+       ST      t11,  0 * SIZE(CO1)
+       ST      t12,  0 * SIZE(CO2)
+       ST      t13,  0 * SIZE(CO3)
+       ST      t14,  0 * SIZE(CO4)
+
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+       dsubu   TEMP, KCO, KK
+#ifdef LEFT
+       daddiu  TEMP, TEMP, -1
+#else
+       daddiu  TEMP, TEMP, -4
+#endif
+
+       dsll    K,TEMP, 0 + BASE_SHIFT
+       dsll    TEMP,TEMP, 2 + BASE_SHIFT
+
+       daddu   A,A,K
+       daddu   B,BO,TEMP
+#endif
+
+#ifdef LEFT
+       daddiu  KK, KK, 1
+#endif
+#endif
 
 
 .L0_N4_Loop:
-       daddu   BO,BO,SPANB                     #  BO point to next panel B
        daddiu  N,N,-1                          #  N--
+#if defined(TRMMKERNEL) && !defined(LEFT)
+       daddiu  KK, KK,4 
+#endif
        bnez    N,.L0_N4_Lb                     #  N!=0
-       move    B,BO                            #  Set B
+       move    BO,B                            #  Set B
 
 
 
 .L0_N2_Lb:
        move    CO1,C                           #  Set C        
        dsra    M,MCO,2                         #  M=MCO/2
-       
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+       move    KK, OFFSET
+#endif
+
        dsll    SPANB,KCO,1+BASE_SHIFT                  #  SPANB=KC*NR(2)*8Byte=KC*16=KC*2^4
        move    A,AO                            #  Reset A
 
        daddu   C,CO2,LDC
 
 .L40:                                          
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       move    B,BO
+#else
+       dsll    K,KK, 2 + BASE_SHIFT    # mr=4
+       dsll    TEMP, KK,1 + BASE_SHIFT # nr=2
+
+       daddu   A,A,K
+       daddu   B,BO,TEMP
+#endif
        MTC             $0,t11
        MOV     t21,t11
        gsLQC1(R8,F1,F0,0)                      #a0,a1
        MOV     t41,t11
        gsLQC1(R9,F9,F8,0)                      #b0,b1
 
+       MOV     t12,t11
+       gsLQC1(R8,F3,F2,1)                      #a2,a3
+       
+       MOV     t22,t11
+       MOV     t32,t11
+       
+       MOV     t42,t11
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       dsubu   TEMP,KCO,KK
+#elif defined(LEFT)
+       daddiu  TEMP, KK, 4
+#else
+       daddiu  TEMP, KK, 2
+#endif
+       dsra    K,TEMP,2                                #  K=KCO/2
+       beqz    K,.L45
+       nop
+#else
+       move    B,BO
+       MTC             $0,t11                          #  gemm part
+       MOV     t21,t11
+       gsLQC1(R8,F1,F0,0)                      #a0,a1
+
+       MOV     t31,t11
+       MOV     t41,t11
+       gsLQC1(R9,F9,F8,0)                      #b0,b1
+
        dsra    K,KCO,2                         #  K=KCO/2
        MOV     t12,t11
        gsLQC1(R8,F3,F2,1)                      #a2,a3
        MOV     t42,t11
        beqz    K,.L45
        nop
+#endif
 
 .L41:                                                  #  N=2,M=K=4
        gsLQC1(R8,F5,F4,2)                      #  R8=A
 
 
 .L45:                                                  #  N=2 M=4 K=2
+#ifndef TRMMKERNEL
        and             K,KCO,2                         #  k = KCO&2
+#else
+       andi    K,TEMP,2
+#endif
        beqz    K,.L48
        nop
 
 
        
 .L48:                                                  #  N=2, M=4, K=1
+#ifndef TRMMKERNEL
        and             K,KCO,1
+#else
+       andi    K,TEMP,1
+#endif
        beqz    K,.L49                          #  
        LD      ALPHA,152($sp)          #  Get ALPHA
        
        MADD    t42,t42,a3,b1
 
 .L49:                                                  #  Write Back
-       LD      c11,0(CO1)                      #  Fetch 16 C
+#ifndef TRMMKERNEL
+       LD      c11,0(CO1)                      #  gemm write back part Fetch 16 C
        LD      c21,1*SIZE(CO1)                 
        LD      c31,2*SIZE(CO1)
        LD      c41,3*SIZE(CO1)
        FETCH   $0,8*SIZE(CO2)
 
        daddu   CO1,CO1,4*SIZE                  #  COx += 4*8Byte
-       daddu   CO2,CO2,4*SIZE
        bnez    M,.L40                          #  M!=0
-       move    B,BO                            #  Reset B
+       daddu   CO2,CO2,4*SIZE
+#else
+       daddiu  M,M,-1
+
+       daddiu  CO1,CO1, 4*SIZE
+       daddiu  CO2,CO2, 4*SIZE
+
+       MUL     t11, ALPHA, t11
+       MUL     t21, ALPHA, t21
+       MUL     t31, ALPHA, t31
+       MUL     t41, ALPHA, t41
+       
+       MUL     t12, ALPHA, t12
+       MUL     t22, ALPHA, t22
+       MUL     t32, ALPHA, t32
+       MUL     t42, ALPHA, t42
+
+       ST      t11, -4 * SIZE(CO1)
+       ST      t21, -3 * SIZE(CO1)
+       ST      t31, -2 * SIZE(CO1)
+       ST      t41, -1 * SIZE(CO1)
+       
+       ST      t12, -4 * SIZE(CO2)
+       ST      t22, -3 * SIZE(CO2)
+       ST      t32, -2 * SIZE(CO2)
+       ST      t42, -1 * SIZE(CO2)
 
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+       dsubu   TEMP, KCO, KK
+#ifdef LEFT
+       daddiu  TEMP, TEMP, -4
+#else
+       daddiu  TEMP, TEMP, -2
+#endif
+
+       dsll    K,TEMP, 2 + BASE_SHIFT
+       dsll    TEMP, TEMP, 1 + BASE_SHIFT
+
+       daddu   A,A,K
+       daddu   B,B,TEMP
+#endif
+
+#ifdef LEFT
+       daddiu  KK, KK, 4
+#endif
+
+       bnez    M,.L40
+       nop
+#endif
 
 .L12_M2:
        and             M,MCO,2                         #  Remainder M = 2
        nop
 
 .L50:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       move    B,BO
+#else
+       dsll    K,    KK, 1 + BASE_SHIFT        #mr=2
+       dsll    TEMP, KK, 1 + BASE_SHIFT        #nr=2
+
+       daddu   A, A, K
+       daddu   B, BO,  TEMP
+#endif
+       MTC             $0,t11
+       gsLQC1(R8,F1,F0,0)                      #a0,a1
+
+       MOV     t21,t11
+       MOV     t12,t11
+       gsLQC1(R9,F9,F8,0)                      #b0,b1
+       
+       MOV     t22,t11
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       dsubu   TEMP, KCO, KK
+#elif defined(LEFT)
+       daddiu  TEMP, KK, 2
+#else
+       daddiu  TEMP, KK, 2
+#endif
+       dsra    K,TEMP,2                                #  K=KCO/2
+       beqz    K,.L55
+       nop
+
+#else
+       move    B,BO
        dsra    K,KCO,2                         #  K=KCO/2
        MTC             $0,t11
        gsLQC1(R8,F1,F0,0)                      #a0,a1
        MOV     t22,t11
        beqz    K,.L55
        nop
+#endif
 
 .L51:                                                  #  N=2 m=2,=K=4
        gsLQC1(R8,F5,F4,1)                      #  R8=A
        MADD    t22,t22,a7,b7
 
 .L55:                                                  #  N=2 M=2 K=2
+#ifndef TRMMKERNEL
        and             K,KCO,2                         #  k = KCO&2
+#else
+       andi    K,TEMP,2
+#endif
+       NOP
        beqz    K,.L58
        nop
 
 
        
 .L58:                                                  #  N=2, M=2, K=1
+#ifndef TRMMKERNEL
        and             K,KCO,1
+#else
+       and             K,  TEMP, 1
+#endif
        beqz    K,.L59                          #  
        LD      ALPHA,152($sp)          #  Get ALPHA
        
 
 
 .L59:                                                  #  Write Back
-       LD      c11,0(CO1)                      #  Fetch 16 C
+#ifndef TRMMKERNEL
+       LD      c11,0(CO1)                      #  write gemm part back Fetch 16 C
        LD      c21,1*SIZE(CO1)                 
        LD      c12,0(CO2)
        LD      c22,1*SIZE(CO2)
        ST      t11,0(CO1)
        ST      t21,1*SIZE(CO1)
        ST      t12,0(CO2)
-       move    B,BO                            #  Reset B
        ST      t22,1*SIZE(CO2)
 
        daddu   CO1,CO1,2*SIZE                  #  COx += 2*8Byte
 
        FETCH   $0,0(CO1)
        FETCH   $0,0(CO2)
+#else
+       daddiu  M, M, -1
+
+       daddiu  CO1,CO1, 2 * SIZE
+       daddiu  CO2,CO2, 2 * SIZE
+
+       MUL     t11, ALPHA, t11
+       MUL     t21, ALPHA, t21
+       MUL     t12, ALPHA, t12
+       MUL     t22, ALPHA, t22
+
+       ST      t11, -2 * SIZE(CO1)
+       ST      t21, -1 * SIZE(CO1)
+       ST      t12, -2 * SIZE(CO2)
+       ST      t22, -1 * SIZE(CO2)
+
+#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       dsubu   TEMP, KCO, KK
+#ifdef LEFT
+       daddiu  TEMP, TEMP, -2
+#else
+       daddiu  TEMP, TEMP, -2
+#endif
+
+       dsll    K,    TEMP, 1 + BASE_SHIFT
+       dsll    TEMP, TEMP, 1 + BASE_SHIFT
+
+       daddu   A, A, K
+       daddu   B, B, TEMP
+#endif
+
+#ifdef LEFT
+       daddiu  KK, KK, 2
+#endif
+       FETCH   $0,0(CO1)
+       FETCH   $0,0(CO2)
+
+#endif
 
 
 .L12_M1:
        nop
 
 .L60:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       move    B,BO
+#else
+       dsll    K,    KK, 0 + BASE_SHIFT
+       dsll    TEMP, KK, 1 + BASE_SHIFT
+
+       daddu   A, A, K
+       daddu   B, BO,  TEMP
+#endif
+       MTC             $0,t11
+       gsLQC1(R8,F4,F0,0)
+
+       MOV     t21,t11
+       MOV     t12,t11
+       gsLQC1(R9,F9,F8,0)                      #b0,b1
+       
+       MOV     t22,t11
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       dsubu   TEMP, KCO, KK
+#elif defined(LEFT)
+       daddiu  TEMP, KK, 1
+#else
+       daddiu  TEMP, KK, 2
+#endif
+       dsra    K,TEMP,2                                #  K=KCO/2
+       beqz    K,.L65
+       nop
+
+#else
        dsra    K,KCO,2                         #  K=KCO/2
        MTC             $0,t11
+       move    B,BO                            #  Reset B
        gsLQC1(R8,F4,F0,0)
 
        MOV     t21,t11
        MOV     t22,t11
        beqz    K,.L65
        nop
+#endif
 
 .L61:                                                  #  N=2 m=1,=K=4
        gsLQC1(R9,F13,F12,1)            #  R9=B
        MADD    t12,t12,a6,b7
 
 .L65:                                                  #  N=2 M=1 K=2
+#ifndef TRMMKERNEL
        and             K,KCO,2                         #  k = KCO&2
+#else
+       and             K,TEMP,2
+#endif
        beqz    K,.L68
        nop
 
 
        
 .L68:                                                  #  N=2, M=1, K=1
+#ifndef TRMMKERNEL
        and             K,KCO,1
+#else
+       and             K,TEMP,1
+#endif
        beqz    K,.L69                          #  
        LD      ALPHA,152($sp)          #  Get ALPHA
        
 
 
 .L69:                                                  #  Write Back
+#ifndef TRMMKERNEL
        LD      c11,0(CO1)                      #  Fetch 16 C
        LD      c12,0(CO2)
        
 
        ST      t11,0(CO1)
        ST      t12,0(CO2)
-       move    B,BO                            #  Reset B
 
        daddu   CO1,CO1,1*SIZE                  #  COx += 2*8Byte
        daddu   CO2,CO2,1*SIZE
 
        FETCH   $0,0(CO1)
        FETCH   $0,0(CO2)
+#else
+       MUL     t11, ALPHA, t11
+       MUL     t12, ALPHA, t12
 
+       ST      t11,  0 * SIZE(CO1)
+       ST      t12,  0 * SIZE(CO2)
 
-.L0_N2_Loop:
-       daddu   BO,BO,SPANB                     #  BO+=KC*2N
-       move    B,BO                            #  Set B
+       daddu   CO1,CO1,1*SIZE                  #  COx += 2*8Byte
+       daddu   CO2,CO2,1*SIZE
+
+#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       dsubu   TEMP, KCO, KK
+#ifdef LEFT
+       daddiu  TEMP, TEMP, -1
+#else
+       daddiu  TEMP, TEMP, -2
+#endif
 
+       dsll    K,    TEMP, 0 + BASE_SHIFT
+       dsll    TEMP, TEMP, 1 + BASE_SHIFT
+
+       daddu   A, A, K
+       daddu   B, B, TEMP
+#endif
+
+#ifdef LEFT
+       daddiu  KK, KK, 1
+#endif
+#endif
+
+.L0_N2_Loop:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+       daddiu  KK, KK, 2
+#endif
+       move    BO, B
 
 
        .align  5                                       
 
        move    CO1,C                           #  Set C        
        dsra    M,MCO,2                         #  M=MCO/2
-       
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+       move    KK, OFFSET
+#endif
+
        move    A,AO                            #  Reset A
        beqz    M,.L11_M2
        daddu   PREA,AO,SPANA
 
 
 .L70:                                          
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       move    B, BO
+#else
+       dsll    K,    KK, 2 + BASE_SHIFT
+       dsll    TEMP, KK, 0 + BASE_SHIFT
+
+       daddu   AO, AO, K
+       daddu   B, BO,  TEMP
+#endif
+       gsLQC1(R9,F12,F8,0)
+       MTC             $0,t11
+       gsLQC1(R8,F1,F0,0)                      #a0,a1
+       MOV     t21,t11
+       gsLQC1(R8,F3,F2,1)                      #a2,a3
+       MOV     t31,t11
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       dsubu   TEMP, KCO, KK
+#elif defined(LEFT)
+       daddiu  TEMP, KK, 4
+#else
+       daddiu  TEMP, KK, 1
+#endif
+       dsra    K,TEMP,2                                #  K=KCO/2
+       beqz    K,.L75
+       MOV     t41,t11
+#else
+       move    B, BO
        dsra    K,KCO,2                         #  K=KCO/2
        gsLQC1(R9,F12,F8,0)
        MTC             $0,t11
        MOV     t31,t11
        beqz    K,.L75
        MOV     t41,t11
+#endif
+
 
 .L71:                                                  #  N=1,M=K=4
        gsLQC1(R8,F5,F4,2)                      #  R8=A
 
 
 .L75:                                                  #  N=2 M=4 K=2
+#ifndef TRMMKERNEL
        and             K,KCO,2                         #  k = KCO&2
+#else
+       and             K,TEMP,2
+#endif
        beqz    K,.L78
        nop
 
 
        
 .L78:                                                  #  N=2, M=4, K=1
+#ifndef TRMMKERNEL
        and             K,KCO,1
+#else
+       and             K,TEMP,1
+#endif
        beqz    K,.L79                          #  
        LD      ALPHA,152($sp)          #  Get ALPHA
        
 
 
 .L79:                                                  #  Write Back
+#ifndef TRMMKERNEL
        LD      c11,0(CO1)                      #  Fetch 16 C
        LD      c21,1*SIZE(CO1)                 
        LD      c31,2*SIZE(CO1)
 
        daddu   CO1,CO1,4*SIZE                  #  COx += 4*8Byte
        bnez    M,.L70                          #  M!=0
-       move    B,BO                            #  Reset B
+       nop
+#else
+       daddiu  M,M,-1                          #  M--
+       MUL     t11, ALPHA, t11
+       MUL     t21, ALPHA, t21
+       MUL     t31, ALPHA, t31
+       MUL     t41, ALPHA, t41
+
+       ST      t11,0(CO1)
+       ST      t21,1*SIZE(CO1)
+       ST      t31,2*SIZE(CO1)
+       ST      t41,3*SIZE(CO1)
+
+       daddu   CO1,CO1,4*SIZE                  #  COx += 4*8Byte
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+       dsubu   TEMP, KCO, KK
+#ifdef LEFT
+       daddiu  TEMP, TEMP, -4
+#else
+       daddiu  TEMP, TEMP, -1
+#endif
+
+       dsll    K,    TEMP, 2 + BASE_SHIFT
+       dsll    TEMP, TEMP, 0 + BASE_SHIFT
+
+       daddu   A, A,K
+       daddu   B, B, TEMP
+#endif
+
+#ifdef LEFT
+       daddiu  KK, KK, 4
+#endif
+       bnez    M,.L70                          #  M!=0
+       nop
+#endif
 
 
 
        nop
 
 .L80:                                          
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       move    B, BO
+#else
+       dsll    K,    KK, 1 + BASE_SHIFT
+       dsll    TEMP, KK, 0 + BASE_SHIFT
+
+       daddu   A, A, K
+       daddu   B, BO,  TEMP
+#endif
+
+       gsLQC1(R9,F12,F8,0)
+       MTC             $0,t11
+       gsLQC1(R8,F1,F0,0)                      #a0,a1
+       MOV             t21,t11
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       dsubu   TEMP, KCO, KK
+#elif defined(LEFT)
+       daddiu  TEMP, KK, 2
+#else
+       daddiu  TEMP, KK, 1
+#endif
+       dsra    K,TEMP,2                                #  K=KCO/2
+       beqz    K,.L85
+       nop
+#else
+       move    B, BO
        dsra    K,KCO,2                         #  K=KCO/2
        gsLQC1(R9,F12,F8,0)
        MTC             $0,t11
        MOV             t21,t11
        beqz    K,.L85
        nop
+#endif
 
 .L81:                                                  #  N=1,M=2,K=4
        gsLQC1(R8,F5,F4,1)                      #  R8=A
 
 
 .L85:                                                  #  N=2 M=4 K=2
+#ifndef TRMMKERNEL
        and             K,KCO,2                         #  k = KCO&2
+#else
+       andi    K,TEMP,2
+#endif
+
        beqz    K,.L88
        nop
 
 
        
 .L88:                                                  #  N=2, M=4, K=1
+#ifndef TRMMKERNEL
        and             K,KCO,1
+#else
+       andi    K,TEMP,1
+#endif
+
        beqz    K,.L89                          #  
        LD      ALPHA,152($sp)          #  Get ALPHA
        
 
 
 .L89:                                                  #  Write Back
+#ifndef TRMMKERNEL
        LD      c11,0(CO1)                      #  Fetch 16 C
        LD      c21,1*SIZE(CO1)                 
 
        FETCH   $0,2*SIZE(CO1)
        
        daddu   CO1,CO1,2*SIZE                  #  COx += 2*8Byte
-       move    B,BO                            #  Reset B
+#else
+       daddu   CO1,CO1,2*SIZE                  #  COx += 2*8Byte
+       MUL     t11, ALPHA, t11
+       MUL     t21, ALPHA, t21
+
+       ST      t11, -2 * SIZE(CO1)
+       ST      t21, -1 * SIZE(CO1)
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+       dsubu   TEMP, KCO, KK
+#ifdef LEFT
+       daddiu  TEMP, TEMP, -2
+#else
+       daddiu  TEMP, TEMP, -1
+#endif
+
+       dsll    K,    TEMP, 1 + BASE_SHIFT
+       dsll    TEMP, TEMP, 0 + BASE_SHIFT
+
+       daddu   A, A, K
+       daddu   B, B, TEMP
+#endif
+
+#ifdef LEFT
+       daddiu  KK, KK, 2
+#endif
+#endif
+
 
 
 .L11_M1:
        beqz    M,.L999                         #  M = 0, End
        nop
 
-.L90:                                          
+.L90:                  
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       move    B,  BO
+#else
+       dsll    K,    KK, 0 + BASE_SHIFT
+       dsll    TEMP, KK, 0 + BASE_SHIFT
+
+       daddu   A, A, K
+       daddu   B, BO,  TEMP
+#endif
+       gsLQC1(R8,F4,F0,0)
+       MTC             $0,t11
+       gsLQC1(R9,F12,F8,0)
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       dsubu   TEMP, KCO, KK
+#elif defined(LEFT)
+       daddiu  TEMP, KK, 1
+#else
+       daddiu  TEMP, KK, 1
+#endif
+       dsra    K,  TEMP, 2
+       beqz    K,.L95
+       nop
+
+#else
+       move    B,  BO
        dsra    K,KCO,2                         #  K=KCO/2
        gsLQC1(R8,F4,F0,0)
        gsLQC1(R9,F12,F8,0)
        beqz    K,.L95
        MTC             $0,t11
+#endif
 
 .L91:                                                  #  N=1,M=1,K=4
        gsLQC1(R8,F6,F2,1)
        nop
 
 .L95:                                                  #  N=2 M=4 K=2
+#ifndef TRMMKERNEL
        and             K,KCO,2                         #  k = KCO&2
+#else
+       andi    K,TEMP,2
+#endif
        beqz    K,.L98
        nop
 
 
        
 .L98:                                                  #  N=2, M=4, K=1
+#ifndef TRMMKERNEL
        and             K,KCO,1
+#else
+       andi    K,TEMP,1
+#endif
        beqz    K,.L99                          #  
        LD      ALPHA,152($sp)          #  Get ALPHA
        MADD    t11,t11,a0,b0
 
 
 .L99:                                                  #  Write Back
+#ifndef TRMMKERNEL
        LD      c11,0(CO1)                      #  Fetch 16 C
        MADD    t11,c11,t11,ALPHA
        ST      t11,0(CO1)
+#else
+       MUL     t11, ALPHA, t11
 
-
+       ST      t11,  0 * SIZE(CO1)
+#endif
 
 
 .L999:                                                 #  End