Appending gemmkernel and trmmkernel C code in kernel/generic, this code can be used...
authorWang Qian <wangqian10@iscas.ac.cn>
Tue, 10 Jan 2012 17:16:13 +0000 (17:16 +0000)
committerWang Qian <wangqian10@iscas.ac.cn>
Tue, 10 Jan 2012 17:16:13 +0000 (17:16 +0000)
kernel/Makefile.L3
kernel/generic/gemmkernel_2x2.c [new file with mode: 0644]
kernel/generic/trmmkernel_2x2.c [new file with mode: 0644]
kernel/generic/zgemmkernel_2x2.c [new file with mode: 0644]
kernel/generic/ztrmmkernel_2x2.c [new file with mode: 0644]
kernel/mips64/KERNEL.LOONGSON3B

index 4e331a4..4f419dc 100644 (file)
@@ -498,6 +498,91 @@ $(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD
 $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
        $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@
 
+ifeq ($(TARGET), LOONGSON3B)                                                                                            
+$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
+
+$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
+
+$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
+
+$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
+
+$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
+
+$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
+
+$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
+
+$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
+
+$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
+
+$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
+
+$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
+
+$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
+
+$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
+
+$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
+
+$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
+
+$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
+
+$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
+
+$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
+
+$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
+
+$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
+
+$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
+
+$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
+
+$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
+
+$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
+
+$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
+
+$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
+
+$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
+
+$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
+else
 $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
        $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
 
@@ -581,6 +666,7 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
 
 $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
        $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
+endif
 
 $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
        $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
diff --git a/kernel/generic/gemmkernel_2x2.c b/kernel/generic/gemmkernel_2x2.c
new file mode 100644 (file)
index 0000000..3645ef1
--- /dev/null
@@ -0,0 +1,157 @@
+#include "common.h"
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
+#ifdef TRMMKERNEL
+               ,BLASLONG offset
+#endif
+               ) 
+{
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*ptrba,*ptrbb;
+   FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7;
+   for (j=0; j<bn/2; j+=1) 
+     {
+        C0 = C;
+        C1 = C0+ldc;
+        ptrba = ba;
+        for (i=0; i<bm/2; i+=1) 
+          {
+             ptrbb = bb;
+             res0 = 0;
+             res1 = 0;
+             res2 = 0;
+             res3 = 0;
+             for (k=0; k<bk/4; k+=1) 
+               {
+                  load0 = ptrba[2*0+0];
+                  load1 = ptrbb[2*0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[2*0+1];
+                  res1 = res1+load2*load1;
+                  load3 = ptrbb[2*0+1];
+                  res2 = res2+load0*load3;
+                  res3 = res3+load2*load3;
+                  load4 = ptrba[2*1+0];
+                  load5 = ptrbb[2*1+0];
+                  res0 = res0+load4*load5;
+                  load6 = ptrba[2*1+1];
+                  res1 = res1+load6*load5;
+                  load7 = ptrbb[2*1+1];
+                  res2 = res2+load4*load7;
+                  res3 = res3+load6*load7;
+                  load0 = ptrba[2*2+0];
+                  load1 = ptrbb[2*2+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[2*2+1];
+                  res1 = res1+load2*load1;
+                  load3 = ptrbb[2*2+1];
+                  res2 = res2+load0*load3;
+                  res3 = res3+load2*load3;
+                  load4 = ptrba[2*3+0];
+                  load5 = ptrbb[2*3+0];
+                  res0 = res0+load4*load5;
+                  load6 = ptrba[2*3+1];
+                  res1 = res1+load6*load5;
+                  load7 = ptrbb[2*3+1];
+                  res2 = res2+load4*load7;
+                  res3 = res3+load6*load7;
+                  ptrba = ptrba+8;
+                  ptrbb = ptrbb+8;
+               }
+             for (k=0; k<(bk&3); k+=1) 
+               {
+                  load0 = ptrba[2*0+0];
+                  load1 = ptrbb[2*0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[2*0+1];
+                  res1 = res1+load2*load1;
+                  load3 = ptrbb[2*0+1];
+                  res2 = res2+load0*load3;
+                  res3 = res3+load2*load3;
+                  ptrba = ptrba+2;
+                  ptrbb = ptrbb+2;
+               }
+             res0 = res0*alpha;
+             C0[0] = C0[0]+res0;
+             res1 = res1*alpha;
+             C0[1] = C0[1]+res1;
+             res2 = res2*alpha;
+             C1[0] = C1[0]+res2;
+             res3 = res3*alpha;
+             C1[1] = C1[1]+res3;
+             C0 = C0+2;
+             C1 = C1+2;
+          }
+        for (i=0; i<(bm&1); i+=1) 
+          {
+             ptrbb = bb;
+             res0 = 0;
+             res1 = 0;
+             for (k=0; k<bk; k+=1) 
+               {
+                  load0 = ptrba[0+0];
+                  load1 = ptrbb[2*0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrbb[2*0+1];
+                  res1 = res1+load0*load2;
+                  ptrba = ptrba+1;
+                  ptrbb = ptrbb+2;
+               }
+             res0 = res0*alpha;
+             C0[0] = C0[0]+res0;
+             res1 = res1*alpha;
+             C1[0] = C1[0]+res1;
+             C0 = C0+1;
+             C1 = C1+1;
+          }
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+     }
+   for (j=0; j<(bn&1); j+=1) 
+     {
+        C0 = C;
+        ptrba = ba;
+        for (i=0; i<bm/2; i+=1) 
+          {
+             ptrbb = bb;
+             res0 = 0;
+             res1 = 0;
+             for (k=0; k<bk; k+=1) 
+               {
+                  load0 = ptrba[2*0+0];
+                  load1 = ptrbb[0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[2*0+1];
+                  res1 = res1+load2*load1;
+                  ptrba = ptrba+2;
+                  ptrbb = ptrbb+1;
+               }
+             res0 = res0*alpha;
+             C0[0] = C0[0]+res0;
+             res1 = res1*alpha;
+             C0[1] = C0[1]+res1;
+             C0 = C0+2;
+          }
+        for (i=0; i<(bm&1); i+=1) 
+          {
+             ptrbb = bb;
+             res0 = 0;
+             for (k=0; k<bk; k+=1) 
+               {
+                  load0 = ptrba[0+0];
+                  load1 = ptrbb[0+0];
+                  res0 = res0+load0*load1;
+                  ptrba = ptrba+1;
+                  ptrbb = ptrbb+1;
+               }
+             res0 = res0*alpha;
+             C0[0] = C0[0]+res0;
+             C0 = C0+1;
+          }
+        k = (bk<<0);
+        bb = bb+k;
+        C = C+ldc;
+     }
+   return 0;
+}
diff --git a/kernel/generic/trmmkernel_2x2.c b/kernel/generic/trmmkernel_2x2.c
new file mode 100644 (file)
index 0000000..5b16806
--- /dev/null
@@ -0,0 +1,280 @@
+#include "common.h"
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
+#ifdef TRMMKERNEL
+               ,BLASLONG offset
+#endif
+               ) 
+{
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*ptrba,*ptrbb;
+   FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7;
+   BLASLONG off, temp;
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   off = -offset; 
+#endif
+   for (j=0; j<bn/2; j+=1) 
+     {
+        C0 = C;
+        C1 = C0+ldc;
+#if defined(TRMMKERNEL) && defined(LEFT)
+               off = offset;
+#endif
+        ptrba = ba;
+        for (i=0; i<bm/2; i+=1) 
+          {
+#if (defined(LEFT) &&  defined(TRANSA)) || \
+                         (!defined(LEFT) && !defined(TRANSA))
+             ptrbb = bb;
+#else
+                         ptrba += off*2;
+                         ptrbb = bb + off*2;
+#endif
+             res0 = 0;
+             res1 = 0;
+             res2 = 0;
+             res3 = 0;
+#if (defined(LEFT) && !defined(TRANSA)) || \
+                        (!defined(LEFT) && defined(TRANSA))
+                        temp = bk-off;
+#elif defined(LEFT) 
+                        temp = off+2;
+#else
+                        temp = off+2;
+#endif
+             for (k=0; k<temp/4; k+=1) 
+               {
+                  load0 = ptrba[2*0+0];
+                  load1 = ptrbb[2*0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[2*0+1];
+                  res1 = res1+load2*load1;
+                  load3 = ptrbb[2*0+1];
+                  res2 = res2+load0*load3;
+                  res3 = res3+load2*load3;
+                  load4 = ptrba[2*1+0];
+                  load5 = ptrbb[2*1+0];
+                  res0 = res0+load4*load5;
+                  load6 = ptrba[2*1+1];
+                  res1 = res1+load6*load5;
+                  load7 = ptrbb[2*1+1];
+                  res2 = res2+load4*load7;
+                  res3 = res3+load6*load7;
+                  load0 = ptrba[2*2+0];
+                  load1 = ptrbb[2*2+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[2*2+1];
+                  res1 = res1+load2*load1;
+                  load3 = ptrbb[2*2+1];
+                  res2 = res2+load0*load3;
+                  res3 = res3+load2*load3;
+                  load4 = ptrba[2*3+0];
+                  load5 = ptrbb[2*3+0];
+                  res0 = res0+load4*load5;
+                  load6 = ptrba[2*3+1];
+                  res1 = res1+load6*load5;
+                  load7 = ptrbb[2*3+1];
+                  res2 = res2+load4*load7;
+                  res3 = res3+load6*load7;
+                  ptrba = ptrba+8;
+                  ptrbb = ptrbb+8;
+               }
+             for (k=0; k<(temp&3); k+=1) 
+               {
+                  load0 = ptrba[2*0+0];
+                  load1 = ptrbb[2*0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[2*0+1];
+                  res1 = res1+load2*load1;
+                  load3 = ptrbb[2*0+1];
+                  res2 = res2+load0*load3;
+                  res3 = res3+load2*load3;
+                  ptrba = ptrba+2;
+                  ptrbb = ptrbb+2;
+               }
+             res0 = res0*alpha;
+             C0[0] = res0;
+             res1 = res1*alpha;
+             C0[1] = res1;
+             res2 = res2*alpha;
+             C1[0] = res2;
+             res3 = res3*alpha;
+             C1[1] = res3;
+#if ( defined(LEFT) && defined(TRANSA)) || \
+                        (!defined(LEFT) && !defined(TRANSA)) 
+                        temp = bk - off;
+#ifdef LEFT
+                        temp -= 2;
+#else 
+                        temp -= 2;
+#endif
+                        ptrba += temp*2;
+                        ptrbb += temp*2;
+#endif
+#ifdef LEFT
+                        off += 2;
+#endif
+             C0 = C0+2;
+             C1 = C1+2;
+          }
+        for (i=0; i<(bm&1); i+=1) 
+          {
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+             ptrbb = bb;
+#else 
+                        ptrba += off;
+                        ptrbb = bb+off*2;
+#endif
+             res0 = 0;
+             res1 = 0;
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+                        temp = bk-off;
+#elif defined(LEFT)
+                        temp = off+1;
+#else 
+                        temp = off+2;
+#endif
+             for (k=0; k<temp; k+=1) 
+               {
+                  load0 = ptrba[0+0];
+                  load1 = ptrbb[2*0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrbb[2*0+1];
+                  res1 = res1+load0*load2;
+                  ptrba = ptrba+1;
+                  ptrbb = ptrbb+2;
+               }
+             res0 = res0*alpha;
+             C0[0] = res0;
+             res1 = res1*alpha;
+             C1[0] = res1;
+#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 
+                        temp = bk-off;
+#ifdef LEFT
+                        temp -= 1;
+#else 
+                        temp -= 2;
+#endif
+                        ptrba += temp;
+                        ptrbb += temp*2;
+#endif
+#ifdef LEFT    
+                        off += 1;
+#endif
+             C0 = C0+1;
+             C1 = C1+1;
+          }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+               off += 2;
+#endif
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+     }
+   for (j=0; j<(bn&1); j+=1) 
+     {
+        C0 = C;
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+               off = offset;
+#endif
+        ptrba = ba;
+        for (i=0; i<bm/2; i+=1) 
+          {
+#if (defined(LEFT) &&  defined(TRANSA)) || \
+                         (!defined(LEFT) && !defined(TRANSA))
+                         ptrbb = bb;
+#else 
+                         ptrba += off*2;
+                         ptrbb = bb + off;
+#endif
+             res0 = 0;
+             res1 = 0;
+#if (defined(LEFT) && !defined(TRANSA)) || \
+                        (!defined(LEFT) && defined(TRANSA))
+                        temp = bk-off;
+#elif defined(LEFT)
+                        temp = off+2;
+#else
+                        temp = off+1;
+#endif
+                        for (k=0; k<temp; k+=1) 
+               {
+                  load0 = ptrba[2*0+0];
+                  load1 = ptrbb[0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[2*0+1];
+                  res1 = res1+load2*load1;
+                  ptrba = ptrba+2;
+                  ptrbb = ptrbb+1;
+               }
+             res0 = res0*alpha;
+             C0[0] = res0;
+             res1 = res1*alpha;
+                        C0[1] = res1;
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+                        (!defined(LEFT) && !defined(TRANSA))
+                        temp = bk - off;
+#ifdef LEFT
+                        temp -= 2;
+#else 
+                        temp -= 1;
+#endif
+                        ptrba += temp*2;
+                        ptrbb += temp;
+#endif
+#ifdef LEFT
+                        off += 2;
+#endif
+
+             C0 = C0+2;
+          }
+        for (i=0; i<(bm&1); i+=1) 
+          {
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+             ptrbb = bb;
+#else 
+                        ptrba += off;
+                        ptrbb = bb+off;
+#endif
+             res0 = 0;
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+                        temp = bk-off;
+#elif defined(LEFT)
+                        temp = off + 1;
+#else 
+                        temp = off + 1;
+#endif
+                        for (k=0; k<temp; k+=1) 
+               {
+                  load0 = ptrba[0+0];
+                  load1 = ptrbb[0+0];
+                  res0 = res0+load0*load1;
+                  ptrba = ptrba+1;
+                  ptrbb = ptrbb+1;
+               }
+             res0 = res0*alpha;
+             C0[0] = res0;
+#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+                        temp = bk-off;
+#ifdef LEFT
+                        temp -= 1;
+#else 
+                        temp -= 1;
+#endif
+                        ptrba += temp;
+                        ptrbb += temp;
+#endif
+#ifdef LEFT
+                        off += 1;
+#endif
+             C0 = C0+1;
+          }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+               off += 1;
+#endif
+        k = (bk<<0);
+        bb = bb+k;
+        C = C+ldc;
+     }
+   return 0;
+}
diff --git a/kernel/generic/zgemmkernel_2x2.c b/kernel/generic/zgemmkernel_2x2.c
new file mode 100644 (file)
index 0000000..cb2a26e
--- /dev/null
@@ -0,0 +1,838 @@
+#include "common.h"
+/********************************
+  ADD1 a*c
+  ADD2 b*c
+  ADD3 a*d
+  ADD4 b*d
+*********************************/
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
+#ifdef TRMMKERNEL
+               , BLASLONG offset
+#endif
+               )
+{
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*ptrba,*ptrbb;
+   FLOAT res0,res1,res2,res3,res4,res5,res6,res7,load0,load1,load2,load3,load4,load5,load6,load7,load8,load9,load10,load11,load12,load13,load14,load15;
+   for (j=0; j<bn/2; j+=1) 
+     {
+        C0 = C;
+        C1 = C0+2*ldc;
+        ptrba = ba;
+        for (i=0; i<bm/2; i+=1) 
+          {
+             ptrbb = bb;
+             res0 = 0;
+             res1 = 0;
+             res2 = 0;
+             res3 = 0;
+             res4 = 0;
+             res5 = 0;
+             res6 = 0;
+             res7 = 0;
+             for (k=0; k<bk/4; k+=1) 
+               {
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                  load0 = ptrba[4*0+0];
+                  load1 = ptrbb[4*0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[4*0+1];
+                  res1 = res1+load2*load1;
+                  load3 = ptrbb[4*0+1];
+                  res0 = res0-load2*load3;
+                  res1 = res1+load0*load3;
+                  load4 = ptrba[4*0+2];
+                  res2 = res2+load4*load1;
+                  load5 = ptrba[4*0+3];
+                  res3 = res3+load5*load1;
+                  res2 = res2-load5*load3;
+                  res3 = res3+load4*load3;
+                  load6 = ptrbb[4*0+2];
+                  res4 = res4+load0*load6;
+                  res5 = res5+load2*load6;
+                  load7 = ptrbb[4*0+3];
+                  res4 = res4-load2*load7;
+                  res5 = res5+load0*load7;
+                  res6 = res6+load4*load6;
+                  res7 = res7+load5*load6;
+                  res6 = res6-load5*load7;
+                  res7 = res7+load4*load7;
+                  load8 = ptrba[4*1+0];
+                  load9 = ptrbb[4*1+0];
+                  res0 = res0+load8*load9;
+                  load10 = ptrba[4*1+1];
+                  res1 = res1+load10*load9;
+                  load11 = ptrbb[4*1+1];
+                  res0 = res0-load10*load11;
+                  res1 = res1+load8*load11;
+                  load12 = ptrba[4*1+2];
+                  res2 = res2+load12*load9;
+                  load13 = ptrba[4*1+3];
+                  res3 = res3+load13*load9;
+                  res2 = res2-load13*load11;
+                  res3 = res3+load12*load11;
+                  load14 = ptrbb[4*1+2];
+                  res4 = res4+load8*load14;
+                  res5 = res5+load10*load14;
+                  load15 = ptrbb[4*1+3];
+                  res4 = res4-load10*load15;
+                  res5 = res5+load8*load15;
+                  res6 = res6+load12*load14;
+                  res7 = res7+load13*load14;
+                  res6 = res6-load13*load15;
+                  res7 = res7+load12*load15;
+                  load0 = ptrba[4*2+0];
+                  load1 = ptrbb[4*2+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[4*2+1];
+                  res1 = res1+load2*load1;
+                  load3 = ptrbb[4*2+1];
+                  res0 = res0-load2*load3;
+                  res1 = res1+load0*load3;
+                  load4 = ptrba[4*2+2];
+                  res2 = res2+load4*load1;
+                  load5 = ptrba[4*2+3];
+                  res3 = res3+load5*load1;
+                  res2 = res2-load5*load3;
+                  res3 = res3+load4*load3;
+                  load6 = ptrbb[4*2+2];
+                  res4 = res4+load0*load6;
+                  res5 = res5+load2*load6;
+                  load7 = ptrbb[4*2+3];
+                  res4 = res4-load2*load7;
+                  res5 = res5+load0*load7;
+                  res6 = res6+load4*load6;
+                  res7 = res7+load5*load6;
+                  res6 = res6-load5*load7;
+                  res7 = res7+load4*load7;
+                  load8 = ptrba[4*3+0];
+                  load9 = ptrbb[4*3+0];
+                  res0 = res0+load8*load9;
+                  load10 = ptrba[4*3+1];
+                  res1 = res1+load10*load9;
+                  load11 = ptrbb[4*3+1];
+                  res0 = res0-load10*load11;
+                  res1 = res1+load8*load11;
+                  load12 = ptrba[4*3+2];
+                  res2 = res2+load12*load9;
+                  load13 = ptrba[4*3+3];
+                  res3 = res3+load13*load9;
+                  res2 = res2-load13*load11;
+                  res3 = res3+load12*load11;
+                  load14 = ptrbb[4*3+2];
+                  res4 = res4+load8*load14;
+                  res5 = res5+load10*load14;
+                  load15 = ptrbb[4*3+3];
+                  res4 = res4-load10*load15;
+                  res5 = res5+load8*load15;
+                  res6 = res6+load12*load14;
+                  res7 = res7+load13*load14;
+                  res6 = res6-load13*load15;
+                  res7 = res7+load12*load15;
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                  load0 = ptrba[4*0+0];
+                  load1 = ptrbb[4*0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[4*0+1];
+                  res1 = res1+load2*load1;
+                  load3 = ptrbb[4*0+1];
+                  res0 = res0+load2*load3;
+                  res1 = res1-load0*load3;
+                  load4 = ptrba[4*0+2];
+                  res2 = res2+load4*load1;
+                  load5 = ptrba[4*0+3];
+                  res3 = res3+load5*load1;
+                  res2 = res2+load5*load3;
+                  res3 = res3-load4*load3;
+                  load6 = ptrbb[4*0+2];
+                  res4 = res4+load0*load6;
+                  res5 = res5+load2*load6;
+                  load7 = ptrbb[4*0+3];
+                  res4 = res4+load2*load7;
+                  res5 = res5-load0*load7;
+                  res6 = res6+load4*load6;
+                  res7 = res7+load5*load6;
+                  res6 = res6+load5*load7;
+                  res7 = res7-load4*load7;
+                  load8 = ptrba[4*1+0];
+                  load9 = ptrbb[4*1+0];
+                  res0 = res0+load8*load9;
+                  load10 = ptrba[4*1+1];
+                  res1 = res1+load10*load9;
+                  load11 = ptrbb[4*1+1];
+                  res0 = res0+load10*load11;
+                  res1 = res1-load8*load11;
+                  load12 = ptrba[4*1+2];
+                  res2 = res2+load12*load9;
+                  load13 = ptrba[4*1+3];
+                  res3 = res3+load13*load9;
+                  res2 = res2+load13*load11;
+                  res3 = res3-load12*load11;
+                  load14 = ptrbb[4*1+2];
+                  res4 = res4+load8*load14;
+                  res5 = res5+load10*load14;
+                  load15 = ptrbb[4*1+3];
+                  res4 = res4+load10*load15;
+                  res5 = res5-load8*load15;
+                  res6 = res6+load12*load14;
+                  res7 = res7+load13*load14;
+                  res6 = res6+load13*load15;
+                  res7 = res7-load12*load15;
+                  load0 = ptrba[4*2+0];
+                  load1 = ptrbb[4*2+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[4*2+1];
+                  res1 = res1+load2*load1;
+                  load3 = ptrbb[4*2+1];
+                  res0 = res0+load2*load3;
+                  res1 = res1-load0*load3;
+                  load4 = ptrba[4*2+2];
+                  res2 = res2+load4*load1;
+                  load5 = ptrba[4*2+3];
+                  res3 = res3+load5*load1;
+                  res2 = res2+load5*load3;
+                  res3 = res3-load4*load3;
+                  load6 = ptrbb[4*2+2];
+                  res4 = res4+load0*load6;
+                  res5 = res5+load2*load6;
+                  load7 = ptrbb[4*2+3];
+                  res4 = res4+load2*load7;
+                  res5 = res5-load0*load7;
+                  res6 = res6+load4*load6;
+                  res7 = res7+load5*load6;
+                  res6 = res6+load5*load7;
+                  res7 = res7-load4*load7;
+                  load8 = ptrba[4*3+0];
+                  load9 = ptrbb[4*3+0];
+                  res0 = res0+load8*load9;
+                  load10 = ptrba[4*3+1];
+                  res1 = res1+load10*load9;
+                  load11 = ptrbb[4*3+1];
+                  res0 = res0+load10*load11;
+                  res1 = res1-load8*load11;
+                  load12 = ptrba[4*3+2];
+                  res2 = res2+load12*load9;
+                  load13 = ptrba[4*3+3];
+                  res3 = res3+load13*load9;
+                  res2 = res2+load13*load11;
+                  res3 = res3-load12*load11;
+                  load14 = ptrbb[4*3+2];
+                  res4 = res4+load8*load14;
+                  res5 = res5+load10*load14;
+                  load15 = ptrbb[4*3+3];
+                  res4 = res4+load10*load15;
+                  res5 = res5-load8*load15;
+                  res6 = res6+load12*load14;
+                  res7 = res7+load13*load14;
+                  res6 = res6+load13*load15;
+                  res7 = res7-load12*load15;
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                  load0 = ptrba[4*0+0];
+                  load1 = ptrbb[4*0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[4*0+1];
+                  res1 = res1-load2*load1;
+                  load3 = ptrbb[4*0+1];
+                  res0 = res0+load2*load3;
+                  res1 = res1+load0*load3;
+                  load4 = ptrba[4*0+2];
+                  res2 = res2+load4*load1;
+                  load5 = ptrba[4*0+3];
+                  res3 = res3-load5*load1;
+                  res2 = res2+load5*load3;
+                  res3 = res3+load4*load3;
+                  load6 = ptrbb[4*0+2];
+                  res4 = res4+load0*load6;
+                  res5 = res5-load2*load6;
+                  load7 = ptrbb[4*0+3];
+                  res4 = res4+load2*load7;
+                  res5 = res5+load0*load7;
+                  res6 = res6+load4*load6;
+                  res7 = res7-load5*load6;
+                  res6 = res6+load5*load7;
+                  res7 = res7+load4*load7;
+                  load8 = ptrba[4*1+0];
+                  load9 = ptrbb[4*1+0];
+                  res0 = res0+load8*load9;
+                  load10 = ptrba[4*1+1];
+                  res1 = res1-load10*load9;
+                  load11 = ptrbb[4*1+1];
+                  res0 = res0+load10*load11;
+                  res1 = res1+load8*load11;
+                  load12 = ptrba[4*1+2];
+                  res2 = res2+load12*load9;
+                  load13 = ptrba[4*1+3];
+                  res3 = res3-load13*load9;
+                  res2 = res2+load13*load11;
+                  res3 = res3+load12*load11;
+                  load14 = ptrbb[4*1+2];
+                  res4 = res4+load8*load14;
+                  res5 = res5-load10*load14;
+                  load15 = ptrbb[4*1+3];
+                  res4 = res4+load10*load15;
+                  res5 = res5+load8*load15;
+                  res6 = res6+load12*load14;
+                  res7 = res7-load13*load14;
+                  res6 = res6+load13*load15;
+                  res7 = res7+load12*load15;
+                  load0 = ptrba[4*2+0];
+                  load1 = ptrbb[4*2+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[4*2+1];
+                  res1 = res1-load2*load1;
+                  load3 = ptrbb[4*2+1];
+                  res0 = res0+load2*load3;
+                  res1 = res1+load0*load3;
+                  load4 = ptrba[4*2+2];
+                  res2 = res2+load4*load1;
+                  load5 = ptrba[4*2+3];
+                  res3 = res3-load5*load1;
+                  res2 = res2+load5*load3;
+                  res3 = res3+load4*load3;
+                  load6 = ptrbb[4*2+2];
+                  res4 = res4+load0*load6;
+                  res5 = res5-load2*load6;
+                  load7 = ptrbb[4*2+3];
+                  res4 = res4+load2*load7;
+                  res5 = res5+load0*load7;
+                  res6 = res6+load4*load6;
+                  res7 = res7-load5*load6;
+                  res6 = res6+load5*load7;
+                  res7 = res7+load4*load7;
+                  load8 = ptrba[4*3+0];
+                  load9 = ptrbb[4*3+0];
+                  res0 = res0+load8*load9;
+                  load10 = ptrba[4*3+1];
+                  res1 = res1-load10*load9;
+                  load11 = ptrbb[4*3+1];
+                  res0 = res0+load10*load11;
+                  res1 = res1+load8*load11;
+                  load12 = ptrba[4*3+2];
+                  res2 = res2+load12*load9;
+                  load13 = ptrba[4*3+3];
+                  res3 = res3-load13*load9;
+                  res2 = res2+load13*load11;
+                  res3 = res3+load12*load11;
+                  load14 = ptrbb[4*3+2];
+                  res4 = res4+load8*load14;
+                  res5 = res5-load10*load14;
+                  load15 = ptrbb[4*3+3];
+                  res4 = res4+load10*load15;
+                  res5 = res5+load8*load15;
+                  res6 = res6+load12*load14;
+                  res7 = res7-load13*load14;
+                  res6 = res6+load13*load15;
+                  res7 = res7+load12*load15;
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                  load0 = ptrba[4*0+0];
+                  load1 = ptrbb[4*0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[4*0+1];
+                  res1 = res1-load2*load1;
+                  load3 = ptrbb[4*0+1];
+                  res0 = res0-load2*load3;
+                  res1 = res1-load0*load3;
+                  load4 = ptrba[4*0+2];
+                  res2 = res2+load4*load1;
+                  load5 = ptrba[4*0+3];
+                  res3 = res3-load5*load1;
+                  res2 = res2-load5*load3;
+                  res3 = res3-load4*load3;
+                  load6 = ptrbb[4*0+2];
+                  res4 = res4+load0*load6;
+                  res5 = res5-load2*load6;
+                  load7 = ptrbb[4*0+3];
+                  res4 = res4-load2*load7;
+                  res5 = res5-load0*load7;
+                  res6 = res6+load4*load6;
+                  res7 = res7-load5*load6;
+                  res6 = res6-load5*load7;
+                  res7 = res7-load4*load7;
+                  load8 = ptrba[4*1+0];
+                  load9 = ptrbb[4*1+0];
+                  res0 = res0+load8*load9;
+                  load10 = ptrba[4*1+1];
+                  res1 = res1-load10*load9;
+                  load11 = ptrbb[4*1+1];
+                  res0 = res0-load10*load11;
+                  res1 = res1-load8*load11;
+                  load12 = ptrba[4*1+2];
+                  res2 = res2+load12*load9;
+                  load13 = ptrba[4*1+3];
+                  res3 = res3-load13*load9;
+                  res2 = res2-load13*load11;
+                  res3 = res3-load12*load11;
+                  load14 = ptrbb[4*1+2];
+                  res4 = res4+load8*load14;
+                  res5 = res5-load10*load14;
+                  load15 = ptrbb[4*1+3];
+                  res4 = res4-load10*load15;
+                  res5 = res5-load8*load15;
+                  res6 = res6+load12*load14;
+                  res7 = res7-load13*load14;
+                  res6 = res6-load13*load15;
+                  res7 = res7-load12*load15;
+                  load0 = ptrba[4*2+0];
+                  load1 = ptrbb[4*2+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[4*2+1];
+                  res1 = res1-load2*load1;
+                  load3 = ptrbb[4*2+1];
+                  res0 = res0-load2*load3;
+                  res1 = res1-load0*load3;
+                  load4 = ptrba[4*2+2];
+                  res2 = res2+load4*load1;
+                  load5 = ptrba[4*2+3];
+                  res3 = res3-load5*load1;
+                  res2 = res2-load5*load3;
+                  res3 = res3-load4*load3;
+                  load6 = ptrbb[4*2+2];
+                  res4 = res4+load0*load6;
+                  res5 = res5-load2*load6;
+                  load7 = ptrbb[4*2+3];
+                  res4 = res4-load2*load7;
+                  res5 = res5-load0*load7;
+                  res6 = res6+load4*load6;
+                  res7 = res7-load5*load6;
+                  res6 = res6-load5*load7;
+                  res7 = res7-load4*load7;
+                  load8 = ptrba[4*3+0];
+                  load9 = ptrbb[4*3+0];
+                  res0 = res0+load8*load9;
+                  load10 = ptrba[4*3+1];
+                  res1 = res1-load10*load9;
+                  load11 = ptrbb[4*3+1];
+                  res0 = res0-load10*load11;
+                  res1 = res1-load8*load11;
+                  load12 = ptrba[4*3+2];
+                  res2 = res2+load12*load9;
+                  load13 = ptrba[4*3+3];
+                  res3 = res3-load13*load9;
+                  res2 = res2-load13*load11;
+                  res3 = res3-load12*load11;
+                  load14 = ptrbb[4*3+2];
+                  res4 = res4+load8*load14;
+                  res5 = res5-load10*load14;
+                  load15 = ptrbb[4*3+3];
+                  res4 = res4-load10*load15;
+                  res5 = res5-load8*load15;
+                  res6 = res6+load12*load14;
+                  res7 = res7-load13*load14;
+                  res6 = res6-load13*load15;
+                  res7 = res7-load12*load15;
+#endif
+                  ptrba = ptrba+16;
+                  ptrbb = ptrbb+16;
+               }
+             for (k=0; k<(bk&3); k+=1) 
+               {
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                  load0 = ptrba[4*0+0];
+                  load1 = ptrbb[4*0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[4*0+1];
+                  res1 = res1+load2*load1;
+                  load3 = ptrbb[4*0+1];
+                  res0 = res0-load2*load3;
+                  res1 = res1+load0*load3;
+                  load4 = ptrba[4*0+2];
+                  res2 = res2+load4*load1;
+                  load5 = ptrba[4*0+3];
+                  res3 = res3+load5*load1;
+                  res2 = res2-load5*load3;
+                  res3 = res3+load4*load3;
+                  load6 = ptrbb[4*0+2];
+                  res4 = res4+load0*load6;
+                  res5 = res5+load2*load6;
+                  load7 = ptrbb[4*0+3];
+                  res4 = res4-load2*load7;
+                  res5 = res5+load0*load7;
+                  res6 = res6+load4*load6;
+                  res7 = res7+load5*load6;
+                  res6 = res6-load5*load7;
+                  res7 = res7+load4*load7;
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                  load0 = ptrba[4*0+0];
+                  load1 = ptrbb[4*0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[4*0+1];
+                  res1 = res1+load2*load1;
+                  load3 = ptrbb[4*0+1];
+                  res0 = res0+load2*load3;
+                  res1 = res1-load0*load3;
+                  load4 = ptrba[4*0+2];
+                  res2 = res2+load4*load1;
+                  load5 = ptrba[4*0+3];
+                  res3 = res3+load5*load1;
+                  res2 = res2+load5*load3;
+                  res3 = res3-load4*load3;
+                  load6 = ptrbb[4*0+2];
+                  res4 = res4+load0*load6;
+                  res5 = res5+load2*load6;
+                  load7 = ptrbb[4*0+3];
+                  res4 = res4+load2*load7;
+                  res5 = res5-load0*load7;
+                  res6 = res6+load4*load6;
+                  res7 = res7+load5*load6;
+                  res6 = res6+load5*load7;
+                  res7 = res7-load4*load7;
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                  load0 = ptrba[4*0+0];
+                  load1 = ptrbb[4*0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[4*0+1];
+                  res1 = res1-load2*load1;
+                  load3 = ptrbb[4*0+1];
+                  res0 = res0+load2*load3;
+                  res1 = res1+load0*load3;
+                  load4 = ptrba[4*0+2];
+                  res2 = res2+load4*load1;
+                  load5 = ptrba[4*0+3];
+                  res3 = res3-load5*load1;
+                  res2 = res2+load5*load3;
+                  res3 = res3+load4*load3;
+                  load6 = ptrbb[4*0+2];
+                  res4 = res4+load0*load6;
+                  res5 = res5-load2*load6;
+                  load7 = ptrbb[4*0+3];
+                  res4 = res4+load2*load7;
+                  res5 = res5+load0*load7;
+                  res6 = res6+load4*load6;
+                  res7 = res7-load5*load6;
+                  res6 = res6+load5*load7;
+                  res7 = res7+load4*load7;
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                  load0 = ptrba[4*0+0];
+                  load1 = ptrbb[4*0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[4*0+1];
+                  res1 = res1-load2*load1;
+                  load3 = ptrbb[4*0+1];
+                  res0 = res0-load2*load3;
+                  res1 = res1-load0*load3;
+                  load4 = ptrba[4*0+2];
+                  res2 = res2+load4*load1;
+                  load5 = ptrba[4*0+3];
+                  res3 = res3-load5*load1;
+                  res2 = res2-load5*load3;
+                  res3 = res3-load4*load3;
+                  load6 = ptrbb[4*0+2];
+                  res4 = res4+load0*load6;
+                  res5 = res5-load2*load6;
+                  load7 = ptrbb[4*0+3];
+                  res4 = res4-load2*load7;
+                  res5 = res5-load0*load7;
+                  res6 = res6+load4*load6;
+                  res7 = res7-load5*load6;
+                  res6 = res6-load5*load7;
+                  res7 = res7-load4*load7;
+#endif
+                  ptrba = ptrba+4;
+                  ptrbb = ptrbb+4;
+               }
+             load0 = res0*alphar;
+             C0[0] = C0[0]+load0;
+             load1 = res1*alphar;
+             C0[1] = C0[1]+load1;
+             load0 = res1*alphai;
+             C0[0] = C0[0]-load0;
+             load1 = res0*alphai;
+             C0[1] = C0[1]+load1;
+             load2 = res2*alphar;
+             C0[2] = C0[2]+load2;
+             load3 = res3*alphar;
+             C0[3] = C0[3]+load3;
+             load2 = res3*alphai;
+             C0[2] = C0[2]-load2;
+             load3 = res2*alphai;
+             C0[3] = C0[3]+load3;
+             load4 = res4*alphar;
+             C1[0] = C1[0]+load4;
+             load5 = res5*alphar;
+             C1[1] = C1[1]+load5;
+             load4 = res5*alphai;
+             C1[0] = C1[0]-load4;
+             load5 = res4*alphai;
+             C1[1] = C1[1]+load5;
+             load6 = res6*alphar;
+             C1[2] = C1[2]+load6;
+             load7 = res7*alphar;
+             C1[3] = C1[3]+load7;
+             load6 = res7*alphai;
+             C1[2] = C1[2]-load6;
+             load7 = res6*alphai;
+             C1[3] = C1[3]+load7;
+             C0 = C0+4;
+             C1 = C1+4;
+          }
+        for (i=0; i<(bm&1); i+=1) 
+          {
+             ptrbb = bb;
+             res0 = 0;
+             res1 = 0;
+             res2 = 0;
+             res3 = 0;
+             for (k=0; k<bk; k+=1) 
+               {
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                  load0 = ptrba[2*0+0];
+                  load1 = ptrbb[4*0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[2*0+1];
+                  res1 = res1+load2*load1;
+                  load3 = ptrbb[4*0+1];
+                  res0 = res0-load2*load3;
+                  res1 = res1+load0*load3;
+                  load4 = ptrbb[4*0+2];
+                  res2 = res2+load0*load4;
+                  res3 = res3+load2*load4;
+                  load5 = ptrbb[4*0+3];
+                  res2 = res2-load2*load5;
+                  res3 = res3+load0*load5;
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                                 load0 = ptrba[2*0+0];
+                  load1 = ptrbb[4*0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[2*0+1];
+                  res1 = res1+load2*load1;
+                  load3 = ptrbb[4*0+1];
+                  res0 = res0+load2*load3;
+                  res1 = res1-load0*load3;
+                  load4 = ptrbb[4*0+2];
+                  res2 = res2+load0*load4;
+                  res3 = res3+load2*load4;
+                  load5 = ptrbb[4*0+3];
+                  res2 = res2+load2*load5;
+                  res3 = res3-load0*load5;
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                  load0 = ptrba[2*0+0];
+                  load1 = ptrbb[4*0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[2*0+1];
+                  res1 = res1-load2*load1;
+                  load3 = ptrbb[4*0+1];
+                  res0 = res0+load2*load3;
+                  res1 = res1+load0*load3;
+                  load4 = ptrbb[4*0+2];
+                  res2 = res2+load0*load4;
+                  res3 = res3-load2*load4;
+                  load5 = ptrbb[4*0+3];
+                  res2 = res2+load2*load5;
+                  res3 = res3+load0*load5;
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                  load0 = ptrba[2*0+0];
+                  load1 = ptrbb[4*0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[2*0+1];
+                  res1 = res1-load2*load1;
+                  load3 = ptrbb[4*0+1];
+                  res0 = res0-load2*load3;
+                  res1 = res1-load0*load3;
+                  load4 = ptrbb[4*0+2];
+                  res2 = res2+load0*load4;
+                  res3 = res3-load2*load4;
+                  load5 = ptrbb[4*0+3];
+                  res2 = res2-load2*load5;
+                  res3 = res3-load0*load5;
+#endif
+                  ptrba = ptrba+2;
+                  ptrbb = ptrbb+4;
+               }
+             load0 = res0*alphar;
+             C0[0] = C0[0]+load0;
+             load1 = res1*alphar;
+             C0[1] = C0[1]+load1;
+             load0 = res1*alphai;
+             C0[0] = C0[0]-load0;
+             load1 = res0*alphai;
+             C0[1] = C0[1]+load1;
+             load2 = res2*alphar;
+             C1[0] = C1[0]+load2;
+             load3 = res3*alphar;
+             C1[1] = C1[1]+load3;
+             load2 = res3*alphai;
+             C1[0] = C1[0]-load2;
+             load3 = res2*alphai;
+             C1[1] = C1[1]+load3;
+             C0 = C0+2;
+             C1 = C1+2;
+          }
+        k = (bk<<2);
+        bb = bb+k;
+        i = (ldc<<2);
+        C = C+i;
+     }
+   for (j=0; j<(bn&1); j+=1) 
+     {
+        C0 = C;
+        ptrba = ba;
+        for (i=0; i<bm/2; i+=1) 
+          {
+             ptrbb = bb;
+             res0 = 0;
+             res1 = 0;
+             res2 = 0;
+             res3 = 0;
+             for (k=0; k<bk; k+=1) 
+               {
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                  load0 = ptrba[4*0+0];
+                  load1 = ptrbb[2*0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[4*0+1];
+                  res1 = res1+load2*load1;
+                  load3 = ptrbb[2*0+1];
+                  res0 = res0-load2*load3;
+                  res1 = res1+load0*load3;
+                  load4 = ptrba[4*0+2];
+                  res2 = res2+load4*load1;
+                  load5 = ptrba[4*0+3];
+                  res3 = res3+load5*load1;
+                  res2 = res2-load5*load3;
+                  res3 = res3+load4*load3;
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                  load0 = ptrba[4*0+0];
+                  load1 = ptrbb[2*0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[4*0+1];
+                  res1 = res1+load2*load1;
+                  load3 = ptrbb[2*0+1];
+                  res0 = res0+load2*load3;
+                  res1 = res1-load0*load3;
+                  load4 = ptrba[4*0+2];
+                  res2 = res2+load4*load1;
+                  load5 = ptrba[4*0+3];
+                  res3 = res3+load5*load1;
+                  res2 = res2+load5*load3;
+                  res3 = res3-load4*load3;
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                  load0 = ptrba[4*0+0];
+                  load1 = ptrbb[2*0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[4*0+1];
+                  res1 = res1-load2*load1;
+                  load3 = ptrbb[2*0+1];
+                  res0 = res0+load2*load3;
+                  res1 = res1+load0*load3;
+                  load4 = ptrba[4*0+2];
+                  res2 = res2+load4*load1;
+                  load5 = ptrba[4*0+3];
+                  res3 = res3-load5*load1;
+                  res2 = res2+load5*load3;
+                  res3 = res3+load4*load3;
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                  load0 = ptrba[4*0+0];
+                  load1 = ptrbb[2*0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[4*0+1];
+                  res1 = res1-load2*load1;
+                  load3 = ptrbb[2*0+1];
+                  res0 = res0-load2*load3;
+                  res1 = res1-load0*load3;
+                  load4 = ptrba[4*0+2];
+                  res2 = res2+load4*load1;
+                  load5 = ptrba[4*0+3];
+                  res3 = res3-load5*load1;
+                  res2 = res2-load5*load3;
+                  res3 = res3-load4*load3;
+#endif
+                  ptrba = ptrba+4;
+                  ptrbb = ptrbb+2;
+               }
+             load0 = res0*alphar;
+             C0[0] = C0[0]+load0;
+             load1 = res1*alphar;
+             C0[1] = C0[1]+load1;
+             load0 = res1*alphai;
+             C0[0] = C0[0]-load0;
+             load1 = res0*alphai;
+             C0[1] = C0[1]+load1;
+             load2 = res2*alphar;
+             C0[2] = C0[2]+load2;
+             load3 = res3*alphar;
+             C0[3] = C0[3]+load3;
+             load2 = res3*alphai;
+             C0[2] = C0[2]-load2;
+             load3 = res2*alphai;
+             C0[3] = C0[3]+load3;
+             C0 = C0+4;
+          }
+        for (i=0; i<(bm&1); i+=1) 
+          {
+             ptrbb = bb;
+             res0 = 0;
+             res1 = 0;
+             for (k=0; k<bk; k+=1) 
+               {
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                  load0 = ptrba[2*0+0];
+                  load1 = ptrbb[2*0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[2*0+1];
+                  res1 = res1+load2*load1;
+                  load3 = ptrbb[2*0+1];
+                  res0 = res0-load2*load3;
+                  res1 = res1+load0*load3;
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                  load0 = ptrba[2*0+0];
+                  load1 = ptrbb[2*0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[2*0+1];
+                  res1 = res1+load2*load1;
+                  load3 = ptrbb[2*0+1];
+                  res0 = res0+load2*load3;
+                  res1 = res1-load0*load3;
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                                 load0 = ptrba[2*0+0];
+                  load1 = ptrbb[2*0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[2*0+1];
+                  res1 = res1-load2*load1;
+                  load3 = ptrbb[2*0+1];
+                  res0 = res0+load2*load3;
+                  res1 = res1+load0*load3;
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                  load0 = ptrba[2*0+0];
+                  load1 = ptrbb[2*0+0];
+                  res0 = res0+load0*load1;
+                  load2 = ptrba[2*0+1];
+                  res1 = res1-load2*load1;
+                  load3 = ptrbb[2*0+1];
+                  res0 = res0-load2*load3;
+                  res1 = res1-load0*load3;
+#endif
+                  ptrba = ptrba+2;
+                  ptrbb = ptrbb+2;
+               }
+             load0 = res0*alphar;
+             C0[0] = C0[0]+load0;
+             load1 = res1*alphar;
+             C0[1] = C0[1]+load1;
+             load0 = res1*alphai;
+             C0[0] = C0[0]-load0;
+             load1 = res0*alphai;
+             C0[1] = C0[1]+load1;
+             C0 = C0+2;
+          }
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+     }
+   return 0;
+}
diff --git a/kernel/generic/ztrmmkernel_2x2.c b/kernel/generic/ztrmmkernel_2x2.c
new file mode 100644 (file)
index 0000000..b7c6539
--- /dev/null
@@ -0,0 +1,923 @@
+#include "common.h"
+/********************************
+  ADD1 a*c
+  ADD2 b*c
+  ADD3 a*d
+  ADD4 b*d
+ *********************************/
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,
+               FLOAT* C,BLASLONG ldc, BLASLONG offset)
+{
+       BLASLONG i,j,k;
+       FLOAT *C0,*C1,*ptrba,*ptrbb;
+       FLOAT res0,res1,res2,res3,res4,res5,res6,res7,load0,load1,load2,load3,load4,load5,load6,load7,load8,load9,load10,load11,load12,load13,load14,load15;
+       BLASLONG off, temp;
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+       off = -offset;
+#endif
+       for (j=0; j<bn/2; j+=1) 
+       {
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+               off = offset;
+#endif
+               C0 = C;
+               C1 = C0+2*ldc;
+               ptrba = ba;
+               for (i=0; i<bm/2; i+=1) 
+               {
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+                       ptrbb = bb;
+#else
+                       ptrba += off*2*2;
+                       ptrbb = bb+off*2*2;
+#endif
+                       res0 = 0;
+                       res1 = 0;
+                       res2 = 0;
+                       res3 = 0;
+                       res4 = 0;
+                       res5 = 0;
+                       res6 = 0;
+                       res7 = 0;
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+                       temp = bk - off;
+#elif defined(LEFT)
+                       temp = off + 2;
+#else 
+                       temp = off + 2;
+#endif
+                       for (k=0; k<temp/4; k+=1) 
+                       {
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                               load0 = ptrba[4*0+0];
+                               load1 = ptrbb[4*0+0];
+                               res0 = res0+load0*load1;
+                               load2 = ptrba[4*0+1];
+                               res1 = res1+load2*load1;
+                               load3 = ptrbb[4*0+1];
+                               res0 = res0-load2*load3;
+                               res1 = res1+load0*load3;
+                               load4 = ptrba[4*0+2];
+                               res2 = res2+load4*load1;
+                               load5 = ptrba[4*0+3];
+                               res3 = res3+load5*load1;
+                               res2 = res2-load5*load3;
+                               res3 = res3+load4*load3;
+                               load6 = ptrbb[4*0+2];
+                               res4 = res4+load0*load6;
+                               res5 = res5+load2*load6;
+                               load7 = ptrbb[4*0+3];
+                               res4 = res4-load2*load7;
+                               res5 = res5+load0*load7;
+                               res6 = res6+load4*load6;
+                               res7 = res7+load5*load6;
+                               res6 = res6-load5*load7;
+                               res7 = res7+load4*load7;
+                               load8 = ptrba[4*1+0];
+                               load9 = ptrbb[4*1+0];
+                               res0 = res0+load8*load9;
+                               load10 = ptrba[4*1+1];
+                               res1 = res1+load10*load9;
+                               load11 = ptrbb[4*1+1];
+                               res0 = res0-load10*load11;
+                               res1 = res1+load8*load11;
+                               load12 = ptrba[4*1+2];
+                               res2 = res2+load12*load9;
+                               load13 = ptrba[4*1+3];
+                               res3 = res3+load13*load9;
+                               res2 = res2-load13*load11;
+                               res3 = res3+load12*load11;
+                               load14 = ptrbb[4*1+2];
+                               res4 = res4+load8*load14;
+                               res5 = res5+load10*load14;
+                               load15 = ptrbb[4*1+3];
+                               res4 = res4-load10*load15;
+                               res5 = res5+load8*load15;
+                               res6 = res6+load12*load14;
+                               res7 = res7+load13*load14;
+                               res6 = res6-load13*load15;
+                               res7 = res7+load12*load15;
+                               load0 = ptrba[4*2+0];
+                               load1 = ptrbb[4*2+0];
+                               res0 = res0+load0*load1;
+                               load2 = ptrba[4*2+1];
+                               res1 = res1+load2*load1;
+                               load3 = ptrbb[4*2+1];
+                               res0 = res0-load2*load3;
+                               res1 = res1+load0*load3;
+                               load4 = ptrba[4*2+2];
+                               res2 = res2+load4*load1;
+                               load5 = ptrba[4*2+3];
+                               res3 = res3+load5*load1;
+                               res2 = res2-load5*load3;
+                               res3 = res3+load4*load3;
+                               load6 = ptrbb[4*2+2];
+                               res4 = res4+load0*load6;
+                               res5 = res5+load2*load6;
+                               load7 = ptrbb[4*2+3];
+                               res4 = res4-load2*load7;
+                               res5 = res5+load0*load7;
+                               res6 = res6+load4*load6;
+                               res7 = res7+load5*load6;
+                               res6 = res6-load5*load7;
+                               res7 = res7+load4*load7;
+                               load8 = ptrba[4*3+0];
+                               load9 = ptrbb[4*3+0];
+                               res0 = res0+load8*load9;
+                               load10 = ptrba[4*3+1];
+                               res1 = res1+load10*load9;
+                               load11 = ptrbb[4*3+1];
+                               res0 = res0-load10*load11;
+                               res1 = res1+load8*load11;
+                               load12 = ptrba[4*3+2];
+                               res2 = res2+load12*load9;
+                               load13 = ptrba[4*3+3];
+                               res3 = res3+load13*load9;
+                               res2 = res2-load13*load11;
+                               res3 = res3+load12*load11;
+                               load14 = ptrbb[4*3+2];
+                               res4 = res4+load8*load14;
+                               res5 = res5+load10*load14;
+                               load15 = ptrbb[4*3+3];
+                               res4 = res4-load10*load15;
+                               res5 = res5+load8*load15;
+                               res6 = res6+load12*load14;
+                               res7 = res7+load13*load14;
+                               res6 = res6-load13*load15;
+                               res7 = res7+load12*load15;
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                               load0 = ptrba[4*0+0];
+                               load1 = ptrbb[4*0+0];
+                               res0 = res0+load0*load1;
+                               load2 = ptrba[4*0+1];
+                               res1 = res1+load2*load1;
+                               load3 = ptrbb[4*0+1];
+                               res0 = res0+load2*load3;
+                               res1 = res1-load0*load3;
+                               load4 = ptrba[4*0+2];
+                               res2 = res2+load4*load1;
+                               load5 = ptrba[4*0+3];
+                               res3 = res3+load5*load1;
+                               res2 = res2+load5*load3;
+                               res3 = res3-load4*load3;
+                               load6 = ptrbb[4*0+2];
+                               res4 = res4+load0*load6;
+                               res5 = res5+load2*load6;
+                               load7 = ptrbb[4*0+3];
+                               res4 = res4+load2*load7;
+                               res5 = res5-load0*load7;
+                               res6 = res6+load4*load6;
+                               res7 = res7+load5*load6;
+                               res6 = res6+load5*load7;
+                               res7 = res7-load4*load7;
+                               load8 = ptrba[4*1+0];
+                               load9 = ptrbb[4*1+0];
+                               res0 = res0+load8*load9;
+                               load10 = ptrba[4*1+1];
+                               res1 = res1+load10*load9;
+                               load11 = ptrbb[4*1+1];
+                               res0 = res0+load10*load11;
+                               res1 = res1-load8*load11;
+                               load12 = ptrba[4*1+2];
+                               res2 = res2+load12*load9;
+                               load13 = ptrba[4*1+3];
+                               res3 = res3+load13*load9;
+                               res2 = res2+load13*load11;
+                               res3 = res3-load12*load11;
+                               load14 = ptrbb[4*1+2];
+                               res4 = res4+load8*load14;
+                               res5 = res5+load10*load14;
+                               load15 = ptrbb[4*1+3];
+                               res4 = res4+load10*load15;
+                               res5 = res5-load8*load15;
+                               res6 = res6+load12*load14;
+                               res7 = res7+load13*load14;
+                               res6 = res6+load13*load15;
+                               res7 = res7-load12*load15;
+                               load0 = ptrba[4*2+0];
+                               load1 = ptrbb[4*2+0];
+                               res0 = res0+load0*load1;
+                               load2 = ptrba[4*2+1];
+                               res1 = res1+load2*load1;
+                               load3 = ptrbb[4*2+1];
+                               res0 = res0+load2*load3;
+                               res1 = res1-load0*load3;
+                               load4 = ptrba[4*2+2];
+                               res2 = res2+load4*load1;
+                               load5 = ptrba[4*2+3];
+                               res3 = res3+load5*load1;
+                               res2 = res2+load5*load3;
+                               res3 = res3-load4*load3;
+                               load6 = ptrbb[4*2+2];
+                               res4 = res4+load0*load6;
+                               res5 = res5+load2*load6;
+                               load7 = ptrbb[4*2+3];
+                               res4 = res4+load2*load7;
+                               res5 = res5-load0*load7;
+                               res6 = res6+load4*load6;
+                               res7 = res7+load5*load6;
+                               res6 = res6+load5*load7;
+                               res7 = res7-load4*load7;
+                               load8 = ptrba[4*3+0];
+                               load9 = ptrbb[4*3+0];
+                               res0 = res0+load8*load9;
+                               load10 = ptrba[4*3+1];
+                               res1 = res1+load10*load9;
+                               load11 = ptrbb[4*3+1];
+                               res0 = res0+load10*load11;
+                               res1 = res1-load8*load11;
+                               load12 = ptrba[4*3+2];
+                               res2 = res2+load12*load9;
+                               load13 = ptrba[4*3+3];
+                               res3 = res3+load13*load9;
+                               res2 = res2+load13*load11;
+                               res3 = res3-load12*load11;
+                               load14 = ptrbb[4*3+2];
+                               res4 = res4+load8*load14;
+                               res5 = res5+load10*load14;
+                               load15 = ptrbb[4*3+3];
+                               res4 = res4+load10*load15;
+                               res5 = res5-load8*load15;
+                               res6 = res6+load12*load14;
+                               res7 = res7+load13*load14;
+                               res6 = res6+load13*load15;
+                               res7 = res7-load12*load15;
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                               load0 = ptrba[4*0+0];
+                               load1 = ptrbb[4*0+0];
+                               res0 = res0+load0*load1;
+                               load2 = ptrba[4*0+1];
+                               res1 = res1-load2*load1;
+                               load3 = ptrbb[4*0+1];
+                               res0 = res0+load2*load3;
+                               res1 = res1+load0*load3;
+                               load4 = ptrba[4*0+2];
+                               res2 = res2+load4*load1;
+                               load5 = ptrba[4*0+3];
+                               res3 = res3-load5*load1;
+                               res2 = res2+load5*load3;
+                               res3 = res3+load4*load3;
+                               load6 = ptrbb[4*0+2];
+                               res4 = res4+load0*load6;
+                               res5 = res5-load2*load6;
+                               load7 = ptrbb[4*0+3];
+                               res4 = res4+load2*load7;
+                               res5 = res5+load0*load7;
+                               res6 = res6+load4*load6;
+                               res7 = res7-load5*load6;
+                               res6 = res6+load5*load7;
+                               res7 = res7+load4*load7;
+                               load8 = ptrba[4*1+0];
+                               load9 = ptrbb[4*1+0];
+                               res0 = res0+load8*load9;
+                               load10 = ptrba[4*1+1];
+                               res1 = res1-load10*load9;
+                               load11 = ptrbb[4*1+1];
+                               res0 = res0+load10*load11;
+                               res1 = res1+load8*load11;
+                               load12 = ptrba[4*1+2];
+                               res2 = res2+load12*load9;
+                               load13 = ptrba[4*1+3];
+                               res3 = res3-load13*load9;
+                               res2 = res2+load13*load11;
+                               res3 = res3+load12*load11;
+                               load14 = ptrbb[4*1+2];
+                               res4 = res4+load8*load14;
+                               res5 = res5-load10*load14;
+                               load15 = ptrbb[4*1+3];
+                               res4 = res4+load10*load15;
+                               res5 = res5+load8*load15;
+                               res6 = res6+load12*load14;
+                               res7 = res7-load13*load14;
+                               res6 = res6+load13*load15;
+                               res7 = res7+load12*load15;
+                               load0 = ptrba[4*2+0];
+                               load1 = ptrbb[4*2+0];
+                               res0 = res0+load0*load1;
+                               load2 = ptrba[4*2+1];
+                               res1 = res1-load2*load1;
+                               load3 = ptrbb[4*2+1];
+                               res0 = res0+load2*load3;
+                               res1 = res1+load0*load3;
+                               load4 = ptrba[4*2+2];
+                               res2 = res2+load4*load1;
+                               load5 = ptrba[4*2+3];
+                               res3 = res3-load5*load1;
+                               res2 = res2+load5*load3;
+                               res3 = res3+load4*load3;
+                               load6 = ptrbb[4*2+2];
+                               res4 = res4+load0*load6;
+                               res5 = res5-load2*load6;
+                               load7 = ptrbb[4*2+3];
+                               res4 = res4+load2*load7;
+                               res5 = res5+load0*load7;
+                               res6 = res6+load4*load6;
+                               res7 = res7-load5*load6;
+                               res6 = res6+load5*load7;
+                               res7 = res7+load4*load7;
+                               load8 = ptrba[4*3+0];
+                               load9 = ptrbb[4*3+0];
+                               res0 = res0+load8*load9;
+                               load10 = ptrba[4*3+1];
+                               res1 = res1-load10*load9;
+                               load11 = ptrbb[4*3+1];
+                               res0 = res0+load10*load11;
+                               res1 = res1+load8*load11;
+                               load12 = ptrba[4*3+2];
+                               res2 = res2+load12*load9;
+                               load13 = ptrba[4*3+3];
+                               res3 = res3-load13*load9;
+                               res2 = res2+load13*load11;
+                               res3 = res3+load12*load11;
+                               load14 = ptrbb[4*3+2];
+                               res4 = res4+load8*load14;
+                               res5 = res5-load10*load14;
+                               load15 = ptrbb[4*3+3];
+                               res4 = res4+load10*load15;
+                               res5 = res5+load8*load15;
+                               res6 = res6+load12*load14;
+                               res7 = res7-load13*load14;
+                               res6 = res6+load13*load15;
+                               res7 = res7+load12*load15;
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                               load0 = ptrba[4*0+0];
+                               load1 = ptrbb[4*0+0];
+                               res0 = res0+load0*load1;
+                               load2 = ptrba[4*0+1];
+                               res1 = res1-load2*load1;
+                               load3 = ptrbb[4*0+1];
+                               res0 = res0-load2*load3;
+                               res1 = res1-load0*load3;
+                               load4 = ptrba[4*0+2];
+                               res2 = res2+load4*load1;
+                               load5 = ptrba[4*0+3];
+                               res3 = res3-load5*load1;
+                               res2 = res2-load5*load3;
+                               res3 = res3-load4*load3;
+                               load6 = ptrbb[4*0+2];
+                               res4 = res4+load0*load6;
+                               res5 = res5-load2*load6;
+                               load7 = ptrbb[4*0+3];
+                               res4 = res4-load2*load7;
+                               res5 = res5-load0*load7;
+                               res6 = res6+load4*load6;
+                               res7 = res7-load5*load6;
+                               res6 = res6-load5*load7;
+                               res7 = res7-load4*load7;
+                               load8 = ptrba[4*1+0];
+                               load9 = ptrbb[4*1+0];
+                               res0 = res0+load8*load9;
+                               load10 = ptrba[4*1+1];
+                               res1 = res1-load10*load9;
+                               load11 = ptrbb[4*1+1];
+                               res0 = res0-load10*load11;
+                               res1 = res1-load8*load11;
+                               load12 = ptrba[4*1+2];
+                               res2 = res2+load12*load9;
+                               load13 = ptrba[4*1+3];
+                               res3 = res3-load13*load9;
+                               res2 = res2-load13*load11;
+                               res3 = res3-load12*load11;
+                               load14 = ptrbb[4*1+2];
+                               res4 = res4+load8*load14;
+                               res5 = res5-load10*load14;
+                               load15 = ptrbb[4*1+3];
+                               res4 = res4-load10*load15;
+                               res5 = res5-load8*load15;
+                               res6 = res6+load12*load14;
+                               res7 = res7-load13*load14;
+                               res6 = res6-load13*load15;
+                               res7 = res7-load12*load15;
+                               load0 = ptrba[4*2+0];
+                               load1 = ptrbb[4*2+0];
+                               res0 = res0+load0*load1;
+                               load2 = ptrba[4*2+1];
+                               res1 = res1-load2*load1;
+                               load3 = ptrbb[4*2+1];
+                               res0 = res0-load2*load3;
+                               res1 = res1-load0*load3;
+                               load4 = ptrba[4*2+2];
+                               res2 = res2+load4*load1;
+                               load5 = ptrba[4*2+3];
+                               res3 = res3-load5*load1;
+                               res2 = res2-load5*load3;
+                               res3 = res3-load4*load3;
+                               load6 = ptrbb[4*2+2];
+                               res4 = res4+load0*load6;
+                               res5 = res5-load2*load6;
+                               load7 = ptrbb[4*2+3];
+                               res4 = res4-load2*load7;
+                               res5 = res5-load0*load7;
+                               res6 = res6+load4*load6;
+                               res7 = res7-load5*load6;
+                               res6 = res6-load5*load7;
+                               res7 = res7-load4*load7;
+                               load8 = ptrba[4*3+0];
+                               load9 = ptrbb[4*3+0];
+                               res0 = res0+load8*load9;
+                               load10 = ptrba[4*3+1];
+                               res1 = res1-load10*load9;
+                               load11 = ptrbb[4*3+1];
+                               res0 = res0-load10*load11;
+                               res1 = res1-load8*load11;
+                               load12 = ptrba[4*3+2];
+                               res2 = res2+load12*load9;
+                               load13 = ptrba[4*3+3];
+                               res3 = res3-load13*load9;
+                               res2 = res2-load13*load11;
+                               res3 = res3-load12*load11;
+                               load14 = ptrbb[4*3+2];
+                               res4 = res4+load8*load14;
+                               res5 = res5-load10*load14;
+                               load15 = ptrbb[4*3+3];
+                               res4 = res4-load10*load15;
+                               res5 = res5-load8*load15;
+                               res6 = res6+load12*load14;
+                               res7 = res7-load13*load14;
+                               res6 = res6-load13*load15;
+                               res7 = res7-load12*load15;
+#endif
+                               ptrba = ptrba+16;
+                               ptrbb = ptrbb+16;
+                       }
+                       for (k=0; k<(temp&3); k+=1) 
+                       {
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                               load0 = ptrba[4*0+0];
+                               load1 = ptrbb[4*0+0];
+                               res0 = res0+load0*load1;
+                               load2 = ptrba[4*0+1];
+                               res1 = res1+load2*load1;
+                               load3 = ptrbb[4*0+1];
+                               res0 = res0-load2*load3;
+                               res1 = res1+load0*load3;
+                               load4 = ptrba[4*0+2];
+                               res2 = res2+load4*load1;
+                               load5 = ptrba[4*0+3];
+                               res3 = res3+load5*load1;
+                               res2 = res2-load5*load3;
+                               res3 = res3+load4*load3;
+                               load6 = ptrbb[4*0+2];
+                               res4 = res4+load0*load6;
+                               res5 = res5+load2*load6;
+                               load7 = ptrbb[4*0+3];
+                               res4 = res4-load2*load7;
+                               res5 = res5+load0*load7;
+                               res6 = res6+load4*load6;
+                               res7 = res7+load5*load6;
+                               res6 = res6-load5*load7;
+                               res7 = res7+load4*load7;
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                               load0 = ptrba[4*0+0];
+                               load1 = ptrbb[4*0+0];
+                               res0 = res0+load0*load1;
+                               load2 = ptrba[4*0+1];
+                               res1 = res1+load2*load1;
+                               load3 = ptrbb[4*0+1];
+                               res0 = res0+load2*load3;
+                               res1 = res1-load0*load3;
+                               load4 = ptrba[4*0+2];
+                               res2 = res2+load4*load1;
+                               load5 = ptrba[4*0+3];
+                               res3 = res3+load5*load1;
+                               res2 = res2+load5*load3;
+                               res3 = res3-load4*load3;
+                               load6 = ptrbb[4*0+2];
+                               res4 = res4+load0*load6;
+                               res5 = res5+load2*load6;
+                               load7 = ptrbb[4*0+3];
+                               res4 = res4+load2*load7;
+                               res5 = res5-load0*load7;
+                               res6 = res6+load4*load6;
+                               res7 = res7+load5*load6;
+                               res6 = res6+load5*load7;
+                               res7 = res7-load4*load7;
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                               load0 = ptrba[4*0+0];
+                               load1 = ptrbb[4*0+0];
+                               res0 = res0+load0*load1;
+                               load2 = ptrba[4*0+1];
+                               res1 = res1-load2*load1;
+                               load3 = ptrbb[4*0+1];
+                               res0 = res0+load2*load3;
+                               res1 = res1+load0*load3;
+                               load4 = ptrba[4*0+2];
+                               res2 = res2+load4*load1;
+                               load5 = ptrba[4*0+3];
+                               res3 = res3-load5*load1;
+                               res2 = res2+load5*load3;
+                               res3 = res3+load4*load3;
+                               load6 = ptrbb[4*0+2];
+                               res4 = res4+load0*load6;
+                               res5 = res5-load2*load6;
+                               load7 = ptrbb[4*0+3];
+                               res4 = res4+load2*load7;
+                               res5 = res5+load0*load7;
+                               res6 = res6+load4*load6;
+                               res7 = res7-load5*load6;
+                               res6 = res6+load5*load7;
+                               res7 = res7+load4*load7;
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                               load0 = ptrba[4*0+0];
+                               load1 = ptrbb[4*0+0];
+                               res0 = res0+load0*load1;
+                               load2 = ptrba[4*0+1];
+                               res1 = res1-load2*load1;
+                               load3 = ptrbb[4*0+1];
+                               res0 = res0-load2*load3;
+                               res1 = res1-load0*load3;
+                               load4 = ptrba[4*0+2];
+                               res2 = res2+load4*load1;
+                               load5 = ptrba[4*0+3];
+                               res3 = res3-load5*load1;
+                               res2 = res2-load5*load3;
+                               res3 = res3-load4*load3;
+                               load6 = ptrbb[4*0+2];
+                               res4 = res4+load0*load6;
+                               res5 = res5-load2*load6;
+                               load7 = ptrbb[4*0+3];
+                               res4 = res4-load2*load7;
+                               res5 = res5-load0*load7;
+                               res6 = res6+load4*load6;
+                               res7 = res7-load5*load6;
+                               res6 = res6-load5*load7;
+                               res7 = res7-load4*load7;
+#endif
+                               ptrba = ptrba+4;
+                               ptrbb = ptrbb+4;
+                       }
+                       load0 = res0*alphar-res1*alphai;
+                       load1 = res1*alphar+res0*alphai;
+                       C0[0] = load0;
+                       C0[1] = load1;
+
+                       load2 = res2*alphar-res3*alphai;
+                       load3 = res3*alphar+res2*alphai;
+                       C0[2] = load2;
+                       C0[3] = load3;
+
+                       load4 = res4*alphar-res5*alphai;
+                       load5 = res5*alphar+res4*alphai;
+                       C1[0] = load4;
+                       C1[1] = load5;
+
+                       load6 = res6*alphar-res7*alphai;
+                       load7 = res7*alphar+res6*alphai;
+                       C1[2] = load6;
+                       C1[3] = load7;
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+                       (!defined(LEFT) && !defined(TRANSA))
+                       temp = bk - off;
+#ifdef LEFT
+                       temp -= 2;
+#else
+                       temp -= 2;
+#endif
+                       ptrba += temp*2*2;
+                       ptrbb += temp*2*2;
+#endif
+#ifdef LEFT
+                       off += 2;
+#endif
+
+                       C0 = C0+4;
+                       C1 = C1+4;
+               }
+               for (i=0; i<(bm&1); i+=1) 
+               {
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+                       ptrbb = bb;
+#else 
+                       ptrba += off*2;
+                       ptrbb = bb + off*2*2;
+#endif
+                       res0 = 0;
+                       res1 = 0;
+                       res2 = 0;
+                       res3 = 0;
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+                       temp = bk - off;
+#elif defined(LEFT)
+                       temp = off+1;
+#else
+                       temp = off+2;
+#endif
+                       for (k=0; k<temp; k+=1) 
+                       {
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                               load0 = ptrba[2*0+0];
+                               load1 = ptrbb[4*0+0];
+                               res0 = res0+load0*load1;
+                               load2 = ptrba[2*0+1];
+                               res1 = res1+load2*load1;
+                               load3 = ptrbb[4*0+1];
+                               res0 = res0-load2*load3;
+                               res1 = res1+load0*load3;
+                               load4 = ptrbb[4*0+2];
+                               res2 = res2+load0*load4;
+                               res3 = res3+load2*load4;
+                               load5 = ptrbb[4*0+3];
+                               res2 = res2-load2*load5;
+                               res3 = res3+load0*load5;
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                               load0 = ptrba[2*0+0];
+                               load1 = ptrbb[4*0+0];
+                               res0 = res0+load0*load1;
+                               load2 = ptrba[2*0+1];
+                               res1 = res1+load2*load1;
+                               load3 = ptrbb[4*0+1];
+                               res0 = res0+load2*load3;
+                               res1 = res1-load0*load3;
+                               load4 = ptrbb[4*0+2];
+                               res2 = res2+load0*load4;
+                               res3 = res3+load2*load4;
+                               load5 = ptrbb[4*0+3];
+                               res2 = res2+load2*load5;
+                               res3 = res3-load0*load5;
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                               load0 = ptrba[2*0+0];
+                               load1 = ptrbb[4*0+0];
+                               res0 = res0+load0*load1;
+                               load2 = ptrba[2*0+1];
+                               res1 = res1-load2*load1;
+                               load3 = ptrbb[4*0+1];
+                               res0 = res0+load2*load3;
+                               res1 = res1+load0*load3;
+                               load4 = ptrbb[4*0+2];
+                               res2 = res2+load0*load4;
+                               res3 = res3-load2*load4;
+                               load5 = ptrbb[4*0+3];
+                               res2 = res2+load2*load5;
+                               res3 = res3+load0*load5;
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                               load0 = ptrba[2*0+0];
+                               load1 = ptrbb[4*0+0];
+                               res0 = res0+load0*load1;
+                               load2 = ptrba[2*0+1];
+                               res1 = res1-load2*load1;
+                               load3 = ptrbb[4*0+1];
+                               res0 = res0-load2*load3;
+                               res1 = res1-load0*load3;
+                               load4 = ptrbb[4*0+2];
+                               res2 = res2+load0*load4;
+                               res3 = res3-load2*load4;
+                               load5 = ptrbb[4*0+3];
+                               res2 = res2-load2*load5;
+                               res3 = res3-load0*load5;
+#endif
+                               ptrba = ptrba+2;
+                               ptrbb = ptrbb+4;
+                       }
+                       load0 = res0*alphar-res1*alphai;
+                       load1 = res1*alphar+res0*alphai;
+                       C0[0] = load0;
+                       C0[1] = load1;
+
+                       load2 = res2*alphar-res3*alphai;
+                       load3 = res3*alphar+res2*alphai;
+                       C1[0] = load2;
+                       C1[1] = load3;
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+                       (!defined(LEFT) && !defined(TRANSA))
+                       temp = bk - off;
+#ifdef LEFT
+                       temp -= 1;
+#else 
+                       temp -= 2;
+#endif
+                       ptrba += temp*2;
+                       ptrbb += temp*2*2;
+#endif
+#ifdef LEFT
+                       off += 1;
+#endif
+                       C0 = C0+2;
+                       C1 = C1+2;
+               }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+               off += 2;
+#endif
+               k = (bk<<2);
+               bb = bb+k;
+               i = (ldc<<2);
+               C = C+i;
+       }
+       for (j=0; j<(bn&1); j+=1) 
+       {
+               C0 = C;
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+               off = offset;
+#endif
+               ptrba = ba;
+               for (i=0; i<bm/2; i+=1) 
+               {
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+                       ptrbb = bb;
+#else 
+                       ptrba += off*2*2;
+                       ptrbb = bb+off*2;
+#endif
+                       res0 = 0;
+                       res1 = 0;
+                       res2 = 0;
+                       res3 = 0;
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+                       temp = bk - off;
+#elif defined(LEFT)
+                       temp = off + 2;
+#else
+                       temp = off + 1;
+#endif
+                       for (k=0; k<temp; k+=1) 
+                       {
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                               load0 = ptrba[4*0+0];
+                               load1 = ptrbb[2*0+0];
+                               res0 = res0+load0*load1;
+                               load2 = ptrba[4*0+1];
+                               res1 = res1+load2*load1;
+                               load3 = ptrbb[2*0+1];
+                               res0 = res0-load2*load3;
+                               res1 = res1+load0*load3;
+                               load4 = ptrba[4*0+2];
+                               res2 = res2+load4*load1;
+                               load5 = ptrba[4*0+3];
+                               res3 = res3+load5*load1;
+                               res2 = res2-load5*load3;
+                               res3 = res3+load4*load3;
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                               load0 = ptrba[4*0+0];
+                               load1 = ptrbb[2*0+0];
+                               res0 = res0+load0*load1;
+                               load2 = ptrba[4*0+1];
+                               res1 = res1+load2*load1;
+                               load3 = ptrbb[2*0+1];
+                               res0 = res0+load2*load3;
+                               res1 = res1-load0*load3;
+                               load4 = ptrba[4*0+2];
+                               res2 = res2+load4*load1;
+                               load5 = ptrba[4*0+3];
+                               res3 = res3+load5*load1;
+                               res2 = res2+load5*load3;
+                               res3 = res3-load4*load3;
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                               load0 = ptrba[4*0+0];
+                               load1 = ptrbb[2*0+0];
+                               res0 = res0+load0*load1;
+                               load2 = ptrba[4*0+1];
+                               res1 = res1-load2*load1;
+                               load3 = ptrbb[2*0+1];
+                               res0 = res0+load2*load3;
+                               res1 = res1+load0*load3;
+                               load4 = ptrba[4*0+2];
+                               res2 = res2+load4*load1;
+                               load5 = ptrba[4*0+3];
+                               res3 = res3-load5*load1;
+                               res2 = res2+load5*load3;
+                               res3 = res3+load4*load3;
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                               load0 = ptrba[4*0+0];
+                               load1 = ptrbb[2*0+0];
+                               res0 = res0+load0*load1;
+                               load2 = ptrba[4*0+1];
+                               res1 = res1-load2*load1;
+                               load3 = ptrbb[2*0+1];
+                               res0 = res0-load2*load3;
+                               res1 = res1-load0*load3;
+                               load4 = ptrba[4*0+2];
+                               res2 = res2+load4*load1;
+                               load5 = ptrba[4*0+3];
+                               res3 = res3-load5*load1;
+                               res2 = res2-load5*load3;
+                               res3 = res3-load4*load3;
+#endif
+                               ptrba = ptrba+4;
+                               ptrbb = ptrbb+2;
+                       }
+                       load0 = res0*alphar-res1*alphai;
+                       load1 = res1*alphar+res0*alphai;
+                       C0[0] = load0;
+                       C0[1] = load1;
+
+                       load2 = res2*alphar-res3*alphai;
+                       load3 = res3*alphar+res2*alphai;
+                       C0[2] = load2;
+                       C0[3] = load3;
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+                       (!defined(LEFT) && !defined(TRANSA))
+                       temp = bk-off;
+#ifdef LEFT
+                       temp -= 2;
+#else
+                       temp -= 1;
+#endif
+                       ptrba += temp*2*2;
+                       ptrbb += temp*2;
+#endif
+#ifdef LEFT
+                       off += 2;
+#endif
+                       C0 = C0+4;
+               }
+               for (i=0; i<(bm&1); i+=1) 
+               {
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+                       ptrbb = bb;
+#else 
+                       ptrba += off*2;
+                       ptrbb = bb + off*2;
+#endif
+                       res0 = 0;
+                       res1 = 0;
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+                       temp = bk-off;
+#elif defined(LEFT)
+                       temp = off + 1;
+#else 
+                       temp = off + 1;
+#endif
+                       for (k=0; k<temp; k+=1) 
+                       {
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                               load0 = ptrba[2*0+0];
+                               load1 = ptrbb[2*0+0];
+                               res0 = res0+load0*load1;
+                               load2 = ptrba[2*0+1];
+                               res1 = res1+load2*load1;
+                               load3 = ptrbb[2*0+1];
+                               res0 = res0-load2*load3;
+                               res1 = res1+load0*load3;
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                               load0 = ptrba[2*0+0];
+                               load1 = ptrbb[2*0+0];
+                               res0 = res0+load0*load1;
+                               load2 = ptrba[2*0+1];
+                               res1 = res1+load2*load1;
+                               load3 = ptrbb[2*0+1];
+                               res0 = res0+load2*load3;
+                               res1 = res1-load0*load3;
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                               load0 = ptrba[2*0+0];
+                               load1 = ptrbb[2*0+0];
+                               res0 = res0+load0*load1;
+                               load2 = ptrba[2*0+1];
+                               res1 = res1-load2*load1;
+                               load3 = ptrbb[2*0+1];
+                               res0 = res0+load2*load3;
+                               res1 = res1+load0*load3;
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                               load0 = ptrba[2*0+0];
+                               load1 = ptrbb[2*0+0];
+                               res0 = res0+load0*load1;
+                               load2 = ptrba[2*0+1];
+                               res1 = res1-load2*load1;
+                               load3 = ptrbb[2*0+1];
+                               res0 = res0-load2*load3;
+                               res1 = res1-load0*load3;
+#endif
+                               ptrba = ptrba+2;
+                               ptrbb = ptrbb+2;
+                       }
+                       load0 = res0*alphar-res1*alphai;
+                       load1 = res1*alphar+res0*alphai;
+                       C0[0] = load0;
+                       C0[1] = load1;
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+                       (!defined(LEFT) && !defined(TRANSA))
+                       temp = bk - off;
+#ifdef LEFT
+                       temp -= 1;
+#else 
+                       temp -= 1;
+#endif
+                       ptrba += temp*2;
+                       ptrbb += temp*2;
+#endif
+#ifdef LEFT
+                       off += 1;
+#endif
+                       C0 = C0+2;
+               }
+               k = (bk<<1);
+               bb = bb+k;
+               i = (ldc<<1);
+               C = C+i;
+       }
+       return 0;
+}
index b98f263..df4380d 100644 (file)
@@ -10,26 +10,30 @@ CGEMVTKERNEL = zgemv_t_loongson3a.c
 ZGEMVNKERNEL = zgemv_n_loongson3a.c
 ZGEMVTKERNEL = zgemv_t_loongson3a.c
 
+STRMMKERNEL    = ../generic/trmmkernel_2x2.c
+DTRMMKERNEL    = ../generic/trmmkernel_2x2.c
+CTRMMKERNEL    = ../generic/ztrmmkernel_2x2.c
+ZTRMMKERNEL    = ../generic/ztrmmkernel_2x2.c
 
-SGEMMKERNEL    =  sgemm_kernel_loongson3b_4x4.S                
-SGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
-SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
+SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c          
+SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
 SGEMMONCOPYOBJ =  sgemm_oncopy.o
 SGEMMOTCOPYOBJ =  sgemm_otcopy.o
 
-DGEMMKERNEL    =  dgemm_kernel_loongson3b_4x4.S
-DGEMMONCOPY    = ../generic/gemm_ncopy_4.c
-DGEMMOTCOPY    = ../generic/gemm_tcopy_4.c
+DGEMMKERNEL    =  ../generic/gemmkernel_2x2.c          
+DGEMMONCOPY    = ../generic/gemm_ncopy_2.c
+DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
 DGEMMONCOPYOBJ = dgemm_oncopy.o
 DGEMMOTCOPYOBJ = dgemm_otcopy.o
 
-CGEMMKERNEL    =  cgemm_kernel_loongson3b_2x2.S
+CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
 CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
 CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
 CGEMMONCOPYOBJ =  cgemm_oncopy.o
 CGEMMOTCOPYOBJ =  cgemm_otcopy.o
 
-ZGEMMKERNEL    =  zgemm_kernel_loongson3b_2x2.S
+ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
 ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
 ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
 ZGEMMONCOPYOBJ =  zgemm_oncopy.o