added generic trmm kernels and modified Makefile.L3
authorwernsaar <wernsaar@googlemail.com>
Tue, 30 Jul 2013 18:18:57 +0000 (20:18 +0200)
committerwernsaar <wernsaar@googlemail.com>
Tue, 30 Jul 2013 18:18:57 +0000 (20:18 +0200)
kernel/Makefile.L3
kernel/generic/trmmkernel_16x2.c [new file with mode: 0644]
kernel/generic/trmmkernel_8x2.c [new file with mode: 0644]

index 2dcae0b..f8152ac 100644 (file)
@@ -582,6 +582,24 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 
 $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
        $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
+
+else
+
+ifdef STRMMKERNEL
+
+$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
+
+$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
+
+$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
+
+$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
+
+
 else
 $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
        $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@@ -595,17 +613,79 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
 $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
        $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
 
+endif
+
+ifdef DTRMMKERNEL
+
+ifdef DTRMMKERNEL_LN
+$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_LN)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
+else
+$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
+endif
+
+ifdef DTRMMKERNEL_LT
+$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_LT)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
+else
+$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
+endif
+
+ifdef DTRMMKERNEL_RN
+$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_RN)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
+else
+$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
+endif
+
+ifdef DTRMMKERNEL_RT
+$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_RT)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
+else
+$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
+endif
+
+else
+
+ifdef DTRMMKERNEL_LN
+$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_LN)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
+else
 $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
        $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
+endif
 
+ifdef DTRMMKERNEL_LT
+$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_LT)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
+else
 $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
        $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
+endif
 
+ifdef DTRMMKERNEL_RN
+$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_RN)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
+else
 $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
        $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
+endif
 
+ifdef DTRMMKERNEL_RT
+$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_RT)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
+else
 $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
        $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
+endif
+
+endif
+
+ifdef QTRMMKERNEL
 
 $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
        $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@@ -619,6 +699,50 @@ $(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
 $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
        $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
 
+else
+
+$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
+
+$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
+
+$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
+
+$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
+
+endif
+
+ifdef CTRMMKERNEL
+
+$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
+
+$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
+
+$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
+
+$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
+
+$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
+
+$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
+
+$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
+
+$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
+
+else
+
 $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
        $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
 
@@ -643,6 +767,37 @@ $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
 $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
        $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
 
+endif
+
+ifdef ZTRMMKERNEL
+
+$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
+
+$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
+
+$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
+
+$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
+
+$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
+
+$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
+
+$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
+
+$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
+
+
+else
+
 $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
        $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
 
@@ -666,8 +821,38 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
 
 $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
        $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
+
+endif
 endif
 
+ifdef XTRMMKERNEL
+
+$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
+
+$(KDIR)xtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
+
+$(KDIR)xtrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
+
+$(KDIR)xtrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
+
+$(KDIR)xtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
+
+$(KDIR)xtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
+
+$(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
+
+$(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
+       $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
+
+else
+
 $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
        $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
 
@@ -692,6 +877,9 @@ $(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
 $(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
        $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
 
+endif
+
+
 $(KDIR)cgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL)
        $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
 
diff --git a/kernel/generic/trmmkernel_16x2.c b/kernel/generic/trmmkernel_16x2.c
new file mode 100644 (file)
index 0000000..437fa09
--- /dev/null
@@ -0,0 +1,1151 @@
+#include "common.h"
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) 
+{
+
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*ptrba,*ptrbb;
+
+   FLOAT res0_0;
+   FLOAT res0_1;
+   FLOAT res0_2;
+   FLOAT res0_3;
+   FLOAT res0_4;
+   FLOAT res0_5;
+   FLOAT res0_6;
+   FLOAT res0_7;
+
+   FLOAT res0_8;
+   FLOAT res0_9;
+   FLOAT res0_10;
+   FLOAT res0_11;
+   FLOAT res0_12;
+   FLOAT res0_13;
+   FLOAT res0_14;
+   FLOAT res0_15;
+
+   FLOAT res1_0;
+   FLOAT res1_1;
+   FLOAT res1_2;
+   FLOAT res1_3;
+   FLOAT res1_4;
+   FLOAT res1_5;
+   FLOAT res1_6;
+   FLOAT res1_7;
+
+   FLOAT res1_8;
+   FLOAT res1_9;
+   FLOAT res1_10;
+   FLOAT res1_11;
+   FLOAT res1_12;
+   FLOAT res1_13;
+   FLOAT res1_14;
+   FLOAT res1_15;
+
+   FLOAT a0;
+   FLOAT a1;
+
+   FLOAT b0;
+   FLOAT b1;
+
+   BLASLONG off, temp;
+
+#if !defined(LEFT)
+   off = -offset; 
+#endif
+
+
+
+   for (j=0; j<bn/2; j+=1) 
+   {
+        C0 = C;
+        C1 = C0+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+       off = offset;
+#endif
+
+
+        ptrba = ba;
+
+
+        for (i=0; i<bm/16; i+=1) 
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*16;
+               ptrbb = bb + off*2;
+#endif
+
+               res0_0 = 0;
+               res0_1 = 0;
+               res0_2 = 0;
+               res0_3 = 0;
+               res0_4 = 0;
+               res0_5 = 0;
+               res0_6 = 0;
+               res0_7 = 0;
+
+               res0_8  = 0;
+               res0_9  = 0;
+               res0_10 = 0;
+               res0_11 = 0;
+               res0_12 = 0;
+               res0_13 = 0;
+               res0_14 = 0;
+               res0_15 = 0;
+
+               res1_0 = 0;
+               res1_1 = 0;
+               res1_2 = 0;
+               res1_3 = 0;
+               res1_4 = 0;
+               res1_5 = 0;
+               res1_6 = 0;
+               res1_7 = 0;
+
+               res1_8  = 0;
+               res1_9  = 0;
+               res1_10 = 0;
+               res1_11 = 0;
+               res1_12 = 0;
+               res1_13 = 0;
+               res1_14 = 0;
+               res1_15 = 0;
+
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT) 
+               temp = off+16;  // number of values in A
+#else
+               temp = off+2;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++) 
+                {
+                       b0 = ptrbb[0];
+                       b1 = ptrbb[1];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+                       res1_0 += a0*b1;
+
+                       a1 = ptrba[1];
+                       res0_1 += a1*b0;
+                       res1_1 += a1*b1;
+
+                       a0 = ptrba[2];
+                       res0_2 += a0*b0;
+                       res1_2 += a0*b1;
+
+                       a1 = ptrba[3];
+                       res0_3 += a1*b0;
+                       res1_3 += a1*b1;
+
+                       a0 = ptrba[4];
+                       res0_4 += a0*b0;
+                       res1_4 += a0*b1;
+
+                       a1 = ptrba[5];
+                       res0_5 += a1*b0;
+                       res1_5 += a1*b1;
+
+                       a0 = ptrba[6];
+                       res0_6 += a0*b0;
+                       res1_6 += a0*b1;
+
+                       a1 = ptrba[7];
+                       res0_7 += a1*b0;
+                       res1_7 += a1*b1;
+
+                       a0 = ptrba[8];
+                       res0_8 += a0*b0;
+                       res1_8 += a0*b1;
+
+                       a1 = ptrba[9];
+                       res0_9 += a1*b0;
+                       res1_9 += a1*b1;
+
+                       a0 = ptrba[10];
+                       res0_10 += a0*b0;
+                       res1_10 += a0*b1;
+
+                       a1 = ptrba[11];
+                       res0_11 += a1*b0;
+                       res1_11 += a1*b1;
+
+                       a0 = ptrba[12];
+                       res0_12 += a0*b0;
+                       res1_12 += a0*b1;
+
+                       a1 = ptrba[13];
+                       res0_13 += a1*b0;
+                       res1_13 += a1*b1;
+
+                       a0 = ptrba[14];
+                       res0_14 += a0*b0;
+                       res1_14 += a0*b1;
+
+                       a1 = ptrba[15];
+                       res0_15 += a1*b0;
+                       res1_15 += a1*b1;
+
+
+                       ptrba = ptrba+16;
+                       ptrbb = ptrbb+2;
+                }
+
+               res0_0 *= alpha;
+               res0_1 *= alpha;
+               res0_2 *= alpha;
+               res0_3 *= alpha;
+               res0_4 *= alpha;
+               res0_5 *= alpha;
+               res0_6 *= alpha;
+               res0_7 *= alpha;
+
+               res0_8  *= alpha;
+               res0_9  *= alpha;
+               res0_10 *= alpha;
+               res0_11 *= alpha;
+               res0_12 *= alpha;
+               res0_13 *= alpha;
+               res0_14 *= alpha;
+               res0_15 *= alpha;
+
+               res1_0 *= alpha;
+               res1_1 *= alpha;
+               res1_2 *= alpha;
+               res1_3 *= alpha;
+               res1_4 *= alpha;
+               res1_5 *= alpha;
+               res1_6 *= alpha;
+               res1_7 *= alpha;
+
+               res1_8  *= alpha;
+               res1_9  *= alpha;
+               res1_10 *= alpha;
+               res1_11 *= alpha;
+               res1_12 *= alpha;
+               res1_13 *= alpha;
+               res1_14 *= alpha;
+               res1_15 *= alpha;
+
+               C0[0] = res0_0;
+               C0[1] = res0_1;
+               C0[2] = res0_2;
+               C0[3] = res0_3;
+               C0[4] = res0_4;
+               C0[5] = res0_5;
+               C0[6] = res0_6;
+               C0[7] = res0_7;
+
+               C0[8]  = res0_8;
+               C0[9]  = res0_9;
+               C0[10] = res0_10;
+               C0[11] = res0_11;
+               C0[12] = res0_12;
+               C0[13] = res0_13;
+               C0[14] = res0_14;
+               C0[15] = res0_15;
+
+               C1[0] = res1_0;
+               C1[1] = res1_1;
+               C1[2] = res1_2;
+               C1[3] = res1_3;
+               C1[4] = res1_4;
+               C1[5] = res1_5;
+               C1[6] = res1_6;
+               C1[7] = res1_7;
+
+               C1[8]  = res1_8;
+               C1[9]  = res1_9;
+               C1[10] = res1_10;
+               C1[11] = res1_11;
+               C1[12] = res1_12;
+               C1[13] = res1_13;
+               C1[14] = res1_14;
+               C1[15] = res1_15;
+
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 16; // number of values in A
+#else 
+               temp -= 2; // number of values in B
+#endif
+               ptrba += temp*16;
+               ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+               off += 16; // number of values in A
+#endif
+
+               C0 = C0+16;
+               C1 = C1+16;
+       }
+
+
+
+
+        if ( bm & 8) 
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*8;
+               ptrbb = bb + off*2;
+#endif
+
+               res0_0 = 0;
+               res0_1 = 0;
+               res0_2 = 0;
+               res0_3 = 0;
+               res0_4 = 0;
+               res0_5 = 0;
+               res0_6 = 0;
+               res0_7 = 0;
+
+               res1_0 = 0;
+               res1_1 = 0;
+               res1_2 = 0;
+               res1_3 = 0;
+               res1_4 = 0;
+               res1_5 = 0;
+               res1_6 = 0;
+               res1_7 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT) 
+               temp = off+8;   // number of values in A
+#else
+               temp = off+2;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++) 
+                {
+                       b0 = ptrbb[0];
+                       b1 = ptrbb[1];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+                       res1_0 += a0*b1;
+
+                       a1 = ptrba[1];
+                       res0_1 += a1*b0;
+                       res1_1 += a1*b1;
+
+                       a0 = ptrba[2];
+                       res0_2 += a0*b0;
+                       res1_2 += a0*b1;
+
+                       a1 = ptrba[3];
+                       res0_3 += a1*b0;
+                       res1_3 += a1*b1;
+
+                       a0 = ptrba[4];
+                       res0_4 += a0*b0;
+                       res1_4 += a0*b1;
+
+                       a1 = ptrba[5];
+                       res0_5 += a1*b0;
+                       res1_5 += a1*b1;
+
+                       a0 = ptrba[6];
+                       res0_6 += a0*b0;
+                       res1_6 += a0*b1;
+
+                       a1 = ptrba[7];
+                       res0_7 += a1*b0;
+                       res1_7 += a1*b1;
+
+                       ptrba = ptrba+8;
+                       ptrbb = ptrbb+2;
+                }
+
+               res0_0 *= alpha;
+               res0_1 *= alpha;
+               res0_2 *= alpha;
+               res0_3 *= alpha;
+               res0_4 *= alpha;
+               res0_5 *= alpha;
+               res0_6 *= alpha;
+               res0_7 *= alpha;
+
+               res1_0 *= alpha;
+               res1_1 *= alpha;
+               res1_2 *= alpha;
+               res1_3 *= alpha;
+               res1_4 *= alpha;
+               res1_5 *= alpha;
+               res1_6 *= alpha;
+               res1_7 *= alpha;
+
+               C0[0] = res0_0;
+               C0[1] = res0_1;
+               C0[2] = res0_2;
+               C0[3] = res0_3;
+               C0[4] = res0_4;
+               C0[5] = res0_5;
+               C0[6] = res0_6;
+               C0[7] = res0_7;
+
+               C1[0] = res1_0;
+               C1[1] = res1_1;
+               C1[2] = res1_2;
+               C1[3] = res1_3;
+               C1[4] = res1_4;
+               C1[5] = res1_5;
+               C1[6] = res1_6;
+               C1[7] = res1_7;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 8; // number of values in A
+#else 
+               temp -= 2; // number of values in B
+#endif
+               ptrba += temp*8;
+               ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+               off += 8; // number of values in A
+#endif
+
+               C0 = C0+8;
+               C1 = C1+8;
+       }
+
+       if ( bm & 4 )
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*4;
+               ptrbb = bb + off*2;
+#endif
+
+               res0_0 = 0;
+               res0_1 = 0;
+               res0_2 = 0;
+               res0_3 = 0;
+
+               res1_0 = 0;
+               res1_1 = 0;
+               res1_2 = 0;
+               res1_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT) 
+               temp = off+4;   // number of values in A
+#else
+               temp = off+2;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++) 
+                {
+                       b0 = ptrbb[0];
+                       b1 = ptrbb[1];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+                       res1_0 += a0*b1;
+
+                       a1 = ptrba[1];
+                       res0_1 += a1*b0;
+                       res1_1 += a1*b1;
+
+                       a0 = ptrba[2];
+                       res0_2 += a0*b0;
+                       res1_2 += a0*b1;
+
+                       a1 = ptrba[3];
+                       res0_3 += a1*b0;
+                       res1_3 += a1*b1;
+
+                       ptrba = ptrba+4;
+                       ptrbb = ptrbb+2;
+                }
+
+               res0_0 *= alpha;
+               res0_1 *= alpha;
+               res0_2 *= alpha;
+               res0_3 *= alpha;
+
+               res1_0 *= alpha;
+               res1_1 *= alpha;
+               res1_2 *= alpha;
+               res1_3 *= alpha;
+
+               C0[0] = res0_0;
+               C0[1] = res0_1;
+               C0[2] = res0_2;
+               C0[3] = res0_3;
+
+               C1[0] = res1_0;
+               C1[1] = res1_1;
+               C1[2] = res1_2;
+               C1[3] = res1_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 4; // number of values in A
+#else 
+               temp -= 2; // number of values in B
+#endif
+               ptrba += temp*4;
+               ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+               off += 4; // number of values in A
+#endif
+
+               C0 = C0+4;
+               C1 = C1+4;
+
+       }
+
+       if ( bm & 2 )
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*2;
+               ptrbb = bb + off*2;
+#endif
+
+               res0_0 = 0;
+               res0_1 = 0;
+
+               res1_0 = 0;
+               res1_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT) 
+               temp = off+2;   // number of values in A
+#else
+               temp = off+2;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++) 
+                {
+                       b0 = ptrbb[0];
+                       b1 = ptrbb[1];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+                       res1_0 += a0*b1;
+
+                       a1 = ptrba[1];
+                       res0_1 += a1*b0;
+                       res1_1 += a1*b1;
+
+                       ptrba = ptrba+2;
+                       ptrbb = ptrbb+2;
+                }
+
+               res0_0 *= alpha;
+               res0_1 *= alpha;
+
+               res1_0 *= alpha;
+               res1_1 *= alpha;
+
+               C0[0] = res0_0;
+               C0[1] = res0_1;
+
+               C1[0] = res1_0;
+               C1[1] = res1_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 2; // number of values in A
+#else 
+               temp -= 2; // number of values in B
+#endif
+               ptrba += temp*2;
+               ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+               off += 2; // number of values in A
+#endif
+
+               C0 = C0+2;
+               C1 = C1+2;
+
+       }
+
+       if ( bm & 1 )
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*1;
+               ptrbb = bb + off*2;
+#endif
+
+               res0_0 = 0;
+
+               res1_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT) 
+               temp = off+1;   // number of values in A
+#else
+               temp = off+2;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++) 
+                {
+                       b0 = ptrbb[0];
+                       b1 = ptrbb[1];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+                       res1_0 += a0*b1;
+
+                       ptrba = ptrba+1;
+                       ptrbb = ptrbb+2;
+                }
+
+               res0_0 *= alpha;
+
+               res1_0 *= alpha;
+
+               C0[0] = res0_0;
+
+               C1[0] = res1_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 1; // number of values in A
+#else 
+               temp -= 2; // number of values in B
+#endif
+               ptrba += temp*1;
+               ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+               off += 1; // number of values in A
+#endif
+
+               C0 = C0+1;
+               C1 = C1+1;
+
+       }
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+               off += 2;
+#endif
+
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+    }
+
+
+
+
+
+
+
+   for (j=0; j<(bn&1); j+=1) 
+   {
+        C0 = C;
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+       off = offset;
+#endif
+
+        ptrba = ba;
+
+
+        for (i=0; i<bm/16; i+=1) 
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*16;
+               ptrbb = bb + off*1;
+#endif
+
+               res0_0 = 0;
+               res0_1 = 0;
+               res0_2 = 0;
+               res0_3 = 0;
+               res0_4 = 0;
+               res0_5 = 0;
+               res0_6 = 0;
+               res0_7 = 0;
+
+               res0_8  = 0;
+               res0_9  = 0;
+               res0_10 = 0;
+               res0_11 = 0;
+               res0_12 = 0;
+               res0_13 = 0;
+               res0_14 = 0;
+               res0_15 = 0;
+
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT) 
+               temp = off+16;  // number of values in A
+#else
+               temp = off+1;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++) 
+                {
+                       b0 = ptrbb[0];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+
+                       a1 = ptrba[1];
+                       res0_1 += a1*b0;
+
+                       a0 = ptrba[2];
+                       res0_2 += a0*b0;
+
+                       a1 = ptrba[3];
+                       res0_3 += a1*b0;
+
+                       a0 = ptrba[4];
+                       res0_4 += a0*b0;
+
+                       a1 = ptrba[5];
+                       res0_5 += a1*b0;
+
+                       a0 = ptrba[6];
+                       res0_6 += a0*b0;
+
+                       a1 = ptrba[7];
+                       res0_7 += a1*b0;
+
+                       a0 = ptrba[8];
+                       res0_8 += a0*b0;
+
+                       a1 = ptrba[9];
+                       res0_9 += a1*b0;
+
+                       a0 = ptrba[10];
+                       res0_10 += a0*b0;
+
+                       a1 = ptrba[11];
+                       res0_11 += a1*b0;
+
+                       a0 = ptrba[12];
+                       res0_12 += a0*b0;
+
+                       a1 = ptrba[13];
+                       res0_13 += a1*b0;
+
+                       a0 = ptrba[14];
+                       res0_14 += a0*b0;
+
+                       a1 = ptrba[15];
+                       res0_15 += a1*b0;
+
+
+                       ptrba = ptrba+16;
+                       ptrbb = ptrbb+1;
+                }
+
+               res0_0 *= alpha;
+               res0_1 *= alpha;
+               res0_2 *= alpha;
+               res0_3 *= alpha;
+               res0_4 *= alpha;
+               res0_5 *= alpha;
+               res0_6 *= alpha;
+               res0_7 *= alpha;
+
+               res0_8  *= alpha;
+               res0_9  *= alpha;
+               res0_10 *= alpha;
+               res0_11 *= alpha;
+               res0_12 *= alpha;
+               res0_13 *= alpha;
+               res0_14 *= alpha;
+               res0_15 *= alpha;
+
+               C0[0] = res0_0;
+               C0[1] = res0_1;
+               C0[2] = res0_2;
+               C0[3] = res0_3;
+               C0[4] = res0_4;
+               C0[5] = res0_5;
+               C0[6] = res0_6;
+               C0[7] = res0_7;
+
+               C0[8]  = res0_8;
+               C0[9]  = res0_9;
+               C0[10] = res0_10;
+               C0[11] = res0_11;
+               C0[12] = res0_12;
+               C0[13] = res0_13;
+               C0[14] = res0_14;
+               C0[15] = res0_15;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 16; // number of values in A
+#else 
+               temp -= 1; // number of values in B
+#endif
+               ptrba += temp*16;
+               ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+               off += 16; // number of values in A
+#endif
+
+               C0 = C0+16;
+       }
+
+
+
+
+        if ( bm & 8 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*8;
+               ptrbb = bb + off*1;
+#endif
+
+               res0_0 = 0;
+               res0_1 = 0;
+               res0_2 = 0;
+               res0_3 = 0;
+               res0_4 = 0;
+               res0_5 = 0;
+               res0_6 = 0;
+               res0_7 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT) 
+               temp = off+8;   // number of values in A
+#else
+               temp = off+1;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++) 
+                {
+                       b0 = ptrbb[0];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+
+                       a1 = ptrba[1];
+                       res0_1 += a1*b0;
+
+                       a0 = ptrba[2];
+                       res0_2 += a0*b0;
+
+                       a1 = ptrba[3];
+                       res0_3 += a1*b0;
+
+                       a0 = ptrba[4];
+                       res0_4 += a0*b0;
+
+                       a1 = ptrba[5];
+                       res0_5 += a1*b0;
+
+                       a0 = ptrba[6];
+                       res0_6 += a0*b0;
+
+                       a1 = ptrba[7];
+                       res0_7 += a1*b0;
+
+                       ptrba = ptrba+8;
+                       ptrbb = ptrbb+1;
+                }
+
+               res0_0 *= alpha;
+               res0_1 *= alpha;
+               res0_2 *= alpha;
+               res0_3 *= alpha;
+               res0_4 *= alpha;
+               res0_5 *= alpha;
+               res0_6 *= alpha;
+               res0_7 *= alpha;
+
+               C0[0] = res0_0;
+               C0[1] = res0_1;
+               C0[2] = res0_2;
+               C0[3] = res0_3;
+               C0[4] = res0_4;
+               C0[5] = res0_5;
+               C0[6] = res0_6;
+               C0[7] = res0_7;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 8; // number of values in A
+#else 
+               temp -= 1; // number of values in B
+#endif
+               ptrba += temp*8;
+               ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+               off += 8; // number of values in A
+#endif
+
+               C0 = C0+8;
+       }
+
+       if ( bm & 4 )
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*4;
+               ptrbb = bb + off*1;
+#endif
+
+               res0_0 = 0;
+               res0_1 = 0;
+               res0_2 = 0;
+               res0_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT) 
+               temp = off+4;   // number of values in A
+#else
+               temp = off+1;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++) 
+                {
+                       b0 = ptrbb[0];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+
+                       a1 = ptrba[1];
+                       res0_1 += a1*b0;
+
+                       a0 = ptrba[2];
+                       res0_2 += a0*b0;
+
+                       a1 = ptrba[3];
+                       res0_3 += a1*b0;
+
+                       ptrba = ptrba+4;
+                       ptrbb = ptrbb+1;
+                }
+
+               res0_0 *= alpha;
+               res0_1 *= alpha;
+               res0_2 *= alpha;
+               res0_3 *= alpha;
+
+               C0[0] = res0_0;
+               C0[1] = res0_1;
+               C0[2] = res0_2;
+               C0[3] = res0_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 4; // number of values in A
+#else 
+               temp -= 1; // number of values in B
+#endif
+               ptrba += temp*4;
+               ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+               off += 4; // number of values in A
+#endif
+
+               C0 = C0+4;
+
+       }
+
+       if ( bm & 2 )
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*2;
+               ptrbb = bb + off*1;
+#endif
+
+               res0_0 = 0;
+               res0_1 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT) 
+               temp = off+2;   // number of values in A
+#else
+               temp = off+1;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++) 
+                {
+                       b0 = ptrbb[0];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+
+                       a1 = ptrba[1];
+                       res0_1 += a1*b0;
+
+                       ptrba = ptrba+2;
+                       ptrbb = ptrbb+1;
+                }
+
+               res0_0 *= alpha;
+               res0_1 *= alpha;
+
+               C0[0] = res0_0;
+               C0[1] = res0_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 2; // number of values in A
+#else 
+               temp -= 1; // number of values in B
+#endif
+               ptrba += temp*2;
+               ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+               off += 2; // number of values in A
+#endif
+
+               C0 = C0+2;
+
+       }
+
+       if ( bm & 1 )
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*1;
+               ptrbb = bb + off*1;
+#endif
+
+               res0_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT) 
+               temp = off+1;   // number of values in A
+#else
+               temp = off+1;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++) 
+                {
+                       b0 = ptrbb[0];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+
+                       ptrba = ptrba+1;
+                       ptrbb = ptrbb+1;
+                }
+
+               res0_0 *= alpha;
+
+               C0[0] = res0_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 1; // number of values in A
+#else 
+               temp -= 1; // number of values in B
+#endif
+               ptrba += temp*1;
+               ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+               off += 1; // number of values in A
+#endif
+
+               C0 = C0+1;
+
+       }
+
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+               off += 1;
+#endif
+
+        k = (bk<<0);
+        bb = bb+k;
+        C = C+ldc;
+   }
+   return 0;
+}
diff --git a/kernel/generic/trmmkernel_8x2.c b/kernel/generic/trmmkernel_8x2.c
new file mode 100644 (file)
index 0000000..5af289c
--- /dev/null
@@ -0,0 +1,750 @@
+#include "common.h"
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) 
+{
+
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*ptrba,*ptrbb;
+
+   FLOAT res0_0;
+   FLOAT res0_1;
+   FLOAT res0_2;
+   FLOAT res0_3;
+   FLOAT res0_4;
+   FLOAT res0_5;
+   FLOAT res0_6;
+   FLOAT res0_7;
+
+   FLOAT res1_0;
+   FLOAT res1_1;
+   FLOAT res1_2;
+   FLOAT res1_3;
+   FLOAT res1_4;
+   FLOAT res1_5;
+   FLOAT res1_6;
+   FLOAT res1_7;
+
+   FLOAT a0;
+   FLOAT a1;
+
+   FLOAT b0;
+   FLOAT b1;
+
+   BLASLONG off, temp;
+
+#if !defined(LEFT)
+   off = -offset; 
+#endif
+
+
+
+   for (j=0; j<bn/2; j+=1) 
+   {
+        C0 = C;
+        C1 = C0+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+               off = offset;
+#endif
+
+
+        ptrba = ba;
+
+        for (i=0; i<bm/8; i+=1) 
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*8;
+               ptrbb = bb + off*2;
+#endif
+
+               res0_0 = 0;
+               res0_1 = 0;
+               res0_2 = 0;
+               res0_3 = 0;
+               res0_4 = 0;
+               res0_5 = 0;
+               res0_6 = 0;
+               res0_7 = 0;
+
+               res1_0 = 0;
+               res1_1 = 0;
+               res1_2 = 0;
+               res1_3 = 0;
+               res1_4 = 0;
+               res1_5 = 0;
+               res1_6 = 0;
+               res1_7 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT) 
+               temp = off+8;   // number of values in A
+#else
+               temp = off+2;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++) 
+                {
+                       b0 = ptrbb[0];
+                       b1 = ptrbb[1];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+                       res1_0 += a0*b1;
+
+                       a1 = ptrba[1];
+                       res0_1 += a1*b0;
+                       res1_1 += a1*b1;
+
+                       a0 = ptrba[2];
+                       res0_2 += a0*b0;
+                       res1_2 += a0*b1;
+
+                       a1 = ptrba[3];
+                       res0_3 += a1*b0;
+                       res1_3 += a1*b1;
+
+                       a0 = ptrba[4];
+                       res0_4 += a0*b0;
+                       res1_4 += a0*b1;
+
+                       a1 = ptrba[5];
+                       res0_5 += a1*b0;
+                       res1_5 += a1*b1;
+
+                       a0 = ptrba[6];
+                       res0_6 += a0*b0;
+                       res1_6 += a0*b1;
+
+                       a1 = ptrba[7];
+                       res0_7 += a1*b0;
+                       res1_7 += a1*b1;
+
+                       ptrba = ptrba+8;
+                       ptrbb = ptrbb+2;
+                }
+
+               res0_0 *= alpha;
+               res0_1 *= alpha;
+               res0_2 *= alpha;
+               res0_3 *= alpha;
+               res0_4 *= alpha;
+               res0_5 *= alpha;
+               res0_6 *= alpha;
+               res0_7 *= alpha;
+
+               res1_0 *= alpha;
+               res1_1 *= alpha;
+               res1_2 *= alpha;
+               res1_3 *= alpha;
+               res1_4 *= alpha;
+               res1_5 *= alpha;
+               res1_6 *= alpha;
+               res1_7 *= alpha;
+
+               C0[0] = res0_0;
+               C0[1] = res0_1;
+               C0[2] = res0_2;
+               C0[3] = res0_3;
+               C0[4] = res0_4;
+               C0[5] = res0_5;
+               C0[6] = res0_6;
+               C0[7] = res0_7;
+
+               C1[0] = res1_0;
+               C1[1] = res1_1;
+               C1[2] = res1_2;
+               C1[3] = res1_3;
+               C1[4] = res1_4;
+               C1[5] = res1_5;
+               C1[6] = res1_6;
+               C1[7] = res1_7;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 8; // number of values in A
+#else 
+               temp -= 2; // number of values in B
+#endif
+               ptrba += temp*8;
+               ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+               off += 8; // number of values in A
+#endif
+
+               C0 = C0+8;
+               C1 = C1+8;
+       }
+
+       if ( bm & 4 )
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*4;
+               ptrbb = bb + off*2;
+#endif
+
+               res0_0 = 0;
+               res0_1 = 0;
+               res0_2 = 0;
+               res0_3 = 0;
+
+               res1_0 = 0;
+               res1_1 = 0;
+               res1_2 = 0;
+               res1_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT) 
+               temp = off+4;   // number of values in A
+#else
+               temp = off+2;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++) 
+                {
+                       b0 = ptrbb[0];
+                       b1 = ptrbb[1];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+                       res1_0 += a0*b1;
+
+                       a1 = ptrba[1];
+                       res0_1 += a1*b0;
+                       res1_1 += a1*b1;
+
+                       a0 = ptrba[2];
+                       res0_2 += a0*b0;
+                       res1_2 += a0*b1;
+
+                       a1 = ptrba[3];
+                       res0_3 += a1*b0;
+                       res1_3 += a1*b1;
+
+                       ptrba = ptrba+4;
+                       ptrbb = ptrbb+2;
+                }
+
+               res0_0 *= alpha;
+               res0_1 *= alpha;
+               res0_2 *= alpha;
+               res0_3 *= alpha;
+
+               res1_0 *= alpha;
+               res1_1 *= alpha;
+               res1_2 *= alpha;
+               res1_3 *= alpha;
+
+               C0[0] = res0_0;
+               C0[1] = res0_1;
+               C0[2] = res0_2;
+               C0[3] = res0_3;
+
+               C1[0] = res1_0;
+               C1[1] = res1_1;
+               C1[2] = res1_2;
+               C1[3] = res1_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 4; // number of values in A
+#else 
+               temp -= 2; // number of values in B
+#endif
+               ptrba += temp*4;
+               ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+               off += 4; // number of values in A
+#endif
+
+               C0 = C0+4;
+               C1 = C1+4;
+
+       }
+
+       if ( bm & 2 )
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*2;
+               ptrbb = bb + off*2;
+#endif
+
+               res0_0 = 0;
+               res0_1 = 0;
+
+               res1_0 = 0;
+               res1_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT) 
+               temp = off+2;   // number of values in A
+#else
+               temp = off+2;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++) 
+                {
+                       b0 = ptrbb[0];
+                       b1 = ptrbb[1];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+                       res1_0 += a0*b1;
+
+                       a1 = ptrba[1];
+                       res0_1 += a1*b0;
+                       res1_1 += a1*b1;
+
+                       ptrba = ptrba+2;
+                       ptrbb = ptrbb+2;
+                }
+
+               res0_0 *= alpha;
+               res0_1 *= alpha;
+
+               res1_0 *= alpha;
+               res1_1 *= alpha;
+
+               C0[0] = res0_0;
+               C0[1] = res0_1;
+
+               C1[0] = res1_0;
+               C1[1] = res1_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 2; // number of values in A
+#else 
+               temp -= 2; // number of values in B
+#endif
+               ptrba += temp*2;
+               ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+               off += 2; // number of values in A
+#endif
+
+               C0 = C0+2;
+               C1 = C1+2;
+
+       }
+
+       if ( bm & 1 )
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*1;
+               ptrbb = bb + off*2;
+#endif
+
+               res0_0 = 0;
+
+               res1_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT) 
+               temp = off+1;   // number of values in A
+#else
+               temp = off+2;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++) 
+                {
+                       b0 = ptrbb[0];
+                       b1 = ptrbb[1];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+                       res1_0 += a0*b1;
+
+                       ptrba = ptrba+1;
+                       ptrbb = ptrbb+2;
+                }
+
+               res0_0 *= alpha;
+
+               res1_0 *= alpha;
+
+               C0[0] = res0_0;
+
+               C1[0] = res1_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 1; // number of values in A
+#else 
+               temp -= 2; // number of values in B
+#endif
+               ptrba += temp*1;
+               ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+               off += 1; // number of values in A
+#endif
+
+               C0 = C0+1;
+               C1 = C1+1;
+
+       }
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+               off += 2;
+#endif
+
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+    }
+
+
+
+
+
+
+
+   for (j=0; j<(bn&1); j+=1) 
+   {
+        C0 = C;
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+       off = offset;
+#endif
+
+        ptrba = ba;
+
+        for (i=0; i<bm/8; i+=1) 
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*8;
+               ptrbb = bb + off*1;
+#endif
+
+               res0_0 = 0;
+               res0_1 = 0;
+               res0_2 = 0;
+               res0_3 = 0;
+               res0_4 = 0;
+               res0_5 = 0;
+               res0_6 = 0;
+               res0_7 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT) 
+               temp = off+8;   // number of values in A
+#else
+               temp = off+1;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++) 
+                {
+                       b0 = ptrbb[0];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+
+                       a1 = ptrba[1];
+                       res0_1 += a1*b0;
+
+                       a0 = ptrba[2];
+                       res0_2 += a0*b0;
+
+                       a1 = ptrba[3];
+                       res0_3 += a1*b0;
+
+                       a0 = ptrba[4];
+                       res0_4 += a0*b0;
+
+                       a1 = ptrba[5];
+                       res0_5 += a1*b0;
+
+                       a0 = ptrba[6];
+                       res0_6 += a0*b0;
+
+                       a1 = ptrba[7];
+                       res0_7 += a1*b0;
+
+                       ptrba = ptrba+8;
+                       ptrbb = ptrbb+1;
+                }
+
+               res0_0 *= alpha;
+               res0_1 *= alpha;
+               res0_2 *= alpha;
+               res0_3 *= alpha;
+               res0_4 *= alpha;
+               res0_5 *= alpha;
+               res0_6 *= alpha;
+               res0_7 *= alpha;
+
+               C0[0] = res0_0;
+               C0[1] = res0_1;
+               C0[2] = res0_2;
+               C0[3] = res0_3;
+               C0[4] = res0_4;
+               C0[5] = res0_5;
+               C0[6] = res0_6;
+               C0[7] = res0_7;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 8; // number of values in A
+#else 
+               temp -= 1; // number of values in B
+#endif
+               ptrba += temp*8;
+               ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+               off += 8; // number of values in A
+#endif
+
+               C0 = C0+8;
+       }
+
+       if ( bm & 4 )
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*4;
+               ptrbb = bb + off*1;
+#endif
+
+               res0_0 = 0;
+               res0_1 = 0;
+               res0_2 = 0;
+               res0_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT) 
+               temp = off+4;   // number of values in A
+#else
+               temp = off+1;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++) 
+                {
+                       b0 = ptrbb[0];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+
+                       a1 = ptrba[1];
+                       res0_1 += a1*b0;
+
+                       a0 = ptrba[2];
+                       res0_2 += a0*b0;
+
+                       a1 = ptrba[3];
+                       res0_3 += a1*b0;
+
+                       ptrba = ptrba+4;
+                       ptrbb = ptrbb+1;
+                }
+
+               res0_0 *= alpha;
+               res0_1 *= alpha;
+               res0_2 *= alpha;
+               res0_3 *= alpha;
+
+               C0[0] = res0_0;
+               C0[1] = res0_1;
+               C0[2] = res0_2;
+               C0[3] = res0_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 4; // number of values in A
+#else 
+               temp -= 1; // number of values in B
+#endif
+               ptrba += temp*4;
+               ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+               off += 4; // number of values in A
+#endif
+
+               C0 = C0+4;
+
+       }
+
+       if ( bm & 2 )
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*2;
+               ptrbb = bb + off*1;
+#endif
+
+               res0_0 = 0;
+               res0_1 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT) 
+               temp = off+2;   // number of values in A
+#else
+               temp = off+1;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++) 
+                {
+                       b0 = ptrbb[0];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+
+                       a1 = ptrba[1];
+                       res0_1 += a1*b0;
+
+                       ptrba = ptrba+2;
+                       ptrbb = ptrbb+1;
+                }
+
+               res0_0 *= alpha;
+               res0_1 *= alpha;
+
+               C0[0] = res0_0;
+               C0[1] = res0_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 2; // number of values in A
+#else 
+               temp -= 1; // number of values in B
+#endif
+               ptrba += temp*2;
+               ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+               off += 2; // number of values in A
+#endif
+
+               C0 = C0+2;
+
+       }
+
+       if ( bm & 1 )
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*1;
+               ptrbb = bb + off*1;
+#endif
+
+               res0_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT) 
+               temp = off+1;   // number of values in A
+#else
+               temp = off+1;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++) 
+                {
+                       b0 = ptrbb[0];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+
+                       ptrba = ptrba+1;
+                       ptrbb = ptrbb+1;
+                }
+
+               res0_0 *= alpha;
+
+               C0[0] = res0_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 1; // number of values in A
+#else 
+               temp -= 1; // number of values in B
+#endif
+               ptrba += temp*1;
+               ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+               off += 1; // number of values in A
+#endif
+
+               C0 = C0+1;
+
+       }
+
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+               off += 1;
+#endif
+
+        k = (bk<<0);
+        bb = bb+k;
+        C = C+ldc;
+   }
+   return 0;
+}