# The first commit's message is:
authorBenedikt Huber <benedikt.huber@theobroma-systems.com>
Thu, 9 Oct 2014 13:52:10 +0000 (06:52 -0700)
committerZhang Xianyi <traits.zhang@gmail.com>
Tue, 11 Nov 2014 14:19:23 +0000 (22:19 +0800)
Optimizations for APM's xgene-1 (aarch64).

1) general system updates to support armv8 better.  Make all did not work, one needed to supply TARGET=ARMV8.
2) sgem 4x4 kernel in assembler using SIMD, and configuration changes to use it.
3) strmm 4x4 kernel in C.  Since the sgem kernel does 4x4, the trmm kernel must also do 4xN.

Added Dave Nuechterlein to the contributors list.

CONTRIBUTORS.md
common_arm64.h
cpuid_arm64.c [new file with mode: 0644]
getarch.c
kernel/arm64/KERNEL.ARMV8
kernel/arm64/sgemm_kernel_4x4.S [new file with mode: 0644]
kernel/generic/trmmkernel_4x4.c [new file with mode: 0644]
param.h

index 18a218c..02d15b7 100644 (file)
@@ -117,5 +117,9 @@ In chronological order:
 * Isaac Dunham <https://github.com/idunham>
   * [2014-08-03] Fixed link error on Linux/musl
 
+* Dave Nuechterlein
+  * [2014-10-10] trmm and sgemm kernels (optimized for APM's X-Gene 1).
+                 ARMv8 support.
+
 * [Your name or handle] <[email or website]>
   * [Date] [Brief summary of your changes]
index 8a66a17..4855493 100644 (file)
@@ -119,9 +119,9 @@ static inline int blas_quickdivide(blasint x, blasint y){
 }
 
 #if defined(DOUBLE)
-#define GET_IMAGE(res)  __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory")
+#define GET_IMAGE(res)  __asm__ __volatile__("str d1, %0" : "=m"(res) : : "memory")
 #else
-#define GET_IMAGE(res)  __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory")
+#define GET_IMAGE(res)  __asm__ __volatile__("str s1, %0" : "=m"(res) : : "memory")
 #endif
 
 #define GET_IMAGE_CANCEL
@@ -138,7 +138,6 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #if defined(ASSEMBLER) && !defined(NEEDPARAM)
 
 #define PROLOGUE \
-       .arm             ;\
        .global REALNAME ;\
        .func   REALNAME  ;\
 REALNAME:
diff --git a/cpuid_arm64.c b/cpuid_arm64.c
new file mode 100644 (file)
index 0000000..c7a27f8
--- /dev/null
@@ -0,0 +1,217 @@
+/**************************************************************************
+  Copyright (c) 2013, The OpenBLAS Project
+  All rights reserved.
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+  1. Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+  2. Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer in
+  the documentation and/or other materials provided with the
+  distribution.
+  3. Neither the name of the OpenBLAS project nor the names of
+  its contributors may be used to endorse or promote products
+  derived from this software without specific prior written permission.
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+  USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+  *****************************************************************************/
+
+#include <string.h>
+
+#define CPU_UNKNOWN            0
+#define CPU_ARMV8              1
+
+static char *cpuname[] = {
+  "UNKOWN",
+  "ARMV8"
+};
+
+
+int get_feature(char *search)
+{
+
+#ifdef linux
+       FILE *infile;
+       char buffer[2048], *p,*t;
+       p = (char *) NULL ;
+
+       infile = fopen("/proc/cpuinfo", "r");
+
+       while (fgets(buffer, sizeof(buffer), infile))
+       {
+
+               if (!strncmp("Features", buffer, 8))
+               {
+                       p = strchr(buffer, ':') + 2;
+                       break;
+               }
+       }
+
+       fclose(infile);
+
+
+       if( p == NULL ) return;
+
+       t = strtok(p," ");
+       while( t = strtok(NULL," "))
+       {
+               if (!strcmp(t, search))   { return(1); }
+       }
+
+#endif
+       return(0);
+}
+
+
+int detect(void)
+{
+
+#ifdef linux
+
+       FILE *infile;
+       char buffer[512], *p;
+       p = (char *) NULL ;
+
+       infile = fopen("/proc/cpuinfo", "r");
+
+       while (fgets(buffer, sizeof(buffer), infile))
+       {
+
+               if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9)))
+               {
+                       p = strchr(buffer, ':') + 2;
+                       break;
+               }
+       }
+
+       fclose(infile);
+
+       if(p != NULL)
+       {
+
+               if (strstr(p, "AArch64"))
+               {
+                               return CPU_ARMV8;
+
+               }
+
+
+       }
+#endif
+
+       return CPU_UNKNOWN;
+}
+
+char *get_corename(void)
+{
+       return cpuname[detect()];
+}
+
+void get_architecture(void)
+{
+       printf("ARM");
+}
+
+void get_subarchitecture(void)
+{
+       int d = detect();
+       switch (d)
+       {
+
+               case CPU_ARMV8:
+                       printf("ARMV8");
+                       break;
+
+               default:
+                       printf("UNKNOWN");
+                       break;
+       }
+}
+
+void get_subdirname(void)
+{
+       printf("arm64");
+}
+
+void get_cpuconfig(void)
+{
+
+       int d = detect();
+       switch (d)
+       {
+
+               case CPU_ARMV8:
+                       printf("#define ARMV8\n");
+                       printf("#define L1_DATA_SIZE 32768\n");
+                       printf("#define L1_DATA_LINESIZE 64\n");
+                       printf("#define L2_SIZE 262144\n");
+                       printf("#define L2_LINESIZE 64\n");
+                       printf("#define DTB_DEFAULT_ENTRIES 64\n");
+                       printf("#define DTB_SIZE 4096\n");
+                       printf("#define L2_ASSOCIATIVE 4\n");
+                       break;
+
+
+       }
+}
+
+
+void get_libname(void)
+{
+
+       int d = detect();
+       switch (d)
+       {
+
+               case CPU_ARMV8:
+                       printf("armv8\n");
+                       break;
+
+       }
+}
+
+
+void get_features(void)
+{
+
+#ifdef linux
+       FILE *infile;
+       char buffer[2048], *p,*t;
+       p = (char *) NULL ;
+
+       infile = fopen("/proc/cpuinfo", "r");
+
+       while (fgets(buffer, sizeof(buffer), infile))
+       {
+
+               if (!strncmp("Features", buffer, 8))
+               {
+                       p = strchr(buffer, ':') + 2;
+                       break;
+               }
+       }
+
+       fclose(infile);
+
+
+       if( p == NULL ) return;
+
+       t = strtok(p," ");
+       while( t = strtok(NULL," "))
+       {
+       }
+
+#endif
+       return;
+}
+
+
index 3e99142..ded347e 100644 (file)
--- a/getarch.c
+++ b/getarch.c
@@ -746,12 +746,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SUBARCHITECTURE "ARMV8"
 #define SUBDIRNAME      "arm64"
 #define ARCHCONFIG   "-DARMV8 " \
-       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
-       "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
-       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \
-       "-DHAVE_VFP -DHAVE_VFPV3 -DHAVE_VFPV4"
+       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+       "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " 
 #define LIBNAME   "armv8"
-#define CORENAME  "ARMV8"
+#define CORENAME  "XGENE1"
 #else
 #endif
 
@@ -801,6 +800,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define OPENBLAS_SUPPORTED
 #endif
 
+#ifdef __aarch64__
+#include "cpuid_arm64.c"
+#define OPENBLAS_SUPPORTED
+#endif
+
 
 #ifndef OPENBLAS_SUPPORTED
 #error "This arch/CPU is not supported by OpenBLAS."
@@ -856,7 +860,7 @@ int main(int argc, char *argv[]){
 #ifdef FORCE
     printf("CORE=%s\n", CORENAME);
 #else
-#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__)
+#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__)
     printf("CORE=%s\n", get_corename());
 #endif
 #endif
@@ -956,7 +960,7 @@ int main(int argc, char *argv[]){
 #ifdef FORCE
     printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
 #else
-#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__)
+#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__)
     printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
 #endif
 #endif
index 27157da..4fc0968 100644 (file)
@@ -80,14 +80,14 @@ DGEMVTKERNEL = ../arm/gemv_t.c
 CGEMVTKERNEL = ../arm/zgemv_t.c
 ZGEMVTKERNEL = ../arm/zgemv_t.c
 
-STRMMKERNEL    = ../generic/trmmkernel_2x2.c
+STRMMKERNEL    = ../generic/trmmkernel_4x4.c
 DTRMMKERNEL    = ../generic/trmmkernel_2x2.c
 CTRMMKERNEL    = ../generic/ztrmmkernel_2x2.c
 ZTRMMKERNEL    = ../generic/ztrmmkernel_2x2.c
 
-SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
-SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
-SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
+SGEMMKERNEL    =  sgemm_kernel_4x4.S
+SGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
 SGEMMONCOPYOBJ =  sgemm_oncopy.o
 SGEMMOTCOPYOBJ =  sgemm_otcopy.o
 
diff --git a/kernel/arm64/sgemm_kernel_4x4.S b/kernel/arm64/sgemm_kernel_4x4.S
new file mode 100644 (file)
index 0000000..7863329
--- /dev/null
@@ -0,0 +1,1327 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/23 Saar
+*       BLASTEST               : OK
+*       CTEST                  : OK
+*       TEST                   : OK
+*
+*
+* 2013/11/02 Saar
+*      UNROLL_N                4
+*      UNROLL_M                4
+*      DGEMM_P                 128
+*      DGEMM_Q                 240
+*      DGEMM_R                 12288
+*      A_PRE                   128
+*      B_PRE                   128
+*      C_PRE                   32
+*
+* Performance on Odroid U2:
+*
+* 3072x3072            1 Core:         2.62 GFLOPS     ATLAS: 2.69     GFLOPS
+* 3072x3072            2 Cores:        5.23 GFLOPS     ATLAS: 5.27     GFLOPS
+* 3072x3072            3 Cores:        7.78 GFLOPS     ATLAS: 7.87     GFLOPS
+* 3072x3072            4 Cores:       10.10 GFLOPS     ATLAS: 9.98     GFLOPS
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0        X3        x4       x5           x6*/
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc*/
+
+
+#define origM x0
+#define origN x1
+#define origK x2
+#define origPA x3
+#define origPB x4
+#define pC x5
+#define LDC x6
+#define offset x7
+#define counterL x8
+#define counterI x9
+#define pB x10
+#define counterJ x11
+#define tempALPHA x12
+#define pCRow0 x13
+#define pCRow1 x14
+#define pCRow2 x15
+#define pA x16
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset
+// 08 counterL
+// 09 counterI
+// 10 pB
+// 11 counterJ
+// 12 tempALPHA      
+// 13 pCRow0
+// 14 pCRow1
+// 15 pCRow2
+// 16 pA
+// 17
+// 18 must save
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 orig ALPHA -> a00
+//v01 a01
+//v02 a02
+//v03 a03
+//v04 a10
+//v05 a11
+//v06 a12
+//v07 a13
+//v08 must save b00
+//v09 must save b01
+//v10 must save b02
+//v11 must save b03
+//v12 must save b10
+//v13 must save b11
+//v14 must save b12
+//v15 must save b13
+//v16 must save  C00
+//v17 must save  C01
+//v18  C02
+//v19  C03
+//v20  C10
+//v21  C11
+//v22  C12
+//v23  C13
+//v24  C20
+//v25  C21
+//v26  C22
+//v27  C23
+//v28  C30
+//v29  C31
+//v30  C32
+//v31  C33
+
+//        add     sp,sp,#-(6*16)
+//        stp     x18,x19,[sp,#(0*16)]
+//        stp     x20,x21,[sp,#(1*16)]
+
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro INIT4x4
+
+       fsub            v16.4s , v16.4s , v16.4s
+       fsub            v20.4s , v20.4s , v20.4s
+       fsub            v24.4s , v24.4s , v24.4s
+       fsub            v28.4s , v28.4s , v28.4s
+
+.endm
+
+.macro KERNEL4x4_I
+
+        ld1     {v8.2s},[pB],#8
+        ld1     {v10.2s},[pB],#8
+        ld1     {v0.4s},[pA],#16
+
+        fmulx   v16.4s, v0.4s, v8.4s[0]
+        fmulx   v20.4s, v0.4s, v8.4s[1]
+       fmulx   v24.4s, v0.4s, v10.4s[0]
+       fmulx   v28.4s, v0.4s, v10.4s[1]
+
+        ld1     {v12.2s},[pB],#8   // for next round
+        ld1     {v14.2s},[pB],#8   // for next round
+        ld1     {v4.4s},[pA],#16   // for next round
+
+
+.endm
+
+
+.macro KERNEL4x4_M2
+
+       fmla    v16.4s, v4.4s, v12.s[0]
+       fmla    v20.4s, v4.4s, v12.s[1]
+       fmla    v24.4s, v4.4s, v14.s[0]
+       fmla    v28.4s, v4.4s, v14.s[1]
+
+        ld1     {v8.2s},[pB],#8
+        ld1     {v10.2s},[pB],#8
+        ld1     {v0.4s},[pA],#16
+
+.endm
+
+
+.macro KERNEL4x4_M1
+
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v20.4s, v0.4s, v8.s[1]
+       fmla    v24.4s, v0.4s, v10.s[0]
+       fmla    v28.4s, v0.4s, v10.s[1]
+
+        ld1     {v12.2s},[pB],#8
+        ld1     {v14.2s},[pB],#8
+        ld1     {v4.4s},[pA],#16
+
+.endm
+
+
+
+.macro KERNEL4x4_E
+
+       fmla    v16.4s, v4.4s, v12.s[0]
+       fmla    v20.4s, v4.4s, v12.s[1]
+       fmla    v24.4s, v4.4s, v14.s[0]
+       fmla    v28.4s, v4.4s, v14.s[1]
+
+.endm
+
+
+
+
+.macro KERNEL4x4_SUB
+
+        ld1     {v8.2s},[pB],#8
+        ld1     {v10.2s},[pB],#8
+       ld1     {v0.4s} , [pA],#16
+
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v20.4s, v0.4s, v8.s[1]
+       fmla    v24.4s, v0.4s, v10.s[0]
+       fmla    v28.4s, v0.4s, v10.s[1]
+
+.endm
+
+
+
+
+.macro SAVE4x4
+
+       add     pCRow1, pCRow0, LDC    // create a second row pointer from the first row pointer
+       mov     v0.d[0], tempALPHA
+
+        ld1     {v8.4s},[pCRow0]   // load 4 values of C from first row
+        fmla     v8.4s ,v16.4s,v0.s[0]
+       st1     {v8.4s},[pCRow0],#16 // store C from first row
+
+        ld1     {v12.4s},[pCRow1]   // load 4 values of C from second row
+        fmla     v12.4s ,v20.4s,v0.s[0]
+       st1     {v12.4s},[pCRow1] // store C from second row
+
+       add     pCRow2, pCRow1, LDC        // Row2 points to third row 
+
+        ld1     {v8.4s},[pCRow2]   // load 4 values of C from third row
+        fmla     v8.4s ,v24.4s,v0.s[0]
+       st1     {v8.4s} ,[pCRow2]  // store C from third row
+
+       add     pCRow1, pCRow2 , LDC // row1 points to fourth row
+
+        ld1     {v12.4s},[pCRow1]   // load 4 values of C from fourth row
+        fmla     v12.4s ,v28.4s,v0.s[0]
+       st1     {v12.4s},[pCRow1]  // store fourth row
+
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x4
+
+       fsub            s16 , s16 , s16
+       fmov            s17, s16
+       fmov            s20, s16
+       fmov            s21, s16
+       fmov            s24, s16
+       fmov            s25, s16
+       fmov            s28, s16
+       fmov            s29, s16
+
+.endm
+
+
+
+.macro KERNEL2x4_SUB
+
+       ldr     s8 , [ pB ]
+       ldr     s9 , [ pB, #4 ]
+       ldr     s10, [ pB, #8 ]
+       ldr     s11, [ pB, #12 ]
+
+       ldr     s0 , [ pA ]
+       ldr     s1 , [ pA, #4 ]
+
+       fmadd   s16  , s0,  s8, s16  
+       fmadd   s17  , s1,  s8, s17  
+
+       fmadd   s20  , s0,  s9, s20  
+       fmadd   s21  , s1,  s9, s21  
+
+       fmadd   s24  , s0,  s10,        s24  
+       fmadd   s25  , s1,  s10,        s25  
+
+       fmadd   s28  , s0,  s11,        s28  
+       fmadd   s29  , s1,  s11,        s29  
+       add     pA , pA, #8
+       add     pB , pB, #16
+
+.endm
+
+            #define F1ST( op1, op2, op3) fmadd op1, op2, op3, op1
+            #define L1ST( op1, op2, op3) ldr op1, [op2,  op3]
+
+.macro SAVE2x4
+
+       add     pCRow1 , pCRow0, LDC
+       add     pCRow2  , pCRow1, LDC
+       mov     v0.d[0], tempALPHA
+
+       L1ST (  s8,pCRow0, #0)
+       L1ST (  s9,pCRow0, #4 )
+
+       F1ST (  s8 , s0 , s16)
+       F1ST (  s9 , s0 , s17)
+
+       str     s8 , [pCRow0, #0]
+       str     s9 , [pCRow0, #4 ]
+
+       ldr     s12, [pCRow1, #0]
+       ldr     s13, [pCRow1, #4 ]
+
+       F1ST (  s12, s0 , s20)
+       F1ST (  s13, s0 , s21)
+
+       str     s12, [pCRow1, #0]
+       str     s13, [pCRow1, #4 ]
+
+       L1ST (  s8,pCRow2 , #0)
+       L1ST (  s9,pCRow2 , #4 )
+
+       F1ST (  s8 , s0 , s24)
+       F1ST (  s9 , s0 , s25)
+
+       str     s8 , [pCRow2 , #0]
+       str     s9 , [pCRow2 , #4 ]
+
+       add     pCRow1, pCRow2 , LDC
+
+       ldr     s12, [pCRow1, #0]
+       ldr     s13, [pCRow1, #4 ]
+
+       F1ST (  s12, s0 , s28)
+       F1ST (  s13, s0 , s29)
+
+       str     s12, [pCRow1, #0]
+       str     s13, [pCRow1, #4 ]
+
+       add     pCRow0, pCRow0, #8
+
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT1x4
+
+       fsub            s16 , s16 , s16
+       fmov            s20, s16
+       fmov            s24, s16
+       fmov            s28, s16
+
+.endm
+
+
+
+.macro KERNEL1x4_SUB
+
+       ldr     s8 , [ pB ]
+       ldr     s9 , [ pB, #4 ]
+       ldr     s10, [ pB, #8 ]
+       ldr     s11, [ pB, #12 ]
+
+       ldr     s0 , [ pA ]
+
+       fmadd   s16  , s0,  s8, s16  
+       fmadd   s20  , s0,  s9, s20  
+       fmadd   s24  , s0,  s10,        s24  
+       fmadd   s28  , s0,  s11,        s28  
+
+       add     pA , pA, #4
+       add     pB , pB, #16
+
+.endm
+
+.macro SAVE1x4
+
+       add     pCRow1 , pCRow0, LDC
+       add     pCRow2  , pCRow1, LDC
+
+       mov     v0.d[0], tempALPHA
+
+       L1ST (  s8,pCRow0, #0)
+       F1ST (  s8 , s0 , s16)
+       str     s8 , [pCRow0, #0]
+
+       L1ST (  s12,pCRow1, #0)
+       F1ST (  s12, s0 , s20)
+       str     s12, [pCRow1, #0]
+
+       L1ST (  s8,pCRow2 , #0)
+       F1ST (  s8 , s0 , s24)
+       str     s8 , [pCRow2 , #0]
+
+       add     pCRow1, pCRow2 , LDC
+
+       L1ST (  s12,pCRow1, #0)
+       F1ST (  s12, s0 , s28)
+       str     s12, [pCRow1, #0]
+
+       add     pCRow0, pCRow0, #4
+
+.endm
+
+/******************************************************************************/
+/******************************************************************************/
+
+.macro INIT4x2
+
+       fsub            s16 , s16 , s16
+       fmov            s17, s16
+       fmov            s18, s16
+       fmov            s19, s16
+       fmov            s20, s16
+       fmov            s21, s16
+       fmov            s22, s16
+       fmov            s23, s16
+
+.endm
+
+
+
+.macro KERNEL4x2_SUB
+
+       ldr     s8 , [ pB ]
+       ldr     s9 , [ pB, #4 ]
+
+       ldr     s0 , [ pA ]
+       ldr     s1 , [ pA, #4 ]
+       ldr     s2 , [ pA, #8 ]
+       ldr     s3 , [ pA, #12 ]
+
+       fmadd   s16  , s0,  s8, s16  
+       fmadd   s17  , s1,  s8, s17  
+       fmadd   s18  , s2,  s8, s18  
+       fmadd   s19  , s3,  s8, s19  
+
+       fmadd   s20  , s0,  s9, s20  
+       fmadd   s21  , s1,  s9, s21  
+       fmadd   s22  , s2,  s9, s22  
+       fmadd   s23  , s3,  s9, s23  
+
+       add     pA , pA, #16
+       add     pB , pB, #8
+
+.endm
+
+.macro SAVE4x2
+
+       add     pCRow1 , pCRow0, LDC
+
+       mov     v0.d[0], tempALPHA
+
+       L1ST (  s8,pCRow0, #0)
+       L1ST (  s9,pCRow0, #4 )
+       L1ST (  s10,pCRow0, #8 )
+       L1ST (  s11,pCRow0, #12 )
+
+       F1ST (  s8 , s0 , s16)
+       F1ST (  s9 , s0 , s17)
+       F1ST (  s10, s0 , s18)
+       F1ST (  s11, s0 , s19)
+
+       str     s8 , [pCRow0]
+       str     s9 , [pCRow0, #4 ]
+       str     s10, [pCRow0, #8 ]
+       str     s11, [pCRow0, #12 ]
+
+       L1ST (  s12,pCRow1, #0)
+       L1ST (  s13,pCRow1, #4 )
+       L1ST (  s14,pCRow1, #8 )
+       L1ST (  s15,pCRow1, #12 )
+
+       F1ST (  s12, s0 , s20)
+       F1ST (  s13, s0 , s21)
+       F1ST (  s14, s0 , s22)
+       F1ST (  s15, s0 , s23)
+
+       str     s12, [pCRow1]
+       str     s13, [pCRow1, #4 ]
+       str     s14, [pCRow1, #8 ]
+       str     s15, [pCRow1, #12 ]
+
+       add     pCRow0, pCRow0, #16
+
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT2x2
+
+       fsub            s16 , s16 , s16
+       fmov            s17, s16
+       fmov            s20, s16
+       fmov            s21, s16
+
+.endm
+
+
+
+.macro KERNEL2x2_SUB
+
+       ldr     s8 , [ pB ]
+       ldr     s9 , [ pB, #4 ]
+
+       ldr     s0 , [ pA ]
+       ldr     s1 , [ pA, #4 ]
+
+       fmadd   s16  , s0,  s8, s16  
+       fmadd   s17  , s1,  s8, s17  
+
+       fmadd   s20  , s0,  s9, s20  
+       fmadd   s21  , s1,  s9, s21  
+
+       add     pA , pA, #8
+       add     pB , pB, #8
+
+.endm
+
+.macro SAVE2x2
+
+       add     pCRow1 , pCRow0, LDC
+
+       mov     v0.d[0], tempALPHA
+
+       L1ST (  s8,pCRow0, #0 )
+       L1ST (  s9,pCRow0, #4 )
+
+       F1ST (  s8 , s0 , s16)
+       F1ST (  s9 , s0 , s17)
+
+       str     s8 , [pCRow0]
+       str     s9 , [pCRow0, #4 ]
+
+       L1ST (  s12,pCRow1, #0 )
+       L1ST (  s13,pCRow1, #4 )
+
+       F1ST (  s12, s0 , s20)
+       F1ST (  s13, s0 , s21)
+
+       str     s12, [pCRow1]
+       str     s13, [pCRow1, #4 ]
+
+       add     pCRow0, pCRow0, #8
+
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x2
+
+       fsub            s16 , s16 , s16
+       fmov            s20, s16
+
+.endm
+
+
+
+.macro KERNEL1x2_SUB
+
+       ldr     s8 , [ pB ]
+       ldr     s9 , [ pB, #4 ]
+
+       ldr     s0 , [ pA ]
+       fmadd   s16  , s0,  s8, s16  
+       fmadd   s20  , s0,  s9, s20  
+
+       add     pA , pA, #4
+       add     pB , pB, #8
+
+.endm
+
+.macro SAVE1x2
+
+       add     pCRow1 , pCRow0, LDC
+
+       mov     v0.d[0], tempALPHA
+
+       L1ST (  s8,pCRow0, #0)
+       F1ST (  s8 , s0 , s16)
+       str     s8 , [pCRow0]
+
+       L1ST (  s12,pCRow1, #0)
+       F1ST (  s12, s0 , s20)
+       str     s12, [pCRow1]
+
+       add     pCRow0, pCRow0, #4
+
+.endm
+
+/******************************************************************************/
+/******************************************************************************/
+
+.macro INIT4x1
+
+       fsub            s16 , s16 , s16
+       fmov            s17, s16
+       fmov            s18, s16
+       fmov            s19, s16
+
+.endm
+
+
+
+.macro KERNEL4x1_SUB
+
+       ldr     s8 , [ pB ]
+
+       ldr     s0 , [ pA ]
+       ldr     s1 , [ pA, #4 ]
+       ldr     s2 , [ pA, #8 ]
+       ldr     s3 , [ pA, #12 ]
+
+       fmadd   s16  , s0,  s8, s16  
+       fmadd   s17  , s1,  s8, s17  
+       fmadd   s18  , s2,  s8, s18  
+       fmadd   s19  , s3,  s8, s19  
+
+       add     pA , pA, #16
+       add     pB , pB, #4
+
+.endm
+
+.macro SAVE4x1
+
+
+       mov     v0.d[0], tempALPHA
+
+       L1ST (  s8,pCRow0, #0 )
+       L1ST (  s9,pCRow0, #4 )
+       L1ST (  s10,pCRow0, #8 )
+       L1ST (  s11,pCRow0, #12 )
+
+       F1ST (  s8 , s0 , s16)
+       F1ST (  s9 , s0 , s17)
+       F1ST (  s10, s0 , s18)
+       F1ST (  s11, s0 , s19)
+
+       str     s8 , [pCRow0]
+       str     s9 , [pCRow0, #4 ]
+       str     s10, [pCRow0, #8 ]
+       str     s11, [pCRow0, #12 ]
+
+       add     pCRow0, pCRow0, #16
+
+.endm
+
+
+
+
+/******************************************************************************/
+
+.macro INIT2x1
+
+       fsub            s16 , s16 , s16
+       fmov            s17, s16
+
+.endm
+
+
+
+.macro KERNEL2x1_SUB
+
+       ldr     s8 , [ pB ]
+
+       ldr     s0 , [ pA ]
+       ldr     s1 , [ pA, #4 ]
+
+       fmadd   s16  , s0,  s8, s16  
+       fmadd   s17  , s1,  s8, s17  
+
+       add     pA , pA, #8
+       add     pB , pB, #4
+
+.endm
+
+.macro SAVE2x1
+
+
+       mov     v0.d[0], tempALPHA
+
+       L1ST (  s8,pCRow0, #0 )
+       L1ST (  s9,pCRow0, #4 )
+
+       F1ST (  s8 , s0 , s16)
+       F1ST (  s9 , s0 , s17)
+
+       str     s8 , [pCRow0]
+       str     s9 , [pCRow0, #4 ]
+
+       add     pCRow0, pCRow0, #8
+
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x1
+
+       fsub            s16 , s16 , s16
+
+.endm
+
+
+
+.macro KERNEL1x1_SUB
+
+       ldr     s8 , [ pB ]
+
+       ldr     s0 , [ pA ]
+
+       fmadd   s16  , s0,  s8, s16  
+
+       add     pA , pA, #4
+       add     pB , pB, #4
+
+.endm
+
+.macro SAVE1x1
+
+
+       mov     v0.d[0], tempALPHA
+
+       L1ST (  s8,pCRow0, #0 )
+       F1ST (  s8 , s0 , s16)
+       str     s8 , [pCRow0]
+
+       add     pCRow0, pCRow0, #4
+
+.endm
+
+
+
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+       PROLOGUE
+
+       .align 5
+        add     sp,sp,#-(5*16)
+        stp     d8,d9,[sp,#(0*16)]
+        stp     d10,d11,[sp,#(1*16)]
+        stp     d12,d13,[sp,#(2*16)]
+        stp     d14,d15,[sp,#(3*16)]
+        stp     d16,d17,[sp,#(4*16)]
+
+        mov     tempALPHA, v0.d[0]
+       lsl     LDC, LDC, #2                                    // ldc = ldc * 4
+
+       mov     pB, origPB
+
+       mov     counterJ, origN
+       asr     counterJ, counterJ, #2                                  // J = J / 4
+       cmp     counterJ, #0
+       ble     sgemm_kernel_L2_BEGIN
+
+sgemm_kernel_L4_BEGIN:
+
+       mov     pCRow0, pC                                              // pCRow0 = C
+        add     pC,pC,LDC, lsl #2
+
+       mov     pA, origPA                                              // pA = start of A array
+
+
+
+sgemm_kernel_L4_M4_BEGIN:
+
+       mov     counterI, origM
+       asr     counterI, counterI, #2                                  // counterI = counterI / 4
+       cmp     counterI, #0
+       ble     sgemm_kernel_L4_M2_BEGIN
+
+sgemm_kernel_L4_M4_20:
+
+       mov     pB, origPB
+       asr     counterL , origK, #1                                    // L = K / 2
+       cmp     counterL , #2                                           // is there at least 4 to do?
+       blt     sgemm_kernel_L4_M4_32
+
+
+
+       KERNEL4x4_I     //do one in the K
+       KERNEL4x4_M2    //do another in the K
+
+       subs    counterL, counterL, #2  // subtract 2, since one is always done at the tail
+       ble     sgemm_kernel_L4_M4_22a
+       .align 5
+
+sgemm_kernel_L4_M4_22:
+
+       KERNEL4x4_M1
+       KERNEL4x4_M2
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L4_M4_22
+
+sgemm_kernel_L4_M4_22a:
+
+       KERNEL4x4_M1
+       KERNEL4x4_E
+
+       b        sgemm_kernel_L4_M4_44
+
+sgemm_kernel_L4_M4_32:   // less than 4 to do in the K direction
+
+       tst     counterL, #1
+       ble     sgemm_kernel_L4_M4_40
+
+       KERNEL4x4_I
+
+       KERNEL4x4_E
+
+       b        sgemm_kernel_L4_M4_44
+
+
+sgemm_kernel_L4_M4_40:
+
+       INIT4x4
+
+
+sgemm_kernel_L4_M4_44:
+
+       ands    counterL , origK, #1
+       ble     sgemm_kernel_L4_M4_100
+
+sgemm_kernel_L4_M4_46:
+
+       KERNEL4x4_SUB
+
+       subs    counterL, counterL, #1
+       bne     sgemm_kernel_L4_M4_46
+
+sgemm_kernel_L4_M4_100:
+
+       SAVE4x4
+
+sgemm_kernel_L4_M4_END:
+
+       subs    counterI, counterI, #1
+       bne     sgemm_kernel_L4_M4_20
+
+
+sgemm_kernel_L4_M2_BEGIN:
+
+       mov     counterI, origM
+       tst     counterI , #3
+       ble     sgemm_kernel_L4_END
+
+       tst     counterI, #2                                    // counterI = counterI / 2
+       ble     sgemm_kernel_L4_M1_BEGIN
+
+sgemm_kernel_L4_M2_20:
+
+       INIT2x4
+
+       mov     pB, origPB
+       asr     counterL , origK, #3                                    // counterL = counterL / 8
+       cmp     counterL , #0
+       ble     sgemm_kernel_L4_M2_40
+
+sgemm_kernel_L4_M2_22:
+
+       KERNEL2x4_SUB
+       KERNEL2x4_SUB
+       KERNEL2x4_SUB
+       KERNEL2x4_SUB
+
+       KERNEL2x4_SUB
+       KERNEL2x4_SUB
+       KERNEL2x4_SUB
+       KERNEL2x4_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L4_M2_22
+
+
+sgemm_kernel_L4_M2_40:
+
+       ands    counterL , origK, #7                                    // counterL = counterL % 8
+       ble     sgemm_kernel_L4_M2_100
+
+sgemm_kernel_L4_M2_42:
+
+       KERNEL2x4_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L4_M2_42
+
+sgemm_kernel_L4_M2_100:
+
+       SAVE2x4
+
+sgemm_kernel_L4_M2_END:
+
+
+sgemm_kernel_L4_M1_BEGIN:
+
+       tst     counterI, #1                                    // counterI = counterI % 2
+       ble     sgemm_kernel_L4_END
+
+sgemm_kernel_L4_M1_20:
+
+       INIT1x4
+
+       mov     pB, origPB
+       asr     counterL , origK, #3                                    // counterL = counterL / 8
+       cmp     counterL , #0
+       ble     sgemm_kernel_L4_M1_40
+
+sgemm_kernel_L4_M1_22:
+       KERNEL1x4_SUB
+       KERNEL1x4_SUB
+       KERNEL1x4_SUB
+       KERNEL1x4_SUB
+
+       KERNEL1x4_SUB
+       KERNEL1x4_SUB
+       KERNEL1x4_SUB
+       KERNEL1x4_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L4_M1_22
+
+
+sgemm_kernel_L4_M1_40:
+
+       ands    counterL , origK, #7                                    // counterL = counterL % 8
+       ble     sgemm_kernel_L4_M1_100
+
+sgemm_kernel_L4_M1_42:
+
+       KERNEL1x4_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L4_M1_42
+
+sgemm_kernel_L4_M1_100:
+
+       SAVE1x4
+
+
+sgemm_kernel_L4_END:
+
+       add     origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
+
+       subs    counterJ, counterJ , #1                                         // j--
+       bgt     sgemm_kernel_L4_BEGIN
+
+
+
+/*********************************************************************************************/
+
+sgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+       mov     counterJ , origN
+       tst     counterJ , #3
+       ble     sgemm_kernel_L999   // error, N was less than 4?
+
+       tst     counterJ , #2
+       ble     sgemm_kernel_L1_BEGIN
+
+       mov     pCRow0, pC                                              // pCRow0 = pC
+       add     pC , pC, LDC, lsl #1
+
+       mov     pA, origPA                                              // pA = A
+
+
+
+sgemm_kernel_L2_M4_BEGIN:
+
+       mov     counterI, origM
+       asr     counterI, counterI, #2                                  // counterI = counterI / 4
+       cmp     counterI,#0
+       ble     sgemm_kernel_L2_M2_BEGIN
+
+sgemm_kernel_L2_M4_20:
+
+       INIT4x2
+
+       mov     pB, origPB
+       asr     counterL , origK, #3                                    // counterL = counterL / 8
+       cmp     counterL,#0
+       ble     sgemm_kernel_L2_M4_40
+       .align 5
+
+sgemm_kernel_L2_M4_22:
+       KERNEL4x2_SUB
+       KERNEL4x2_SUB
+       KERNEL4x2_SUB
+       KERNEL4x2_SUB
+
+       KERNEL4x2_SUB
+       KERNEL4x2_SUB
+       KERNEL4x2_SUB
+       KERNEL4x2_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L2_M4_22
+
+
+sgemm_kernel_L2_M4_40:
+
+       ands    counterL , origK, #7                                    // counterL = counterL % 8
+       ble     sgemm_kernel_L2_M4_100
+
+sgemm_kernel_L2_M4_42:
+
+       KERNEL4x2_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L2_M4_42
+
+sgemm_kernel_L2_M4_100:
+
+       SAVE4x2
+
+sgemm_kernel_L2_M4_END:
+
+       subs    counterI, counterI, #1
+       bgt     sgemm_kernel_L2_M4_20
+
+
+sgemm_kernel_L2_M2_BEGIN:
+
+       mov     counterI, origM
+       tst     counterI , #3
+       ble     sgemm_kernel_L2_END
+
+       tst     counterI, #2                                    // counterI = counterI / 2
+       ble     sgemm_kernel_L2_M1_BEGIN
+
+sgemm_kernel_L2_M2_20:
+
+       INIT2x2
+
+       mov     pB, origPB
+       asr     counterL , origK, #3                                    // counterL = counterL / 8
+        cmp    counterL,#0
+       ble     sgemm_kernel_L2_M2_40
+
+sgemm_kernel_L2_M2_22:
+
+       KERNEL2x2_SUB
+       KERNEL2x2_SUB
+       KERNEL2x2_SUB
+       KERNEL2x2_SUB
+
+       KERNEL2x2_SUB
+       KERNEL2x2_SUB
+       KERNEL2x2_SUB
+       KERNEL2x2_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L2_M2_22
+
+
+sgemm_kernel_L2_M2_40:
+
+       ands    counterL , origK, #7                                    // counterL = counterL % 8
+       ble     sgemm_kernel_L2_M2_100
+
+sgemm_kernel_L2_M2_42:
+
+       KERNEL2x2_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L2_M2_42
+
+sgemm_kernel_L2_M2_100:
+
+       SAVE2x2
+
+sgemm_kernel_L2_M2_END:
+
+
+sgemm_kernel_L2_M1_BEGIN:
+
+       tst     counterI, #1                                    // counterI = counterI % 2
+       ble     sgemm_kernel_L2_END
+
+sgemm_kernel_L2_M1_20:
+
+       INIT1x2
+
+       mov     pB, origPB
+       asr     counterL , origK, #3                                    // counterL = counterL / 8
+        cmp     counterL, #0
+       ble     sgemm_kernel_L2_M1_40
+
+sgemm_kernel_L2_M1_22:
+       KERNEL1x2_SUB
+       KERNEL1x2_SUB
+       KERNEL1x2_SUB
+       KERNEL1x2_SUB
+
+       KERNEL1x2_SUB
+       KERNEL1x2_SUB
+       KERNEL1x2_SUB
+       KERNEL1x2_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L2_M1_22
+
+
+sgemm_kernel_L2_M1_40:
+
+       ands    counterL , origK, #7                                    // counterL = counterL % 8
+       ble     sgemm_kernel_L2_M1_100
+
+sgemm_kernel_L2_M1_42:
+
+       KERNEL1x2_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L2_M1_42
+
+sgemm_kernel_L2_M1_100:
+
+       SAVE1x2
+
+
+sgemm_kernel_L2_END:
+       add     origPB, origPB, origK, lsl #3                                   // B = B + K * 2 * 4
+
+/*********************************************************************************************/
+
+sgemm_kernel_L1_BEGIN:
+
+       mov     counterJ , origN
+       tst     counterJ , #1
+       ble     sgemm_kernel_L999 // done
+
+
+       mov     pCRow0, pC                                              // pCRow0 = C
+       add     pC , pCRow0 , LDC                                 // C01 is the current line, update pC to point to next
+
+       mov     pA, origPA                                              // pA = A
+
+
+
+sgemm_kernel_L1_M4_BEGIN:
+
+       mov     counterI, origM
+       asr     counterI, counterI, #2                                  // counterI = counterI / 4
+       cmp     counterI, #0
+       ble     sgemm_kernel_L1_M2_BEGIN
+
+sgemm_kernel_L1_M4_20:
+
+       INIT4x1
+
+       mov     pB, origPB
+       asr     counterL , origK, #3                                    // counterL = counterL / 8
+       cmp     counterL , #0
+       ble     sgemm_kernel_L1_M4_40
+       .align 5
+
+sgemm_kernel_L1_M4_22:
+       KERNEL4x1_SUB
+       KERNEL4x1_SUB
+       KERNEL4x1_SUB
+       KERNEL4x1_SUB
+
+       KERNEL4x1_SUB
+       KERNEL4x1_SUB
+       KERNEL4x1_SUB
+       KERNEL4x1_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L1_M4_22
+
+
+sgemm_kernel_L1_M4_40:
+
+       ands    counterL , origK, #7                                    // counterL = counterL % 8
+       ble     sgemm_kernel_L1_M4_100
+
+sgemm_kernel_L1_M4_42:
+
+       KERNEL4x1_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L1_M4_42
+
+sgemm_kernel_L1_M4_100:
+
+       SAVE4x1
+
+sgemm_kernel_L1_M4_END:
+
+       subs    counterI, counterI, #1
+       bgt     sgemm_kernel_L1_M4_20
+
+
+sgemm_kernel_L1_M2_BEGIN:
+
+       mov     counterI, origM
+       tst     counterI , #3
+       ble     sgemm_kernel_L1_END
+
+       tst     counterI, #2                                    // counterI = counterI / 2
+       ble     sgemm_kernel_L1_M1_BEGIN
+
+sgemm_kernel_L1_M2_20:
+
+       INIT2x1
+
+       mov     pB, origPB
+       asr     counterL , origK, #3                                    // counterL = counterL / 8
+       cmp     counterL , #0
+       ble     sgemm_kernel_L1_M2_40
+
+sgemm_kernel_L1_M2_22:
+
+       KERNEL2x1_SUB
+       KERNEL2x1_SUB
+       KERNEL2x1_SUB
+       KERNEL2x1_SUB
+
+       KERNEL2x1_SUB
+       KERNEL2x1_SUB
+       KERNEL2x1_SUB
+       KERNEL2x1_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L1_M2_22
+
+
+sgemm_kernel_L1_M2_40:
+
+       ands    counterL , origK, #7                                    // counterL = counterL % 8
+       ble     sgemm_kernel_L1_M2_100
+
+sgemm_kernel_L1_M2_42:
+
+       KERNEL2x1_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L1_M2_42
+
+sgemm_kernel_L1_M2_100:
+
+       SAVE2x1
+
+sgemm_kernel_L1_M2_END:
+
+
+sgemm_kernel_L1_M1_BEGIN:
+
+       tst     counterI, #1                                    // counterI = counterI % 2
+       ble     sgemm_kernel_L1_END
+
+sgemm_kernel_L1_M1_20:
+
+       INIT1x1
+
+       mov     pB, origPB
+       asr     counterL , origK, #3                                    // counterL = counterL / 8
+       cmp     counterL , #0
+       ble     sgemm_kernel_L1_M1_40
+
+sgemm_kernel_L1_M1_22:
+       KERNEL1x1_SUB
+       KERNEL1x1_SUB
+       KERNEL1x1_SUB
+       KERNEL1x1_SUB
+
+       KERNEL1x1_SUB
+       KERNEL1x1_SUB
+       KERNEL1x1_SUB
+       KERNEL1x1_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L1_M1_22
+
+
+sgemm_kernel_L1_M1_40:
+
+       ands    counterL , origK, #7                                    // counterL = counterL % 8
+       ble     sgemm_kernel_L1_M1_100
+
+sgemm_kernel_L1_M1_42:
+
+       KERNEL1x1_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L1_M1_42
+
+sgemm_kernel_L1_M1_100:
+
+       SAVE1x1
+
+
+sgemm_kernel_L1_END:
+
+
+sgemm_kernel_L999:
+       mov     x0, #0                                          // set return value
+        ldp     d8,d9,[sp,#(0*16)]
+        ldp     d10,d11,[sp,#(1*16)]
+        ldp     d12,d13,[sp,#(2*16)]
+        ldp     d14,d15,[sp,#(3*16)]
+        ldp     d16,d17,[sp,#(4*16)]
+        add     sp,sp,#(5*16)
+       ret
+
+       EPILOGUE
+
diff --git a/kernel/generic/trmmkernel_4x4.c b/kernel/generic/trmmkernel_4x4.c
new file mode 100644 (file)
index 0000000..a85828c
--- /dev/null
@@ -0,0 +1,875 @@
+#include "common.h"
+#include <stdbool.h>
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
+{
+
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
+
+   FLOAT res0_0;
+   FLOAT res0_1;
+   FLOAT res0_2;
+   FLOAT res0_3;
+
+   FLOAT res1_0;
+   FLOAT res1_1;
+   FLOAT res1_2;
+   FLOAT res1_3;
+
+   FLOAT res2_0;
+   FLOAT res2_1;
+   FLOAT res2_2;
+   FLOAT res2_3;
+
+   FLOAT res3_0;
+   FLOAT res3_1;
+   FLOAT res3_2;
+   FLOAT res3_3;
+
+   FLOAT a0;
+   FLOAT a1;
+
+   FLOAT b0;
+   FLOAT b1;
+   FLOAT b2;
+   FLOAT b3;
+
+   BLASLONG off, temp;
+
+   bool left;
+   bool transposed;
+   bool backwards;
+
+#ifdef LEFT
+   left = true;
+#else
+   left = false;
+#endif
+
+#ifdef TRANSA
+   transposed = true;
+#else
+   transposed = false;
+#endif
+
+   backwards = left != transposed;
+
+   if (!left) {
+      off = -offset;
+   }
+
+
+   for (j=0; j<bn/4; j+=1) // do blocks of the Mx4 loops 
+   {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C1+ldc;
+        C3 = C2+ldc;
+
+
+        if (left) {
+            off = offset;
+        }
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x4
+       {
+
+               ptrbb = bb;
+                if (backwards)
+                {
+                  ptrba += off*4; // number of values in A
+                  ptrbb += off*4; // number of values in B
+                }
+
+               res0_0 = 0;
+               res0_1 = 0;
+               res0_2 = 0;
+               res0_3 = 0;
+
+               res1_0 = 0;
+               res1_1 = 0;
+               res1_2 = 0;
+               res1_3 = 0;
+
+               res2_0 = 0;
+               res2_1 = 0;
+               res2_2 = 0;
+               res2_3 = 0;
+
+               res3_0 = 0;
+               res3_1 = 0;
+               res3_2 = 0;
+               res3_3 = 0;
+
+                temp = backwards ? bk-off :
+                             left ? off + 4 : // number of values in A
+                                    off + 4;  // number of values in B
+
+               for (k=0; k<temp; k++)
+                {
+                       b0 = ptrbb[0];
+                       b1 = ptrbb[1];
+                       b2 = ptrbb[2];
+                       b3 = ptrbb[3];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+                       res1_0 += a0*b1;
+                       res2_0 += a0*b2;
+                       res3_0 += a0*b3;
+
+                       a1 = ptrba[1];
+                       res0_1 += a1*b0;
+                       res1_1 += a1*b1;
+                       res2_1 += a1*b2;
+                       res3_1 += a1*b3;
+
+                       a0 = ptrba[2];
+                       res0_2 += a0*b0;
+                       res1_2 += a0*b1;
+                       res2_2 += a0*b2;
+                       res3_2 += a0*b3;
+
+                       a1 = ptrba[3];
+                       res0_3 += a1*b0;
+                       res1_3 += a1*b1;
+                       res2_3 += a1*b2;
+                       res3_3 += a1*b3;
+
+                       ptrba = ptrba+4;
+                       ptrbb = ptrbb+4;
+                }
+
+               res0_0 *= alpha;
+               res0_1 *= alpha;
+               res0_2 *= alpha;
+               res0_3 *= alpha;
+
+               res1_0 *= alpha;
+               res1_1 *= alpha;
+               res1_2 *= alpha;
+               res1_3 *= alpha;
+
+               res2_0 *= alpha;
+               res2_1 *= alpha;
+               res2_2 *= alpha;
+               res2_3 *= alpha;
+
+               res3_0 *= alpha;
+               res3_1 *= alpha;
+               res3_2 *= alpha;
+               res3_3 *= alpha;
+
+               C0[0] = res0_0;
+               C0[1] = res0_1;
+               C0[2] = res0_2;
+               C0[3] = res0_3;
+
+               C1[0] = res1_0;
+               C1[1] = res1_1;
+               C1[2] = res1_2;
+               C1[3] = res1_3;
+
+               C2[0] = res2_0;
+               C2[1] = res2_1;
+               C2[2] = res2_2;
+               C2[3] = res2_3;
+
+               C3[0] = res3_0;
+               C3[1] = res3_1;
+               C3[2] = res3_2;
+               C3[3] = res3_3;
+
+               if (!backwards) {
+                    temp = bk-off;
+                    temp = left ? temp - 4 : // number of values in A
+                                  temp - 4;  // number of values in B
+
+                    ptrba += temp*4; // number of values in A
+                   ptrbb += temp*4; // number of values in B
+                }
+#ifdef LEFT
+               off += 4; // number of values in A
+#endif
+
+               C0 = C0+4;
+               C1 = C1+4;
+               C2 = C2+4;
+               C3 = C3+4;
+
+       }
+
+       if ( bm & 2 ) // do any 2x4 loop
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*2;
+               ptrbb = bb + off*4;
+#endif
+
+               res0_0 = 0;
+               res0_1 = 0;
+
+               res1_0 = 0;
+               res1_1 = 0;
+
+               res2_0 = 0;
+               res2_1 = 0;
+
+               res3_0 = 0;
+               res3_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT)
+               temp = off+2;   // number of values in A
+#else
+               temp = off+4;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++)
+                {
+                       b0 = ptrbb[0];
+                       b1 = ptrbb[1];
+                       b2 = ptrbb[2];
+                       b3 = ptrbb[3];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+                       res1_0 += a0*b1;
+                       res2_0 += a0*b2;
+                       res3_0 += a0*b3;
+
+                       a1 = ptrba[1];
+                       res0_1 += a1*b0;
+                       res1_1 += a1*b1;
+                       res2_1 += a1*b2;
+                       res3_1 += a1*b3;
+
+                       ptrba = ptrba+2;
+                       ptrbb = ptrbb+4;
+                }
+
+               res0_0 *= alpha;
+               res0_1 *= alpha;
+
+               res1_0 *= alpha;
+               res1_1 *= alpha;
+
+               res2_0 *= alpha;
+               res2_1 *= alpha;
+
+               res3_0 *= alpha;
+               res3_1 *= alpha;
+
+               C0[0] = res0_0;
+               C0[1] = res0_1;
+
+               C1[0] = res1_0;
+               C1[1] = res1_1;
+
+               C2[0] = res2_0;
+               C2[1] = res2_1;
+
+               C3[0] = res3_0;
+               C3[1] = res3_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 2; // number of values in A
+#else
+               temp -= 4; // number of values in B
+#endif
+               ptrba += temp*2;
+               ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+               off += 2; // number of values in A
+#endif
+
+               C0 = C0+2;
+               C1 = C1+2;
+               C2 = C2+2;
+               C3 = C3+2;
+
+       }
+
+       if ( bm & 1 ) // do any 1x4 loop
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*1;
+               ptrbb = bb + off*4;
+#endif
+
+               res0_0 = 0;
+               res1_0 = 0;
+               res2_0 = 0;
+               res3_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT)
+               temp = off+1;   // number of values in A
+#else
+               temp = off+4;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++)
+                {
+                       b0 = ptrbb[0];
+                       b1 = ptrbb[1];
+                       b2 = ptrbb[2];
+                       b3 = ptrbb[3];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+                       res1_0 += a0*b1;
+                       res2_0 += a0*b2;
+                       res3_0 += a0*b3;
+
+                       ptrba = ptrba+1;
+                       ptrbb = ptrbb+4;
+                }
+
+               res0_0 *= alpha;
+
+               res1_0 *= alpha;
+
+               res2_0 *= alpha;
+
+               res3_0 *= alpha;
+
+               C0[0] = res0_0;
+
+               C1[0] = res1_0;
+
+               C2[0] = res2_0;
+
+               C3[0] = res3_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 1; // number of values in A
+#else
+               temp -= 4; // number of values in B
+#endif
+               ptrba += temp*1;
+               ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+               off += 1; // number of values in A
+#endif
+
+               C0 = C0+1;
+               C1 = C1+1;
+               C2 = C2+1;
+               C3 = C3+1;
+
+       }
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+               off += 4;
+#endif
+
+        k = (bk<<2);
+        bb = bb+k;
+        i = (ldc<<2);
+        C = C+i;
+    }
+
+   for (j=0; j<(bn&2); j+=2) // do the Mx2 loops 
+   {
+        C0 = C;
+        C1 = C0+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+               off = offset;
+#endif
+
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x2
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*4;
+               ptrbb = bb + off*2;
+#endif
+
+               res0_0 = 0;
+               res0_1 = 0;
+               res0_2 = 0;
+               res0_3 = 0;
+
+               res1_0 = 0;
+               res1_1 = 0;
+               res1_2 = 0;
+               res1_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT)
+               temp = off+4;   // number of values in A
+#else
+               temp = off+2;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++)
+                {
+                       b0 = ptrbb[0];
+                       b1 = ptrbb[1];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+                       res1_0 += a0*b1;
+
+                       a1 = ptrba[1];
+                       res0_1 += a1*b0;
+                       res1_1 += a1*b1;
+
+                       a0 = ptrba[2];
+                       res0_2 += a0*b0;
+                       res1_2 += a0*b1;
+
+                       a1 = ptrba[3];
+                       res0_3 += a1*b0;
+                       res1_3 += a1*b1;
+
+                       ptrba = ptrba+4;
+                       ptrbb = ptrbb+2;
+                }
+
+               res0_0 *= alpha;
+               res0_1 *= alpha;
+               res0_2 *= alpha;
+               res0_3 *= alpha;
+
+               res1_0 *= alpha;
+               res1_1 *= alpha;
+               res1_2 *= alpha;
+               res1_3 *= alpha;
+
+               C0[0] = res0_0;
+               C0[1] = res0_1;
+               C0[2] = res0_2;
+               C0[3] = res0_3;
+
+               C1[0] = res1_0;
+               C1[1] = res1_1;
+               C1[2] = res1_2;
+               C1[3] = res1_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 4; // number of values in A
+#else
+               temp -= 2; // number of values in B
+#endif
+               ptrba += temp*4;
+               ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+               off += 4; // number of values in A
+#endif
+
+               C0 = C0+4;
+               C1 = C1+4;
+
+       }
+
+       if ( bm & 2 ) // do any 2x2 loop
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*2;
+               ptrbb = bb + off*2;
+#endif
+
+               res0_0 = 0;
+               res0_1 = 0;
+
+               res1_0 = 0;
+               res1_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT)
+               temp = off+2;   // number of values in A
+#else
+               temp = off+2;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++)
+                {
+                       b0 = ptrbb[0];
+                       b1 = ptrbb[1];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+                       res1_0 += a0*b1;
+
+                       a1 = ptrba[1];
+                       res0_1 += a1*b0;
+                       res1_1 += a1*b1;
+
+                       ptrba = ptrba+2;
+                       ptrbb = ptrbb+2;
+                }
+
+               res0_0 *= alpha;
+               res0_1 *= alpha;
+
+               res1_0 *= alpha;
+               res1_1 *= alpha;
+
+               C0[0] = res0_0;
+               C0[1] = res0_1;
+
+               C1[0] = res1_0;
+               C1[1] = res1_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 2; // number of values in A
+#else
+               temp -= 2; // number of values in B
+#endif
+               ptrba += temp*2;
+               ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+               off += 2; // number of values in A
+#endif
+
+               C0 = C0+2;
+               C1 = C1+2;
+
+       }
+
+       if ( bm & 1 ) // do any 1x2 loop
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*1;
+               ptrbb = bb + off*2;
+#endif
+
+               res0_0 = 0;
+
+               res1_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT)
+               temp = off+1;   // number of values in A
+#else
+               temp = off+2;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++)
+                {
+                       b0 = ptrbb[0];
+                       b1 = ptrbb[1];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+                       res1_0 += a0*b1;
+
+                       ptrba = ptrba+1;
+                       ptrbb = ptrbb+2;
+                }
+
+               res0_0 *= alpha;
+
+               res1_0 *= alpha;
+
+               C0[0] = res0_0;
+
+               C1[0] = res1_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 1; // number of values in A
+#else
+               temp -= 2; // number of values in B
+#endif
+               ptrba += temp*1;
+               ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+               off += 1; // number of values in A
+#endif
+
+               C0 = C0+1;
+               C1 = C1+1;
+
+       }
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+               off += 2;
+#endif
+
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+    }
+
+
+
+
+
+
+
+   for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
+   {
+        C0 = C;
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+       off = offset;
+#endif
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*4;
+               ptrbb = bb + off*1;
+#endif
+
+               res0_0 = 0;
+               res0_1 = 0;
+               res0_2 = 0;
+               res0_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT)
+               temp = off+4;   // number of values in A
+#else
+               temp = off+1;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++)
+                {
+                       b0 = ptrbb[0];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+
+                       a1 = ptrba[1];
+                       res0_1 += a1*b0;
+
+                       a0 = ptrba[2];
+                       res0_2 += a0*b0;
+
+                       a1 = ptrba[3];
+                       res0_3 += a1*b0;
+
+                       ptrba = ptrba+4;
+                       ptrbb = ptrbb+1;
+                }
+
+               res0_0 *= alpha;
+               res0_1 *= alpha;
+               res0_2 *= alpha;
+               res0_3 *= alpha;
+
+               C0[0] = res0_0;
+               C0[1] = res0_1;
+               C0[2] = res0_2;
+               C0[3] = res0_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 4; // number of values in A
+#else
+               temp -= 1; // number of values in B
+#endif
+               ptrba += temp*4;
+               ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+               off += 4; // number of values in A
+#endif
+
+               C0 = C0+4;
+
+       }
+
+       if ( bm & 2 ) // do any 2x1 loop
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*2;
+               ptrbb = bb + off*1;
+#endif
+
+               res0_0 = 0;
+               res0_1 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT)
+               temp = off+2;   // number of values in A
+#else
+               temp = off+1;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++)
+                {
+                       b0 = ptrbb[0];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+
+                       a1 = ptrba[1];
+                       res0_1 += a1*b0;
+
+                       ptrba = ptrba+2;
+                       ptrbb = ptrbb+1;
+                }
+
+               res0_0 *= alpha;
+               res0_1 *= alpha;
+
+               C0[0] = res0_0;
+               C0[1] = res0_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 2; // number of values in A
+#else
+               temp -= 1; // number of values in B
+#endif
+               ptrba += temp*2;
+               ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+               off += 2; // number of values in A
+#endif
+
+               C0 = C0+2;
+
+       }
+
+       if ( bm & 1 ) // do any 1x1 loop
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*1;
+               ptrbb = bb + off*1;
+#endif
+
+               res0_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT)
+               temp = off+1;   // number of values in A
+#else
+               temp = off+1;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++)
+                {
+                       b0 = ptrbb[0];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+
+                       ptrba = ptrba+1;
+                       ptrbb = ptrbb+1;
+                }
+
+               res0_0 *= alpha;
+
+               C0[0] = res0_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 1; // number of values in A
+#else
+               temp -= 1; // number of values in B
+#endif
+               ptrba += temp*1;
+               ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+               off += 1; // number of values in A
+#endif
+
+               C0 = C0+1;
+
+       }
+
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+               off += 1;
+#endif
+
+        k = (bk<<0);
+        bb = bb+k;
+        C = C+ldc;
+   }
+   return 0;
+}
diff --git a/param.h b/param.h
index 3e20f58..d7a427b 100644 (file)
--- a/param.h
+++ b/param.h
@@ -2039,8 +2039,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GEMM_DEFAULT_OFFSET_B 0
 #define GEMM_DEFAULT_ALIGN 0x03fffUL
 
-#define SGEMM_DEFAULT_UNROLL_M  2
-#define SGEMM_DEFAULT_UNROLL_N  2
+#define SGEMM_DEFAULT_UNROLL_M  4
+#define SGEMM_DEFAULT_UNROLL_N  4
 
 #define DGEMM_DEFAULT_UNROLL_M  2
 #define DGEMM_DEFAULT_UNROLL_N  2