# The first commit's message is:

author Benedikt Huber <benedikt.huber@theobroma-systems.com>

Thu, 9 Oct 2014 13:52:10 +0000 (06:52 -0700)

committer Zhang Xianyi <traits.zhang@gmail.com>

Tue, 11 Nov 2014 14:19:23 +0000 (22:19 +0800)
author Benedikt Huber <benedikt.huber@theobroma-systems.com>
Thu, 9 Oct 2014 13:52:10 +0000 (06:52 -0700)
committer Zhang Xianyi <traits.zhang@gmail.com>
Tue, 11 Nov 2014 14:19:23 +0000 (22:19 +0800)
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md

index 18a218c..02d15b7 100644 (file)
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -117,5 +117,9 @@ In chronological order:
  * Isaac Dunham <https://github.com/idunham>
    * [2014-08-03] Fixed link error on Linux/musl
  
+* Dave Nuechterlein
+  * [2014-10-10] trmm and sgemm kernels (optimized for APM's X-Gene 1).
+                 ARMv8 support.
+
  * [Your name or handle] <[email or website]>
    * [Date] [Brief summary of your changes]
diff --git a/common_arm64.h b/common_arm64.h

index 8a66a17..4855493 100644 (file)
--- a/common_arm64.h
+++ b/common_arm64.h
@@ -119,9 +119,9 @@ static inline int blas_quickdivide(blasint x, blasint y){
  }
  
  #if defined(DOUBLE)
-#define GET_IMAGE(res)  __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory")
+#define GET_IMAGE(res)  __asm__ __volatile__("str d1, %0" : "=m"(res) : : "memory")
  #else
-#define GET_IMAGE(res)  __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory")
+#define GET_IMAGE(res)  __asm__ __volatile__("str s1, %0" : "=m"(res) : : "memory")
  #endif
  
  #define GET_IMAGE_CANCEL
@@ -138,7 +138,6 @@ static inline int blas_quickdivide(blasint x, blasint y){
  #if defined(ASSEMBLER) && !defined(NEEDPARAM)
  
  #define PROLOGUE \
-       .arm             ;\
         .global REALNAME ;\
         .func   REALNAME  ;\
  REALNAME:
diff --git a/cpuid_arm64.c b/cpuid_arm64.c

new file mode 100644 (file)

index 0000000..c7a27f8
--- /dev/null
+++ b/cpuid_arm64.c
@@ -0,0 +1,217 @@
+/**************************************************************************
+  Copyright (c) 2013, The OpenBLAS Project
+  All rights reserved.
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+  1. Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+  2. Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer in
+  the documentation and/or other materials provided with the
+  distribution.
+  3. Neither the name of the OpenBLAS project nor the names of
+  its contributors may be used to endorse or promote products
+  derived from this software without specific prior written permission.
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+  USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+  *****************************************************************************/
+
+#include <string.h>
+
+#define CPU_UNKNOWN            0
+#define CPU_ARMV8              1
+
+static char *cpuname[] = {
+  "UNKOWN",
+  "ARMV8"
+};
+
+
+int get_feature(char *search)
+{
+
+#ifdef linux
+       FILE *infile;
+       char buffer[2048], *p,*t;
+       p = (char *) NULL ;
+
+       infile = fopen("/proc/cpuinfo", "r");
+
+       while (fgets(buffer, sizeof(buffer), infile))
+       {
+
+               if (!strncmp("Features", buffer, 8))
+               {
+                       p = strchr(buffer, ':') + 2;
+                       break;
+               }
+       }
+
+       fclose(infile);
+
+
+       if( p == NULL ) return;
+
+       t = strtok(p," ");
+       while( t = strtok(NULL," "))
+       {
+               if (!strcmp(t, search))   { return(1); }
+       }
+
+#endif
+       return(0);
+}
+
+
+int detect(void)
+{
+
+#ifdef linux
+
+       FILE *infile;
+       char buffer[512], *p;
+       p = (char *) NULL ;
+
+       infile = fopen("/proc/cpuinfo", "r");
+
+       while (fgets(buffer, sizeof(buffer), infile))
+       {
+
+               if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9)))
+               {
+                       p = strchr(buffer, ':') + 2;
+                       break;
+               }
+       }
+
+       fclose(infile);
+
+       if(p != NULL)
+       {
+
+               if (strstr(p, "AArch64"))
+               {
+                               return CPU_ARMV8;
+
+               }
+
+
+       }
+#endif
+
+       return CPU_UNKNOWN;
+}
+
+char *get_corename(void)
+{
+       return cpuname[detect()];
+}
+
+void get_architecture(void)
+{
+       printf("ARM");
+}
+
+void get_subarchitecture(void)
+{
+       int d = detect();
+       switch (d)
+       {
+
+               case CPU_ARMV8:
+                       printf("ARMV8");
+                       break;
+
+               default:
+                       printf("UNKNOWN");
+                       break;
+       }
+}
+
+void get_subdirname(void)
+{
+       printf("arm64");
+}
+
+void get_cpuconfig(void)
+{
+
+       int d = detect();
+       switch (d)
+       {
+
+               case CPU_ARMV8:
+                       printf("#define ARMV8\n");
+                       printf("#define L1_DATA_SIZE 32768\n");
+                       printf("#define L1_DATA_LINESIZE 64\n");
+                       printf("#define L2_SIZE 262144\n");
+                       printf("#define L2_LINESIZE 64\n");
+                       printf("#define DTB_DEFAULT_ENTRIES 64\n");
+                       printf("#define DTB_SIZE 4096\n");
+                       printf("#define L2_ASSOCIATIVE 4\n");
+                       break;
+
+
+       }
+}
+
+
+void get_libname(void)
+{
+
+       int d = detect();
+       switch (d)
+       {
+
+               case CPU_ARMV8:
+                       printf("armv8\n");
+                       break;
+
+       }
+}
+
+
+void get_features(void)
+{
+
+#ifdef linux
+       FILE *infile;
+       char buffer[2048], *p,*t;
+       p = (char *) NULL ;
+
+       infile = fopen("/proc/cpuinfo", "r");
+
+       while (fgets(buffer, sizeof(buffer), infile))
+       {
+
+               if (!strncmp("Features", buffer, 8))
+               {
+                       p = strchr(buffer, ':') + 2;
+                       break;
+               }
+       }
+
+       fclose(infile);
+
+
+       if( p == NULL ) return;
+
+       t = strtok(p," ");
+       while( t = strtok(NULL," "))
+       {
+       }
+
+#endif
+       return;
+}
+
+
diff --git a/getarch.c b/getarch.c

index 3e99142..ded347e 100644 (file)
--- a/getarch.c
+++ b/getarch.c
@@ -746,12 +746,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  #define SUBARCHITECTURE "ARMV8"
  #define SUBDIRNAME      "arm64"
  #define ARCHCONFIG   "-DARMV8 " \
-       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
-       "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
-       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \
-       "-DHAVE_VFP -DHAVE_VFPV3 -DHAVE_VFPV4"
+       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+       "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " 
  #define LIBNAME   "armv8"
-#define CORENAME  "ARMV8"
+#define CORENAME  "XGENE1"
  #else
  #endif
  
@@ -801,6 +800,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  #define OPENBLAS_SUPPORTED
  #endif
  
+#ifdef __aarch64__
+#include "cpuid_arm64.c"
+#define OPENBLAS_SUPPORTED
+#endif
+
  
  #ifndef OPENBLAS_SUPPORTED
  #error "This arch/CPU is not supported by OpenBLAS."
@@ -856,7 +860,7 @@ int main(int argc, char *argv[]){
  #ifdef FORCE
      printf("CORE=%s\n", CORENAME);
  #else
-#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__)
+#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__)
      printf("CORE=%s\n", get_corename());
  #endif
  #endif
@@ -956,7 +960,7 @@ int main(int argc, char *argv[]){
  #ifdef FORCE
      printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
  #else
-#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__)
+#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__)
      printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
  #endif
  #endif
diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8

index 27157da..4fc0968 100644 (file)
--- a/kernel/arm64/KERNEL.ARMV8
+++ b/kernel/arm64/KERNEL.ARMV8
@@ -80,14 +80,14 @@ DGEMVTKERNEL = ../arm/gemv_t.c
  CGEMVTKERNEL = ../arm/zgemv_t.c
  ZGEMVTKERNEL = ../arm/zgemv_t.c
  
-STRMMKERNEL    = ../generic/trmmkernel_2x2.c
+STRMMKERNEL    = ../generic/trmmkernel_4x4.c
  DTRMMKERNEL    = ../generic/trmmkernel_2x2.c
  CTRMMKERNEL    = ../generic/ztrmmkernel_2x2.c
  ZTRMMKERNEL    = ../generic/ztrmmkernel_2x2.c
  
-SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
-SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
-SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
+SGEMMKERNEL    =  sgemm_kernel_4x4.S
+SGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
  SGEMMONCOPYOBJ =  sgemm_oncopy.o
  SGEMMOTCOPYOBJ =  sgemm_otcopy.o
  
diff --git a/kernel/arm64/sgemm_kernel_4x4.S b/kernel/arm64/sgemm_kernel_4x4.S

new file mode 100644 (file)

index 0000000..7863329
--- /dev/null
+++ b/kernel/arm64/sgemm_kernel_4x4.S
@@ -0,0 +1,1327 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/23 Saar
+*       BLASTEST               : OK
+*       CTEST                  : OK
+*       TEST                   : OK
+*
+*
+* 2013/11/02 Saar
+*      UNROLL_N                4
+*      UNROLL_M                4
+*      DGEMM_P                 128
+*      DGEMM_Q                 240
+*      DGEMM_R                 12288
+*      A_PRE                   128
+*      B_PRE                   128
+*      C_PRE                   32
+*
+* Performance on Odroid U2:
+*
+* 3072x3072            1 Core:         2.62 GFLOPS     ATLAS: 2.69     GFLOPS
+* 3072x3072            2 Cores:        5.23 GFLOPS     ATLAS: 5.27     GFLOPS
+* 3072x3072            3 Cores:        7.78 GFLOPS     ATLAS: 7.87     GFLOPS
+* 3072x3072            4 Cores:       10.10 GFLOPS     ATLAS: 9.98     GFLOPS
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0        X3        x4       x5           x6*/
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc*/
+
+
+#define origM x0
+#define origN x1
+#define origK x2
+#define origPA x3
+#define origPB x4
+#define pC x5
+#define LDC x6
+#define offset x7
+#define counterL x8
+#define counterI x9
+#define pB x10
+#define counterJ x11
+#define tempALPHA x12
+#define pCRow0 x13
+#define pCRow1 x14
+#define pCRow2 x15
+#define pA x16
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset
+// 08 counterL
+// 09 counterI
+// 10 pB
+// 11 counterJ
+// 12 tempALPHA      
+// 13 pCRow0
+// 14 pCRow1
+// 15 pCRow2
+// 16 pA
+// 17
+// 18 must save
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 orig ALPHA -> a00
+//v01 a01
+//v02 a02
+//v03 a03
+//v04 a10
+//v05 a11
+//v06 a12
+//v07 a13
+//v08 must save b00
+//v09 must save b01
+//v10 must save b02
+//v11 must save b03
+//v12 must save b10
+//v13 must save b11
+//v14 must save b12
+//v15 must save b13
+//v16 must save  C00
+//v17 must save  C01
+//v18  C02
+//v19  C03
+//v20  C10
+//v21  C11
+//v22  C12
+//v23  C13
+//v24  C20
+//v25  C21
+//v26  C22
+//v27  C23
+//v28  C30
+//v29  C31
+//v30  C32
+//v31  C33
+
+//        add     sp,sp,#-(6*16)
+//        stp     x18,x19,[sp,#(0*16)]
+//        stp     x20,x21,[sp,#(1*16)]
+
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro INIT4x4
+
+       fsub            v16.4s , v16.4s , v16.4s
+       fsub            v20.4s , v20.4s , v20.4s
+       fsub            v24.4s , v24.4s , v24.4s
+       fsub            v28.4s , v28.4s , v28.4s
+
+.endm
+
+.macro KERNEL4x4_I
+
+        ld1     {v8.2s},[pB],#8
+        ld1     {v10.2s},[pB],#8
+        ld1     {v0.4s},[pA],#16
+
+        fmulx   v16.4s, v0.4s, v8.4s[0]
+        fmulx   v20.4s, v0.4s, v8.4s[1]
+       fmulx   v24.4s, v0.4s, v10.4s[0]
+       fmulx   v28.4s, v0.4s, v10.4s[1]
+
+        ld1     {v12.2s},[pB],#8   // for next round
+        ld1     {v14.2s},[pB],#8   // for next round
+        ld1     {v4.4s},[pA],#16   // for next round
+
+
+.endm
+
+
+.macro KERNEL4x4_M2
+
+       fmla    v16.4s, v4.4s, v12.s[0]
+       fmla    v20.4s, v4.4s, v12.s[1]
+       fmla    v24.4s, v4.4s, v14.s[0]
+       fmla    v28.4s, v4.4s, v14.s[1]
+
+        ld1     {v8.2s},[pB],#8
+        ld1     {v10.2s},[pB],#8
+        ld1     {v0.4s},[pA],#16
+
+.endm
+
+
+.macro KERNEL4x4_M1
+
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v20.4s, v0.4s, v8.s[1]
+       fmla    v24.4s, v0.4s, v10.s[0]
+       fmla    v28.4s, v0.4s, v10.s[1]
+
+        ld1     {v12.2s},[pB],#8
+        ld1     {v14.2s},[pB],#8
+        ld1     {v4.4s},[pA],#16
+
+.endm
+
+
+
+.macro KERNEL4x4_E
+
+       fmla    v16.4s, v4.4s, v12.s[0]
+       fmla    v20.4s, v4.4s, v12.s[1]
+       fmla    v24.4s, v4.4s, v14.s[0]
+       fmla    v28.4s, v4.4s, v14.s[1]
+
+.endm
+
+
+
+
+.macro KERNEL4x4_SUB
+
+        ld1     {v8.2s},[pB],#8
+        ld1     {v10.2s},[pB],#8
+       ld1     {v0.4s} , [pA],#16
+
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v20.4s, v0.4s, v8.s[1]
+       fmla    v24.4s, v0.4s, v10.s[0]
+       fmla    v28.4s, v0.4s, v10.s[1]
+
+.endm
+
+
+
+
+.macro SAVE4x4
+
+       add     pCRow1, pCRow0, LDC    // create a second row pointer from the first row pointer
+       mov     v0.d[0], tempALPHA
+
+        ld1     {v8.4s},[pCRow0]   // load 4 values of C from first row
+        fmla     v8.4s ,v16.4s,v0.s[0]
+       st1     {v8.4s},[pCRow0],#16 // store C from first row
+
+        ld1     {v12.4s},[pCRow1]   // load 4 values of C from second row
+        fmla     v12.4s ,v20.4s,v0.s[0]
+       st1     {v12.4s},[pCRow1] // store C from second row
+
+       add     pCRow2, pCRow1, LDC        // Row2 points to third row 
+
+        ld1     {v8.4s},[pCRow2]   // load 4 values of C from third row
+        fmla     v8.4s ,v24.4s,v0.s[0]
+       st1     {v8.4s} ,[pCRow2]  // store C from third row
+
+       add     pCRow1, pCRow2 , LDC // row1 points to fourth row
+
+        ld1     {v12.4s},[pCRow1]   // load 4 values of C from fourth row
+        fmla     v12.4s ,v28.4s,v0.s[0]
+       st1     {v12.4s},[pCRow1]  // store fourth row
+
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x4
+
+       fsub            s16 , s16 , s16
+       fmov            s17, s16
+       fmov            s20, s16
+       fmov            s21, s16
+       fmov            s24, s16
+       fmov            s25, s16
+       fmov            s28, s16
+       fmov            s29, s16
+
+.endm
+
+
+
+.macro KERNEL2x4_SUB
+
+       ldr     s8 , [ pB ]
+       ldr     s9 , [ pB, #4 ]
+       ldr     s10, [ pB, #8 ]
+       ldr     s11, [ pB, #12 ]
+
+       ldr     s0 , [ pA ]
+       ldr     s1 , [ pA, #4 ]
+
+       fmadd   s16  , s0,  s8, s16  
+       fmadd   s17  , s1,  s8, s17  
+
+       fmadd   s20  , s0,  s9, s20  
+       fmadd   s21  , s1,  s9, s21  
+
+       fmadd   s24  , s0,  s10,        s24  
+       fmadd   s25  , s1,  s10,        s25  
+
+       fmadd   s28  , s0,  s11,        s28  
+       fmadd   s29  , s1,  s11,        s29  
+       add     pA , pA, #8
+       add     pB , pB, #16
+
+.endm
+
+            #define F1ST( op1, op2, op3) fmadd op1, op2, op3, op1
+            #define L1ST( op1, op2, op3) ldr op1, [op2,  op3]
+
+.macro SAVE2x4
+
+       add     pCRow1 , pCRow0, LDC
+       add     pCRow2  , pCRow1, LDC
+       mov     v0.d[0], tempALPHA
+
+       L1ST (  s8,pCRow0, #0)
+       L1ST (  s9,pCRow0, #4 )
+
+       F1ST (  s8 , s0 , s16)
+       F1ST (  s9 , s0 , s17)
+
+       str     s8 , [pCRow0, #0]
+       str     s9 , [pCRow0, #4 ]
+
+       ldr     s12, [pCRow1, #0]
+       ldr     s13, [pCRow1, #4 ]
+
+       F1ST (  s12, s0 , s20)
+       F1ST (  s13, s0 , s21)
+
+       str     s12, [pCRow1, #0]
+       str     s13, [pCRow1, #4 ]
+
+       L1ST (  s8,pCRow2 , #0)
+       L1ST (  s9,pCRow2 , #4 )
+
+       F1ST (  s8 , s0 , s24)
+       F1ST (  s9 , s0 , s25)
+
+       str     s8 , [pCRow2 , #0]
+       str     s9 , [pCRow2 , #4 ]
+
+       add     pCRow1, pCRow2 , LDC
+
+       ldr     s12, [pCRow1, #0]
+       ldr     s13, [pCRow1, #4 ]
+
+       F1ST (  s12, s0 , s28)
+       F1ST (  s13, s0 , s29)
+
+       str     s12, [pCRow1, #0]
+       str     s13, [pCRow1, #4 ]
+
+       add     pCRow0, pCRow0, #8
+
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT1x4
+
+       fsub            s16 , s16 , s16
+       fmov            s20, s16
+       fmov            s24, s16
+       fmov            s28, s16
+
+.endm
+
+
+
+.macro KERNEL1x4_SUB
+
+       ldr     s8 , [ pB ]
+       ldr     s9 , [ pB, #4 ]
+       ldr     s10, [ pB, #8 ]
+       ldr     s11, [ pB, #12 ]
+
+       ldr     s0 , [ pA ]
+
+       fmadd   s16  , s0,  s8, s16  
+       fmadd   s20  , s0,  s9, s20  
+       fmadd   s24  , s0,  s10,        s24  
+       fmadd   s28  , s0,  s11,        s28  
+
+       add     pA , pA, #4
+       add     pB , pB, #16
+
+.endm
+
+.macro SAVE1x4
+
+       add     pCRow1 , pCRow0, LDC
+       add     pCRow2  , pCRow1, LDC
+
+       mov     v0.d[0], tempALPHA
+
+       L1ST (  s8,pCRow0, #0)
+       F1ST (  s8 , s0 , s16)
+       str     s8 , [pCRow0, #0]
+
+       L1ST (  s12,pCRow1, #0)
+       F1ST (  s12, s0 , s20)
+       str     s12, [pCRow1, #0]
+
+       L1ST (  s8,pCRow2 , #0)
+       F1ST (  s8 , s0 , s24)
+       str     s8 , [pCRow2 , #0]
+
+       add     pCRow1, pCRow2 , LDC
+
+       L1ST (  s12,pCRow1, #0)
+       F1ST (  s12, s0 , s28)
+       str     s12, [pCRow1, #0]
+
+       add     pCRow0, pCRow0, #4
+
+.endm
+
+/******************************************************************************/
+/******************************************************************************/
+
+.macro INIT4x2
+
+       fsub            s16 , s16 , s16
+       fmov            s17, s16
+       fmov            s18, s16
+       fmov            s19, s16
+       fmov            s20, s16
+       fmov            s21, s16
+       fmov            s22, s16
+       fmov            s23, s16
+
+.endm
+
+
+
+.macro KERNEL4x2_SUB
+
+       ldr     s8 , [ pB ]
+       ldr     s9 , [ pB, #4 ]
+
+       ldr     s0 , [ pA ]
+       ldr     s1 , [ pA, #4 ]
+       ldr     s2 , [ pA, #8 ]
+       ldr     s3 , [ pA, #12 ]
+
+       fmadd   s16  , s0,  s8, s16  
+       fmadd   s17  , s1,  s8, s17  
+       fmadd   s18  , s2,  s8, s18  
+       fmadd   s19  , s3,  s8, s19  
+
+       fmadd   s20  , s0,  s9, s20  
+       fmadd   s21  , s1,  s9, s21  
+       fmadd   s22  , s2,  s9, s22  
+       fmadd   s23  , s3,  s9, s23  
+
+       add     pA , pA, #16
+       add     pB , pB, #8
+
+.endm
+
+.macro SAVE4x2
+
+       add     pCRow1 , pCRow0, LDC
+
+       mov     v0.d[0], tempALPHA
+
+       L1ST (  s8,pCRow0, #0)
+       L1ST (  s9,pCRow0, #4 )
+       L1ST (  s10,pCRow0, #8 )
+       L1ST (  s11,pCRow0, #12 )
+
+       F1ST (  s8 , s0 , s16)
+       F1ST (  s9 , s0 , s17)
+       F1ST (  s10, s0 , s18)
+       F1ST (  s11, s0 , s19)
+
+       str     s8 , [pCRow0]
+       str     s9 , [pCRow0, #4 ]
+       str     s10, [pCRow0, #8 ]
+       str     s11, [pCRow0, #12 ]
+
+       L1ST (  s12,pCRow1, #0)
+       L1ST (  s13,pCRow1, #4 )
+       L1ST (  s14,pCRow1, #8 )
+       L1ST (  s15,pCRow1, #12 )
+
+       F1ST (  s12, s0 , s20)
+       F1ST (  s13, s0 , s21)
+       F1ST (  s14, s0 , s22)
+       F1ST (  s15, s0 , s23)
+
+       str     s12, [pCRow1]
+       str     s13, [pCRow1, #4 ]
+       str     s14, [pCRow1, #8 ]
+       str     s15, [pCRow1, #12 ]
+
+       add     pCRow0, pCRow0, #16
+
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT2x2
+
+       fsub            s16 , s16 , s16
+       fmov            s17, s16
+       fmov            s20, s16
+       fmov            s21, s16
+
+.endm
+
+
+
+.macro KERNEL2x2_SUB
+
+       ldr     s8 , [ pB ]
+       ldr     s9 , [ pB, #4 ]
+
+       ldr     s0 , [ pA ]
+       ldr     s1 , [ pA, #4 ]
+
+       fmadd   s16  , s0,  s8, s16  
+       fmadd   s17  , s1,  s8, s17  
+
+       fmadd   s20  , s0,  s9, s20  
+       fmadd   s21  , s1,  s9, s21  
+
+       add     pA , pA, #8
+       add     pB , pB, #8
+
+.endm
+
+.macro SAVE2x2
+
+       add     pCRow1 , pCRow0, LDC
+
+       mov     v0.d[0], tempALPHA
+
+       L1ST (  s8,pCRow0, #0 )
+       L1ST (  s9,pCRow0, #4 )
+
+       F1ST (  s8 , s0 , s16)
+       F1ST (  s9 , s0 , s17)
+
+       str     s8 , [pCRow0]
+       str     s9 , [pCRow0, #4 ]
+
+       L1ST (  s12,pCRow1, #0 )
+       L1ST (  s13,pCRow1, #4 )
+
+       F1ST (  s12, s0 , s20)
+       F1ST (  s13, s0 , s21)
+
+       str     s12, [pCRow1]
+       str     s13, [pCRow1, #4 ]
+
+       add     pCRow0, pCRow0, #8
+
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x2
+
+       fsub            s16 , s16 , s16
+       fmov            s20, s16
+
+.endm
+
+
+
+.macro KERNEL1x2_SUB
+
+       ldr     s8 , [ pB ]
+       ldr     s9 , [ pB, #4 ]
+
+       ldr     s0 , [ pA ]
+       fmadd   s16  , s0,  s8, s16  
+       fmadd   s20  , s0,  s9, s20  
+
+       add     pA , pA, #4
+       add     pB , pB, #8
+
+.endm
+
+.macro SAVE1x2
+
+       add     pCRow1 , pCRow0, LDC
+
+       mov     v0.d[0], tempALPHA
+
+       L1ST (  s8,pCRow0, #0)
+       F1ST (  s8 , s0 , s16)
+       str     s8 , [pCRow0]
+
+       L1ST (  s12,pCRow1, #0)
+       F1ST (  s12, s0 , s20)
+       str     s12, [pCRow1]
+
+       add     pCRow0, pCRow0, #4
+
+.endm
+
+/******************************************************************************/
+/******************************************************************************/
+
+.macro INIT4x1
+
+       fsub            s16 , s16 , s16
+       fmov            s17, s16
+       fmov            s18, s16
+       fmov            s19, s16
+
+.endm
+
+
+
+.macro KERNEL4x1_SUB
+
+       ldr     s8 , [ pB ]
+
+       ldr     s0 , [ pA ]
+       ldr     s1 , [ pA, #4 ]
+       ldr     s2 , [ pA, #8 ]
+       ldr     s3 , [ pA, #12 ]
+
+       fmadd   s16  , s0,  s8, s16  
+       fmadd   s17  , s1,  s8, s17  
+       fmadd   s18  , s2,  s8, s18  
+       fmadd   s19  , s3,  s8, s19  
+
+       add     pA , pA, #16
+       add     pB , pB, #4
+
+.endm
+
+.macro SAVE4x1
+
+
+       mov     v0.d[0], tempALPHA
+
+       L1ST (  s8,pCRow0, #0 )
+       L1ST (  s9,pCRow0, #4 )
+       L1ST (  s10,pCRow0, #8 )
+       L1ST (  s11,pCRow0, #12 )
+
+       F1ST (  s8 , s0 , s16)
+       F1ST (  s9 , s0 , s17)
+       F1ST (  s10, s0 , s18)
+       F1ST (  s11, s0 , s19)
+
+       str     s8 , [pCRow0]
+       str     s9 , [pCRow0, #4 ]
+       str     s10, [pCRow0, #8 ]
+       str     s11, [pCRow0, #12 ]
+
+       add     pCRow0, pCRow0, #16
+
+.endm
+
+
+
+
+/******************************************************************************/
+
+.macro INIT2x1
+
+       fsub            s16 , s16 , s16
+       fmov            s17, s16
+
+.endm
+
+
+
+.macro KERNEL2x1_SUB
+
+       ldr     s8 , [ pB ]
+
+       ldr     s0 , [ pA ]
+       ldr     s1 , [ pA, #4 ]
+
+       fmadd   s16  , s0,  s8, s16  
+       fmadd   s17  , s1,  s8, s17  
+
+       add     pA , pA, #8
+       add     pB , pB, #4
+
+.endm
+
+.macro SAVE2x1
+
+
+       mov     v0.d[0], tempALPHA
+
+       L1ST (  s8,pCRow0, #0 )
+       L1ST (  s9,pCRow0, #4 )
+
+       F1ST (  s8 , s0 , s16)
+       F1ST (  s9 , s0 , s17)
+
+       str     s8 , [pCRow0]
+       str     s9 , [pCRow0, #4 ]
+
+       add     pCRow0, pCRow0, #8
+
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x1
+
+       fsub            s16 , s16 , s16
+
+.endm
+
+
+
+.macro KERNEL1x1_SUB
+
+       ldr     s8 , [ pB ]
+
+       ldr     s0 , [ pA ]
+
+       fmadd   s16  , s0,  s8, s16  
+
+       add     pA , pA, #4
+       add     pB , pB, #4
+
+.endm
+
+.macro SAVE1x1
+
+
+       mov     v0.d[0], tempALPHA
+
+       L1ST (  s8,pCRow0, #0 )
+       F1ST (  s8 , s0 , s16)
+       str     s8 , [pCRow0]
+
+       add     pCRow0, pCRow0, #4
+
+.endm
+
+
+
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+       PROLOGUE
+
+       .align 5
+        add     sp,sp,#-(5*16)
+        stp     d8,d9,[sp,#(0*16)]
+        stp     d10,d11,[sp,#(1*16)]
+        stp     d12,d13,[sp,#(2*16)]
+        stp     d14,d15,[sp,#(3*16)]
+        stp     d16,d17,[sp,#(4*16)]
+
+        mov     tempALPHA, v0.d[0]
+       lsl     LDC, LDC, #2                                    // ldc = ldc * 4
+
+       mov     pB, origPB
+
+       mov     counterJ, origN
+       asr     counterJ, counterJ, #2                                  // J = J / 4
+       cmp     counterJ, #0
+       ble     sgemm_kernel_L2_BEGIN
+
+sgemm_kernel_L4_BEGIN:
+
+       mov     pCRow0, pC                                              // pCRow0 = C
+        add     pC,pC,LDC, lsl #2
+
+       mov     pA, origPA                                              // pA = start of A array
+
+
+
+sgemm_kernel_L4_M4_BEGIN:
+
+       mov     counterI, origM
+       asr     counterI, counterI, #2                                  // counterI = counterI / 4
+       cmp     counterI, #0
+       ble     sgemm_kernel_L4_M2_BEGIN
+
+sgemm_kernel_L4_M4_20:
+
+       mov     pB, origPB
+       asr     counterL , origK, #1                                    // L = K / 2
+       cmp     counterL , #2                                           // is there at least 4 to do?
+       blt     sgemm_kernel_L4_M4_32
+
+
+
+       KERNEL4x4_I     //do one in the K
+       KERNEL4x4_M2    //do another in the K
+
+       subs    counterL, counterL, #2  // subtract 2, since one is always done at the tail
+       ble     sgemm_kernel_L4_M4_22a
+       .align 5
+
+sgemm_kernel_L4_M4_22:
+
+       KERNEL4x4_M1
+       KERNEL4x4_M2
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L4_M4_22
+
+sgemm_kernel_L4_M4_22a:
+
+       KERNEL4x4_M1
+       KERNEL4x4_E
+
+       b        sgemm_kernel_L4_M4_44
+
+sgemm_kernel_L4_M4_32:   // less than 4 to do in the K direction
+
+       tst     counterL, #1
+       ble     sgemm_kernel_L4_M4_40
+
+       KERNEL4x4_I
+
+       KERNEL4x4_E
+
+       b        sgemm_kernel_L4_M4_44
+
+
+sgemm_kernel_L4_M4_40:
+
+       INIT4x4
+
+
+sgemm_kernel_L4_M4_44:
+
+       ands    counterL , origK, #1
+       ble     sgemm_kernel_L4_M4_100
+
+sgemm_kernel_L4_M4_46:
+
+       KERNEL4x4_SUB
+
+       subs    counterL, counterL, #1
+       bne     sgemm_kernel_L4_M4_46
+
+sgemm_kernel_L4_M4_100:
+
+       SAVE4x4
+
+sgemm_kernel_L4_M4_END:
+
+       subs    counterI, counterI, #1
+       bne     sgemm_kernel_L4_M4_20
+
+
+sgemm_kernel_L4_M2_BEGIN:
+
+       mov     counterI, origM
+       tst     counterI , #3
+       ble     sgemm_kernel_L4_END
+
+       tst     counterI, #2                                    // counterI = counterI / 2
+       ble     sgemm_kernel_L4_M1_BEGIN
+
+sgemm_kernel_L4_M2_20:
+
+       INIT2x4
+
+       mov     pB, origPB
+       asr     counterL , origK, #3                                    // counterL = counterL / 8
+       cmp     counterL , #0
+       ble     sgemm_kernel_L4_M2_40
+
+sgemm_kernel_L4_M2_22:
+
+       KERNEL2x4_SUB
+       KERNEL2x4_SUB
+       KERNEL2x4_SUB
+       KERNEL2x4_SUB
+
+       KERNEL2x4_SUB
+       KERNEL2x4_SUB
+       KERNEL2x4_SUB
+       KERNEL2x4_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L4_M2_22
+
+
+sgemm_kernel_L4_M2_40:
+
+       ands    counterL , origK, #7                                    // counterL = counterL % 8
+       ble     sgemm_kernel_L4_M2_100
+
+sgemm_kernel_L4_M2_42:
+
+       KERNEL2x4_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L4_M2_42
+
+sgemm_kernel_L4_M2_100:
+
+       SAVE2x4
+
+sgemm_kernel_L4_M2_END:
+
+
+sgemm_kernel_L4_M1_BEGIN:
+
+       tst     counterI, #1                                    // counterI = counterI % 2
+       ble     sgemm_kernel_L4_END
+
+sgemm_kernel_L4_M1_20:
+
+       INIT1x4
+
+       mov     pB, origPB
+       asr     counterL , origK, #3                                    // counterL = counterL / 8
+       cmp     counterL , #0
+       ble     sgemm_kernel_L4_M1_40
+
+sgemm_kernel_L4_M1_22:
+       KERNEL1x4_SUB
+       KERNEL1x4_SUB
+       KERNEL1x4_SUB
+       KERNEL1x4_SUB
+
+       KERNEL1x4_SUB
+       KERNEL1x4_SUB
+       KERNEL1x4_SUB
+       KERNEL1x4_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L4_M1_22
+
+
+sgemm_kernel_L4_M1_40:
+
+       ands    counterL , origK, #7                                    // counterL = counterL % 8
+       ble     sgemm_kernel_L4_M1_100
+
+sgemm_kernel_L4_M1_42:
+
+       KERNEL1x4_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L4_M1_42
+
+sgemm_kernel_L4_M1_100:
+
+       SAVE1x4
+
+
+sgemm_kernel_L4_END:
+
+       add     origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
+
+       subs    counterJ, counterJ , #1                                         // j--
+       bgt     sgemm_kernel_L4_BEGIN
+
+
+
+/*********************************************************************************************/
+
+sgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+       mov     counterJ , origN
+       tst     counterJ , #3
+       ble     sgemm_kernel_L999   // error, N was less than 4?
+
+       tst     counterJ , #2
+       ble     sgemm_kernel_L1_BEGIN
+
+       mov     pCRow0, pC                                              // pCRow0 = pC
+       add     pC , pC, LDC, lsl #1
+
+       mov     pA, origPA                                              // pA = A
+
+
+
+sgemm_kernel_L2_M4_BEGIN:
+
+       mov     counterI, origM
+       asr     counterI, counterI, #2                                  // counterI = counterI / 4
+       cmp     counterI,#0
+       ble     sgemm_kernel_L2_M2_BEGIN
+
+sgemm_kernel_L2_M4_20:
+
+       INIT4x2
+
+       mov     pB, origPB
+       asr     counterL , origK, #3                                    // counterL = counterL / 8
+       cmp     counterL,#0
+       ble     sgemm_kernel_L2_M4_40
+       .align 5
+
+sgemm_kernel_L2_M4_22:
+       KERNEL4x2_SUB
+       KERNEL4x2_SUB
+       KERNEL4x2_SUB
+       KERNEL4x2_SUB
+
+       KERNEL4x2_SUB
+       KERNEL4x2_SUB
+       KERNEL4x2_SUB
+       KERNEL4x2_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L2_M4_22
+
+
+sgemm_kernel_L2_M4_40:
+
+       ands    counterL , origK, #7                                    // counterL = counterL % 8
+       ble     sgemm_kernel_L2_M4_100
+
+sgemm_kernel_L2_M4_42:
+
+       KERNEL4x2_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L2_M4_42
+
+sgemm_kernel_L2_M4_100:
+
+       SAVE4x2
+
+sgemm_kernel_L2_M4_END:
+
+       subs    counterI, counterI, #1
+       bgt     sgemm_kernel_L2_M4_20
+
+
+sgemm_kernel_L2_M2_BEGIN:
+
+       mov     counterI, origM
+       tst     counterI , #3
+       ble     sgemm_kernel_L2_END
+
+       tst     counterI, #2                                    // counterI = counterI / 2
+       ble     sgemm_kernel_L2_M1_BEGIN
+
+sgemm_kernel_L2_M2_20:
+
+       INIT2x2
+
+       mov     pB, origPB
+       asr     counterL , origK, #3                                    // counterL = counterL / 8
+        cmp    counterL,#0
+       ble     sgemm_kernel_L2_M2_40
+
+sgemm_kernel_L2_M2_22:
+
+       KERNEL2x2_SUB
+       KERNEL2x2_SUB
+       KERNEL2x2_SUB
+       KERNEL2x2_SUB
+
+       KERNEL2x2_SUB
+       KERNEL2x2_SUB
+       KERNEL2x2_SUB
+       KERNEL2x2_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L2_M2_22
+
+
+sgemm_kernel_L2_M2_40:
+
+       ands    counterL , origK, #7                                    // counterL = counterL % 8
+       ble     sgemm_kernel_L2_M2_100
+
+sgemm_kernel_L2_M2_42:
+
+       KERNEL2x2_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L2_M2_42
+
+sgemm_kernel_L2_M2_100:
+
+       SAVE2x2
+
+sgemm_kernel_L2_M2_END:
+
+
+sgemm_kernel_L2_M1_BEGIN:
+
+       tst     counterI, #1                                    // counterI = counterI % 2
+       ble     sgemm_kernel_L2_END
+
+sgemm_kernel_L2_M1_20:
+
+       INIT1x2
+
+       mov     pB, origPB
+       asr     counterL , origK, #3                                    // counterL = counterL / 8
+        cmp     counterL, #0
+       ble     sgemm_kernel_L2_M1_40
+
+sgemm_kernel_L2_M1_22:
+       KERNEL1x2_SUB
+       KERNEL1x2_SUB
+       KERNEL1x2_SUB
+       KERNEL1x2_SUB
+
+       KERNEL1x2_SUB
+       KERNEL1x2_SUB
+       KERNEL1x2_SUB
+       KERNEL1x2_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L2_M1_22
+
+
+sgemm_kernel_L2_M1_40:
+
+       ands    counterL , origK, #7                                    // counterL = counterL % 8
+       ble     sgemm_kernel_L2_M1_100
+
+sgemm_kernel_L2_M1_42:
+
+       KERNEL1x2_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L2_M1_42
+
+sgemm_kernel_L2_M1_100:
+
+       SAVE1x2
+
+
+sgemm_kernel_L2_END:
+       add     origPB, origPB, origK, lsl #3                                   // B = B + K * 2 * 4
+
+/*********************************************************************************************/
+
+sgemm_kernel_L1_BEGIN:
+
+       mov     counterJ , origN
+       tst     counterJ , #1
+       ble     sgemm_kernel_L999 // done
+
+
+       mov     pCRow0, pC                                              // pCRow0 = C
+       add     pC , pCRow0 , LDC                                 // C01 is the current line, update pC to point to next
+
+       mov     pA, origPA                                              // pA = A
+
+
+
+sgemm_kernel_L1_M4_BEGIN:
+
+       mov     counterI, origM
+       asr     counterI, counterI, #2                                  // counterI = counterI / 4
+       cmp     counterI, #0
+       ble     sgemm_kernel_L1_M2_BEGIN
+
+sgemm_kernel_L1_M4_20:
+
+       INIT4x1
+
+       mov     pB, origPB
+       asr     counterL , origK, #3                                    // counterL = counterL / 8
+       cmp     counterL , #0
+       ble     sgemm_kernel_L1_M4_40
+       .align 5
+
+sgemm_kernel_L1_M4_22:
+       KERNEL4x1_SUB
+       KERNEL4x1_SUB
+       KERNEL4x1_SUB
+       KERNEL4x1_SUB
+
+       KERNEL4x1_SUB
+       KERNEL4x1_SUB
+       KERNEL4x1_SUB
+       KERNEL4x1_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L1_M4_22
+
+
+sgemm_kernel_L1_M4_40:
+
+       ands    counterL , origK, #7                                    // counterL = counterL % 8
+       ble     sgemm_kernel_L1_M4_100
+
+sgemm_kernel_L1_M4_42:
+
+       KERNEL4x1_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L1_M4_42
+
+sgemm_kernel_L1_M4_100:
+
+       SAVE4x1
+
+sgemm_kernel_L1_M4_END:
+
+       subs    counterI, counterI, #1
+       bgt     sgemm_kernel_L1_M4_20
+
+
+sgemm_kernel_L1_M2_BEGIN:
+
+       mov     counterI, origM
+       tst     counterI , #3
+       ble     sgemm_kernel_L1_END
+
+       tst     counterI, #2                                    // counterI = counterI / 2
+       ble     sgemm_kernel_L1_M1_BEGIN
+
+sgemm_kernel_L1_M2_20:
+
+       INIT2x1
+
+       mov     pB, origPB
+       asr     counterL , origK, #3                                    // counterL = counterL / 8
+       cmp     counterL , #0
+       ble     sgemm_kernel_L1_M2_40
+
+sgemm_kernel_L1_M2_22:
+
+       KERNEL2x1_SUB
+       KERNEL2x1_SUB
+       KERNEL2x1_SUB
+       KERNEL2x1_SUB
+
+       KERNEL2x1_SUB
+       KERNEL2x1_SUB
+       KERNEL2x1_SUB
+       KERNEL2x1_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L1_M2_22
+
+
+sgemm_kernel_L1_M2_40:
+
+       ands    counterL , origK, #7                                    // counterL = counterL % 8
+       ble     sgemm_kernel_L1_M2_100
+
+sgemm_kernel_L1_M2_42:
+
+       KERNEL2x1_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L1_M2_42
+
+sgemm_kernel_L1_M2_100:
+
+       SAVE2x1
+
+sgemm_kernel_L1_M2_END:
+
+
+sgemm_kernel_L1_M1_BEGIN:
+
+       tst     counterI, #1                                    // counterI = counterI % 2
+       ble     sgemm_kernel_L1_END
+
+sgemm_kernel_L1_M1_20:
+
+       INIT1x1
+
+       mov     pB, origPB
+       asr     counterL , origK, #3                                    // counterL = counterL / 8
+       cmp     counterL , #0
+       ble     sgemm_kernel_L1_M1_40
+
+sgemm_kernel_L1_M1_22:
+       KERNEL1x1_SUB
+       KERNEL1x1_SUB
+       KERNEL1x1_SUB
+       KERNEL1x1_SUB
+
+       KERNEL1x1_SUB
+       KERNEL1x1_SUB
+       KERNEL1x1_SUB
+       KERNEL1x1_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L1_M1_22
+
+
+sgemm_kernel_L1_M1_40:
+
+       ands    counterL , origK, #7                                    // counterL = counterL % 8
+       ble     sgemm_kernel_L1_M1_100
+
+sgemm_kernel_L1_M1_42:
+
+       KERNEL1x1_SUB
+
+       subs    counterL, counterL, #1
+       bgt     sgemm_kernel_L1_M1_42
+
+sgemm_kernel_L1_M1_100:
+
+       SAVE1x1
+
+
+sgemm_kernel_L1_END:
+
+
+sgemm_kernel_L999:
+       mov     x0, #0                                          // set return value
+        ldp     d8,d9,[sp,#(0*16)]
+        ldp     d10,d11,[sp,#(1*16)]
+        ldp     d12,d13,[sp,#(2*16)]
+        ldp     d14,d15,[sp,#(3*16)]
+        ldp     d16,d17,[sp,#(4*16)]
+        add     sp,sp,#(5*16)
+       ret
+
+       EPILOGUE
+
diff --git a/kernel/generic/trmmkernel_4x4.c b/kernel/generic/trmmkernel_4x4.c

new file mode 100644 (file)

index 0000000..a85828c
--- /dev/null
+++ b/kernel/generic/trmmkernel_4x4.c
@@ -0,0 +1,875 @@
+#include "common.h"
+#include <stdbool.h>
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
+{
+
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
+
+   FLOAT res0_0;
+   FLOAT res0_1;
+   FLOAT res0_2;
+   FLOAT res0_3;
+
+   FLOAT res1_0;
+   FLOAT res1_1;
+   FLOAT res1_2;
+   FLOAT res1_3;
+
+   FLOAT res2_0;
+   FLOAT res2_1;
+   FLOAT res2_2;
+   FLOAT res2_3;
+
+   FLOAT res3_0;
+   FLOAT res3_1;
+   FLOAT res3_2;
+   FLOAT res3_3;
+
+   FLOAT a0;
+   FLOAT a1;
+
+   FLOAT b0;
+   FLOAT b1;
+   FLOAT b2;
+   FLOAT b3;
+
+   BLASLONG off, temp;
+
+   bool left;
+   bool transposed;
+   bool backwards;
+
+#ifdef LEFT
+   left = true;
+#else
+   left = false;
+#endif
+
+#ifdef TRANSA
+   transposed = true;
+#else
+   transposed = false;
+#endif
+
+   backwards = left != transposed;
+
+   if (!left) {
+      off = -offset;
+   }
+
+
+   for (j=0; j<bn/4; j+=1) // do blocks of the Mx4 loops 
+   {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C1+ldc;
+        C3 = C2+ldc;
+
+
+        if (left) {
+            off = offset;
+        }
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x4
+       {
+
+               ptrbb = bb;
+                if (backwards)
+                {
+                  ptrba += off*4; // number of values in A
+                  ptrbb += off*4; // number of values in B
+                }
+
+               res0_0 = 0;
+               res0_1 = 0;
+               res0_2 = 0;
+               res0_3 = 0;
+
+               res1_0 = 0;
+               res1_1 = 0;
+               res1_2 = 0;
+               res1_3 = 0;
+
+               res2_0 = 0;
+               res2_1 = 0;
+               res2_2 = 0;
+               res2_3 = 0;
+
+               res3_0 = 0;
+               res3_1 = 0;
+               res3_2 = 0;
+               res3_3 = 0;
+
+                temp = backwards ? bk-off :
+                             left ? off + 4 : // number of values in A
+                                    off + 4;  // number of values in B
+
+               for (k=0; k<temp; k++)
+                {
+                       b0 = ptrbb[0];
+                       b1 = ptrbb[1];
+                       b2 = ptrbb[2];
+                       b3 = ptrbb[3];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+                       res1_0 += a0*b1;
+                       res2_0 += a0*b2;
+                       res3_0 += a0*b3;
+
+                       a1 = ptrba[1];
+                       res0_1 += a1*b0;
+                       res1_1 += a1*b1;
+                       res2_1 += a1*b2;
+                       res3_1 += a1*b3;
+
+                       a0 = ptrba[2];
+                       res0_2 += a0*b0;
+                       res1_2 += a0*b1;
+                       res2_2 += a0*b2;
+                       res3_2 += a0*b3;
+
+                       a1 = ptrba[3];
+                       res0_3 += a1*b0;
+                       res1_3 += a1*b1;
+                       res2_3 += a1*b2;
+                       res3_3 += a1*b3;
+
+                       ptrba = ptrba+4;
+                       ptrbb = ptrbb+4;
+                }
+
+               res0_0 *= alpha;
+               res0_1 *= alpha;
+               res0_2 *= alpha;
+               res0_3 *= alpha;
+
+               res1_0 *= alpha;
+               res1_1 *= alpha;
+               res1_2 *= alpha;
+               res1_3 *= alpha;
+
+               res2_0 *= alpha;
+               res2_1 *= alpha;
+               res2_2 *= alpha;
+               res2_3 *= alpha;
+
+               res3_0 *= alpha;
+               res3_1 *= alpha;
+               res3_2 *= alpha;
+               res3_3 *= alpha;
+
+               C0[0] = res0_0;
+               C0[1] = res0_1;
+               C0[2] = res0_2;
+               C0[3] = res0_3;
+
+               C1[0] = res1_0;
+               C1[1] = res1_1;
+               C1[2] = res1_2;
+               C1[3] = res1_3;
+
+               C2[0] = res2_0;
+               C2[1] = res2_1;
+               C2[2] = res2_2;
+               C2[3] = res2_3;
+
+               C3[0] = res3_0;
+               C3[1] = res3_1;
+               C3[2] = res3_2;
+               C3[3] = res3_3;
+
+               if (!backwards) {
+                    temp = bk-off;
+                    temp = left ? temp - 4 : // number of values in A
+                                  temp - 4;  // number of values in B
+
+                    ptrba += temp*4; // number of values in A
+                   ptrbb += temp*4; // number of values in B
+                }
+#ifdef LEFT
+               off += 4; // number of values in A
+#endif
+
+               C0 = C0+4;
+               C1 = C1+4;
+               C2 = C2+4;
+               C3 = C3+4;
+
+       }
+
+       if ( bm & 2 ) // do any 2x4 loop
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*2;
+               ptrbb = bb + off*4;
+#endif
+
+               res0_0 = 0;
+               res0_1 = 0;
+
+               res1_0 = 0;
+               res1_1 = 0;
+
+               res2_0 = 0;
+               res2_1 = 0;
+
+               res3_0 = 0;
+               res3_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT)
+               temp = off+2;   // number of values in A
+#else
+               temp = off+4;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++)
+                {
+                       b0 = ptrbb[0];
+                       b1 = ptrbb[1];
+                       b2 = ptrbb[2];
+                       b3 = ptrbb[3];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+                       res1_0 += a0*b1;
+                       res2_0 += a0*b2;
+                       res3_0 += a0*b3;
+
+                       a1 = ptrba[1];
+                       res0_1 += a1*b0;
+                       res1_1 += a1*b1;
+                       res2_1 += a1*b2;
+                       res3_1 += a1*b3;
+
+                       ptrba = ptrba+2;
+                       ptrbb = ptrbb+4;
+                }
+
+               res0_0 *= alpha;
+               res0_1 *= alpha;
+
+               res1_0 *= alpha;
+               res1_1 *= alpha;
+
+               res2_0 *= alpha;
+               res2_1 *= alpha;
+
+               res3_0 *= alpha;
+               res3_1 *= alpha;
+
+               C0[0] = res0_0;
+               C0[1] = res0_1;
+
+               C1[0] = res1_0;
+               C1[1] = res1_1;
+
+               C2[0] = res2_0;
+               C2[1] = res2_1;
+
+               C3[0] = res3_0;
+               C3[1] = res3_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 2; // number of values in A
+#else
+               temp -= 4; // number of values in B
+#endif
+               ptrba += temp*2;
+               ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+               off += 2; // number of values in A
+#endif
+
+               C0 = C0+2;
+               C1 = C1+2;
+               C2 = C2+2;
+               C3 = C3+2;
+
+       }
+
+       if ( bm & 1 ) // do any 1x4 loop
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*1;
+               ptrbb = bb + off*4;
+#endif
+
+               res0_0 = 0;
+               res1_0 = 0;
+               res2_0 = 0;
+               res3_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT)
+               temp = off+1;   // number of values in A
+#else
+               temp = off+4;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++)
+                {
+                       b0 = ptrbb[0];
+                       b1 = ptrbb[1];
+                       b2 = ptrbb[2];
+                       b3 = ptrbb[3];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+                       res1_0 += a0*b1;
+                       res2_0 += a0*b2;
+                       res3_0 += a0*b3;
+
+                       ptrba = ptrba+1;
+                       ptrbb = ptrbb+4;
+                }
+
+               res0_0 *= alpha;
+
+               res1_0 *= alpha;
+
+               res2_0 *= alpha;
+
+               res3_0 *= alpha;
+
+               C0[0] = res0_0;
+
+               C1[0] = res1_0;
+
+               C2[0] = res2_0;
+
+               C3[0] = res3_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 1; // number of values in A
+#else
+               temp -= 4; // number of values in B
+#endif
+               ptrba += temp*1;
+               ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+               off += 1; // number of values in A
+#endif
+
+               C0 = C0+1;
+               C1 = C1+1;
+               C2 = C2+1;
+               C3 = C3+1;
+
+       }
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+               off += 4;
+#endif
+
+        k = (bk<<2);
+        bb = bb+k;
+        i = (ldc<<2);
+        C = C+i;
+    }
+
+   for (j=0; j<(bn&2); j+=2) // do the Mx2 loops 
+   {
+        C0 = C;
+        C1 = C0+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+               off = offset;
+#endif
+
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x2
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*4;
+               ptrbb = bb + off*2;
+#endif
+
+               res0_0 = 0;
+               res0_1 = 0;
+               res0_2 = 0;
+               res0_3 = 0;
+
+               res1_0 = 0;
+               res1_1 = 0;
+               res1_2 = 0;
+               res1_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT)
+               temp = off+4;   // number of values in A
+#else
+               temp = off+2;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++)
+                {
+                       b0 = ptrbb[0];
+                       b1 = ptrbb[1];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+                       res1_0 += a0*b1;
+
+                       a1 = ptrba[1];
+                       res0_1 += a1*b0;
+                       res1_1 += a1*b1;
+
+                       a0 = ptrba[2];
+                       res0_2 += a0*b0;
+                       res1_2 += a0*b1;
+
+                       a1 = ptrba[3];
+                       res0_3 += a1*b0;
+                       res1_3 += a1*b1;
+
+                       ptrba = ptrba+4;
+                       ptrbb = ptrbb+2;
+                }
+
+               res0_0 *= alpha;
+               res0_1 *= alpha;
+               res0_2 *= alpha;
+               res0_3 *= alpha;
+
+               res1_0 *= alpha;
+               res1_1 *= alpha;
+               res1_2 *= alpha;
+               res1_3 *= alpha;
+
+               C0[0] = res0_0;
+               C0[1] = res0_1;
+               C0[2] = res0_2;
+               C0[3] = res0_3;
+
+               C1[0] = res1_0;
+               C1[1] = res1_1;
+               C1[2] = res1_2;
+               C1[3] = res1_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 4; // number of values in A
+#else
+               temp -= 2; // number of values in B
+#endif
+               ptrba += temp*4;
+               ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+               off += 4; // number of values in A
+#endif
+
+               C0 = C0+4;
+               C1 = C1+4;
+
+       }
+
+       if ( bm & 2 ) // do any 2x2 loop
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*2;
+               ptrbb = bb + off*2;
+#endif
+
+               res0_0 = 0;
+               res0_1 = 0;
+
+               res1_0 = 0;
+               res1_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT)
+               temp = off+2;   // number of values in A
+#else
+               temp = off+2;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++)
+                {
+                       b0 = ptrbb[0];
+                       b1 = ptrbb[1];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+                       res1_0 += a0*b1;
+
+                       a1 = ptrba[1];
+                       res0_1 += a1*b0;
+                       res1_1 += a1*b1;
+
+                       ptrba = ptrba+2;
+                       ptrbb = ptrbb+2;
+                }
+
+               res0_0 *= alpha;
+               res0_1 *= alpha;
+
+               res1_0 *= alpha;
+               res1_1 *= alpha;
+
+               C0[0] = res0_0;
+               C0[1] = res0_1;
+
+               C1[0] = res1_0;
+               C1[1] = res1_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 2; // number of values in A
+#else
+               temp -= 2; // number of values in B
+#endif
+               ptrba += temp*2;
+               ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+               off += 2; // number of values in A
+#endif
+
+               C0 = C0+2;
+               C1 = C1+2;
+
+       }
+
+       if ( bm & 1 ) // do any 1x2 loop
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*1;
+               ptrbb = bb + off*2;
+#endif
+
+               res0_0 = 0;
+
+               res1_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT)
+               temp = off+1;   // number of values in A
+#else
+               temp = off+2;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++)
+                {
+                       b0 = ptrbb[0];
+                       b1 = ptrbb[1];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+                       res1_0 += a0*b1;
+
+                       ptrba = ptrba+1;
+                       ptrbb = ptrbb+2;
+                }
+
+               res0_0 *= alpha;
+
+               res1_0 *= alpha;
+
+               C0[0] = res0_0;
+
+               C1[0] = res1_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 1; // number of values in A
+#else
+               temp -= 2; // number of values in B
+#endif
+               ptrba += temp*1;
+               ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+               off += 1; // number of values in A
+#endif
+
+               C0 = C0+1;
+               C1 = C1+1;
+
+       }
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+               off += 2;
+#endif
+
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+    }
+
+
+
+
+
+
+
+   for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
+   {
+        C0 = C;
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+       off = offset;
+#endif
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*4;
+               ptrbb = bb + off*1;
+#endif
+
+               res0_0 = 0;
+               res0_1 = 0;
+               res0_2 = 0;
+               res0_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT)
+               temp = off+4;   // number of values in A
+#else
+               temp = off+1;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++)
+                {
+                       b0 = ptrbb[0];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+
+                       a1 = ptrba[1];
+                       res0_1 += a1*b0;
+
+                       a0 = ptrba[2];
+                       res0_2 += a0*b0;
+
+                       a1 = ptrba[3];
+                       res0_3 += a1*b0;
+
+                       ptrba = ptrba+4;
+                       ptrbb = ptrbb+1;
+                }
+
+               res0_0 *= alpha;
+               res0_1 *= alpha;
+               res0_2 *= alpha;
+               res0_3 *= alpha;
+
+               C0[0] = res0_0;
+               C0[1] = res0_1;
+               C0[2] = res0_2;
+               C0[3] = res0_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 4; // number of values in A
+#else
+               temp -= 1; // number of values in B
+#endif
+               ptrba += temp*4;
+               ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+               off += 4; // number of values in A
+#endif
+
+               C0 = C0+4;
+
+       }
+
+       if ( bm & 2 ) // do any 2x1 loop
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*2;
+               ptrbb = bb + off*1;
+#endif
+
+               res0_0 = 0;
+               res0_1 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT)
+               temp = off+2;   // number of values in A
+#else
+               temp = off+1;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++)
+                {
+                       b0 = ptrbb[0];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+
+                       a1 = ptrba[1];
+                       res0_1 += a1*b0;
+
+                       ptrba = ptrba+2;
+                       ptrbb = ptrbb+1;
+                }
+
+               res0_0 *= alpha;
+               res0_1 *= alpha;
+
+               C0[0] = res0_0;
+               C0[1] = res0_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 2; // number of values in A
+#else
+               temp -= 1; // number of values in B
+#endif
+               ptrba += temp*2;
+               ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+               off += 2; // number of values in A
+#endif
+
+               C0 = C0+2;
+
+       }
+
+       if ( bm & 1 ) // do any 1x1 loop
+       {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               ptrbb = bb;
+#else
+               ptrba += off*1;
+               ptrbb = bb + off*1;
+#endif
+
+               res0_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+               temp = bk-off;
+#elif defined(LEFT)
+               temp = off+1;   // number of values in A
+#else
+               temp = off+1;   // number of values in B
+#endif
+
+               for (k=0; k<temp; k++)
+                {
+                       b0 = ptrbb[0];
+
+                       a0 = ptrba[0];
+                       res0_0 += a0*b0;
+
+                       ptrba = ptrba+1;
+                       ptrbb = ptrbb+1;
+                }
+
+               res0_0 *= alpha;
+
+               C0[0] = res0_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+               temp = bk - off;
+#ifdef LEFT
+               temp -= 1; // number of values in A
+#else
+               temp -= 1; // number of values in B
+#endif
+               ptrba += temp*1;
+               ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+               off += 1; // number of values in A
+#endif
+
+               C0 = C0+1;
+
+       }
+
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+               off += 1;
+#endif
+
+        k = (bk<<0);
+        bb = bb+k;
+        C = C+ldc;
+   }
+   return 0;
+}
diff --git a/param.h b/param.h

index 3e20f58..d7a427b 100644 (file)
--- a/param.h
+++ b/param.h
@@ -2039,8 +2039,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  #define GEMM_DEFAULT_OFFSET_B 0
  #define GEMM_DEFAULT_ALIGN 0x03fffUL
  
-#define SGEMM_DEFAULT_UNROLL_M  2
-#define SGEMM_DEFAULT_UNROLL_N  2
+#define SGEMM_DEFAULT_UNROLL_M  4
+#define SGEMM_DEFAULT_UNROLL_N  4
  
  #define DGEMM_DEFAULT_UNROLL_M  2
  #define DGEMM_DEFAULT_UNROLL_N  2
author	Benedikt Huber <benedikt.huber@theobroma-systems.com>
	Thu, 9 Oct 2014 13:52:10 +0000 (06:52 -0700)
committer	Zhang Xianyi <traits.zhang@gmail.com>
	Tue, 11 Nov 2014 14:19:23 +0000 (22:19 +0800)
CONTRIBUTORS.md		patch \| blob \| history
common_arm64.h		patch \| blob \| history
cpuid_arm64.c	[new file with mode: 0644]	patch \| blob
getarch.c		patch \| blob \| history
kernel/arm64/KERNEL.ARMV8		patch \| blob \| history
kernel/arm64/sgemm_kernel_4x4.S	[new file with mode: 0644]	patch \| blob
kernel/generic/trmmkernel_4x4.c	[new file with mode: 0644]	patch \| blob
param.h		patch \| blob \| history