From 58c90d5937cc5cc225e96cc60457401c07e07165 Mon Sep 17 00:00:00 2001
From: Benedikt Huber <benedikt.huber@theobroma-systems.com>
Date: Thu, 9 Oct 2014 06:52:10 -0700
Subject: [PATCH]  # The first commit's message is: Optimizations for APM's
 xgene-1 (aarch64).

1) general system updates to support armv8 better.  Make all did not work, one needed to supply TARGET=ARMV8.
2) sgem 4x4 kernel in assembler using SIMD, and configuration changes to use it.
3) strmm 4x4 kernel in C.  Since the sgem kernel does 4x4, the trmm kernel must also do 4xN.

Added Dave Nuechterlein to the contributors list.
---
 CONTRIBUTORS.md                 |    4 +
 common_arm64.h                  |    5 +-
 cpuid_arm64.c                   |  217 +++++
 getarch.c                       |   18 +-
 kernel/arm64/KERNEL.ARMV8       |    8 +-
 kernel/arm64/sgemm_kernel_4x4.S | 1327 +++++++++++++++++++++++++++++++
 kernel/generic/trmmkernel_4x4.c |  875 ++++++++++++++++++++
 param.h                         |    4 +-
 8 files changed, 2442 insertions(+), 16 deletions(-)
 create mode 100644 cpuid_arm64.c
 create mode 100644 kernel/arm64/sgemm_kernel_4x4.S
 create mode 100644 kernel/generic/trmmkernel_4x4.c

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 18a218ce..02d15b7f 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -117,5 +117,9 @@ In chronological order:
 * Isaac Dunham <https://github.com/idunham>
   * [2014-08-03] Fixed link error on Linux/musl
 
+* Dave Nuechterlein
+  * [2014-10-10] trmm and sgemm kernels (optimized for APM's X-Gene 1).
+                 ARMv8 support.
+
 * [Your name or handle] <[email or website]>
   * [Date] [Brief summary of your changes]
diff --git a/common_arm64.h b/common_arm64.h
index 8a66a170..4855493d 100644
--- a/common_arm64.h
+++ b/common_arm64.h
@@ -119,9 +119,9 @@ static inline int blas_quickdivide(blasint x, blasint y){
 }
 
 #if defined(DOUBLE)
-#define GET_IMAGE(res)  __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory")
+#define GET_IMAGE(res)  __asm__ __volatile__("str d1, %0" : "=m"(res) : : "memory")
 #else
-#define GET_IMAGE(res)  __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory")
+#define GET_IMAGE(res)  __asm__ __volatile__("str s1, %0" : "=m"(res) : : "memory")
 #endif
 
 #define GET_IMAGE_CANCEL
@@ -138,7 +138,6 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #if defined(ASSEMBLER) && !defined(NEEDPARAM)
 
 #define PROLOGUE \
-	.arm		 ;\
 	.global	REALNAME ;\
 	.func	REALNAME  ;\
 REALNAME:
diff --git a/cpuid_arm64.c b/cpuid_arm64.c
new file mode 100644
index 00000000..c7a27f89
--- /dev/null
+++ b/cpuid_arm64.c
@@ -0,0 +1,217 @@
+/**************************************************************************
+  Copyright (c) 2013, The OpenBLAS Project
+  All rights reserved.
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+  1. Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+  2. Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer in
+  the documentation and/or other materials provided with the
+  distribution.
+  3. Neither the name of the OpenBLAS project nor the names of
+  its contributors may be used to endorse or promote products
+  derived from this software without specific prior written permission.
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+  USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+  *****************************************************************************/
+
+#include <string.h>
+
+#define CPU_UNKNOWN     	0
+#define CPU_ARMV8       	1
+
+static char *cpuname[] = {
+  "UNKOWN",
+  "ARMV8"
+};
+
+
+int get_feature(char *search)
+{
+
+#ifdef linux
+	FILE *infile;
+  	char buffer[2048], *p,*t;
+  	p = (char *) NULL ;
+
+  	infile = fopen("/proc/cpuinfo", "r");
+
+	while (fgets(buffer, sizeof(buffer), infile))
+	{
+
+		if (!strncmp("Features", buffer, 8))
+		{
+			p = strchr(buffer, ':') + 2;
+			break;
+      		}
+  	}
+
+  	fclose(infile);
+
+
+	if( p == NULL ) return;
+
+	t = strtok(p," ");
+	while( t = strtok(NULL," "))
+	{
+		if (!strcmp(t, search))   { return(1); }
+	}
+
+#endif
+	return(0);
+}
+
+
+int detect(void)
+{
+
+#ifdef linux
+
+	FILE *infile;
+  	char buffer[512], *p;
+  	p = (char *) NULL ;
+
+  	infile = fopen("/proc/cpuinfo", "r");
+
+	while (fgets(buffer, sizeof(buffer), infile))
+	{
+
+		if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9)))
+		{
+			p = strchr(buffer, ':') + 2;
+			break;
+      		}
+  	}
+
+  	fclose(infile);
+
+  	if(p != NULL)
+	{
+
+		if (strstr(p, "AArch64"))
+		{
+			 	return CPU_ARMV8;
+
+		}
+
+
+	}
+#endif
+
+	return CPU_UNKNOWN;
+}
+
+char *get_corename(void)
+{
+	return cpuname[detect()];
+}
+
+void get_architecture(void)
+{
+	printf("ARM");
+}
+
+void get_subarchitecture(void)
+{
+	int d = detect();
+	switch (d)
+	{
+
+		case CPU_ARMV8:
+			printf("ARMV8");
+			break;
+
+		default:
+			printf("UNKNOWN");
+			break;
+	}
+}
+
+void get_subdirname(void)
+{
+	printf("arm64");
+}
+
+void get_cpuconfig(void)
+{
+
+	int d = detect();
+	switch (d)
+	{
+
+		case CPU_ARMV8:
+    			printf("#define ARMV8\n");
+    			printf("#define L1_DATA_SIZE 32768\n");
+    			printf("#define L1_DATA_LINESIZE 64\n");
+    			printf("#define L2_SIZE 262144\n");
+    			printf("#define L2_LINESIZE 64\n");
+    			printf("#define DTB_DEFAULT_ENTRIES 64\n");
+    			printf("#define DTB_SIZE 4096\n");
+    			printf("#define L2_ASSOCIATIVE 4\n");
+			break;
+
+
+	}
+}
+
+
+void get_libname(void)
+{
+
+	int d = detect();
+	switch (d)
+	{
+
+		case CPU_ARMV8:
+    			printf("armv8\n");
+			break;
+
+	}
+}
+
+
+void get_features(void)
+{
+
+#ifdef linux
+	FILE *infile;
+  	char buffer[2048], *p,*t;
+  	p = (char *) NULL ;
+
+  	infile = fopen("/proc/cpuinfo", "r");
+
+	while (fgets(buffer, sizeof(buffer), infile))
+	{
+
+		if (!strncmp("Features", buffer, 8))
+		{
+			p = strchr(buffer, ':') + 2;
+			break;
+      		}
+  	}
+
+  	fclose(infile);
+
+
+	if( p == NULL ) return;
+
+	t = strtok(p," ");
+	while( t = strtok(NULL," "))
+	{
+	}
+
+#endif
+	return;
+}
+
+
diff --git a/getarch.c b/getarch.c
index 3e991425..ded347ec 100644
--- a/getarch.c
+++ b/getarch.c
@@ -746,12 +746,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SUBARCHITECTURE "ARMV8"
 #define SUBDIRNAME      "arm64"
 #define ARCHCONFIG   "-DARMV8 " \
-       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
-       "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
-       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \
-       "-DHAVE_VFP -DHAVE_VFPV3 -DHAVE_VFPV4"
+       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+       "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " 
 #define LIBNAME   "armv8"
-#define CORENAME  "ARMV8"
+#define CORENAME  "XGENE1"
 #else
 #endif
 
@@ -801,6 +800,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define OPENBLAS_SUPPORTED
 #endif
 
+#ifdef __aarch64__
+#include "cpuid_arm64.c"
+#define OPENBLAS_SUPPORTED
+#endif
+
 
 #ifndef OPENBLAS_SUPPORTED
 #error "This arch/CPU is not supported by OpenBLAS."
@@ -856,7 +860,7 @@ int main(int argc, char *argv[]){
 #ifdef FORCE
     printf("CORE=%s\n", CORENAME);
 #else
-#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__)
+#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__)
     printf("CORE=%s\n", get_corename());
 #endif
 #endif
@@ -956,7 +960,7 @@ int main(int argc, char *argv[]){
 #ifdef FORCE
     printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
 #else
-#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__)
+#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__)
     printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
 #endif
 #endif
diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8
index 27157dad..4fc0968c 100644
--- a/kernel/arm64/KERNEL.ARMV8
+++ b/kernel/arm64/KERNEL.ARMV8
@@ -80,14 +80,14 @@ DGEMVTKERNEL = ../arm/gemv_t.c
 CGEMVTKERNEL = ../arm/zgemv_t.c
 ZGEMVTKERNEL = ../arm/zgemv_t.c
 
-STRMMKERNEL	= ../generic/trmmkernel_2x2.c
+STRMMKERNEL	= ../generic/trmmkernel_4x4.c
 DTRMMKERNEL	= ../generic/trmmkernel_2x2.c
 CTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
 ZTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
 
-SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
-SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
-SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
+SGEMMKERNEL    =  sgemm_kernel_4x4.S
+SGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
 SGEMMONCOPYOBJ =  sgemm_oncopy.o
 SGEMMOTCOPYOBJ =  sgemm_otcopy.o
 
diff --git a/kernel/arm64/sgemm_kernel_4x4.S b/kernel/arm64/sgemm_kernel_4x4.S
new file mode 100644
index 00000000..78633297
--- /dev/null
+++ b/kernel/arm64/sgemm_kernel_4x4.S
@@ -0,0 +1,1327 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/23 Saar
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+*
+* 2013/11/02 Saar
+*	UNROLL_N		4
+*	UNROLL_M		4
+*	DGEMM_P			128
+*	DGEMM_Q			240
+*	DGEMM_R			12288
+*	A_PRE			128
+*	B_PRE			128
+*	C_PRE			32
+*
+* Performance on Odroid U2:
+*
+* 3072x3072		1 Core:		2.62 GFLOPS	ATLAS: 2.69	GFLOPS
+* 3072x3072		2 Cores:	5.23 GFLOPS	ATLAS: 5.27	GFLOPS
+* 3072x3072		3 Cores:	7.78 GFLOPS	ATLAS: 7.87	GFLOPS
+* 3072x3072		4 Cores:       10.10 GFLOPS	ATLAS: 9.98	GFLOPS
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0        X3        x4       x5           x6*/
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc*/
+
+
+#define origM x0
+#define origN x1
+#define origK x2
+#define origPA x3
+#define origPB x4
+#define pC x5
+#define LDC x6
+#define offset x7
+#define counterL x8
+#define counterI x9
+#define pB x10
+#define counterJ x11
+#define tempALPHA x12
+#define pCRow0 x13
+#define pCRow1 x14
+#define pCRow2 x15
+#define pA x16
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset
+// 08 counterL
+// 09 counterI
+// 10 pB
+// 11 counterJ
+// 12 tempALPHA      
+// 13 pCRow0
+// 14 pCRow1
+// 15 pCRow2
+// 16 pA
+// 17
+// 18 must save
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 orig ALPHA -> a00
+//v01 a01
+//v02 a02
+//v03 a03
+//v04 a10
+//v05 a11
+//v06 a12
+//v07 a13
+//v08 must save b00
+//v09 must save b01
+//v10 must save b02
+//v11 must save b03
+//v12 must save b10
+//v13 must save b11
+//v14 must save b12
+//v15 must save b13
+//v16 must save  C00
+//v17 must save  C01
+//v18  C02
+//v19  C03
+//v20  C10
+//v21  C11
+//v22  C12
+//v23  C13
+//v24  C20
+//v25  C21
+//v26  C22
+//v27  C23
+//v28  C30
+//v29  C31
+//v30  C32
+//v31  C33
+
+//        add     sp,sp,#-(6*16)
+//        stp     x18,x19,[sp,#(0*16)]
+//        stp     x20,x21,[sp,#(1*16)]
+
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro INIT4x4
+
+	fsub     	v16.4s , v16.4s , v16.4s
+	fsub     	v20.4s , v20.4s , v20.4s
+	fsub     	v24.4s , v24.4s , v24.4s
+	fsub     	v28.4s , v28.4s , v28.4s
+
+.endm
+
+.macro KERNEL4x4_I
+
+        ld1     {v8.2s},[pB],#8
+        ld1     {v10.2s},[pB],#8
+        ld1     {v0.4s},[pA],#16
+
+        fmulx   v16.4s, v0.4s, v8.4s[0]
+        fmulx   v20.4s, v0.4s, v8.4s[1]
+	fmulx	v24.4s, v0.4s, v10.4s[0]
+	fmulx	v28.4s, v0.4s, v10.4s[1]
+
+        ld1     {v12.2s},[pB],#8   // for next round
+        ld1     {v14.2s},[pB],#8   // for next round
+        ld1     {v4.4s},[pA],#16   // for next round
+
+
+.endm
+
+
+.macro KERNEL4x4_M2
+
+	fmla  	v16.4s, v4.4s, v12.s[0]
+	fmla  	v20.4s, v4.4s, v12.s[1]
+	fmla 	v24.4s, v4.4s, v14.s[0]
+	fmla  	v28.4s, v4.4s, v14.s[1]
+
+        ld1     {v8.2s},[pB],#8
+        ld1     {v10.2s},[pB],#8
+        ld1     {v0.4s},[pA],#16
+
+.endm
+
+
+.macro KERNEL4x4_M1
+
+	fmla 	v16.4s, v0.4s, v8.s[0]
+	fmla 	v20.4s, v0.4s, v8.s[1]
+	fmla 	v24.4s, v0.4s, v10.s[0]
+	fmla 	v28.4s, v0.4s, v10.s[1]
+
+        ld1     {v12.2s},[pB],#8
+        ld1     {v14.2s},[pB],#8
+        ld1     {v4.4s},[pA],#16
+
+.endm
+
+
+
+.macro KERNEL4x4_E
+
+	fmla 	v16.4s, v4.4s, v12.s[0]
+	fmla 	v20.4s, v4.4s, v12.s[1]
+	fmla 	v24.4s, v4.4s, v14.s[0]
+	fmla 	v28.4s, v4.4s, v14.s[1]
+
+.endm
+
+
+
+
+.macro KERNEL4x4_SUB
+
+        ld1     {v8.2s},[pB],#8
+        ld1     {v10.2s},[pB],#8
+	ld1	{v0.4s} , [pA],#16
+
+	fmla 	v16.4s, v0.4s, v8.s[0]
+	fmla 	v20.4s, v0.4s, v8.s[1]
+	fmla 	v24.4s, v0.4s, v10.s[0]
+	fmla 	v28.4s, v0.4s, v10.s[1]
+
+.endm
+
+
+
+
+.macro SAVE4x4
+
+	add	pCRow1, pCRow0, LDC    // create a second row pointer from the first row pointer
+	mov	v0.d[0], tempALPHA
+
+        ld1     {v8.4s},[pCRow0]   // load 4 values of C from first row
+        fmla     v8.4s ,v16.4s,v0.s[0]
+	st1 	{v8.4s},[pCRow0],#16 // store C from first row
+
+        ld1     {v12.4s},[pCRow1]   // load 4 values of C from second row
+        fmla     v12.4s ,v20.4s,v0.s[0]
+	st1 	{v12.4s},[pCRow1] // store C from second row
+
+	add	pCRow2, pCRow1, LDC        // Row2 points to third row 
+
+        ld1     {v8.4s},[pCRow2]   // load 4 values of C from third row
+        fmla     v8.4s ,v24.4s,v0.s[0]
+	st1 	{v8.4s} ,[pCRow2]  // store C from third row
+
+	add	pCRow1, pCRow2 , LDC // row1 points to fourth row
+
+        ld1     {v12.4s},[pCRow1]   // load 4 values of C from fourth row
+        fmla     v12.4s ,v28.4s,v0.s[0]
+	st1     {v12.4s},[pCRow1]  // store fourth row
+
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x4
+
+	fsub		s16 , s16 , s16
+	fmov		s17, s16
+	fmov		s20, s16
+	fmov		s21, s16
+	fmov		s24, s16
+	fmov		s25, s16
+	fmov		s28, s16
+	fmov		s29, s16
+
+.endm
+
+
+
+.macro KERNEL2x4_SUB
+
+	ldr	s8 , [ pB ]
+	ldr	s9 , [ pB, #4 ]
+	ldr	s10, [ pB, #8 ]
+	ldr	s11, [ pB, #12 ]
+
+	ldr	s0 , [ pA ]
+	ldr	s1 , [ pA, #4 ]
+
+	fmadd 	s16  , s0,  s8,	s16  
+	fmadd 	s17  , s1,  s8,	s17  
+
+	fmadd 	s20  , s0,  s9,	s20  
+	fmadd 	s21  , s1,  s9,	s21  
+
+	fmadd 	s24  , s0,  s10,	s24  
+	fmadd 	s25  , s1,  s10,	s25  
+
+	fmadd 	s28  , s0,  s11,	s28  
+	fmadd 	s29  , s1,  s11,	s29  
+	add	pA , pA, #8
+	add	pB , pB, #16
+
+.endm
+
+            #define F1ST( op1, op2, op3) fmadd op1, op2, op3, op1
+            #define L1ST( op1, op2, op3) ldr op1, [op2,  op3]
+
+.macro SAVE2x4
+
+	add	pCRow1 , pCRow0, LDC
+	add	pCRow2  , pCRow1, LDC
+	mov	v0.d[0], tempALPHA
+
+	L1ST (	s8,pCRow0, #0)
+	L1ST (	s9,pCRow0, #4 )
+
+	F1ST ( 	s8 , s0 , s16)
+	F1ST ( 	s9 , s0 , s17)
+
+	str 	s8 , [pCRow0, #0]
+	str 	s9 , [pCRow0, #4 ]
+
+	ldr	s12, [pCRow1, #0]
+	ldr	s13, [pCRow1, #4 ]
+
+	F1ST ( 	s12, s0 , s20)
+	F1ST ( 	s13, s0 , s21)
+
+	str 	s12, [pCRow1, #0]
+	str 	s13, [pCRow1, #4 ]
+
+	L1ST (	s8,pCRow2 , #0)
+	L1ST (	s9,pCRow2 , #4 )
+
+	F1ST ( 	s8 , s0 , s24)
+	F1ST ( 	s9 , s0 , s25)
+
+	str 	s8 , [pCRow2 , #0]
+	str 	s9 , [pCRow2 , #4 ]
+
+	add	pCRow1, pCRow2 , LDC
+
+	ldr	s12, [pCRow1, #0]
+	ldr	s13, [pCRow1, #4 ]
+
+	F1ST ( 	s12, s0 , s28)
+	F1ST ( 	s13, s0 , s29)
+
+	str 	s12, [pCRow1, #0]
+	str 	s13, [pCRow1, #4 ]
+
+	add	pCRow0, pCRow0, #8
+
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT1x4
+
+	fsub		s16 , s16 , s16
+	fmov		s20, s16
+	fmov		s24, s16
+	fmov		s28, s16
+
+.endm
+
+
+
+.macro KERNEL1x4_SUB
+
+	ldr	s8 , [ pB ]
+	ldr	s9 , [ pB, #4 ]
+	ldr	s10, [ pB, #8 ]
+	ldr	s11, [ pB, #12 ]
+
+	ldr	s0 , [ pA ]
+
+	fmadd 	s16  , s0,  s8,	s16  
+	fmadd 	s20  , s0,  s9,	s20  
+	fmadd 	s24  , s0,  s10,	s24  
+	fmadd 	s28  , s0,  s11,	s28  
+
+	add	pA , pA, #4
+	add	pB , pB, #16
+
+.endm
+
+.macro SAVE1x4
+
+	add	pCRow1 , pCRow0, LDC
+	add	pCRow2  , pCRow1, LDC
+
+	mov	v0.d[0], tempALPHA
+
+	L1ST (	s8,pCRow0, #0)
+	F1ST ( 	s8 , s0 , s16)
+	str 	s8 , [pCRow0, #0]
+
+	L1ST (	s12,pCRow1, #0)
+	F1ST ( 	s12, s0 , s20)
+	str 	s12, [pCRow1, #0]
+
+	L1ST (	s8,pCRow2 , #0)
+	F1ST ( 	s8 , s0 , s24)
+	str 	s8 , [pCRow2 , #0]
+
+	add	pCRow1, pCRow2 , LDC
+
+	L1ST (	s12,pCRow1, #0)
+	F1ST ( 	s12, s0 , s28)
+	str 	s12, [pCRow1, #0]
+
+	add	pCRow0, pCRow0, #4
+
+.endm
+
+/******************************************************************************/
+/******************************************************************************/
+
+.macro INIT4x2
+
+	fsub		s16 , s16 , s16
+	fmov		s17, s16
+	fmov		s18, s16
+	fmov		s19, s16
+	fmov		s20, s16
+	fmov		s21, s16
+	fmov		s22, s16
+	fmov		s23, s16
+
+.endm
+
+
+
+.macro KERNEL4x2_SUB
+
+	ldr	s8 , [ pB ]
+	ldr	s9 , [ pB, #4 ]
+
+	ldr	s0 , [ pA ]
+	ldr	s1 , [ pA, #4 ]
+	ldr	s2 , [ pA, #8 ]
+	ldr	s3 , [ pA, #12 ]
+
+	fmadd 	s16  , s0,  s8,	s16  
+	fmadd 	s17  , s1,  s8,	s17  
+	fmadd 	s18  , s2,  s8,	s18  
+	fmadd 	s19  , s3,  s8,	s19  
+
+	fmadd 	s20  , s0,  s9,	s20  
+	fmadd 	s21  , s1,  s9,	s21  
+	fmadd 	s22  , s2,  s9,	s22  
+	fmadd 	s23  , s3,  s9,	s23  
+
+	add	pA , pA, #16
+	add	pB , pB, #8
+
+.endm
+
+.macro SAVE4x2
+
+	add	pCRow1 , pCRow0, LDC
+
+	mov	v0.d[0], tempALPHA
+
+	L1ST (	s8,pCRow0, #0)
+	L1ST (	s9,pCRow0, #4 )
+	L1ST (	s10,pCRow0, #8 )
+	L1ST (	s11,pCRow0, #12 )
+
+	F1ST ( 	s8 , s0 , s16)
+	F1ST ( 	s9 , s0 , s17)
+	F1ST ( 	s10, s0 , s18)
+	F1ST ( 	s11, s0 , s19)
+
+	str 	s8 , [pCRow0]
+	str 	s9 , [pCRow0, #4 ]
+	str 	s10, [pCRow0, #8 ]
+	str 	s11, [pCRow0, #12 ]
+
+	L1ST (	s12,pCRow1, #0)
+	L1ST (	s13,pCRow1, #4 )
+	L1ST (	s14,pCRow1, #8 )
+	L1ST (	s15,pCRow1, #12 )
+
+	F1ST ( 	s12, s0 , s20)
+	F1ST ( 	s13, s0 , s21)
+	F1ST ( 	s14, s0 , s22)
+	F1ST ( 	s15, s0 , s23)
+
+	str 	s12, [pCRow1]
+	str 	s13, [pCRow1, #4 ]
+	str 	s14, [pCRow1, #8 ]
+	str 	s15, [pCRow1, #12 ]
+
+	add	pCRow0, pCRow0, #16
+
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT2x2
+
+	fsub		s16 , s16 , s16
+	fmov		s17, s16
+	fmov		s20, s16
+	fmov		s21, s16
+
+.endm
+
+
+
+.macro KERNEL2x2_SUB
+
+	ldr	s8 , [ pB ]
+	ldr	s9 , [ pB, #4 ]
+
+	ldr	s0 , [ pA ]
+	ldr	s1 , [ pA, #4 ]
+
+	fmadd 	s16  , s0,  s8,	s16  
+	fmadd 	s17  , s1,  s8,	s17  
+
+	fmadd 	s20  , s0,  s9,	s20  
+	fmadd 	s21  , s1,  s9,	s21  
+
+	add	pA , pA, #8
+	add	pB , pB, #8
+
+.endm
+
+.macro SAVE2x2
+
+	add	pCRow1 , pCRow0, LDC
+
+	mov	v0.d[0], tempALPHA
+
+	L1ST (	s8,pCRow0, #0 )
+	L1ST (	s9,pCRow0, #4 )
+
+	F1ST ( 	s8 , s0 , s16)
+	F1ST ( 	s9 , s0 , s17)
+
+	str 	s8 , [pCRow0]
+	str 	s9 , [pCRow0, #4 ]
+
+	L1ST (	s12,pCRow1, #0 )
+	L1ST (	s13,pCRow1, #4 )
+
+	F1ST ( 	s12, s0 , s20)
+	F1ST ( 	s13, s0 , s21)
+
+	str 	s12, [pCRow1]
+	str 	s13, [pCRow1, #4 ]
+
+	add	pCRow0, pCRow0, #8
+
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x2
+
+	fsub		s16 , s16 , s16
+	fmov		s20, s16
+
+.endm
+
+
+
+.macro KERNEL1x2_SUB
+
+	ldr	s8 , [ pB ]
+	ldr	s9 , [ pB, #4 ]
+
+	ldr	s0 , [ pA ]
+	fmadd 	s16  , s0,  s8,	s16  
+	fmadd 	s20  , s0,  s9,	s20  
+
+	add	pA , pA, #4
+	add	pB , pB, #8
+
+.endm
+
+.macro SAVE1x2
+
+	add	pCRow1 , pCRow0, LDC
+
+	mov	v0.d[0], tempALPHA
+
+	L1ST (	s8,pCRow0, #0)
+	F1ST ( 	s8 , s0 , s16)
+	str 	s8 , [pCRow0]
+
+	L1ST (	s12,pCRow1, #0)
+	F1ST ( 	s12, s0 , s20)
+	str 	s12, [pCRow1]
+
+	add	pCRow0, pCRow0, #4
+
+.endm
+
+/******************************************************************************/
+/******************************************************************************/
+
+.macro INIT4x1
+
+	fsub		s16 , s16 , s16
+	fmov		s17, s16
+	fmov		s18, s16
+	fmov		s19, s16
+
+.endm
+
+
+
+.macro KERNEL4x1_SUB
+
+	ldr	s8 , [ pB ]
+
+	ldr	s0 , [ pA ]
+	ldr	s1 , [ pA, #4 ]
+	ldr	s2 , [ pA, #8 ]
+	ldr	s3 , [ pA, #12 ]
+
+	fmadd 	s16  , s0,  s8,	s16  
+	fmadd 	s17  , s1,  s8,	s17  
+	fmadd 	s18  , s2,  s8,	s18  
+	fmadd 	s19  , s3,  s8,	s19  
+
+	add	pA , pA, #16
+	add	pB , pB, #4
+
+.endm
+
+.macro SAVE4x1
+
+
+	mov	v0.d[0], tempALPHA
+
+	L1ST (	s8,pCRow0, #0 )
+	L1ST (	s9,pCRow0, #4 )
+	L1ST (	s10,pCRow0, #8 )
+	L1ST (	s11,pCRow0, #12 )
+
+	F1ST ( 	s8 , s0 , s16)
+	F1ST ( 	s9 , s0 , s17)
+	F1ST ( 	s10, s0 , s18)
+	F1ST ( 	s11, s0 , s19)
+
+	str 	s8 , [pCRow0]
+	str 	s9 , [pCRow0, #4 ]
+	str 	s10, [pCRow0, #8 ]
+	str 	s11, [pCRow0, #12 ]
+
+	add	pCRow0, pCRow0, #16
+
+.endm
+
+
+
+
+/******************************************************************************/
+
+.macro INIT2x1
+
+	fsub		s16 , s16 , s16
+	fmov		s17, s16
+
+.endm
+
+
+
+.macro KERNEL2x1_SUB
+
+	ldr	s8 , [ pB ]
+
+	ldr	s0 , [ pA ]
+	ldr	s1 , [ pA, #4 ]
+
+	fmadd 	s16  , s0,  s8,	s16  
+	fmadd 	s17  , s1,  s8,	s17  
+
+	add	pA , pA, #8
+	add	pB , pB, #4
+
+.endm
+
+.macro SAVE2x1
+
+
+	mov	v0.d[0], tempALPHA
+
+	L1ST (	s8,pCRow0, #0 )
+	L1ST (	s9,pCRow0, #4 )
+
+	F1ST ( 	s8 , s0 , s16)
+	F1ST ( 	s9 , s0 , s17)
+
+	str 	s8 , [pCRow0]
+	str 	s9 , [pCRow0, #4 ]
+
+	add	pCRow0, pCRow0, #8
+
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x1
+
+	fsub		s16 , s16 , s16
+
+.endm
+
+
+
+.macro KERNEL1x1_SUB
+
+	ldr	s8 , [ pB ]
+
+	ldr	s0 , [ pA ]
+
+	fmadd 	s16  , s0,  s8,	s16  
+
+	add	pA , pA, #4
+	add	pB , pB, #4
+
+.endm
+
+.macro SAVE1x1
+
+
+	mov	v0.d[0], tempALPHA
+
+	L1ST (	s8,pCRow0, #0 )
+	F1ST ( 	s8 , s0 , s16)
+	str 	s8 , [pCRow0]
+
+	add	pCRow0, pCRow0, #4
+
+.endm
+
+
+
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+        add     sp,sp,#-(5*16)
+        stp     d8,d9,[sp,#(0*16)]
+        stp     d10,d11,[sp,#(1*16)]
+        stp     d12,d13,[sp,#(2*16)]
+        stp     d14,d15,[sp,#(3*16)]
+        stp     d16,d17,[sp,#(4*16)]
+
+        mov     tempALPHA, v0.d[0]
+	lsl	LDC, LDC, #2					// ldc = ldc * 4
+
+	mov	pB, origPB
+
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #2					// J = J / 4
+	cmp 	counterJ, #0
+	ble	sgemm_kernel_L2_BEGIN
+
+sgemm_kernel_L4_BEGIN:
+
+	mov	pCRow0, pC						// pCRow0 = C
+        add     pC,pC,LDC, lsl #2
+
+	mov	pA, origPA						// pA = start of A array
+
+
+
+sgemm_kernel_L4_M4_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #2					// counterI = counterI / 4
+	cmp 	counterI, #0
+	ble	sgemm_kernel_L4_M2_BEGIN
+
+sgemm_kernel_L4_M4_20:
+
+	mov	pB, origPB
+	asr 	counterL , origK, #1					// L = K / 2
+	cmp	counterL , #2                                           // is there at least 4 to do?
+	blt	sgemm_kernel_L4_M4_32
+
+
+
+	KERNEL4x4_I     //do one in the K
+	KERNEL4x4_M2    //do another in the K
+
+	subs	counterL, counterL, #2  // subtract 2, since one is always done at the tail
+	ble	sgemm_kernel_L4_M4_22a
+	.align 5
+
+sgemm_kernel_L4_M4_22:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L4_M4_22
+
+sgemm_kernel_L4_M4_22a:
+
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	b	 sgemm_kernel_L4_M4_44
+
+sgemm_kernel_L4_M4_32:   // less than 4 to do in the K direction
+
+	tst	counterL, #1
+	ble	sgemm_kernel_L4_M4_40
+
+	KERNEL4x4_I
+
+	KERNEL4x4_E
+
+	b	 sgemm_kernel_L4_M4_44
+
+
+sgemm_kernel_L4_M4_40:
+
+	INIT4x4
+
+
+sgemm_kernel_L4_M4_44:
+
+	ands	counterL , origK, #1
+	ble	sgemm_kernel_L4_M4_100
+
+sgemm_kernel_L4_M4_46:
+
+	KERNEL4x4_SUB
+
+	subs	counterL, counterL, #1
+	bne	sgemm_kernel_L4_M4_46
+
+sgemm_kernel_L4_M4_100:
+
+	SAVE4x4
+
+sgemm_kernel_L4_M4_END:
+
+	subs	counterI, counterI, #1
+	bne	sgemm_kernel_L4_M4_20
+
+
+sgemm_kernel_L4_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	sgemm_kernel_L4_END
+
+	tst	counterI, #2					// counterI = counterI / 2
+	ble	sgemm_kernel_L4_M1_BEGIN
+
+sgemm_kernel_L4_M2_20:
+
+	INIT2x4
+
+	mov	pB, origPB
+	asr 	counterL , origK, #3					// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	sgemm_kernel_L4_M2_40
+
+sgemm_kernel_L4_M2_22:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L4_M2_22
+
+
+sgemm_kernel_L4_M2_40:
+
+	ands	counterL , origK, #7					// counterL = counterL % 8
+	ble	sgemm_kernel_L4_M2_100
+
+sgemm_kernel_L4_M2_42:
+
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L4_M2_42
+
+sgemm_kernel_L4_M2_100:
+
+	SAVE2x4
+
+sgemm_kernel_L4_M2_END:
+
+
+sgemm_kernel_L4_M1_BEGIN:
+
+	tst	counterI, #1					// counterI = counterI % 2
+	ble	sgemm_kernel_L4_END
+
+sgemm_kernel_L4_M1_20:
+
+	INIT1x4
+
+	mov	pB, origPB
+	asr 	counterL , origK, #3					// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	sgemm_kernel_L4_M1_40
+
+sgemm_kernel_L4_M1_22:
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L4_M1_22
+
+
+sgemm_kernel_L4_M1_40:
+
+	ands	counterL , origK, #7					// counterL = counterL % 8
+	ble	sgemm_kernel_L4_M1_100
+
+sgemm_kernel_L4_M1_42:
+
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L4_M1_42
+
+sgemm_kernel_L4_M1_100:
+
+	SAVE1x4
+
+
+sgemm_kernel_L4_END:
+
+	add	origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
+
+	subs	counterJ, counterJ , #1						// j--
+	bgt	sgemm_kernel_L4_BEGIN
+
+
+
+/*********************************************************************************************/
+
+sgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+	mov	counterJ , origN
+	tst	counterJ , #3
+	ble	sgemm_kernel_L999   // error, N was less than 4?
+
+	tst	counterJ , #2
+	ble	sgemm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC						// pCRow0 = pC
+	add	pC , pC, LDC, lsl #1
+
+	mov	pA, origPA						// pA = A
+
+
+
+sgemm_kernel_L2_M4_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #2					// counterI = counterI / 4
+	cmp	counterI,#0
+	ble	sgemm_kernel_L2_M2_BEGIN
+
+sgemm_kernel_L2_M4_20:
+
+	INIT4x2
+
+	mov	pB, origPB
+	asr	counterL , origK, #3					// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	sgemm_kernel_L2_M4_40
+	.align 5
+
+sgemm_kernel_L2_M4_22:
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M4_22
+
+
+sgemm_kernel_L2_M4_40:
+
+	ands	counterL , origK, #7					// counterL = counterL % 8
+	ble	sgemm_kernel_L2_M4_100
+
+sgemm_kernel_L2_M4_42:
+
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M4_42
+
+sgemm_kernel_L2_M4_100:
+
+	SAVE4x2
+
+sgemm_kernel_L2_M4_END:
+
+	subs	counterI, counterI, #1
+	bgt	sgemm_kernel_L2_M4_20
+
+
+sgemm_kernel_L2_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	sgemm_kernel_L2_END
+
+	tst	counterI, #2					// counterI = counterI / 2
+	ble	sgemm_kernel_L2_M1_BEGIN
+
+sgemm_kernel_L2_M2_20:
+
+	INIT2x2
+
+	mov	pB, origPB
+	asr	counterL , origK, #3					// counterL = counterL / 8
+        cmp	counterL,#0
+	ble	sgemm_kernel_L2_M2_40
+
+sgemm_kernel_L2_M2_22:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M2_22
+
+
+sgemm_kernel_L2_M2_40:
+
+	ands	counterL , origK, #7					// counterL = counterL % 8
+	ble	sgemm_kernel_L2_M2_100
+
+sgemm_kernel_L2_M2_42:
+
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M2_42
+
+sgemm_kernel_L2_M2_100:
+
+	SAVE2x2
+
+sgemm_kernel_L2_M2_END:
+
+
+sgemm_kernel_L2_M1_BEGIN:
+
+	tst	counterI, #1					// counterI = counterI % 2
+	ble	sgemm_kernel_L2_END
+
+sgemm_kernel_L2_M1_20:
+
+	INIT1x2
+
+	mov	pB, origPB
+	asr 	counterL , origK, #3					// counterL = counterL / 8
+        cmp     counterL, #0
+	ble	sgemm_kernel_L2_M1_40
+
+sgemm_kernel_L2_M1_22:
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M1_22
+
+
+sgemm_kernel_L2_M1_40:
+
+	ands	counterL , origK, #7					// counterL = counterL % 8
+	ble	sgemm_kernel_L2_M1_100
+
+sgemm_kernel_L2_M1_42:
+
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M1_42
+
+sgemm_kernel_L2_M1_100:
+
+	SAVE1x2
+
+
+sgemm_kernel_L2_END:
+	add	origPB, origPB, origK, lsl #3					// B = B + K * 2 * 4
+
+/*********************************************************************************************/
+
+sgemm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	sgemm_kernel_L999 // done
+
+
+	mov	pCRow0, pC						// pCRow0 = C
+	add	pC , pCRow0 , LDC                                 // C01 is the current line, update pC to point to next
+
+	mov	pA, origPA						// pA = A
+
+
+
+sgemm_kernel_L1_M4_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #2					// counterI = counterI / 4
+	cmp	counterI, #0
+	ble	sgemm_kernel_L1_M2_BEGIN
+
+sgemm_kernel_L1_M4_20:
+
+	INIT4x1
+
+	mov	pB, origPB
+	asr	counterL , origK, #3					// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	sgemm_kernel_L1_M4_40
+	.align 5
+
+sgemm_kernel_L1_M4_22:
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M4_22
+
+
+sgemm_kernel_L1_M4_40:
+
+	ands	counterL , origK, #7					// counterL = counterL % 8
+	ble	sgemm_kernel_L1_M4_100
+
+sgemm_kernel_L1_M4_42:
+
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M4_42
+
+sgemm_kernel_L1_M4_100:
+
+	SAVE4x1
+
+sgemm_kernel_L1_M4_END:
+
+	subs	counterI, counterI, #1
+	bgt	sgemm_kernel_L1_M4_20
+
+
+sgemm_kernel_L1_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	sgemm_kernel_L1_END
+
+	tst	counterI, #2					// counterI = counterI / 2
+	ble	sgemm_kernel_L1_M1_BEGIN
+
+sgemm_kernel_L1_M2_20:
+
+	INIT2x1
+
+	mov	pB, origPB
+	asr 	counterL , origK, #3					// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	sgemm_kernel_L1_M2_40
+
+sgemm_kernel_L1_M2_22:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M2_22
+
+
+sgemm_kernel_L1_M2_40:
+
+	ands	counterL , origK, #7					// counterL = counterL % 8
+	ble	sgemm_kernel_L1_M2_100
+
+sgemm_kernel_L1_M2_42:
+
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M2_42
+
+sgemm_kernel_L1_M2_100:
+
+	SAVE2x1
+
+sgemm_kernel_L1_M2_END:
+
+
+sgemm_kernel_L1_M1_BEGIN:
+
+	tst	counterI, #1					// counterI = counterI % 2
+	ble	sgemm_kernel_L1_END
+
+sgemm_kernel_L1_M1_20:
+
+	INIT1x1
+
+	mov	pB, origPB
+	asr 	counterL , origK, #3					// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	sgemm_kernel_L1_M1_40
+
+sgemm_kernel_L1_M1_22:
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M1_22
+
+
+sgemm_kernel_L1_M1_40:
+
+	ands	counterL , origK, #7					// counterL = counterL % 8
+	ble	sgemm_kernel_L1_M1_100
+
+sgemm_kernel_L1_M1_42:
+
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M1_42
+
+sgemm_kernel_L1_M1_100:
+
+	SAVE1x1
+
+
+sgemm_kernel_L1_END:
+
+
+sgemm_kernel_L999:
+	mov	x0, #0						// set return value
+        ldp     d8,d9,[sp,#(0*16)]
+        ldp     d10,d11,[sp,#(1*16)]
+        ldp     d12,d13,[sp,#(2*16)]
+        ldp     d14,d15,[sp,#(3*16)]
+        ldp     d16,d17,[sp,#(4*16)]
+        add     sp,sp,#(5*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/generic/trmmkernel_4x4.c b/kernel/generic/trmmkernel_4x4.c
new file mode 100644
index 00000000..a85828ca
--- /dev/null
+++ b/kernel/generic/trmmkernel_4x4.c
@@ -0,0 +1,875 @@
+#include "common.h"
+#include <stdbool.h>
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
+{
+
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
+
+   FLOAT res0_0;
+   FLOAT res0_1;
+   FLOAT res0_2;
+   FLOAT res0_3;
+
+   FLOAT res1_0;
+   FLOAT res1_1;
+   FLOAT res1_2;
+   FLOAT res1_3;
+
+   FLOAT res2_0;
+   FLOAT res2_1;
+   FLOAT res2_2;
+   FLOAT res2_3;
+
+   FLOAT res3_0;
+   FLOAT res3_1;
+   FLOAT res3_2;
+   FLOAT res3_3;
+
+   FLOAT a0;
+   FLOAT a1;
+
+   FLOAT b0;
+   FLOAT b1;
+   FLOAT b2;
+   FLOAT b3;
+
+   BLASLONG off, temp;
+
+   bool left;
+   bool transposed;
+   bool backwards;
+
+#ifdef LEFT
+   left = true;
+#else
+   left = false;
+#endif
+
+#ifdef TRANSA
+   transposed = true;
+#else
+   transposed = false;
+#endif
+
+   backwards = left != transposed;
+
+   if (!left) {
+      off = -offset;
+   }
+
+
+   for (j=0; j<bn/4; j+=1) // do blocks of the Mx4 loops 
+   {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C1+ldc;
+        C3 = C2+ldc;
+
+
+        if (left) {
+            off = offset;
+        }
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x4
+	{
+
+		ptrbb = bb;
+                if (backwards)
+                {
+		   ptrba += off*4; // number of values in A
+		   ptrbb += off*4; // number of values in B
+                }
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+
+                temp = backwards ? bk-off :
+                             left ? off + 4 : // number of values in A
+                                    off + 4;  // number of values in B
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+
+		if (!backwards) {
+                    temp = bk-off;
+                    temp = left ? temp - 4 : // number of values in A
+                                  temp - 4;  // number of values in B
+
+                    ptrba += temp*4; // number of values in A
+		    ptrbb += temp*4; // number of values in B
+                }
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+		C1 = C1+4;
+		C2 = C2+4;
+		C3 = C3+4;
+
+	}
+
+	if ( bm & 2 ) // do any 2x4 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+		C1 = C1+2;
+		C2 = C2+2;
+		C3 = C3+2;
+
+	}
+
+	if ( bm & 1 ) // do any 1x4 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res1_0 = 0;
+		res2_0 = 0;
+		res3_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+
+		res1_0 *= alpha;
+
+		res2_0 *= alpha;
+
+		res3_0 *= alpha;
+
+		C0[0] = res0_0;
+
+		C1[0] = res1_0;
+
+		C2[0] = res2_0;
+
+		C3[0] = res3_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+		C1 = C1+1;
+		C2 = C2+1;
+		C3 = C3+1;
+
+	}
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 4;
+#endif
+
+        k = (bk<<2);
+        bb = bb+k;
+        i = (ldc<<2);
+        C = C+i;
+    }
+
+   for (j=0; j<(bn&2); j+=2) // do the Mx2 loops 
+   {
+        C0 = C;
+        C1 = C0+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+		off = offset;
+#endif
+
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x2
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+		C1 = C1+4;
+
+	}
+
+	if ( bm & 2 ) // do any 2x2 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+		C1 = C1+2;
+
+	}
+
+	if ( bm & 1 ) // do any 1x2 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+
+		res1_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+
+		res1_0 *= alpha;
+
+		C0[0] = res0_0;
+
+		C1[0] = res1_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+		C1 = C1+1;
+
+	}
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 2;
+#endif
+
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+    }
+
+
+
+
+
+
+
+   for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
+   {
+        C0 = C;
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+	off = offset;
+#endif
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+
+	}
+
+	if ( bm & 2 ) // do any 2x1 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+
+	}
+
+	if ( bm & 1 ) // do any 1x1 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+
+		C0[0] = res0_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+
+	}
+
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 1;
+#endif
+
+        k = (bk<<0);
+        bb = bb+k;
+        C = C+ldc;
+   }
+   return 0;
+}
diff --git a/param.h b/param.h
index 3e20f588..d7a427b6 100644
--- a/param.h
+++ b/param.h
@@ -2039,8 +2039,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GEMM_DEFAULT_OFFSET_B 0
 #define GEMM_DEFAULT_ALIGN 0x03fffUL
 
-#define SGEMM_DEFAULT_UNROLL_M  2
-#define SGEMM_DEFAULT_UNROLL_N  2
+#define SGEMM_DEFAULT_UNROLL_M  4
+#define SGEMM_DEFAULT_UNROLL_N  4
 
 #define DGEMM_DEFAULT_UNROLL_M  2
 #define DGEMM_DEFAULT_UNROLL_N  2
-- 
2.34.1