--- /dev/null
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/23 Saar
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+*
+*
+* 2013/11/02 Saar
+* UNROLL_N 4
+* UNROLL_M 4
+* DGEMM_P 128
+* DGEMM_Q 240
+* DGEMM_R 12288
+* A_PRE 128
+* B_PRE 128
+* C_PRE 32
+*
+* Performance on Odroid U2:
+*
+* 3072x3072 1 Core: 2.62 GFLOPS ATLAS: 2.69 GFLOPS
+* 3072x3072 2 Cores: 5.23 GFLOPS ATLAS: 5.27 GFLOPS
+* 3072x3072 3 Cores: 7.78 GFLOPS ATLAS: 7.87 GFLOPS
+* 3072x3072 4 Cores: 10.10 GFLOPS ATLAS: 9.98 GFLOPS
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/* X0 X1 X2 s0 X3 x4 x5 x6*/
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc*/
+
+
+#define origM x0
+#define origN x1
+#define origK x2
+#define origPA x3
+#define origPB x4
+#define pC x5
+#define LDC x6
+#define offset x7
+#define counterL x8
+#define counterI x9
+#define pB x10
+#define counterJ x11
+#define tempALPHA x12
+#define pCRow0 x13
+#define pCRow1 x14
+#define pCRow2 x15
+#define pA x16
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset
+// 08 counterL
+// 09 counterI
+// 10 pB
+// 11 counterJ
+// 12 tempALPHA
+// 13 pCRow0
+// 14 pCRow1
+// 15 pCRow2
+// 16 pA
+// 17
+// 18 must save
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 orig ALPHA -> a00
+//v01 a01
+//v02 a02
+//v03 a03
+//v04 a10
+//v05 a11
+//v06 a12
+//v07 a13
+//v08 must save b00
+//v09 must save b01
+//v10 must save b02
+//v11 must save b03
+//v12 must save b10
+//v13 must save b11
+//v14 must save b12
+//v15 must save b13
+//v16 must save C00
+//v17 must save C01
+//v18 C02
+//v19 C03
+//v20 C10
+//v21 C11
+//v22 C12
+//v23 C13
+//v24 C20
+//v25 C21
+//v26 C22
+//v27 C23
+//v28 C30
+//v29 C31
+//v30 C32
+//v31 C33
+
+// add sp,sp,#-(6*16)
+// stp x18,x19,[sp,#(0*16)]
+// stp x20,x21,[sp,#(1*16)]
+
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro INIT4x4
+
+ fsub v16.4s , v16.4s , v16.4s
+ fsub v20.4s , v20.4s , v20.4s
+ fsub v24.4s , v24.4s , v24.4s
+ fsub v28.4s , v28.4s , v28.4s
+
+.endm
+
+.macro KERNEL4x4_I
+
+ ld1 {v8.2s},[pB],#8
+ ld1 {v10.2s},[pB],#8
+ ld1 {v0.4s},[pA],#16
+
+ fmulx v16.4s, v0.4s, v8.4s[0]
+ fmulx v20.4s, v0.4s, v8.4s[1]
+ fmulx v24.4s, v0.4s, v10.4s[0]
+ fmulx v28.4s, v0.4s, v10.4s[1]
+
+ ld1 {v12.2s},[pB],#8 // for next round
+ ld1 {v14.2s},[pB],#8 // for next round
+ ld1 {v4.4s},[pA],#16 // for next round
+
+
+.endm
+
+
+.macro KERNEL4x4_M2
+
+ fmla v16.4s, v4.4s, v12.s[0]
+ fmla v20.4s, v4.4s, v12.s[1]
+ fmla v24.4s, v4.4s, v14.s[0]
+ fmla v28.4s, v4.4s, v14.s[1]
+
+ ld1 {v8.2s},[pB],#8
+ ld1 {v10.2s},[pB],#8
+ ld1 {v0.4s},[pA],#16
+
+.endm
+
+
+.macro KERNEL4x4_M1
+
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v24.4s, v0.4s, v10.s[0]
+ fmla v28.4s, v0.4s, v10.s[1]
+
+ ld1 {v12.2s},[pB],#8
+ ld1 {v14.2s},[pB],#8
+ ld1 {v4.4s},[pA],#16
+
+.endm
+
+
+
+.macro KERNEL4x4_E
+
+ fmla v16.4s, v4.4s, v12.s[0]
+ fmla v20.4s, v4.4s, v12.s[1]
+ fmla v24.4s, v4.4s, v14.s[0]
+ fmla v28.4s, v4.4s, v14.s[1]
+
+.endm
+
+
+
+
+.macro KERNEL4x4_SUB
+
+ ld1 {v8.2s},[pB],#8
+ ld1 {v10.2s},[pB],#8
+ ld1 {v0.4s} , [pA],#16
+
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v24.4s, v0.4s, v10.s[0]
+ fmla v28.4s, v0.4s, v10.s[1]
+
+.endm
+
+
+
+
+.macro SAVE4x4
+
+ add pCRow1, pCRow0, LDC // create a second row pointer from the first row pointer
+ mov v0.d[0], tempALPHA
+
+ ld1 {v8.4s},[pCRow0] // load 4 values of C from first row
+ fmla v8.4s ,v16.4s,v0.s[0]
+ st1 {v8.4s},[pCRow0],#16 // store C from first row
+
+ ld1 {v12.4s},[pCRow1] // load 4 values of C from second row
+ fmla v12.4s ,v20.4s,v0.s[0]
+ st1 {v12.4s},[pCRow1] // store C from second row
+
+ add pCRow2, pCRow1, LDC // Row2 points to third row
+
+ ld1 {v8.4s},[pCRow2] // load 4 values of C from third row
+ fmla v8.4s ,v24.4s,v0.s[0]
+ st1 {v8.4s} ,[pCRow2] // store C from third row
+
+ add pCRow1, pCRow2 , LDC // row1 points to fourth row
+
+ ld1 {v12.4s},[pCRow1] // load 4 values of C from fourth row
+ fmla v12.4s ,v28.4s,v0.s[0]
+ st1 {v12.4s},[pCRow1] // store fourth row
+
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x4
+
+ fsub s16 , s16 , s16
+ fmov s17, s16
+ fmov s20, s16
+ fmov s21, s16
+ fmov s24, s16
+ fmov s25, s16
+ fmov s28, s16
+ fmov s29, s16
+
+.endm
+
+
+
+.macro KERNEL2x4_SUB
+
+ ldr s8 , [ pB ]
+ ldr s9 , [ pB, #4 ]
+ ldr s10, [ pB, #8 ]
+ ldr s11, [ pB, #12 ]
+
+ ldr s0 , [ pA ]
+ ldr s1 , [ pA, #4 ]
+
+ fmadd s16 , s0, s8, s16
+ fmadd s17 , s1, s8, s17
+
+ fmadd s20 , s0, s9, s20
+ fmadd s21 , s1, s9, s21
+
+ fmadd s24 , s0, s10, s24
+ fmadd s25 , s1, s10, s25
+
+ fmadd s28 , s0, s11, s28
+ fmadd s29 , s1, s11, s29
+ add pA , pA, #8
+ add pB , pB, #16
+
+.endm
+
+ #define F1ST( op1, op2, op3) fmadd op1, op2, op3, op1
+ #define L1ST( op1, op2, op3) ldr op1, [op2, op3]
+
+.macro SAVE2x4
+
+ add pCRow1 , pCRow0, LDC
+ add pCRow2 , pCRow1, LDC
+ mov v0.d[0], tempALPHA
+
+ L1ST ( s8,pCRow0, #0)
+ L1ST ( s9,pCRow0, #4 )
+
+ F1ST ( s8 , s0 , s16)
+ F1ST ( s9 , s0 , s17)
+
+ str s8 , [pCRow0, #0]
+ str s9 , [pCRow0, #4 ]
+
+ ldr s12, [pCRow1, #0]
+ ldr s13, [pCRow1, #4 ]
+
+ F1ST ( s12, s0 , s20)
+ F1ST ( s13, s0 , s21)
+
+ str s12, [pCRow1, #0]
+ str s13, [pCRow1, #4 ]
+
+ L1ST ( s8,pCRow2 , #0)
+ L1ST ( s9,pCRow2 , #4 )
+
+ F1ST ( s8 , s0 , s24)
+ F1ST ( s9 , s0 , s25)
+
+ str s8 , [pCRow2 , #0]
+ str s9 , [pCRow2 , #4 ]
+
+ add pCRow1, pCRow2 , LDC
+
+ ldr s12, [pCRow1, #0]
+ ldr s13, [pCRow1, #4 ]
+
+ F1ST ( s12, s0 , s28)
+ F1ST ( s13, s0 , s29)
+
+ str s12, [pCRow1, #0]
+ str s13, [pCRow1, #4 ]
+
+ add pCRow0, pCRow0, #8
+
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT1x4
+
+ fsub s16 , s16 , s16
+ fmov s20, s16
+ fmov s24, s16
+ fmov s28, s16
+
+.endm
+
+
+
+.macro KERNEL1x4_SUB
+
+ ldr s8 , [ pB ]
+ ldr s9 , [ pB, #4 ]
+ ldr s10, [ pB, #8 ]
+ ldr s11, [ pB, #12 ]
+
+ ldr s0 , [ pA ]
+
+ fmadd s16 , s0, s8, s16
+ fmadd s20 , s0, s9, s20
+ fmadd s24 , s0, s10, s24
+ fmadd s28 , s0, s11, s28
+
+ add pA , pA, #4
+ add pB , pB, #16
+
+.endm
+
+.macro SAVE1x4
+
+ add pCRow1 , pCRow0, LDC
+ add pCRow2 , pCRow1, LDC
+
+ mov v0.d[0], tempALPHA
+
+ L1ST ( s8,pCRow0, #0)
+ F1ST ( s8 , s0 , s16)
+ str s8 , [pCRow0, #0]
+
+ L1ST ( s12,pCRow1, #0)
+ F1ST ( s12, s0 , s20)
+ str s12, [pCRow1, #0]
+
+ L1ST ( s8,pCRow2 , #0)
+ F1ST ( s8 , s0 , s24)
+ str s8 , [pCRow2 , #0]
+
+ add pCRow1, pCRow2 , LDC
+
+ L1ST ( s12,pCRow1, #0)
+ F1ST ( s12, s0 , s28)
+ str s12, [pCRow1, #0]
+
+ add pCRow0, pCRow0, #4
+
+.endm
+
+/******************************************************************************/
+/******************************************************************************/
+
+.macro INIT4x2
+
+ fsub s16 , s16 , s16
+ fmov s17, s16
+ fmov s18, s16
+ fmov s19, s16
+ fmov s20, s16
+ fmov s21, s16
+ fmov s22, s16
+ fmov s23, s16
+
+.endm
+
+
+
+.macro KERNEL4x2_SUB
+
+ ldr s8 , [ pB ]
+ ldr s9 , [ pB, #4 ]
+
+ ldr s0 , [ pA ]
+ ldr s1 , [ pA, #4 ]
+ ldr s2 , [ pA, #8 ]
+ ldr s3 , [ pA, #12 ]
+
+ fmadd s16 , s0, s8, s16
+ fmadd s17 , s1, s8, s17
+ fmadd s18 , s2, s8, s18
+ fmadd s19 , s3, s8, s19
+
+ fmadd s20 , s0, s9, s20
+ fmadd s21 , s1, s9, s21
+ fmadd s22 , s2, s9, s22
+ fmadd s23 , s3, s9, s23
+
+ add pA , pA, #16
+ add pB , pB, #8
+
+.endm
+
+.macro SAVE4x2
+
+ add pCRow1 , pCRow0, LDC
+
+ mov v0.d[0], tempALPHA
+
+ L1ST ( s8,pCRow0, #0)
+ L1ST ( s9,pCRow0, #4 )
+ L1ST ( s10,pCRow0, #8 )
+ L1ST ( s11,pCRow0, #12 )
+
+ F1ST ( s8 , s0 , s16)
+ F1ST ( s9 , s0 , s17)
+ F1ST ( s10, s0 , s18)
+ F1ST ( s11, s0 , s19)
+
+ str s8 , [pCRow0]
+ str s9 , [pCRow0, #4 ]
+ str s10, [pCRow0, #8 ]
+ str s11, [pCRow0, #12 ]
+
+ L1ST ( s12,pCRow1, #0)
+ L1ST ( s13,pCRow1, #4 )
+ L1ST ( s14,pCRow1, #8 )
+ L1ST ( s15,pCRow1, #12 )
+
+ F1ST ( s12, s0 , s20)
+ F1ST ( s13, s0 , s21)
+ F1ST ( s14, s0 , s22)
+ F1ST ( s15, s0 , s23)
+
+ str s12, [pCRow1]
+ str s13, [pCRow1, #4 ]
+ str s14, [pCRow1, #8 ]
+ str s15, [pCRow1, #12 ]
+
+ add pCRow0, pCRow0, #16
+
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT2x2
+
+ fsub s16 , s16 , s16
+ fmov s17, s16
+ fmov s20, s16
+ fmov s21, s16
+
+.endm
+
+
+
+.macro KERNEL2x2_SUB
+
+ ldr s8 , [ pB ]
+ ldr s9 , [ pB, #4 ]
+
+ ldr s0 , [ pA ]
+ ldr s1 , [ pA, #4 ]
+
+ fmadd s16 , s0, s8, s16
+ fmadd s17 , s1, s8, s17
+
+ fmadd s20 , s0, s9, s20
+ fmadd s21 , s1, s9, s21
+
+ add pA , pA, #8
+ add pB , pB, #8
+
+.endm
+
+.macro SAVE2x2
+
+ add pCRow1 , pCRow0, LDC
+
+ mov v0.d[0], tempALPHA
+
+ L1ST ( s8,pCRow0, #0 )
+ L1ST ( s9,pCRow0, #4 )
+
+ F1ST ( s8 , s0 , s16)
+ F1ST ( s9 , s0 , s17)
+
+ str s8 , [pCRow0]
+ str s9 , [pCRow0, #4 ]
+
+ L1ST ( s12,pCRow1, #0 )
+ L1ST ( s13,pCRow1, #4 )
+
+ F1ST ( s12, s0 , s20)
+ F1ST ( s13, s0 , s21)
+
+ str s12, [pCRow1]
+ str s13, [pCRow1, #4 ]
+
+ add pCRow0, pCRow0, #8
+
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x2
+
+ fsub s16 , s16 , s16
+ fmov s20, s16
+
+.endm
+
+
+
+.macro KERNEL1x2_SUB
+
+ ldr s8 , [ pB ]
+ ldr s9 , [ pB, #4 ]
+
+ ldr s0 , [ pA ]
+ fmadd s16 , s0, s8, s16
+ fmadd s20 , s0, s9, s20
+
+ add pA , pA, #4
+ add pB , pB, #8
+
+.endm
+
+.macro SAVE1x2
+
+ add pCRow1 , pCRow0, LDC
+
+ mov v0.d[0], tempALPHA
+
+ L1ST ( s8,pCRow0, #0)
+ F1ST ( s8 , s0 , s16)
+ str s8 , [pCRow0]
+
+ L1ST ( s12,pCRow1, #0)
+ F1ST ( s12, s0 , s20)
+ str s12, [pCRow1]
+
+ add pCRow0, pCRow0, #4
+
+.endm
+
+/******************************************************************************/
+/******************************************************************************/
+
+.macro INIT4x1
+
+ fsub s16 , s16 , s16
+ fmov s17, s16
+ fmov s18, s16
+ fmov s19, s16
+
+.endm
+
+
+
+.macro KERNEL4x1_SUB
+
+ ldr s8 , [ pB ]
+
+ ldr s0 , [ pA ]
+ ldr s1 , [ pA, #4 ]
+ ldr s2 , [ pA, #8 ]
+ ldr s3 , [ pA, #12 ]
+
+ fmadd s16 , s0, s8, s16
+ fmadd s17 , s1, s8, s17
+ fmadd s18 , s2, s8, s18
+ fmadd s19 , s3, s8, s19
+
+ add pA , pA, #16
+ add pB , pB, #4
+
+.endm
+
+.macro SAVE4x1
+
+
+ mov v0.d[0], tempALPHA
+
+ L1ST ( s8,pCRow0, #0 )
+ L1ST ( s9,pCRow0, #4 )
+ L1ST ( s10,pCRow0, #8 )
+ L1ST ( s11,pCRow0, #12 )
+
+ F1ST ( s8 , s0 , s16)
+ F1ST ( s9 , s0 , s17)
+ F1ST ( s10, s0 , s18)
+ F1ST ( s11, s0 , s19)
+
+ str s8 , [pCRow0]
+ str s9 , [pCRow0, #4 ]
+ str s10, [pCRow0, #8 ]
+ str s11, [pCRow0, #12 ]
+
+ add pCRow0, pCRow0, #16
+
+.endm
+
+
+
+
+/******************************************************************************/
+
+.macro INIT2x1
+
+ fsub s16 , s16 , s16
+ fmov s17, s16
+
+.endm
+
+
+
+.macro KERNEL2x1_SUB
+
+ ldr s8 , [ pB ]
+
+ ldr s0 , [ pA ]
+ ldr s1 , [ pA, #4 ]
+
+ fmadd s16 , s0, s8, s16
+ fmadd s17 , s1, s8, s17
+
+ add pA , pA, #8
+ add pB , pB, #4
+
+.endm
+
+.macro SAVE2x1
+
+
+ mov v0.d[0], tempALPHA
+
+ L1ST ( s8,pCRow0, #0 )
+ L1ST ( s9,pCRow0, #4 )
+
+ F1ST ( s8 , s0 , s16)
+ F1ST ( s9 , s0 , s17)
+
+ str s8 , [pCRow0]
+ str s9 , [pCRow0, #4 ]
+
+ add pCRow0, pCRow0, #8
+
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x1
+
+ fsub s16 , s16 , s16
+
+.endm
+
+
+
+.macro KERNEL1x1_SUB
+
+ ldr s8 , [ pB ]
+
+ ldr s0 , [ pA ]
+
+ fmadd s16 , s0, s8, s16
+
+ add pA , pA, #4
+ add pB , pB, #4
+
+.endm
+
+.macro SAVE1x1
+
+
+ mov v0.d[0], tempALPHA
+
+ L1ST ( s8,pCRow0, #0 )
+ F1ST ( s8 , s0 , s16)
+ str s8 , [pCRow0]
+
+ add pCRow0, pCRow0, #4
+
+.endm
+
+
+
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+ PROLOGUE
+
+ .align 5
+ add sp,sp,#-(5*16)
+ stp d8,d9,[sp,#(0*16)]
+ stp d10,d11,[sp,#(1*16)]
+ stp d12,d13,[sp,#(2*16)]
+ stp d14,d15,[sp,#(3*16)]
+ stp d16,d17,[sp,#(4*16)]
+
+ mov tempALPHA, v0.d[0]
+ lsl LDC, LDC, #2 // ldc = ldc * 4
+
+ mov pB, origPB
+
+ mov counterJ, origN
+ asr counterJ, counterJ, #2 // J = J / 4
+ cmp counterJ, #0
+ ble sgemm_kernel_L2_BEGIN
+
+sgemm_kernel_L4_BEGIN:
+
+ mov pCRow0, pC // pCRow0 = C
+ add pC,pC,LDC, lsl #2
+
+ mov pA, origPA // pA = start of A array
+
+
+
+sgemm_kernel_L4_M4_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #2 // counterI = counterI / 4
+ cmp counterI, #0
+ ble sgemm_kernel_L4_M2_BEGIN
+
+sgemm_kernel_L4_M4_20:
+
+ mov pB, origPB
+ asr counterL , origK, #1 // L = K / 2
+ cmp counterL , #2 // is there at least 4 to do?
+ blt sgemm_kernel_L4_M4_32
+
+
+
+ KERNEL4x4_I //do one in the K
+ KERNEL4x4_M2 //do another in the K
+
+ subs counterL, counterL, #2 // subtract 2, since one is always done at the tail
+ ble sgemm_kernel_L4_M4_22a
+ .align 5
+
+sgemm_kernel_L4_M4_22:
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L4_M4_22
+
+sgemm_kernel_L4_M4_22a:
+
+ KERNEL4x4_M1
+ KERNEL4x4_E
+
+ b sgemm_kernel_L4_M4_44
+
+sgemm_kernel_L4_M4_32: // less than 4 to do in the K direction
+
+ tst counterL, #1
+ ble sgemm_kernel_L4_M4_40
+
+ KERNEL4x4_I
+
+ KERNEL4x4_E
+
+ b sgemm_kernel_L4_M4_44
+
+
+sgemm_kernel_L4_M4_40:
+
+ INIT4x4
+
+
+sgemm_kernel_L4_M4_44:
+
+ ands counterL , origK, #1
+ ble sgemm_kernel_L4_M4_100
+
+sgemm_kernel_L4_M4_46:
+
+ KERNEL4x4_SUB
+
+ subs counterL, counterL, #1
+ bne sgemm_kernel_L4_M4_46
+
+sgemm_kernel_L4_M4_100:
+
+ SAVE4x4
+
+sgemm_kernel_L4_M4_END:
+
+ subs counterI, counterI, #1
+ bne sgemm_kernel_L4_M4_20
+
+
+sgemm_kernel_L4_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble sgemm_kernel_L4_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble sgemm_kernel_L4_M1_BEGIN
+
+sgemm_kernel_L4_M2_20:
+
+ INIT2x4
+
+ mov pB, origPB
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble sgemm_kernel_L4_M2_40
+
+sgemm_kernel_L4_M2_22:
+
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L4_M2_22
+
+
+sgemm_kernel_L4_M2_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L4_M2_100
+
+sgemm_kernel_L4_M2_42:
+
+ KERNEL2x4_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L4_M2_42
+
+sgemm_kernel_L4_M2_100:
+
+ SAVE2x4
+
+sgemm_kernel_L4_M2_END:
+
+
+sgemm_kernel_L4_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble sgemm_kernel_L4_END
+
+sgemm_kernel_L4_M1_20:
+
+ INIT1x4
+
+ mov pB, origPB
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble sgemm_kernel_L4_M1_40
+
+sgemm_kernel_L4_M1_22:
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L4_M1_22
+
+
+sgemm_kernel_L4_M1_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L4_M1_100
+
+sgemm_kernel_L4_M1_42:
+
+ KERNEL1x4_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L4_M1_42
+
+sgemm_kernel_L4_M1_100:
+
+ SAVE1x4
+
+
+sgemm_kernel_L4_END:
+
+ add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
+
+ subs counterJ, counterJ , #1 // j--
+ bgt sgemm_kernel_L4_BEGIN
+
+
+
+/*********************************************************************************************/
+
+sgemm_kernel_L2_BEGIN: // less than 2 left in N direction
+
+ mov counterJ , origN
+ tst counterJ , #3
+ ble sgemm_kernel_L999 // error, N was less than 4?
+
+ tst counterJ , #2
+ ble sgemm_kernel_L1_BEGIN
+
+ mov pCRow0, pC // pCRow0 = pC
+ add pC , pC, LDC, lsl #1
+
+ mov pA, origPA // pA = A
+
+
+
+sgemm_kernel_L2_M4_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #2 // counterI = counterI / 4
+ cmp counterI,#0
+ ble sgemm_kernel_L2_M2_BEGIN
+
+sgemm_kernel_L2_M4_20:
+
+ INIT4x2
+
+ mov pB, origPB
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble sgemm_kernel_L2_M4_40
+ .align 5
+
+sgemm_kernel_L2_M4_22:
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L2_M4_22
+
+
+sgemm_kernel_L2_M4_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L2_M4_100
+
+sgemm_kernel_L2_M4_42:
+
+ KERNEL4x2_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L2_M4_42
+
+sgemm_kernel_L2_M4_100:
+
+ SAVE4x2
+
+sgemm_kernel_L2_M4_END:
+
+ subs counterI, counterI, #1
+ bgt sgemm_kernel_L2_M4_20
+
+
+sgemm_kernel_L2_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble sgemm_kernel_L2_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble sgemm_kernel_L2_M1_BEGIN
+
+sgemm_kernel_L2_M2_20:
+
+ INIT2x2
+
+ mov pB, origPB
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble sgemm_kernel_L2_M2_40
+
+sgemm_kernel_L2_M2_22:
+
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L2_M2_22
+
+
+sgemm_kernel_L2_M2_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L2_M2_100
+
+sgemm_kernel_L2_M2_42:
+
+ KERNEL2x2_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L2_M2_42
+
+sgemm_kernel_L2_M2_100:
+
+ SAVE2x2
+
+sgemm_kernel_L2_M2_END:
+
+
+sgemm_kernel_L2_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble sgemm_kernel_L2_END
+
+sgemm_kernel_L2_M1_20:
+
+ INIT1x2
+
+ mov pB, origPB
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL, #0
+ ble sgemm_kernel_L2_M1_40
+
+sgemm_kernel_L2_M1_22:
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L2_M1_22
+
+
+sgemm_kernel_L2_M1_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L2_M1_100
+
+sgemm_kernel_L2_M1_42:
+
+ KERNEL1x2_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L2_M1_42
+
+sgemm_kernel_L2_M1_100:
+
+ SAVE1x2
+
+
+sgemm_kernel_L2_END:
+ add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
+
+/*********************************************************************************************/
+
+sgemm_kernel_L1_BEGIN:
+
+ mov counterJ , origN
+ tst counterJ , #1
+ ble sgemm_kernel_L999 // done
+
+
+ mov pCRow0, pC // pCRow0 = C
+ add pC , pCRow0 , LDC // C01 is the current line, update pC to point to next
+
+ mov pA, origPA // pA = A
+
+
+
+sgemm_kernel_L1_M4_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #2 // counterI = counterI / 4
+ cmp counterI, #0
+ ble sgemm_kernel_L1_M2_BEGIN
+
+sgemm_kernel_L1_M4_20:
+
+ INIT4x1
+
+ mov pB, origPB
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble sgemm_kernel_L1_M4_40
+ .align 5
+
+sgemm_kernel_L1_M4_22:
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L1_M4_22
+
+
+sgemm_kernel_L1_M4_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L1_M4_100
+
+sgemm_kernel_L1_M4_42:
+
+ KERNEL4x1_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L1_M4_42
+
+sgemm_kernel_L1_M4_100:
+
+ SAVE4x1
+
+sgemm_kernel_L1_M4_END:
+
+ subs counterI, counterI, #1
+ bgt sgemm_kernel_L1_M4_20
+
+
+sgemm_kernel_L1_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble sgemm_kernel_L1_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble sgemm_kernel_L1_M1_BEGIN
+
+sgemm_kernel_L1_M2_20:
+
+ INIT2x1
+
+ mov pB, origPB
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble sgemm_kernel_L1_M2_40
+
+sgemm_kernel_L1_M2_22:
+
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L1_M2_22
+
+
+sgemm_kernel_L1_M2_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L1_M2_100
+
+sgemm_kernel_L1_M2_42:
+
+ KERNEL2x1_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L1_M2_42
+
+sgemm_kernel_L1_M2_100:
+
+ SAVE2x1
+
+sgemm_kernel_L1_M2_END:
+
+
+sgemm_kernel_L1_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble sgemm_kernel_L1_END
+
+sgemm_kernel_L1_M1_20:
+
+ INIT1x1
+
+ mov pB, origPB
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble sgemm_kernel_L1_M1_40
+
+sgemm_kernel_L1_M1_22:
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L1_M1_22
+
+
+sgemm_kernel_L1_M1_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L1_M1_100
+
+sgemm_kernel_L1_M1_42:
+
+ KERNEL1x1_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L1_M1_42
+
+sgemm_kernel_L1_M1_100:
+
+ SAVE1x1
+
+
+sgemm_kernel_L1_END:
+
+
+sgemm_kernel_L999:
+ mov x0, #0 // set return value
+ ldp d8,d9,[sp,#(0*16)]
+ ldp d10,d11,[sp,#(1*16)]
+ ldp d12,d13,[sp,#(2*16)]
+ ldp d14,d15,[sp,#(3*16)]
+ ldp d16,d17,[sp,#(4*16)]
+ add sp,sp,#(5*16)
+ ret
+
+ EPILOGUE
+
--- /dev/null
+#include "common.h"
+#include <stdbool.h>
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
+{
+
+ BLASLONG i,j,k;
+ FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
+
+ FLOAT res0_0;
+ FLOAT res0_1;
+ FLOAT res0_2;
+ FLOAT res0_3;
+
+ FLOAT res1_0;
+ FLOAT res1_1;
+ FLOAT res1_2;
+ FLOAT res1_3;
+
+ FLOAT res2_0;
+ FLOAT res2_1;
+ FLOAT res2_2;
+ FLOAT res2_3;
+
+ FLOAT res3_0;
+ FLOAT res3_1;
+ FLOAT res3_2;
+ FLOAT res3_3;
+
+ FLOAT a0;
+ FLOAT a1;
+
+ FLOAT b0;
+ FLOAT b1;
+ FLOAT b2;
+ FLOAT b3;
+
+ BLASLONG off, temp;
+
+ bool left;
+ bool transposed;
+ bool backwards;
+
+#ifdef LEFT
+ left = true;
+#else
+ left = false;
+#endif
+
+#ifdef TRANSA
+ transposed = true;
+#else
+ transposed = false;
+#endif
+
+ backwards = left != transposed;
+
+ if (!left) {
+ off = -offset;
+ }
+
+
+ for (j=0; j<bn/4; j+=1) // do blocks of the Mx4 loops
+ {
+ C0 = C;
+ C1 = C0+ldc;
+ C2 = C1+ldc;
+ C3 = C2+ldc;
+
+
+ if (left) {
+ off = offset;
+ }
+
+ ptrba = ba;
+
+ for (i=0; i<bm/4; i+=1) // do blocks of 4x4
+ {
+
+ ptrbb = bb;
+ if (backwards)
+ {
+ ptrba += off*4; // number of values in A
+ ptrbb += off*4; // number of values in B
+ }
+
+ res0_0 = 0;
+ res0_1 = 0;
+ res0_2 = 0;
+ res0_3 = 0;
+
+ res1_0 = 0;
+ res1_1 = 0;
+ res1_2 = 0;
+ res1_3 = 0;
+
+ res2_0 = 0;
+ res2_1 = 0;
+ res2_2 = 0;
+ res2_3 = 0;
+
+ res3_0 = 0;
+ res3_1 = 0;
+ res3_2 = 0;
+ res3_3 = 0;
+
+ temp = backwards ? bk-off :
+ left ? off + 4 : // number of values in A
+ off + 4; // number of values in B
+
+ for (k=0; k<temp; k++)
+ {
+ b0 = ptrbb[0];
+ b1 = ptrbb[1];
+ b2 = ptrbb[2];
+ b3 = ptrbb[3];
+
+ a0 = ptrba[0];
+ res0_0 += a0*b0;
+ res1_0 += a0*b1;
+ res2_0 += a0*b2;
+ res3_0 += a0*b3;
+
+ a1 = ptrba[1];
+ res0_1 += a1*b0;
+ res1_1 += a1*b1;
+ res2_1 += a1*b2;
+ res3_1 += a1*b3;
+
+ a0 = ptrba[2];
+ res0_2 += a0*b0;
+ res1_2 += a0*b1;
+ res2_2 += a0*b2;
+ res3_2 += a0*b3;
+
+ a1 = ptrba[3];
+ res0_3 += a1*b0;
+ res1_3 += a1*b1;
+ res2_3 += a1*b2;
+ res3_3 += a1*b3;
+
+ ptrba = ptrba+4;
+ ptrbb = ptrbb+4;
+ }
+
+ res0_0 *= alpha;
+ res0_1 *= alpha;
+ res0_2 *= alpha;
+ res0_3 *= alpha;
+
+ res1_0 *= alpha;
+ res1_1 *= alpha;
+ res1_2 *= alpha;
+ res1_3 *= alpha;
+
+ res2_0 *= alpha;
+ res2_1 *= alpha;
+ res2_2 *= alpha;
+ res2_3 *= alpha;
+
+ res3_0 *= alpha;
+ res3_1 *= alpha;
+ res3_2 *= alpha;
+ res3_3 *= alpha;
+
+ C0[0] = res0_0;
+ C0[1] = res0_1;
+ C0[2] = res0_2;
+ C0[3] = res0_3;
+
+ C1[0] = res1_0;
+ C1[1] = res1_1;
+ C1[2] = res1_2;
+ C1[3] = res1_3;
+
+ C2[0] = res2_0;
+ C2[1] = res2_1;
+ C2[2] = res2_2;
+ C2[3] = res2_3;
+
+ C3[0] = res3_0;
+ C3[1] = res3_1;
+ C3[2] = res3_2;
+ C3[3] = res3_3;
+
+ if (!backwards) {
+ temp = bk-off;
+ temp = left ? temp - 4 : // number of values in A
+ temp - 4; // number of values in B
+
+ ptrba += temp*4; // number of values in A
+ ptrbb += temp*4; // number of values in B
+ }
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+
+ C0 = C0+4;
+ C1 = C1+4;
+ C2 = C2+4;
+ C3 = C3+4;
+
+ }
+
+ if ( bm & 2 ) // do any 2x4 loop
+ {
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ ptrbb = bb;
+#else
+ ptrba += off*2;
+ ptrbb = bb + off*4;
+#endif
+
+ res0_0 = 0;
+ res0_1 = 0;
+
+ res1_0 = 0;
+ res1_1 = 0;
+
+ res2_0 = 0;
+ res2_1 = 0;
+
+ res3_0 = 0;
+ res3_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = bk-off;
+#elif defined(LEFT)
+ temp = off+2; // number of values in A
+#else
+ temp = off+4; // number of values in B
+#endif
+
+ for (k=0; k<temp; k++)
+ {
+ b0 = ptrbb[0];
+ b1 = ptrbb[1];
+ b2 = ptrbb[2];
+ b3 = ptrbb[3];
+
+ a0 = ptrba[0];
+ res0_0 += a0*b0;
+ res1_0 += a0*b1;
+ res2_0 += a0*b2;
+ res3_0 += a0*b3;
+
+ a1 = ptrba[1];
+ res0_1 += a1*b0;
+ res1_1 += a1*b1;
+ res2_1 += a1*b2;
+ res3_1 += a1*b3;
+
+ ptrba = ptrba+2;
+ ptrbb = ptrbb+4;
+ }
+
+ res0_0 *= alpha;
+ res0_1 *= alpha;
+
+ res1_0 *= alpha;
+ res1_1 *= alpha;
+
+ res2_0 *= alpha;
+ res2_1 *= alpha;
+
+ res3_0 *= alpha;
+ res3_1 *= alpha;
+
+ C0[0] = res0_0;
+ C0[1] = res0_1;
+
+ C1[0] = res1_0;
+ C1[1] = res1_1;
+
+ C2[0] = res2_0;
+ C2[1] = res2_1;
+
+ C3[0] = res3_0;
+ C3[1] = res3_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = bk - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ ptrba += temp*2;
+ ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+
+ C0 = C0+2;
+ C1 = C1+2;
+ C2 = C2+2;
+ C3 = C3+2;
+
+ }
+
+ if ( bm & 1 ) // do any 1x4 loop
+ {
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ ptrbb = bb;
+#else
+ ptrba += off*1;
+ ptrbb = bb + off*4;
+#endif
+
+ res0_0 = 0;
+ res1_0 = 0;
+ res2_0 = 0;
+ res3_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = bk-off;
+#elif defined(LEFT)
+ temp = off+1; // number of values in A
+#else
+ temp = off+4; // number of values in B
+#endif
+
+ for (k=0; k<temp; k++)
+ {
+ b0 = ptrbb[0];
+ b1 = ptrbb[1];
+ b2 = ptrbb[2];
+ b3 = ptrbb[3];
+
+ a0 = ptrba[0];
+ res0_0 += a0*b0;
+ res1_0 += a0*b1;
+ res2_0 += a0*b2;
+ res3_0 += a0*b3;
+
+ ptrba = ptrba+1;
+ ptrbb = ptrbb+4;
+ }
+
+ res0_0 *= alpha;
+
+ res1_0 *= alpha;
+
+ res2_0 *= alpha;
+
+ res3_0 *= alpha;
+
+ C0[0] = res0_0;
+
+ C1[0] = res1_0;
+
+ C2[0] = res2_0;
+
+ C3[0] = res3_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = bk - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ ptrba += temp*1;
+ ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+
+ C0 = C0+1;
+ C1 = C1+1;
+ C2 = C2+1;
+ C3 = C3+1;
+
+ }
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 4;
+#endif
+
+ k = (bk<<2);
+ bb = bb+k;
+ i = (ldc<<2);
+ C = C+i;
+ }
+
+ for (j=0; j<(bn&2); j+=2) // do the Mx2 loops
+ {
+ C0 = C;
+ C1 = C0+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
+
+ ptrba = ba;
+
+ for (i=0; i<bm/4; i+=1) // do blocks of 4x2
+ {
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ ptrbb = bb;
+#else
+ ptrba += off*4;
+ ptrbb = bb + off*2;
+#endif
+
+ res0_0 = 0;
+ res0_1 = 0;
+ res0_2 = 0;
+ res0_3 = 0;
+
+ res1_0 = 0;
+ res1_1 = 0;
+ res1_2 = 0;
+ res1_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = bk-off;
+#elif defined(LEFT)
+ temp = off+4; // number of values in A
+#else
+ temp = off+2; // number of values in B
+#endif
+
+ for (k=0; k<temp; k++)
+ {
+ b0 = ptrbb[0];
+ b1 = ptrbb[1];
+
+ a0 = ptrba[0];
+ res0_0 += a0*b0;
+ res1_0 += a0*b1;
+
+ a1 = ptrba[1];
+ res0_1 += a1*b0;
+ res1_1 += a1*b1;
+
+ a0 = ptrba[2];
+ res0_2 += a0*b0;
+ res1_2 += a0*b1;
+
+ a1 = ptrba[3];
+ res0_3 += a1*b0;
+ res1_3 += a1*b1;
+
+ ptrba = ptrba+4;
+ ptrbb = ptrbb+2;
+ }
+
+ res0_0 *= alpha;
+ res0_1 *= alpha;
+ res0_2 *= alpha;
+ res0_3 *= alpha;
+
+ res1_0 *= alpha;
+ res1_1 *= alpha;
+ res1_2 *= alpha;
+ res1_3 *= alpha;
+
+ C0[0] = res0_0;
+ C0[1] = res0_1;
+ C0[2] = res0_2;
+ C0[3] = res0_3;
+
+ C1[0] = res1_0;
+ C1[1] = res1_1;
+ C1[2] = res1_2;
+ C1[3] = res1_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = bk - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ ptrba += temp*4;
+ ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+
+ C0 = C0+4;
+ C1 = C1+4;
+
+ }
+
+ if ( bm & 2 ) // do any 2x2 loop
+ {
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ ptrbb = bb;
+#else
+ ptrba += off*2;
+ ptrbb = bb + off*2;
+#endif
+
+ res0_0 = 0;
+ res0_1 = 0;
+
+ res1_0 = 0;
+ res1_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = bk-off;
+#elif defined(LEFT)
+ temp = off+2; // number of values in A
+#else
+ temp = off+2; // number of values in B
+#endif
+
+ for (k=0; k<temp; k++)
+ {
+ b0 = ptrbb[0];
+ b1 = ptrbb[1];
+
+ a0 = ptrba[0];
+ res0_0 += a0*b0;
+ res1_0 += a0*b1;
+
+ a1 = ptrba[1];
+ res0_1 += a1*b0;
+ res1_1 += a1*b1;
+
+ ptrba = ptrba+2;
+ ptrbb = ptrbb+2;
+ }
+
+ res0_0 *= alpha;
+ res0_1 *= alpha;
+
+ res1_0 *= alpha;
+ res1_1 *= alpha;
+
+ C0[0] = res0_0;
+ C0[1] = res0_1;
+
+ C1[0] = res1_0;
+ C1[1] = res1_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = bk - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ ptrba += temp*2;
+ ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+
+ C0 = C0+2;
+ C1 = C1+2;
+
+ }
+
+ if ( bm & 1 ) // do any 1x2 loop
+ {
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ ptrbb = bb;
+#else
+ ptrba += off*1;
+ ptrbb = bb + off*2;
+#endif
+
+ res0_0 = 0;
+
+ res1_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = bk-off;
+#elif defined(LEFT)
+ temp = off+1; // number of values in A
+#else
+ temp = off+2; // number of values in B
+#endif
+
+ for (k=0; k<temp; k++)
+ {
+ b0 = ptrbb[0];
+ b1 = ptrbb[1];
+
+ a0 = ptrba[0];
+ res0_0 += a0*b0;
+ res1_0 += a0*b1;
+
+ ptrba = ptrba+1;
+ ptrbb = ptrbb+2;
+ }
+
+ res0_0 *= alpha;
+
+ res1_0 *= alpha;
+
+ C0[0] = res0_0;
+
+ C1[0] = res1_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = bk - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ ptrba += temp*1;
+ ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+
+ C0 = C0+1;
+ C1 = C1+1;
+
+ }
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 2;
+#endif
+
+ k = (bk<<1);
+ bb = bb+k;
+ i = (ldc<<1);
+ C = C+i;
+ }
+
+
+
+
+
+
+
+ for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
+ {
+ C0 = C;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
+ ptrba = ba;
+
+ for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
+ {
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ ptrbb = bb;
+#else
+ ptrba += off*4;
+ ptrbb = bb + off*1;
+#endif
+
+ res0_0 = 0;
+ res0_1 = 0;
+ res0_2 = 0;
+ res0_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = bk-off;
+#elif defined(LEFT)
+ temp = off+4; // number of values in A
+#else
+ temp = off+1; // number of values in B
+#endif
+
+ for (k=0; k<temp; k++)
+ {
+ b0 = ptrbb[0];
+
+ a0 = ptrba[0];
+ res0_0 += a0*b0;
+
+ a1 = ptrba[1];
+ res0_1 += a1*b0;
+
+ a0 = ptrba[2];
+ res0_2 += a0*b0;
+
+ a1 = ptrba[3];
+ res0_3 += a1*b0;
+
+ ptrba = ptrba+4;
+ ptrbb = ptrbb+1;
+ }
+
+ res0_0 *= alpha;
+ res0_1 *= alpha;
+ res0_2 *= alpha;
+ res0_3 *= alpha;
+
+ C0[0] = res0_0;
+ C0[1] = res0_1;
+ C0[2] = res0_2;
+ C0[3] = res0_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = bk - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ ptrba += temp*4;
+ ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+
+ C0 = C0+4;
+
+ }
+
+ if ( bm & 2 ) // do any 2x1 loop
+ {
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ ptrbb = bb;
+#else
+ ptrba += off*2;
+ ptrbb = bb + off*1;
+#endif
+
+ res0_0 = 0;
+ res0_1 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = bk-off;
+#elif defined(LEFT)
+ temp = off+2; // number of values in A
+#else
+ temp = off+1; // number of values in B
+#endif
+
+ for (k=0; k<temp; k++)
+ {
+ b0 = ptrbb[0];
+
+ a0 = ptrba[0];
+ res0_0 += a0*b0;
+
+ a1 = ptrba[1];
+ res0_1 += a1*b0;
+
+ ptrba = ptrba+2;
+ ptrbb = ptrbb+1;
+ }
+
+ res0_0 *= alpha;
+ res0_1 *= alpha;
+
+ C0[0] = res0_0;
+ C0[1] = res0_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = bk - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ ptrba += temp*2;
+ ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+
+ C0 = C0+2;
+
+ }
+
+ if ( bm & 1 ) // do any 1x1 loop
+ {
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ ptrbb = bb;
+#else
+ ptrba += off*1;
+ ptrbb = bb + off*1;
+#endif
+
+ res0_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = bk-off;
+#elif defined(LEFT)
+ temp = off+1; // number of values in A
+#else
+ temp = off+1; // number of values in B
+#endif
+
+ for (k=0; k<temp; k++)
+ {
+ b0 = ptrbb[0];
+
+ a0 = ptrba[0];
+ res0_0 += a0*b0;
+
+ ptrba = ptrba+1;
+ ptrbb = ptrbb+1;
+ }
+
+ res0_0 *= alpha;
+
+ C0[0] = res0_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = bk - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ ptrba += temp*1;
+ ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+
+ C0 = C0+1;
+
+ }
+
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 1;
+#endif
+
+ k = (bk<<0);
+ bb = bb+k;
+ C = C+ldc;
+ }
+ return 0;
+}