added optimized sgemm and strmm kernel for ARMV6
authorwernsaar <wernsaar@googlemail.com>
Sat, 23 Nov 2013 17:09:41 +0000 (18:09 +0100)
committerwernsaar <wernsaar@googlemail.com>
Sat, 23 Nov 2013 17:09:41 +0000 (18:09 +0100)
kernel/arm/KERNEL.ARMV6
kernel/arm/sgemm_kernel_4x2_vfp.S [new file with mode: 0644]
kernel/arm/strmm_kernel_4x2_vfp.S [new file with mode: 0644]
param.h

index 7a58fec..b192b20 100644 (file)
@@ -80,12 +80,16 @@ DGEMVTKERNEL = gemv_t.c
 CGEMVTKERNEL = zgemv_t.c
 ZGEMVTKERNEL = zgemv_t.c
 
-STRMMKERNEL    = ../generic/trmmkernel_2x2.c
+STRMMKERNEL    = strmm_kernel_4x2_vfp.S
 DTRMMKERNEL    = dtrmm_kernel_4x2_vfp.S
 CTRMMKERNEL    = ../generic/ztrmmkernel_2x2.c
 ZTRMMKERNEL    = ../generic/ztrmmkernel_2x2.c
 
-SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c          
+SGEMMKERNEL    =  sgemm_kernel_4x2_vfp.S               
+SGEMMINCOPY    = ../generic/gemm_ncopy_4.c
+SGEMMITCOPY    = ../generic/gemm_tcopy_4.c
+SGEMMINCOPYOBJ = sgemm_incopy.o
+SGEMMITCOPYOBJ = sgemm_itcopy.o
 SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
 SGEMMONCOPYOBJ =  sgemm_oncopy.o
diff --git a/kernel/arm/sgemm_kernel_4x2_vfp.S b/kernel/arm/sgemm_kernel_4x2_vfp.S
new file mode 100644 (file)
index 0000000..3e20f86
--- /dev/null
@@ -0,0 +1,798 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/23 Saar
+*       BLASTEST               : OK
+*       CTEST                  : OK
+*       TEST                   : OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define        OLD_M   r0
+#define        OLD_N   r1
+#define        OLD_K   r2
+#define        OLD_A   r3
+#define OLD_ALPHA s0
+
+/******************************************************
+* [fp, #-128] - [fp, #-64] is reserved
+* for store and restore of floating point
+* registers
+*******************************************************/
+
+#define LDC    [fp, #-252 ]
+#define M      [fp, #-256 ]
+#define N      [fp, #-260 ]
+#define K      [fp, #-264 ]
+#define A      [fp, #-268 ]
+
+#define ALPHA  [fp, #-280]
+
+#define B      [fp, #4 ]
+#define C      [fp, #8 ]
+#define OLD_LDC        [fp, #12 ]
+
+#define I      r0
+#define J      r1
+#define L      r2
+
+#define        AO      r5
+#define        BO      r6
+
+#define        CO1     r8
+#define        CO2     r9
+
+#define K1     r7
+#define BC     r12
+
+#define A_PRE  96
+#define B_PRE  96
+#define C_PRE  64
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro INIT4x2
+
+       vsub.f32                s8 , s8 , s8
+       vmov.f32                s9, s8
+       vmov.f32                s10, s8
+       vmov.f32                s11, s8
+       vmov.f32                s12, s8
+       vmov.f32                s13, s8
+       vmov.f32                s14, s8
+       vmov.f32                s15, s8
+
+.endm
+
+
+
+.macro KERNEL4x2_SUB
+
+       flds    s4 , [ BO ]
+       flds    s5 , [ BO, #4 ]
+
+       flds    s0 , [ AO ]
+       flds    s1 , [ AO, #4 ]
+       flds    s2 , [ AO, #8 ]
+       flds    s3 , [ AO, #12 ]
+
+       fmacs   s8  , s0,  s4
+       fmacs   s9  , s1,  s4
+       fmacs   s10  , s2,  s4
+       fmacs   s11  , s3,  s4
+
+       fmacs   s12  , s0,  s5
+       fmacs   s13  , s1,  s5
+       fmacs   s14  , s2,  s5
+       fmacs   s15  , s3,  s5
+
+       add     AO , AO, #16
+       add     BO , BO, #8
+
+.endm
+
+.macro SAVE4x2
+
+       ldr     r3  , LDC
+       add     CO2 , CO1, r3
+
+       flds            s0, ALPHA
+
+       flds    s4 , [CO1]
+       flds    s5 , [CO1, #4 ]
+       flds    s6 , [CO1, #8 ]
+       flds    s7 , [CO1, #12 ]
+               
+       fmacs   s4 , s0 , s8
+       fmacs   s5 , s0 , s9
+       fmacs   s6 , s0 , s10
+       fmacs   s7 , s0 , s11
+
+       fsts    s4 , [CO1]
+       fsts    s5 , [CO1, #4 ]
+       fsts    s6 , [CO1, #8 ]
+       fsts    s7 , [CO1, #12 ]
+
+       flds    s4 , [CO2]
+       flds    s5 , [CO2, #4 ]
+       flds    s6 , [CO2, #8 ]
+       flds    s7 , [CO2, #12 ]
+
+       fmacs   s4 , s0 , s12
+       fmacs   s5 , s0 , s13
+       fmacs   s6 , s0 , s14
+       fmacs   s7 , s0 , s15
+
+       fsts    s4 , [CO2]
+       fsts    s5 , [CO2, #4 ]
+       fsts    s6 , [CO2, #8 ]
+       fsts    s7 , [CO2, #12 ]
+
+       add     CO1, CO1, #16
+
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT2x2
+
+       vsub.f32                s8 , s8 , s8
+       vmov.f32                s9, s8
+       vmov.f32                s12, s8
+       vmov.f32                s13, s8
+
+.endm
+
+.macro KERNEL2x2_SUB
+
+       flds    s4 , [ BO ]
+       flds    s5 , [ BO, #4 ]
+
+       flds    s0 , [ AO ]
+       flds    s1 , [ AO, #4 ]
+
+       fmacs   s8  , s0,  s4
+       fmacs   s9  , s1,  s4
+
+       fmacs   s12  , s0,  s5
+       fmacs   s13  , s1,  s5
+
+       add     AO , AO, #8
+       add     BO , BO, #8
+
+.endm
+
+.macro SAVE2x2
+
+       ldr     r3  , LDC
+       add     CO2 , CO1, r3
+
+       flds            s0, ALPHA
+
+       flds    s4 , [CO1]
+       flds    s5 , [CO1, #4 ]
+               
+       fmacs   s4 , s0 , s8
+       fmacs   s5 , s0 , s9
+
+       fsts    s4 , [CO1]
+       fsts    s5 , [CO1, #4 ]
+
+       flds    s4 , [CO2]
+       flds    s5 , [CO2, #4 ]
+
+       fmacs   s4 , s0 , s12
+       fmacs   s5 , s0 , s13
+
+       fsts    s4 , [CO2]
+       fsts    s5 , [CO2, #4 ]
+
+       add     CO1, CO1, #8
+
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT1x2
+
+       vsub.f32                s8 , s8 , s8
+       vmov.f32                s12, s8
+
+.endm
+
+.macro KERNEL1x2_SUB
+
+       flds    s4 , [ BO ]
+       flds    s5 , [ BO, #4 ]
+
+       flds    s0 , [ AO ]
+
+       fmacs   s8  , s0,  s4
+
+       fmacs   s12  , s0,  s5
+
+       add     AO , AO, #4
+       add     BO , BO, #8
+
+.endm
+
+.macro SAVE1x2
+
+       ldr     r3  , LDC
+       add     CO2 , CO1, r3
+
+       flds            s0, ALPHA
+
+       flds    s4 , [CO1]
+               
+       fmacs   s4 , s0 , s8
+
+       fsts    s4 , [CO1]
+
+       flds    s4 , [CO2]
+
+       fmacs   s4 , s0 , s12
+
+       fsts    s4 , [CO2]
+
+       add     CO1, CO1, #4
+
+.endm
+
+
+
+/******************************************************************************/
+
+.macro INIT4x1
+
+       vsub.f32                s8 , s8 , s8
+       vmov.f32                s9, s8
+       vmov.f32                s10, s8
+       vmov.f32                s11, s8
+
+.endm
+
+
+
+.macro KERNEL4x1_SUB
+
+       flds    s4 , [ BO ]
+
+       flds    s0 , [ AO ]
+       flds    s1 , [ AO, #4 ]
+       flds    s2 , [ AO, #8 ]
+       flds    s3 , [ AO, #12 ]
+
+       fmacs   s8  , s0,  s4
+       fmacs   s9  , s1,  s4
+       fmacs   s10 , s2,  s4
+       fmacs   s11 , s3,  s4
+
+       add     AO , AO, #16
+       add     BO , BO, #4
+
+.endm
+
+.macro SAVE4x1
+
+       flds            s0, ALPHA
+
+       flds    s4 , [CO1]
+       flds    s5 , [CO1, #4 ]
+       flds    s6 , [CO1, #8 ]
+       flds    s7 , [CO1, #12 ]
+               
+       fmacs   s4 , s0 , s8
+       fmacs   s5 , s0 , s9
+       fmacs   s6 , s0 , s10
+       fmacs   s7 , s0 , s11
+
+       fsts    s4 , [CO1]
+       fsts    s5 , [CO1, #4 ]
+       fsts    s6 , [CO1, #8 ]
+       fsts    s7 , [CO1, #12 ]
+
+       add     CO1, CO1, #16
+
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x1
+
+       vsub.f32                s8 , s8 , s8
+       vmov.f32                s9 , s8
+
+.endm
+
+.macro KERNEL2x1_SUB
+
+       flds    s4 , [ BO ]
+
+       flds    s0 , [ AO ]
+       flds    s1 , [ AO, #4 ]
+
+       fmacs   s8  , s0,  s4
+       fmacs   s9  , s1,  s4
+
+       add     AO , AO, #8
+       add     BO , BO, #4
+
+.endm
+
+.macro SAVE2x1
+
+       flds            s0, ALPHA
+
+       flds    s4 , [CO1]
+       flds    s5 , [CO1, #4 ]
+               
+       fmacs   s4 , s0 , s8
+       fmacs   s5 , s0 , s9
+
+       fsts    s4 , [CO1]
+       fsts    s5 , [CO1, #4 ]
+
+       add     CO1, CO1, #8
+
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT1x1
+
+       vsub.f32                s8 , s8 , s8
+
+.endm
+
+.macro KERNEL1x1_SUB
+
+       flds    s4 , [ BO ]
+
+       flds    s0 , [ AO ]
+
+       fmacs   s8  , s0,  s4
+
+       add     AO , AO, #4
+       add     BO , BO, #4
+
+.endm
+
+.macro SAVE1x1
+
+       flds            s0, ALPHA
+
+       flds    s4 , [CO1]
+               
+       fmacs   s4 , s0 , s8
+
+       fsts    s4 , [CO1]
+
+       add     CO1, CO1, #4
+
+.endm
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+       PROLOGUE
+
+       .align 5
+
+       push    {r4 - r9, fp}
+       add     fp, sp, #24
+       sub     sp, sp, #STACKSIZE                              // reserve stack
+
+       str     OLD_M, M
+       str     OLD_N, N
+       str     OLD_K, K
+       str     OLD_A, A
+       vstr    OLD_ALPHA, ALPHA
+
+       sub     r3, fp, #128
+       vstm    r3, { s8 - s15}                                 // store floating point registers
+
+       ldr     r3, OLD_LDC
+       lsl     r3, r3, #2                                      // ldc = ldc * 4
+       str     r3, LDC
+
+       ldr     K1, K
+       ldr     BC, B
+
+       ldr     J, N
+       asrs    J, J, #1                                        // J = J / 2
+       ble     sgemm_kernel_L1_BEGIN
+
+
+/*********************************************************************************************/
+
+sgemm_kernel_L2_BEGIN:
+
+       ldr     CO1, C                                          // CO1 = C
+       ldr     r4 , LDC
+       lsl     r4 , r4 , #1                                    // LDC * 2
+       add     r3 , r4, CO1
+       str     r3 , C                                          // store C
+
+       ldr     AO, A                                           // AO = A
+
+sgemm_kernel_L2_M4_BEGIN:
+
+       ldr     I, M
+       asrs    I, I, #2                                        // I = I / 4
+       ble     sgemm_kernel_L2_M2_BEGIN
+
+sgemm_kernel_L2_M4_20:
+
+       INIT4x2
+
+       mov     BO, BC
+       asrs    L , K1, #3                                      // L = L / 8
+       ble     sgemm_kernel_L2_M4_40
+       .align 5
+
+sgemm_kernel_L2_M4_22:
+       KERNEL4x2_SUB
+       KERNEL4x2_SUB
+       KERNEL4x2_SUB
+       KERNEL4x2_SUB
+
+       KERNEL4x2_SUB
+       KERNEL4x2_SUB
+       KERNEL4x2_SUB
+       KERNEL4x2_SUB
+
+       subs    L, L, #1
+       bgt     sgemm_kernel_L2_M4_22
+       
+
+sgemm_kernel_L2_M4_40:
+       
+       ands    L , K1, #7                                      // L = L % 8
+       ble     sgemm_kernel_L2_M4_100
+
+sgemm_kernel_L2_M4_42:
+
+       KERNEL4x2_SUB
+
+       subs    L, L, #1
+       bgt     sgemm_kernel_L2_M4_42
+       
+sgemm_kernel_L2_M4_100:
+
+       SAVE4x2
+
+sgemm_kernel_L2_M4_END:
+
+       subs    I, I, #1
+       bgt     sgemm_kernel_L2_M4_20
+
+
+sgemm_kernel_L2_M2_BEGIN:
+
+       ldr     I, M
+       tst     I , #3
+       ble     sgemm_kernel_L2_END
+
+       tst     I, #2                                   // I = I / 2
+       ble     sgemm_kernel_L2_M1_BEGIN
+
+sgemm_kernel_L2_M2_20:
+
+       INIT2x2
+
+       mov     BO, BC
+       asrs    L , K1, #3                                      // L = L / 8
+       ble     sgemm_kernel_L2_M2_40
+
+sgemm_kernel_L2_M2_22:
+
+       KERNEL2x2_SUB
+       KERNEL2x2_SUB
+       KERNEL2x2_SUB
+       KERNEL2x2_SUB
+
+       KERNEL2x2_SUB
+       KERNEL2x2_SUB
+       KERNEL2x2_SUB
+       KERNEL2x2_SUB
+
+       subs    L, L, #1
+       bgt     sgemm_kernel_L2_M2_22
+       
+
+sgemm_kernel_L2_M2_40:
+       
+       ands    L , K1, #7                                      // L = L % 8
+       ble     sgemm_kernel_L2_M2_100
+
+sgemm_kernel_L2_M2_42:
+
+       KERNEL2x2_SUB
+
+       subs    L, L, #1
+       bgt     sgemm_kernel_L2_M2_42
+       
+sgemm_kernel_L2_M2_100:
+
+       SAVE2x2
+
+sgemm_kernel_L2_M2_END:
+
+
+sgemm_kernel_L2_M1_BEGIN:
+
+       tst     I, #1                                   // I = I % 2
+       ble     sgemm_kernel_L2_END
+
+sgemm_kernel_L2_M1_20:
+
+       INIT1x2
+
+       mov     BO, BC
+       asrs    L , K1, #3                                      // L = L / 8
+       ble     sgemm_kernel_L2_M1_40
+
+sgemm_kernel_L2_M1_22:
+       KERNEL1x2_SUB
+       KERNEL1x2_SUB
+       KERNEL1x2_SUB
+       KERNEL1x2_SUB
+
+       KERNEL1x2_SUB
+       KERNEL1x2_SUB
+       KERNEL1x2_SUB
+       KERNEL1x2_SUB
+
+       subs    L, L, #1
+       bgt     sgemm_kernel_L2_M1_22
+       
+
+sgemm_kernel_L2_M1_40:
+       
+       ands    L , K1, #7                                      // L = L % 8
+       ble     sgemm_kernel_L2_M1_100
+
+sgemm_kernel_L2_M1_42:
+
+       KERNEL1x2_SUB
+
+       subs    L, L, #1
+       bgt     sgemm_kernel_L2_M1_42
+       
+sgemm_kernel_L2_M1_100:
+
+       SAVE1x2
+
+
+sgemm_kernel_L2_END:
+
+       mov     r3, BC
+       mov     r4, K1
+       lsl     r4, r4, #3                                      // k * 2 * 4
+       add     r3, r3, r4                                      // B = B + K * 2 * 4
+       mov     BC, r3
+
+       subs    J , #1                                          // j--
+       bgt     sgemm_kernel_L2_BEGIN
+
+/*********************************************************************************************/
+
+sgemm_kernel_L1_BEGIN:
+
+       ldr     J , N
+       tst     J , #1
+       ble     sgemm_kernel_L999
+
+       
+       ldr     CO1, C                                          // CO1 = C
+       ldr     r4 , LDC
+       add     r3 , r4, CO1
+       str     r3 , C                                          // store C
+
+       ldr     AO, A                                           // AO = A
+
+
+
+sgemm_kernel_L1_M4_BEGIN:
+
+       ldr     I, M
+       asrs    I, I, #2                                        // I = I / 4
+       ble     sgemm_kernel_L1_M2_BEGIN
+
+sgemm_kernel_L1_M4_20:
+
+       INIT4x1
+
+       mov     BO, BC
+       asrs    L , K1, #3                                      // L = L / 8
+       ble     sgemm_kernel_L1_M4_40
+       .align 5
+
+sgemm_kernel_L1_M4_22:
+       KERNEL4x1_SUB
+       KERNEL4x1_SUB
+       KERNEL4x1_SUB
+       KERNEL4x1_SUB
+
+       KERNEL4x1_SUB
+       KERNEL4x1_SUB
+       KERNEL4x1_SUB
+       KERNEL4x1_SUB
+
+       subs    L, L, #1
+       bgt     sgemm_kernel_L1_M4_22
+       
+
+sgemm_kernel_L1_M4_40:
+       
+       ands    L , K1, #7                                      // L = L % 8
+       ble     sgemm_kernel_L1_M4_100
+
+sgemm_kernel_L1_M4_42:
+
+       KERNEL4x1_SUB
+
+       subs    L, L, #1
+       bgt     sgemm_kernel_L1_M4_42
+       
+sgemm_kernel_L1_M4_100:
+
+       SAVE4x1
+
+sgemm_kernel_L1_M4_END:
+
+       subs    I, I, #1
+       bgt     sgemm_kernel_L1_M4_20
+
+
+sgemm_kernel_L1_M2_BEGIN:
+
+       ldr     I, M
+       tst     I , #3
+       ble     sgemm_kernel_L1_END
+
+       tst     I, #2                                   // I = I / 2
+       ble     sgemm_kernel_L1_M1_BEGIN
+
+sgemm_kernel_L1_M2_20:
+
+       INIT2x1
+
+       mov     BO, BC
+       asrs    L , K1, #3                                      // L = L / 8
+       ble     sgemm_kernel_L1_M2_40
+
+sgemm_kernel_L1_M2_22:
+
+       KERNEL2x1_SUB
+       KERNEL2x1_SUB
+       KERNEL2x1_SUB
+       KERNEL2x1_SUB
+
+       KERNEL2x1_SUB
+       KERNEL2x1_SUB
+       KERNEL2x1_SUB
+       KERNEL2x1_SUB
+
+       subs    L, L, #1
+       bgt     sgemm_kernel_L1_M2_22
+       
+
+sgemm_kernel_L1_M2_40:
+       
+       ands    L , K1, #7                                      // L = L % 8
+       ble     sgemm_kernel_L1_M2_100
+
+sgemm_kernel_L1_M2_42:
+
+       KERNEL2x1_SUB
+
+       subs    L, L, #1
+       bgt     sgemm_kernel_L1_M2_42
+       
+sgemm_kernel_L1_M2_100:
+
+       SAVE2x1
+
+sgemm_kernel_L1_M2_END:
+
+
+sgemm_kernel_L1_M1_BEGIN:
+
+       tst     I, #1                                   // I = I % 2
+       ble     sgemm_kernel_L1_END
+
+sgemm_kernel_L1_M1_20:
+
+       INIT1x1
+
+       mov     BO, BC
+       asrs    L , K1, #3                                      // L = L / 8
+       ble     sgemm_kernel_L1_M1_40
+
+sgemm_kernel_L1_M1_22:
+
+       KERNEL1x1_SUB
+       KERNEL1x1_SUB
+       KERNEL1x1_SUB
+       KERNEL1x1_SUB
+
+       KERNEL1x1_SUB
+       KERNEL1x1_SUB
+       KERNEL1x1_SUB
+       KERNEL1x1_SUB
+
+       subs    L, L, #1
+       bgt     sgemm_kernel_L1_M1_22
+       
+
+sgemm_kernel_L1_M1_40:
+       
+       ands    L , K1, #7                                      // L = L % 8
+       ble     sgemm_kernel_L1_M1_100
+
+sgemm_kernel_L1_M1_42:
+
+       KERNEL1x1_SUB
+
+       subs    L, L, #1
+       bgt     sgemm_kernel_L1_M1_42
+       
+sgemm_kernel_L1_M1_100:
+
+       SAVE1x1
+
+
+sgemm_kernel_L1_END:
+
+
+sgemm_kernel_L999:
+
+       sub     r3, fp, #128
+       vldm    r3, { s8 - s15}                                 // restore floating point registers
+
+       movs    r0, #0                                          // set return value
+       sub     sp, fp, #24
+       pop     {r4 - r9, fp}
+       bx      lr
+
+       EPILOGUE
+
diff --git a/kernel/arm/strmm_kernel_4x2_vfp.S b/kernel/arm/strmm_kernel_4x2_vfp.S
new file mode 100644 (file)
index 0000000..5394a64
--- /dev/null
@@ -0,0 +1,1083 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/23 Saar
+*       BLASTEST               : OK
+*       CTEST                  : OK
+*       TEST                   : OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 252
+
+#define        OLD_M   r0
+#define        OLD_N   r1
+#define        OLD_K   r2
+#define        OLD_A   r3
+#define OLD_ALPHA s0
+
+/******************************************************
+* [fp, #-128] - [fp, #-64] is reserved
+* for store and restore of floating point
+* registers
+*******************************************************/
+
+#define KK      [fp, #-240 ]
+#define KKK     [fp, #-244]
+#define C      [fp, #-248 ]
+#define LDC    [fp, #-252 ]
+#define M      [fp, #-256 ]
+#define N      [fp, #-260 ]
+#define K      [fp, #-264 ]
+#define A      [fp, #-268 ]
+
+#define ALPHA  [fp, #-276 ]
+
+#define B      [fp, #4 ]
+#define OLD_C  [fp, #8 ]
+#define OLD_LDC        [fp, #12 ]
+#define OFFSET  [fp, #16 ]
+
+#define I      r0
+#define J      r1
+#define L      r2
+
+#define        AO      r5
+#define        BO      r6
+
+#define        CO1     r8
+#define        CO2     r9
+
+#define K1     r7
+#define BC     r12
+
+#define A_PRE  64
+#define B_PRE  64
+#define C_PRE  64
+
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro INIT4x2
+
+       vsub.f32                s8 , s8 , s8
+       vmov.f32                s9, s8
+       vmov.f32                s10, s8
+       vmov.f32                s11, s8
+       vmov.f32                s12, s8
+       vmov.f32                s13, s8
+       vmov.f32                s14, s8
+       vmov.f32                s15, s8
+
+.endm
+
+
+
+.macro KERNEL4x2_SUB
+
+       flds    s4 , [ BO ]
+       flds    s5 , [ BO, #4 ]
+
+       flds    s0 , [ AO ]
+       flds    s1 , [ AO, #4 ]
+       flds    s2 , [ AO, #8 ]
+       flds    s3 , [ AO, #12 ]
+
+       fmacs   s8  , s0,  s4
+       fmacs   s9  , s1,  s4
+       fmacs   s10  , s2,  s4
+       fmacs   s11  , s3,  s4
+
+       fmacs   s12  , s0,  s5
+       fmacs   s13  , s1,  s5
+       fmacs   s14  , s2,  s5
+       fmacs   s15  , s3,  s5
+
+       add     AO , AO, #16
+       add     BO , BO, #8
+
+.endm
+
+.macro SAVE4x2
+
+       ldr     r3  , LDC
+       add     CO2 , CO1, r3
+
+       flds            s0, ALPHA
+
+       fmuls   s4 , s0 , s8
+       fmuls   s5 , s0 , s9
+       fmuls   s6 , s0 , s10
+       fmuls   s7 , s0 , s11
+
+       fsts    s4 , [CO1]
+       fsts    s5 , [CO1, #4 ]
+       fsts    s6 , [CO1, #8 ]
+       fsts    s7 , [CO1, #12 ]
+
+       fmuls   s4 , s0 , s12
+       fmuls   s5 , s0 , s13
+       fmuls   s6 , s0 , s14
+       fmuls   s7 , s0 , s15
+
+       fsts    s4 , [CO2]
+       fsts    s5 , [CO2, #4 ]
+       fsts    s6 , [CO2, #8 ]
+       fsts    s7 , [CO2, #12 ]
+
+       add     CO1, CO1, #16
+
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT2x2
+
+       vsub.f32                s8 , s8 , s8
+       vmov.f32                s9, s8
+       vmov.f32                s12, s8
+       vmov.f32                s13, s8
+
+.endm
+
+.macro KERNEL2x2_SUB
+
+       flds    s4 , [ BO ]
+       flds    s5 , [ BO, #4 ]
+
+       flds    s0 , [ AO ]
+       flds    s1 , [ AO, #4 ]
+
+       fmacs   s8  , s0,  s4
+       fmacs   s9  , s1,  s4
+
+       fmacs   s12  , s0,  s5
+       fmacs   s13  , s1,  s5
+
+       add     AO , AO, #8
+       add     BO , BO, #8
+
+.endm
+
+.macro SAVE2x2
+
+       ldr     r3  , LDC
+       add     CO2 , CO1, r3
+
+       flds            s0, ALPHA
+
+               
+       fmuls   s4 , s0 , s8
+       fmuls   s5 , s0 , s9
+
+       fsts    s4 , [CO1]
+       fsts    s5 , [CO1, #4 ]
+
+       fmuls   s4 , s0 , s12
+       fmuls   s5 , s0 , s13
+
+       fsts    s4 , [CO2]
+       fsts    s5 , [CO2, #4 ]
+
+       add     CO1, CO1, #8
+
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT1x2
+
+       vsub.f32                s8 , s8 , s8
+       vmov.f32                s12, s8
+
+.endm
+
+.macro KERNEL1x2_SUB
+
+       flds    s4 , [ BO ]
+       flds    s5 , [ BO, #4 ]
+
+       flds    s0 , [ AO ]
+
+       fmacs   s8  , s0,  s4
+
+       fmacs   s12  , s0,  s5
+
+       add     AO , AO, #4
+       add     BO , BO, #8
+
+.endm
+
+.macro SAVE1x2
+
+       ldr     r3  , LDC
+       add     CO2 , CO1, r3
+
+       flds            s0, ALPHA
+
+               
+       fmuls   s4 , s0 , s8
+
+       fsts    s4 , [CO1]
+
+
+       fmuls   s4 , s0 , s12
+
+       fsts    s4 , [CO2]
+
+       add     CO1, CO1, #4
+
+.endm
+
+
+
+/******************************************************************************/
+
+.macro INIT4x1
+
+       vsub.f32                s8 , s8 , s8
+       vmov.f32                s9, s8
+       vmov.f32                s10, s8
+       vmov.f32                s11, s8
+
+.endm
+
+
+
+.macro KERNEL4x1_SUB
+
+       flds    s4 , [ BO ]
+
+       flds    s0 , [ AO ]
+       flds    s1 , [ AO, #4 ]
+       flds    s2 , [ AO, #8 ]
+       flds    s3 , [ AO, #12 ]
+
+       fmacs   s8  , s0,  s4
+       fmacs   s9  , s1,  s4
+       fmacs   s10 , s2,  s4
+       fmacs   s11 , s3,  s4
+
+       add     AO , AO, #16
+       add     BO , BO, #4
+
+.endm
+
+.macro SAVE4x1
+
+       flds            s0, ALPHA
+
+       fmuls   s4 , s0 , s8
+       fmuls   s5 , s0 , s9
+       fmuls   s6 , s0 , s10
+       fmuls   s7 , s0 , s11
+
+       fsts    s4 , [CO1]
+       fsts    s5 , [CO1, #4 ]
+       fsts    s6 , [CO1, #8 ]
+       fsts    s7 , [CO1, #12 ]
+
+       add     CO1, CO1, #16
+
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x1
+
+       vsub.f32                s8 , s8 , s8
+       vmov.f32                s9 , s8
+
+.endm
+
+.macro KERNEL2x1_SUB
+
+       flds    s4 , [ BO ]
+
+       flds    s0 , [ AO ]
+       flds    s1 , [ AO, #4 ]
+
+       fmacs   s8  , s0,  s4
+       fmacs   s9  , s1,  s4
+
+       add     AO , AO, #8
+       add     BO , BO, #4
+
+.endm
+
+.macro SAVE2x1
+
+       flds            s0, ALPHA
+
+       fmuls   s4 , s0 , s8
+       fmuls   s5 , s0 , s9
+
+       fsts    s4 , [CO1]
+       fsts    s5 , [CO1, #4 ]
+
+       add     CO1, CO1, #8
+
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT1x1
+
+       vsub.f32                s8 , s8 , s8
+
+.endm
+
+.macro KERNEL1x1_SUB
+
+       flds    s4 , [ BO ]
+
+       flds    s0 , [ AO ]
+
+       fmacs   s8  , s0,  s4
+
+       add     AO , AO, #4
+       add     BO , BO, #4
+
+.endm
+
+.macro SAVE1x1
+
+       flds            s0, ALPHA
+
+       fmuls   s4 , s0 , s8
+
+       fsts    s4 , [CO1]
+
+       add     CO1, CO1, #4
+
+.endm
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+
+       PROLOGUE
+
+       .align 5
+
+       push    {r4 - r9, fp}
+       add     fp, sp, #24
+       sub     sp, sp, #STACKSIZE                              // reserve stack
+
+       str     OLD_M, M
+       str     OLD_N, N
+       str     OLD_K, K
+       str     OLD_A, A
+       vstr    OLD_ALPHA, ALPHA
+
+       sub     r3, fp, #128
+       vstm    r3, { s8 - s15}                                 // store floating point registers
+
+       ldr     r3, OLD_LDC
+       lsl     r3, r3, #2                                      // ldc = ldc * 4
+       str     r3, LDC
+
+       ldr     r3, OLD_C
+       str     r3, C
+
+       ldr     BC, B
+
+        ldr     r3, OFFSET
+#ifndef LEFT
+        neg     r3 , r3
+#endif
+        str     r3 , KK
+
+       ldr     J, N
+       asrs    J, J, #1                                        // J = J / 2
+       ble     _L1_BEGIN
+
+_L2_BEGIN:
+
+       ldr     CO1, C                                          // CO1 = C
+       ldr     r4 , LDC
+       lsl     r4 , r4 , #1                                    // LDC * 2
+       add     r3 , r4, CO1
+       str     r3 , C                                          // store C
+
+#if defined(LEFT)
+        ldr     r3 , OFFSET
+        str     r3 , KK
+#endif
+
+       ldr     AO, A                                           // AO = A
+
+
+_L2_M4_BEGIN:
+
+       ldr     I, M
+       asrs    I, I, #2                                        // I = I / 4
+       ble     _L2_M2_BEGIN
+
+_L2_M4_20:
+
+       INIT4x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+
+        mov     BO, BC
+#else
+        mov     BO, BC
+        ldr     r3 , KK
+        lsls    r4 , r3 , #3                                    // 2 float values
+        add     BO , BO , r4
+        lsls    r4 , r3 , #4                                    // 4 float values
+        add     AO , AO , r4
+
+#endif
+
+#ifndef TRMMKERNEL
+        ldr     L , K
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        ldr     L , K
+        ldr     r3, KK
+        sub     L , L, r3
+        str     L , KKK
+#else
+        ldr     L , KK
+#ifdef LEFT
+        add     L , L , #4        // number of values in AO
+#else
+        add     L , L , #2        // number of values in BO
+#endif
+        str     L , KKK
+#endif
+
+       mov     K1, L
+       asrs    L , K1, #3                                      // L = L / 8
+       ble     _L2_M4_40
+       .align 5
+
+_L2_M4_22:
+       KERNEL4x2_SUB
+       KERNEL4x2_SUB
+       KERNEL4x2_SUB
+       KERNEL4x2_SUB
+
+       KERNEL4x2_SUB
+       KERNEL4x2_SUB
+       KERNEL4x2_SUB
+       KERNEL4x2_SUB
+
+       subs    L, L, #1
+       bgt     _L2_M4_22
+       
+
+_L2_M4_40:
+       
+       ands    L , K1, #7                                      // L = L % 8
+       ble     _L2_M4_100
+
+_L2_M4_42:
+
+       KERNEL4x2_SUB
+
+       subs    L, L, #1
+       bgt     _L2_M4_42
+       
+_L2_M4_100:
+
+       SAVE4x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+        ldr     r3 , K
+        ldr     r4 , KKK
+        sub     r3 , r3 , r4
+        lsls    r4 , r3 , #3                    // 2 float values
+        add     BO , BO , r4
+        lsls    r4 , r3 , #4                    // 4 float values
+        add     AO , AO , r4
+#endif
+
+#if defined(LEFT)
+        ldr     r3 , KK
+        add     r3 , r3 , #4                    // number of values in AO
+        str     r3 , KK
+#endif
+
+
+
+_L2_M4_END:
+
+       subs    I, I, #1
+       bgt     _L2_M4_20
+
+
+_L2_M2_BEGIN:
+
+       ldr     I, M
+       tst     I , #3
+       ble     _L2_END
+
+       tst     I, #2                                   // I = I / 2
+       ble     _L2_M1_BEGIN
+
+_L2_M2_20:
+
+       INIT2x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+
+        mov     BO, BC
+#else
+        mov     BO, BC
+        ldr     r3 , KK
+        lsls    r4 , r3 , #3                                    // 2 float values
+        add     BO , BO , r4
+        lsls    r4 , r3 , #3                                    // 2 float values
+        add     AO , AO , r4
+
+#endif
+
+#ifndef TRMMKERNEL
+        ldr     L , K
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        ldr     L , K
+        ldr     r3, KK
+        sub     L , L, r3
+        str     L , KKK
+#else
+        ldr     L , KK
+#ifdef LEFT
+        add     L , L , #2        // number of values in AO
+#else
+        add     L , L , #2        // number of values in BO
+#endif
+        str     L , KKK
+#endif
+
+       mov     K1, L
+       asrs    L , K1, #3                                      // L = L / 8
+       ble     _L2_M2_40
+
+_L2_M2_22:
+
+       KERNEL2x2_SUB
+       KERNEL2x2_SUB
+       KERNEL2x2_SUB
+       KERNEL2x2_SUB
+
+       KERNEL2x2_SUB
+       KERNEL2x2_SUB
+       KERNEL2x2_SUB
+       KERNEL2x2_SUB
+
+       subs    L, L, #1
+       bgt     _L2_M2_22
+       
+
+_L2_M2_40:
+       
+       ands    L , K1, #7                                      // L = L % 8
+       ble     _L2_M2_100
+
+_L2_M2_42:
+
+       KERNEL2x2_SUB
+
+       subs    L, L, #1
+       bgt     _L2_M2_42
+       
+_L2_M2_100:
+
+       SAVE2x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+        ldr     r3 , K
+        ldr     r4 , KKK
+        sub     r3 , r3 , r4
+        lsls    r4 , r3 , #3                    // 2 float values
+        add     BO , BO , r4
+        lsls    r4 , r3 , #3                    // 2 float values
+        add     AO , AO , r4
+#endif
+
+#if defined(LEFT)
+        ldr     r3 , KK
+        add     r3 , r3 , #2                    // number of values in AO
+        str     r3 , KK
+#endif
+
+
+
+_L2_M2_END:
+
+
+_L2_M1_BEGIN:
+
+       tst     I, #1                                   // I = I % 2
+       ble     _L2_END
+
+_L2_M1_20:
+
+       INIT1x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+
+        mov     BO, BC
+#else
+        mov     BO, BC
+        ldr     r3 , KK
+        lsls    r4 , r3 , #3                                    // 2 float values
+        add     BO , BO , r4
+        lsls    r4 , r3 , #2                                    // 1 float value
+        add     AO , AO , r4
+
+#endif
+
+#ifndef TRMMKERNEL
+        ldr     L , K
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        ldr     L , K
+        ldr     r3, KK
+        sub     L , L, r3
+        str     L , KKK
+#else
+        ldr     L , KK
+#ifdef LEFT
+        add     L , L , #1        // number of values in AO
+#else
+        add     L , L , #2        // number of values in BO
+#endif
+        str     L , KKK
+#endif
+
+       mov     K1, L
+       asrs    L , K1, #3                                      // L = L / 8
+       ble     _L2_M1_40
+
+_L2_M1_22:
+       KERNEL1x2_SUB
+       KERNEL1x2_SUB
+       KERNEL1x2_SUB
+       KERNEL1x2_SUB
+
+       KERNEL1x2_SUB
+       KERNEL1x2_SUB
+       KERNEL1x2_SUB
+       KERNEL1x2_SUB
+
+       subs    L, L, #1
+       bgt     _L2_M1_22
+       
+
+_L2_M1_40:
+       
+       ands    L , K1, #7                                      // L = L % 8
+       ble     _L2_M1_100
+
+_L2_M1_42:
+
+       KERNEL1x2_SUB
+
+       subs    L, L, #1
+       bgt     _L2_M1_42
+       
+_L2_M1_100:
+
+       SAVE1x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+        ldr     r3 , K
+        ldr     r4 , KKK
+        sub     r3 , r3 , r4
+        lsls    r4 , r3 , #3                    // 2 float values
+        add     BO , BO , r4
+        lsls    r4 , r3 , #2                    // 1 float value
+        add     AO , AO , r4
+#endif
+
+#if defined(LEFT)
+        ldr     r3 , KK
+        add     r3 , r3 , #1                    // number of values in AO
+        str     r3 , KK
+#endif
+
+
+
+
+_L2_END:
+
+       mov     r3, BC
+       ldr     r4, K
+       lsl     r4, r4, #3                                      // k * 2 * 4
+       add     r3, r3, r4                                      // B = B + K * 2 * 4
+       mov     BC, r3
+
+#if !defined(LEFT)
+        ldr     r3 , KK
+        add     r3 , r3 , #2                                    // number of values in BO
+        str     r3 , KK
+#endif
+
+
+       subs    J , #1                                          // j--
+       bgt     _L2_BEGIN
+
+/*********************************************************************************************/
+
+_L1_BEGIN:
+
+       ldr     J , N
+       tst     J , #1
+       ble     _L999
+
+       
+       ldr     CO1, C                                          // CO1 = C
+       ldr     r4 , LDC
+       add     r3 , r4, CO1
+       str     r3 , C                                          // store C
+
+#if defined(LEFT)
+        ldr     r3 , OFFSET
+        str     r3 , KK
+#endif
+
+       ldr     AO, A                                           // AO = A
+        //pld     [AO , #A_PRE-96]
+        //pld     [AO , #A_PRE-64]
+        //pld     [AO , #A_PRE-32]
+
+
+
+_L1_M4_BEGIN:
+
+       ldr     I, M
+       asrs    I, I, #2                                        // I = I / 4
+       ble     _L1_M2_BEGIN
+
+_L1_M4_20:
+
+       INIT4x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+
+        mov     BO, BC
+#else
+        mov     BO, BC
+        ldr     r3 , KK
+        lsls    r4 , r3 , #2                                    // 1 float value
+        add     BO , BO , r4
+        lsls    r4 , r3 , #4                                    // 4 float values
+        add     AO , AO , r4
+
+#endif
+
+#ifndef TRMMKERNEL
+        ldr     L , K
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        ldr     L , K
+        ldr     r3, KK
+        sub     L , L, r3
+        str     L , KKK
+#else
+        ldr     L , KK
+#ifdef LEFT
+        add     L , L , #4        // number of values in AO
+#else
+        add     L , L , #1        // number of values in BO
+#endif
+        str     L , KKK
+#endif
+
+       mov     K1, L
+       asrs    L , K1, #3                                      // L = L / 8
+       ble     _L1_M4_40
+       .align 5
+
+_L1_M4_22:
+       KERNEL4x1_SUB
+       KERNEL4x1_SUB
+       KERNEL4x1_SUB
+       KERNEL4x1_SUB
+
+       KERNEL4x1_SUB
+       KERNEL4x1_SUB
+       KERNEL4x1_SUB
+       KERNEL4x1_SUB
+
+       subs    L, L, #1
+       bgt     _L1_M4_22
+       
+
+_L1_M4_40:
+       
+       ands    L , K1, #7                                      // L = L % 8
+       ble     _L1_M4_100
+
+_L1_M4_42:
+
+       KERNEL4x1_SUB
+
+       subs    L, L, #1
+       bgt     _L1_M4_42
+       
+_L1_M4_100:
+
+       SAVE4x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+        ldr     r3 , K
+        ldr     r4 , KKK
+        sub     r3 , r3 , r4
+        lsls    r4 , r3 , #2                    // 1 float value
+        add     BO , BO , r4
+        lsls    r4 , r3 , #4                    // 4 float values
+        add     AO , AO , r4
+#endif
+
+#if defined(LEFT)
+        ldr     r3 , KK
+        add     r3 , r3 , #4                    // number of values in AO
+        str     r3 , KK
+#endif
+
+
+
+_L1_M4_END:
+
+       subs    I, I, #1
+       bgt     _L1_M4_20
+
+
+_L1_M2_BEGIN:
+
+       ldr     I, M
+       tst     I , #3
+       ble     _L1_END
+
+       tst     I, #2                                   // I = I / 2
+       ble     _L1_M1_BEGIN
+
+_L1_M2_20:
+
+       INIT2x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+
+        mov     BO, BC
+#else
+        mov     BO, BC
+        ldr     r3 , KK
+        lsls    r4 , r3 , #2                                    // 1 float value
+        add     BO , BO , r4
+        lsls    r4 , r3 , #3                                    // 2 float values
+        add     AO , AO , r4
+
+#endif
+
+#ifndef TRMMKERNEL
+        ldr     L , K
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        ldr     L , K
+        ldr     r3, KK
+        sub     L , L, r3
+        str     L , KKK
+#else
+        ldr     L , KK
+#ifdef LEFT
+        add     L , L , #2        // number of values in AO
+#else
+        add     L , L , #1        // number of values in BO
+#endif
+        str     L , KKK
+#endif
+
+       mov     K1, L
+       asrs    L , K1, #3                                      // L = L / 8
+       ble     _L1_M2_40
+
+_L1_M2_22:
+
+       KERNEL2x1_SUB
+       KERNEL2x1_SUB
+       KERNEL2x1_SUB
+       KERNEL2x1_SUB
+
+       KERNEL2x1_SUB
+       KERNEL2x1_SUB
+       KERNEL2x1_SUB
+       KERNEL2x1_SUB
+
+       subs    L, L, #1
+       bgt     _L1_M2_22
+       
+
+_L1_M2_40:
+       
+       ands    L , K1, #7                                      // L = L % 8
+       ble     _L1_M2_100
+
+_L1_M2_42:
+
+       KERNEL2x1_SUB
+
+       subs    L, L, #1
+       bgt     _L1_M2_42
+       
+_L1_M2_100:
+
+       SAVE2x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+        ldr     r3 , K
+        ldr     r4 , KKK
+        sub     r3 , r3 , r4
+        lsls    r4 , r3 , #2                    // 1 float value
+        add     BO , BO , r4
+        lsls    r4 , r3 , #3                    // 2 float values
+        add     AO , AO , r4
+#endif
+
+#if defined(LEFT)
+        ldr     r3 , KK
+        add     r3 , r3 , #2                    // number of values in AO
+        str     r3 , KK
+#endif
+
+
+
+_L1_M2_END:
+
+
+_L1_M1_BEGIN:
+
+       tst     I, #1                                   // I = I % 2
+       ble     _L1_END
+
+_L1_M1_20:
+
+       INIT1x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+
+        mov     BO, BC
+#else
+        mov     BO, BC
+        ldr     r3 , KK
+        lsls    r4 , r3 , #2                                    // 1 float value
+        add     BO , BO , r4
+        lsls    r4 , r3 , #2                                    // 1 float value
+        add     AO , AO , r4
+
+#endif
+
+#ifndef TRMMKERNEL
+        ldr     L , K
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        ldr     L , K
+        ldr     r3, KK
+        sub     L , L, r3
+        str     L , KKK
+#else
+        ldr     L , KK
+#ifdef LEFT
+        add     L , L , #1        // number of values in AO
+#else
+        add     L , L , #1        // number of values in BO
+#endif
+        str     L , KKK
+#endif
+
+       mov     K1, L
+       asrs    L , K1, #3                                      // L = L / 8
+       ble     _L1_M1_40
+
+_L1_M1_22:
+       KERNEL1x1_SUB
+       KERNEL1x1_SUB
+       KERNEL1x1_SUB
+       KERNEL1x1_SUB
+
+       KERNEL1x1_SUB
+       KERNEL1x1_SUB
+       KERNEL1x1_SUB
+       KERNEL1x1_SUB
+
+       subs    L, L, #1
+       bgt     _L1_M1_22
+       
+
+_L1_M1_40:
+       
+       ands    L , K1, #7                                      // L = L % 8
+       ble     _L1_M1_100
+
+_L1_M1_42:
+
+       KERNEL1x1_SUB
+
+       subs    L, L, #1
+       bgt     _L1_M1_42
+       
+_L1_M1_100:
+
+       SAVE1x1
+
+
+_L1_END:
+
+
+_L999:
+
+       sub     r3, fp, #128
+       vldm    r3, { s8 - s15}                                 // restore floating point registers
+
+       movs    r0, #0                                          // set return value
+       sub     sp, fp, #24
+       pop     {r4 - r9, fp}
+       bx      lr
+
+       EPILOGUE
+
diff --git a/param.h b/param.h
index 568605b..ec1767d 100644 (file)
--- a/param.h
+++ b/param.h
@@ -1843,7 +1843,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GEMM_DEFAULT_OFFSET_B 0
 #define GEMM_DEFAULT_ALIGN 0x03fffUL
 
-#define SGEMM_DEFAULT_UNROLL_M  2
+#define SGEMM_DEFAULT_UNROLL_M  4
 #define SGEMM_DEFAULT_UNROLL_N  2
 
 #define DGEMM_DEFAULT_UNROLL_M  4