From ac50bccbd250a2222e7aa0222c383187886267be Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 5 Nov 2013 20:21:35 +0100 Subject: [PATCH] added cgemm_ncopy_2_vfpv3.S and made assembler labels unique --- kernel/arm/KERNEL.ARMV7 | 2 +- kernel/arm/cgemm_kernel_2x2_vfpv3.S | 150 ++++++++++----------- kernel/arm/cgemm_ncopy_2_vfpv3.S | 258 +++++++++++++++++++++++++++++++++++ kernel/arm/sgemm_kernel_4x4_vfpv3.S | 262 ++++++++++++++++++------------------ kernel/arm/sgemm_ncopy_4_vfpv3.S | 78 +++++------ 5 files changed, 504 insertions(+), 246 deletions(-) create mode 100644 kernel/arm/cgemm_ncopy_2_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index e302616..cdf3707 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -109,7 +109,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMONCOPY = cgemm_ncopy_2_vfpv3.S CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o diff --git a/kernel/arm/cgemm_kernel_2x2_vfpv3.S b/kernel/arm/cgemm_kernel_2x2_vfpv3.S index 4cebcab..3aba68d 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/01 Saar +* 2013/11/05 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -888,9 +888,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr J, N asrs J, J, #1 // J = J / 2 - ble _L1_BEGIN + ble cgemm_kernel_L1_BEGIN -_L2_BEGIN: +cgemm_kernel_L2_BEGIN: ldr CO1, C // CO1 = C ldr r4 , LDC @@ -904,19 +904,19 @@ _L2_BEGIN: -_L2_M2_BEGIN: +cgemm_kernel_L2_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 - ble _L2_M1_BEGIN + ble cgemm_kernel_L2_M1_BEGIN -_L2_M2_20: +cgemm_kernel_L2_M2_20: mov BO, BC asrs L , K1, #3 // L = L / 8 cmp L , #3 - blt _L2_M2_30 + blt cgemm_kernel_L2_M2_30 .align 5 @@ -933,7 +933,7 @@ _L2_M2_20: sub L, L, #2 -_L2_M2_22: +cgemm_kernel_L2_M2_22: KERNEL2x2_M1 KERNEL2x2_M2 @@ -946,7 +946,7 @@ _L2_M2_22: KERNEL2x2_M2 subs L, L, #1 - bgt _L2_M2_22 + bgt cgemm_kernel_L2_M2_22 KERNEL2x2_M1 KERNEL2x2_M2 @@ -958,15 +958,15 @@ _L2_M2_22: KERNEL2x2_M1 KERNEL2x2_E - b _L2_M2_44 + b cgemm_kernel_L2_M2_44 -_L2_M2_30: +cgemm_kernel_L2_M2_30: tst L, #3 - ble _L2_M2_40 + ble cgemm_kernel_L2_M2_40 tst L, #2 - ble _L2_M2_32 + ble cgemm_kernel_L2_M2_32 KERNEL2x2_I KERNEL2x2_M2 @@ -989,12 +989,12 @@ _L2_M2_30: KERNEL2x2_M1 KERNEL2x2_E - b _L2_M2_44 + b cgemm_kernel_L2_M2_44 -_L2_M2_32: +cgemm_kernel_L2_M2_32: tst L, #1 - ble _L2_M2_40 + ble cgemm_kernel_L2_M2_40 KERNEL2x2_I KERNEL2x2_M2 @@ -1006,51 +1006,51 @@ _L2_M2_32: KERNEL2x2_M1 KERNEL2x2_E - b _L2_M2_44 + b cgemm_kernel_L2_M2_44 -_L2_M2_40: +cgemm_kernel_L2_M2_40: INIT2x2 -_L2_M2_44: +cgemm_kernel_L2_M2_44: ands L , K1, #7 // L = L % 8 - ble _L2_M2_100 + ble cgemm_kernel_L2_M2_100 -_L2_M2_46: +cgemm_kernel_L2_M2_46: KERNEL2x2_SUB subs L, L, #1 - bne _L2_M2_46 + bne cgemm_kernel_L2_M2_46 -_L2_M2_100: +cgemm_kernel_L2_M2_100: SAVE2x2 -_L2_M2_END: +cgemm_kernel_L2_M2_END: subs I, I, #1 - bne _L2_M2_20 + bne cgemm_kernel_L2_M2_20 -_L2_M1_BEGIN: +cgemm_kernel_L2_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 - ble _L2_END + ble cgemm_kernel_L2_END -_L2_M1_20: +cgemm_kernel_L2_M1_20: INIT1x2 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L2_M1_40 + ble cgemm_kernel_L2_M1_40 -_L2_M1_22: +cgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB @@ -1063,27 +1063,27 @@ _L2_M1_22: KERNEL1x2_SUB subs L, L, #1 - bgt _L2_M1_22 + bgt cgemm_kernel_L2_M1_22 -_L2_M1_40: +cgemm_kernel_L2_M1_40: ands L , K1, #7 // L = L % 8 - ble _L2_M1_100 + ble cgemm_kernel_L2_M1_100 -_L2_M1_42: +cgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs L, L, #1 - bgt _L2_M1_42 + bgt cgemm_kernel_L2_M1_42 -_L2_M1_100: +cgemm_kernel_L2_M1_100: SAVE1x2 -_L2_END: +cgemm_kernel_L2_END: mov r3, BC mov r4, K1 @@ -1092,17 +1092,17 @@ _L2_END: mov BC, r3 subs J , #1 // j-- - bgt _L2_BEGIN + bgt cgemm_kernel_L2_BEGIN /*********************************************************************************************/ -_L1_BEGIN: +cgemm_kernel_L1_BEGIN: ldr J , N tst J , #1 - ble _L999 + ble cgemm_kernel_L999 ldr CO1, C // CO1 = C @@ -1112,19 +1112,19 @@ _L1_BEGIN: ldr AO, A // AO = A -_L1_M2_BEGIN: +cgemm_kernel_L1_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 - ble _L1_M1_BEGIN + ble cgemm_kernel_L1_M1_BEGIN -_L1_M2_20: +cgemm_kernel_L1_M2_20: mov BO, BC asrs L , K1, #3 // L = L / 8 cmp L , #3 - blt _L1_M2_30 + blt cgemm_kernel_L1_M2_30 .align 5 @@ -1141,7 +1141,7 @@ _L1_M2_20: sub L, L, #2 -_L1_M2_22: +cgemm_kernel_L1_M2_22: KERNEL2x1_M1 KERNEL2x1_M2 @@ -1154,7 +1154,7 @@ _L1_M2_22: KERNEL2x1_M2 subs L, L, #1 - bgt _L1_M2_22 + bgt cgemm_kernel_L1_M2_22 KERNEL2x1_M1 KERNEL2x1_M2 @@ -1166,15 +1166,15 @@ _L1_M2_22: KERNEL2x1_M1 KERNEL2x1_E - b _L1_M2_44 + b cgemm_kernel_L1_M2_44 -_L1_M2_30: +cgemm_kernel_L1_M2_30: tst L, #3 - ble _L1_M2_40 + ble cgemm_kernel_L1_M2_40 tst L, #2 - ble _L1_M2_32 + ble cgemm_kernel_L1_M2_32 KERNEL2x1_I KERNEL2x1_M2 @@ -1197,12 +1197,12 @@ _L1_M2_30: KERNEL2x1_M1 KERNEL2x1_E - b _L1_M2_44 + b cgemm_kernel_L1_M2_44 -_L1_M2_32: +cgemm_kernel_L1_M2_32: tst L, #1 - ble _L1_M2_40 + ble cgemm_kernel_L1_M2_40 KERNEL2x1_I KERNEL2x1_M2 @@ -1214,51 +1214,51 @@ _L1_M2_32: KERNEL2x1_M1 KERNEL2x1_E - b _L1_M2_44 + b cgemm_kernel_L1_M2_44 -_L1_M2_40: +cgemm_kernel_L1_M2_40: INIT2x1 -_L1_M2_44: +cgemm_kernel_L1_M2_44: ands L , K1, #7 // L = L % 8 - ble _L1_M2_100 + ble cgemm_kernel_L1_M2_100 -_L1_M2_46: +cgemm_kernel_L1_M2_46: KERNEL2x1_SUB subs L, L, #1 - bne _L1_M2_46 + bne cgemm_kernel_L1_M2_46 -_L1_M2_100: +cgemm_kernel_L1_M2_100: SAVE2x1 -_L1_M2_END: +cgemm_kernel_L1_M2_END: subs I, I, #1 - bne _L1_M2_20 + bne cgemm_kernel_L1_M2_20 -_L1_M1_BEGIN: +cgemm_kernel_L1_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 - ble _L1_END + ble cgemm_kernel_L1_END -_L1_M1_20: +cgemm_kernel_L1_M1_20: INIT1x1 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L1_M1_40 + ble cgemm_kernel_L1_M1_40 -_L1_M1_22: +cgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB @@ -1271,31 +1271,31 @@ _L1_M1_22: KERNEL1x1_SUB subs L, L, #1 - bgt _L1_M1_22 + bgt cgemm_kernel_L1_M1_22 -_L1_M1_40: +cgemm_kernel_L1_M1_40: ands L , K1, #7 // L = L % 8 - ble _L1_M1_100 + ble cgemm_kernel_L1_M1_100 -_L1_M1_42: +cgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs L, L, #1 - bgt _L1_M1_42 + bgt cgemm_kernel_L1_M1_42 -_L1_M1_100: +cgemm_kernel_L1_M1_100: SAVE1x1 -_L1_END: +cgemm_kernel_L1_END: -_L999: +cgemm_kernel_L999: sub r3, fp, #128 vldm r3, { s8 - s31} // restore floating point registers diff --git a/kernel/arm/cgemm_ncopy_2_vfpv3.S b/kernel/arm/cgemm_ncopy_2_vfpv3.S new file mode 100644 index 0000000..08fbd55 --- /dev/null +++ b/kernel/arm/cgemm_ncopy_2_vfpv3.S @@ -0,0 +1,258 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/05 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDA [fp, #-260 ] + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 + +#define I r3 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY2x2 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + flds s4 , [ AO1, #8 ] + flds s5 , [ AO1, #12 ] + + flds s2 , [ AO2, #0 ] + flds s3 , [ AO2, #4 ] + add AO1, AO1, #16 + flds s6 , [ AO2, #8 ] + flds s7 , [ AO2, #12 ] + + fstmias BO!, { s0 - s7 } + add AO2, AO2, #16 + +.endm + + +.macro COPY1x2 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + flds s2 , [ AO2, #0 ] + flds s3 , [ AO2, #4 ] + + add AO1, AO1, #8 + fstmias BO!, { s0 - s3 } + add AO2, AO2, #8 + +.endm + +.macro COPY2x1 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + flds s2 , [ AO1, #8 ] + flds s3 , [ AO1, #12 ] + + fstmias BO!, { s0 - s3 } + add AO1, AO1, #16 + +.endm + + +.macro COPY1x1 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + + fstmias BO!, { s0 - s1 } + add AO1, AO1, #8 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + + lsl r3, r3, #3 // lda = lda * 4 * 2 + str r3, LDA + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + ldr BO, B + +/*********************************************************************************************/ + +cgemm_ncopy_L2_BEGIN: + + asrs J, N, #1 // J = N / 2 + ble cgemm_ncopy_L1_BEGIN + +cgemm_ncopy_L2_M2_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add A , AO2, r4 // A = A + 2 * LDA + + asrs I, M, #1 // I = M / 2 + ble cgemm_ncopy_L2_M2_40 + +cgemm_ncopy_L2_M2_20: + + pld [ AO1, #A_PRE ] + pld [ AO2, #A_PRE ] + + COPY2x2 + subs I , I , #1 + ble cgemm_ncopy_L2_M2_40 + + COPY2x2 + subs I , I , #1 + bne cgemm_ncopy_L2_M2_20 + + +cgemm_ncopy_L2_M2_40: + + ands I, M , #1 + ble cgemm_ncopy_L2_M2_END + +cgemm_ncopy_L2_M2_60: + + COPY1x2 + + subs I , I , #1 + bne cgemm_ncopy_L2_M2_60 + + +cgemm_ncopy_L2_M2_END: + + subs J , J, #1 // j-- + bne cgemm_ncopy_L2_M2_BEGIN + + +/*********************************************************************************************/ + +cgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble cgemm_ncopy_L999 + + +cgemm_ncopy_L1_M2_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add A , AO1, r4 // A = A + 1 * LDA + + asrs I, M, #1 // I = M / 2 + ble cgemm_ncopy_L1_M2_40 + +cgemm_ncopy_L1_M2_20: + + COPY2x1 + + subs I , I , #1 + bne cgemm_ncopy_L1_M2_20 + + +cgemm_ncopy_L1_M2_40: + + ands I, M , #1 + ble cgemm_ncopy_L1_M2_END + +cgemm_ncopy_L1_M2_60: + + COPY1x1 + + subs I , I , #1 + bne cgemm_ncopy_L1_M2_60 + + +cgemm_ncopy_L1_M2_END: + + + +cgemm_ncopy_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S index 8bc3e53..4031c28 100644 --- a/kernel/arm/sgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/02 Saar +* 2013/11/05 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -865,9 +865,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr J, N asrs J, J, #2 // J = J / 4 - ble _L2_BEGIN + ble sgemm_kernel_L2_BEGIN -_L4_BEGIN: +sgemm_kernel_L4_BEGIN: ldr CO1, C // CO1 = C ldr r4 , LDC @@ -881,19 +881,19 @@ _L4_BEGIN: -_L4_M4_BEGIN: +sgemm_kernel_L4_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 - ble _L4_M2_BEGIN + ble sgemm_kernel_L4_M2_BEGIN -_L4_M4_20: +sgemm_kernel_L4_M4_20: mov BO, BC asrs L , K1, #1 // L = L / 8 cmp L , #2 - blt _L4_M4_32 + blt sgemm_kernel_L4_M4_32 @@ -901,81 +901,81 @@ _L4_M4_20: KERNEL4x4_M2 subs L, L, #2 - ble _L4_M4_22a + ble sgemm_kernel_L4_M4_22a .align 5 -_L4_M4_22: +sgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs L, L, #1 - bgt _L4_M4_22 + bgt sgemm_kernel_L4_M4_22 -_L4_M4_22a: +sgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b _L4_M4_44 + b sgemm_kernel_L4_M4_44 -_L4_M4_32: +sgemm_kernel_L4_M4_32: tst L, #1 - ble _L4_M4_40 + ble sgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b _L4_M4_44 + b sgemm_kernel_L4_M4_44 -_L4_M4_40: +sgemm_kernel_L4_M4_40: INIT4x4 -_L4_M4_44: +sgemm_kernel_L4_M4_44: ands L , K1, #1 // L = L % 8 - ble _L4_M4_100 + ble sgemm_kernel_L4_M4_100 -_L4_M4_46: +sgemm_kernel_L4_M4_46: KERNEL4x4_SUB subs L, L, #1 - bne _L4_M4_46 + bne sgemm_kernel_L4_M4_46 -_L4_M4_100: +sgemm_kernel_L4_M4_100: SAVE4x4 -_L4_M4_END: +sgemm_kernel_L4_M4_END: subs I, I, #1 - bne _L4_M4_20 + bne sgemm_kernel_L4_M4_20 -_L4_M2_BEGIN: +sgemm_kernel_L4_M2_BEGIN: ldr I, M tst I , #3 - ble _L4_END + ble sgemm_kernel_L4_END tst I, #2 // I = I / 2 - ble _L4_M1_BEGIN + ble sgemm_kernel_L4_M1_BEGIN -_L4_M2_20: +sgemm_kernel_L4_M2_20: INIT2x4 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L4_M2_40 + ble sgemm_kernel_L4_M2_40 -_L4_M2_22: +sgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -988,42 +988,42 @@ _L4_M2_22: KERNEL2x4_SUB subs L, L, #1 - bgt _L4_M2_22 + bgt sgemm_kernel_L4_M2_22 -_L4_M2_40: +sgemm_kernel_L4_M2_40: ands L , K1, #7 // L = L % 8 - ble _L4_M2_100 + ble sgemm_kernel_L4_M2_100 -_L4_M2_42: +sgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs L, L, #1 - bgt _L4_M2_42 + bgt sgemm_kernel_L4_M2_42 -_L4_M2_100: +sgemm_kernel_L4_M2_100: SAVE2x4 -_L4_M2_END: +sgemm_kernel_L4_M2_END: -_L4_M1_BEGIN: +sgemm_kernel_L4_M1_BEGIN: tst I, #1 // I = I % 2 - ble _L4_END + ble sgemm_kernel_L4_END -_L4_M1_20: +sgemm_kernel_L4_M1_20: INIT1x4 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L4_M1_40 + ble sgemm_kernel_L4_M1_40 -_L4_M1_22: +sgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1035,27 +1035,27 @@ _L4_M1_22: KERNEL1x4_SUB subs L, L, #1 - bgt _L4_M1_22 + bgt sgemm_kernel_L4_M1_22 -_L4_M1_40: +sgemm_kernel_L4_M1_40: ands L , K1, #7 // L = L % 8 - ble _L4_M1_100 + ble sgemm_kernel_L4_M1_100 -_L4_M1_42: +sgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs L, L, #1 - bgt _L4_M1_42 + bgt sgemm_kernel_L4_M1_42 -_L4_M1_100: +sgemm_kernel_L4_M1_100: SAVE1x4 -_L4_END: +sgemm_kernel_L4_END: mov r3, BC mov r4, K1 @@ -1064,20 +1064,20 @@ _L4_END: mov BC, r3 subs J , #1 // j-- - bgt _L4_BEGIN + bgt sgemm_kernel_L4_BEGIN /*********************************************************************************************/ -_L2_BEGIN: +sgemm_kernel_L2_BEGIN: ldr J , N tst J , #3 - ble _L999 + ble sgemm_kernel_L999 tst J , #2 - ble _L1_BEGIN + ble sgemm_kernel_L1_BEGIN ldr CO1, C // CO1 = C ldr r4 , LDC @@ -1092,22 +1092,22 @@ _L2_BEGIN: -_L2_M4_BEGIN: +sgemm_kernel_L2_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 - ble _L2_M2_BEGIN + ble sgemm_kernel_L2_M2_BEGIN -_L2_M4_20: +sgemm_kernel_L2_M4_20: INIT4x2 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L2_M4_40 + ble sgemm_kernel_L2_M4_40 .align 5 -_L2_M4_22: +sgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1119,49 +1119,49 @@ _L2_M4_22: KERNEL4x2_SUB subs L, L, #1 - bgt _L2_M4_22 + bgt sgemm_kernel_L2_M4_22 -_L2_M4_40: +sgemm_kernel_L2_M4_40: ands L , K1, #7 // L = L % 8 - ble _L2_M4_100 + ble sgemm_kernel_L2_M4_100 -_L2_M4_42: +sgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs L, L, #1 - bgt _L2_M4_42 + bgt sgemm_kernel_L2_M4_42 -_L2_M4_100: +sgemm_kernel_L2_M4_100: SAVE4x2 -_L2_M4_END: +sgemm_kernel_L2_M4_END: subs I, I, #1 - bgt _L2_M4_20 + bgt sgemm_kernel_L2_M4_20 -_L2_M2_BEGIN: +sgemm_kernel_L2_M2_BEGIN: ldr I, M tst I , #3 - ble _L2_END + ble sgemm_kernel_L2_END tst I, #2 // I = I / 2 - ble _L2_M1_BEGIN + ble sgemm_kernel_L2_M1_BEGIN -_L2_M2_20: +sgemm_kernel_L2_M2_20: INIT2x2 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L2_M2_40 + ble sgemm_kernel_L2_M2_40 -_L2_M2_22: +sgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1174,42 +1174,42 @@ _L2_M2_22: KERNEL2x2_SUB subs L, L, #1 - bgt _L2_M2_22 + bgt sgemm_kernel_L2_M2_22 -_L2_M2_40: +sgemm_kernel_L2_M2_40: ands L , K1, #7 // L = L % 8 - ble _L2_M2_100 + ble sgemm_kernel_L2_M2_100 -_L2_M2_42: +sgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs L, L, #1 - bgt _L2_M2_42 + bgt sgemm_kernel_L2_M2_42 -_L2_M2_100: +sgemm_kernel_L2_M2_100: SAVE2x2 -_L2_M2_END: +sgemm_kernel_L2_M2_END: -_L2_M1_BEGIN: +sgemm_kernel_L2_M1_BEGIN: tst I, #1 // I = I % 2 - ble _L2_END + ble sgemm_kernel_L2_END -_L2_M1_20: +sgemm_kernel_L2_M1_20: INIT1x2 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L2_M1_40 + ble sgemm_kernel_L2_M1_40 -_L2_M1_22: +sgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1221,27 +1221,27 @@ _L2_M1_22: KERNEL1x2_SUB subs L, L, #1 - bgt _L2_M1_22 + bgt sgemm_kernel_L2_M1_22 -_L2_M1_40: +sgemm_kernel_L2_M1_40: ands L , K1, #7 // L = L % 8 - ble _L2_M1_100 + ble sgemm_kernel_L2_M1_100 -_L2_M1_42: +sgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs L, L, #1 - bgt _L2_M1_42 + bgt sgemm_kernel_L2_M1_42 -_L2_M1_100: +sgemm_kernel_L2_M1_100: SAVE1x2 -_L2_END: +sgemm_kernel_L2_END: mov r3, BC mov r4, K1 @@ -1251,11 +1251,11 @@ _L2_END: /*********************************************************************************************/ -_L1_BEGIN: +sgemm_kernel_L1_BEGIN: ldr J , N tst J , #1 - ble _L999 + ble sgemm_kernel_L999 ldr CO1, C // CO1 = C @@ -1270,22 +1270,22 @@ _L1_BEGIN: -_L1_M4_BEGIN: +sgemm_kernel_L1_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 - ble _L1_M2_BEGIN + ble sgemm_kernel_L1_M2_BEGIN -_L1_M4_20: +sgemm_kernel_L1_M4_20: INIT4x1 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L1_M4_40 + ble sgemm_kernel_L1_M4_40 .align 5 -_L1_M4_22: +sgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1297,49 +1297,49 @@ _L1_M4_22: KERNEL4x1_SUB subs L, L, #1 - bgt _L1_M4_22 + bgt sgemm_kernel_L1_M4_22 -_L1_M4_40: +sgemm_kernel_L1_M4_40: ands L , K1, #7 // L = L % 8 - ble _L1_M4_100 + ble sgemm_kernel_L1_M4_100 -_L1_M4_42: +sgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs L, L, #1 - bgt _L1_M4_42 + bgt sgemm_kernel_L1_M4_42 -_L1_M4_100: +sgemm_kernel_L1_M4_100: SAVE4x1 -_L1_M4_END: +sgemm_kernel_L1_M4_END: subs I, I, #1 - bgt _L1_M4_20 + bgt sgemm_kernel_L1_M4_20 -_L1_M2_BEGIN: +sgemm_kernel_L1_M2_BEGIN: ldr I, M tst I , #3 - ble _L1_END + ble sgemm_kernel_L1_END tst I, #2 // I = I / 2 - ble _L1_M1_BEGIN + ble sgemm_kernel_L1_M1_BEGIN -_L1_M2_20: +sgemm_kernel_L1_M2_20: INIT2x1 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L1_M2_40 + ble sgemm_kernel_L1_M2_40 -_L1_M2_22: +sgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1352,42 +1352,42 @@ _L1_M2_22: KERNEL2x1_SUB subs L, L, #1 - bgt _L1_M2_22 + bgt sgemm_kernel_L1_M2_22 -_L1_M2_40: +sgemm_kernel_L1_M2_40: ands L , K1, #7 // L = L % 8 - ble _L1_M2_100 + ble sgemm_kernel_L1_M2_100 -_L1_M2_42: +sgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs L, L, #1 - bgt _L1_M2_42 + bgt sgemm_kernel_L1_M2_42 -_L1_M2_100: +sgemm_kernel_L1_M2_100: SAVE2x1 -_L1_M2_END: +sgemm_kernel_L1_M2_END: -_L1_M1_BEGIN: +sgemm_kernel_L1_M1_BEGIN: tst I, #1 // I = I % 2 - ble _L1_END + ble sgemm_kernel_L1_END -_L1_M1_20: +sgemm_kernel_L1_M1_20: INIT1x1 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L1_M1_40 + ble sgemm_kernel_L1_M1_40 -_L1_M1_22: +sgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1399,30 +1399,30 @@ _L1_M1_22: KERNEL1x1_SUB subs L, L, #1 - bgt _L1_M1_22 + bgt sgemm_kernel_L1_M1_22 -_L1_M1_40: +sgemm_kernel_L1_M1_40: ands L , K1, #7 // L = L % 8 - ble _L1_M1_100 + ble sgemm_kernel_L1_M1_100 -_L1_M1_42: +sgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs L, L, #1 - bgt _L1_M1_42 + bgt sgemm_kernel_L1_M1_42 -_L1_M1_100: +sgemm_kernel_L1_M1_100: SAVE1x1 -_L1_END: +sgemm_kernel_L1_END: -_L999: +sgemm_kernel_L999: sub r3, fp, #128 vldm r3, { s8 - s31} // restore floating point registers diff --git a/kernel/arm/sgemm_ncopy_4_vfpv3.S b/kernel/arm/sgemm_ncopy_4_vfpv3.S index 8af7ed8..2d8fa2e 100644 --- a/kernel/arm/sgemm_ncopy_4_vfpv3.S +++ b/kernel/arm/sgemm_ncopy_4_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/02 Saar +* 2013/11/05 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -68,7 +68,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define I r3 #define J r12 -#define A_PRE 96 +#define A_PRE 192 /************************************************************************************** * Macro definitions @@ -199,12 +199,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr BO, B -_L4_BEGIN: +sgemm_ncopy_L4_BEGIN: asrs J, N, #2 // J = N / 4 - ble _L2_BEGIN + ble sgemm_ncopy_L2_BEGIN -_L4_M4_BEGIN: +sgemm_ncopy_L4_M4_BEGIN: mov AO1, A // AO1 = A ldr r4 , LDA @@ -214,9 +214,9 @@ _L4_M4_BEGIN: add A , AO4, r4 // A = A + 4 * LDA asrs I, M, #2 // I = M / 4 - ble _L4_M4_40 + ble sgemm_ncopy_L4_M4_40 -_L4_M4_20: +sgemm_ncopy_L4_M4_20: pld [ AO1, #A_PRE ] pld [ AO2, #A_PRE ] @@ -225,45 +225,45 @@ _L4_M4_20: COPY4x4 subs I , I , #1 - ble _L4_M4_40 + ble sgemm_ncopy_L4_M4_40 COPY4x4 subs I , I , #1 - bne _L4_M4_20 + bne sgemm_ncopy_L4_M4_20 -_L4_M4_40: +sgemm_ncopy_L4_M4_40: ands I, M , #3 - ble _L4_M4_END + ble sgemm_ncopy_L4_M4_END -_L4_M4_60: +sgemm_ncopy_L4_M4_60: COPY1x4 subs I , I , #1 - bne _L4_M4_60 + bne sgemm_ncopy_L4_M4_60 -_L4_M4_END: +sgemm_ncopy_L4_M4_END: subs J , J, #1 // j-- - bne _L4_M4_BEGIN + bne sgemm_ncopy_L4_M4_BEGIN /*********************************************************************************************/ -_L2_BEGIN: +sgemm_ncopy_L2_BEGIN: tst N, #3 - ble _L999 + ble sgemm_ncopy_L999 tst N, #2 - ble _L1_BEGIN + ble sgemm_ncopy_L1_BEGIN -_L2_M4_BEGIN: +sgemm_ncopy_L2_M4_BEGIN: mov AO1, A // AO1 = A ldr r4 , LDA @@ -271,75 +271,75 @@ _L2_M4_BEGIN: add A , AO2, r4 // A = A + 2 * LDA asrs I, M, #2 // I = M / 4 - ble _L2_M4_40 + ble sgemm_ncopy_L2_M4_40 -_L2_M4_20: +sgemm_ncopy_L2_M4_20: COPY4x2 subs I , I , #1 - bne _L2_M4_20 + bne sgemm_ncopy_L2_M4_20 -_L2_M4_40: +sgemm_ncopy_L2_M4_40: ands I, M , #3 - ble _L2_M4_END + ble sgemm_ncopy_L2_M4_END -_L2_M4_60: +sgemm_ncopy_L2_M4_60: COPY1x2 subs I , I , #1 - bne _L2_M4_60 + bne sgemm_ncopy_L2_M4_60 -_L2_M4_END: +sgemm_ncopy_L2_M4_END: /*********************************************************************************************/ -_L1_BEGIN: +sgemm_ncopy_L1_BEGIN: tst N, #1 - ble _L999 + ble sgemm_ncopy_L999 -_L1_M4_BEGIN: +sgemm_ncopy_L1_M4_BEGIN: mov AO1, A // AO1 = A ldr r4 , LDA add A , AO1, r4 // A = A + 1 * LDA asrs I, M, #2 // I = M / 4 - ble _L1_M4_40 + ble sgemm_ncopy_L1_M4_40 -_L1_M4_20: +sgemm_ncopy_L1_M4_20: COPY4x1 subs I , I , #1 - bne _L1_M4_20 + bne sgemm_ncopy_L1_M4_20 -_L1_M4_40: +sgemm_ncopy_L1_M4_40: ands I, M , #3 - ble _L1_M4_END + ble sgemm_ncopy_L1_M4_END -_L1_M4_60: +sgemm_ncopy_L1_M4_60: COPY1x1 subs I , I , #1 - bne _L1_M4_60 + bne sgemm_ncopy_L1_M4_60 -_L1_M4_END: +sgemm_ncopy_L1_M4_END: -_L999: +sgemm_ncopy_L999: sub r3, fp, #128 vldm r3, { s8 - s15} // restore floating point registers -- 2.7.4