From 2a1515c9dd561fe8c3dca9dcae9efc22a0c56644 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 12 Oct 2013 16:48:29 +0200 Subject: [PATCH] added dgemm_ncopy_4_vfpv3.S --- kernel/arm/dgemm_ncopy_4_vfpv3.S | 344 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 344 insertions(+) create mode 100644 kernel/arm/dgemm_ncopy_4_vfpv3.S diff --git a/kernel/arm/dgemm_ncopy_4_vfpv3.S b/kernel/arm/dgemm_ncopy_4_vfpv3.S new file mode 100644 index 0000000..bdb63bf --- /dev/null +++ b/kernel/arm/dgemm_ncopy_4_vfpv3.S @@ -0,0 +1,344 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/11 Saar +* BLASTEST : xOK +* CTEST : xOK +* TEST : xOK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDA [fp, #-260 ] + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 +#define AO3 r8 +#define AO4 r9 + +#define I r3 +#define J r12 + +#define A_PRE 96 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY4x4 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO2, #0 ] + fldd d2 , [ AO3, #0 ] + fldd d3 , [ AO4, #0 ] + + fldd d4 , [ AO1, #8 ] + fldd d8 , [ AO1, #16 ] + fldd d12, [ AO1, #24 ] + + fldd d5 , [ AO2, #8 ] + add AO1, AO1, #32 + fldd d9 , [ AO2, #16 ] + fldd d13, [ AO2, #24 ] + + fldd d6 , [ AO3, #8 ] + add AO2, AO2, #32 + fldd d10, [ AO3, #16 ] + fldd d14, [ AO3, #24 ] + + fldd d7 , [ AO4, #8 ] + add AO3, AO3, #32 + fldd d11, [ AO4, #16 ] + fldd d15, [ AO4, #24 ] + + fstmiad BO!, { d0 - d3 } + add AO4, AO4, #32 + fstmiad BO!, { d4 - d7 } + fstmiad BO!, { d8 - d15 } + +.endm + +.macro COPY1x4 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO2, #0 ] + add AO1, AO1, #8 + fldd d2 , [ AO3, #0 ] + add AO2, AO2, #8 + fldd d3 , [ AO4, #0 ] + + add AO3, AO3, #8 + fstmiad BO!, { d0 - d3 } + add AO4, AO4, #8 + +.endm + +.macro COPY4x2 + + fldd d0 , [ AO1, #0 ] + fldd d2 , [ AO1, #8 ] + fldd d4 , [ AO1, #16 ] + fldd d6 , [ AO1, #24 ] + + fldd d1 , [ AO2, #0 ] + fldd d3 , [ AO2, #8 ] + add AO1, AO1, #32 + fldd d5 , [ AO2, #16 ] + fldd d7 , [ AO2, #24 ] + + fstmiad BO!, { d0 - d7 } + add AO2, AO2, #32 + +.endm + + +.macro COPY1x2 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO2, #0 ] + add AO1, AO1, #8 + + fstmiad BO!, { d0 - d1 } + add AO2, AO2, #8 + +.endm + +.macro COPY4x1 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + fldd d2 , [ AO1, #16 ] + fldd d3 , [ AO1, #24 ] + + fstmiad BO!, { d0 - d3 } + add AO1, AO1, #32 + +.endm + + +.macro COPY1x1 + + fldd d0 , [ AO1, #0 ] + + fstmiad BO!, { d0 } + add AO1, AO1, #8 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + + lsl r3, r3, #3 // lda = lda * 8 + str r3, LDA + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + ldr BO, B + +_L4_BEGIN: + + asrs J, N, #2 // J = N / 4 + ble _L2_BEGIN + +_L4_M4_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add AO3, AO2, r4 + add AO4, AO3, r4 + add A , AO4, r4 // A = A + 4 * LDA + + asrs I, M, #2 // I = M / 4 + ble _L4_M4_40 + +_L4_M4_20: + + COPY4x4 + + subs I , I , #1 + bne _L4_M4_20 + + +_L4_M4_40: + + ands I, M , #3 + ble _L4_M4_END + +_L4_M4_60: + + COPY1x4 + + subs I , I , #1 + bne _L4_M4_60 + + +_L4_M4_END: + + subs J , J, #1 // j-- + bne _L4_M4_BEGIN + + + +/*********************************************************************************************/ + +_L2_BEGIN: + + tst N, #3 + ble _L999 + + tst N, #2 + ble _L1_BEGIN + +_L2_M4_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add A , AO2, r4 // A = A + 2 * LDA + + asrs I, M, #2 // I = M / 4 + ble _L2_M4_40 + +_L2_M4_20: + + COPY4x2 + + subs I , I , #1 + bne _L2_M4_20 + + +_L2_M4_40: + + ands I, M , #3 + ble _L2_M4_END + +_L2_M4_60: + + COPY1x2 + + subs I , I , #1 + bne _L2_M4_60 + + +_L2_M4_END: + + +/*********************************************************************************************/ + +_L1_BEGIN: + + tst N, #1 + ble _L999 + + +_L1_M4_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add A , AO1, r4 // A = A + 1 * LDA + + asrs I, M, #2 // I = M / 4 + ble _L1_M4_40 + +_L1_M4_20: + + COPY4x1 + + subs I , I , #1 + bne _L1_M4_20 + + +_L1_M4_40: + + ands I, M , #3 + ble _L1_M4_END + +_L1_M4_60: + + COPY1x1 + + subs I , I , #1 + bne _L1_M4_60 + + +_L1_M4_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + -- 2.7.4