From 12e02a00e06b61e1825ac48d9071fce262042998 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 24 Nov 2013 08:46:47 +0100 Subject: [PATCH] added ncopy kernels for ARMV6 --- kernel/arm/KERNEL.ARMV6 | 14 +-- kernel/arm/dgemm_ncopy_2_vfp.S | 225 +++++++++++++++++++++++++++++++++++++++++ kernel/arm/sgemm_ncopy_2_vfp.S | 225 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 457 insertions(+), 7 deletions(-) create mode 100644 kernel/arm/dgemm_ncopy_2_vfp.S create mode 100644 kernel/arm/sgemm_ncopy_2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index b192b20..1f2510b 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -85,22 +85,22 @@ DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -SGEMMKERNEL = sgemm_kernel_4x2_vfp.S -SGEMMINCOPY = ../generic/gemm_ncopy_4.c -SGEMMITCOPY = ../generic/gemm_tcopy_4.c +SGEMMKERNEL = sgemm_kernel_4x2_vfp.S +SGEMMINCOPY = sgemm_ncopy_4_vfp.S +SGEMMITCOPY = sgemm_tcopy_4_vfp.S SGEMMINCOPYOBJ = sgemm_incopy.o SGEMMITCOPYOBJ = sgemm_itcopy.o -SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMONCOPY = sgemm_ncopy_2_vfp.S SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = dgemm_kernel_4x2_vfp.S -DGEMMINCOPY = ../generic/gemm_ncopy_4.c -DGEMMITCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPY = dgemm_ncopy_4_vfp.S +DGEMMITCOPY = dgemm_tcopy_4_vfp.S DGEMMINCOPYOBJ = dgemm_incopy.o DGEMMITCOPYOBJ = dgemm_itcopy.o -DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMONCOPY = dgemm_ncopy_2_vfp.S DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o diff --git a/kernel/arm/dgemm_ncopy_2_vfp.S b/kernel/arm/dgemm_ncopy_2_vfp.S new file mode 100644 index 0000000..763c032 --- /dev/null +++ b/kernel/arm/dgemm_ncopy_2_vfp.S @@ -0,0 +1,225 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/24 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 +#define LDA r8 + +#define I r3 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY2x2 + + fldd d0 , [ AO1, #0 ] + fldd d2 , [ AO1, #8 ] + + fldd d1 , [ AO2, #0 ] + fldd d3 , [ AO2, #8 ] + + add AO1, AO1, #16 + fstmiad BO!, { d0 - d3 } + add AO2, AO2, #16 + +.endm + + +.macro COPY1x2 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO2, #0 ] + add AO1, AO1, #8 + + fstmiad BO!, { d0 - d1 } + add AO2, AO2, #8 + +.endm + +.macro COPY2x1 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + + fstmiad BO!, { d0 - d1 } + add AO1, AO1, #16 + +.endm + + +.macro COPY1x1 + + fldd d0 , [ AO1, #0 ] + + fstmiad BO!, { d0 } + add AO1, AO1, #8 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + + lsl LDA, OLD_LDA, #3 // lda = lda * 8 + + ldr BO, B + + +/*********************************************************************************************/ + +dgemm_ncopy_L2_BEGIN: + + asrs J, N, #1 // J = N / 2 + ble dgemm_ncopy_L1_BEGIN + +dgemm_ncopy_L2_M2_BEGIN: + + mov AO1, A // AO1 = A + add AO2, AO1, LDA + add A , AO2, LDA // A = A + 2 * LDA + + asrs I, M, #1 // I = M / 2 + ble dgemm_ncopy_L2_M2_40 + +dgemm_ncopy_L2_M2_20: + + COPY2x2 + + subs I , I , #1 + bne dgemm_ncopy_L2_M2_20 + + +dgemm_ncopy_L2_M2_40: + + ands I, M , #1 + ble dgemm_ncopy_L2_M2_END + +dgemm_ncopy_L2_M2_60: + + COPY1x2 + + subs I , I , #1 + bne dgemm_ncopy_L2_M2_60 + + +dgemm_ncopy_L2_M2_END: + + subs J , J, #1 // j-- + bne dgemm_ncopy_L2_M2_BEGIN + +/*********************************************************************************************/ + +dgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble dgemm_ncopy_L999 + + +dgemm_ncopy_L1_M2_BEGIN: + + mov AO1, A // AO1 = A + add A , AO1, LDA // A = A + 1 * LDA + + asrs I, M, #1 // I = M / 2 + ble dgemm_ncopy_L1_M2_40 + +dgemm_ncopy_L1_M2_20: + + COPY2x1 + + subs I , I , #1 + bne dgemm_ncopy_L1_M2_20 + + +dgemm_ncopy_L1_M2_40: + + ands I, M , #1 + ble dgemm_ncopy_L1_M2_END + +dgemm_ncopy_L1_M2_60: + + COPY1x1 + + subs I , I , #1 + bne dgemm_ncopy_L1_M2_60 + + +dgemm_ncopy_L1_M2_END: + + + +dgemm_ncopy_L999: + + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/sgemm_ncopy_2_vfp.S b/kernel/arm/sgemm_ncopy_2_vfp.S new file mode 100644 index 0000000..0546f1d --- /dev/null +++ b/kernel/arm/sgemm_ncopy_2_vfp.S @@ -0,0 +1,225 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/24 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 +#define LDA r8 + +#define I r3 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY2x2 + + flds s0 , [ AO1, #0 ] + flds s2 , [ AO1, #4 ] + + flds s1 , [ AO2, #0 ] + flds s3 , [ AO2, #4 ] + + add AO1, AO1, #8 + fstmias BO!, { s0 - s3 } + add AO2, AO2, #8 + +.endm + + +.macro COPY1x2 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO2, #0 ] + add AO1, AO1, #4 + + fstmias BO!, { s0 - s1 } + add AO2, AO2, #4 + +.endm + +.macro COPY2x1 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + + fstmias BO!, { s0 - s1 } + add AO1, AO1, #8 + +.endm + + +.macro COPY1x1 + + flds s0 , [ AO1, #0 ] + + fstmias BO!, { s0 } + add AO1, AO1, #4 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + + lsl LDA, OLD_LDA, #2 // lda = lda * 4 + + ldr BO, B + + +/*********************************************************************************************/ + +sgemm_ncopy_L2_BEGIN: + + asrs J, N, #1 // J = N / 2 + ble sgemm_ncopy_L1_BEGIN + +sgemm_ncopy_L2_M2_BEGIN: + + mov AO1, A // AO1 = A + add AO2, AO1, LDA + add A , AO2, LDA // A = A + 2 * LDA + + asrs I, M, #1 // I = M / 2 + ble sgemm_ncopy_L2_M2_40 + +sgemm_ncopy_L2_M2_20: + + COPY2x2 + + subs I , I , #1 + bne sgemm_ncopy_L2_M2_20 + + +sgemm_ncopy_L2_M2_40: + + ands I, M , #1 + ble sgemm_ncopy_L2_M2_END + +sgemm_ncopy_L2_M2_60: + + COPY1x2 + + subs I , I , #1 + bne sgemm_ncopy_L2_M2_60 + + +sgemm_ncopy_L2_M2_END: + + subs J , J, #1 // j-- + bne sgemm_ncopy_L2_M2_BEGIN + +/*********************************************************************************************/ + +sgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble sgemm_ncopy_L999 + + +sgemm_ncopy_L1_M2_BEGIN: + + mov AO1, A // AO1 = A + add A , AO1, LDA // A = A + 1 * LDA + + asrs I, M, #1 // I = M / 2 + ble sgemm_ncopy_L1_M2_40 + +sgemm_ncopy_L1_M2_20: + + COPY2x1 + + subs I , I , #1 + bne sgemm_ncopy_L1_M2_20 + + +sgemm_ncopy_L1_M2_40: + + ands I, M , #1 + ble sgemm_ncopy_L1_M2_END + +sgemm_ncopy_L1_M2_60: + + COPY1x1 + + subs I , I , #1 + bne sgemm_ncopy_L1_M2_60 + + +sgemm_ncopy_L1_M2_END: + + + +sgemm_ncopy_L999: + + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + -- 2.7.4