From 85484a42df31257592161ebac7cda80f25133547 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 16 Oct 2013 18:00:41 +0200 Subject: [PATCH] added kernels for cgemm, ctrmm, zgemm and ztrmm --- kernel/arm/KERNEL.ARMV7 | 37 +- kernel/arm/cgemm_kernel_2x2_vfpv3.S | 1293 +++++++++++++++++++++++++++++ kernel/arm/ctrmm_kernel_2x2_vfpv3.S | 1476 +++++++++++++++++++++++++++++++++ kernel/arm/zgemm_kernel_2x2_vfpv3.S | 1329 ++++++++++++++++++++++++++++++ kernel/arm/ztrmm_kernel_2x2_vfpv3.S | 1538 +++++++++++++++++++++++++++++++++++ 5 files changed, 5657 insertions(+), 16 deletions(-) create mode 100644 kernel/arm/cgemm_kernel_2x2_vfpv3.S create mode 100644 kernel/arm/ctrmm_kernel_2x2_vfpv3.S create mode 100644 kernel/arm/zgemm_kernel_2x2_vfpv3.S create mode 100644 kernel/arm/ztrmm_kernel_2x2_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 8c69ad5..4315379 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -80,36 +80,41 @@ DGEMVTKERNEL = gemv_t.c CGEMVTKERNEL = zgemv_t.c ZGEMVTKERNEL = zgemv_t.c -STRMMKERNEL = ../generic/trmmkernel_2x2.c -DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c - -SGEMMKERNEL = ../generic/gemmkernel_2x2.c -SGEMMONCOPY = ../generic/gemm_ncopy_2.c -SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +STRMMKERNEL = strmm_kernel_4x4_vfpv3.S +DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S +CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S +ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S + +#SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o #DGEMMKERNEL = ../generic/gemmkernel_2x2.c #DGEMMKERNEL = dgemm_kernel_4x2_vfpv2.S DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S -DGEMMINCOPY = -DGEMMITCOPY = -DGEMMONCOPY = ../generic/gemm_ncopy_4.c -DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMINCOPYOBJ = -DGEMMITCOPYOBJ = +DGEMMINCOPY = dgemm_ncopy_4_vfpv3.S +DGEMMITCOPY = ../generic/gemm_tcopy_4.c +DGEMMONCOPY = dgemm_ncopy_4_vfpv3.S +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o -ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMONCOPYOBJ = zgemm_oncopy.o diff --git a/kernel/arm/cgemm_kernel_2x2_vfpv3.S b/kernel/arm/cgemm_kernel_2x2_vfpv3.S new file mode 100644 index 0000000..abbbac8 --- /dev/null +++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S @@ -0,0 +1,1293 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/16 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R s0 +#define OLD_ALPHA_I s1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define FADD_R fsubs + #define FADD_I fadds + + #define FMAC_R1 fnmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fnmacs + +#elif defined(CN) || defined(CT) + + #define FADD_R fadds + #define FADD_I fsubs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#elif defined(NC) || defined(TC) + + #define FADD_R fadds + #define FADD_I fsubs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#else + + #define FADD_R fsubs + #define FADD_I fadds + + #define FMAC_R1 fnmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fnmacs + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + vmov.f32 s30, s16 + vmov.f32 s31, s16 + +.endm + +.macro KERNEL2x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldmias AO!, { s0 - s1 } + fldmias BO!, { s8 - s9 } + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fldmias AO!, { s2 - s3 } + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fldmias BO!, { s10 - s11 } + fmuls s18 , s2, s8 + fmuls s26 , s3, s9 + fldmias AO!, { s4 - s5 } + fmuls s19 , s2, s9 + fmuls s27 , s3, s8 + + fldmias BO!, { s12 - s13 } + fmuls s20 , s0, s10 + fmuls s28 , s1, s11 + fldmias AO!, { s6 - s7 } + fmuls s21 , s0, s11 + fmuls s29 , s1, s10 + + fldmias BO!, { s14 - s15 } + fmuls s22 , s2, s10 + fmuls s30 , s3, s11 + fmuls s23 , s2, s11 + fmuls s31 , s3, s10 + +.endm + + + +.macro KERNEL2x2_M1 + + fmacs s16 , s0, s8 + fldmias AO!, { s4 - s5 } + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fldmias BO!, { s12 - s13 } + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fldmias AO!, { s6 - s7 } + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fldmias BO!, { s14 - s15 } + fmacs s27 , s3, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + fmacs s22 , s2, s10 + fmacs s30 , s3, s11 + fmacs s23 , s2, s11 + fmacs s31 , s3, s10 + +.endm + +.macro KERNEL2x2_M2 + pld [ AO , #A_PRE ] + + fmacs s16 , s4, s12 + pld [ BO , #B_PRE ] + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fldmias AO!, { s0 - s1 } + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fldmias BO!, { s8 - s9 } + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + fldmias AO!, { s2 - s3 } + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fldmias BO!, { s10 - s11 } + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + fmacs s22 , s6, s14 + fmacs s30 , s7, s15 + fmacs s23 , s6, s15 + fmacs s31 , s7, s14 + +.endm + + +.macro KERNEL2x2_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + fmacs s22 , s6, s14 + fmacs s30 , s7, s15 + fmacs s23 , s6, s15 + fmacs s31 , s7, s14 + +.endm + +.macro KERNEL2x2_SUB + + fldmias AO!, { s0 - s1 } + fldmias BO!, { s8 - s9 } + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fldmias AO!, { s2 - s3 } + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fldmias BO!, { s10 - s11 } + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + fmacs s22 , s2, s10 + fmacs s30 , s3, s11 + fmacs s23 , s2, s11 + fmacs s31 , s3, s10 + +.endm + + + + +.macro SAVE2x2 + pld [ CO1 , #C_PRE ] + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s7 } + fldmias CO2, { s8 - s11 } + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s18, s26 , s18 + FADD_I s19, s27 , s19 + FADD_R s20, s28 , s20 + FADD_I s21, s29 , s21 + FADD_R s22, s30 , s22 + FADD_I s23, s31 , s23 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s6 , s0 , s18 + FMAC_I1 s7 , s0 , s19 + FMAC_R2 s6 , s1 , s19 + FMAC_I2 s7 , s1 , s18 + + FMAC_R1 s8 , s0 , s20 + FMAC_I1 s9 , s0 , s21 + FMAC_R2 s8 , s1 , s21 + FMAC_I2 s9 , s1 , s20 + + FMAC_R1 s10, s0 , s22 + FMAC_I1 s11, s0 , s23 + FMAC_R2 s10, s1 , s23 + FMAC_I2 s11, s1 , s22 + + fstmias CO1, { s4 - s7 } + fstmias CO2, { s8 - s11 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + +.endm + +.macro KERNEL1x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fmuls s20 , s0, s10 + fmuls s28 , s1, s11 + fmuls s21 , s0, s11 + fmuls s29 , s1, s10 + + add BO , BO, #16 + add AO , AO, #8 + + pld [ BO , #B_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + flds s14, [ BO, #8 ] + flds s15, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + + + +.macro KERNEL1x2_M1 + pld [ BO , #B_PRE ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + flds s14, [ BO, #8 ] + flds s15, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + +.macro KERNEL1x2_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + + +.macro KERNEL1x2_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + +.endm + +.macro KERNEL1x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + + + + +.macro SAVE1x2 + pld [ CO1 , #C_PRE ] + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s5 } + fldmias CO2, { s8 - s9 } + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s20, s28 , s20 + FADD_I s21, s29 , s21 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s8 , s0 , s20 + FMAC_I1 s9 , s0 , s21 + FMAC_R2 s8 , s1 , s21 + FMAC_I2 s9 , s1 , s20 + + fstmias CO1, { s4 - s5 } + fstmias CO2, { s8 - s9 } + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + +.endm + +.macro KERNEL2x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fmuls s18 , s2, s8 + fmuls s26 , s3, s9 + fmuls s19 , s2, s9 + fmuls s27 , s3, s8 + + add BO , BO, #8 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + flds s6 , [ AO, #8 ] + flds s7 , [ AO, #12 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + + + +.macro KERNEL2x1_M1 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + flds s6 , [ AO, #8 ] + flds s7 , [ AO, #12 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + +.macro KERNEL2x1_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + + +.macro KERNEL2x1_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + +.endm + +.macro KERNEL2x1_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + + + + +.macro SAVE2x1 + pld [ CO1 , #C_PRE ] + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s7 } + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s18, s26 , s18 + FADD_I s19, s27 , s19 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s6 , s0 , s18 + FMAC_I1 s7 , s0 , s19 + FMAC_R2 s6 , s1 , s19 + FMAC_I2 s7 , s1 , s18 + + fstmias CO1, { s4 - s7 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + +.endm + +.macro KERNEL1x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + add BO , BO, #8 + add AO , AO, #8 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + + + +.macro KERNEL1x1_M1 + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + +.macro KERNEL1x1_M2 + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + + +.macro KERNEL1x1_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + +.endm + +.macro KERNEL1x1_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + + + + +.macro SAVE1x1 + pld [ CO1 , #C_PRE ] + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s5 } + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + fstmias CO1, { s4 - s5 } + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { s8 - s31} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 4 * 2 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt _L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_30: + tst L, #3 + ble _L2_M2_40 + + tst L, #2 + ble _L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + +_L2_M2_32: + + tst L, #1 + ble _L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_40: + + INIT2x2 + + +_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne _L2_M2_46 + +_L2_M2_100: + + SAVE2x2 + +_L2_M2_END: + + subs I, I, #1 + bne _L2_M2_20 + + +_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + + +_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #4 // k * 2 * 4 * 2 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt _L2_BEGIN + + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + +_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt _L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_30: + tst L, #3 + ble _L1_M2_40 + + tst L, #2 + ble _L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + +_L1_M2_32: + + tst L, #1 + ble _L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_40: + + INIT2x1 + + +_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne _L1_M2_46 + +_L1_M2_100: + + SAVE2x1 + +_L1_M2_END: + + subs I, I, #1 + bne _L1_M2_20 + + +_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s31} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S new file mode 100644 index 0000000..28e555c --- /dev/null +++ b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S @@ -0,0 +1,1476 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/16 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R s0 +#define OLD_ALPHA_I s1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KKK [fp, #-240] +#define KK [fp, #-244 ] +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define FADD_R fsubs + #define FADD_I fadds + + #define FMAC_R1 fnmuls + #define FMAC_R2 fnmacs + #define FMAC_I1 fmuls + #define FMAC_I2 fnmacs + +#elif defined(CN) || defined(CT) + + #define FADD_R fadds + #define FADD_I fsubs + + #define FMAC_R1 fmuls + #define FMAC_R2 fmacs + #define FMAC_I1 fnmuls + #define FMAC_I2 fmacs + +#elif defined(NC) || defined(TC) + + #define FADD_R fadds + #define FADD_I fsubs + + #define FMAC_R1 fmuls + #define FMAC_R2 fnmacs + #define FMAC_I1 fmuls + #define FMAC_I2 fmacs + +#else + + #define FADD_R fsubs + #define FADD_I fadds + + #define FMAC_R1 fnmuls + #define FMAC_R2 fmacs + #define FMAC_I1 fnmuls + #define FMAC_I2 fnmacs + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + vmov.f32 s30, s16 + vmov.f32 s31, s16 + +.endm + +.macro KERNEL2x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldmias AO!, { s0 - s1 } + fldmias BO!, { s8 - s9 } + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fldmias AO!, { s2 - s3 } + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fldmias BO!, { s10 - s11 } + fmuls s18 , s2, s8 + fmuls s26 , s3, s9 + fldmias AO!, { s4 - s5 } + fmuls s19 , s2, s9 + fmuls s27 , s3, s8 + + fldmias BO!, { s12 - s13 } + fmuls s20 , s0, s10 + fmuls s28 , s1, s11 + fldmias AO!, { s6 - s7 } + fmuls s21 , s0, s11 + fmuls s29 , s1, s10 + + fldmias BO!, { s14 - s15 } + fmuls s22 , s2, s10 + fmuls s30 , s3, s11 + fmuls s23 , s2, s11 + fmuls s31 , s3, s10 + +.endm + + + +.macro KERNEL2x2_M1 + + fmacs s16 , s0, s8 + fldmias AO!, { s4 - s5 } + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fldmias BO!, { s12 - s13 } + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fldmias AO!, { s6 - s7 } + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fldmias BO!, { s14 - s15 } + fmacs s27 , s3, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + fmacs s22 , s2, s10 + fmacs s30 , s3, s11 + fmacs s23 , s2, s11 + fmacs s31 , s3, s10 + +.endm + +.macro KERNEL2x2_M2 + pld [ AO , #A_PRE ] + + fmacs s16 , s4, s12 + pld [ BO , #B_PRE ] + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fldmias AO!, { s0 - s1 } + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fldmias BO!, { s8 - s9 } + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + fldmias AO!, { s2 - s3 } + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fldmias BO!, { s10 - s11 } + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + fmacs s22 , s6, s14 + fmacs s30 , s7, s15 + fmacs s23 , s6, s15 + fmacs s31 , s7, s14 + +.endm + + +.macro KERNEL2x2_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + fmacs s22 , s6, s14 + fmacs s30 , s7, s15 + fmacs s23 , s6, s15 + fmacs s31 , s7, s14 + +.endm + +.macro KERNEL2x2_SUB + + fldmias AO!, { s0 - s1 } + fldmias BO!, { s8 - s9 } + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fldmias AO!, { s2 - s3 } + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fldmias BO!, { s10 - s11 } + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + fmacs s22 , s2, s10 + fmacs s30 , s3, s11 + fmacs s23 , s2, s11 + fmacs s31 , s3, s10 + +.endm + + + + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA_R + flds s1, ALPHA_I + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s18, s26 , s18 + FADD_I s19, s27 , s19 + FADD_R s20, s28 , s20 + FADD_I s21, s29 , s21 + FADD_R s22, s30 , s22 + FADD_I s23, s31 , s23 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s6 , s0 , s18 + FMAC_I1 s7 , s0 , s19 + FMAC_R2 s6 , s1 , s19 + FMAC_I2 s7 , s1 , s18 + + FMAC_R1 s8 , s0 , s20 + FMAC_I1 s9 , s0 , s21 + FMAC_R2 s8 , s1 , s21 + FMAC_I2 s9 , s1 , s20 + + FMAC_R1 s10, s0 , s22 + FMAC_I1 s11, s0 , s23 + FMAC_R2 s10, s1 , s23 + FMAC_I2 s11, s1 , s22 + + fstmias CO1, { s4 - s7 } + fstmias CO2, { s8 - s11 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + +.endm + +.macro KERNEL1x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fmuls s20 , s0, s10 + fmuls s28 , s1, s11 + fmuls s21 , s0, s11 + fmuls s29 , s1, s10 + + add BO , BO, #16 + add AO , AO, #8 + + pld [ BO , #B_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + flds s14, [ BO, #8 ] + flds s15, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + + + +.macro KERNEL1x2_M1 + pld [ BO , #B_PRE ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + flds s14, [ BO, #8 ] + flds s15, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + +.macro KERNEL1x2_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + + +.macro KERNEL1x2_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + +.endm + +.macro KERNEL1x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + + + + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA_R + flds s1, ALPHA_I + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s20, s28 , s20 + FADD_I s21, s29 , s21 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s8 , s0 , s20 + FMAC_I1 s9 , s0 , s21 + FMAC_R2 s8 , s1 , s21 + FMAC_I2 s9 , s1 , s20 + + fstmias CO1, { s4 - s5 } + fstmias CO2, { s8 - s9 } + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + +.endm + +.macro KERNEL2x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fmuls s18 , s2, s8 + fmuls s26 , s3, s9 + fmuls s19 , s2, s9 + fmuls s27 , s3, s8 + + add BO , BO, #8 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + flds s6 , [ AO, #8 ] + flds s7 , [ AO, #12 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + + + +.macro KERNEL2x1_M1 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + flds s6 , [ AO, #8 ] + flds s7 , [ AO, #12 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + +.macro KERNEL2x1_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + + +.macro KERNEL2x1_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + +.endm + +.macro KERNEL2x1_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + + + + +.macro SAVE2x1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s18, s26 , s18 + FADD_I s19, s27 , s19 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s6 , s0 , s18 + FMAC_I1 s7 , s0 , s19 + FMAC_R2 s6 , s1 , s19 + FMAC_I2 s7 , s1 , s18 + + fstmias CO1, { s4 - s7 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + +.endm + +.macro KERNEL1x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + add BO , BO, #8 + add AO , AO, #8 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + + + +.macro KERNEL1x1_M1 + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + +.macro KERNEL1x1_M2 + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + + +.macro KERNEL1x1_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + +.endm + +.macro KERNEL1x1_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + + + + +.macro SAVE1x1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + fstmias CO1, { s4 - s5 } + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { s8 - s31} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 4 * 2 + str r3, LDC + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt _L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_30: + tst L, #3 + ble _L2_M2_40 + + tst L, #2 + ble _L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + +_L2_M2_32: + + tst L, #1 + ble _L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_40: + + INIT2x2 + + +_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne _L2_M2_46 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + +_L2_M2_END: + + subs I, I, #1 + bne _L2_M2_20 + + +_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #4 // k * 2 * 4 * 2 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + subs J , #1 // j-- + bgt _L2_BEGIN + + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + +_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt _L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_30: + tst L, #3 + ble _L1_M2_40 + + tst L, #2 + ble _L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + +_L1_M2_32: + + tst L, #1 + ble _L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_40: + + INIT2x1 + + +_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne _L1_M2_46 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + subs I, I, #1 + bne _L1_M2_20 + + +_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s31} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zgemm_kernel_2x2_vfpv3.S b/kernel/arm/zgemm_kernel_2x2_vfpv3.S new file mode 100644 index 0000000..4b01f04 --- /dev/null +++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S @@ -0,0 +1,1329 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/16 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R d0 +#define OLD_ALPHA_I d1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define FADD_R fsubd + #define FADD_I faddd + + #define FMAC_R1 fnmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fnmacd + +#elif defined(CN) || defined(CT) + + #define FADD_R faddd + #define FADD_I fsubd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#elif defined(NC) || defined(TC) + + #define FADD_R faddd + #define FADD_I fsubd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#else + + #define FADD_R fsubd + #define FADD_I faddd + + #define FMAC_R1 fnmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fnmacd + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + vmov.f64 d30, d16 + vmov.f64 d31, d16 + +.endm + +.macro KERNEL2x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmuld d24 , d1, d9 + fldd d3 , [ AO, #24 ] + fmuld d17 , d0, d9 + fldd d10, [ BO, #16 ] + fmuld d25 , d1, d8 + + fldd d11, [ BO, #24 ] + fmuld d18 , d2, d8 + add BO , BO, #32 + fmuld d26 , d3, d9 + add AO , AO, #32 + fmuld d19 , d2, d9 + pld [ BO , #B_PRE ] + fmuld d27 , d3, d8 + + pld [ AO , #A_PRE ] + fmuld d20 , d0, d10 + fldd d4 , [ AO, #0 ] + fmuld d28 , d1, d11 + fldd d5 , [ AO, #8 ] + fmuld d21 , d0, d11 + fldd d12, [ BO ] + fmuld d29 , d1, d10 + + fldd d13, [ BO, #8 ] + fmuld d22 , d2, d10 + fldd d6 , [ AO, #16 ] + fmuld d30 , d3, d11 + fldd d7 , [ AO, #24 ] + fmuld d23 , d2, d11 + fldd d14, [ BO, #16 ] + fmuld d31 , d3, d10 + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #32 +.endm + + + +.macro KERNEL2x2_M1 + pld [ AO , #A_PRE ] + + fmacd d16 , d0, d8 + pld [ BO , #B_PRE ] + fmacd d24 , d1, d9 + fldd d4 , [ AO, #0 ] + fmacd d17 , d0, d9 + fldd d5 , [ AO, #8 ] + fmacd d25 , d1, d8 + + fldd d12, [ BO ] + fmacd d18 , d2, d8 + fldd d13, [ BO, #8 ] + fmacd d26 , d3, d9 + fldd d6 , [ AO, #16 ] + fmacd d19 , d2, d9 + fldd d7 , [ AO, #24 ] + fmacd d27 , d3, d8 + + fmacd d20 , d0, d10 + fldd d14, [ BO, #16 ] + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fldd d15, [ BO, #24 ] + fmacd d29 , d1, d10 + + fmacd d22 , d2, d10 + add BO , BO, #32 + fmacd d30 , d3, d11 + fmacd d23 , d2, d11 + add AO , AO, #32 + fmacd d31 , d3, d10 + +.endm + +.macro KERNEL2x2_M2 + pld [ AO , #A_PRE ] + + fmacd d16 , d4, d12 + pld [ BO , #B_PRE ] + fmacd d24 , d5, d13 + fldd d0 , [ AO, #0 ] + fmacd d17 , d4, d13 + fldd d1 , [ AO, #8 ] + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fldd d8 , [ BO ] + fmacd d26 , d7, d13 + fldd d9 , [ BO, #8 ] + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fldd d2 , [ AO, #16 ] + fmacd d20 , d4, d14 + fldd d3 , [ AO, #24 ] + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fldd d10, [ BO, #16 ] + fmacd d29 , d5, d14 + + fldd d11, [ BO, #24 ] + fmacd d22 , d6, d14 + fmacd d30 , d7, d15 + add BO , BO, #32 + fmacd d23 , d6, d15 + add AO , AO, #32 + fmacd d31 , d7, d14 + +.endm + + +.macro KERNEL2x2_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + + fmacd d22 , d6, d14 + fmacd d30 , d7, d15 + fmacd d23 , d6, d15 + fmacd d31 , d7, d14 + +.endm + +.macro KERNEL2x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmacd d24 , d1, d9 + fldd d3 , [ AO, #24 ] + fmacd d17 , d0, d9 + fldd d10, [ BO, #16 ] + fmacd d25 , d1, d8 + + fldd d11, [ BO, #24 ] + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + fmacd d22 , d2, d10 + add BO , BO, #32 + fmacd d30 , d3, d11 + fmacd d23 , d2, d11 + add AO , AO, #32 + fmacd d31 , d3, d10 + +.endm + + + + +.macro SAVE2x2 + pld [ CO1 , #C_PRE ] + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d7 } + fldmiad CO2, { d8 - d11 } + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d18, d26 , d18 + FADD_I d19, d27 , d19 + FADD_R d20, d28 , d20 + FADD_I d21, d29 , d21 + FADD_R d22, d30 , d22 + FADD_I d23, d31 , d23 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d6 , d0 , d18 + FMAC_I1 d7 , d0 , d19 + FMAC_R2 d6 , d1 , d19 + FMAC_I2 d7 , d1 , d18 + + FMAC_R1 d8 , d0 , d20 + FMAC_I1 d9 , d0 , d21 + FMAC_R2 d8 , d1 , d21 + FMAC_I2 d9 , d1 , d20 + + FMAC_R1 d10, d0 , d22 + FMAC_I1 d11, d0 , d23 + FMAC_R2 d10, d1 , d23 + FMAC_I2 d11, d1 , d22 + + fstmiad CO1, { d4 - d7 } + fstmiad CO2, { d8 - d11 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + +.endm + +.macro KERNEL1x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + fmuld d20 , d0, d10 + fmuld d28 , d1, d11 + fmuld d21 , d0, d11 + fmuld d29 , d1, d10 + + add BO , BO, #32 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + fldd d14, [ BO, #16 ] + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + + + +.macro KERNEL1x2_M1 + pld [ BO , #B_PRE ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + fldd d14, [ BO, #16 ] + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + +.macro KERNEL1x2_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + + +.macro KERNEL1x2_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + +.endm + +.macro KERNEL1x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + + + + +.macro SAVE1x2 + pld [ CO1 , #C_PRE ] + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d5 } + fldmiad CO2, { d8 - d9 } + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d20, d28 , d20 + FADD_I d21, d29 , d21 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d8 , d0 , d20 + FMAC_I1 d9 , d0 , d21 + FMAC_R2 d8 , d1 , d21 + FMAC_I2 d9 , d1 , d20 + + fstmiad CO1, { d4 - d5 } + fstmiad CO2, { d8 - d9 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + +.endm + +.macro KERNEL2x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + fmuld d18 , d2, d8 + fmuld d26 , d3, d9 + fmuld d19 , d2, d9 + fmuld d27 , d3, d8 + + add BO , BO, #16 + add AO , AO, #32 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + fldd d6 , [ AO, #16 ] + fldd d7 , [ AO, #24 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + + + +.macro KERNEL2x1_M1 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + fldd d6 , [ AO, #16 ] + fldd d7 , [ AO, #24 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + +.macro KERNEL2x1_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + + +.macro KERNEL2x1_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + +.endm + +.macro KERNEL2x1_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + + + + +.macro SAVE2x1 + pld [ CO1 , #C_PRE ] + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d7 } + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d18, d26 , d18 + FADD_I d19, d27 , d19 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d6 , d0 , d18 + FMAC_I1 d7 , d0 , d19 + FMAC_R2 d6 , d1 , d19 + FMAC_I2 d7 , d1 , d18 + + fstmiad CO1, { d4 - d7 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + +.endm + +.macro KERNEL1x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + add BO , BO, #16 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + + + +.macro KERNEL1x1_M1 + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + +.macro KERNEL1x1_M2 + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + + +.macro KERNEL1x1_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + +.endm + +.macro KERNEL1x1_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + + + + +.macro SAVE1x1 + pld [ CO1 , #C_PRE ] + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d5 } + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + fstmiad CO1, { d4 - d5 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #4 // ldc = ldc * 8 * 2 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt _L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_30: + tst L, #3 + ble _L2_M2_40 + + tst L, #2 + ble _L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + +_L2_M2_32: + + tst L, #1 + ble _L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_40: + + INIT2x2 + + +_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne _L2_M2_46 + +_L2_M2_100: + + SAVE2x2 + +_L2_M2_END: + + subs I, I, #1 + bne _L2_M2_20 + + +_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + + +_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #5 // k * 2 * 8 * 2 + add r3, r3, r4 // B = B + K * 4 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt _L2_BEGIN + + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + +_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt _L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_30: + tst L, #3 + ble _L1_M2_40 + + tst L, #2 + ble _L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + +_L1_M2_32: + + tst L, #1 + ble _L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_40: + + INIT2x1 + + +_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne _L1_M2_46 + +_L1_M2_100: + + SAVE2x1 + +_L1_M2_END: + + subs I, I, #1 + bne _L1_M2_20 + + +_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S new file mode 100644 index 0000000..917ce61 --- /dev/null +++ b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S @@ -0,0 +1,1538 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/16 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R d0 +#define OLD_ALPHA_I d1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KKK [fp, #-240] +#define KK [fp, #-244 ] +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define FADD_R fsubd + #define FADD_I faddd + + #define FMAC_R1 fnmuld + #define FMAC_R2 fnmacd + #define FMAC_I1 fmuld + #define FMAC_I2 fnmacd + +#elif defined(CN) || defined(CT) + + #define FADD_R faddd + #define FADD_I fsubd + + #define FMAC_R1 fmuld + #define FMAC_R2 fmacd + #define FMAC_I1 fnmuld + #define FMAC_I2 fmacd + +#elif defined(NC) || defined(TC) + + #define FADD_R faddd + #define FADD_I fsubd + + #define FMAC_R1 fmuld + #define FMAC_R2 fnmacd + #define FMAC_I1 fmuld + #define FMAC_I2 fmacd + +#else + + #define FADD_R fsubd + #define FADD_I faddd + + #define FMAC_R1 fnmuld + #define FMAC_R2 fmacd + #define FMAC_I1 fnmuld + #define FMAC_I2 fnmacd + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + vmov.f64 d30, d16 + vmov.f64 d31, d16 + +.endm + +.macro KERNEL2x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmuld d24 , d1, d9 + fldd d3 , [ AO, #24 ] + fmuld d17 , d0, d9 + fldd d10, [ BO, #16 ] + fmuld d25 , d1, d8 + + fldd d11, [ BO, #24 ] + fmuld d18 , d2, d8 + add BO , BO, #32 + fmuld d26 , d3, d9 + add AO , AO, #32 + fmuld d19 , d2, d9 + pld [ BO , #B_PRE ] + fmuld d27 , d3, d8 + + pld [ AO , #A_PRE ] + fmuld d20 , d0, d10 + fldd d4 , [ AO, #0 ] + fmuld d28 , d1, d11 + fldd d5 , [ AO, #8 ] + fmuld d21 , d0, d11 + fldd d12, [ BO ] + fmuld d29 , d1, d10 + + fldd d13, [ BO, #8 ] + fmuld d22 , d2, d10 + fldd d6 , [ AO, #16 ] + fmuld d30 , d3, d11 + fldd d7 , [ AO, #24 ] + fmuld d23 , d2, d11 + fldd d14, [ BO, #16 ] + fmuld d31 , d3, d10 + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #32 +.endm + + + +.macro KERNEL2x2_M1 + pld [ AO , #A_PRE ] + + fmacd d16 , d0, d8 + pld [ BO , #B_PRE ] + fmacd d24 , d1, d9 + fldd d4 , [ AO, #0 ] + fmacd d17 , d0, d9 + fldd d5 , [ AO, #8 ] + fmacd d25 , d1, d8 + + fldd d12, [ BO ] + fmacd d18 , d2, d8 + fldd d13, [ BO, #8 ] + fmacd d26 , d3, d9 + fldd d6 , [ AO, #16 ] + fmacd d19 , d2, d9 + fldd d7 , [ AO, #24 ] + fmacd d27 , d3, d8 + + fmacd d20 , d0, d10 + fldd d14, [ BO, #16 ] + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fldd d15, [ BO, #24 ] + fmacd d29 , d1, d10 + + fmacd d22 , d2, d10 + add BO , BO, #32 + fmacd d30 , d3, d11 + fmacd d23 , d2, d11 + add AO , AO, #32 + fmacd d31 , d3, d10 + +.endm + +.macro KERNEL2x2_M2 + pld [ AO , #A_PRE ] + + fmacd d16 , d4, d12 + pld [ BO , #B_PRE ] + fmacd d24 , d5, d13 + fldd d0 , [ AO, #0 ] + fmacd d17 , d4, d13 + fldd d1 , [ AO, #8 ] + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fldd d8 , [ BO ] + fmacd d26 , d7, d13 + fldd d9 , [ BO, #8 ] + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fldd d2 , [ AO, #16 ] + fmacd d20 , d4, d14 + fldd d3 , [ AO, #24 ] + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fldd d10, [ BO, #16 ] + fmacd d29 , d5, d14 + + fldd d11, [ BO, #24 ] + fmacd d22 , d6, d14 + fmacd d30 , d7, d15 + add BO , BO, #32 + fmacd d23 , d6, d15 + add AO , AO, #32 + fmacd d31 , d7, d14 + +.endm + + +.macro KERNEL2x2_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + + fmacd d22 , d6, d14 + fmacd d30 , d7, d15 + fmacd d23 , d6, d15 + fmacd d31 , d7, d14 + +.endm + +.macro KERNEL2x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmacd d24 , d1, d9 + fldd d3 , [ AO, #24 ] + fmacd d17 , d0, d9 + fldd d10, [ BO, #16 ] + fmacd d25 , d1, d8 + + fldd d11, [ BO, #24 ] + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + fmacd d22 , d2, d10 + add BO , BO, #32 + fmacd d30 , d3, d11 + fmacd d23 , d2, d11 + add AO , AO, #32 + fmacd d31 , d3, d10 + +.endm + + + + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d18, d26 , d18 + FADD_I d19, d27 , d19 + FADD_R d20, d28 , d20 + FADD_I d21, d29 , d21 + FADD_R d22, d30 , d22 + FADD_I d23, d31 , d23 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d6 , d0 , d18 + FMAC_I1 d7 , d0 , d19 + FMAC_R2 d6 , d1 , d19 + FMAC_I2 d7 , d1 , d18 + + FMAC_R1 d8 , d0 , d20 + FMAC_I1 d9 , d0 , d21 + FMAC_R2 d8 , d1 , d21 + FMAC_I2 d9 , d1 , d20 + + FMAC_R1 d10, d0 , d22 + FMAC_I1 d11, d0 , d23 + FMAC_R2 d10, d1 , d23 + FMAC_I2 d11, d1 , d22 + + fstmiad CO1, { d4 - d7 } + fstmiad CO2, { d8 - d11 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + +.endm + +.macro KERNEL1x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + fmuld d20 , d0, d10 + fmuld d28 , d1, d11 + fmuld d21 , d0, d11 + fmuld d29 , d1, d10 + + add BO , BO, #32 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + fldd d14, [ BO, #16 ] + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + + + +.macro KERNEL1x2_M1 + pld [ BO , #B_PRE ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + fldd d14, [ BO, #16 ] + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + +.macro KERNEL1x2_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + + +.macro KERNEL1x2_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + +.endm + +.macro KERNEL1x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + + + + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d20, d28 , d20 + FADD_I d21, d29 , d21 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d8 , d0 , d20 + FMAC_I1 d9 , d0 , d21 + FMAC_R2 d8 , d1 , d21 + FMAC_I2 d9 , d1 , d20 + + fstmiad CO1, { d4 - d5 } + fstmiad CO2, { d8 - d9 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + +.endm + +.macro KERNEL2x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + fmuld d18 , d2, d8 + fmuld d26 , d3, d9 + fmuld d19 , d2, d9 + fmuld d27 , d3, d8 + + add BO , BO, #16 + add AO , AO, #32 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + fldd d6 , [ AO, #16 ] + fldd d7 , [ AO, #24 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + + + +.macro KERNEL2x1_M1 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + fldd d6 , [ AO, #16 ] + fldd d7 , [ AO, #24 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + +.macro KERNEL2x1_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + + +.macro KERNEL2x1_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + +.endm + +.macro KERNEL2x1_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + + + + +.macro SAVE2x1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d18, d26 , d18 + FADD_I d19, d27 , d19 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d6 , d0 , d18 + FMAC_I1 d7 , d0 , d19 + FMAC_R2 d6 , d1 , d19 + FMAC_I2 d7 , d1 , d18 + + fstmiad CO1, { d4 - d7 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + +.endm + +.macro KERNEL1x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + add BO , BO, #16 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + + + +.macro KERNEL1x1_M1 + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + +.macro KERNEL1x1_M2 + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + + +.macro KERNEL1x1_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + +.endm + +.macro KERNEL1x1_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + + + + +.macro SAVE1x1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + fstmiad CO1, { d4 - d5 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #4 // ldc = ldc * 8 * 2 + str r3, LDC + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt _L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_30: + tst L, #3 + ble _L2_M2_40 + + tst L, #2 + ble _L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + +_L2_M2_32: + + tst L, #1 + ble _L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_40: + + INIT2x2 + + +_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne _L2_M2_46 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + + +_L2_M2_END: + + subs I, I, #1 + bne _L2_M2_20 + + +_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #5 // k * 2 * 8 * 2 + add r3, r3, r4 // B = B + K * 4 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + + subs J , #1 // j-- + bgt _L2_BEGIN + + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + + ldr AO, A // AO = A + +_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt _L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_30: + tst L, #3 + ble _L1_M2_40 + + tst L, #2 + ble _L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + +_L1_M2_32: + + tst L, #1 + ble _L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_40: + + INIT2x1 + + +_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne _L1_M2_46 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + subs I, I, #1 + bne _L1_M2_20 + + +_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + -- 2.7.4