From 2b801a00a56c8e270e0de628c5e55e93adaeb1b3 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 2 Nov 2013 13:06:11 +0100 Subject: [PATCH] small optimizations on sgemm_kernel for ARMV7 --- kernel/arm/sgemm_kernel_4x4_vfpv3.S | 131 +++++++++++------------------------- kernel/arm/sgemm_ncopy_4_vfpv3.S | 17 +++-- param.h | 6 +- 3 files changed, 54 insertions(+), 100 deletions(-) diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S index 4746a58..8bc3e53 100644 --- a/kernel/arm/sgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S @@ -26,28 +26,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/10/13 Saar +* 2013/11/02 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * * -* 2013/10/13 Saar +* 2013/11/02 Saar * UNROLL_N 4 * UNROLL_M 4 * DGEMM_P 128 * DGEMM_Q 240 -* DGEMM_R 4096 -* A_PRE 96 -* B_PRE 96 -* C_PRE 64 +* DGEMM_R 12288 +* A_PRE 128 +* B_PRE 128 +* C_PRE 32 * -* Performance on Odroid U2: +* Performance on Odroid U2: * -* 1 Core: 2.60 GFLOPS ATLAS: 2.67 GFLOPS -* 2 Cores: 5.17 GFLOPS ATLAS: 5.25 GFLOPS -* 3 Cores: 7.60 GFLOPS ATLAS: 7.82 GFLOPS -* 4 Cores: 9.98 GFLOPS ATLAS: 9.95 GFLOPS +* 3072x3072 1 Core: 2.62 GFLOPS ATLAS: 2.69 GFLOPS +* 3072x3072 2 Cores: 5.23 GFLOPS ATLAS: 5.27 GFLOPS +* 3072x3072 3 Cores: 7.78 GFLOPS ATLAS: 7.87 GFLOPS +* 3072x3072 4 Cores: 10.10 GFLOPS ATLAS: 9.98 GFLOPS **************************************************************************************/ #define ASSEMBLER @@ -92,9 +92,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define K1 r7 #define BC r12 -#define A_PRE 96 -#define B_PRE 96 -#define C_PRE 64 +#define A_PRE 128 +#define B_PRE 128 +#define C_PRE 32 /************************************************************************************** * Macro definitions @@ -123,10 +123,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_I + pld [ AO , #A_PRE ] fldmias AO!, { s0 - s1 } - pld [ AO , #A_PRE-8 ] + pld [ BO , #B_PRE ] fldmias BO!, { s8 - s9 } - pld [ BO , #B_PRE-8 ] fmuls s16 , s0, s8 fldmias AO!, { s2 - s3 } @@ -162,20 +162,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ AO , #A_PRE ] fmacs s16 , s4, s12 fmacs s17 , s5, s12 - fldmias AO!, { s0 - s1 } + fldmias AO!, { s0 - s3 } fmacs s18 , s6, s12 pld [ BO , #B_PRE ] fmacs s19 , s7, s12 fmacs s20 , s4, s13 - fldmias AO!, { s2 - s3 } + fldmias BO!, { s8 - s11 } fmacs s21 , s5, s13 fmacs s22 , s6, s13 - fldmias BO!, { s8 - s9 } + //fldmias AO!, { s2 - s3 } fmacs s23 , s7, s13 fmacs s24 , s4, s14 - fldmias BO!, { s10 - s11 } + //fldmias BO!, { s10 - s11 } fmacs s25 , s5, s14 fmacs s26 , s6, s14 fmacs s27 , s7, s14 @@ -191,17 +191,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_M1 fmacs s16 , s0, s8 - fldmias AO!, { s4 - s5 } + fldmias AO!, { s4 - s7 } fmacs s17 , s1, s8 fmacs s18 , s2, s8 - fldmias AO!, { s6 - s7 } + fldmias BO!, { s12 - s15 } + //fldmias AO!, { s6 - s7 } fmacs s19 , s3, s8 fmacs s20 , s0, s9 - fldmias BO!, { s12 - s13 } fmacs s21 , s1, s9 fmacs s22 , s2, s9 - fldmias BO!, { s14 - s15 } + //fldmias BO!, { s14 - s15 } fmacs s23 , s3, s9 fmacs s24 , s0, s10 @@ -248,10 +248,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_SUB flds s8 , [ BO ] - pld [ BO , #B_PRE ] flds s0 , [ AO ] - pld [ AO , #A_PRE ] flds s1 , [ AO, #4 ] fmacs s16 , s0, s8 @@ -284,16 +282,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x4 - pld [ CO1 , #C_PRE ] ldr r3 , LDC add CO2 , CO1, r3 flds s0, ALPHA add r4 , CO2, r3 - pld [ CO2 , #C_PRE ] fldmias CO1, { s8 - s11 } - pld [ r4 , #C_PRE ] fmacs s8 , s0 , s16 flds s12, [CO2] @@ -313,6 +308,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacs s15, s0 , s23 fsts s11, [CO1, #12 ] + pld [ CO1 , #C_PRE ] + fldmias r4, { s8 - s11 } fmacs s8 , s0 , s24 @@ -324,9 +321,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacs s11, s0 , s27 fsts s15, [CO2, #12 ] + pld [ CO2 , #C_PRE ] + add CO2, r4 , r3 - pld [ CO2 , #C_PRE ] fldmias CO2, { s12 - s15 } @@ -339,7 +337,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fsts s11, [r4 , #12 ] fmacs s15, s0 , s31 + pld [ r4 , #C_PRE ] fstmias CO2, { s12 - s15 } + pld [ CO2 , #C_PRE ] add CO1, CO1, #16 @@ -891,79 +891,30 @@ _L4_M4_20: mov BO, BC - asrs L , K1, #3 // L = L / 8 - cmp L , #3 - blt _L4_M4_30 - .align 5 + asrs L , K1, #1 // L = L / 8 + cmp L , #2 + blt _L4_M4_32 KERNEL4x4_I KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - sub L, L, #2 + subs L, L, #2 + ble _L4_M4_22a + .align 5 _L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 subs L, L, #1 bgt _L4_M4_22 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 +_L4_M4_22a: KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - b _L4_M4_44 - - -_L4_M4_30: - tst L, #3 - ble _L4_M4_40 - - tst L, #2 - ble _L4_M4_32 - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 KERNEL4x4_E b _L4_M4_44 @@ -974,13 +925,7 @@ _L4_M4_32: ble _L4_M4_40 KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 KERNEL4x4_E b _L4_M4_44 @@ -993,7 +938,7 @@ _L4_M4_40: _L4_M4_44: - ands L , K1, #7 // L = L % 8 + ands L , K1, #1 // L = L % 8 ble _L4_M4_100 _L4_M4_46: diff --git a/kernel/arm/sgemm_ncopy_4_vfpv3.S b/kernel/arm/sgemm_ncopy_4_vfpv3.S index 34fbb32..8af7ed8 100644 --- a/kernel/arm/sgemm_ncopy_4_vfpv3.S +++ b/kernel/arm/sgemm_ncopy_4_vfpv3.S @@ -26,10 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/10/11 Saar -* BLASTEST : xOK -* CTEST : xOK -* TEST : xOK +* 2013/11/02 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK * **************************************************************************************/ @@ -218,6 +218,15 @@ _L4_M4_BEGIN: _L4_M4_20: + pld [ AO1, #A_PRE ] + pld [ AO2, #A_PRE ] + pld [ AO3, #A_PRE ] + pld [ AO4, #A_PRE ] + COPY4x4 + + subs I , I , #1 + ble _L4_M4_40 + COPY4x4 subs I , I , #1 diff --git a/param.h b/param.h index f6895d9..ab0ed91 100644 --- a/param.h +++ b/param.h @@ -1814,17 +1814,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 192 +#define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 96 #define ZGEMM_DEFAULT_P 64 -#define SGEMM_DEFAULT_Q 120 +#define SGEMM_DEFAULT_Q 240 #define DGEMM_DEFAULT_Q 120 #define CGEMM_DEFAULT_Q 120 #define ZGEMM_DEFAULT_Q 120 -#define SGEMM_DEFAULT_R 16384 +#define SGEMM_DEFAULT_R 12288 #define DGEMM_DEFAULT_R 8192 #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -- 2.7.4