From 58c90d5937cc5cc225e96cc60457401c07e07165 Mon Sep 17 00:00:00 2001 From: Benedikt Huber Date: Thu, 9 Oct 2014 06:52:10 -0700 Subject: [PATCH] # The first commit's message is: Optimizations for APM's xgene-1 (aarch64). 1) general system updates to support armv8 better. Make all did not work, one needed to supply TARGET=ARMV8. 2) sgem 4x4 kernel in assembler using SIMD, and configuration changes to use it. 3) strmm 4x4 kernel in C. Since the sgem kernel does 4x4, the trmm kernel must also do 4xN. Added Dave Nuechterlein to the contributors list. --- CONTRIBUTORS.md | 4 + common_arm64.h | 5 +- cpuid_arm64.c | 217 +++++++ getarch.c | 18 +- kernel/arm64/KERNEL.ARMV8 | 8 +- kernel/arm64/sgemm_kernel_4x4.S | 1327 +++++++++++++++++++++++++++++++++++++++ kernel/generic/trmmkernel_4x4.c | 875 ++++++++++++++++++++++++++ param.h | 4 +- 8 files changed, 2442 insertions(+), 16 deletions(-) create mode 100644 cpuid_arm64.c create mode 100644 kernel/arm64/sgemm_kernel_4x4.S create mode 100644 kernel/generic/trmmkernel_4x4.c diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 18a218c..02d15b7 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -117,5 +117,9 @@ In chronological order: * Isaac Dunham * [2014-08-03] Fixed link error on Linux/musl +* Dave Nuechterlein + * [2014-10-10] trmm and sgemm kernels (optimized for APM's X-Gene 1). + ARMv8 support. + * [Your name or handle] <[email or website]> * [Date] [Brief summary of your changes] diff --git a/common_arm64.h b/common_arm64.h index 8a66a17..4855493 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -119,9 +119,9 @@ static inline int blas_quickdivide(blasint x, blasint y){ } #if defined(DOUBLE) -#define GET_IMAGE(res) __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory") +#define GET_IMAGE(res) __asm__ __volatile__("str d1, %0" : "=m"(res) : : "memory") #else -#define GET_IMAGE(res) __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory") +#define GET_IMAGE(res) __asm__ __volatile__("str s1, %0" : "=m"(res) : : "memory") #endif #define GET_IMAGE_CANCEL @@ -138,7 +138,6 @@ static inline int blas_quickdivide(blasint x, blasint y){ #if defined(ASSEMBLER) && !defined(NEEDPARAM) #define PROLOGUE \ - .arm ;\ .global REALNAME ;\ .func REALNAME ;\ REALNAME: diff --git a/cpuid_arm64.c b/cpuid_arm64.c new file mode 100644 index 0000000..c7a27f8 --- /dev/null +++ b/cpuid_arm64.c @@ -0,0 +1,217 @@ +/************************************************************************** + Copyright (c) 2013, The OpenBLAS Project + All rights reserved. + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include + +#define CPU_UNKNOWN 0 +#define CPU_ARMV8 1 + +static char *cpuname[] = { + "UNKOWN", + "ARMV8" +}; + + +int get_feature(char *search) +{ + +#ifdef linux + FILE *infile; + char buffer[2048], *p,*t; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("Features", buffer, 8)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + + + if( p == NULL ) return; + + t = strtok(p," "); + while( t = strtok(NULL," ")) + { + if (!strcmp(t, search)) { return(1); } + } + +#endif + return(0); +} + + +int detect(void) +{ + +#ifdef linux + + FILE *infile; + char buffer[512], *p; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9))) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + + if(p != NULL) + { + + if (strstr(p, "AArch64")) + { + return CPU_ARMV8; + + } + + + } +#endif + + return CPU_UNKNOWN; +} + +char *get_corename(void) +{ + return cpuname[detect()]; +} + +void get_architecture(void) +{ + printf("ARM"); +} + +void get_subarchitecture(void) +{ + int d = detect(); + switch (d) + { + + case CPU_ARMV8: + printf("ARMV8"); + break; + + default: + printf("UNKNOWN"); + break; + } +} + +void get_subdirname(void) +{ + printf("arm64"); +} + +void get_cpuconfig(void) +{ + + int d = detect(); + switch (d) + { + + case CPU_ARMV8: + printf("#define ARMV8\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L2_SIZE 262144\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); + break; + + + } +} + + +void get_libname(void) +{ + + int d = detect(); + switch (d) + { + + case CPU_ARMV8: + printf("armv8\n"); + break; + + } +} + + +void get_features(void) +{ + +#ifdef linux + FILE *infile; + char buffer[2048], *p,*t; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("Features", buffer, 8)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + + + if( p == NULL ) return; + + t = strtok(p," "); + while( t = strtok(NULL," ")) + { + } + +#endif + return; +} + + diff --git a/getarch.c b/getarch.c index 3e99142..ded347e 100644 --- a/getarch.c +++ b/getarch.c @@ -746,12 +746,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SUBARCHITECTURE "ARMV8" #define SUBDIRNAME "arm64" #define ARCHCONFIG "-DARMV8 " \ - "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ - "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ - "-DHAVE_VFP -DHAVE_VFPV3 -DHAVE_VFPV4" + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " #define LIBNAME "armv8" -#define CORENAME "ARMV8" +#define CORENAME "XGENE1" #else #endif @@ -801,6 +800,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OPENBLAS_SUPPORTED #endif +#ifdef __aarch64__ +#include "cpuid_arm64.c" +#define OPENBLAS_SUPPORTED +#endif + #ifndef OPENBLAS_SUPPORTED #error "This arch/CPU is not supported by OpenBLAS." @@ -856,7 +860,7 @@ int main(int argc, char *argv[]){ #ifdef FORCE printf("CORE=%s\n", CORENAME); #else -#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) +#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) printf("CORE=%s\n", get_corename()); #endif #endif @@ -956,7 +960,7 @@ int main(int argc, char *argv[]){ #ifdef FORCE printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); #else -#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) +#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); #endif #endif diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index 27157da..4fc0968 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -80,14 +80,14 @@ DGEMVTKERNEL = ../arm/gemv_t.c CGEMVTKERNEL = ../arm/zgemv_t.c ZGEMVTKERNEL = ../arm/zgemv_t.c -STRMMKERNEL = ../generic/trmmkernel_2x2.c +STRMMKERNEL = ../generic/trmmkernel_4x4.c DTRMMKERNEL = ../generic/trmmkernel_2x2.c CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -SGEMMKERNEL = ../generic/gemmkernel_2x2.c -SGEMMONCOPY = ../generic/gemm_ncopy_2.c -SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMKERNEL = sgemm_kernel_4x4.S +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o diff --git a/kernel/arm64/sgemm_kernel_4x4.S b/kernel/arm64/sgemm_kernel_4x4.S new file mode 100644 index 0000000..7863329 --- /dev/null +++ b/kernel/arm64/sgemm_kernel_4x4.S @@ -0,0 +1,1327 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/23 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/11/02 Saar +* UNROLL_N 4 +* UNROLL_M 4 +* DGEMM_P 128 +* DGEMM_Q 240 +* DGEMM_R 12288 +* A_PRE 128 +* B_PRE 128 +* C_PRE 32 +* +* Performance on Odroid U2: +* +* 3072x3072 1 Core: 2.62 GFLOPS ATLAS: 2.69 GFLOPS +* 3072x3072 2 Cores: 5.23 GFLOPS ATLAS: 5.27 GFLOPS +* 3072x3072 3 Cores: 7.78 GFLOPS ATLAS: 7.87 GFLOPS +* 3072x3072 4 Cores: 10.10 GFLOPS ATLAS: 9.98 GFLOPS +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6*/ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc*/ + + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define pB x10 +#define counterJ x11 +#define tempALPHA x12 +#define pCRow0 x13 +#define pCRow1 x14 +#define pCRow2 x15 +#define pA x16 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 pB +// 11 counterJ +// 12 tempALPHA +// 13 pCRow0 +// 14 pCRow1 +// 15 pCRow2 +// 16 pA +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 orig ALPHA -> a00 +//v01 a01 +//v02 a02 +//v03 a03 +//v04 a10 +//v05 a11 +//v06 a12 +//v07 a13 +//v08 must save b00 +//v09 must save b01 +//v10 must save b02 +//v11 must save b03 +//v12 must save b10 +//v13 must save b11 +//v14 must save b12 +//v15 must save b13 +//v16 must save C00 +//v17 must save C01 +//v18 C02 +//v19 C03 +//v20 C10 +//v21 C11 +//v22 C12 +//v23 C13 +//v24 C20 +//v25 C21 +//v26 C22 +//v27 C23 +//v28 C30 +//v29 C31 +//v30 C32 +//v31 C33 + +// add sp,sp,#-(6*16) +// stp x18,x19,[sp,#(0*16)] +// stp x20,x21,[sp,#(1*16)] + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x4 + + fsub v16.4s , v16.4s , v16.4s + fsub v20.4s , v20.4s , v20.4s + fsub v24.4s , v24.4s , v24.4s + fsub v28.4s , v28.4s , v28.4s + +.endm + +.macro KERNEL4x4_I + + ld1 {v8.2s},[pB],#8 + ld1 {v10.2s},[pB],#8 + ld1 {v0.4s},[pA],#16 + + fmulx v16.4s, v0.4s, v8.4s[0] + fmulx v20.4s, v0.4s, v8.4s[1] + fmulx v24.4s, v0.4s, v10.4s[0] + fmulx v28.4s, v0.4s, v10.4s[1] + + ld1 {v12.2s},[pB],#8 // for next round + ld1 {v14.2s},[pB],#8 // for next round + ld1 {v4.4s},[pA],#16 // for next round + + +.endm + + +.macro KERNEL4x4_M2 + + fmla v16.4s, v4.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v24.4s, v4.4s, v14.s[0] + fmla v28.4s, v4.4s, v14.s[1] + + ld1 {v8.2s},[pB],#8 + ld1 {v10.2s},[pB],#8 + ld1 {v0.4s},[pA],#16 + +.endm + + +.macro KERNEL4x4_M1 + + fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v10.s[1] + + ld1 {v12.2s},[pB],#8 + ld1 {v14.2s},[pB],#8 + ld1 {v4.4s},[pA],#16 + +.endm + + + +.macro KERNEL4x4_E + + fmla v16.4s, v4.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v24.4s, v4.4s, v14.s[0] + fmla v28.4s, v4.4s, v14.s[1] + +.endm + + + + +.macro KERNEL4x4_SUB + + ld1 {v8.2s},[pB],#8 + ld1 {v10.2s},[pB],#8 + ld1 {v0.4s} , [pA],#16 + + fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v10.s[1] + +.endm + + + + +.macro SAVE4x4 + + add pCRow1, pCRow0, LDC // create a second row pointer from the first row pointer + mov v0.d[0], tempALPHA + + ld1 {v8.4s},[pCRow0] // load 4 values of C from first row + fmla v8.4s ,v16.4s,v0.s[0] + st1 {v8.4s},[pCRow0],#16 // store C from first row + + ld1 {v12.4s},[pCRow1] // load 4 values of C from second row + fmla v12.4s ,v20.4s,v0.s[0] + st1 {v12.4s},[pCRow1] // store C from second row + + add pCRow2, pCRow1, LDC // Row2 points to third row + + ld1 {v8.4s},[pCRow2] // load 4 values of C from third row + fmla v8.4s ,v24.4s,v0.s[0] + st1 {v8.4s} ,[pCRow2] // store C from third row + + add pCRow1, pCRow2 , LDC // row1 points to fourth row + + ld1 {v12.4s},[pCRow1] // load 4 values of C from fourth row + fmla v12.4s ,v28.4s,v0.s[0] + st1 {v12.4s},[pCRow1] // store fourth row + +.endm + +/******************************************************************************/ + +.macro INIT2x4 + + fsub s16 , s16 , s16 + fmov s17, s16 + fmov s20, s16 + fmov s21, s16 + fmov s24, s16 + fmov s25, s16 + fmov s28, s16 + fmov s29, s16 + +.endm + + + +.macro KERNEL2x4_SUB + + ldr s8 , [ pB ] + ldr s9 , [ pB, #4 ] + ldr s10, [ pB, #8 ] + ldr s11, [ pB, #12 ] + + ldr s0 , [ pA ] + ldr s1 , [ pA, #4 ] + + fmadd s16 , s0, s8, s16 + fmadd s17 , s1, s8, s17 + + fmadd s20 , s0, s9, s20 + fmadd s21 , s1, s9, s21 + + fmadd s24 , s0, s10, s24 + fmadd s25 , s1, s10, s25 + + fmadd s28 , s0, s11, s28 + fmadd s29 , s1, s11, s29 + add pA , pA, #8 + add pB , pB, #16 + +.endm + + #define F1ST( op1, op2, op3) fmadd op1, op2, op3, op1 + #define L1ST( op1, op2, op3) ldr op1, [op2, op3] + +.macro SAVE2x4 + + add pCRow1 , pCRow0, LDC + add pCRow2 , pCRow1, LDC + mov v0.d[0], tempALPHA + + L1ST ( s8,pCRow0, #0) + L1ST ( s9,pCRow0, #4 ) + + F1ST ( s8 , s0 , s16) + F1ST ( s9 , s0 , s17) + + str s8 , [pCRow0, #0] + str s9 , [pCRow0, #4 ] + + ldr s12, [pCRow1, #0] + ldr s13, [pCRow1, #4 ] + + F1ST ( s12, s0 , s20) + F1ST ( s13, s0 , s21) + + str s12, [pCRow1, #0] + str s13, [pCRow1, #4 ] + + L1ST ( s8,pCRow2 , #0) + L1ST ( s9,pCRow2 , #4 ) + + F1ST ( s8 , s0 , s24) + F1ST ( s9 , s0 , s25) + + str s8 , [pCRow2 , #0] + str s9 , [pCRow2 , #4 ] + + add pCRow1, pCRow2 , LDC + + ldr s12, [pCRow1, #0] + ldr s13, [pCRow1, #4 ] + + F1ST ( s12, s0 , s28) + F1ST ( s13, s0 , s29) + + str s12, [pCRow1, #0] + str s13, [pCRow1, #4 ] + + add pCRow0, pCRow0, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT1x4 + + fsub s16 , s16 , s16 + fmov s20, s16 + fmov s24, s16 + fmov s28, s16 + +.endm + + + +.macro KERNEL1x4_SUB + + ldr s8 , [ pB ] + ldr s9 , [ pB, #4 ] + ldr s10, [ pB, #8 ] + ldr s11, [ pB, #12 ] + + ldr s0 , [ pA ] + + fmadd s16 , s0, s8, s16 + fmadd s20 , s0, s9, s20 + fmadd s24 , s0, s10, s24 + fmadd s28 , s0, s11, s28 + + add pA , pA, #4 + add pB , pB, #16 + +.endm + +.macro SAVE1x4 + + add pCRow1 , pCRow0, LDC + add pCRow2 , pCRow1, LDC + + mov v0.d[0], tempALPHA + + L1ST ( s8,pCRow0, #0) + F1ST ( s8 , s0 , s16) + str s8 , [pCRow0, #0] + + L1ST ( s12,pCRow1, #0) + F1ST ( s12, s0 , s20) + str s12, [pCRow1, #0] + + L1ST ( s8,pCRow2 , #0) + F1ST ( s8 , s0 , s24) + str s8 , [pCRow2 , #0] + + add pCRow1, pCRow2 , LDC + + L1ST ( s12,pCRow1, #0) + F1ST ( s12, s0 , s28) + str s12, [pCRow1, #0] + + add pCRow0, pCRow0, #4 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x2 + + fsub s16 , s16 , s16 + fmov s17, s16 + fmov s18, s16 + fmov s19, s16 + fmov s20, s16 + fmov s21, s16 + fmov s22, s16 + fmov s23, s16 + +.endm + + + +.macro KERNEL4x2_SUB + + ldr s8 , [ pB ] + ldr s9 , [ pB, #4 ] + + ldr s0 , [ pA ] + ldr s1 , [ pA, #4 ] + ldr s2 , [ pA, #8 ] + ldr s3 , [ pA, #12 ] + + fmadd s16 , s0, s8, s16 + fmadd s17 , s1, s8, s17 + fmadd s18 , s2, s8, s18 + fmadd s19 , s3, s8, s19 + + fmadd s20 , s0, s9, s20 + fmadd s21 , s1, s9, s21 + fmadd s22 , s2, s9, s22 + fmadd s23 , s3, s9, s23 + + add pA , pA, #16 + add pB , pB, #8 + +.endm + +.macro SAVE4x2 + + add pCRow1 , pCRow0, LDC + + mov v0.d[0], tempALPHA + + L1ST ( s8,pCRow0, #0) + L1ST ( s9,pCRow0, #4 ) + L1ST ( s10,pCRow0, #8 ) + L1ST ( s11,pCRow0, #12 ) + + F1ST ( s8 , s0 , s16) + F1ST ( s9 , s0 , s17) + F1ST ( s10, s0 , s18) + F1ST ( s11, s0 , s19) + + str s8 , [pCRow0] + str s9 , [pCRow0, #4 ] + str s10, [pCRow0, #8 ] + str s11, [pCRow0, #12 ] + + L1ST ( s12,pCRow1, #0) + L1ST ( s13,pCRow1, #4 ) + L1ST ( s14,pCRow1, #8 ) + L1ST ( s15,pCRow1, #12 ) + + F1ST ( s12, s0 , s20) + F1ST ( s13, s0 , s21) + F1ST ( s14, s0 , s22) + F1ST ( s15, s0 , s23) + + str s12, [pCRow1] + str s13, [pCRow1, #4 ] + str s14, [pCRow1, #8 ] + str s15, [pCRow1, #12 ] + + add pCRow0, pCRow0, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + fsub s16 , s16 , s16 + fmov s17, s16 + fmov s20, s16 + fmov s21, s16 + +.endm + + + +.macro KERNEL2x2_SUB + + ldr s8 , [ pB ] + ldr s9 , [ pB, #4 ] + + ldr s0 , [ pA ] + ldr s1 , [ pA, #4 ] + + fmadd s16 , s0, s8, s16 + fmadd s17 , s1, s8, s17 + + fmadd s20 , s0, s9, s20 + fmadd s21 , s1, s9, s21 + + add pA , pA, #8 + add pB , pB, #8 + +.endm + +.macro SAVE2x2 + + add pCRow1 , pCRow0, LDC + + mov v0.d[0], tempALPHA + + L1ST ( s8,pCRow0, #0 ) + L1ST ( s9,pCRow0, #4 ) + + F1ST ( s8 , s0 , s16) + F1ST ( s9 , s0 , s17) + + str s8 , [pCRow0] + str s9 , [pCRow0, #4 ] + + L1ST ( s12,pCRow1, #0 ) + L1ST ( s13,pCRow1, #4 ) + + F1ST ( s12, s0 , s20) + F1ST ( s13, s0 , s21) + + str s12, [pCRow1] + str s13, [pCRow1, #4 ] + + add pCRow0, pCRow0, #8 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + fsub s16 , s16 , s16 + fmov s20, s16 + +.endm + + + +.macro KERNEL1x2_SUB + + ldr s8 , [ pB ] + ldr s9 , [ pB, #4 ] + + ldr s0 , [ pA ] + fmadd s16 , s0, s8, s16 + fmadd s20 , s0, s9, s20 + + add pA , pA, #4 + add pB , pB, #8 + +.endm + +.macro SAVE1x2 + + add pCRow1 , pCRow0, LDC + + mov v0.d[0], tempALPHA + + L1ST ( s8,pCRow0, #0) + F1ST ( s8 , s0 , s16) + str s8 , [pCRow0] + + L1ST ( s12,pCRow1, #0) + F1ST ( s12, s0 , s20) + str s12, [pCRow1] + + add pCRow0, pCRow0, #4 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x1 + + fsub s16 , s16 , s16 + fmov s17, s16 + fmov s18, s16 + fmov s19, s16 + +.endm + + + +.macro KERNEL4x1_SUB + + ldr s8 , [ pB ] + + ldr s0 , [ pA ] + ldr s1 , [ pA, #4 ] + ldr s2 , [ pA, #8 ] + ldr s3 , [ pA, #12 ] + + fmadd s16 , s0, s8, s16 + fmadd s17 , s1, s8, s17 + fmadd s18 , s2, s8, s18 + fmadd s19 , s3, s8, s19 + + add pA , pA, #16 + add pB , pB, #4 + +.endm + +.macro SAVE4x1 + + + mov v0.d[0], tempALPHA + + L1ST ( s8,pCRow0, #0 ) + L1ST ( s9,pCRow0, #4 ) + L1ST ( s10,pCRow0, #8 ) + L1ST ( s11,pCRow0, #12 ) + + F1ST ( s8 , s0 , s16) + F1ST ( s9 , s0 , s17) + F1ST ( s10, s0 , s18) + F1ST ( s11, s0 , s19) + + str s8 , [pCRow0] + str s9 , [pCRow0, #4 ] + str s10, [pCRow0, #8 ] + str s11, [pCRow0, #12 ] + + add pCRow0, pCRow0, #16 + +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + + fsub s16 , s16 , s16 + fmov s17, s16 + +.endm + + + +.macro KERNEL2x1_SUB + + ldr s8 , [ pB ] + + ldr s0 , [ pA ] + ldr s1 , [ pA, #4 ] + + fmadd s16 , s0, s8, s16 + fmadd s17 , s1, s8, s17 + + add pA , pA, #8 + add pB , pB, #4 + +.endm + +.macro SAVE2x1 + + + mov v0.d[0], tempALPHA + + L1ST ( s8,pCRow0, #0 ) + L1ST ( s9,pCRow0, #4 ) + + F1ST ( s8 , s0 , s16) + F1ST ( s9 , s0 , s17) + + str s8 , [pCRow0] + str s9 , [pCRow0, #4 ] + + add pCRow0, pCRow0, #8 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + fsub s16 , s16 , s16 + +.endm + + + +.macro KERNEL1x1_SUB + + ldr s8 , [ pB ] + + ldr s0 , [ pA ] + + fmadd s16 , s0, s8, s16 + + add pA , pA, #4 + add pB , pB, #4 + +.endm + +.macro SAVE1x1 + + + mov v0.d[0], tempALPHA + + L1ST ( s8,pCRow0, #0 ) + F1ST ( s8 , s0 , s16) + str s8 , [pCRow0] + + add pCRow0, pCRow0, #4 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + add sp,sp,#-(5*16) + stp d8,d9,[sp,#(0*16)] + stp d10,d11,[sp,#(1*16)] + stp d12,d13,[sp,#(2*16)] + stp d14,d15,[sp,#(3*16)] + stp d16,d17,[sp,#(4*16)] + + mov tempALPHA, v0.d[0] + lsl LDC, LDC, #2 // ldc = ldc * 4 + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble sgemm_kernel_L2_BEGIN + +sgemm_kernel_L4_BEGIN: + + mov pCRow0, pC // pCRow0 = C + add pC,pC,LDC, lsl #2 + + mov pA, origPA // pA = start of A array + + + +sgemm_kernel_L4_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble sgemm_kernel_L4_M2_BEGIN + +sgemm_kernel_L4_M4_20: + + mov pB, origPB + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt sgemm_kernel_L4_M4_32 + + + + KERNEL4x4_I //do one in the K + KERNEL4x4_M2 //do another in the K + + subs counterL, counterL, #2 // subtract 2, since one is always done at the tail + ble sgemm_kernel_L4_M4_22a + .align 5 + +sgemm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M4_22 + +sgemm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b sgemm_kernel_L4_M4_44 + +sgemm_kernel_L4_M4_32: // less than 4 to do in the K direction + + tst counterL, #1 + ble sgemm_kernel_L4_M4_40 + + KERNEL4x4_I + + KERNEL4x4_E + + b sgemm_kernel_L4_M4_44 + + +sgemm_kernel_L4_M4_40: + + INIT4x4 + + +sgemm_kernel_L4_M4_44: + + ands counterL , origK, #1 + ble sgemm_kernel_L4_M4_100 + +sgemm_kernel_L4_M4_46: + + KERNEL4x4_SUB + + subs counterL, counterL, #1 + bne sgemm_kernel_L4_M4_46 + +sgemm_kernel_L4_M4_100: + + SAVE4x4 + +sgemm_kernel_L4_M4_END: + + subs counterI, counterI, #1 + bne sgemm_kernel_L4_M4_20 + + +sgemm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble sgemm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble sgemm_kernel_L4_M1_BEGIN + +sgemm_kernel_L4_M2_20: + + INIT2x4 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L4_M2_40 + +sgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M2_22 + + +sgemm_kernel_L4_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L4_M2_100 + +sgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M2_42 + +sgemm_kernel_L4_M2_100: + + SAVE2x4 + +sgemm_kernel_L4_M2_END: + + +sgemm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble sgemm_kernel_L4_END + +sgemm_kernel_L4_M1_20: + + INIT1x4 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L4_M1_40 + +sgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M1_22 + + +sgemm_kernel_L4_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L4_M1_100 + +sgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M1_42 + +sgemm_kernel_L4_M1_100: + + SAVE1x4 + + +sgemm_kernel_L4_END: + + add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 + + subs counterJ, counterJ , #1 // j-- + bgt sgemm_kernel_L4_BEGIN + + + +/*********************************************************************************************/ + +sgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble sgemm_kernel_L999 // error, N was less than 4? + + tst counterJ , #2 + ble sgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + add pC , pC, LDC, lsl #1 + + mov pA, origPA // pA = A + + + +sgemm_kernel_L2_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI,#0 + ble sgemm_kernel_L2_M2_BEGIN + +sgemm_kernel_L2_M4_20: + + INIT4x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble sgemm_kernel_L2_M4_40 + .align 5 + +sgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M4_22 + + +sgemm_kernel_L2_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L2_M4_100 + +sgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M4_42 + +sgemm_kernel_L2_M4_100: + + SAVE4x2 + +sgemm_kernel_L2_M4_END: + + subs counterI, counterI, #1 + bgt sgemm_kernel_L2_M4_20 + + +sgemm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble sgemm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble sgemm_kernel_L2_M1_BEGIN + +sgemm_kernel_L2_M2_20: + + INIT2x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble sgemm_kernel_L2_M2_40 + +sgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M2_22 + + +sgemm_kernel_L2_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L2_M2_100 + +sgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M2_42 + +sgemm_kernel_L2_M2_100: + + SAVE2x2 + +sgemm_kernel_L2_M2_END: + + +sgemm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble sgemm_kernel_L2_END + +sgemm_kernel_L2_M1_20: + + INIT1x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble sgemm_kernel_L2_M1_40 + +sgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M1_22 + + +sgemm_kernel_L2_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L2_M1_100 + +sgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M1_42 + +sgemm_kernel_L2_M1_100: + + SAVE1x2 + + +sgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + +/*********************************************************************************************/ + +sgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble sgemm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pCRow0 , LDC // C01 is the current line, update pC to point to next + + mov pA, origPA // pA = A + + + +sgemm_kernel_L1_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble sgemm_kernel_L1_M2_BEGIN + +sgemm_kernel_L1_M4_20: + + INIT4x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L1_M4_40 + .align 5 + +sgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M4_22 + + +sgemm_kernel_L1_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L1_M4_100 + +sgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M4_42 + +sgemm_kernel_L1_M4_100: + + SAVE4x1 + +sgemm_kernel_L1_M4_END: + + subs counterI, counterI, #1 + bgt sgemm_kernel_L1_M4_20 + + +sgemm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble sgemm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble sgemm_kernel_L1_M1_BEGIN + +sgemm_kernel_L1_M2_20: + + INIT2x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L1_M2_40 + +sgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M2_22 + + +sgemm_kernel_L1_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L1_M2_100 + +sgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M2_42 + +sgemm_kernel_L1_M2_100: + + SAVE2x1 + +sgemm_kernel_L1_M2_END: + + +sgemm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble sgemm_kernel_L1_END + +sgemm_kernel_L1_M1_20: + + INIT1x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L1_M1_40 + +sgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M1_22 + + +sgemm_kernel_L1_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L1_M1_100 + +sgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M1_42 + +sgemm_kernel_L1_M1_100: + + SAVE1x1 + + +sgemm_kernel_L1_END: + + +sgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8,d9,[sp,#(0*16)] + ldp d10,d11,[sp,#(1*16)] + ldp d12,d13,[sp,#(2*16)] + ldp d14,d15,[sp,#(3*16)] + ldp d16,d17,[sp,#(4*16)] + add sp,sp,#(5*16) + ret + + EPILOGUE + diff --git a/kernel/generic/trmmkernel_4x4.c b/kernel/generic/trmmkernel_4x4.c new file mode 100644 index 0000000..a85828c --- /dev/null +++ b/kernel/generic/trmmkernel_4x4.c @@ -0,0 +1,875 @@ +#include "common.h" +#include + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; + + FLOAT res0_0; + FLOAT res0_1; + FLOAT res0_2; + FLOAT res0_3; + + FLOAT res1_0; + FLOAT res1_1; + FLOAT res1_2; + FLOAT res1_3; + + FLOAT res2_0; + FLOAT res2_1; + FLOAT res2_2; + FLOAT res2_3; + + FLOAT res3_0; + FLOAT res3_1; + FLOAT res3_2; + FLOAT res3_3; + + FLOAT a0; + FLOAT a1; + + FLOAT b0; + FLOAT b1; + FLOAT b2; + FLOAT b3; + + BLASLONG off, temp; + + bool left; + bool transposed; + bool backwards; + +#ifdef LEFT + left = true; +#else + left = false; +#endif + +#ifdef TRANSA + transposed = true; +#else + transposed = false; +#endif + + backwards = left != transposed; + + if (!left) { + off = -offset; + } + + + for (j=0; j