THUNDERX2T99: Add Optimized SNRM2 Implementation
authorAshwin Sekhar T K <ashwin.sekhar@cavium.com>
Thu, 19 Jan 2017 08:57:02 +0000 (00:57 -0800)
committerAshwin Sekhar T K <ashwin.sekhar@cavium.com>
Tue, 24 Jan 2017 04:53:21 +0000 (10:23 +0530)
benchmark/nrm2.c
kernel/arm64/KERNEL.THUNDERX2T99
kernel/arm64/snrm2_thunderx2t99.S [new file with mode: 0644]

index 691f28c..d3718f9 100644 (file)
@@ -149,7 +149,7 @@ int main(int argc, char *argv[]){
   srandom(getpid());
 #endif
 
-  fprintf(stderr, "   SIZE       Time\n");
+  fprintf(stderr, "   SIZE       Flops\n");
 
   for(m = from; m <= to; m += step)
   {
@@ -180,7 +180,10 @@ int main(int argc, char *argv[]){
 
     timeg /= loops;
 
-    fprintf(stderr, " %10.6f secs\n", timeg);
+    fprintf(stderr,
+           " %10.2f MFlops %10.6f sec\n",
+           COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg);
+
 
   }
 
index c9d3132..b604a7e 100644 (file)
@@ -1,17 +1,18 @@
 include $(KERNELDIR)/KERNEL.CORTEXA57
 
-DAXPYKERNEL=daxpy_thunderx2t99.S
+SNRM2KERNEL    = snrm2_thunderx2t99.S
+DAXPYKERNEL    = daxpy_thunderx2t99.S
 
 ifndef SMP
-DDOTKERNEL=ddot_thunderx2t99.S
+DDOTKERNEL     = ddot_thunderx2t99.S
 else
-DDOTKERNEL=ddot_thunderx2t99.c
+DDOTKERNEL     = ddot_thunderx2t99.c
 endif
 
 ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4)
-DGEMMKERNEL    =  dgemm_kernel_8x4_thunderx2t99.S
+DGEMMKERNEL    = dgemm_kernel_8x4_thunderx2t99.S
 else
-DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+DGEMMKERNEL    = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
 endif
 
 ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4)
diff --git a/kernel/arm64/snrm2_thunderx2t99.S b/kernel/arm64/snrm2_thunderx2t99.S
new file mode 100644 (file)
index 0000000..d69441d
--- /dev/null
@@ -0,0 +1,228 @@
+/*******************************************************************************
+Copyright (c) 2017, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define        N       x0      /* vector length */
+#define        X       x1      /* X vector address */
+#define        INC_X   x2      /* X stride */
+#define I      x5      /* loop variable */
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+#define TMPF   s16
+#define TMPFD  d17
+#define SSQ    s0
+#define SSQD   d0
+#define TMPVF  {v16.s}[0]
+#define TMPVFD {v17.s}[0]
+#define SZ     4
+
+/******************************************************************************/
+
+.macro INIT
+       fmov    SSQD, xzr
+       fmov    d1, xzr
+       fmov    d2, xzr
+       fmov    d3, xzr
+       fmov    d4, xzr
+       fmov    d5, xzr
+       fmov    d6, xzr
+       fmov    d7, xzr
+.endm
+
+.macro KERNEL_F1
+       ldr     TMPF, [X], #SZ
+       fcvt    TMPFD, TMPF
+       fmadd   SSQD, TMPFD, TMPFD, SSQD
+.endm
+
+.macro KERNEL_F32
+       ldur    q16, [X]
+       ldur    q18, [X, #16]
+       ldur    q20, [X, #32]
+       ldur    q22, [X, #48]
+       ldur    q24, [X, #64]
+       ldur    q26, [X, #80]
+       ldur    q28, [X, #96]
+       ldur    q30, [X, #112]
+
+       add     X, X, #128
+
+       fcvtl2  v17.2d, v16.4s
+       fcvtl   v16.2d, v16.2s
+       fcvtl2  v19.2d, v18.4s
+       fcvtl   v18.2d, v18.2s
+       fcvtl2  v21.2d, v20.4s
+       fcvtl   v20.2d, v20.2s
+       fcvtl2  v23.2d, v22.4s
+       fcvtl   v22.2d, v22.2s
+       fcvtl2  v25.2d, v24.4s
+       fcvtl   v24.2d, v24.2s
+       fcvtl2  v27.2d, v26.4s
+       fcvtl   v26.2d, v26.2s
+       fcvtl2  v29.2d, v28.4s
+       fcvtl   v28.2d, v28.2s
+       fcvtl2  v31.2d, v30.4s
+       fcvtl   v30.2d, v30.2s
+
+       fmla    v0.2d, v16.2d, v16.2d
+       fmla    v1.2d, v17.2d, v17.2d
+       fmla    v2.2d, v18.2d, v18.2d
+       fmla    v3.2d, v19.2d, v19.2d
+       fmla    v4.2d, v20.2d, v20.2d
+       fmla    v5.2d, v21.2d, v21.2d
+       fmla    v6.2d, v22.2d, v22.2d
+       fmla    v7.2d, v23.2d, v23.2d
+
+       fmla    v0.2d, v24.2d, v24.2d
+       fmla    v1.2d, v25.2d, v25.2d
+       fmla    v2.2d, v26.2d, v26.2d
+       fmla    v3.2d, v27.2d, v27.2d
+       fmla    v4.2d, v28.2d, v28.2d
+       fmla    v5.2d, v29.2d, v29.2d
+       fmla    v6.2d, v30.2d, v30.2d
+       fmla    v7.2d, v31.2d, v31.2d
+
+       prfm    PLDL1KEEP, [X, #1024]
+       prfm    PLDL1KEEP, [X, #1024+64]
+.endm
+
+.macro KERNEL_F32_FINALIZE
+       fadd    v0.2d, v0.2d, v1.2d
+       fadd    v2.2d, v2.2d, v3.2d
+       fadd    v4.2d, v4.2d, v5.2d
+       fadd    v6.2d, v6.2d, v7.2d
+
+       fadd    v0.2d, v0.2d, v2.2d
+       fadd    v4.2d, v4.2d, v6.2d
+
+       fadd    v0.2d, v0.2d, v4.2d
+       faddp   SSQD, v0.2d
+.endm
+
+.macro INIT_S
+       lsl     INC_X, INC_X, #2
+.endm
+
+.macro KERNEL_S1
+       ldr     TMPF, [X]
+       add     X, X, INC_X
+       fcvt    TMPFD, TMPF
+       fmadd   SSQD, TMPFD, TMPFD, SSQD
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+       PROLOGUE
+
+       INIT
+
+       cmp     N, xzr
+       ble     nrm2_kernel_zero
+       cmp     INC_X, xzr
+       ble     nrm2_kernel_zero
+       cmp     INC_X, #1
+       bne     nrm2_kernel_S_BEGIN
+
+nrm2_kernel_F_BEGIN:
+
+       asr     I, N, #6
+       cmp     I, xzr
+       beq     nrm2_kernel_S_BEGIN
+
+       .align 5
+nrm2_kernel_F64:
+
+       KERNEL_F32
+       KERNEL_F32
+
+       subs    I, I, #1
+       bne     nrm2_kernel_F64
+
+       KERNEL_F32_FINALIZE
+
+nrm2_kernel_F1:
+
+       ands    I, N, #63
+       ble     nrm2_kernel_L999
+
+nrm2_kernel_F10:
+
+       KERNEL_F1
+
+       subs    I, I, #1
+        bne     nrm2_kernel_F10
+
+       b       nrm2_kernel_L999
+
+nrm2_kernel_S_BEGIN:
+
+       INIT_S
+
+       asr     I, N, #2
+       cmp     I, xzr
+       ble     nrm2_kernel_S1
+
+nrm2_kernel_S4:
+
+       KERNEL_S1
+       KERNEL_S1
+       KERNEL_S1
+       KERNEL_S1
+
+       subs    I, I, #1
+       bne     nrm2_kernel_S4
+
+nrm2_kernel_S1:
+
+       ands    I, N, #3
+       ble     nrm2_kernel_L999
+
+nrm2_kernel_S10:
+
+       KERNEL_S1
+
+       subs    I, I, #1
+       bne     nrm2_kernel_S10
+
+nrm2_kernel_L999:
+       fsqrt   SSQD, SSQD
+       fcvt    SSQ, SSQD
+       ret
+
+nrm2_kernel_zero:
+       fmov    SSQ, wzr
+
+       ret
+
+       EPILOGUE