[WIP] Use arm neon instructions to optimize tcopy operation
authorzq <zuoqian3@huawei.com>
Tue, 31 Dec 2019 02:21:23 +0000 (10:21 +0800)
committerzq <zuoqian3@huawei.com>
Tue, 31 Dec 2019 02:21:23 +0000 (10:21 +0800)
kernel/arm64/KERNEL.ARMV8
kernel/arm64/KERNEL.TSV110
kernel/arm64/sgemm_tcopy_16.S [new file with mode: 0644]

index b90dd22..28eff77 100644 (file)
@@ -108,12 +108,20 @@ SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
 STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
 ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
 SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+ifeq ($(SGEMM_UNROLL_M), 16)
+SGEMMITCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_M).S
+else
 SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+endif
 SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
 SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+ifeq ($(SGEMM_UNROLL_N), 16)
+SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
+else
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+endif
 SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
index 04d6940..8c31f83 100644 (file)
@@ -110,12 +110,20 @@ SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
 STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
 ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
 SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+ifeq ($(SGEMM_UNROLL_M), 16)
+SGEMMITCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_M).S
+else
 SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+endif
 SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
 SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+ifeq ($(SGEMM_UNROLL_N), 16)
+SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
+else
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+endif
 SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
diff --git a/kernel/arm64/sgemm_tcopy_16.S b/kernel/arm64/sgemm_tcopy_16.S
new file mode 100644 (file)
index 0000000..12b80bd
--- /dev/null
@@ -0,0 +1,824 @@
+/***************************************************************************\r
+Copyright (c) 2019, The OpenBLAS Project\r
+All rights reserved.\r
+*****************************************************************************/\r
+\r
+#define ASSEMBLER\r
+#include "common.h"\r
+\r
+#define        M               x0\r
+#define        N               x1\r
+#define        A               x2\r
+#define        LDA             x3\r
+#define        B               x4\r
+\r
+#define M8             x5\r
+\r
+#define        A01             x6\r
+#define        A02             x7\r
+#define        A03             x8\r
+#define        A04             x9\r
+#define        A05             x10\r
+#define        A06             x11\r
+#define        A07             x12\r
+#define        A08             x13\r
+\r
+#define        B01             x14\r
+#define        B02             x15\r
+#define        B03             x16\r
+#define        B04             x17\r
+#define        B00             x22\r
+\r
+\r
+#define I              x18\r
+#define        J               x19\r
+\r
+#define TEMP1          x20\r
+\r
+#define A_PREFETCH     256\r
+\r
+/**************************************************************************************\r
+* Macro definitions\r
+**************************************************************************************/\r
+.macro SAVE_REGS\r
+       add     sp, sp, #-(11 * 16)\r
+       stp     d8, d9, [sp, #(0 * 16)]\r
+       stp     d10, d11, [sp, #(1 * 16)]\r
+       stp     d12, d13, [sp, #(2 * 16)]\r
+       stp     d14, d15, [sp, #(3 * 16)]\r
+       stp     d16, d17, [sp, #(4 * 16)]\r
+       stp     x18, x19, [sp, #(5 * 16)]\r
+       stp     x20, x21, [sp, #(6 * 16)]\r
+       stp     x22, x23, [sp, #(7 * 16)]\r
+       stp     x24, x25, [sp, #(8 * 16)]\r
+       stp     x26, x27, [sp, #(9 * 16)]\r
+       str     x28, [sp, #(10 * 16)]\r
+.endm\r
+\r
+.macro RESTORE_REGS\r
+       ldp     d8, d9, [sp, #(0 * 16)]\r
+       ldp     d10, d11, [sp, #(1 * 16)]\r
+       ldp     d12, d13, [sp, #(2 * 16)]\r
+       ldp     d14, d15, [sp, #(3 * 16)]\r
+       ldp     d16, d17, [sp, #(4 * 16)]\r
+       ldp     x18, x19, [sp, #(5 * 16)]\r
+       ldp     x20, x21, [sp, #(6 * 16)]\r
+       ldp     x22, x23, [sp, #(7 * 16)]\r
+       ldp     x24, x25, [sp, #(8 * 16)]\r
+       ldp     x26, x27, [sp, #(9 * 16)]\r
+       ldr     x28, [sp, #(10 * 16)]\r
+       add     sp, sp, #(11*16)\r
+.endm\r
+\r
+/*************************************************************************************************************************/\r
+\r
+.macro COPY16x8\r
+       prfm    PLDL1KEEP, [A01, #A_PREFETCH]\r
+       prfm    PLDL1KEEP, [A02, #A_PREFETCH]\r
+       prfm    PLDL1KEEP, [A03, #A_PREFETCH]\r
+       prfm    PLDL1KEEP, [A04, #A_PREFETCH]\r
+       prfm    PLDL1KEEP, [A05, #A_PREFETCH]\r
+       prfm    PLDL1KEEP, [A06, #A_PREFETCH]\r
+       prfm    PLDL1KEEP, [A07, #A_PREFETCH]\r
+       prfm    PLDL1KEEP, [A08, #A_PREFETCH]\r
+       //prfm  PSTL1KEEP, [B00, M8]\r
+       \r
+       ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01]\r
+       add  A01, A01, #64\r
+       \r
+       st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]\r
+       add TEMP1, B00, #64\r
+\r
+       ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02]\r
+       add  A02, A02, #64\r
+       \r
+       st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]\r
+       add TEMP1, TEMP1, #64\r
+\r
+       ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03]\r
+       add  A03, A03, #64\r
+       \r
+       st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1]\r
+       add TEMP1, TEMP1, #64\r
+\r
+       ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04]\r
+       add  A04, A04, #64\r
+       \r
+       st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1]\r
+       add TEMP1, TEMP1, #64\r
+\r
+       ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [A05]\r
+       add  A05, A05, #64\r
+       \r
+       st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [TEMP1]\r
+       add TEMP1, TEMP1, #64\r
+\r
+       ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [A06]\r
+       add  A06, A06, #64\r
+       \r
+       st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [TEMP1]\r
+       add TEMP1, TEMP1, #64\r
+\r
+       ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [A07]\r
+       add  A07, A07, #64\r
+       \r
+       st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [TEMP1]\r
+       add TEMP1, TEMP1, #64\r
+\r
+       ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [A08]\r
+       add  A08, A08, #64\r
+       \r
+       st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [TEMP1]\r
+       add TEMP1, TEMP1, #64\r
+       \r
+       add     B00, B00, M8\r
+\r
+.endm\r
+\r
+.macro COPY8x8\r
+       prfm    PLDL1KEEP, [A01, #A_PREFETCH]\r
+       prfm    PLDL1KEEP, [A02, #A_PREFETCH]\r
+       prfm    PLDL1KEEP, [A03, #A_PREFETCH]\r
+       prfm    PLDL1KEEP, [A04, #A_PREFETCH]\r
+       prfm    PLDL1KEEP, [A05, #A_PREFETCH]\r
+       prfm    PLDL1KEEP, [A06, #A_PREFETCH]\r
+       prfm    PLDL1KEEP, [A07, #A_PREFETCH]\r
+       prfm    PLDL1KEEP, [A08, #A_PREFETCH]\r
+\r
+       ldp     q0, q1, [A01]\r
+       ldp     q2, q3, [A02]\r
+       add     A01, A01, #32\r
+       add     A02, A02, #32\r
+       \r
+       st1     {v0.4s, v1.4s, v2.4s, v3.4s}, [B01]\r
+       add     B01, B01, #64\r
+       \r
+       ldp     q4, q5, [A03]\r
+       ldp     q6, q7, [A04]\r
+       add     A03, A03, #32\r
+       add     A04, A04, #32\r
+\r
+       st1     {v4.4s, v5.4s, v6.4s, v7.4s}, [B01]\r
+       add     B01, B01, #64\r
+\r
+       ldp     q8, q9, [A05]\r
+       ldp     q10, q11, [A06]\r
+       add     A05, A05, #32\r
+       add     A06, A06, #32\r
+\r
+       st1     {v8.4s, v9.4s, v10.4s, v11.4s}, [B01]\r
+       add     B01, B01, #64\r
+\r
+       ldp     q12, q13, [A07]\r
+       ldp     q14, q15, [A08]\r
+       add     A07, A07, #32\r
+       add     A08, A08, #32\r
+\r
+       st1     {v12.4s, v13.4s, v14.4s, v15.4s}, [B01]\r
+       add     B01, B01, #64\r
+.endm\r
+\r
+.macro COPY4x8\r
+       //prfm  PLDL1KEEP, [A01, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A02, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A03, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A04, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A05, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A06, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A07, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A08, #A_PREFETCH]\r
+\r
+       ldr     q0, [A01]\r
+       ldr     q1, [A02]\r
+       ldr     q2, [A03]\r
+       ldr     q3, [A04]\r
+       add     A01, A01, #16\r
+       add     A02, A02, #16\r
+       add     A03, A03, #16\r
+       add     A04, A04, #16\r
+\r
+       st1     {v0.4s, v1.4s, v2.4s, v3.4s}, [B02]\r
+       add     B02, B02, #64\r
+\r
+       ldr     q4, [A05]\r
+       ldr     q5, [A06]\r
+       ldr     q6, [A07]\r
+       ldr     q7, [A08]\r
+\r
+       add     A05, A05, #16\r
+       add     A06, A06, #16\r
+       add     A07, A07, #16\r
+       add     A08, A08, #16\r
+\r
+       st1     {v4.4s, v5.4s, v6.4s, v7.4s}, [B02]\r
+       add     B02, B02, #64\r
+.endm\r
+\r
+.macro COPY2x8\r
+       //prfm  PLDL1KEEP, [A01, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A02, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A03, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A04, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A05, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A06, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A07, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A08, #A_PREFETCH]\r
+\r
+       ldr     d0, [A01]\r
+       ldr     d1, [A02]\r
+       ldr     d2, [A03]\r
+       ldr     d3, [A04]\r
+       \r
+       add     A01, A01, #8\r
+       add     A02, A02, #8\r
+       add     A03, A03, #8\r
+       add     A04, A04, #8\r
+\r
+       stp     d0, d1, [B03]\r
+       add     B03, B03, #16\r
+       stp     d2, d3, [B03]\r
+       add     B03, B03, #16\r
+\r
+       ldr     d4, [A05]\r
+       ldr     d5, [A06]\r
+       ldr     d6, [A07]\r
+       ldr     d7, [A08]\r
+       \r
+       add     A05, A05, #8\r
+       add     A06, A06, #8\r
+       add     A07, A07, #8\r
+       add     A08, A08, #8\r
+\r
+       stp     d4, d5, [B03]\r
+       add     B03, B03, #16\r
+       stp     d6, d7, [B03]\r
+       add     B03, B03, #16\r
+\r
+.endm\r
+\r
+.macro COPY1x8\r
+       //prfm  PLDL1KEEP, [A01, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A02, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A03, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A04, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A05, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A06, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A07, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A08, #A_PREFETCH]\r
+\r
+       ldr     s0, [A01]\r
+       ldr     s1, [A02]\r
+       ldr     s2, [A03]\r
+       ldr     s3, [A04]\r
+       \r
+       add     A01, A01, #4\r
+       add     A02, A02, #4\r
+       add     A03, A03, #4\r
+       add     A04, A04, #4\r
+\r
+       stp     s0, s1, [B04]\r
+       add     B04, B04, #8\r
+       stp     s2, s3, [B04]\r
+       add     B04, B04, #8\r
+\r
+       ldr     s4, [A05]\r
+       ldr     s5, [A06]\r
+       ldr     s6, [A07]\r
+       ldr     s7, [A08]\r
+       \r
+       ldr     d4, [A05], #8\r
+       ldr     d5, [A06], #8\r
+       ldr     d6, [A07], #8\r
+       ldr     d7, [A08], #8\r
+\r
+       stp     s4, s5, [B04]\r
+       add     B04, B04, #8\r
+       stp     s6, s7, [B04]\r
+       add     B04, B04, #8\r
+\r
+.endm\r
+\r
+/*************************************************************************************************************************/\r
+.macro COPY16x4\r
+       prfm    PLDL1KEEP, [A01, #A_PREFETCH]\r
+       prfm    PLDL1KEEP, [A02, #A_PREFETCH]\r
+       prfm    PLDL1KEEP, [A03, #A_PREFETCH]\r
+       prfm    PLDL1KEEP, [A04, #A_PREFETCH]\r
+\r
+       ld1     {v0.4s, v1.4s, v2.4s, v3.4s}, [A01]\r
+       add     A01, A01, #64\r
+\r
+       st1     {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]\r
+       add     TEMP1, B00, #64\r
+\r
+       ld1     {v4.4s, v5.4s, v6.4s, v7.4s}, [A02]\r
+       add     A02, A02, #64\r
+\r
+       st1     {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]\r
+       add     TEMP1, TEMP1, #64\r
+\r
+       ld1     {v8.4s, v9.4s, v10.4s, v11.4s}, [A03]\r
+       add     A03, A03, #64\r
+\r
+       st1     {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1]\r
+       add     TEMP1, TEMP1, #64\r
+\r
+       ld1     {v12.4s, v13.4s, v14.4s, v15.4s}, [A04]\r
+       add     A04, A04, #64\r
+\r
+       st1     {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1]\r
+\r
+       add     B00, B00, M8\r
+.endm\r
+\r
+.macro COPY8x4\r
+       prfm    PLDL1KEEP, [A01, #A_PREFETCH]\r
+       prfm    PLDL1KEEP, [A02, #A_PREFETCH]\r
+       prfm    PLDL1KEEP, [A03, #A_PREFETCH]\r
+       prfm    PLDL1KEEP, [A04, #A_PREFETCH]\r
+\r
+       ldp     q0, q1, [A01]\r
+       ldp     q2, q3, [A02]\r
+       add     A01, A01, #32\r
+       add     A02, A02, #32\r
+\r
+       st1     {v0.4s, v1.4s, v2.4s, v3.4s}, [B01]\r
+       add     B01, B01, #64\r
+\r
+       ldp     q4, q5, [A03]\r
+       ldp     q6, q7, [A04]\r
+       add     A03, A03, #32\r
+       add     A04, A04, #32\r
+\r
+       st1     {v4.4s, v5.4s, v6.4s, v7.4s}, [B01]\r
+       add     B01, B01, #64\r
+.endm\r
+\r
+.macro COPY4x4\r
+       //prfm  PLDL1KEEP, [A01, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A02, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A03, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A04, #A_PREFETCH]\r
+\r
+       ldr     q0, [A01]\r
+       ldr     q1, [A02]\r
+       ldr     q2, [A03]\r
+       ldr     q3, [A04]\r
+       add     A01, A01, #16\r
+       add     A02, A02, #16\r
+       add     A03, A03, #16\r
+       add     A04, A04, #16\r
+\r
+       st1     {v0.4s, v1.4s, v2.4s, v3.4s}, [B02]\r
+\r
+       add     B02, B02, #64\r
+.endm\r
+\r
+.macro COPY2x4\r
+       //prfm  PLDL1KEEP, [A01, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A02, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A03, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A04, #A_PREFETCH]\r
+\r
+       ldr     d0, [A01]\r
+       ldr     d1, [A02]\r
+       ldr     d2, [A03]\r
+       ldr     d3, [A04]\r
+       \r
+       add     A01, A01, #8\r
+       add     A02, A02, #8\r
+       add     A03, A03, #8\r
+       add     A04, A04, #8\r
+       \r
+       stp     d0, d1, [B03]\r
+       add     B03, B03, #16\r
+       stp     d2, d3, [B03]\r
+\r
+       add     B03, B03, #16\r
+.endm\r
+\r
+.macro COPY1x4\r
+       //prfm  PLDL1KEEP, [A01, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A02, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A03, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A04, #A_PREFETCH]\r
+\r
+       ldr     s0, [A01]\r
+       ldr     s1, [A02]\r
+       ldr     s2, [A03]\r
+       ldr     s3, [A04]\r
+       \r
+       add     A01, A01, #4\r
+       add     A02, A02, #4\r
+       add     A03, A03, #4\r
+       add     A04, A04, #4\r
+\r
+       stp     s0, s1, [B04]\r
+       add     B04, B04, #8\r
+       stp     s2, s3, [B04]\r
+       add     B04, B04, #8\r
+\r
+.endm\r
+\r
+/*************************************************************************************************************************/\r
+\r
+.macro COPY16x2\r
+       prfm    PLDL1KEEP, [A01, #A_PREFETCH]\r
+       prfm    PLDL1KEEP, [A02, #A_PREFETCH]\r
+\r
+       ld1     {v0.4s, v1.4s, v2.4s, v3.4s}, [A01]\r
+       add     A01, A01, #64\r
+       \r
+       ld1     {v4.4s, v5.4s, v6.4s, v7.4s}, [A02]\r
+       add     A02, A02, #64\r
+\r
+       st1     {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]\r
+       add     TEMP1, B00, #64\r
+       st1     {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]\r
+       add     B00, B00, M8\r
+.endm\r
+\r
+.macro COPY8x2\r
+       prfm    PLDL1KEEP, [A01, #A_PREFETCH]\r
+       prfm    PLDL1KEEP, [A02, #A_PREFETCH]\r
+\r
+       ld1     {v0.4s, v1.4s}, [A01]\r
+       ld1     {v2.4s, v3.4s}, [A02]\r
+       add     A01, A01, #32\r
+       add     A02, A02, #32\r
+\r
+       st1     {v0.4s, v1.4s, v2.4s, v3.4s}, [B01]\r
+       add     B01, B01, #64\r
+.endm\r
+\r
+.macro COPY4x2\r
+       //prfm  PLDL1KEEP, [A01, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A02, #A_PREFETCH]\r
+\r
+       ldr     q0, [A01]\r
+       ldr     q1, [A02]\r
+       add     A01, A01, #16\r
+       add     A02, A02, #16\r
+\r
+       stp     q0, q1, [B02]\r
+       add     B02, B02, #32\r
+.endm\r
+\r
+.macro COPY2x2\r
+       //prfm  PLDL1KEEP, [A01, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A02, #A_PREFETCH]\r
+\r
+       ldr     d0, [A01]\r
+       ldr     d1, [A02]\r
+       \r
+       add     A01, A01, #8\r
+       add     A02, A02, #8\r
+       \r
+       stp     d0, d1, [B03]\r
+       add     B03, B03, #16\r
+.endm\r
+\r
+.macro COPY1x2\r
+       //prfm  PLDL1KEEP, [A01, #A_PREFETCH]\r
+       //prfm  PLDL1KEEP, [A02, #A_PREFETCH]\r
+\r
+       ldr     s0, [A01]\r
+       ldr     s1, [A02]\r
+       \r
+       add     A01, A01, #4\r
+       add     A02, A02, #4\r
+\r
+       stp     s0, s1, [B04]\r
+\r
+       add     B04, B04, #8\r
+.endm\r
+\r
+/*************************************************************************************************************************/\r
+\r
+.macro COPY16x1\r
+       prfm    PLDL1KEEP, [A01, #A_PREFETCH]\r
+\r
+       ld1     {v0.4s, v1.4s, v2.4s, v3.4s}, [A01]\r
+       add     A01, A01, #64\r
+\r
+       st1     {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]\r
+       add     B00, B00, M8\r
+.endm\r
+\r
+.macro COPY8x1\r
+       prfm    PLDL1KEEP, [A01, #A_PREFETCH]\r
+       \r
+       ldp     q0, q1, [A01]\r
+       add     A01, A01, #32\r
+       stp     q0, q1, [B01]\r
+\r
+       add     B01, B01, #32\r
+.endm\r
+\r
+.macro COPY4x1\r
+       //prfm  PLDL1KEEP, [A01, #A_PREFETCH]\r
+\r
+       ldr q0, [A01]\r
+       add     A01, A01, #16\r
+       str q0, [B02]\r
+\r
+       add     B02, B02, #16\r
+.endm\r
+\r
+.macro COPY2x1\r
+       //prfm  PLDL1KEEP, [A01, #A_PREFETCH]\r
+\r
+       ldr     d0, [A01]\r
+       add     A01, A01, #8\r
+       str d0, [B03]\r
+\r
+       add     B03, B03, #8\r
+.endm\r
+\r
+.macro COPY1x1\r
+       //prfm  PLDL1KEEP, [A01, #A_PREFETCH]\r
+\r
+       ldr     s0, [A01]\r
+       add     A01, A01, #4\r
+       str     s0, [B04]\r
+\r
+       add     B04, B04, #4\r
+.endm\r
+\r
+/**************************************************************************************\r
+* End of macro definitions\r
+**************************************************************************************/\r
+\r
+       PROLOGUE\r
+\r
+       .align 5\r
+\r
+       SAVE_REGS\r
+\r
+       lsl     LDA, LDA, #2                                    // LDA = LDA * SIZE\r
+\r
+       lsl     TEMP1, M, #2                                    // TEMP1 = M * SIZE\r
+\r
+       and     B01 , N , #-16\r
+       and     B02 , N , #-8\r
+       and     B03 , N , #-4\r
+       and     B04 , N , #-2\r
+\r
+       mul     B01, B01, TEMP1\r
+       mul     B02, B02, TEMP1\r
+       mul     B03, B03, TEMP1\r
+       mul     B04, B04, TEMP1\r
+\r
+       add     B01 , B01, B\r
+       add     B02 , B02, B\r
+       add     B03 , B03, B\r
+       add     B04 , B04, B\r
+\r
+       lsl     M8, M, #6                                       // M8 = M * 16 * SIZE\r
+\r
+.Lsgemm_tcopy_L8_BEGIN:\r
+       asr     J, M, #3                                        // J = M / 8\r
+       cmp     J, #0\r
+       ble     .Lsgemm_tcopy_L4_BEGIN\r
+\r
+       .align  5\r
+.Lsgemm_tcopy_L8_M16_BEGIN:\r
+\r
+       mov     A01, A\r
+       add     A02, A01, LDA\r
+       add     A03, A02, LDA\r
+       add     A04, A03, LDA\r
+       add     A05, A04, LDA\r
+       add     A06, A05, LDA\r
+       add     A07, A06, LDA\r
+       add     A08, A07, LDA\r
+       add     A, A08, LDA\r
+\r
+       mov     B00, B\r
+       add     B, B00, #512                                    // B = B + 8 * 16 * SIZE\r
+\r
+       asr     I, N, #4                                        // I = N / 16\r
+       cmp     I, #0\r
+       ble     .Lsgemm_tcopy_L8_M16_40\r
+\r
+       .align  5\r
+.Lsgemm_tcopy_L8_M16_20:\r
+\r
+       COPY16x8\r
+\r
+       subs    I , I , #1\r
+       bne     .Lsgemm_tcopy_L8_M16_20\r
+\r
+.Lsgemm_tcopy_L8_M16_40:\r
+       tst     N , #8\r
+       ble     .Lsgemm_tcopy_L8_M16_60\r
+\r
+       COPY8x8\r
+       \r
+.Lsgemm_tcopy_L8_M16_60:\r
+       tst     N , #4\r
+       ble     .Lsgemm_tcopy_L8_M16_80\r
+\r
+       COPY4x8\r
+\r
+.Lsgemm_tcopy_L8_M16_80:\r
+\r
+       tst     N , #2\r
+       ble     .Lsgemm_tcopy_L8_M16_100\r
+\r
+       COPY2x8\r
+\r
+.Lsgemm_tcopy_L8_M16_100:\r
+\r
+       tst     N, #1\r
+       ble     .Lsgemm_tcopy_L8_M16_END\r
+\r
+       COPY1x8\r
+\r
+.Lsgemm_tcopy_L8_M16_END:\r
+\r
+       subs    J , J, #1                                               // j--\r
+       bne     .Lsgemm_tcopy_L8_M16_BEGIN\r
+\r
+/*********************************************************************************************/\r
+\r
+.Lsgemm_tcopy_L4_BEGIN:\r
+       tst     M, #7\r
+       ble     .Lsgemm_tcopy_L999\r
+\r
+       tst     M, #4\r
+       ble     .Lsgemm_tcopy_L2_BEGIN\r
+       \r
+.Lsgemm_tcopy_L4_M16_BEGIN:\r
+\r
+       mov     A01, A\r
+       add     A02, A01, LDA\r
+       add     A03, A02, LDA\r
+       add     A04, A03, LDA\r
+       add     A, A04, LDA\r
+\r
+       mov     B00, B\r
+       add     B, B00, #256                                    // B = B + 4 * 16 * SIZE\r
+\r
+       asr     I, N, #4                                        // I = N / 16\r
+       cmp     I, #0\r
+       ble     .Lsgemm_tcopy_L4_M16_40\r
+\r
+       .align  5\r
+.Lsgemm_tcopy_L4_M16_20:\r
+\r
+       COPY16x4\r
+\r
+       subs    I , I , #1\r
+       bne     .Lsgemm_tcopy_L4_M16_20\r
+\r
+.Lsgemm_tcopy_L4_M16_40:\r
+       tst     N , #8\r
+       ble     .Lsgemm_tcopy_L4_M16_60\r
+\r
+       COPY8x4\r
+       \r
+.Lsgemm_tcopy_L4_M16_60:\r
+       tst     N , #4\r
+       ble     .Lsgemm_tcopy_L4_M16_80\r
+\r
+       COPY4x4\r
+\r
+.Lsgemm_tcopy_L4_M16_80:\r
+\r
+       tst     N , #2\r
+       ble     .Lsgemm_tcopy_L4_M16_100\r
+\r
+       COPY2x4\r
+\r
+\r
+.Lsgemm_tcopy_L4_M16_100:\r
+\r
+       tst     N, #1\r
+       ble     .Lsgemm_tcopy_L4_M16_END\r
+\r
+       COPY1x4\r
+\r
+\r
+.Lsgemm_tcopy_L4_M16_END:\r
+\r
+/*********************************************************************************************/\r
+\r
+.Lsgemm_tcopy_L2_BEGIN:\r
+\r
+       tst     M, #3\r
+       ble     .Lsgemm_tcopy_L999\r
+\r
+       tst     M, #2\r
+       ble     .Lsgemm_tcopy_L1_BEGIN\r
+\r
+.Lsgemm_tcopy_L2_M16_BEGIN:\r
+       mov     A01, A\r
+       add     A02, A01, LDA\r
+       add     A, A02, LDA\r
+\r
+       mov     B00, B\r
+       add     B, B00, #128                                    // B = B + 2 * 16 * SIZE\r
+\r
+       asr     I, N, #4                                        // I = N / 16\r
+       cmp     I, #0\r
+       ble     .Lsgemm_tcopy_L2_M16_40\r
+\r
+       .align  5\r
+.Lsgemm_tcopy_L2_M16_20:\r
+\r
+       COPY16x2\r
+\r
+       subs    I , I , #1\r
+       bne     .Lsgemm_tcopy_L2_M16_20\r
+\r
+.Lsgemm_tcopy_L2_M16_40:\r
+       tst     N , #8\r
+       ble     .Lsgemm_tcopy_L2_M16_60\r
+\r
+       COPY8x2\r
+\r
+.Lsgemm_tcopy_L2_M16_60:\r
+       tst     N , #4\r
+       ble     .Lsgemm_tcopy_L2_M16_80\r
+\r
+       COPY4x2\r
+\r
+.Lsgemm_tcopy_L2_M16_80:\r
+\r
+       tst     N , #2\r
+       ble     .Lsgemm_tcopy_L2_M16_100\r
+\r
+       COPY2x2\r
+\r
+.Lsgemm_tcopy_L2_M16_100:\r
+\r
+       tst     N , #1\r
+       ble     .Lsgemm_tcopy_L2_M16_END\r
+\r
+       COPY1x2\r
+\r
+.Lsgemm_tcopy_L2_M16_END:\r
+\r
+/*********************************************************************************************/\r
+\r
+.Lsgemm_tcopy_L1_BEGIN:\r
+\r
+       tst     M, #1\r
+       ble     .Lsgemm_tcopy_L999\r
+\r
+\r
+.Lsgemm_tcopy_L1_M16_BEGIN:\r
+\r
+       mov     A01, A                                          // A01 = A\r
+       mov     B00, B\r
+\r
+       asr     I, N, #4                                        // I = M / 16\r
+       cmp     I, #0\r
+       ble     .Lsgemm_tcopy_L1_M16_40\r
+\r
+       .align  5\r
+.Lsgemm_tcopy_L1_M16_20:\r
+\r
+       COPY16x1\r
+\r
+       subs    I , I , #1\r
+       bne     .Lsgemm_tcopy_L1_M16_20\r
+       \r
+.Lsgemm_tcopy_L1_M16_40:\r
+       tst     N , #8\r
+       ble     .Lsgemm_tcopy_L1_M16_60\r
+\r
+       COPY8x1\r
+\r
+.Lsgemm_tcopy_L1_M16_60:\r
+       tst     N , #4\r
+       ble     .Lsgemm_tcopy_L1_M16_80\r
+\r
+       COPY4x1\r
+\r
+.Lsgemm_tcopy_L1_M16_80:\r
+\r
+       tst     N , #2\r
+       ble     .Lsgemm_tcopy_L1_M16_100\r
+\r
+       COPY2x1\r
+\r
+.Lsgemm_tcopy_L1_M16_100:\r
+\r
+       tst     N , #1\r
+       ble     .Lsgemm_tcopy_L1_M16_END\r
+\r
+       COPY1x1\r
+\r
+\r
+.Lsgemm_tcopy_L1_M16_END:\r
+\r
+.Lsgemm_tcopy_L999:\r
+       mov     x0, #0                                          // set return value\r
+       RESTORE_REGS\r
+       ret\r
+\r
+       EPILOGUE\r
+\r
+\r