arm64: arch/arm/lib: Add optimized memset/memcpy/memmove functions

author Stefan Roese <sr@denx.de>

Thu, 2 Sep 2021 15:00:17 +0000 (17:00 +0200)

committer Tom Rini <trini@konsulko.com>

Fri, 24 Sep 2021 11:48:06 +0000 (07:48 -0400)
author Stefan Roese <sr@denx.de>
Thu, 2 Sep 2021 15:00:17 +0000 (17:00 +0200)
committer Tom Rini <trini@konsulko.com>
Fri, 24 Sep 2021 11:48:06 +0000 (07:48 -0400)
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile

index 7f66332..c48e1f6 100644 (file)
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -39,8 +39,13 @@ obj-$(CONFIG_$(SPL_TPL_)FRAMEWORK) += spl.o
  obj-$(CONFIG_SPL_FRAMEWORK) += zimage.o
  obj-$(CONFIG_OF_LIBFDT) += bootm-fdt.o
  endif
+ifdef CONFIG_ARM64
+obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMSET) += memset-arm64.o
+obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMCPY) += memcpy-arm64.o
+else
  obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMSET) += memset.o
  obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMCPY) += memcpy.o
+endif
  obj-$(CONFIG_SEMIHOSTING) += semihosting.o
  
  obj-y  += bdinfo.o
diff --git a/arch/arm/lib/asmdefs.h b/arch/arm/lib/asmdefs.h

new file mode 100644 (file)

index 0000000..d307a3a
--- /dev/null
+++ b/arch/arm/lib/asmdefs.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Macros for asm code.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ */
+
+#ifndef _ASMDEFS_H
+#define _ASMDEFS_H
+
+#if defined(__aarch64__)
+
+/* Branch Target Identitication support.  */
+#define BTI_C          hint    34
+#define BTI_J          hint    36
+/* Return address signing support (pac-ret).  */
+#define PACIASP                hint    25; .cfi_window_save
+#define AUTIASP                hint    29; .cfi_window_save
+
+/* GNU_PROPERTY_AARCH64_* macros from elf.h.  */
+#define FEATURE_1_AND 0xc0000000
+#define FEATURE_1_BTI 1
+#define FEATURE_1_PAC 2
+
+/* Add a NT_GNU_PROPERTY_TYPE_0 note.  */
+#define GNU_PROPERTY(type, value)      \
+  .section .note.gnu.property, "a";    \
+  .p2align 3;                          \
+  .word 4;                             \
+  .word 16;                            \
+  .word 5;                             \
+  .asciz "GNU";                                \
+  .word type;                          \
+  .word 4;                             \
+  .word value;                         \
+  .word 0;                             \
+  .text
+
+/* If set then the GNU Property Note section will be added to
+   mark objects to support BTI and PAC-RET.  */
+#ifndef WANT_GNU_PROPERTY
+#define WANT_GNU_PROPERTY 1
+#endif
+
+#if WANT_GNU_PROPERTY
+/* Add property note with supported features to all asm files.  */
+GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
+#endif
+
+#define ENTRY_ALIGN(name, alignment)   \
+  .global name;                \
+  .type name,%function;        \
+  .align alignment;            \
+  name:                        \
+  .cfi_startproc;      \
+  BTI_C;
+
+#else
+
+#define END_FILE
+
+#define ENTRY_ALIGN(name, alignment)   \
+  .global name;                \
+  .type name,%function;        \
+  .align alignment;            \
+  name:                        \
+  .cfi_startproc;
+
+#endif
+
+#define ENTRY(name)    ENTRY_ALIGN(name, 6)
+
+#define ENTRY_ALIAS(name)      \
+  .global name;                \
+  .type name,%function;        \
+  name:
+
+#define END(name)      \
+  .cfi_endproc;                \
+  .size name, .-name;
+
+#define L(l) .L ## l
+
+#ifdef __ILP32__
+  /* Sanitize padding bits of pointer arguments as per aapcs64 */
+#define PTR_ARG(n)  mov w##n, w##n
+#else
+#define PTR_ARG(n)
+#endif
+
+#ifdef __ILP32__
+  /* Sanitize padding bits of size arguments as per aapcs64 */
+#define SIZE_ARG(n)  mov w##n, w##n
+#else
+#define SIZE_ARG(n)
+#endif
+
+#endif
diff --git a/arch/arm/lib/memcpy-arm64.S b/arch/arm/lib/memcpy-arm64.S

new file mode 100644 (file)

index 0000000..507054d
--- /dev/null
+++ b/arch/arm/lib/memcpy-arm64.S
@@ -0,0 +1,242 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2012-2020, Arm Limited.
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#include "asmdefs.h"
+
+#define dstin  x0
+#define src    x1
+#define count  x2
+#define dst    x3
+#define srcend x4
+#define dstend x5
+#define A_l    x6
+#define A_lw   w6
+#define A_h    x7
+#define B_l    x8
+#define B_lw   w8
+#define B_h    x9
+#define C_l    x10
+#define C_lw   w10
+#define C_h    x11
+#define D_l    x12
+#define D_h    x13
+#define E_l    x14
+#define E_h    x15
+#define F_l    x16
+#define F_h    x17
+#define G_l    count
+#define G_h    dst
+#define H_l    src
+#define H_h    srcend
+#define tmp1   x14
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The destination pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+ENTRY_ALIAS (memmove)
+ENTRY (memcpy)
+       PTR_ARG (0)
+       PTR_ARG (1)
+       SIZE_ARG (2)
+       add     srcend, src, count
+       add     dstend, dstin, count
+       cmp     count, 128
+       b.hi    L(copy_long)
+       cmp     count, 32
+       b.hi    L(copy32_128)
+
+       /* Small copies: 0..32 bytes.  */
+       cmp     count, 16
+       b.lo    L(copy16)
+       ldp     A_l, A_h, [src]
+       ldp     D_l, D_h, [srcend, -16]
+       stp     A_l, A_h, [dstin]
+       stp     D_l, D_h, [dstend, -16]
+       ret
+
+       /* Copy 8-15 bytes.  */
+L(copy16):
+       tbz     count, 3, L(copy8)
+       ldr     A_l, [src]
+       ldr     A_h, [srcend, -8]
+       str     A_l, [dstin]
+       str     A_h, [dstend, -8]
+       ret
+
+       .p2align 3
+       /* Copy 4-7 bytes.  */
+L(copy8):
+       tbz     count, 2, L(copy4)
+       ldr     A_lw, [src]
+       ldr     B_lw, [srcend, -4]
+       str     A_lw, [dstin]
+       str     B_lw, [dstend, -4]
+       ret
+
+       /* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+       cbz     count, L(copy0)
+       lsr     tmp1, count, 1
+       ldrb    A_lw, [src]
+       ldrb    C_lw, [srcend, -1]
+       ldrb    B_lw, [src, tmp1]
+       strb    A_lw, [dstin]
+       strb    B_lw, [dstin, tmp1]
+       strb    C_lw, [dstend, -1]
+L(copy0):
+       ret
+
+       .p2align 4
+       /* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+       ldp     A_l, A_h, [src]
+       ldp     B_l, B_h, [src, 16]
+       ldp     C_l, C_h, [srcend, -32]
+       ldp     D_l, D_h, [srcend, -16]
+       cmp     count, 64
+       b.hi    L(copy128)
+       stp     A_l, A_h, [dstin]
+       stp     B_l, B_h, [dstin, 16]
+       stp     C_l, C_h, [dstend, -32]
+       stp     D_l, D_h, [dstend, -16]
+       ret
+
+       .p2align 4
+       /* Copy 65..128 bytes.  */
+L(copy128):
+       ldp     E_l, E_h, [src, 32]
+       ldp     F_l, F_h, [src, 48]
+       cmp     count, 96
+       b.ls    L(copy96)
+       ldp     G_l, G_h, [srcend, -64]
+       ldp     H_l, H_h, [srcend, -48]
+       stp     G_l, G_h, [dstend, -64]
+       stp     H_l, H_h, [dstend, -48]
+L(copy96):
+       stp     A_l, A_h, [dstin]
+       stp     B_l, B_h, [dstin, 16]
+       stp     E_l, E_h, [dstin, 32]
+       stp     F_l, F_h, [dstin, 48]
+       stp     C_l, C_h, [dstend, -32]
+       stp     D_l, D_h, [dstend, -16]
+       ret
+
+       .p2align 4
+       /* Copy more than 128 bytes.  */
+L(copy_long):
+       /* Use backwards copy if there is an overlap.  */
+       sub     tmp1, dstin, src
+       cbz     tmp1, L(copy0)
+       cmp     tmp1, count
+       b.lo    L(copy_long_backwards)
+
+       /* Copy 16 bytes and then align dst to 16-byte alignment.  */
+
+       ldp     D_l, D_h, [src]
+       and     tmp1, dstin, 15
+       bic     dst, dstin, 15
+       sub     src, src, tmp1
+       add     count, count, tmp1      /* Count is now 16 too large.  */
+       ldp     A_l, A_h, [src, 16]
+       stp     D_l, D_h, [dstin]
+       ldp     B_l, B_h, [src, 32]
+       ldp     C_l, C_h, [src, 48]
+       ldp     D_l, D_h, [src, 64]!
+       subs    count, count, 128 + 16  /* Test and readjust count.  */
+       b.ls    L(copy64_from_end)
+
+L(loop64):
+       stp     A_l, A_h, [dst, 16]
+       ldp     A_l, A_h, [src, 16]
+       stp     B_l, B_h, [dst, 32]
+       ldp     B_l, B_h, [src, 32]
+       stp     C_l, C_h, [dst, 48]
+       ldp     C_l, C_h, [src, 48]
+       stp     D_l, D_h, [dst, 64]!
+       ldp     D_l, D_h, [src, 64]!
+       subs    count, count, 64
+       b.hi    L(loop64)
+
+       /* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
+       ldp     E_l, E_h, [srcend, -64]
+       stp     A_l, A_h, [dst, 16]
+       ldp     A_l, A_h, [srcend, -48]
+       stp     B_l, B_h, [dst, 32]
+       ldp     B_l, B_h, [srcend, -32]
+       stp     C_l, C_h, [dst, 48]
+       ldp     C_l, C_h, [srcend, -16]
+       stp     D_l, D_h, [dst, 64]
+       stp     E_l, E_h, [dstend, -64]
+       stp     A_l, A_h, [dstend, -48]
+       stp     B_l, B_h, [dstend, -32]
+       stp     C_l, C_h, [dstend, -16]
+       ret
+
+       .p2align 4
+
+       /* Large backwards copy for overlapping copies.
+          Copy 16 bytes and then align dst to 16-byte alignment.  */
+L(copy_long_backwards):
+       ldp     D_l, D_h, [srcend, -16]
+       and     tmp1, dstend, 15
+       sub     srcend, srcend, tmp1
+       sub     count, count, tmp1
+       ldp     A_l, A_h, [srcend, -16]
+       stp     D_l, D_h, [dstend, -16]
+       ldp     B_l, B_h, [srcend, -32]
+       ldp     C_l, C_h, [srcend, -48]
+       ldp     D_l, D_h, [srcend, -64]!
+       sub     dstend, dstend, tmp1
+       subs    count, count, 128
+       b.ls    L(copy64_from_start)
+
+L(loop64_backwards):
+       stp     A_l, A_h, [dstend, -16]
+       ldp     A_l, A_h, [srcend, -16]
+       stp     B_l, B_h, [dstend, -32]
+       ldp     B_l, B_h, [srcend, -32]
+       stp     C_l, C_h, [dstend, -48]
+       ldp     C_l, C_h, [srcend, -48]
+       stp     D_l, D_h, [dstend, -64]!
+       ldp     D_l, D_h, [srcend, -64]!
+       subs    count, count, 64
+       b.hi    L(loop64_backwards)
+
+       /* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+       ldp     G_l, G_h, [src, 48]
+       stp     A_l, A_h, [dstend, -16]
+       ldp     A_l, A_h, [src, 32]
+       stp     B_l, B_h, [dstend, -32]
+       ldp     B_l, B_h, [src, 16]
+       stp     C_l, C_h, [dstend, -48]
+       ldp     C_l, C_h, [src]
+       stp     D_l, D_h, [dstend, -64]
+       stp     G_l, G_h, [dstin, 48]
+       stp     A_l, A_h, [dstin, 32]
+       stp     B_l, B_h, [dstin, 16]
+       stp     C_l, C_h, [dstin]
+       ret
+
+END (memcpy)
diff --git a/arch/arm/lib/memset-arm64.S b/arch/arm/lib/memset-arm64.S

new file mode 100644 (file)

index 0000000..710f6f5
--- /dev/null
+++ b/arch/arm/lib/memset-arm64.S
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * memset - fill memory with a constant byte
+ *
+ * Copyright (c) 2012-2021, Arm Limited.
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ *
+ */
+
+#include "asmdefs.h"
+
+#define dstin  x0
+#define val    x1
+#define valw   w1
+#define count  x2
+#define dst    x3
+#define dstend x4
+#define zva_val        x5
+
+ENTRY (memset)
+       PTR_ARG (0)
+       SIZE_ARG (2)
+
+       dup     v0.16B, valw
+       add     dstend, dstin, count
+
+       cmp     count, 96
+       b.hi    L(set_long)
+       cmp     count, 16
+       b.hs    L(set_medium)
+       mov     val, v0.D[0]
+
+       /* Set 0..15 bytes.  */
+       tbz     count, 3, 1f
+       str     val, [dstin]
+       str     val, [dstend, -8]
+       ret
+       .p2align 4
+1:     tbz     count, 2, 2f
+       str     valw, [dstin]
+       str     valw, [dstend, -4]
+       ret
+2:     cbz     count, 3f
+       strb    valw, [dstin]
+       tbz     count, 1, 3f
+       strh    valw, [dstend, -2]
+3:     ret
+
+       /* Set 17..96 bytes.  */
+L(set_medium):
+       str     q0, [dstin]
+       tbnz    count, 6, L(set96)
+       str     q0, [dstend, -16]
+       tbz     count, 5, 1f
+       str     q0, [dstin, 16]
+       str     q0, [dstend, -32]
+1:     ret
+
+       .p2align 4
+       /* Set 64..96 bytes.  Write 64 bytes from the start and
+          32 bytes from the end.  */
+L(set96):
+       str     q0, [dstin, 16]
+       stp     q0, q0, [dstin, 32]
+       stp     q0, q0, [dstend, -32]
+       ret
+
+       .p2align 4
+L(set_long):
+       and     valw, valw, 255
+       bic     dst, dstin, 15
+       str     q0, [dstin]
+       cmp     count, 160
+       ccmp    valw, 0, 0, hs
+       b.ne    L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+       mrs     zva_val, dczid_el0
+       and     zva_val, zva_val, 31
+       cmp     zva_val, 4              /* ZVA size is 64 bytes.  */
+       b.ne    L(no_zva)
+#endif
+       str     q0, [dst, 16]
+       stp     q0, q0, [dst, 32]
+       bic     dst, dst, 63
+       sub     count, dstend, dst      /* Count is now 64 too large.  */
+       sub     count, count, 128       /* Adjust count and bias for loop.  */
+
+       .p2align 4
+L(zva_loop):
+       add     dst, dst, 64
+       dc      zva, dst
+       subs    count, count, 64
+       b.hi    L(zva_loop)
+       stp     q0, q0, [dstend, -64]
+       stp     q0, q0, [dstend, -32]
+       ret
+
+L(no_zva):
+       sub     count, dstend, dst      /* Count is 16 too large.  */
+       sub     dst, dst, 16            /* Dst is biased by -32.  */
+       sub     count, count, 64 + 16   /* Adjust count and bias for loop.  */
+L(no_zva_loop):
+       stp     q0, q0, [dst, 32]
+       stp     q0, q0, [dst, 64]!
+       subs    count, count, 64
+       b.hi    L(no_zva_loop)
+       stp     q0, q0, [dstend, -64]
+       stp     q0, q0, [dstend, -32]
+       ret
+
+END (memset)
author	Stefan Roese <sr@denx.de>
	Thu, 2 Sep 2021 15:00:17 +0000 (17:00 +0200)
committer	Tom Rini <trini@konsulko.com>
	Fri, 24 Sep 2021 11:48:06 +0000 (07:48 -0400)
arch/arm/lib/Makefile		patch \| blob \| history
arch/arm/lib/asmdefs.h	[new file with mode: 0644]	patch \| blob
arch/arm/lib/memcpy-arm64.S	[new file with mode: 0644]	patch \| blob
arch/arm/lib/memset-arm64.S	[new file with mode: 0644]	patch \| blob