arm64: Import latest memcpy()/memmove() implementation

author Robin Murphy <robin.murphy@arm.com>

Thu, 27 May 2021 15:34:46 +0000 (16:34 +0100)

committer Will Deacon <will@kernel.org>

Tue, 1 Jun 2021 17:34:38 +0000 (18:34 +0100)
author Robin Murphy <robin.murphy@arm.com>
Thu, 27 May 2021 15:34:46 +0000 (16:34 +0100)
committer Will Deacon <will@kernel.org>
Tue, 1 Jun 2021 17:34:38 +0000 (18:34 +0100)
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile

index d31e116..01c596a 100644 (file)
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,7 +1,7 @@
  # SPDX-License-Identifier: GPL-2.0
  lib-y          := clear_user.o delay.o copy_from_user.o                \
                    copy_to_user.o copy_in_user.o copy_page.o            \
-                  clear_page.o csum.o memchr.o memcpy.o memmove.o      \
+                  clear_page.o csum.o memchr.o memcpy.o                \
                    memset.o memcmp.o strcmp.o strncmp.o strlen.o        \
                    strnlen.o strchr.o strrchr.o tishift.o
  
diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S

index dc8d2a2..31073a8 100644 (file)
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -1,66 +1,252 @@
  /* SPDX-License-Identifier: GPL-2.0-only */
  /*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2012-2020, Arm Limited.
   *
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * Adapted from the original at:
+ * https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/memcpy.S
   */
  
  #include <linux/linkage.h>
  #include <asm/assembler.h>
-#include <asm/cache.h>
  
-/*
- * Copy a buffer from src to dest (alignment handled by the hardware)
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
   *
- * Parameters:
- *     x0 - dest
- *     x1 - src
- *     x2 - n
- * Returns:
- *     x0 - dest
   */
-       .macro ldrb1 reg, ptr, val
-       ldrb  \reg, [\ptr], \val
-       .endm
-
-       .macro strb1 reg, ptr, val
-       strb \reg, [\ptr], \val
-       .endm
  
-       .macro ldrh1 reg, ptr, val
-       ldrh  \reg, [\ptr], \val
-       .endm
+#define L(label) .L ## label
  
-       .macro strh1 reg, ptr, val
-       strh \reg, [\ptr], \val
-       .endm
+#define dstin  x0
+#define src    x1
+#define count  x2
+#define dst    x3
+#define srcend x4
+#define dstend x5
+#define A_l    x6
+#define A_lw   w6
+#define A_h    x7
+#define B_l    x8
+#define B_lw   w8
+#define B_h    x9
+#define C_l    x10
+#define C_lw   w10
+#define C_h    x11
+#define D_l    x12
+#define D_h    x13
+#define E_l    x14
+#define E_h    x15
+#define F_l    x16
+#define F_h    x17
+#define G_l    count
+#define G_h    dst
+#define H_l    src
+#define H_h    srcend
+#define tmp1   x14
  
-       .macro ldr1 reg, ptr, val
-       ldr \reg, [\ptr], \val
-       .endm
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
  
-       .macro str1 reg, ptr, val
-       str \reg, [\ptr], \val
-       .endm
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
  
-       .macro ldp1 reg1, reg2, ptr, val
-       ldp \reg1, \reg2, [\ptr], \val
-       .endm
-
-       .macro stp1 reg1, reg2, ptr, val
-       stp \reg1, \reg2, [\ptr], \val
-       .endm
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The destination pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+*/
  
+SYM_FUNC_START_ALIAS(__memmove)
+SYM_FUNC_START_WEAK_ALIAS_PI(memmove)
  SYM_FUNC_START_ALIAS(__memcpy)
  SYM_FUNC_START_WEAK_PI(memcpy)
-#include "copy_template.S"
+       add     srcend, src, count
+       add     dstend, dstin, count
+       cmp     count, 128
+       b.hi    L(copy_long)
+       cmp     count, 32
+       b.hi    L(copy32_128)
+
+       /* Small copies: 0..32 bytes.  */
+       cmp     count, 16
+       b.lo    L(copy16)
+       ldp     A_l, A_h, [src]
+       ldp     D_l, D_h, [srcend, -16]
+       stp     A_l, A_h, [dstin]
+       stp     D_l, D_h, [dstend, -16]
+       ret
+
+       /* Copy 8-15 bytes.  */
+L(copy16):
+       tbz     count, 3, L(copy8)
+       ldr     A_l, [src]
+       ldr     A_h, [srcend, -8]
+       str     A_l, [dstin]
+       str     A_h, [dstend, -8]
+       ret
+
+       .p2align 3
+       /* Copy 4-7 bytes.  */
+L(copy8):
+       tbz     count, 2, L(copy4)
+       ldr     A_lw, [src]
+       ldr     B_lw, [srcend, -4]
+       str     A_lw, [dstin]
+       str     B_lw, [dstend, -4]
+       ret
+
+       /* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+       cbz     count, L(copy0)
+       lsr     tmp1, count, 1
+       ldrb    A_lw, [src]
+       ldrb    C_lw, [srcend, -1]
+       ldrb    B_lw, [src, tmp1]
+       strb    A_lw, [dstin]
+       strb    B_lw, [dstin, tmp1]
+       strb    C_lw, [dstend, -1]
+L(copy0):
+       ret
+
+       .p2align 4
+       /* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+       ldp     A_l, A_h, [src]
+       ldp     B_l, B_h, [src, 16]
+       ldp     C_l, C_h, [srcend, -32]
+       ldp     D_l, D_h, [srcend, -16]
+       cmp     count, 64
+       b.hi    L(copy128)
+       stp     A_l, A_h, [dstin]
+       stp     B_l, B_h, [dstin, 16]
+       stp     C_l, C_h, [dstend, -32]
+       stp     D_l, D_h, [dstend, -16]
         ret
+
+       .p2align 4
+       /* Copy 65..128 bytes.  */
+L(copy128):
+       ldp     E_l, E_h, [src, 32]
+       ldp     F_l, F_h, [src, 48]
+       cmp     count, 96
+       b.ls    L(copy96)
+       ldp     G_l, G_h, [srcend, -64]
+       ldp     H_l, H_h, [srcend, -48]
+       stp     G_l, G_h, [dstend, -64]
+       stp     H_l, H_h, [dstend, -48]
+L(copy96):
+       stp     A_l, A_h, [dstin]
+       stp     B_l, B_h, [dstin, 16]
+       stp     E_l, E_h, [dstin, 32]
+       stp     F_l, F_h, [dstin, 48]
+       stp     C_l, C_h, [dstend, -32]
+       stp     D_l, D_h, [dstend, -16]
+       ret
+
+       .p2align 4
+       /* Copy more than 128 bytes.  */
+L(copy_long):
+       /* Use backwards copy if there is an overlap.  */
+       sub     tmp1, dstin, src
+       cbz     tmp1, L(copy0)
+       cmp     tmp1, count
+       b.lo    L(copy_long_backwards)
+
+       /* Copy 16 bytes and then align dst to 16-byte alignment.  */
+
+       ldp     D_l, D_h, [src]
+       and     tmp1, dstin, 15
+       bic     dst, dstin, 15
+       sub     src, src, tmp1
+       add     count, count, tmp1      /* Count is now 16 too large.  */
+       ldp     A_l, A_h, [src, 16]
+       stp     D_l, D_h, [dstin]
+       ldp     B_l, B_h, [src, 32]
+       ldp     C_l, C_h, [src, 48]
+       ldp     D_l, D_h, [src, 64]!
+       subs    count, count, 128 + 16  /* Test and readjust count.  */
+       b.ls    L(copy64_from_end)
+
+L(loop64):
+       stp     A_l, A_h, [dst, 16]
+       ldp     A_l, A_h, [src, 16]
+       stp     B_l, B_h, [dst, 32]
+       ldp     B_l, B_h, [src, 32]
+       stp     C_l, C_h, [dst, 48]
+       ldp     C_l, C_h, [src, 48]
+       stp     D_l, D_h, [dst, 64]!
+       ldp     D_l, D_h, [src, 64]!
+       subs    count, count, 64
+       b.hi    L(loop64)
+
+       /* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
+       ldp     E_l, E_h, [srcend, -64]
+       stp     A_l, A_h, [dst, 16]
+       ldp     A_l, A_h, [srcend, -48]
+       stp     B_l, B_h, [dst, 32]
+       ldp     B_l, B_h, [srcend, -32]
+       stp     C_l, C_h, [dst, 48]
+       ldp     C_l, C_h, [srcend, -16]
+       stp     D_l, D_h, [dst, 64]
+       stp     E_l, E_h, [dstend, -64]
+       stp     A_l, A_h, [dstend, -48]
+       stp     B_l, B_h, [dstend, -32]
+       stp     C_l, C_h, [dstend, -16]
+       ret
+
+       .p2align 4
+
+       /* Large backwards copy for overlapping copies.
+          Copy 16 bytes and then align dst to 16-byte alignment.  */
+L(copy_long_backwards):
+       ldp     D_l, D_h, [srcend, -16]
+       and     tmp1, dstend, 15
+       sub     srcend, srcend, tmp1
+       sub     count, count, tmp1
+       ldp     A_l, A_h, [srcend, -16]
+       stp     D_l, D_h, [dstend, -16]
+       ldp     B_l, B_h, [srcend, -32]
+       ldp     C_l, C_h, [srcend, -48]
+       ldp     D_l, D_h, [srcend, -64]!
+       sub     dstend, dstend, tmp1
+       subs    count, count, 128
+       b.ls    L(copy64_from_start)
+
+L(loop64_backwards):
+       stp     A_l, A_h, [dstend, -16]
+       ldp     A_l, A_h, [srcend, -16]
+       stp     B_l, B_h, [dstend, -32]
+       ldp     B_l, B_h, [srcend, -32]
+       stp     C_l, C_h, [dstend, -48]
+       ldp     C_l, C_h, [srcend, -48]
+       stp     D_l, D_h, [dstend, -64]!
+       ldp     D_l, D_h, [srcend, -64]!
+       subs    count, count, 64
+       b.hi    L(loop64_backwards)
+
+       /* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+       ldp     G_l, G_h, [src, 48]
+       stp     A_l, A_h, [dstend, -16]
+       ldp     A_l, A_h, [src, 32]
+       stp     B_l, B_h, [dstend, -32]
+       ldp     B_l, B_h, [src, 16]
+       stp     C_l, C_h, [dstend, -48]
+       ldp     C_l, C_h, [src]
+       stp     D_l, D_h, [dstend, -64]
+       stp     G_l, G_h, [dstin, 48]
+       stp     A_l, A_h, [dstin, 32]
+       stp     B_l, B_h, [dstin, 16]
+       stp     C_l, C_h, [dstin]
+       ret
+
  SYM_FUNC_END_PI(memcpy)
  EXPORT_SYMBOL(memcpy)
  SYM_FUNC_END_ALIAS(__memcpy)
  EXPORT_SYMBOL(__memcpy)
+SYM_FUNC_END_ALIAS_PI(memmove)
+EXPORT_SYMBOL(memmove)
+SYM_FUNC_END_ALIAS(__memmove)
+EXPORT_SYMBOL(__memmove)
+\ No newline at end of file
diff --git a/arch/arm64/lib/memmove.S b/arch/arm64/lib/memmove.S

deleted file mode 100644 (file)

index 1035dce..0000000
--- a/arch/arm64/lib/memmove.S
+++ /dev/null
@@ -1,189 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
- *
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <asm/cache.h>
-
-/*
- * Move a buffer from src to test (alignment handled by the hardware).
- * If dest <= src, call memcpy, otherwise copy in reverse order.
- *
- * Parameters:
- *     x0 - dest
- *     x1 - src
- *     x2 - n
- * Returns:
- *     x0 - dest
- */
-dstin  .req    x0
-src    .req    x1
-count  .req    x2
-tmp1   .req    x3
-tmp1w  .req    w3
-tmp2   .req    x4
-tmp2w  .req    w4
-tmp3   .req    x5
-tmp3w  .req    w5
-dst    .req    x6
-
-A_l    .req    x7
-A_h    .req    x8
-B_l    .req    x9
-B_h    .req    x10
-C_l    .req    x11
-C_h    .req    x12
-D_l    .req    x13
-D_h    .req    x14
-
-SYM_FUNC_START_ALIAS(__memmove)
-SYM_FUNC_START_WEAK_PI(memmove)
-       cmp     dstin, src
-       b.lo    __memcpy
-       add     tmp1, src, count
-       cmp     dstin, tmp1
-       b.hs    __memcpy                /* No overlap.  */
-
-       add     dst, dstin, count
-       add     src, src, count
-       cmp     count, #16
-       b.lo    .Ltail15  /*probably non-alignment accesses.*/
-
-       ands    tmp2, src, #15     /* Bytes to reach alignment.  */
-       b.eq    .LSrcAligned
-       sub     count, count, tmp2
-       /*
-       * process the aligned offset length to make the src aligned firstly.
-       * those extra instructions' cost is acceptable. It also make the
-       * coming accesses are based on aligned address.
-       */
-       tbz     tmp2, #0, 1f
-       ldrb    tmp1w, [src, #-1]!
-       strb    tmp1w, [dst, #-1]!
-1:
-       tbz     tmp2, #1, 2f
-       ldrh    tmp1w, [src, #-2]!
-       strh    tmp1w, [dst, #-2]!
-2:
-       tbz     tmp2, #2, 3f
-       ldr     tmp1w, [src, #-4]!
-       str     tmp1w, [dst, #-4]!
-3:
-       tbz     tmp2, #3, .LSrcAligned
-       ldr     tmp1, [src, #-8]!
-       str     tmp1, [dst, #-8]!
-
-.LSrcAligned:
-       cmp     count, #64
-       b.ge    .Lcpy_over64
-
-       /*
-       * Deal with small copies quickly by dropping straight into the
-       * exit block.
-       */
-.Ltail63:
-       /*
-       * Copy up to 48 bytes of data. At this point we only need the
-       * bottom 6 bits of count to be accurate.
-       */
-       ands    tmp1, count, #0x30
-       b.eq    .Ltail15
-       cmp     tmp1w, #0x20
-       b.eq    1f
-       b.lt    2f
-       ldp     A_l, A_h, [src, #-16]!
-       stp     A_l, A_h, [dst, #-16]!
-1:
-       ldp     A_l, A_h, [src, #-16]!
-       stp     A_l, A_h, [dst, #-16]!
-2:
-       ldp     A_l, A_h, [src, #-16]!
-       stp     A_l, A_h, [dst, #-16]!
-
-.Ltail15:
-       tbz     count, #3, 1f
-       ldr     tmp1, [src, #-8]!
-       str     tmp1, [dst, #-8]!
-1:
-       tbz     count, #2, 2f
-       ldr     tmp1w, [src, #-4]!
-       str     tmp1w, [dst, #-4]!
-2:
-       tbz     count, #1, 3f
-       ldrh    tmp1w, [src, #-2]!
-       strh    tmp1w, [dst, #-2]!
-3:
-       tbz     count, #0, .Lexitfunc
-       ldrb    tmp1w, [src, #-1]
-       strb    tmp1w, [dst, #-1]
-
-.Lexitfunc:
-       ret
-
-.Lcpy_over64:
-       subs    count, count, #128
-       b.ge    .Lcpy_body_large
-       /*
-       * Less than 128 bytes to copy, so handle 64 bytes here and then jump
-       * to the tail.
-       */
-       ldp     A_l, A_h, [src, #-16]
-       stp     A_l, A_h, [dst, #-16]
-       ldp     B_l, B_h, [src, #-32]
-       ldp     C_l, C_h, [src, #-48]
-       stp     B_l, B_h, [dst, #-32]
-       stp     C_l, C_h, [dst, #-48]
-       ldp     D_l, D_h, [src, #-64]!
-       stp     D_l, D_h, [dst, #-64]!
-
-       tst     count, #0x3f
-       b.ne    .Ltail63
-       ret
-
-       /*
-       * Critical loop. Start at a new cache line boundary. Assuming
-       * 64 bytes per line this ensures the entire loop is in one line.
-       */
-       .p2align        L1_CACHE_SHIFT
-.Lcpy_body_large:
-       /* pre-load 64 bytes data. */
-       ldp     A_l, A_h, [src, #-16]
-       ldp     B_l, B_h, [src, #-32]
-       ldp     C_l, C_h, [src, #-48]
-       ldp     D_l, D_h, [src, #-64]!
-1:
-       /*
-       * interlace the load of next 64 bytes data block with store of the last
-       * loaded 64 bytes data.
-       */
-       stp     A_l, A_h, [dst, #-16]
-       ldp     A_l, A_h, [src, #-16]
-       stp     B_l, B_h, [dst, #-32]
-       ldp     B_l, B_h, [src, #-32]
-       stp     C_l, C_h, [dst, #-48]
-       ldp     C_l, C_h, [src, #-48]
-       stp     D_l, D_h, [dst, #-64]!
-       ldp     D_l, D_h, [src, #-64]!
-       subs    count, count, #64
-       b.ge    1b
-       stp     A_l, A_h, [dst, #-16]
-       stp     B_l, B_h, [dst, #-32]
-       stp     C_l, C_h, [dst, #-48]
-       stp     D_l, D_h, [dst, #-64]!
-
-       tst     count, #0x3f
-       b.ne    .Ltail63
-       ret
-SYM_FUNC_END_PI(memmove)
-EXPORT_SYMBOL(memmove)
-SYM_FUNC_END_ALIAS(__memmove)
-EXPORT_SYMBOL(__memmove)
author	Robin Murphy <robin.murphy@arm.com>
	Thu, 27 May 2021 15:34:46 +0000 (16:34 +0100)
committer	Will Deacon <will@kernel.org>
	Tue, 1 Jun 2021 17:34:38 +0000 (18:34 +0100)
arch/arm64/lib/Makefile		patch \| blob \| history
arch/arm64/lib/memcpy.S		patch \| blob \| history
arch/arm64/lib/memmove.S	[deleted file]	patch \| blob \| history