aarch64: Use memcpy_simd as the default memcpy

author Wilco Dijkstra <wdijkstr@arm.com>

Wed, 26 Oct 2022 13:16:50 +0000 (14:16 +0100)

committer Wilco Dijkstra <wdijkstr@arm.com>

Wed, 26 Oct 2022 13:16:50 +0000 (14:16 +0100)
author Wilco Dijkstra <wdijkstr@arm.com>
Wed, 26 Oct 2022 13:16:50 +0000 (14:16 +0100)
committer Wilco Dijkstra <wdijkstr@arm.com>
Wed, 26 Oct 2022 13:16:50 +0000 (14:16 +0100)
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S

index 98d4e2c0e202eca13e1fd19ad8046cf61ad280ff..7b396b202fabf01b6ff2adc71a1038148e0b1054 100644 (file)
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -1,4 +1,5 @@
-/* Copyright (C) 2012-2022 Free Software Foundation, Inc.
+/* Generic optimized memcpy using SIMD.
+   Copyright (C) 2012-2022 Free Software Foundation, Inc.
  
     This file is part of the GNU C Library.
  
@@ -20,7 +21,7 @@
  
  /* Assumptions:
   *
- * ARMv8-a, AArch64, unaligned accesses.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
   *
   */
  
@@ -36,21 +37,18 @@
  #define B_l    x8
  #define B_lw   w8
  #define B_h    x9
-#define C_l    x10
  #define C_lw   w10
-#define C_h    x11
-#define D_l    x12
-#define D_h    x13
-#define E_l    x14
-#define E_h    x15
-#define F_l    x16
-#define F_h    x17
-#define G_l    count
-#define G_h    dst
-#define H_l    src
-#define H_h    srcend
  #define tmp1   x14
  
+#define A_q    q0
+#define B_q    q1
+#define C_q    q2
+#define D_q    q3
+#define E_q    q4
+#define F_q    q5
+#define G_q    q6
+#define H_q    q7
+
  #ifndef MEMMOVE
  # define MEMMOVE memmove
  #endif
@@ -69,10 +67,9 @@
     Large copies use a software pipelined loop processing 64 bytes per
     iteration.  The destination pointer is 16-byte aligned to minimize
     unaligned accesses.  The loop tail is handled by always copying 64 bytes
-   from the end.
-*/
+   from the end.  */
  
-ENTRY_ALIGN (MEMCPY, 6)
+ENTRY (MEMCPY)
         PTR_ARG (0)
         PTR_ARG (1)
         SIZE_ARG (2)
@@ -87,10 +84,10 @@ ENTRY_ALIGN (MEMCPY, 6)
         /* Small copies: 0..32 bytes.  */
         cmp     count, 16
         b.lo    L(copy16)
-       ldp     A_l, A_h, [src]
-       ldp     D_l, D_h, [srcend, -16]
-       stp     A_l, A_h, [dstin]
-       stp     D_l, D_h, [dstend, -16]
+       ldr     A_q, [src]
+       ldr     B_q, [srcend, -16]
+       str     A_q, [dstin]
+       str     B_q, [dstend, -16]
         ret
  
         /* Copy 8-15 bytes.  */
@@ -102,7 +99,6 @@ L(copy16):
         str     A_h, [dstend, -8]
         ret
  
-       .p2align 3
         /* Copy 4-7 bytes.  */
  L(copy8):
         tbz     count, 2, L(copy4)
@@ -128,87 +124,69 @@ L(copy0):
         .p2align 4
         /* Medium copies: 33..128 bytes.  */
  L(copy32_128):
-       ldp     A_l, A_h, [src]
-       ldp     B_l, B_h, [src, 16]
-       ldp     C_l, C_h, [srcend, -32]
-       ldp     D_l, D_h, [srcend, -16]
+       ldp     A_q, B_q, [src]
+       ldp     C_q, D_q, [srcend, -32]
         cmp     count, 64
         b.hi    L(copy128)
-       stp     A_l, A_h, [dstin]
-       stp     B_l, B_h, [dstin, 16]
-       stp     C_l, C_h, [dstend, -32]
-       stp     D_l, D_h, [dstend, -16]
+       stp     A_q, B_q, [dstin]
+       stp     C_q, D_q, [dstend, -32]
         ret
  
         .p2align 4
         /* Copy 65..128 bytes.  */
  L(copy128):
-       ldp     E_l, E_h, [src, 32]
-       ldp     F_l, F_h, [src, 48]
+       ldp     E_q, F_q, [src, 32]
         cmp     count, 96
         b.ls    L(copy96)
-       ldp     G_l, G_h, [srcend, -64]
-       ldp     H_l, H_h, [srcend, -48]
-       stp     G_l, G_h, [dstend, -64]
-       stp     H_l, H_h, [dstend, -48]
+       ldp     G_q, H_q, [srcend, -64]
+       stp     G_q, H_q, [dstend, -64]
  L(copy96):
-       stp     A_l, A_h, [dstin]
-       stp     B_l, B_h, [dstin, 16]
-       stp     E_l, E_h, [dstin, 32]
-       stp     F_l, F_h, [dstin, 48]
-       stp     C_l, C_h, [dstend, -32]
-       stp     D_l, D_h, [dstend, -16]
+       stp     A_q, B_q, [dstin]
+       stp     E_q, F_q, [dstin, 32]
+       stp     C_q, D_q, [dstend, -32]
         ret
  
-       .p2align 4
+       /* Align loop64 below to 16 bytes.  */
+       nop
+
         /* Copy more than 128 bytes.  */
  L(copy_long):
-       /* Copy 16 bytes and then align dst to 16-byte alignment.  */
-       ldp     D_l, D_h, [src]
-       and     tmp1, dstin, 15
-       bic     dst, dstin, 15
-       sub     src, src, tmp1
+       /* Copy 16 bytes and then align src to 16-byte alignment.  */
+       ldr     D_q, [src]
+       and     tmp1, src, 15
+       bic     src, src, 15
+       sub     dst, dstin, tmp1
         add     count, count, tmp1      /* Count is now 16 too large.  */
-       ldp     A_l, A_h, [src, 16]
-       stp     D_l, D_h, [dstin]
-       ldp     B_l, B_h, [src, 32]
-       ldp     C_l, C_h, [src, 48]
-       ldp     D_l, D_h, [src, 64]!
+       ldp     A_q, B_q, [src, 16]
+       str     D_q, [dstin]
+       ldp     C_q, D_q, [src, 48]
         subs    count, count, 128 + 16  /* Test and readjust count.  */
         b.ls    L(copy64_from_end)
-
  L(loop64):
-       stp     A_l, A_h, [dst, 16]
-       ldp     A_l, A_h, [src, 16]
-       stp     B_l, B_h, [dst, 32]
-       ldp     B_l, B_h, [src, 32]
-       stp     C_l, C_h, [dst, 48]
-       ldp     C_l, C_h, [src, 48]
-       stp     D_l, D_h, [dst, 64]!
-       ldp     D_l, D_h, [src, 64]!
+       stp     A_q, B_q, [dst, 16]
+       ldp     A_q, B_q, [src, 80]
+       stp     C_q, D_q, [dst, 48]
+       ldp     C_q, D_q, [src, 112]
+       add     src, src, 64
+       add     dst, dst, 64
         subs    count, count, 64
         b.hi    L(loop64)
  
         /* Write the last iteration and copy 64 bytes from the end.  */
  L(copy64_from_end):
-       ldp     E_l, E_h, [srcend, -64]
-       stp     A_l, A_h, [dst, 16]
-       ldp     A_l, A_h, [srcend, -48]
-       stp     B_l, B_h, [dst, 32]
-       ldp     B_l, B_h, [srcend, -32]
-       stp     C_l, C_h, [dst, 48]
-       ldp     C_l, C_h, [srcend, -16]
-       stp     D_l, D_h, [dst, 64]
-       stp     E_l, E_h, [dstend, -64]
-       stp     A_l, A_h, [dstend, -48]
-       stp     B_l, B_h, [dstend, -32]
-       stp     C_l, C_h, [dstend, -16]
+       ldp     E_q, F_q, [srcend, -64]
+       stp     A_q, B_q, [dst, 16]
+       ldp     A_q, B_q, [srcend, -32]
+       stp     C_q, D_q, [dst, 48]
+       stp     E_q, F_q, [dstend, -64]
+       stp     A_q, B_q, [dstend, -32]
         ret
  
  END (MEMCPY)
  libc_hidden_builtin_def (MEMCPY)
  
-ENTRY_ALIGN (MEMMOVE, 4)
+
+ENTRY (MEMMOVE)
         PTR_ARG (0)
         PTR_ARG (1)
         SIZE_ARG (2)
@@ -220,64 +198,56 @@ ENTRY_ALIGN (MEMMOVE, 4)
         cmp     count, 32
         b.hi    L(copy32_128)
  
-       /* Small copies: 0..32 bytes.  */
+       /* Small moves: 0..32 bytes.  */
         cmp     count, 16
         b.lo    L(copy16)
-       ldp     A_l, A_h, [src]
-       ldp     D_l, D_h, [srcend, -16]
-       stp     A_l, A_h, [dstin]
-       stp     D_l, D_h, [dstend, -16]
+       ldr     A_q, [src]
+       ldr     B_q, [srcend, -16]
+       str     A_q, [dstin]
+       str     B_q, [dstend, -16]
         ret
  
-       .p2align 4
  L(move_long):
         /* Only use backward copy if there is an overlap.  */
         sub     tmp1, dstin, src
-       cbz     tmp1, L(copy0)
+       cbz     tmp1, L(move0)
         cmp     tmp1, count
         b.hs    L(copy_long)
  
         /* Large backwards copy for overlapping copies.
-          Copy 16 bytes and then align dst to 16-byte alignment.  */
-       ldp     D_l, D_h, [srcend, -16]
-       and     tmp1, dstend, 15
-       sub     srcend, srcend, tmp1
+          Copy 16 bytes and then align srcend to 16-byte alignment.  */
+L(copy_long_backwards):
+       ldr     D_q, [srcend, -16]
+       and     tmp1, srcend, 15
+       bic     srcend, srcend, 15
         sub     count, count, tmp1
-       ldp     A_l, A_h, [srcend, -16]
-       stp     D_l, D_h, [dstend, -16]
-       ldp     B_l, B_h, [srcend, -32]
-       ldp     C_l, C_h, [srcend, -48]
-       ldp     D_l, D_h, [srcend, -64]!
+       ldp     A_q, B_q, [srcend, -32]
+       str     D_q, [dstend, -16]
+       ldp     C_q, D_q, [srcend, -64]
         sub     dstend, dstend, tmp1
         subs    count, count, 128
         b.ls    L(copy64_from_start)
  
  L(loop64_backwards):
-       stp     A_l, A_h, [dstend, -16]
-       ldp     A_l, A_h, [srcend, -16]
-       stp     B_l, B_h, [dstend, -32]
-       ldp     B_l, B_h, [srcend, -32]
-       stp     C_l, C_h, [dstend, -48]
-       ldp     C_l, C_h, [srcend, -48]
-       stp     D_l, D_h, [dstend, -64]!
-       ldp     D_l, D_h, [srcend, -64]!
+       str     B_q, [dstend, -16]
+       str     A_q, [dstend, -32]
+       ldp     A_q, B_q, [srcend, -96]
+       str     D_q, [dstend, -48]
+       str     C_q, [dstend, -64]!
+       ldp     C_q, D_q, [srcend, -128]
+       sub     srcend, srcend, 64
         subs    count, count, 64
         b.hi    L(loop64_backwards)
  
         /* Write the last iteration and copy 64 bytes from the start.  */
  L(copy64_from_start):
-       ldp     G_l, G_h, [src, 48]
-       stp     A_l, A_h, [dstend, -16]
-       ldp     A_l, A_h, [src, 32]
-       stp     B_l, B_h, [dstend, -32]
-       ldp     B_l, B_h, [src, 16]
-       stp     C_l, C_h, [dstend, -48]
-       ldp     C_l, C_h, [src]
-       stp     D_l, D_h, [dstend, -64]
-       stp     G_l, G_h, [dstin, 48]
-       stp     A_l, A_h, [dstin, 32]
-       stp     B_l, B_h, [dstin, 16]
-       stp     C_l, C_h, [dstin]
+       ldp     E_q, F_q, [src, 32]
+       stp     A_q, B_q, [dstend, -32]
+       ldp     A_q, B_q, [src]
+       stp     C_q, D_q, [dstend, -64]
+       stp     E_q, F_q, [dstin, 32]
+       stp     A_q, B_q, [dstin]
+L(move0):
         ret
  
  END (MEMMOVE)
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile

index 16297192ee8375ea1470d7b1f7e728a6dd0fd704..223777d94e350fdfd1bb82a4b38eea4653d63057 100644 (file)
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -3,7 +3,6 @@ sysdep_routines += \
    memchr_generic \
    memchr_nosimd \
    memcpy_a64fx \
-  memcpy_advsimd \
    memcpy_falkor \
    memcpy_generic \
    memcpy_sve \
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c

index c6fe149a7cb63056fcc519d51256e48785917277..ac8980288ebd22d10bfedb3d81a9fa647f137ca8 100644 (file)
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -37,7 +37,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
               IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
               IFUNC_IMPL_ADD (array, i, memcpy, !bti, __memcpy_thunderx2)
               IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor)
-             IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd)
  #if HAVE_AARCH64_SVE_ASM
               IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_a64fx)
               IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_sve)
@@ -47,7 +46,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
               IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
               IFUNC_IMPL_ADD (array, i, memmove, !bti, __memmove_thunderx2)
               IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
-             IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd)
  #if HAVE_AARCH64_SVE_ASM
               IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_a64fx)
               IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_sve)
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c

index 0486213f08db8ab08a903c4b9e6bdd19df6dccec..21d954e7f3b0071d02f407e7c5564780bfd4e896 100644 (file)
--- a/sysdeps/aarch64/multiarch/memcpy.c
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -29,7 +29,6 @@
  extern __typeof (__redirect_memcpy) __libc_memcpy;
  
  extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
-extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden;
  extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
  extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
  extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
@@ -41,9 +40,6 @@ select_memcpy_ifunc (void)
  {
    INIT_ARCH ();
  
-  if (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr))
-    return __memcpy_simd;
-
    if (sve && HAVE_AARCH64_SVE_ASM)
      {
        if (IS_A64FX (midr))
diff --git a/sysdeps/aarch64/multiarch/memcpy_advsimd.S b/sysdeps/aarch64/multiarch/memcpy_advsimd.S

deleted file mode 100644 (file)

index fe9beaf..0000000
--- a/sysdeps/aarch64/multiarch/memcpy_advsimd.S
+++ /dev/null
@@ -1,248 +0,0 @@
-/* Generic optimized memcpy using SIMD.
-   Copyright (C) 2020-2022 Free Software Foundation, Inc.
-
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library.  If not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
- *
- */
-
-#define dstin  x0
-#define src    x1
-#define count  x2
-#define dst    x3
-#define srcend x4
-#define dstend x5
-#define A_l    x6
-#define A_lw   w6
-#define A_h    x7
-#define B_l    x8
-#define B_lw   w8
-#define B_h    x9
-#define C_lw   w10
-#define tmp1   x14
-
-#define A_q    q0
-#define B_q    q1
-#define C_q    q2
-#define D_q    q3
-#define E_q    q4
-#define F_q    q5
-#define G_q    q6
-#define H_q    q7
-
-
-/* This implementation supports both memcpy and memmove and shares most code.
-   It uses unaligned accesses and branchless sequences to keep the code small,
-   simple and improve performance.
-
-   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
-   copies of up to 128 bytes, and large copies.  The overhead of the overlap
-   check in memmove is negligible since it is only required for large copies.
-
-   Large copies use a software pipelined loop processing 64 bytes per
-   iteration.  The destination pointer is 16-byte aligned to minimize
-   unaligned accesses.  The loop tail is handled by always copying 64 bytes
-   from the end.  */
-
-ENTRY (__memcpy_simd)
-       PTR_ARG (0)
-       PTR_ARG (1)
-       SIZE_ARG (2)
-
-       add     srcend, src, count
-       add     dstend, dstin, count
-       cmp     count, 128
-       b.hi    L(copy_long)
-       cmp     count, 32
-       b.hi    L(copy32_128)
-
-       /* Small copies: 0..32 bytes.  */
-       cmp     count, 16
-       b.lo    L(copy16)
-       ldr     A_q, [src]
-       ldr     B_q, [srcend, -16]
-       str     A_q, [dstin]
-       str     B_q, [dstend, -16]
-       ret
-
-       /* Copy 8-15 bytes.  */
-L(copy16):
-       tbz     count, 3, L(copy8)
-       ldr     A_l, [src]
-       ldr     A_h, [srcend, -8]
-       str     A_l, [dstin]
-       str     A_h, [dstend, -8]
-       ret
-
-       /* Copy 4-7 bytes.  */
-L(copy8):
-       tbz     count, 2, L(copy4)
-       ldr     A_lw, [src]
-       ldr     B_lw, [srcend, -4]
-       str     A_lw, [dstin]
-       str     B_lw, [dstend, -4]
-       ret
-
-       /* Copy 0..3 bytes using a branchless sequence.  */
-L(copy4):
-       cbz     count, L(copy0)
-       lsr     tmp1, count, 1
-       ldrb    A_lw, [src]
-       ldrb    C_lw, [srcend, -1]
-       ldrb    B_lw, [src, tmp1]
-       strb    A_lw, [dstin]
-       strb    B_lw, [dstin, tmp1]
-       strb    C_lw, [dstend, -1]
-L(copy0):
-       ret
-
-       .p2align 4
-       /* Medium copies: 33..128 bytes.  */
-L(copy32_128):
-       ldp     A_q, B_q, [src]
-       ldp     C_q, D_q, [srcend, -32]
-       cmp     count, 64
-       b.hi    L(copy128)
-       stp     A_q, B_q, [dstin]
-       stp     C_q, D_q, [dstend, -32]
-       ret
-
-       .p2align 4
-       /* Copy 65..128 bytes.  */
-L(copy128):
-       ldp     E_q, F_q, [src, 32]
-       cmp     count, 96
-       b.ls    L(copy96)
-       ldp     G_q, H_q, [srcend, -64]
-       stp     G_q, H_q, [dstend, -64]
-L(copy96):
-       stp     A_q, B_q, [dstin]
-       stp     E_q, F_q, [dstin, 32]
-       stp     C_q, D_q, [dstend, -32]
-       ret
-
-       /* Align loop64 below to 16 bytes.  */
-       nop
-
-       /* Copy more than 128 bytes.  */
-L(copy_long):
-       /* Copy 16 bytes and then align src to 16-byte alignment.  */
-       ldr     D_q, [src]
-       and     tmp1, src, 15
-       bic     src, src, 15
-       sub     dst, dstin, tmp1
-       add     count, count, tmp1      /* Count is now 16 too large.  */
-       ldp     A_q, B_q, [src, 16]
-       str     D_q, [dstin]
-       ldp     C_q, D_q, [src, 48]
-       subs    count, count, 128 + 16  /* Test and readjust count.  */
-       b.ls    L(copy64_from_end)
-L(loop64):
-       stp     A_q, B_q, [dst, 16]
-       ldp     A_q, B_q, [src, 80]
-       stp     C_q, D_q, [dst, 48]
-       ldp     C_q, D_q, [src, 112]
-       add     src, src, 64
-       add     dst, dst, 64
-       subs    count, count, 64
-       b.hi    L(loop64)
-
-       /* Write the last iteration and copy 64 bytes from the end.  */
-L(copy64_from_end):
-       ldp     E_q, F_q, [srcend, -64]
-       stp     A_q, B_q, [dst, 16]
-       ldp     A_q, B_q, [srcend, -32]
-       stp     C_q, D_q, [dst, 48]
-       stp     E_q, F_q, [dstend, -64]
-       stp     A_q, B_q, [dstend, -32]
-       ret
-
-END (__memcpy_simd)
-libc_hidden_builtin_def (__memcpy_simd)
-
-
-ENTRY (__memmove_simd)
-       PTR_ARG (0)
-       PTR_ARG (1)
-       SIZE_ARG (2)
-
-       add     srcend, src, count
-       add     dstend, dstin, count
-       cmp     count, 128
-       b.hi    L(move_long)
-       cmp     count, 32
-       b.hi    L(copy32_128)
-
-       /* Small moves: 0..32 bytes.  */
-       cmp     count, 16
-       b.lo    L(copy16)
-       ldr     A_q, [src]
-       ldr     B_q, [srcend, -16]
-       str     A_q, [dstin]
-       str     B_q, [dstend, -16]
-       ret
-
-L(move_long):
-       /* Only use backward copy if there is an overlap.  */
-       sub     tmp1, dstin, src
-       cbz     tmp1, L(move0)
-       cmp     tmp1, count
-       b.hs    L(copy_long)
-
-       /* Large backwards copy for overlapping copies.
-          Copy 16 bytes and then align srcend to 16-byte alignment.  */
-L(copy_long_backwards):
-       ldr     D_q, [srcend, -16]
-       and     tmp1, srcend, 15
-       bic     srcend, srcend, 15
-       sub     count, count, tmp1
-       ldp     A_q, B_q, [srcend, -32]
-       str     D_q, [dstend, -16]
-       ldp     C_q, D_q, [srcend, -64]
-       sub     dstend, dstend, tmp1
-       subs    count, count, 128
-       b.ls    L(copy64_from_start)
-
-L(loop64_backwards):
-       str     B_q, [dstend, -16]
-       str     A_q, [dstend, -32]
-       ldp     A_q, B_q, [srcend, -96]
-       str     D_q, [dstend, -48]
-       str     C_q, [dstend, -64]!
-       ldp     C_q, D_q, [srcend, -128]
-       sub     srcend, srcend, 64
-       subs    count, count, 64
-       b.hi    L(loop64_backwards)
-
-       /* Write the last iteration and copy 64 bytes from the start.  */
-L(copy64_from_start):
-       ldp     E_q, F_q, [src, 32]
-       stp     A_q, B_q, [dstend, -32]
-       ldp     A_q, B_q, [src]
-       stp     C_q, D_q, [dstend, -64]
-       stp     E_q, F_q, [dstin, 32]
-       stp     A_q, B_q, [dstin]
-L(move0):
-       ret
-
-END (__memmove_simd)
-libc_hidden_builtin_def (__memmove_simd)
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c

index 261996ecc4b36bdaf3c50598d910e9c7dcace08b..70e8eaef7c5010e32e55c4036e8bc915fa69ddc8 100644 (file)
--- a/sysdeps/aarch64/multiarch/memmove.c
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -29,7 +29,6 @@
  extern __typeof (__redirect_memmove) __libc_memmove;
  
  extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
-extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden;
  extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
  extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
  extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
@@ -41,9 +40,6 @@ select_memmove_ifunc (void)
  {
    INIT_ARCH ();
  
-  if (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr))
-    return __memmove_simd;
-
    if (sve && HAVE_AARCH64_SVE_ASM)
      {
        if (IS_A64FX (midr))
author	Wilco Dijkstra <wdijkstr@arm.com>
	Wed, 26 Oct 2022 13:16:50 +0000 (14:16 +0100)
committer	Wilco Dijkstra <wdijkstr@arm.com>
	Wed, 26 Oct 2022 13:16:50 +0000 (14:16 +0100)
sysdeps/aarch64/memcpy.S		patch \| blob \| history
sysdeps/aarch64/multiarch/Makefile		patch \| blob \| history
sysdeps/aarch64/multiarch/ifunc-impl-list.c		patch \| blob \| history
sysdeps/aarch64/multiarch/memcpy.c		patch \| blob \| history
sysdeps/aarch64/multiarch/memcpy_advsimd.S	[deleted file]	patch \| blob \| history
sysdeps/aarch64/multiarch/memmove.c		patch \| blob \| history