aarch64: thunderx2 memmove performance improvements

author Anton Youdkevitch <anton.youdkevitch@bell-sw.com>

Fri, 3 May 2019 18:01:34 +0000 (11:01 -0700)

committer Steve Ellcey <sellcey@caviumnetworks.com>

Fri, 3 May 2019 18:01:34 +0000 (11:01 -0700)
author Anton Youdkevitch <anton.youdkevitch@bell-sw.com>
Fri, 3 May 2019 18:01:34 +0000 (11:01 -0700)
committer Steve Ellcey <sellcey@caviumnetworks.com>
Fri, 3 May 2019 18:01:34 +0000 (11:01 -0700)
diff --git a/ChangeLog b/ChangeLog

index 3c4dcb3..c71f249 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+2019-05-03  Anton Youdkevitch  <anton.youdkevitch@bell-sw.com>
+
+       * sysdeps/aarch64/multiarch/ifunc-impl-list.c: Added
+       __memmove_thunderx2 to the list of implementations
+       * sysdeps/aarch64/multiarch/memmove.c: Likewise
+       * sysdeps/aarch64/multiarch/memcpy_thunderx2.S:
+       (__memmove_thunderx2): Rewritten using SIMD ld/st
+       (__memcpy_thunderx2): Fixed handling overlapping cases.
+       Used ldp/stp instead of ldr/str if possible. Made loops
+       tails branchless.
+
  2019-05-03  Florian Weimer  <fweimer@redhat.com>
  
         * misc/tst-tsearch.c (walk_tree): Add more error checking.
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c

index 5f43362..10ff7d4 100644 (file)
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -45,6 +45,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
               IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
    IFUNC_IMPL (i, name, memmove,
               IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
+             IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx2)
               IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
               IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
    IFUNC_IMPL (i, name, memset,
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S

index 45e9a29..058db48 100644 (file)
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
@@ -31,8 +31,8 @@
  #define dst    x3
  #define srcend x4
  #define dstend x5
-#define tmp2    x6
-#define tmp3    x7
+#define tmp2   x6
+#define tmp3   x7
  #define tmp3w   w7
  #define A_l    x6
  #define A_lw   w6
@@ -53,26 +53,26 @@
  #define G_h    dst
  #define tmp1   x14
  
-#define A_q     q0
-#define B_q     q1
-#define C_q     q2
-#define D_q     q3
-#define E_q     q4
-#define F_q     q5
-#define G_q     q6
+#define A_q    q0
+#define B_q    q1
+#define C_q    q2
+#define D_q    q3
+#define E_q    q4
+#define F_q    q5
+#define G_q    q6
  #define H_q    q7
  #define I_q    q16
  #define J_q    q17
  
-#define A_v     v0
-#define B_v     v1
-#define C_v     v2
-#define D_v     v3
-#define E_v     v4
-#define F_v     v5
-#define G_v     v6
-#define H_v     v7
-#define I_v     v16
+#define A_v    v0
+#define B_v    v1
+#define C_v    v2
+#define D_v    v3
+#define E_v    v4
+#define F_v    v5
+#define G_v    v6
+#define H_v    v7
+#define I_v    v16
  #define J_v    v17
  
  #ifndef MEMMOVE
@@ -85,16 +85,14 @@
  #if IS_IN (libc)
  
  #undef MEMCPY
-#undef MEMMOVE
  #define MEMCPY __memcpy_thunderx2
+#undef MEMMOVE
  #define MEMMOVE __memmove_thunderx2
  
  
-/* Moves are split into 3 main cases: small copies of up to 16 bytes,
-   medium copies of 17..96 bytes which are fully unrolled. Large copies
-   of more than 96 bytes align the destination and use an unrolled loop
-   processing 64 bytes per iteration.
-   Overlapping large forward memmoves use a loop that copies backwards.
+/* Overlapping large forward memmoves use a loop that copies backwards.
+   Otherwise memcpy is used. Small moves branch to memcopy16 directly.
+   The longer memcpy cases fall through to the memcpy head.
  */
  
  ENTRY_ALIGN (MEMMOVE, 6)
@@ -103,188 +101,14 @@ ENTRY_ALIGN (MEMMOVE, 6)
         DELOUSE (1)
         DELOUSE (2)
  
+       add     srcend, src, count
+       cmp     count, 16
+       b.ls    L(memcopy16)
         sub     tmp1, dstin, src
         cmp     count, 96
         ccmp    tmp1, count, 2, hi
         b.lo    L(move_long)
  
-       prfm    PLDL1KEEP, [src]
-       add     srcend, src, count
-       add     dstend, dstin, count
-       cmp     count, 16
-       b.ls    L(copy16)
-       cmp     count, 96
-       b.hi    L(copy_long)
-
-       /* Medium copies: 17..96 bytes.  */
-       sub     tmp1, count, 1
-       ldp     A_l, A_h, [src]
-       tbnz    tmp1, 6, L(copy96)
-       ldp     D_l, D_h, [srcend, -16]
-       tbz     tmp1, 5, 1f
-       ldp     B_l, B_h, [src, 16]
-       ldp     C_l, C_h, [srcend, -32]
-       stp     B_l, B_h, [dstin, 16]
-       stp     C_l, C_h, [dstend, -32]
-1:
-       stp     A_l, A_h, [dstin]
-       stp     D_l, D_h, [dstend, -16]
-       ret
-
-       .p2align 4
-       /* Small copies: 0..16 bytes.  */
-L(copy16):
-       cmp     count, 8
-       b.lo    1f
-       ldr     A_l, [src]
-       ldr     A_h, [srcend, -8]
-       str     A_l, [dstin]
-       str     A_h, [dstend, -8]
-       ret
-       .p2align 4
-1:
-       tbz     count, 2, 1f
-       ldr     A_lw, [src]
-       ldr     A_hw, [srcend, -4]
-       str     A_lw, [dstin]
-       str     A_hw, [dstend, -4]
-       ret
-
-       /* Copy 0..3 bytes.  Use a branchless sequence that copies the same
-          byte 3 times if count==1, or the 2nd byte twice if count==2.  */
-1:
-       cbz     count, 2f
-       lsr     tmp1, count, 1
-       ldrb    A_lw, [src]
-       ldrb    A_hw, [srcend, -1]
-       ldrb    B_lw, [src, tmp1]
-       strb    A_lw, [dstin]
-       strb    B_lw, [dstin, tmp1]
-       strb    A_hw, [dstend, -1]
-2:     ret
-
-       .p2align 4
-       /* Copy 64..96 bytes.  Copy 64 bytes from the start and
-          32 bytes from the end.  */
-L(copy96):
-       ldp     B_l, B_h, [src, 16]
-       ldp     C_l, C_h, [src, 32]
-       ldp     D_l, D_h, [src, 48]
-       ldp     E_l, E_h, [srcend, -32]
-       ldp     F_l, F_h, [srcend, -16]
-       stp     A_l, A_h, [dstin]
-       stp     B_l, B_h, [dstin, 16]
-       stp     C_l, C_h, [dstin, 32]
-       stp     D_l, D_h, [dstin, 48]
-       stp     E_l, E_h, [dstend, -32]
-       stp     F_l, F_h, [dstend, -16]
-       ret
-
-       /* Align DST to 16 byte alignment so that we don't cross cache line
-          boundaries on both loads and stores.  There are at least 96 bytes
-          to copy, so copy 16 bytes unaligned and then align.  The loop
-          copies 64 bytes per iteration and prefetches one iteration ahead.  */
-
-       .p2align 4
-L(copy_long):
-       and     tmp1, dstin, 15
-       bic     dst, dstin, 15
-       ldp     D_l, D_h, [src]
-       sub     src, src, tmp1
-       add     count, count, tmp1      /* Count is now 16 too large.  */
-       ldp     A_l, A_h, [src, 16]
-       stp     D_l, D_h, [dstin]
-       ldp     B_l, B_h, [src, 32]
-       ldp     C_l, C_h, [src, 48]
-       ldp     D_l, D_h, [src, 64]!
-       subs    count, count, 128 + 16  /* Test and readjust count.  */
-       b.ls    L(last64)
-L(loop64):
-       stp     A_l, A_h, [dst, 16]
-       ldp     A_l, A_h, [src, 16]
-       stp     B_l, B_h, [dst, 32]
-       ldp     B_l, B_h, [src, 32]
-       stp     C_l, C_h, [dst, 48]
-       ldp     C_l, C_h, [src, 48]
-       stp     D_l, D_h, [dst, 64]!
-       ldp     D_l, D_h, [src, 64]!
-       subs    count, count, 64
-       b.hi    L(loop64)
-
-       /* Write the last full set of 64 bytes.  The remainder is at most 64
-          bytes, so it is safe to always copy 64 bytes from the end even if
-          there is just 1 byte left.  */
-L(last64):
-       ldp     E_l, E_h, [srcend, -64]
-       stp     A_l, A_h, [dst, 16]
-       ldp     A_l, A_h, [srcend, -48]
-       stp     B_l, B_h, [dst, 32]
-       ldp     B_l, B_h, [srcend, -32]
-       stp     C_l, C_h, [dst, 48]
-       ldp     C_l, C_h, [srcend, -16]
-       stp     D_l, D_h, [dst, 64]
-       stp     E_l, E_h, [dstend, -64]
-       stp     A_l, A_h, [dstend, -48]
-       stp     B_l, B_h, [dstend, -32]
-       stp     C_l, C_h, [dstend, -16]
-       ret
-
-       .p2align 4
-L(move_long):
-       cbz     tmp1, 3f
-
-       add     srcend, src, count
-       add     dstend, dstin, count
-
-       /* Align dstend to 16 byte alignment so that we don't cross cache line
-          boundaries on both loads and stores.  There are at least 96 bytes
-          to copy, so copy 16 bytes unaligned and then align.  The loop
-          copies 64 bytes per iteration and prefetches one iteration ahead.  */
-
-       and     tmp1, dstend, 15
-       ldp     D_l, D_h, [srcend, -16]
-       sub     srcend, srcend, tmp1
-       sub     count, count, tmp1
-       ldp     A_l, A_h, [srcend, -16]
-       stp     D_l, D_h, [dstend, -16]
-       ldp     B_l, B_h, [srcend, -32]
-       ldp     C_l, C_h, [srcend, -48]
-       ldp     D_l, D_h, [srcend, -64]!
-       sub     dstend, dstend, tmp1
-       subs    count, count, 128
-       b.ls    2f
-
-       nop
-1:
-       stp     A_l, A_h, [dstend, -16]
-       ldp     A_l, A_h, [srcend, -16]
-       stp     B_l, B_h, [dstend, -32]
-       ldp     B_l, B_h, [srcend, -32]
-       stp     C_l, C_h, [dstend, -48]
-       ldp     C_l, C_h, [srcend, -48]
-       stp     D_l, D_h, [dstend, -64]!
-       ldp     D_l, D_h, [srcend, -64]!
-       subs    count, count, 64
-       b.hi    1b
-
-       /* Write the last full set of 64 bytes.  The remainder is at most 64
-          bytes, so it is safe to always copy 64 bytes from the start even if
-          there is just 1 byte left.  */
-2:
-       ldp     G_l, G_h, [src, 48]
-       stp     A_l, A_h, [dstend, -16]
-       ldp     A_l, A_h, [src, 32]
-       stp     B_l, B_h, [dstend, -32]
-       ldp     B_l, B_h, [src, 16]
-       stp     C_l, C_h, [dstend, -48]
-       ldp     C_l, C_h, [src]
-       stp     D_l, D_h, [dstend, -64]
-       stp     G_l, G_h, [dstin, 48]
-       stp     A_l, A_h, [dstin, 32]
-       stp     B_l, B_h, [dstin, 16]
-       stp     C_l, C_h, [dstin]
-3:     ret
-
  END (MEMMOVE)
  libc_hidden_builtin_def (MEMMOVE)
  
@@ -293,227 +117,217 @@ libc_hidden_builtin_def (MEMMOVE)
     medium copies of 17..96 bytes which are fully unrolled. Large copies
     of more than 96 bytes align the destination and use load-and-merge
     approach in the case src and dst addresses are unaligned not evenly,
-   so that, loads and stores are always aligned.
-   Large copies use an unrolled loop processing 64 bytes per iteration.
-   The current optimized memcpy implementation is not compatible with
-   memmove and is separated from it completely.
-
-   memcpy implementation below is not compatible with memmove
-   because of pipelined loads/stores, which are faster, but they
-   can't be used in the case of overlapping memmove arrays */
+   so that, actual loads and stores are always aligned.
+   Large copies use the loops processing 64 bytes per iteration for
+   unaligned case and 128 bytes per iteration for aligned ones.
+*/
  
  #define MEMCPY_PREFETCH_LDR 640
  
+       .p2align 4
  ENTRY (MEMCPY)
+
         DELOUSE (0)
         DELOUSE (1)
         DELOUSE (2)
  
-       add     srcend, src, count
-       cmp     count, 16
-       b.ls    L(memcopy16)
-       ldr     A_q, [src], #16
-       add     dstend, dstin, count
-       and     tmp1, src, 15
-       cmp     count, 96
-       b.hi    L(memcopy_long)
+       add     srcend, src, count
+       cmp     count, 16
+       b.ls    L(memcopy16)
+       ldr     A_q, [src], #16
+       add     dstend, dstin, count
+       and     tmp1, src, 15
+       cmp     count, 96
+       b.hi    L(memcopy_long)
  
         /* Medium copies: 17..96 bytes.  */
-       ldr     E_q, [srcend, -16]
-       cmp     count, 64
-       b.gt    L(memcpy_copy96)
-       cmp     count, 48
-       b.le    L(bytes_17_to_48)
+       ldr     E_q, [srcend, -16]
+       cmp     count, 64
+       b.gt    L(memcpy_copy96)
+       cmp     count, 48
+       b.le    L(bytes_17_to_48)
         /* 49..64 bytes */
-       ldp     B_q, C_q, [src]
-       str     E_q, [dstend, -16]
-       stp     A_q, B_q, [dstin]
-       str     C_q, [dstin, 32]
+       ldp     B_q, C_q, [src]
+       str     E_q, [dstend, -16]
+       stp     A_q, B_q, [dstin]
+       str     C_q, [dstin, 32]
         ret
  
  L(bytes_17_to_48):
         /* 17..48 bytes*/
-       cmp     count, 32
-       b.gt    L(bytes_32_to_48)
+       cmp     count, 32
+       b.gt    L(bytes_32_to_48)
         /* 17..32 bytes*/
-       str     A_q, [dstin]
-       str     E_q, [dstend, -16]
+       str     A_q, [dstin]
+       str     E_q, [dstend, -16]
         ret
  
  L(bytes_32_to_48):
         /* 32..48 */
-       ldr     B_q, [src]
-       str     A_q, [dstin]
-       str     E_q, [dstend, -16]
-       str     B_q, [dstin, 16]
+       ldr     B_q, [src]
+       str     A_q, [dstin]
+       str     E_q, [dstend, -16]
+       str     B_q, [dstin, 16]
         ret
  
         .p2align 4
         /* Small copies: 0..16 bytes.  */
  L(memcopy16):
-       cmp     count, 8
-       b.lo    L(bytes_0_to_8)
-       ldr     A_l, [src]
-       ldr     A_h, [srcend, -8]
-       add     dstend, dstin, count
-       str     A_l, [dstin]
-       str     A_h, [dstend, -8]
+       cmp     count, 8
+       b.lo    L(bytes_0_to_8)
+       ldr     A_l, [src]
+       ldr     A_h, [srcend, -8]
+       add     dstend, dstin, count
+       str     A_l, [dstin]
+       str     A_h, [dstend, -8]
         ret
         .p2align 4
  
  L(bytes_0_to_8):
-       tbz     count, 2, L(bytes_0_to_3)
-       ldr     A_lw, [src]
-       ldr     A_hw, [srcend, -4]
-       add     dstend, dstin, count
-       str     A_lw, [dstin]
-       str     A_hw, [dstend, -4]
+       tbz     count, 2, L(bytes_0_to_3)
+       ldr     A_lw, [src]
+       ldr     A_hw, [srcend, -4]
+       add     dstend, dstin, count
+       str     A_lw, [dstin]
+       str     A_hw, [dstend, -4]
         ret
  
         /* Copy 0..3 bytes.  Use a branchless sequence that copies the same
            byte 3 times if count==1, or the 2nd byte twice if count==2.  */
  L(bytes_0_to_3):
-       cbz     count, L(end)
-       lsr     tmp1, count, 1
-       ldrb    A_lw, [src]
-       ldrb    A_hw, [srcend, -1]
-       add     dstend, dstin, count
-       ldrb    B_lw, [src, tmp1]
-       strb    A_lw, [dstin]
-       strb    B_lw, [dstin, tmp1]
-       strb    A_hw, [dstend, -1]
-L(end):
+       cbz     count, 1f
+       lsr     tmp1, count, 1
+       ldrb    A_lw, [src]
+       ldrb    A_hw, [srcend, -1]
+       add     dstend, dstin, count
+       ldrb    B_lw, [src, tmp1]
+       strb    B_lw, [dstin, tmp1]
+       strb    A_hw, [dstend, -1]
+       strb    A_lw, [dstin]
+1:
         ret
  
         .p2align 4
  
  L(memcpy_copy96):
         /* Copying 65..96 bytes. A_q (first 16 bytes) and
-          E_q(last 16 bytes) are already loaded.
-
-          The size is large enough to benefit from aligned
-          loads */
-       bic     src, src, 15
-       ldp     B_q, C_q, [src]
-       str     A_q, [dstin]
+          E_q(last 16 bytes) are already loaded. The size
+          is large enough to benefit from aligned loads */
+       bic     src, src, 15
+       ldp     B_q, C_q, [src]
         /* Loaded 64 bytes, second 16-bytes chunk can be
            overlapping with the first chunk by tmp1 bytes.
            Stored 16 bytes. */
-       sub     dst, dstin, tmp1
-       add     count, count, tmp1
+       sub     dst, dstin, tmp1
+       add     count, count, tmp1
         /* The range of count being [65..96] becomes [65..111]
            after tmp [0..15] gets added to it,
            count now is <bytes-left-to-load>+48 */
-       cmp     count, 80
-       b.gt    L(copy96_medium)
-       ldr     D_q, [src, 32]
-       stp     B_q, C_q, [dst, 16]
-       str     E_q, [dstend, -16]
-       str     D_q, [dst, 48]
+       cmp     count, 80
+       b.gt    L(copy96_medium)
+       ldr     D_q, [src, 32]
+       stp     B_q, C_q, [dst, 16]
+       str     D_q, [dst, 48]
+       str     A_q, [dstin]
+       str     E_q, [dstend, -16]
         ret
  
         .p2align 4
  L(copy96_medium):
-       ldp     D_q, A_q, [src, 32]
-       str     B_q, [dst, 16]
-       cmp     count, 96
-       b.gt    L(copy96_large)
-       str     E_q, [dstend, -16]
-       stp     C_q, D_q, [dst, 32]
-       str     A_q, [dst, 64]
+       ldp     D_q, G_q, [src, 32]
+       cmp     count, 96
+       b.gt    L(copy96_large)
+       stp     B_q, C_q, [dst, 16]
+       stp     D_q, G_q, [dst, 48]
+       str     A_q, [dstin]
+       str     E_q, [dstend, -16]
         ret
  
  L(copy96_large):
-       ldr     F_q, [src, 64]
-       stp     C_q, D_q, [dst, 32]
-       str     E_q, [dstend, -16]
-       stp     A_q, F_q, [dst, 64]
+       ldr     F_q, [src, 64]
+       str     B_q, [dst, 16]
+       stp     C_q, D_q, [dst, 32]
+       stp     G_q, F_q, [dst, 64]
+       str     A_q, [dstin]
+       str     E_q, [dstend, -16]
         ret
  
         .p2align 4
  L(memcopy_long):
-       bic     src, src, 15
-       ldp     B_q, C_q, [src], #32
-       str     A_q, [dstin]
-       sub     dst, dstin, tmp1
-       add     count, count, tmp1
-       add     dst, dst, 16
+       bic     src, src, 15
+       ldp     B_q, C_q, [src], #32
+       sub     dst, dstin, tmp1
+       add     count, count, tmp1
+       add     dst, dst, 16
         and     tmp1, dst, 15
-       ldp     D_q, E_q, [src], #32
-       str     B_q, [dst], #16
+       ldp     D_q, E_q, [src], #32
+       str     A_q, [dstin]
  
         /* Already loaded 64+16 bytes. Check if at
            least 64 more bytes left */
-       subs    count, count, 64+64+16
-       b.lt    L(loop128_exit2)
-       cmp     count, MEMCPY_PREFETCH_LDR + 64 + 32
-       b.lt    L(loop128)
+       subs    count, count, 64+64+16
+       b.lt    L(loop128_exit0)
+       cmp     count, MEMCPY_PREFETCH_LDR + 64 + 32
+       b.lt    L(loop128)
         cbnz    tmp1, L(dst_unaligned)
-       sub     count, count, MEMCPY_PREFETCH_LDR + 64 + 32
+       sub     count, count, MEMCPY_PREFETCH_LDR + 64 + 32
  
         .p2align 4
  
  L(loop128_prefetch):
-       str     C_q, [dst], #16
-       prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
-       str     D_q, [dst], #16
-       ldp     F_q, G_q, [src], #32
-       str     E_q, [dst], #16
-       ldp     H_q, A_q, [src], #32
-       str     F_q, [dst], #16
-       prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
-       str     G_q, [dst], #16
-       ldp     B_q, C_q, [src], #32
-       str     H_q, [dst], #16
-       ldp     D_q, E_q, [src], #32
-       stp     A_q, B_q, [dst], #32
+       prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+       ldp     F_q, G_q, [src], #32
+       stp     B_q, C_q, [dst], #32
+       ldp     H_q, I_q, [src], #32
+       prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+       ldp     B_q, C_q, [src], #32
+       stp     D_q, E_q, [dst], #32
+       ldp     D_q, E_q, [src], #32
+       stp     F_q, G_q, [dst], #32
+       stp     H_q, I_q, [dst], #32
         subs    count, count, 128
-       b.ge    L(loop128_prefetch)
+       b.ge    L(loop128_prefetch)
  
-L(preloop128):
         add     count, count, MEMCPY_PREFETCH_LDR + 64 + 32
         .p2align 4
  L(loop128):
-       ldp     F_q, G_q, [src], #32
-       str     C_q, [dst], #16
-       ldp     B_q, A_q, [src], #32
-       str     D_q, [dst], #16
-       stp     E_q, F_q, [dst], #32
-       stp     G_q, B_q, [dst], #32
-       subs    count, count, 64
-       b.lt    L(loop128_exit1)
-L(loop128_proceed):
-       ldp     B_q, C_q, [src], #32
-       str     A_q, [dst], #16
-       ldp     D_q, E_q, [src], #32
-       str     B_q, [dst], #16
-       subs    count, count, 64
-       b.ge    L(loop128)
-
-       .p2align 4
-L(loop128_exit2):
-       stp     C_q, D_q, [dst], #32
-       str     E_q, [dst], #16
-       b       L(copy_long_check32);
-
+       ldp     F_q, G_q, [src], #32
+       ldp     H_q, I_q, [src], #32
+       stp     B_q, C_q, [dst], #32
+       stp     D_q, E_q, [dst], #32
+       subs    count, count, 64
+       b.lt    L(loop128_exit1)
+       ldp     B_q, C_q, [src], #32
+       ldp     D_q, E_q, [src], #32
+       stp     F_q, G_q, [dst], #32
+       stp     H_q, I_q, [dst], #32
+       subs    count, count, 64
+       b.ge    L(loop128)
+L(loop128_exit0):
+       ldp     F_q, G_q, [srcend, -64]
+       ldp     H_q, I_q, [srcend, -32]
+       stp     B_q, C_q, [dst], #32
+       stp     D_q, E_q, [dst]
+       stp     F_q, G_q, [dstend, -64]
+       stp     H_q, I_q, [dstend, -32]
+       ret
  L(loop128_exit1):
-       /* A_q is still not stored and 0..63 bytes left,
-          so, count is -64..-1.
-          Check if less than 32 bytes left (count < -32) */
-       str     A_q, [dst], #16
-L(copy_long_check32):
-       cmn     count, 64
-       b.eq    L(copy_long_done)
-       cmn     count, 32
-       b.le    L(copy_long_last32)
-       ldp     B_q, C_q, [src]
-       stp     B_q, C_q, [dst]
-
-L(copy_long_last32):
-       ldp     F_q, G_q, [srcend, -32]
-       stp     F_q, G_q, [dstend, -32]
-
-L(copy_long_done):
+       ldp     B_q, C_q, [srcend, -64]
+       ldp     D_q, E_q, [srcend, -32]
+       stp     F_q, G_q, [dst], #32
+       stp     H_q, I_q, [dst]
+       stp     B_q, C_q, [dstend, -64]
+       stp     D_q, E_q, [dstend, -32]
+       ret
+
+L(dst_unaligned_tail):
+       ldp     C_q, D_q, [srcend, -64]
+       ldp     E_q, F_q, [srcend, -32]
+       stp     A_q, B_q, [dst], #32
+       stp     H_q, I_q, [dst], #16
+       str     G_q, [dst, tmp1]
+       stp     C_q, D_q, [dstend, -64]
+       stp     E_q, F_q, [dstend, -32]
         ret
  
  L(dst_unaligned):
@@ -542,17 +356,20 @@ L(dst_unaligned):
         /* Store the 16 bytes to dst and align dst for further
            operations, several bytes will be stored at this
            address once more */
-       str     C_q, [dst], #16
-       ldp     F_q, G_q, [src], #32
+
+       ldp     F_q, G_q, [src], #32
+       stp     B_q, C_q, [dst], #32
         bic     dst, dst, 15
-       subs    count, count, 32
+       sub     count, count, 32
         adrp    tmp2, L(ext_table)
         add     tmp2, tmp2, :lo12:L(ext_table)
         add     tmp2, tmp2, tmp1, LSL #2
         ldr     tmp3w, [tmp2]
         add     tmp2, tmp2, tmp3w, SXTW
         br      tmp2
-.p2align 4 ;\
+
+.p2align 4
+       /* to make the loop in each chunk 16-bytes aligned */
         nop
  #define EXT_CHUNK(shft) \
  L(ext_size_ ## shft):;\
@@ -573,7 +390,7 @@ L(ext_size_ ## shft):;\
         b.ge    1b;\
  2:;\
         ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
-       b       L(ext_tail);
+       b       L(dst_unaligned_tail);
  
  EXT_CHUNK(1)
  EXT_CHUNK(2)
@@ -591,12 +408,45 @@ EXT_CHUNK(13)
  EXT_CHUNK(14)
  EXT_CHUNK(15)
  
-L(ext_tail):
-       stp     A_q, B_q, [dst], #32
-       stp     H_q, I_q, [dst], #16
-       add     dst, dst, tmp1
-       str     G_q, [dst], #16
-       b       L(copy_long_check32)
+L(move_long):
+       .p2align 4
+1:
+       cbz     tmp1, 3f
+
+       add     srcend, src, count
+       add     dstend, dstin, count
+
+       and     tmp1, srcend, 15
+       ldr     D_q, [srcend, -16]
+       sub     srcend, srcend, tmp1
+       sub     count, count, tmp1
+       ldp     A_q, B_q, [srcend, -32]
+       str     D_q, [dstend, -16]
+       ldp     C_q, D_q, [srcend, -64]!
+       sub     dstend, dstend, tmp1
+       subs    count, count, 128
+       b.ls    2f
+
+       .p2align 4
+1:
+       subs    count, count, 64
+       stp     A_q, B_q, [dstend, -32]
+       ldp     A_q, B_q, [srcend, -32]
+       stp     C_q, D_q, [dstend, -64]!
+       ldp     C_q, D_q, [srcend, -64]!
+       b.hi    1b
+
+       /* Write the last full set of 64 bytes.  The remainder is at most 64
+          bytes, so it is safe to always copy 64 bytes from the start even if
+          there is just 1 byte left.  */
+2:
+       ldp     E_q, F_q, [src, 32]
+       ldp     G_q, H_q, [src]
+       stp     A_q, B_q, [dstend, -32]
+       stp     C_q, D_q, [dstend, -64]
+       stp     E_q, F_q, [dstin, 32]
+       stp     G_q, H_q, [dstin]
+3:     ret
  
  
  END (MEMCPY)
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c

index f58dde3..f3d341b 100644 (file)
--- a/sysdeps/aarch64/multiarch/memmove.c
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -30,6 +30,7 @@ extern __typeof (__redirect_memmove) __libc_memmove;
  
  extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
  extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
  extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
  
  libc_ifunc (__libc_memmove,
@@ -37,7 +38,9 @@ libc_ifunc (__libc_memmove,
              ? __memmove_thunderx
              : (IS_FALKOR (midr) || IS_PHECDA (midr)
                 ? __memmove_falkor
-               : __memmove_generic)));
+               : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
+                 ? __memmove_thunderx2
+                 : __memmove_generic))));
  
  # undef memmove
  strong_alias (__libc_memmove, memmove);
author	Anton Youdkevitch <anton.youdkevitch@bell-sw.com>
	Fri, 3 May 2019 18:01:34 +0000 (11:01 -0700)
committer	Steve Ellcey <sellcey@caviumnetworks.com>
	Fri, 3 May 2019 18:01:34 +0000 (11:01 -0700)
ChangeLog		patch \| blob \| history
sysdeps/aarch64/multiarch/ifunc-impl-list.c		patch \| blob \| history
sysdeps/aarch64/multiarch/memcpy_thunderx2.S		patch \| blob \| history
sysdeps/aarch64/multiarch/memmove.c		patch \| blob \| history