aarch64: Added optimized memcpy and memmove for A64FX

author Naohiro Tamura <naohirot@jp.fujitsu.com>

Thu, 27 May 2021 07:42:35 +0000 (07:42 +0000)

committer Szabolcs Nagy <szabolcs.nagy@arm.com>

Thu, 27 May 2021 08:47:53 +0000 (09:47 +0100)
author Naohiro Tamura <naohirot@jp.fujitsu.com>
Thu, 27 May 2021 07:42:35 +0000 (07:42 +0000)
committer Szabolcs Nagy <szabolcs.nagy@arm.com>
Thu, 27 May 2021 08:47:53 +0000 (09:47 +0100)
diff --git a/manual/tunables.texi b/manual/tunables.texi

index 6de647b..fe7c131 100644 (file)
--- a/manual/tunables.texi
+++ b/manual/tunables.texi
@@ -454,7 +454,8 @@ This tunable is specific to powerpc, powerpc64 and powerpc64le.
  The @code{glibc.cpu.name=xxx} tunable allows the user to tell @theglibc{} to
  assume that the CPU is @code{xxx} where xxx may have one of these values:
  @code{generic}, @code{falkor}, @code{thunderxt88}, @code{thunderx2t99},
-@code{thunderx2t99p1}, @code{ares}, @code{emag}, @code{kunpeng}.
+@code{thunderx2t99p1}, @code{ares}, @code{emag}, @code{kunpeng},
+@code{a64fx}.
  
  This tunable is specific to aarch64.
  @end deftp
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile

index dc3efff..04c3f17 100644 (file)
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,6 +1,6 @@
  ifeq ($(subdir),string)
  sysdep_routines += memcpy_generic memcpy_advsimd memcpy_thunderx memcpy_thunderx2 \
-                  memcpy_falkor \
+                  memcpy_falkor memcpy_a64fx \
                    memset_generic memset_falkor memset_emag memset_kunpeng \
                    memchr_generic memchr_nosimd \
                    strlen_mte strlen_asimd
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c

index 99a8c68..9113935 100644 (file)
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -25,7 +25,7 @@
  #include <stdio.h>
  
  /* Maximum number of IFUNC implementations.  */
-#define MAX_IFUNC      4
+#define MAX_IFUNC      7
  
  size_t
  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
@@ -43,12 +43,18 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
               IFUNC_IMPL_ADD (array, i, memcpy, !bti, __memcpy_thunderx2)
               IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor)
               IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd)
+#if HAVE_AARCH64_SVE_ASM
+             IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_a64fx)
+#endif
               IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
    IFUNC_IMPL (i, name, memmove,
               IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
               IFUNC_IMPL_ADD (array, i, memmove, !bti, __memmove_thunderx2)
               IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
               IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd)
+#if HAVE_AARCH64_SVE_ASM
+             IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_a64fx)
+#endif
               IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
    IFUNC_IMPL (i, name, memset,
               /* Enable this on non-falkor processors too so that other cores
diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h

index a167699..6d92c1b 100644 (file)
--- a/sysdeps/aarch64/multiarch/init-arch.h
+++ b/sysdeps/aarch64/multiarch/init-arch.h
@@ -33,4 +33,6 @@
    bool __attribute__((unused)) bti =                                         \
      HAVE_AARCH64_BTI && GLRO(dl_aarch64_cpu_features).bti;                   \
    bool __attribute__((unused)) mte =                                         \
-    MTE_ENABLED ();
+    MTE_ENABLED ();                                                          \
+  bool __attribute__((unused)) sve =                                         \
+    GLRO(dl_aarch64_cpu_features).sve;
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c

index 0e0a5cb..25e0081 100644 (file)
--- a/sysdeps/aarch64/multiarch/memcpy.c
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -33,6 +33,9 @@ extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden;
  extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
  extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
  extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
+# if HAVE_AARCH64_SVE_ASM
+extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden;
+# endif
  
  libc_ifunc (__libc_memcpy,
              (IS_THUNDERX (midr)
@@ -40,12 +43,17 @@ libc_ifunc (__libc_memcpy,
              : (IS_FALKOR (midr) || IS_PHECDA (midr)
                 ? __memcpy_falkor
                 : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
-                 ? __memcpy_thunderx2
-                 : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)
-                    || IS_NEOVERSE_V1 (midr)
-                    ? __memcpy_simd
+                  ? __memcpy_thunderx2
+                  : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)
+                     || IS_NEOVERSE_V1 (midr)
+                     ? __memcpy_simd
+# if HAVE_AARCH64_SVE_ASM
+                    : (IS_A64FX (midr)
+                       ? __memcpy_a64fx
+                       : __memcpy_generic))))));
+# else
                      : __memcpy_generic)))));
-
+# endif
  # undef memcpy
  strong_alias (__libc_memcpy, memcpy);
  #endif
diff --git a/sysdeps/aarch64/multiarch/memcpy_a64fx.S b/sysdeps/aarch64/multiarch/memcpy_a64fx.S

new file mode 100644 (file)

index 0000000..6552840
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy_a64fx.S
@@ -0,0 +1,406 @@
+/* Optimized memcpy for Fujitsu A64FX processor.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8.2-a, AArch64, unaligned accesses, sve
+ *
+ */
+
+#define L2_SIZE                (8*1024*1024)/2 // L2 8MB/2
+#define CACHE_LINE_SIZE        256
+#define ZF_DIST                (CACHE_LINE_SIZE * 21)  // Zerofill distance
+#define dest           x0
+#define src            x1
+#define n              x2      // size
+#define tmp1           x3
+#define tmp2           x4
+#define tmp3           x5
+#define rest           x6
+#define dest_ptr       x7
+#define src_ptr                x8
+#define vector_length  x9
+#define cl_remainder   x10     // CACHE_LINE_SIZE remainder
+
+#if HAVE_AARCH64_SVE_ASM
+# if IS_IN (libc)
+#  define MEMCPY __memcpy_a64fx
+#  define MEMMOVE __memmove_a64fx
+
+       .arch armv8.2-a+sve
+
+       .macro dc_zva times
+       dc      zva, tmp1
+       add     tmp1, tmp1, CACHE_LINE_SIZE
+       .if \times-1
+       dc_zva "(\times-1)"
+       .endif
+       .endm
+
+       .macro ld1b_unroll8
+       ld1b    z0.b, p0/z, [src_ptr, #0, mul vl]
+       ld1b    z1.b, p0/z, [src_ptr, #1, mul vl]
+       ld1b    z2.b, p0/z, [src_ptr, #2, mul vl]
+       ld1b    z3.b, p0/z, [src_ptr, #3, mul vl]
+       ld1b    z4.b, p0/z, [src_ptr, #4, mul vl]
+       ld1b    z5.b, p0/z, [src_ptr, #5, mul vl]
+       ld1b    z6.b, p0/z, [src_ptr, #6, mul vl]
+       ld1b    z7.b, p0/z, [src_ptr, #7, mul vl]
+       .endm
+
+       .macro stld1b_unroll4a
+       st1b    z0.b, p0,   [dest_ptr, #0, mul vl]
+       st1b    z1.b, p0,   [dest_ptr, #1, mul vl]
+       ld1b    z0.b, p0/z, [src_ptr,  #0, mul vl]
+       ld1b    z1.b, p0/z, [src_ptr,  #1, mul vl]
+       st1b    z2.b, p0,   [dest_ptr, #2, mul vl]
+       st1b    z3.b, p0,   [dest_ptr, #3, mul vl]
+       ld1b    z2.b, p0/z, [src_ptr,  #2, mul vl]
+       ld1b    z3.b, p0/z, [src_ptr,  #3, mul vl]
+       .endm
+
+       .macro stld1b_unroll4b
+       st1b    z4.b, p0,   [dest_ptr, #4, mul vl]
+       st1b    z5.b, p0,   [dest_ptr, #5, mul vl]
+       ld1b    z4.b, p0/z, [src_ptr,  #4, mul vl]
+       ld1b    z5.b, p0/z, [src_ptr,  #5, mul vl]
+       st1b    z6.b, p0,   [dest_ptr, #6, mul vl]
+       st1b    z7.b, p0,   [dest_ptr, #7, mul vl]
+       ld1b    z6.b, p0/z, [src_ptr,  #6, mul vl]
+       ld1b    z7.b, p0/z, [src_ptr,  #7, mul vl]
+       .endm
+
+       .macro stld1b_unroll8
+       stld1b_unroll4a
+       stld1b_unroll4b
+       .endm
+
+       .macro st1b_unroll8
+       st1b    z0.b, p0, [dest_ptr, #0, mul vl]
+       st1b    z1.b, p0, [dest_ptr, #1, mul vl]
+       st1b    z2.b, p0, [dest_ptr, #2, mul vl]
+       st1b    z3.b, p0, [dest_ptr, #3, mul vl]
+       st1b    z4.b, p0, [dest_ptr, #4, mul vl]
+       st1b    z5.b, p0, [dest_ptr, #5, mul vl]
+       st1b    z6.b, p0, [dest_ptr, #6, mul vl]
+       st1b    z7.b, p0, [dest_ptr, #7, mul vl]
+       .endm
+
+       .macro shortcut_for_small_size exit
+       // if rest <= vector_length * 2
+       whilelo p0.b, xzr, n
+       whilelo p1.b, vector_length, n
+       b.last  1f
+       ld1b    z0.b, p0/z, [src, #0, mul vl]
+       ld1b    z1.b, p1/z, [src, #1, mul vl]
+       st1b    z0.b, p0, [dest, #0, mul vl]
+       st1b    z1.b, p1, [dest, #1, mul vl]
+       ret
+1:     // if rest > vector_length * 8
+       cmp     n, vector_length, lsl 3 // vector_length * 8
+       b.hi    \exit
+       // if rest <= vector_length * 4
+       lsl     tmp1, vector_length, 1  // vector_length * 2
+       whilelo p2.b, tmp1, n
+       incb    tmp1
+       whilelo p3.b, tmp1, n
+       b.last  1f
+       ld1b    z0.b, p0/z, [src, #0, mul vl]
+       ld1b    z1.b, p1/z, [src, #1, mul vl]
+       ld1b    z2.b, p2/z, [src, #2, mul vl]
+       ld1b    z3.b, p3/z, [src, #3, mul vl]
+       st1b    z0.b, p0, [dest, #0, mul vl]
+       st1b    z1.b, p1, [dest, #1, mul vl]
+       st1b    z2.b, p2, [dest, #2, mul vl]
+       st1b    z3.b, p3, [dest, #3, mul vl]
+       ret
+1:     // if rest <= vector_length * 8
+       lsl     tmp1, vector_length, 2  // vector_length * 4
+       whilelo p4.b, tmp1, n
+       incb    tmp1
+       whilelo p5.b, tmp1, n
+       b.last  1f
+       ld1b    z0.b, p0/z, [src, #0, mul vl]
+       ld1b    z1.b, p1/z, [src, #1, mul vl]
+       ld1b    z2.b, p2/z, [src, #2, mul vl]
+       ld1b    z3.b, p3/z, [src, #3, mul vl]
+       ld1b    z4.b, p4/z, [src, #4, mul vl]
+       ld1b    z5.b, p5/z, [src, #5, mul vl]
+       st1b    z0.b, p0, [dest, #0, mul vl]
+       st1b    z1.b, p1, [dest, #1, mul vl]
+       st1b    z2.b, p2, [dest, #2, mul vl]
+       st1b    z3.b, p3, [dest, #3, mul vl]
+       st1b    z4.b, p4, [dest, #4, mul vl]
+       st1b    z5.b, p5, [dest, #5, mul vl]
+       ret
+1:     lsl     tmp1, vector_length, 2  // vector_length * 4
+       incb    tmp1                    // vector_length * 5
+       incb    tmp1                    // vector_length * 6
+       whilelo p6.b, tmp1, n
+       incb    tmp1
+       whilelo p7.b, tmp1, n
+       ld1b    z0.b, p0/z, [src, #0, mul vl]
+       ld1b    z1.b, p1/z, [src, #1, mul vl]
+       ld1b    z2.b, p2/z, [src, #2, mul vl]
+       ld1b    z3.b, p3/z, [src, #3, mul vl]
+       ld1b    z4.b, p4/z, [src, #4, mul vl]
+       ld1b    z5.b, p5/z, [src, #5, mul vl]
+       ld1b    z6.b, p6/z, [src, #6, mul vl]
+       ld1b    z7.b, p7/z, [src, #7, mul vl]
+       st1b    z0.b, p0, [dest, #0, mul vl]
+       st1b    z1.b, p1, [dest, #1, mul vl]
+       st1b    z2.b, p2, [dest, #2, mul vl]
+       st1b    z3.b, p3, [dest, #3, mul vl]
+       st1b    z4.b, p4, [dest, #4, mul vl]
+       st1b    z5.b, p5, [dest, #5, mul vl]
+       st1b    z6.b, p6, [dest, #6, mul vl]
+       st1b    z7.b, p7, [dest, #7, mul vl]
+       ret
+       .endm
+
+ENTRY (MEMCPY)
+
+       PTR_ARG (0)
+       PTR_ARG (1)
+       SIZE_ARG (2)
+
+L(memcpy):
+       cntb    vector_length
+       // shortcut for less than vector_length * 8
+       // gives a free ptrue to p0.b for n >= vector_length
+       shortcut_for_small_size L(vl_agnostic)
+       // end of shortcut
+
+L(vl_agnostic): // VL Agnostic
+       mov     rest, n
+       mov     dest_ptr, dest
+       mov     src_ptr, src
+       // if rest >= L2_SIZE && vector_length == 64 then L(L2)
+       mov     tmp1, 64
+       cmp     rest, L2_SIZE
+       ccmp    vector_length, tmp1, 0, cs
+       b.eq    L(L2)
+
+L(unroll8): // unrolling and software pipeline
+       lsl     tmp1, vector_length, 3  // vector_length * 8
+       .p2align 3
+       cmp      rest, tmp1
+       b.cc    L(last)
+       ld1b_unroll8
+       add     src_ptr, src_ptr, tmp1
+       sub     rest, rest, tmp1
+       cmp     rest, tmp1
+       b.cc    2f
+       .p2align 3
+1:     stld1b_unroll8
+       add     dest_ptr, dest_ptr, tmp1
+       add     src_ptr, src_ptr, tmp1
+       sub     rest, rest, tmp1
+       cmp     rest, tmp1
+       b.ge    1b
+2:     st1b_unroll8
+       add     dest_ptr, dest_ptr, tmp1
+
+       .p2align 3
+L(last):
+       whilelo p0.b, xzr, rest
+       whilelo p1.b, vector_length, rest
+       b.last  1f
+       ld1b    z0.b, p0/z, [src_ptr, #0, mul vl]
+       ld1b    z1.b, p1/z, [src_ptr, #1, mul vl]
+       st1b    z0.b, p0, [dest_ptr, #0, mul vl]
+       st1b    z1.b, p1, [dest_ptr, #1, mul vl]
+       ret
+1:     lsl     tmp1, vector_length, 1  // vector_length * 2
+       whilelo p2.b, tmp1, rest
+       incb    tmp1
+       whilelo p3.b, tmp1, rest
+       b.last  1f
+       ld1b    z0.b, p0/z, [src_ptr, #0, mul vl]
+       ld1b    z1.b, p1/z, [src_ptr, #1, mul vl]
+       ld1b    z2.b, p2/z, [src_ptr, #2, mul vl]
+       ld1b    z3.b, p3/z, [src_ptr, #3, mul vl]
+       st1b    z0.b, p0, [dest_ptr, #0, mul vl]
+       st1b    z1.b, p1, [dest_ptr, #1, mul vl]
+       st1b    z2.b, p2, [dest_ptr, #2, mul vl]
+       st1b    z3.b, p3, [dest_ptr, #3, mul vl]
+       ret
+1:     lsl     tmp1, vector_length, 2  // vector_length * 4
+       whilelo p4.b, tmp1, rest
+       incb    tmp1
+       whilelo p5.b, tmp1, rest
+       incb    tmp1
+       whilelo p6.b, tmp1, rest
+       incb    tmp1
+       whilelo p7.b, tmp1, rest
+       ld1b    z0.b, p0/z, [src_ptr, #0, mul vl]
+       ld1b    z1.b, p1/z, [src_ptr, #1, mul vl]
+       ld1b    z2.b, p2/z, [src_ptr, #2, mul vl]
+       ld1b    z3.b, p3/z, [src_ptr, #3, mul vl]
+       ld1b    z4.b, p4/z, [src_ptr, #4, mul vl]
+       ld1b    z5.b, p5/z, [src_ptr, #5, mul vl]
+       ld1b    z6.b, p6/z, [src_ptr, #6, mul vl]
+       ld1b    z7.b, p7/z, [src_ptr, #7, mul vl]
+       st1b    z0.b, p0, [dest_ptr, #0, mul vl]
+       st1b    z1.b, p1, [dest_ptr, #1, mul vl]
+       st1b    z2.b, p2, [dest_ptr, #2, mul vl]
+       st1b    z3.b, p3, [dest_ptr, #3, mul vl]
+       st1b    z4.b, p4, [dest_ptr, #4, mul vl]
+       st1b    z5.b, p5, [dest_ptr, #5, mul vl]
+       st1b    z6.b, p6, [dest_ptr, #6, mul vl]
+       st1b    z7.b, p7, [dest_ptr, #7, mul vl]
+       ret
+
+L(L2):
+       // align dest address at CACHE_LINE_SIZE byte boundary
+       mov     tmp1, CACHE_LINE_SIZE
+       ands    tmp2, dest_ptr, CACHE_LINE_SIZE - 1
+       // if cl_remainder == 0
+       b.eq    L(L2_dc_zva)
+       sub     cl_remainder, tmp1, tmp2
+       // process remainder until the first CACHE_LINE_SIZE boundary
+       whilelo p1.b, xzr, cl_remainder // keep p0.b all true
+       whilelo p2.b, vector_length, cl_remainder
+       b.last  1f
+       ld1b    z1.b, p1/z, [src_ptr, #0, mul vl]
+       ld1b    z2.b, p2/z, [src_ptr, #1, mul vl]
+       st1b    z1.b, p1, [dest_ptr, #0, mul vl]
+       st1b    z2.b, p2, [dest_ptr, #1, mul vl]
+       b       2f
+1:     lsl     tmp1, vector_length, 1  // vector_length * 2
+       whilelo p3.b, tmp1, cl_remainder
+       incb    tmp1
+       whilelo p4.b, tmp1, cl_remainder
+       ld1b    z1.b, p1/z, [src_ptr, #0, mul vl]
+       ld1b    z2.b, p2/z, [src_ptr, #1, mul vl]
+       ld1b    z3.b, p3/z, [src_ptr, #2, mul vl]
+       ld1b    z4.b, p4/z, [src_ptr, #3, mul vl]
+       st1b    z1.b, p1, [dest_ptr, #0, mul vl]
+       st1b    z2.b, p2, [dest_ptr, #1, mul vl]
+       st1b    z3.b, p3, [dest_ptr, #2, mul vl]
+       st1b    z4.b, p4, [dest_ptr, #3, mul vl]
+2:     add     dest_ptr, dest_ptr, cl_remainder
+       add     src_ptr, src_ptr, cl_remainder
+       sub     rest, rest, cl_remainder
+
+L(L2_dc_zva):
+       // zero fill
+       and     tmp1, dest, 0xffffffffffffff
+       and     tmp2, src, 0xffffffffffffff
+       subs    tmp1, tmp1, tmp2        // diff
+       b.ge    1f
+       neg     tmp1, tmp1
+1:     mov     tmp3, ZF_DIST + CACHE_LINE_SIZE * 2
+       cmp     tmp1, tmp3
+       b.lo    L(unroll8)
+       mov     tmp1, dest_ptr
+       dc_zva  (ZF_DIST / CACHE_LINE_SIZE) - 1
+       // unroll
+       ld1b_unroll8    // this line has to be after "b.lo L(unroll8)"
+       add      src_ptr, src_ptr, CACHE_LINE_SIZE * 2
+       sub      rest, rest, CACHE_LINE_SIZE * 2
+       mov      tmp1, ZF_DIST
+       .p2align 3
+1:     stld1b_unroll4a
+       add     tmp2, dest_ptr, tmp1    // dest_ptr + ZF_DIST
+       dc      zva, tmp2
+       stld1b_unroll4b
+       add     tmp2, tmp2, CACHE_LINE_SIZE
+       dc      zva, tmp2
+       add     dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2
+       add     src_ptr, src_ptr, CACHE_LINE_SIZE * 2
+       sub     rest, rest, CACHE_LINE_SIZE * 2
+       cmp     rest, tmp3      // ZF_DIST + CACHE_LINE_SIZE * 2
+       b.ge    1b
+       st1b_unroll8
+       add     dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2
+       b       L(unroll8)
+
+END (MEMCPY)
+libc_hidden_builtin_def (MEMCPY)
+
+
+ENTRY (MEMMOVE)
+
+       PTR_ARG (0)
+       PTR_ARG (1)
+       SIZE_ARG (2)
+
+       // remove tag address
+       // dest has to be immutable because it is the return value
+       // src has to be immutable because it is used in L(bwd_last)
+       and     tmp2, dest, 0xffffffffffffff    // save dest_notag into tmp2
+       and     tmp3, src, 0xffffffffffffff     // save src_notag intp tmp3
+       cmp     n, 0
+       ccmp    tmp2, tmp3, 4, ne
+       b.ne    1f
+       ret
+1:     cntb    vector_length
+       // shortcut for less than vector_length * 8
+       // gives a free ptrue to p0.b for n >= vector_length
+       // tmp2 and tmp3 should not be used in this macro to keep
+       // notag addresses
+       shortcut_for_small_size L(dispatch)
+       // end of shortcut
+
+L(dispatch):
+       // tmp2 = dest_notag, tmp3 = src_notag
+       // diff = dest_notag - src_notag
+       sub     tmp1, tmp2, tmp3
+       // if diff <= 0 || diff >= n then memcpy
+       cmp     tmp1, 0
+       ccmp    tmp1, n, 2, gt
+       b.cs    L(vl_agnostic)
+
+L(bwd_start):
+       mov     rest, n
+       add     dest_ptr, dest, n       // dest_end
+       add     src_ptr, src, n         // src_end
+
+L(bwd_unroll8): // unrolling and software pipeline
+       lsl     tmp1, vector_length, 3  // vector_length * 8
+       .p2align 3
+       cmp     rest, tmp1
+       b.cc    L(bwd_last)
+       sub     src_ptr, src_ptr, tmp1
+       ld1b_unroll8
+       sub     rest, rest, tmp1
+       cmp     rest, tmp1
+       b.cc    2f
+       .p2align 3
+1:     sub     src_ptr, src_ptr, tmp1
+       sub     dest_ptr, dest_ptr, tmp1
+       stld1b_unroll8
+       sub     rest, rest, tmp1
+       cmp     rest, tmp1
+       b.ge    1b
+2:     sub     dest_ptr, dest_ptr, tmp1
+       st1b_unroll8
+
+L(bwd_last):
+       mov     dest_ptr, dest
+       mov     src_ptr, src
+       b       L(last)
+
+END (MEMMOVE)
+libc_hidden_builtin_def (MEMMOVE)
+# endif /* IS_IN (libc) */
+#endif /* HAVE_AARCH64_SVE_ASM */
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c

index 12d7781..d0adefc 100644 (file)
--- a/sysdeps/aarch64/multiarch/memmove.c
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -33,6 +33,9 @@ extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden;
  extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
  extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
  extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
+# if HAVE_AARCH64_SVE_ASM
+extern __typeof (__redirect_memmove) __memmove_a64fx attribute_hidden;
+# endif
  
  libc_ifunc (__libc_memmove,
              (IS_THUNDERX (midr)
@@ -40,12 +43,17 @@ libc_ifunc (__libc_memmove,
              : (IS_FALKOR (midr) || IS_PHECDA (midr)
                 ? __memmove_falkor
                 : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
-                 ? __memmove_thunderx2
-                 : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)
-                    || IS_NEOVERSE_V1 (midr)
-                    ? __memmove_simd
+                  ? __memmove_thunderx2
+                  : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)
+                     || IS_NEOVERSE_V1 (midr)
+                     ? __memmove_simd
+# if HAVE_AARCH64_SVE_ASM
+                    : (IS_A64FX (midr)
+                       ? __memmove_a64fx
+                       : __memmove_generic))))));
+# else
                      : __memmove_generic)))));
-
+# endif
  # undef memmove
  strong_alias (__libc_memmove, memmove);
  #endif
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c

index db6aa35..6206a2f 100644 (file)
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
@@ -46,6 +46,7 @@ static struct cpu_list cpu_list[] = {
        {"ares",          0x411FD0C0},
        {"emag",          0x503F0001},
        {"kunpeng920",    0x481FD010},
+      {"a64fx",                 0x460F0010},
        {"generic",       0x0}
  };
  
@@ -116,4 +117,7 @@ init_cpu_features (struct cpu_features *cpu_features)
              (PR_TAGGED_ADDR_ENABLE | PR_MTE_TCF_ASYNC | MTE_ALLOWED_TAGS),
              0, 0, 0);
  #endif
+
+  /* Check if SVE is supported.  */
+  cpu_features->sve = GLRO (dl_hwcap) & HWCAP_SVE;
  }
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h

index 3b9bfed..2b322e5 100644 (file)
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
@@ -65,6 +65,9 @@
  #define IS_KUNPENG920(midr) (MIDR_IMPLEMENTOR(midr) == 'H'                        \
                          && MIDR_PARTNUM(midr) == 0xd01)
  
+#define IS_A64FX(midr) (MIDR_IMPLEMENTOR(midr) == 'F'                        \
+                       && MIDR_PARTNUM(midr) == 0x001)
+
  struct cpu_features
  {
    uint64_t midr_el1;
@@ -72,6 +75,7 @@ struct cpu_features
    bool bti;
    /* Currently, the GLIBC memory tagging tunable only defines 8 bits.  */
    uint8_t mte_state;
+  bool sve;
  };
  
  #endif /* _CPU_FEATURES_AARCH64_H  */
author	Naohiro Tamura <naohirot@jp.fujitsu.com>
	Thu, 27 May 2021 07:42:35 +0000 (07:42 +0000)
committer	Szabolcs Nagy <szabolcs.nagy@arm.com>
	Thu, 27 May 2021 08:47:53 +0000 (09:47 +0100)
manual/tunables.texi		patch \| blob \| history
sysdeps/aarch64/multiarch/Makefile		patch \| blob \| history
sysdeps/aarch64/multiarch/ifunc-impl-list.c		patch \| blob \| history
sysdeps/aarch64/multiarch/init-arch.h		patch \| blob \| history
sysdeps/aarch64/multiarch/memcpy.c		patch \| blob \| history
sysdeps/aarch64/multiarch/memcpy_a64fx.S	[new file with mode: 0644]	patch \| blob
sysdeps/aarch64/multiarch/memmove.c		patch \| blob \| history
sysdeps/unix/sysv/linux/aarch64/cpu-features.c		patch \| blob \| history
sysdeps/unix/sysv/linux/aarch64/cpu-features.h		patch \| blob \| history