+2018-02-22 Steve Ellcey <sellcey@cavium.com>
+
+ * sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
+ Add memcpy_thunderx2.
+ * sysdeps/aarch64/multiarch/ifunc-impl-list.c (MAX_IFUNC):
+ Increment to 4.
+ (__libc_ifunc_impl_list): Add __memcpy_thunderx2.
+ * sysdeps/aarch64/multiarch/memcpy.c (libc_ifunc): Add IS_THUNDERX2
+ and IS_THUNDERX2PA checks.
+ * sysdeps/aarch64/multiarch/memcpy_thunderx.S (USE_THUNDERX2):
+ Use macro to set name appropriately.
+ (memcpy): Use USE_THUNDERX2 macro to modify prefetches.
+ * sysdeps/aarch64/multiarch/memcpy_thunderx2.S: New file.
+ * sysdeps/unix/sysv/linux/aarch64/cpu-features.h (IS_THUNDERX2PA):
+ New macro.
+ (IS_THUNDERX2): New macro.
+
2018-02-22 Stefan Liebler <stli@linux.vnet.ibm.com>
* sysdeps/s390/fpu/libm-test-ulps: Regenerated.
ifeq ($(subdir),string)
-sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \
- memmove_falkor memset_generic memset_falkor
+sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
+ memcpy_falkor memmove_falkor memset_generic memset_falkor
endif
#include <stdio.h>
/* Maximum number of IFUNC implementations. */
-#define MAX_IFUNC 3
+#define MAX_IFUNC 4
size_t
__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c. */
IFUNC_IMPL (i, name, memcpy,
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
IFUNC_IMPL (i, name, memmove,
extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
libc_ifunc (__libc_memcpy,
? __memcpy_thunderx
: (IS_FALKOR (midr)
? __memcpy_falkor
- : __memcpy_generic)));
+ : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
+ ? __memcpy_thunderx2
+ : __memcpy_generic))));
# undef memcpy
strong_alias (__libc_memcpy, memcpy);
#if IS_IN (libc)
-# undef MEMCPY
-# define MEMCPY __memcpy_thunderx
-# undef MEMMOVE
-# define MEMMOVE __memmove_thunderx
-# define USE_THUNDERX
+# ifndef USE_THUNDERX2
+# undef MEMCPY
+# define MEMCPY __memcpy_thunderx
+# undef MEMMOVE
+# define MEMMOVE __memmove_thunderx
+# define USE_THUNDERX
+# endif
ENTRY_ALIGN (MEMMOVE, 6)
.p2align 4
L(copy_long):
-# ifdef USE_THUNDERX
+# if defined(USE_THUNDERX) || defined (USE_THUNDERX2)
/* On thunderx, large memcpy's are helped by software prefetching.
This loop is identical to the one below it but with prefetching
bic dst, dstin, 15
ldp D_l, D_h, [src]
sub src, src, tmp1
+# if defined(USE_THUNDERX)
prfm pldl1strm, [src, 384]
+# elif defined(USE_THUNDERX2)
+ prfm pldl1strm, [src, 256]
+# endif
add count, count, tmp1 /* Count is now 16 too large. */
ldp A_l, A_h, [src, 16]
stp D_l, D_h, [dstin]
subs count, count, 128 + 16 /* Test and readjust count. */
L(prefetch_loop64):
+# if defined(USE_THUNDERX)
tbz src, #6, 1f
prfm pldl1strm, [src, 512]
1:
+# elif defined(USE_THUNDERX2)
+ prfm pldl1strm, [src, 256]
+# endif
stp A_l, A_h, [dst, 16]
ldp A_l, A_h, [src, 16]
stp B_l, B_h, [dst, 32]
--- /dev/null
+/* A Thunderx2 Optimized memcpy implementation for AARCH64.
+ Copyright (C) 2018 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* The actual code in this memcpy and memmove is in memcpy_thunderx.S.
+ The only real differences are with the prefetching instructions. */
+
+#define MEMCPY __memcpy_thunderx2
+#define MEMMOVE __memmove_thunderx2
+#define USE_THUNDERX2
+
+#include "memcpy_thunderx.S"
#define IS_THUNDERX(midr) (MIDR_IMPLEMENTOR(midr) == 'C' \
&& MIDR_PARTNUM(midr) == 0x0a1)
+#define IS_THUNDERX2PA(midr) (MIDR_IMPLEMENTOR(midr) == 'B' \
+ && MIDR_PARTNUM(midr) == 0x516)
+#define IS_THUNDERX2(midr) (MIDR_IMPLEMENTOR(midr) == 'C' \
+ && MIDR_PARTNUM(midr) == 0xaf)
+
#define IS_FALKOR(midr) (MIDR_IMPLEMENTOR(midr) == 'Q' \
&& MIDR_PARTNUM(midr) == 0xc00)