From e9537dddc7c7c7b60b55ed845542c8d586164488 Mon Sep 17 00:00:00 2001 From: Steve Ellcey Date: Thu, 22 Feb 2018 08:38:47 -0800 Subject: [PATCH] IFUNC for Cavium ThunderX2 * sysdeps/aarch64/multiarch/Makefile (sysdep_routines): Add memcpy_thunderx2. * sysdeps/aarch64/multiarch/ifunc-impl-list.c (MAX_IFUNC): Increment to 4. (__libc_ifunc_impl_list): Add __memcpy_thunderx2. * sysdeps/aarch64/multiarch/memcpy.c (libc_ifunc): Add IS_THUNDERX2 and IS_THUNDERX2PA checks. * sysdeps/aarch64/multiarch/memcpy_thunderx.S (USE_THUNDERX2): Use macro to set name appropriately. (memcpy): Use USE_THUNDERX2 macro to modify prefetches. * sysdeps/aarch64/multiarch/memcpy_thunderx2.S: New file. * sysdeps/unix/sysv/linux/aarch64/cpu-features.h (IS_THUNDERX2PA): New macro. (IS_THUNDERX2): New macro. --- ChangeLog | 17 ++++++++++++++++ sysdeps/aarch64/multiarch/Makefile | 4 ++-- sysdeps/aarch64/multiarch/ifunc-impl-list.c | 3 ++- sysdeps/aarch64/multiarch/memcpy.c | 5 ++++- sysdeps/aarch64/multiarch/memcpy_thunderx.S | 22 +++++++++++++++------ sysdeps/aarch64/multiarch/memcpy_thunderx2.S | 27 ++++++++++++++++++++++++++ sysdeps/unix/sysv/linux/aarch64/cpu-features.h | 5 +++++ 7 files changed, 73 insertions(+), 10 deletions(-) create mode 100644 sysdeps/aarch64/multiarch/memcpy_thunderx2.S diff --git a/ChangeLog b/ChangeLog index d33b470..b47ef63 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,20 @@ +2018-02-22 Steve Ellcey + + * sysdeps/aarch64/multiarch/Makefile (sysdep_routines): + Add memcpy_thunderx2. + * sysdeps/aarch64/multiarch/ifunc-impl-list.c (MAX_IFUNC): + Increment to 4. + (__libc_ifunc_impl_list): Add __memcpy_thunderx2. + * sysdeps/aarch64/multiarch/memcpy.c (libc_ifunc): Add IS_THUNDERX2 + and IS_THUNDERX2PA checks. + * sysdeps/aarch64/multiarch/memcpy_thunderx.S (USE_THUNDERX2): + Use macro to set name appropriately. + (memcpy): Use USE_THUNDERX2 macro to modify prefetches. + * sysdeps/aarch64/multiarch/memcpy_thunderx2.S: New file. + * sysdeps/unix/sysv/linux/aarch64/cpu-features.h (IS_THUNDERX2PA): + New macro. + (IS_THUNDERX2): New macro. + 2018-02-22 Stefan Liebler * sysdeps/s390/fpu/libm-test-ulps: Regenerated. diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile index aa179c4..57ffdf7 100644 --- a/sysdeps/aarch64/multiarch/Makefile +++ b/sysdeps/aarch64/multiarch/Makefile @@ -1,4 +1,4 @@ ifeq ($(subdir),string) -sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \ - memmove_falkor memset_generic memset_falkor +sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \ + memcpy_falkor memmove_falkor memset_generic memset_falkor endif diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c index f84956c..e55be80 100644 --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -25,7 +25,7 @@ #include /* Maximum number of IFUNC implementations. */ -#define MAX_IFUNC 3 +#define MAX_IFUNC 4 size_t __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, @@ -40,6 +40,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c. */ IFUNC_IMPL (i, name, memcpy, IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2) IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor) IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic)) IFUNC_IMPL (i, name, memmove, diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c index 3efea2c..b94c655 100644 --- a/sysdeps/aarch64/multiarch/memcpy.c +++ b/sysdeps/aarch64/multiarch/memcpy.c @@ -30,6 +30,7 @@ extern __typeof (__redirect_memcpy) __libc_memcpy; extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden; +extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden; libc_ifunc (__libc_memcpy, @@ -37,7 +38,9 @@ libc_ifunc (__libc_memcpy, ? __memcpy_thunderx : (IS_FALKOR (midr) ? __memcpy_falkor - : __memcpy_generic))); + : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr) + ? __memcpy_thunderx2 + : __memcpy_generic)))); # undef memcpy strong_alias (__libc_memcpy, memcpy); diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx.S b/sysdeps/aarch64/multiarch/memcpy_thunderx.S index 4f6921d..de494d9 100644 --- a/sysdeps/aarch64/multiarch/memcpy_thunderx.S +++ b/sysdeps/aarch64/multiarch/memcpy_thunderx.S @@ -74,11 +74,13 @@ #if IS_IN (libc) -# undef MEMCPY -# define MEMCPY __memcpy_thunderx -# undef MEMMOVE -# define MEMMOVE __memmove_thunderx -# define USE_THUNDERX +# ifndef USE_THUNDERX2 +# undef MEMCPY +# define MEMCPY __memcpy_thunderx +# undef MEMMOVE +# define MEMMOVE __memmove_thunderx +# define USE_THUNDERX +# endif ENTRY_ALIGN (MEMMOVE, 6) @@ -180,7 +182,7 @@ L(copy96): .p2align 4 L(copy_long): -# ifdef USE_THUNDERX +# if defined(USE_THUNDERX) || defined (USE_THUNDERX2) /* On thunderx, large memcpy's are helped by software prefetching. This loop is identical to the one below it but with prefetching @@ -194,7 +196,11 @@ L(copy_long): bic dst, dstin, 15 ldp D_l, D_h, [src] sub src, src, tmp1 +# if defined(USE_THUNDERX) prfm pldl1strm, [src, 384] +# elif defined(USE_THUNDERX2) + prfm pldl1strm, [src, 256] +# endif add count, count, tmp1 /* Count is now 16 too large. */ ldp A_l, A_h, [src, 16] stp D_l, D_h, [dstin] @@ -204,9 +210,13 @@ L(copy_long): subs count, count, 128 + 16 /* Test and readjust count. */ L(prefetch_loop64): +# if defined(USE_THUNDERX) tbz src, #6, 1f prfm pldl1strm, [src, 512] 1: +# elif defined(USE_THUNDERX2) + prfm pldl1strm, [src, 256] +# endif stp A_l, A_h, [dst, 16] ldp A_l, A_h, [src, 16] stp B_l, B_h, [dst, 32] diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S new file mode 100644 index 0000000..8501abf --- /dev/null +++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S @@ -0,0 +1,27 @@ +/* A Thunderx2 Optimized memcpy implementation for AARCH64. + Copyright (C) 2018 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* The actual code in this memcpy and memmove is in memcpy_thunderx.S. + The only real differences are with the prefetching instructions. */ + +#define MEMCPY __memcpy_thunderx2 +#define MEMMOVE __memmove_thunderx2 +#define USE_THUNDERX2 + +#include "memcpy_thunderx.S" diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h index c646f9d..cde655b 100644 --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h @@ -41,6 +41,11 @@ #define IS_THUNDERX(midr) (MIDR_IMPLEMENTOR(midr) == 'C' \ && MIDR_PARTNUM(midr) == 0x0a1) +#define IS_THUNDERX2PA(midr) (MIDR_IMPLEMENTOR(midr) == 'B' \ + && MIDR_PARTNUM(midr) == 0x516) +#define IS_THUNDERX2(midr) (MIDR_IMPLEMENTOR(midr) == 'C' \ + && MIDR_PARTNUM(midr) == 0xaf) + #define IS_FALKOR(midr) (MIDR_IMPLEMENTOR(midr) == 'Q' \ && MIDR_PARTNUM(midr) == 0xc00) -- 2.7.4