From 1d47c0512a265d4bb3ab9e56259fd1e4f4d42c75 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Thu, 17 Mar 2022 18:49:00 +0100 Subject: [PATCH] libatomic: Improve 16-byte atomics on Intel AVX [PR104688] MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit As mentioned in the PR, the latest Intel SDM has added: "Processors that enumerate support for Intel® AVX (by setting the feature flag CPUID.01H:ECX.AVX[bit 28]) guarantee that the 16-byte memory operations performed by the following instructions will always be carried out atomically: • MOVAPD, MOVAPS, and MOVDQA. • VMOVAPD, VMOVAPS, and VMOVDQA when encoded with VEX.128. • VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64 when encoded with EVEX.128 and k0 (masking disabled). (Note that these instructions require the linear addresses of their memory operands to be 16-byte aligned.)" The following patch deals with it just on the libatomic library side so far, currently (since ~ 2017) we emit all the __atomic_* 16-byte builtins as library calls since and this is something that we can hopefully backport. The patch simply introduces yet another ifunc variant that takes priority over the pure CMPXCHG16B one, one that checks AVX and CMPXCHG16B bits and on non-Intel clears the AVX bit during detection for now (if AMD comes with the same guarantee, we could revert the config/x86/init.c hunk), which implements 16-byte atomic load as vmovdqa and 16-byte atomic store as vmovdqa followed by mfence. 2022-03-17 Jakub Jelinek PR target/104688 * Makefile.am (IFUNC_OPTIONS): Change on x86_64 to -mcx16 -mcx16. (libatomic_la_LIBADD): Add $(addsuffix _16_2_.lo,$(SIZEOBJS)) for x86_64. * Makefile.in: Regenerated. * config/x86/host-config.h (IFUNC_COND_1): For x86_64 define to both AVX and CMPXCHG16B bits. (IFUNC_COND_2): Define. (IFUNC_NCOND): For x86_64 define to 2 * (N == 16). (MAYBE_HAVE_ATOMIC_CAS_16, MAYBE_HAVE_ATOMIC_EXCHANGE_16, MAYBE_HAVE_ATOMIC_LDST_16): Define to IFUNC_COND_2 rather than IFUNC_COND_1. (HAVE_ATOMIC_CAS_16): Redefine to 1 whenever IFUNC_ALT != 0. (HAVE_ATOMIC_LDST_16): Redefine to 1 whenever IFUNC_ALT == 1. (atomic_compare_exchange_n): Define whenever IFUNC_ALT != 0 on x86_64 for N == 16. (__atomic_load_n, __atomic_store_n): Redefine whenever IFUNC_ALT == 1 on x86_64 for N == 16. (atomic_load_n, atomic_store_n): New functions. * config/x86/init.c (__libat_feat1_init): On x86_64 clear bit_AVX if CPU vendor is not Intel. --- libatomic/Makefile.am | 5 +++-- libatomic/Makefile.in | 6 ++++-- libatomic/config/x86/host-config.h | 43 +++++++++++++++++++++++++++++++------- libatomic/config/x86/init.c | 12 +++++++++++ 4 files changed, 55 insertions(+), 11 deletions(-) diff --git a/libatomic/Makefile.am b/libatomic/Makefile.am index 389f3dd..d88515e 100644 --- a/libatomic/Makefile.am +++ b/libatomic/Makefile.am @@ -138,8 +138,9 @@ IFUNC_OPTIONS = -march=i586 libatomic_la_LIBADD += $(addsuffix _8_1_.lo,$(SIZEOBJS)) endif if ARCH_X86_64 -IFUNC_OPTIONS = -mcx16 -libatomic_la_LIBADD += $(addsuffix _16_1_.lo,$(SIZEOBJS)) +IFUNC_OPTIONS = -mcx16 -mcx16 +libatomic_la_LIBADD += $(addsuffix _16_1_.lo,$(SIZEOBJS)) \ + $(addsuffix _16_2_.lo,$(SIZEOBJS)) endif endif diff --git a/libatomic/Makefile.in b/libatomic/Makefile.in index 0a51bd5..80d2565 100644 --- a/libatomic/Makefile.in +++ b/libatomic/Makefile.in @@ -96,7 +96,9 @@ target_triplet = @target@ @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@ $(addsuffix \ @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@ _8_2_.lo,$(SIZEOBJS)) @ARCH_I386_TRUE@@HAVE_IFUNC_TRUE@am__append_3 = $(addsuffix _8_1_.lo,$(SIZEOBJS)) -@ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@am__append_4 = $(addsuffix _16_1_.lo,$(SIZEOBJS)) +@ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@am__append_4 = $(addsuffix _16_1_.lo,$(SIZEOBJS)) \ +@ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@ $(addsuffix _16_2_.lo,$(SIZEOBJS)) + subdir = . ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 am__aclocal_m4_deps = $(top_srcdir)/../config/acx.m4 \ @@ -435,7 +437,7 @@ libatomic_la_LIBADD = $(foreach s,$(SIZES),$(addsuffix \ @ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -march=armv8-a+lse @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -march=armv7-a+fp -DHAVE_KERNEL64 @ARCH_I386_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -march=i586 -@ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -mcx16 +@ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -mcx16 -mcx16 libatomic_convenience_la_SOURCES = $(libatomic_la_SOURCES) libatomic_convenience_la_LIBADD = $(libatomic_la_LIBADD) MULTISRCTOP = diff --git a/libatomic/config/x86/host-config.h b/libatomic/config/x86/host-config.h index f20ce09..007b7e1 100644 --- a/libatomic/config/x86/host-config.h +++ b/libatomic/config/x86/host-config.h @@ -55,31 +55,37 @@ load_feat1 (void) } #ifdef __x86_64__ -# define IFUNC_COND_1 (load_feat1 () & bit_CMPXCHG16B) +# define IFUNC_COND_1 ((load_feat1 () & (bit_AVX | bit_CMPXCHG16B)) \ + == (bit_AVX | bit_CMPXCHG16B)) +# define IFUNC_COND_2 (load_feat1 () & bit_CMPXCHG16B) #else # define IFUNC_COND_1 (load_feat1 () & bit_CMPXCHG8B) #endif #ifdef __x86_64__ -# define IFUNC_NCOND(N) (N == 16) +# define IFUNC_NCOND(N) (2 * (N == 16)) #else # define IFUNC_NCOND(N) (N == 8) #endif #ifdef __x86_64__ # undef MAYBE_HAVE_ATOMIC_CAS_16 -# define MAYBE_HAVE_ATOMIC_CAS_16 IFUNC_COND_1 +# define MAYBE_HAVE_ATOMIC_CAS_16 IFUNC_COND_2 # undef MAYBE_HAVE_ATOMIC_EXCHANGE_16 -# define MAYBE_HAVE_ATOMIC_EXCHANGE_16 IFUNC_COND_1 +# define MAYBE_HAVE_ATOMIC_EXCHANGE_16 IFUNC_COND_2 # undef MAYBE_HAVE_ATOMIC_LDST_16 -# define MAYBE_HAVE_ATOMIC_LDST_16 IFUNC_COND_1 +# define MAYBE_HAVE_ATOMIC_LDST_16 IFUNC_COND_2 /* Since load and store are implemented with CAS, they are not fast. */ # undef FAST_ATOMIC_LDST_16 # define FAST_ATOMIC_LDST_16 0 -# if IFUNC_ALT == 1 +# if IFUNC_ALT != 0 # undef HAVE_ATOMIC_CAS_16 # define HAVE_ATOMIC_CAS_16 1 # endif +# if IFUNC_ALT == 1 +# undef HAVE_ATOMIC_LDST_16 +# define HAVE_ATOMIC_LDST_16 1 +# endif #else # undef MAYBE_HAVE_ATOMIC_CAS_8 # define MAYBE_HAVE_ATOMIC_CAS_8 IFUNC_COND_1 @@ -93,7 +99,7 @@ load_feat1 (void) # endif #endif -#if defined(__x86_64__) && N == 16 && IFUNC_ALT == 1 +#if defined(__x86_64__) && N == 16 && IFUNC_ALT != 0 static inline bool atomic_compare_exchange_n (UTYPE *mptr, UTYPE *eptr, UTYPE newval, bool weak_p UNUSED, int sm UNUSED, int fm UNUSED) @@ -108,6 +114,29 @@ atomic_compare_exchange_n (UTYPE *mptr, UTYPE *eptr, UTYPE newval, # define atomic_compare_exchange_n atomic_compare_exchange_n #endif /* Have CAS 16 */ +#if defined(__x86_64__) && N == 16 && IFUNC_ALT == 1 +#define __atomic_load_n(ptr, model) \ + (sizeof (*ptr) == 16 ? atomic_load_n (ptr, model) \ + : (__atomic_load_n) (ptr, model)) +#define __atomic_store_n(ptr, val, model) \ + (sizeof (*ptr) == 16 ? atomic_store_n (ptr, val, model) \ + : (__atomic_store_n) (ptr, val, model)) + +static inline UTYPE +atomic_load_n (UTYPE *ptr, int model UNUSED) +{ + UTYPE ret; + __asm__ ("vmovdqa\t{%1, %0|%0, %1}" : "=x" (ret) : "m" (*ptr)); + return ret; +} + +static inline void +atomic_store_n (UTYPE *ptr, UTYPE val, int model UNUSED) +{ + __asm__ ("vmovdqa\t{%1, %0|%0, %1}\n\tmfence" : "=m" (*ptr) : "x" (val)); +} +#endif + #endif /* HAVE_IFUNC */ #include_next diff --git a/libatomic/config/x86/init.c b/libatomic/config/x86/init.c index 7bdec72..6f6499c 100644 --- a/libatomic/config/x86/init.c +++ b/libatomic/config/x86/init.c @@ -34,6 +34,18 @@ __libat_feat1_init (void) unsigned int eax, ebx, ecx, edx; FEAT1_REGISTER = 0; __get_cpuid (1, &eax, &ebx, &ecx, &edx); +#ifdef __x86_64__ + if ((FEAT1_REGISTER & (bit_AVX | bit_CMPXCHG16B)) + == (bit_AVX | bit_CMPXCHG16B)) + { + /* Intel SDM guarantees that 16-byte VMOVDQA on 16-byte aligned address + is atomic, but so far we don't have this guarantee from AMD. */ + unsigned int ecx2 = 0; + __get_cpuid (0, &eax, &ebx, &ecx2, &edx); + if (ecx2 != signature_INTEL_ecx) + FEAT1_REGISTER &= ~bit_AVX; + } +#endif /* See the load in load_feat1. */ __atomic_store_n (&__libat_feat1, FEAT1_REGISTER, __ATOMIC_RELAXED); return FEAT1_REGISTER; -- 2.7.4