From 93ac449369be8cb085131b6700fc99b2ff2d9c7c Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet Date: Tue, 13 Dec 2022 15:47:26 +0000 Subject: [PATCH] [libc] Make string functions buildable with GCC Differential Revision: https://reviews.llvm.org/D139939 --- libc/src/__support/compiler_features.h | 8 ++++ .../src/string/memory_utils/bcmp_implementations.h | 2 +- .../string/memory_utils/memcmp_implementations.h | 3 +- .../string/memory_utils/memcpy_implementations.h | 2 +- .../string/memory_utils/memmove_implementations.h | 4 +- .../string/memory_utils/memset_implementations.h | 2 +- libc/src/string/memory_utils/op_builtin.h | 6 +-- libc/src/string/memory_utils/op_generic.h | 32 ++++++++++++++-- libc/src/string/memory_utils/op_x86.h | 43 +++++++++++++++++++++- 9 files changed, 88 insertions(+), 14 deletions(-) diff --git a/libc/src/__support/compiler_features.h b/libc/src/__support/compiler_features.h index a30d2a2..fed5759 100644 --- a/libc/src/__support/compiler_features.h +++ b/libc/src/__support/compiler_features.h @@ -38,4 +38,12 @@ #define LLVM_LIBC_HAS_FEATURE(FEATURE) 0 #endif +#if defined(LLVM_LIBC_COMPILER_CLANG) +#define LLVM_LIBC_LOOP_NOUNROLL _Pragma("nounroll") +#elif defined(LLVM_LIBC_COMPILER_GCC) +#define LLVM_LIBC_LOOP_NOUNROLL _Pragma("GCC unroll 0") +#else +#define LLVM_LIBC_LOOP_NOUNROLL +#endif + #endif // LLVM_LIBC_SUPPORT_COMPILER_FEATURES_H diff --git a/libc/src/string/memory_utils/bcmp_implementations.h b/libc/src/string/memory_utils/bcmp_implementations.h index e7ded19..18a4ab8 100644 --- a/libc/src/string/memory_utils/bcmp_implementations.h +++ b/libc/src/string/memory_utils/bcmp_implementations.h @@ -22,7 +22,7 @@ namespace __llvm_libc { [[maybe_unused]] static inline BcmpReturnType inline_bcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) { -#pragma nounroll + LLVM_LIBC_LOOP_NOUNROLL for (size_t offset = 0; offset < count; ++offset) if (auto value = generic::Bcmp<1>::block(p1 + offset, p2 + offset)) return value; diff --git a/libc/src/string/memory_utils/memcmp_implementations.h b/libc/src/string/memory_utils/memcmp_implementations.h index 33fbd7c..1dac6e0 100644 --- a/libc/src/string/memory_utils/memcmp_implementations.h +++ b/libc/src/string/memory_utils/memcmp_implementations.h @@ -22,7 +22,7 @@ namespace __llvm_libc { [[maybe_unused]] static inline MemcmpReturnType inline_memcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) { -#pragma nounroll + LLVM_LIBC_LOOP_NOUNROLL for (size_t offset = 0; offset < count; ++offset) if (auto value = generic::Memcmp<1>::block(p1 + offset, p2 + offset)) return value; @@ -83,6 +83,7 @@ inline_memcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) { } return x86::avx512bw::Memcmp<64>::loop_and_tail(p1, p2, count); } + #endif // defined(LLVM_LIBC_ARCH_X86) #if defined(LLVM_LIBC_ARCH_AARCH64) diff --git a/libc/src/string/memory_utils/memcpy_implementations.h b/libc/src/string/memory_utils/memcpy_implementations.h index 8d8ba6f..4372733 100644 --- a/libc/src/string/memory_utils/memcpy_implementations.h +++ b/libc/src/string/memory_utils/memcpy_implementations.h @@ -24,7 +24,7 @@ namespace __llvm_libc { [[maybe_unused]] static inline void inline_memcpy_embedded_tiny(Ptr __restrict dst, CPtr __restrict src, size_t count) { -#pragma nounroll + LLVM_LIBC_LOOP_NOUNROLL for (size_t offset = 0; offset < count; ++offset) builtin::Memcpy<1>::block(dst + offset, src + offset); } diff --git a/libc/src/string/memory_utils/memmove_implementations.h b/libc/src/string/memory_utils/memmove_implementations.h index dfea5fa..7e26b36 100644 --- a/libc/src/string/memory_utils/memmove_implementations.h +++ b/libc/src/string/memory_utils/memmove_implementations.h @@ -23,11 +23,11 @@ inline_memmove_embedded_tiny(Ptr dst, CPtr src, size_t count) { if ((count == 0) || (dst == src)) return; if (dst < src) { -#pragma nounroll + LLVM_LIBC_LOOP_NOUNROLL for (size_t offset = 0; offset < count; ++offset) builtin::Memcpy<1>::block(dst + offset, src + offset); } else { -#pragma nounroll + LLVM_LIBC_LOOP_NOUNROLL for (ptrdiff_t offset = count - 1; offset >= 0; --offset) builtin::Memcpy<1>::block(dst + offset, src + offset); } diff --git a/libc/src/string/memory_utils/memset_implementations.h b/libc/src/string/memory_utils/memset_implementations.h index 58779f7..dbcc356 100644 --- a/libc/src/string/memory_utils/memset_implementations.h +++ b/libc/src/string/memory_utils/memset_implementations.h @@ -22,7 +22,7 @@ namespace __llvm_libc { [[maybe_unused]] inline static void inline_memset_embedded_tiny(Ptr dst, uint8_t value, size_t count) { -#pragma nounroll + LLVM_LIBC_LOOP_NOUNROLL for (size_t offset = 0; offset < count; ++offset) generic::Memset<1, 1>::block(dst + offset, value); } diff --git a/libc/src/string/memory_utils/op_builtin.h b/libc/src/string/memory_utils/op_builtin.h index 68ae862..ce33de3 100644 --- a/libc/src/string/memory_utils/op_builtin.h +++ b/libc/src/string/memory_utils/op_builtin.h @@ -27,9 +27,9 @@ template struct Memcpy { #ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE return __builtin_memcpy_inline(dst, src, SIZE); #else - deferred_static_assert("Missing __builtin_memcpy_inline"); - (void)dst; - (void)src; + // The codegen may be suboptimal. + for (size_t i = 0; i < Size; ++i) + dst[i] = src[i]; #endif } diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h index e21ea6c..1603dbf 100644 --- a/libc/src/string/memory_utils/op_generic.h +++ b/libc/src/string/memory_utils/op_generic.h @@ -26,6 +26,7 @@ #include "src/__support/CPP/array.h" #include "src/__support/CPP/type_traits.h" #include "src/__support/common.h" +#include "src/__support/compiler_features.h" #include "src/__support/endian.h" #include "src/string/memory_utils/op_builtin.h" #include "src/string/memory_utils/utils.h" @@ -71,9 +72,34 @@ template struct ScalarType { } }; +// GCC can only take literals as __vector_size__ argument so we have to use +// template specialization. +template struct VectorValueType {}; +template <> struct VectorValueType<1> { + using type = uint8_t __attribute__((__vector_size__(1))); +}; +template <> struct VectorValueType<2> { + using type = uint8_t __attribute__((__vector_size__(2))); +}; +template <> struct VectorValueType<4> { + using type = uint8_t __attribute__((__vector_size__(4))); +}; +template <> struct VectorValueType<8> { + using type = uint8_t __attribute__((__vector_size__(8))); +}; +template <> struct VectorValueType<16> { + using type = uint8_t __attribute__((__vector_size__(16))); +}; +template <> struct VectorValueType<32> { + using type = uint8_t __attribute__((__vector_size__(32))); +}; +template <> struct VectorValueType<64> { + using type = uint8_t __attribute__((__vector_size__(64))); +}; + // Implements load, store and splat for vector types. template struct VectorType { - using Type = uint8_t __attribute__((__vector_size__(Size))); + using Type = typename VectorValueType::type; static inline Type load(CPtr src) { return ::__llvm_libc::load(src); } static inline void store(Ptr dst, Type value) { ::__llvm_libc::store(dst, value); @@ -434,7 +460,7 @@ template struct Memmove { const size_t tail_offset = count - Size; const auto tail_value = T::load(src + tail_offset); size_t offset = 0; -#pragma nounroll + LLVM_LIBC_LOOP_NOUNROLL do { block(dst + offset, src + offset); offset += Size; @@ -460,7 +486,7 @@ template struct Memmove { static_assert(Size > 1, "a loop of size 1 does not need tail"); const auto head_value = T::load(src); ptrdiff_t offset = count - Size; -#pragma nounroll + LLVM_LIBC_LOOP_NOUNROLL do { block(dst + offset, src + offset); offset -= Size; diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h index f68af00..b2355d9 100644 --- a/libc/src/string/memory_utils/op_x86.h +++ b/libc/src/string/memory_utils/op_x86.h @@ -99,17 +99,24 @@ template struct BcmpImpl { namespace sse2 { static inline BcmpReturnType bcmp16(CPtr p1, CPtr p2) { +#if defined(__SSE2__) using T = char __attribute__((__vector_size__(16))); // A mask indicating which bytes differ after loading 16 bytes from p1 and p2. const int mask = _mm_movemask_epi8(cpp::bit_cast<__m128i>(load(p1) != load(p2))); return static_cast(mask); +#else + (void)p1; + (void)p2; + return BcmpReturnType::ZERO(); +#endif // defined(__SSE2__) } template using Bcmp = BcmpImpl; } // namespace sse2 namespace avx2 { static inline BcmpReturnType bcmp32(CPtr p1, CPtr p2) { +#if defined(__AVX2__) using T = char __attribute__((__vector_size__(32))); // A mask indicating which bytes differ after loading 32 bytes from p1 and p2. const int mask = @@ -117,17 +124,29 @@ static inline BcmpReturnType bcmp32(CPtr p1, CPtr p2) { // _mm256_movemask_epi8 returns an int but it is to be interpreted as a 32-bit // mask. return static_cast(mask); +#else + (void)p1; + (void)p2; + return BcmpReturnType::ZERO(); +#endif // defined(__AVX2__) } template using Bcmp = BcmpImpl; } // namespace avx2 namespace avx512bw { static inline BcmpReturnType bcmp64(CPtr p1, CPtr p2) { +#if defined(__AVX512BW__) using T = char __attribute__((__vector_size__(64))); // A mask indicating which bytes differ after loading 64 bytes from p1 and p2. - const uint64_t mask = _mm512_cmpneq_epi8_mask(load(p1), load(p2)); + const uint64_t mask = _mm512_cmpneq_epi8_mask( + cpp::bit_cast<__m512i>(load(p1)), cpp::bit_cast<__m512i>(load(p2))); const bool mask_is_set = mask != 0; return static_cast(mask_is_set); +#else + (void)p1; + (void)p2; + return BcmpReturnType::ZERO(); +#endif // defined(__AVX512BW__) } template using Bcmp = BcmpImpl; } // namespace avx512bw @@ -192,35 +211,55 @@ struct MemcmpImpl { namespace sse2 { static inline MemcmpReturnType memcmp16(CPtr p1, CPtr p2) { +#if defined(__SSE2__) using T = char __attribute__((__vector_size__(16))); // A mask indicating which bytes differ after loading 16 bytes from p1 and p2. if (int mask = _mm_movemask_epi8(cpp::bit_cast<__m128i>(load(p1) != load(p2)))) return char_diff_no_zero(p1, p2, mask); return MemcmpReturnType::ZERO(); +#else + (void)p1; + (void)p2; + return MemcmpReturnType::ZERO(); +#endif // defined(__SSE2__) } template using Memcmp = MemcmpImpl; } // namespace sse2 namespace avx2 { static inline MemcmpReturnType memcmp32(CPtr p1, CPtr p2) { +#if defined(__AVX2__) using T = char __attribute__((__vector_size__(32))); // A mask indicating which bytes differ after loading 32 bytes from p1 and p2. if (int mask = _mm256_movemask_epi8( cpp::bit_cast<__m256i>(load(p1) != load(p2)))) return char_diff_no_zero(p1, p2, mask); return MemcmpReturnType::ZERO(); +#else + (void)p1; + (void)p2; + return MemcmpReturnType::ZERO(); +#endif // defined(__AVX2__) } template using Memcmp = MemcmpImpl; } // namespace avx2 namespace avx512bw { static inline MemcmpReturnType memcmp64(CPtr p1, CPtr p2) { +#if defined(__AVX512BW__) using T = char __attribute__((__vector_size__(64))); // A mask indicating which bytes differ after loading 64 bytes from p1 and p2. - if (uint64_t mask = _mm512_cmpneq_epi8_mask(load(p1), load(p2))) + if (uint64_t mask = + _mm512_cmpneq_epi8_mask(cpp::bit_cast<__m512i>(load(p1)), + cpp::bit_cast<__m512i>(load(p2)))) return char_diff_no_zero(p1, p2, mask); return MemcmpReturnType::ZERO(); +#else + (void)p1; + (void)p2; + return MemcmpReturnType::ZERO(); +#endif // defined(__AVX512BW__) } template using Memcmp = MemcmpImpl; } // namespace avx512bw -- 2.7.4