From: Guillaume Chatelet Date: Thu, 27 Oct 2022 08:36:04 +0000 (+0000) Subject: Revert D136595 "[libc] Switch to new implementation of mem* functions" X-Git-Tag: upstream/17.0.6~29358 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=b6d3ae3d3de0ce8015e422592aa467b99d4b0b6d;p=platform%2Fupstream%2Fllvm.git Revert D136595 "[libc] Switch to new implementation of mem* functions" This patch seems to introduce bugs on aarch64. Reverting while we investigate the root cause. This reverts commit 02841488138160f9064f334a833d4bf3e80385c6. --- diff --git a/libc/src/stdio/printf_core/string_writer.cpp b/libc/src/stdio/printf_core/string_writer.cpp index 472573d..a80df32 100644 --- a/libc/src/stdio/printf_core/string_writer.cpp +++ b/libc/src/stdio/printf_core/string_writer.cpp @@ -33,7 +33,7 @@ void StringWriter::write(char new_char, size_t len) { len = available_capacity; if (len > 0) { - inline_memset(cur_buffer, static_cast(new_char), len); + inline_memset(cur_buffer, new_char, len); cur_buffer += len; available_capacity -= len; } diff --git a/libc/src/string/bcmp.cpp b/libc/src/string/bcmp.cpp index 2199130..963a7f5 100644 --- a/libc/src/string/bcmp.cpp +++ b/libc/src/string/bcmp.cpp @@ -14,7 +14,8 @@ namespace __llvm_libc { LLVM_LIBC_FUNCTION(int, bcmp, (const void *lhs, const void *rhs, size_t count)) { - return inline_bcmp(lhs, rhs, count); + return inline_bcmp(static_cast(lhs), + static_cast(rhs), count); } } // namespace __llvm_libc diff --git a/libc/src/string/memcmp.cpp b/libc/src/string/memcmp.cpp index 7cf6782..292525e 100644 --- a/libc/src/string/memcmp.cpp +++ b/libc/src/string/memcmp.cpp @@ -15,7 +15,8 @@ namespace __llvm_libc { LLVM_LIBC_FUNCTION(int, memcmp, (const void *lhs, const void *rhs, size_t count)) { - return inline_memcmp(lhs, rhs, count); + return inline_memcmp(static_cast(lhs), + static_cast(rhs), count); } } // namespace __llvm_libc diff --git a/libc/src/string/memcpy.cpp b/libc/src/string/memcpy.cpp index 8504005..ff990f4 100644 --- a/libc/src/string/memcpy.cpp +++ b/libc/src/string/memcpy.cpp @@ -15,7 +15,8 @@ namespace __llvm_libc { LLVM_LIBC_FUNCTION(void *, memcpy, (void *__restrict dst, const void *__restrict src, size_t size)) { - inline_memcpy(dst, src, size); + inline_memcpy(reinterpret_cast(dst), + reinterpret_cast(src), size); return dst; } diff --git a/libc/src/string/memmove.cpp b/libc/src/string/memmove.cpp index a42ced3..f242578 100644 --- a/libc/src/string/memmove.cpp +++ b/libc/src/string/memmove.cpp @@ -9,110 +9,42 @@ #include "src/string/memmove.h" #include "src/__support/common.h" -#include "src/string/memory_utils/op_aarch64.h" -#include "src/string/memory_utils/op_builtin.h" -#include "src/string/memory_utils/op_generic.h" -#include "src/string/memory_utils/op_x86.h" +#include "src/__support/integer_operations.h" +#include "src/string/memory_utils/elements.h" #include // size_t, ptrdiff_t -#include - namespace __llvm_libc { -[[maybe_unused]] static inline void -inline_memmove_embedded_tiny(Ptr dst, CPtr src, size_t count) { - if ((count == 0) || (dst == src)) - return; - if (dst < src) { -#pragma nounroll - for (size_t offset = 0; offset < count; ++offset) - builtin::Memcpy<1>::block(dst + offset, src + offset); - } else { -#pragma nounroll - for (ptrdiff_t offset = count - 1; offset >= 0; --offset) - builtin::Memcpy<1>::block(dst + offset, src + offset); - } -} - -template -[[maybe_unused]] static inline void inline_memmove_generic(Ptr dst, CPtr src, - size_t count) { +static inline void inline_memmove(char *dst, const char *src, size_t count) { + using namespace __llvm_libc::scalar; if (count == 0) return; if (count == 1) - return generic::Memmove<1, MaxSize>::block(dst, src); + return move<_1>(dst, src); if (count <= 4) - return generic::Memmove<2, MaxSize>::head_tail(dst, src, count); + return move>(dst, src, count); if (count <= 8) - return generic::Memmove<4, MaxSize>::head_tail(dst, src, count); + return move>(dst, src, count); if (count <= 16) - return generic::Memmove<8, MaxSize>::head_tail(dst, src, count); + return move>(dst, src, count); if (count <= 32) - return generic::Memmove<16, MaxSize>::head_tail(dst, src, count); + return move>(dst, src, count); if (count <= 64) - return generic::Memmove<32, MaxSize>::head_tail(dst, src, count); + return move>(dst, src, count); if (count <= 128) - return generic::Memmove<64, MaxSize>::head_tail(dst, src, count); - if (dst < src) { - generic::Memmove<32, MaxSize>::template align_forward(dst, src, - count); - return generic::Memmove<64, MaxSize>::loop_and_tail_forward(dst, src, - count); - } else { - generic::Memmove<32, MaxSize>::template align_backward(dst, src, - count); - return generic::Memmove<64, MaxSize>::loop_and_tail_backward(dst, src, - count); - } -} + return move>(dst, src, count); -static inline void inline_memmove(Ptr dst, CPtr src, size_t count) { -#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64) -#if defined(LLVM_LIBC_ARCH_X86) - static constexpr size_t kMaxSize = x86::kAvx512F ? 64 - : x86::kAvx ? 32 - : x86::kSse2 ? 16 - : 8; -#elif defined(LLVM_LIBC_ARCH_AARCH64) - static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8; -#endif - // return inline_memmove_generic(dst, src, count); - if (count == 0) - return; - if (count == 1) - return generic::Memmove<1, kMaxSize>::block(dst, src); - if (count <= 4) - return generic::Memmove<2, kMaxSize>::head_tail(dst, src, count); - if (count <= 8) - return generic::Memmove<4, kMaxSize>::head_tail(dst, src, count); - if (count <= 16) - return generic::Memmove<8, kMaxSize>::head_tail(dst, src, count); - if (count <= 32) - return generic::Memmove<16, kMaxSize>::head_tail(dst, src, count); - if (count <= 64) - return generic::Memmove<32, kMaxSize>::head_tail(dst, src, count); - if (count <= 128) - return generic::Memmove<64, kMaxSize>::head_tail(dst, src, count); - if (dst < src) { - generic::Memmove<32, kMaxSize>::align_forward(dst, src, count); - return generic::Memmove<64, kMaxSize>::loop_and_tail_forward(dst, src, - count); - } else { - generic::Memmove<32, kMaxSize>::align_backward(dst, src, count); - return generic::Memmove<64, kMaxSize>::loop_and_tail_backward(dst, src, - count); - } -#elif defined(LLVM_LIBC_ARCH_ARM) - return inline_memmove_embedded_tiny(dst, src, count); -#else -#error "Unsupported platform" -#endif + using AlignedMoveLoop = Align<_16, Arg::Src>::Then>; + if (dst < src) + return move(dst, src, count); + else if (dst > src) + return move_backward(dst, src, count); } LLVM_LIBC_FUNCTION(void *, memmove, (void *dst, const void *src, size_t count)) { - inline_memmove(reinterpret_cast(dst), reinterpret_cast(src), - count); + inline_memmove(reinterpret_cast(dst), + reinterpret_cast(src), count); return dst; } diff --git a/libc/src/string/memory_utils/bcmp_implementations.h b/libc/src/string/memory_utils/bcmp_implementations.h index 2e18ee8..c26e38e 100644 --- a/libc/src/string/memory_utils/bcmp_implementations.h +++ b/libc/src/string/memory_utils/bcmp_implementations.h @@ -11,169 +11,49 @@ #include "src/__support/architectures.h" #include "src/__support/common.h" -#include "src/string/memory_utils/op_aarch64.h" -#include "src/string/memory_utils/op_builtin.h" -#include "src/string/memory_utils/op_generic.h" -#include "src/string/memory_utils/op_x86.h" +#include "src/string/memory_utils/elements.h" #include // size_t namespace __llvm_libc { -[[maybe_unused]] static inline BcmpReturnType -inline_bcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) { -#pragma nounroll - for (size_t offset = 0; offset < count; ++offset) - if (auto value = generic::Bcmp<1>::block(p1 + offset, p2 + offset)) - return value; - return BcmpReturnType::ZERO(); +// Fixed-size difference between 'lhs' and 'rhs'. +template bool differs(const char *lhs, const char *rhs) { + return !Element::equals(lhs, rhs); } - -#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64) -[[maybe_unused]] static inline BcmpReturnType -inline_bcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) { - if (count < 256) - return generic::Bcmp<16>::loop_and_tail(p1, p2, count); - if (auto value = generic::Bcmp<64>::block(p1, p2)) - return value; - align_to_next_boundary<64, Arg::P1>(p1, p2, count); - return generic::Bcmp<64>::loop_and_tail(p1, p2, count); +// Runtime-size difference between 'lhs' and 'rhs'. +template +bool differs(const char *lhs, const char *rhs, size_t size) { + return !Element::equals(lhs, rhs, size); } -#endif // defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64) +static inline int inline_bcmp(const char *lhs, const char *rhs, size_t count) { #if defined(LLVM_LIBC_ARCH_X86) -[[maybe_unused]] static inline BcmpReturnType -inline_bcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) { - if (count <= 32) - return x86::sse2::Bcmp<16>::head_tail(p1, p2, count); - if (count < 256) - return x86::sse2::Bcmp<16>::loop_and_tail(p1, p2, count); - if (auto value = x86::sse2::Bcmp<16>::block(p1, p2)) - return value; - align_to_next_boundary<16, Arg::P1>(p1, p2, count); - return x86::sse2::Bcmp<64>::loop_and_tail(p1, p2, count); -} - -[[maybe_unused]] static inline BcmpReturnType -inline_bcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) { - if (count <= 32) - return x86::sse2::Bcmp<16>::head_tail(p1, p2, count); - if (count <= 64) - return x86::avx2::Bcmp<32>::head_tail(p1, p2, count); - if (count <= 128) - return x86::avx2::Bcmp<64>::head_tail(p1, p2, count); - if (unlikely(count >= 256)) { - if (auto value = x86::avx2::Bcmp<64>::block(p1, p2)) - return value; - align_to_next_boundary<64, Arg::P1>(p1, p2, count); - } - return x86::avx2::Bcmp<64>::loop_and_tail(p1, p2, count); -} - -[[maybe_unused]] static inline BcmpReturnType -inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) { - if (count <= 32) - return x86::sse2::Bcmp<16>::head_tail(p1, p2, count); - if (count <= 64) - return x86::avx2::Bcmp<32>::head_tail(p1, p2, count); - if (count <= 128) - return x86::avx512bw::Bcmp<64>::head_tail(p1, p2, count); - if (unlikely(count >= 256)) { - if (auto value = x86::avx512bw::Bcmp<64>::block(p1, p2)) - return value; - align_to_next_boundary<64, Arg::P1>(p1, p2, count); - } - return x86::avx512bw::Bcmp<64>::loop_and_tail(p1, p2, count); -} - -[[maybe_unused]] static inline BcmpReturnType inline_bcmp_x86(CPtr p1, CPtr p2, - size_t count) { + using namespace ::__llvm_libc::x86; +#elif defined(LLVM_LIBC_ARCH_AARCH64) + using namespace ::__llvm_libc::aarch64; +#else + using namespace ::__llvm_libc::scalar; +#endif if (count == 0) - return BcmpReturnType::ZERO(); + return 0; if (count == 1) - return generic::Bcmp<1>::block(p1, p2); + return differs<_1>(lhs, rhs); if (count == 2) - return generic::Bcmp<2>::block(p1, p2); - if (count <= 4) - return generic::Bcmp<2>::head_tail(p1, p2, count); + return differs<_2>(lhs, rhs); + if (count == 3) + return differs<_3>(lhs, rhs); if (count <= 8) - return generic::Bcmp<4>::head_tail(p1, p2, count); + return differs>(lhs, rhs, count); if (count <= 16) - return generic::Bcmp<8>::head_tail(p1, p2, count); - if constexpr (x86::kAvx512BW) - return inline_bcmp_x86_avx512bw_gt16(p1, p2, count); - else if constexpr (x86::kAvx2) - return inline_bcmp_x86_avx2_gt16(p1, p2, count); - else if constexpr (x86::kSse2) - return inline_bcmp_x86_sse2_gt16(p1, p2, count); - else - return inline_bcmp_generic_gt16(p1, p2, count); -} -#endif // defined(LLVM_LIBC_ARCH_X86) - -#if defined(LLVM_LIBC_ARCH_AARCH64) -[[maybe_unused]] static inline BcmpReturnType -inline_bcmp_aarch64(CPtr p1, CPtr p2, size_t count) { - if (likely(count <= 32)) { - if (unlikely(count >= 16)) { - return generic::Bcmp<16>::head_tail(p1, p2, count); - } - switch (count) { - case 0: - return BcmpReturnType::ZERO(); - case 1: - return generic::Bcmp<1>::block(p1, p2); - case 2: - return generic::Bcmp<2>::block(p1, p2); - case 3: - return generic::Bcmp<2>::head_tail(p1, p2, count); - case 4: - return generic::Bcmp<4>::block(p1, p2); - case 5: - case 6: - case 7: - return generic::Bcmp<4>::head_tail(p1, p2, count); - case 8: - return generic::Bcmp<8>::block(p1, p2); - case 9: - case 10: - case 11: - case 12: - case 13: - case 14: - case 15: - return generic::Bcmp<8>::head_tail(p1, p2, count); - } - } - + return differs>(lhs, rhs, count); + if (count <= 32) + return differs>(lhs, rhs, count); if (count <= 64) - return generic::Bcmp<32>::head_tail(p1, p2, count); - - // Aligned loop if > 256, otherwise normal loop - if (count > 256) { - if (auto value = generic::Bcmp<32>::block(p1, p2)) - return value; - align_to_next_boundary<16, Arg::P1>(p1, p2, count); - } - return generic::Bcmp<32>::loop_and_tail(p1, p2, count); -} -#endif // defined(LLVM_LIBC_ARCH_AARCH64) - -static inline BcmpReturnType inline_bcmp(CPtr p1, CPtr p2, size_t count) { -#if defined(LLVM_LIBC_ARCH_X86) - return inline_bcmp_x86(p1, p2, count); -#elif defined(LLVM_LIBC_ARCH_AARCH64) - return inline_bcmp_aarch64(p1, p2, count); -#elif defined(LLVM_LIBC_ARCH_ARM) - return inline_bcmp_embedded_tiny(p1, p2, count); -#else -#error "Unsupported platform" -#endif -} - -static inline int inline_bcmp(const void *p1, const void *p2, size_t count) { - return static_cast(inline_bcmp(reinterpret_cast(p1), - reinterpret_cast(p2), count)); + return differs>(lhs, rhs, count); + if (count <= 128) + return differs>(lhs, rhs, count); + return differs::Then>>(lhs, rhs, count); } } // namespace __llvm_libc diff --git a/libc/src/string/memory_utils/bzero_implementations.h b/libc/src/string/memory_utils/bzero_implementations.h index 550c910..168fdd7 100644 --- a/libc/src/string/memory_utils/bzero_implementations.h +++ b/libc/src/string/memory_utils/bzero_implementations.h @@ -15,14 +15,10 @@ namespace __llvm_libc { -inline static void inline_bzero(Ptr dst, size_t count) { +inline static void inline_bzero(char *dst, size_t count) { inline_memset(dst, 0, count); } -inline static void inline_bzero(void *dst, size_t count) { - inline_bzero(reinterpret_cast(dst), count); -} - } // namespace __llvm_libc #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BZERO_IMPLEMENTATIONS_H diff --git a/libc/src/string/memory_utils/memcmp_implementations.h b/libc/src/string/memory_utils/memcmp_implementations.h index 642c7e9..f207946 100644 --- a/libc/src/string/memory_utils/memcmp_implementations.h +++ b/libc/src/string/memory_utils/memcmp_implementations.h @@ -11,141 +11,95 @@ #include "src/__support/architectures.h" #include "src/__support/common.h" -#include "src/string/memory_utils/op_aarch64.h" -#include "src/string/memory_utils/op_builtin.h" -#include "src/string/memory_utils/op_generic.h" -#include "src/string/memory_utils/op_x86.h" -#include "src/string/memory_utils/utils.h" +#include "src/string/memory_utils/elements.h" #include // size_t namespace __llvm_libc { -[[maybe_unused]] static inline MemcmpReturnType -inline_memcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) { -#pragma nounroll - for (size_t offset = 0; offset < count; ++offset) - if (auto value = generic::Memcmp<1>::block(p1 + offset, p2 + offset)) - return value; - return MemcmpReturnType::ZERO(); -} - -#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64) -[[maybe_unused]] static inline MemcmpReturnType -inline_memcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) { - if (unlikely(count >= 384)) { - if (auto value = generic::Memcmp<16>::block(p1, p2)) - return value; - align_to_next_boundary<16, Arg::P1>(p1, p2, count); - } - return generic::Memcmp<16>::loop_and_tail(p1, p2, count); -} -#endif // defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64) +static inline int inline_memcmp(const char *lhs, const char *rhs, + size_t count) { #if defined(LLVM_LIBC_ARCH_X86) -[[maybe_unused]] static inline MemcmpReturnType -inline_memcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) { - if (unlikely(count >= 384)) { - if (auto value = x86::sse2::Memcmp<16>::block(p1, p2)) - return value; - align_to_next_boundary<16, Arg::P1>(p1, p2, count); - } - return x86::sse2::Memcmp<16>::loop_and_tail(p1, p2, count); -} - -[[maybe_unused]] static inline MemcmpReturnType -inline_memcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) { - if (count <= 32) - return x86::sse2::Memcmp<16>::head_tail(p1, p2, count); - if (count <= 64) - return x86::avx2::Memcmp<32>::head_tail(p1, p2, count); - if (count <= 128) - return x86::avx2::Memcmp<64>::head_tail(p1, p2, count); - if (unlikely(count >= 384)) { - if (auto value = x86::avx2::Memcmp<32>::block(p1, p2)) - return value; - align_to_next_boundary<32, Arg::P1>(p1, p2, count); - } - return x86::avx2::Memcmp<32>::loop_and_tail(p1, p2, count); -} - -[[maybe_unused]] static inline MemcmpReturnType -inline_memcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) { + ///////////////////////////////////////////////////////////////////////////// + // LLVM_LIBC_ARCH_X86 + ///////////////////////////////////////////////////////////////////////////// + using namespace __llvm_libc::x86; + if (count == 0) + return 0; + if (count == 1) + return three_way_compare<_1>(lhs, rhs); + if (count == 2) + return three_way_compare<_2>(lhs, rhs); + if (count == 3) + return three_way_compare<_3>(lhs, rhs); + if (count <= 8) + return three_way_compare>(lhs, rhs, count); + if (count <= 16) + return three_way_compare>(lhs, rhs, count); if (count <= 32) - return x86::sse2::Memcmp<16>::head_tail(p1, p2, count); + return three_way_compare>(lhs, rhs, count); if (count <= 64) - return x86::avx2::Memcmp<32>::head_tail(p1, p2, count); + return three_way_compare>(lhs, rhs, count); if (count <= 128) - return x86::avx512bw::Memcmp<64>::head_tail(p1, p2, count); - if (unlikely(count >= 384)) { - if (auto value = x86::avx512bw::Memcmp<64>::block(p1, p2)) - return value; - align_to_next_boundary<64, Arg::P1>(p1, p2, count); - } - return x86::avx512bw::Memcmp<64>::loop_and_tail(p1, p2, count); -} -#endif // defined(LLVM_LIBC_ARCH_X86) - -#if defined(LLVM_LIBC_ARCH_AARCH64) -[[maybe_unused]] static inline MemcmpReturnType -inline_memcmp_aarch64_neon_gt16(CPtr p1, CPtr p2, size_t count) { - if (unlikely(count >= 128)) { // [128, ∞] - if (auto value = generic::Memcmp<16>::block(p1, p2)) - return value; - align_to_next_boundary<16, Arg::P1>(p1, p2, count); - return generic::Memcmp<32>::loop_and_tail(p1, p2, count); - } + return three_way_compare>(lhs, rhs, count); + return three_way_compare::Then>>(lhs, rhs, count); +#elif defined(LLVM_LIBC_ARCH_AARCH64) + ///////////////////////////////////////////////////////////////////////////// + // LLVM_LIBC_ARCH_AARCH64 + ///////////////////////////////////////////////////////////////////////////// + using namespace ::__llvm_libc::aarch64; + if (count == 0) // [0, 0] + return 0; + if (count == 1) // [1, 1] + return three_way_compare<_1>(lhs, rhs); + if (count == 2) // [2, 2] + return three_way_compare<_2>(lhs, rhs); + if (count == 3) // [3, 3] + return three_way_compare<_3>(lhs, rhs); + if (count < 8) // [4, 7] + return three_way_compare>(lhs, rhs, count); + if (count < 16) // [8, 15] + return three_way_compare>(lhs, rhs, count); + if (unlikely(count >= 128)) // [128, ∞] + return three_way_compare::Then>>(lhs, rhs, count); + if (!equals<_16>(lhs, rhs)) // [16, 16] + return three_way_compare<_16>(lhs, rhs); if (count < 32) // [17, 31] - return generic::Memcmp<16>::tail(p1, p2, count); - if (generic::Bcmp<16>::block(p1 + 16, p2 + 16)) // [32, 32] - return generic::Memcmp<16>::block(p1 + 16, p2 + 16); + return three_way_compare>(lhs, rhs, count); + if (!equals::Then<_16>>(lhs, rhs)) // [32, 32] + return three_way_compare::Then<_16>>(lhs, rhs); if (count < 64) // [33, 63] - return generic::Memcmp<32>::tail(p1, p2, count); + return three_way_compare>(lhs, rhs, count); // [64, 127] - return generic::Memcmp<16>::loop_and_tail(p1 + 32, p2 + 32, count - 32); -} -#endif // defined(LLVM_LIBC_ARCH_AARCH64) + return three_way_compare::Then>>(lhs, rhs, count); +#else + ///////////////////////////////////////////////////////////////////////////// + // Default + ///////////////////////////////////////////////////////////////////////////// + using namespace ::__llvm_libc::scalar; -static inline MemcmpReturnType inline_memcmp(CPtr p1, CPtr p2, size_t count) { -#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64) if (count == 0) - return MemcmpReturnType::ZERO(); + return 0; if (count == 1) - return generic::Memcmp<1>::block(p1, p2); + return three_way_compare<_1>(lhs, rhs); if (count == 2) - return generic::Memcmp<2>::block(p1, p2); + return three_way_compare<_2>(lhs, rhs); if (count == 3) - return generic::Memcmp<3>::block(p1, p2); + return three_way_compare<_3>(lhs, rhs); if (count <= 8) - return generic::Memcmp<4>::head_tail(p1, p2, count); + return three_way_compare>(lhs, rhs, count); if (count <= 16) - return generic::Memcmp<8>::head_tail(p1, p2, count); -#if defined(LLVM_LIBC_ARCH_X86) - if constexpr (x86::kAvx512BW) - return inline_memcmp_x86_avx512bw_gt16(p1, p2, count); - else if constexpr (x86::kAvx2) - return inline_memcmp_x86_avx2_gt16(p1, p2, count); - else if constexpr (x86::kSse2) - return inline_memcmp_x86_sse2_gt16(p1, p2, count); - else - return inline_memcmp_generic_gt16(p1, p2, count); -#elif defined(LLVM_LIBC_ARCH_AARCH64) - if constexpr (aarch64::kNeon) - return inline_memcmp_aarch64_neon_gt16(p1, p2, count); - else - return inline_memcmp_generic_gt16(p1, p2, count); -#endif -#elif defined(LLVM_LIBC_ARCH_ARM) - return inline_memcmp_embedded_tiny(p1, p2, count); -#else -#error "Unsupported platform" + return three_way_compare>(lhs, rhs, count); + if (count <= 32) + return three_way_compare>(lhs, rhs, count); + if (count <= 64) + return three_way_compare>(lhs, rhs, count); + if (count <= 128) + return three_way_compare>(lhs, rhs, count); + return three_way_compare::Then>>(lhs, rhs, count); #endif } -static inline int inline_memcmp(const void *p1, const void *p2, size_t count) { - return static_cast(inline_memcmp(reinterpret_cast(p1), - reinterpret_cast(p2), count)); -} - } // namespace __llvm_libc #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMCMP_IMPLEMENTATIONS_H diff --git a/libc/src/string/memory_utils/memcpy_implementations.h b/libc/src/string/memory_utils/memcpy_implementations.h index cb9a828..3385d40 100644 --- a/libc/src/string/memory_utils/memcpy_implementations.h +++ b/libc/src/string/memory_utils/memcpy_implementations.h @@ -11,130 +11,145 @@ #include "src/__support/architectures.h" #include "src/__support/common.h" -#include "src/string/memory_utils/op_aarch64.h" -#include "src/string/memory_utils/op_builtin.h" -#include "src/string/memory_utils/op_generic.h" -#include "src/string/memory_utils/op_x86.h" +#include "src/string/memory_utils/elements.h" #include "src/string/memory_utils/utils.h" #include // size_t -namespace __llvm_libc { +// Design rationale +// ================ +// +// Using a profiler to observe size distributions for calls into libc +// functions, it was found most operations act on a small number of bytes. +// This makes it important to favor small sizes. +// +// The tests for `count` are in ascending order so the cost of branching is +// proportional to the cost of copying. +// +// The function is written in C++ for several reasons: +// - The compiler can __see__ the code, this is useful when performing Profile +// Guided Optimization as the optimized code can take advantage of branching +// probabilities. +// - It also allows for easier customization and favors testing multiple +// implementation parameters. +// - As compilers and processors get better, the generated code is improved +// with little change on the code side. -[[maybe_unused]] static inline void -inline_memcpy_embedded_tiny(Ptr __restrict dst, CPtr __restrict src, - size_t count) { -#pragma nounroll - for (size_t offset = 0; offset < count; ++offset) - builtin::Memcpy<1>::block(dst + offset, src + offset); -} +namespace __llvm_libc { +static inline void inline_memcpy(char *__restrict dst, + const char *__restrict src, size_t count) { + using namespace __llvm_libc::builtin; #if defined(LLVM_LIBC_ARCH_X86) -[[maybe_unused]] static inline void -inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) { + ///////////////////////////////////////////////////////////////////////////// + // LLVM_LIBC_ARCH_X86 + ///////////////////////////////////////////////////////////////////////////// + + // Whether to use only rep;movsb. + constexpr bool USE_ONLY_REP_MOVSB = + LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB); + + // kRepMovsBSize == -1 : Only CopyAligned is used. + // kRepMovsBSize == 0 : Only RepMovsb is used. + // else CopyAligned is used up to kRepMovsBSize and then RepMovsb. + constexpr size_t REP_MOVS_B_SIZE = +#if defined(LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE) + LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE; +#else + -1; +#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE + + // Whether target supports AVX instructions. + constexpr bool HAS_AVX = LLVM_LIBC_IS_DEFINED(__AVX__); + +#if defined(__AVX__) + using LoopBlockSize = _64; +#else + using LoopBlockSize = _32; +#endif + + if (USE_ONLY_REP_MOVSB) + return copy(dst, src, count); + if (count == 0) return; if (count == 1) - return builtin::Memcpy<1>::block(dst, src); + return copy<_1>(dst, src); if (count == 2) - return builtin::Memcpy<2>::block(dst, src); + return copy<_2>(dst, src); if (count == 3) - return builtin::Memcpy<3>::block(dst, src); + return copy<_3>(dst, src); if (count == 4) - return builtin::Memcpy<4>::block(dst, src); + return copy<_4>(dst, src); if (count < 8) - return builtin::Memcpy<4>::head_tail(dst, src, count); + return copy>(dst, src, count); if (count < 16) - return builtin::Memcpy<8>::head_tail(dst, src, count); + return copy>(dst, src, count); if (count < 32) - return builtin::Memcpy<16>::head_tail(dst, src, count); + return copy>(dst, src, count); if (count < 64) - return builtin::Memcpy<32>::head_tail(dst, src, count); + return copy>(dst, src, count); if (count < 128) - return builtin::Memcpy<64>::head_tail(dst, src, count); - if (x86::kAvx && count < 256) - return builtin::Memcpy<128>::head_tail(dst, src, count); - builtin::Memcpy<32>::block(dst, src); - align_to_next_boundary<32, Arg::Dst>(dst, src, count); - static constexpr size_t kBlockSize = x86::kAvx ? 64 : 32; - return builtin::Memcpy::loop_and_tail(dst, src, count); -} - -[[maybe_unused]] static inline void -inline_memcpy_x86_maybe_interpose_repmovsb(Ptr __restrict dst, - CPtr __restrict src, size_t count) { - // Whether to use rep;movsb exclusively, not at all, or only above a certain - // threshold. - // TODO: Use only a single preprocessor definition to simplify the code. -#ifndef LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE -#define LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE -1 -#endif - - static constexpr bool kUseOnlyRepMovsb = - LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB); - static constexpr size_t kRepMovsbThreshold = - LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE; - if constexpr (kUseOnlyRepMovsb) - return x86::Memcpy::repmovsb(dst, src, count); - else if constexpr (kRepMovsbThreshold >= 0) { - if (unlikely(count >= kRepMovsbThreshold)) - return x86::Memcpy::repmovsb(dst, src, count); - else - return inline_memcpy_x86(dst, src, count); - } else { - return inline_memcpy_x86(dst, src, count); - } -} -#endif // defined(LLVM_LIBC_ARCH_X86) - -#if defined(LLVM_LIBC_ARCH_AARCH64) -[[maybe_unused]] static inline void -inline_memcpy_aarch64(Ptr __restrict dst, CPtr __restrict src, size_t count) { + return copy>(dst, src, count); + if (HAS_AVX && count < 256) + return copy>(dst, src, count); + if (count <= REP_MOVS_B_SIZE) + return copy::Then>>(dst, src, + count); + return copy(dst, src, count); +#elif defined(LLVM_LIBC_ARCH_AARCH64) + ///////////////////////////////////////////////////////////////////////////// + // LLVM_LIBC_ARCH_AARCH64 + ///////////////////////////////////////////////////////////////////////////// if (count == 0) return; if (count == 1) - return builtin::Memcpy<1>::block(dst, src); + return copy<_1>(dst, src); if (count == 2) - return builtin::Memcpy<2>::block(dst, src); + return copy<_2>(dst, src); if (count == 3) - return builtin::Memcpy<3>::block(dst, src); + return copy<_3>(dst, src); if (count == 4) - return builtin::Memcpy<4>::block(dst, src); + return copy<_4>(dst, src); if (count < 8) - return builtin::Memcpy<4>::head_tail(dst, src, count); + return copy>(dst, src, count); if (count < 16) - return builtin::Memcpy<8>::head_tail(dst, src, count); + return copy>(dst, src, count); if (count < 32) - return builtin::Memcpy<16>::head_tail(dst, src, count); + return copy>(dst, src, count); if (count < 64) - return builtin::Memcpy<32>::head_tail(dst, src, count); + return copy>(dst, src, count); if (count < 128) - return builtin::Memcpy<64>::head_tail(dst, src, count); - builtin::Memcpy<16>::block(dst, src); - align_to_next_boundary<16, Arg::Src>(dst, src, count); - return builtin::Memcpy<64>::loop_and_tail(dst, src, count); -} -#endif // defined(LLVM_LIBC_ARCH_AARCH64) - -static inline void inline_memcpy(Ptr __restrict dst, CPtr __restrict src, - size_t count) { - using namespace __llvm_libc::builtin; -#if defined(LLVM_LIBC_ARCH_X86) - return inline_memcpy_x86_maybe_interpose_repmovsb(dst, src, count); -#elif defined(LLVM_LIBC_ARCH_AARCH64) - return inline_memcpy_aarch64(dst, src, count); -#elif defined(LLVM_LIBC_ARCH_ARM) - return inline_memcpy_embedded_tiny(dst, src, count); + return copy>(dst, src, count); + return copy::Then>>(dst, src, count); #else -#error "Unsupported platform" + ///////////////////////////////////////////////////////////////////////////// + // Default + ///////////////////////////////////////////////////////////////////////////// + if (count == 0) + return; + if (count == 1) + return copy<_1>(dst, src); + if (count == 2) + return copy<_2>(dst, src); + if (count == 3) + return copy<_3>(dst, src); + if (count == 4) + return copy<_4>(dst, src); + if (count < 8) + return copy>(dst, src, count); + if (count < 16) + return copy>(dst, src, count); + if (count < 32) + return copy>(dst, src, count); + if (count < 64) + return copy>(dst, src, count); + if (count < 128) + return copy>(dst, src, count); + return copy::Then>>(dst, src, count); #endif } -static inline void inline_memcpy(void *__restrict dst, - const void *__restrict src, size_t count) { - inline_memcpy(reinterpret_cast(dst), reinterpret_cast(src), count); -} - } // namespace __llvm_libc #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_IMPLEMENTATIONS_H diff --git a/libc/src/string/memory_utils/memset_implementations.h b/libc/src/string/memory_utils/memset_implementations.h index 75ecf16..d58ed3b 100644 --- a/libc/src/string/memory_utils/memset_implementations.h +++ b/libc/src/string/memory_utils/memset_implementations.h @@ -10,109 +10,127 @@ #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H #include "src/__support/architectures.h" -#include "src/string/memory_utils/op_aarch64.h" -#include "src/string/memory_utils/op_builtin.h" -#include "src/string/memory_utils/op_generic.h" -#include "src/string/memory_utils/op_x86.h" +#include "src/string/memory_utils/elements.h" #include "src/string/memory_utils/utils.h" #include // size_t namespace __llvm_libc { -[[maybe_unused]] inline static void -inline_memset_embedded_tiny(Ptr dst, uint8_t value, size_t count) { -#pragma nounroll - for (size_t offset = 0; offset < count; ++offset) - generic::Memset<1, 1>::block(dst + offset, value); -} - +// A general purpose implementation assuming cheap unaligned writes for sizes: +// 1, 2, 4, 8, 16, 32 and 64 Bytes. Note that some architecture can't store 32 +// or 64 Bytes at a time, the compiler will expand them as needed. +// +// This implementation is subject to change as we benchmark more processors. We +// may also want to customize it for processors with specialized instructions +// that performs better (e.g. `rep stosb`). +// +// A note on the apparent discrepancy in the use of 32 vs 64 Bytes writes. +// We want to balance two things here: +// - The number of redundant writes (when using `SetBlockOverlap`), +// - The number of conditionals for sizes <=128 (~90% of memset calls are for +// such sizes). +// +// For the range 64-128: +// - SetBlockOverlap<64> uses no conditionals but always writes 128 Bytes this +// is wasteful near 65 but efficient toward 128. +// - SetAlignedBlocks<32> would consume between 3 and 4 conditionals and write +// 96 or 128 Bytes. +// - Another approach could be to use an hybrid approach copy<64>+Overlap<32> +// for 65-96 and copy<96>+Overlap<32> for 97-128 +// +// Benchmarks showed that redundant writes were cheap (for Intel X86) but +// conditional were expensive, even on processor that do not support writing 64B +// at a time (pre-AVX512F). We also want to favor short functions that allow +// more hot code to fit in the iL1 cache. +// +// Above 128 we have to use conditionals since we don't know the upper bound in +// advance. SetAlignedBlocks<64> may waste up to 63 Bytes, SetAlignedBlocks<32> +// may waste up to 31 Bytes. Benchmarks showed that SetAlignedBlocks<64> was not +// superior for sizes that mattered. +inline static void inline_memset(char *dst, unsigned char value, size_t count) { #if defined(LLVM_LIBC_ARCH_X86) -template -[[maybe_unused]] inline static void inline_memset_x86(Ptr dst, uint8_t value, - size_t count) { + ///////////////////////////////////////////////////////////////////////////// + // LLVM_LIBC_ARCH_X86 + ///////////////////////////////////////////////////////////////////////////// + using namespace __llvm_libc::x86; if (count == 0) return; if (count == 1) - return generic::Memset<1, MaxSize>::block(dst, value); + return splat_set<_1>(dst, value); if (count == 2) - return generic::Memset<2, MaxSize>::block(dst, value); + return splat_set<_2>(dst, value); if (count == 3) - return generic::Memset<3, MaxSize>::block(dst, value); + return splat_set<_3>(dst, value); if (count <= 8) - return generic::Memset<4, MaxSize>::head_tail(dst, value, count); + return splat_set>(dst, value, count); if (count <= 16) - return generic::Memset<8, MaxSize>::head_tail(dst, value, count); + return splat_set>(dst, value, count); if (count <= 32) - return generic::Memset<16, MaxSize>::head_tail(dst, value, count); + return splat_set>(dst, value, count); if (count <= 64) - return generic::Memset<32, MaxSize>::head_tail(dst, value, count); + return splat_set>(dst, value, count); if (count <= 128) - return generic::Memset<64, MaxSize>::head_tail(dst, value, count); - // Aligned loop - generic::Memset<32, MaxSize>::block(dst, value); - align_to_next_boundary<32>(dst, count); - return generic::Memset<32, MaxSize>::loop_and_tail(dst, value, count); -} -#endif // defined(LLVM_LIBC_ARCH_X86) - -#if defined(LLVM_LIBC_ARCH_AARCH64) -template -[[maybe_unused]] inline static void -inline_memset_aarch64(Ptr dst, uint8_t value, size_t count) { + return splat_set>(dst, value, count); + return splat_set::Then>>(dst, value, count); +#elif defined(LLVM_LIBC_ARCH_AARCH64) + ///////////////////////////////////////////////////////////////////////////// + // LLVM_LIBC_ARCH_AARCH64 + ///////////////////////////////////////////////////////////////////////////// + using namespace __llvm_libc::aarch64_memset; if (count == 0) return; if (count <= 3) { - generic::Memset<1, MaxSize>::block(dst, value); + splat_set<_1>(dst, value); if (count > 1) - generic::Memset<2, MaxSize>::tail(dst, value, count); + splat_set>(dst, value, count); return; } if (count <= 8) - return generic::Memset<4, MaxSize>::head_tail(dst, value, count); + return splat_set>(dst, value, count); if (count <= 16) - return generic::Memset<8, MaxSize>::head_tail(dst, value, count); + return splat_set>(dst, value, count); if (count <= 32) - return generic::Memset<16, MaxSize>::head_tail(dst, value, count); + return splat_set>(dst, value, count); if (count <= (32 + 64)) { - generic::Memset<32, MaxSize>::block(dst, value); + splat_set<_32>(dst, value); if (count <= 64) - return generic::Memset<32, MaxSize>::tail(dst, value, count); - generic::Memset<32, MaxSize>::block(dst + 32, value); - generic::Memset<32, MaxSize>::tail(dst, value, count); + return splat_set>(dst, value, count); + splat_set::Then<_32>>(dst, value); + splat_set>(dst, value, count); return; } - if (count >= 448 && value == 0 && aarch64::neon::hasZva()) { - generic::Memset<64, MaxSize>::block(dst, 0); - align_to_next_boundary<64>(dst, count); - return aarch64::neon::BzeroCacheLine<64>::loop_and_tail(dst, 0, count); - } else { - generic::Memset<16, MaxSize>::block(dst, value); - align_to_next_boundary<16>(dst, count); - return generic::Memset<64, MaxSize>::loop_and_tail(dst, value, count); - } -} -#endif // defined(LLVM_LIBC_ARCH_AARCH64) - -inline static void inline_memset(Ptr dst, uint8_t value, size_t count) { -#if defined(LLVM_LIBC_ARCH_X86) - static constexpr size_t kMaxSize = x86::kAvx512F ? 64 - : x86::kAvx ? 32 - : x86::kSse2 ? 16 - : 8; - return inline_memset_x86(dst, value, count); -#elif defined(LLVM_LIBC_ARCH_AARCH64) - static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8; - return inline_memset_aarch64(dst, value, count); -#elif defined(LLVM_LIBC_ARCH_ARM) - return inline_memset_embedded_tiny(dst, value, count); + if (count >= 448 && value == 0 && hasZva()) + return splat_set::Then>>(dst, 0, + count); + else + return splat_set::Then>>(dst, value, count); #else -#error "Unsupported platform" -#endif -} + ///////////////////////////////////////////////////////////////////////////// + // Default + ///////////////////////////////////////////////////////////////////////////// + using namespace ::__llvm_libc::scalar; -inline static void inline_memset(void *dst, uint8_t value, size_t count) { - inline_memset(reinterpret_cast(dst), value, count); + if (count == 0) + return; + if (count == 1) + return splat_set<_1>(dst, value); + if (count == 2) + return splat_set<_2>(dst, value); + if (count == 3) + return splat_set<_3>(dst, value); + if (count <= 8) + return splat_set>(dst, value, count); + if (count <= 16) + return splat_set>(dst, value, count); + if (count <= 32) + return splat_set>(dst, value, count); + if (count <= 64) + return splat_set>(dst, value, count); + if (count <= 128) + return splat_set>(dst, value, count); + return splat_set::Then>>(dst, value, count); +#endif } } // namespace __llvm_libc diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h index 8e64322..a4b59a1 100644 --- a/libc/src/string/memory_utils/op_x86.h +++ b/libc/src/string/memory_utils/op_x86.h @@ -42,7 +42,7 @@ static inline constexpr bool kAvx512BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__); /////////////////////////////////////////////////////////////////////////////// // Memcpy repmovsb implementation struct Memcpy { - static void repmovsb(void *dst, const void *src, size_t count) { + static void repmovsb(char *dst, const char *src, size_t count) { asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory"); } }; diff --git a/libc/src/string/mempcpy.cpp b/libc/src/string/mempcpy.cpp index dd539eb..f26bd64 100644 --- a/libc/src/string/mempcpy.cpp +++ b/libc/src/string/mempcpy.cpp @@ -15,10 +15,11 @@ namespace __llvm_libc { LLVM_LIBC_FUNCTION(void *, mempcpy, - (void *__restrict dst, const void *__restrict src, + (void *__restrict dest, const void *__restrict src, size_t count)) { - inline_memcpy(dst, src, count); - return reinterpret_cast(dst) + count; + char *result = reinterpret_cast(dest); + inline_memcpy(result, reinterpret_cast(src), count); + return result + count; } } // namespace __llvm_libc diff --git a/libc/src/string/memset.cpp b/libc/src/string/memset.cpp index b80cfce..549c074 100644 --- a/libc/src/string/memset.cpp +++ b/libc/src/string/memset.cpp @@ -13,7 +13,8 @@ namespace __llvm_libc { LLVM_LIBC_FUNCTION(void *, memset, (void *dst, int value, size_t count)) { - inline_memset(dst, static_cast(value), count); + inline_memset(reinterpret_cast(dst), + static_cast(value), count); return dst; }