From 67437dd0147e92fe601ef76d17dbbf171f638580 Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet Date: Wed, 2 Nov 2022 09:03:58 +0000 Subject: [PATCH] [reland][libc] Switch to new implementation of mem* functions The new framework makes it explicit which processor feature is being used and allows for easier per platform customization: - ARM cpu now uses trivial implementations to reduce code size. - Memcmp, Bcmp and Memmove have been optimized for x86 - Bcmp has been optimized for aarch64. This is a reland of https://reviews.llvm.org/D135134 (b3f1d58, 028414881381) Reviewed By: courbet Differential Revision: https://reviews.llvm.org/D136595 --- libc/src/stdio/printf_core/string_writer.cpp | 2 +- libc/src/string/bcmp.cpp | 3 +- libc/src/string/memcmp.cpp | 3 +- libc/src/string/memcpy.cpp | 3 +- libc/src/string/memmove.cpp | 104 +++++++++-- .../src/string/memory_utils/bcmp_implementations.h | 176 +++++++++++++++--- .../string/memory_utils/bzero_implementations.h | 6 +- .../string/memory_utils/memcmp_implementations.h | 182 ++++++++++++------- .../string/memory_utils/memcpy_implementations.h | 197 ++++++++++----------- .../string/memory_utils/memset_implementations.h | 160 ++++++++--------- libc/src/string/memory_utils/op_x86.h | 2 +- libc/src/string/mempcpy.cpp | 7 +- libc/src/string/memset.cpp | 3 +- 13 files changed, 525 insertions(+), 323 deletions(-) diff --git a/libc/src/stdio/printf_core/string_writer.cpp b/libc/src/stdio/printf_core/string_writer.cpp index a80df32..472573d 100644 --- a/libc/src/stdio/printf_core/string_writer.cpp +++ b/libc/src/stdio/printf_core/string_writer.cpp @@ -33,7 +33,7 @@ void StringWriter::write(char new_char, size_t len) { len = available_capacity; if (len > 0) { - inline_memset(cur_buffer, new_char, len); + inline_memset(cur_buffer, static_cast(new_char), len); cur_buffer += len; available_capacity -= len; } diff --git a/libc/src/string/bcmp.cpp b/libc/src/string/bcmp.cpp index 963a7f5..2199130 100644 --- a/libc/src/string/bcmp.cpp +++ b/libc/src/string/bcmp.cpp @@ -14,8 +14,7 @@ namespace __llvm_libc { LLVM_LIBC_FUNCTION(int, bcmp, (const void *lhs, const void *rhs, size_t count)) { - return inline_bcmp(static_cast(lhs), - static_cast(rhs), count); + return inline_bcmp(lhs, rhs, count); } } // namespace __llvm_libc diff --git a/libc/src/string/memcmp.cpp b/libc/src/string/memcmp.cpp index 292525e..7cf6782 100644 --- a/libc/src/string/memcmp.cpp +++ b/libc/src/string/memcmp.cpp @@ -15,8 +15,7 @@ namespace __llvm_libc { LLVM_LIBC_FUNCTION(int, memcmp, (const void *lhs, const void *rhs, size_t count)) { - return inline_memcmp(static_cast(lhs), - static_cast(rhs), count); + return inline_memcmp(lhs, rhs, count); } } // namespace __llvm_libc diff --git a/libc/src/string/memcpy.cpp b/libc/src/string/memcpy.cpp index ff990f4..8504005 100644 --- a/libc/src/string/memcpy.cpp +++ b/libc/src/string/memcpy.cpp @@ -15,8 +15,7 @@ namespace __llvm_libc { LLVM_LIBC_FUNCTION(void *, memcpy, (void *__restrict dst, const void *__restrict src, size_t size)) { - inline_memcpy(reinterpret_cast(dst), - reinterpret_cast(src), size); + inline_memcpy(dst, src, size); return dst; } diff --git a/libc/src/string/memmove.cpp b/libc/src/string/memmove.cpp index f242578..a42ced3 100644 --- a/libc/src/string/memmove.cpp +++ b/libc/src/string/memmove.cpp @@ -9,42 +9,110 @@ #include "src/string/memmove.h" #include "src/__support/common.h" -#include "src/__support/integer_operations.h" -#include "src/string/memory_utils/elements.h" +#include "src/string/memory_utils/op_aarch64.h" +#include "src/string/memory_utils/op_builtin.h" +#include "src/string/memory_utils/op_generic.h" +#include "src/string/memory_utils/op_x86.h" #include // size_t, ptrdiff_t +#include + namespace __llvm_libc { -static inline void inline_memmove(char *dst, const char *src, size_t count) { - using namespace __llvm_libc::scalar; +[[maybe_unused]] static inline void +inline_memmove_embedded_tiny(Ptr dst, CPtr src, size_t count) { + if ((count == 0) || (dst == src)) + return; + if (dst < src) { +#pragma nounroll + for (size_t offset = 0; offset < count; ++offset) + builtin::Memcpy<1>::block(dst + offset, src + offset); + } else { +#pragma nounroll + for (ptrdiff_t offset = count - 1; offset >= 0; --offset) + builtin::Memcpy<1>::block(dst + offset, src + offset); + } +} + +template +[[maybe_unused]] static inline void inline_memmove_generic(Ptr dst, CPtr src, + size_t count) { if (count == 0) return; if (count == 1) - return move<_1>(dst, src); + return generic::Memmove<1, MaxSize>::block(dst, src); if (count <= 4) - return move>(dst, src, count); + return generic::Memmove<2, MaxSize>::head_tail(dst, src, count); if (count <= 8) - return move>(dst, src, count); + return generic::Memmove<4, MaxSize>::head_tail(dst, src, count); if (count <= 16) - return move>(dst, src, count); + return generic::Memmove<8, MaxSize>::head_tail(dst, src, count); if (count <= 32) - return move>(dst, src, count); + return generic::Memmove<16, MaxSize>::head_tail(dst, src, count); if (count <= 64) - return move>(dst, src, count); + return generic::Memmove<32, MaxSize>::head_tail(dst, src, count); if (count <= 128) - return move>(dst, src, count); + return generic::Memmove<64, MaxSize>::head_tail(dst, src, count); + if (dst < src) { + generic::Memmove<32, MaxSize>::template align_forward(dst, src, + count); + return generic::Memmove<64, MaxSize>::loop_and_tail_forward(dst, src, + count); + } else { + generic::Memmove<32, MaxSize>::template align_backward(dst, src, + count); + return generic::Memmove<64, MaxSize>::loop_and_tail_backward(dst, src, + count); + } +} - using AlignedMoveLoop = Align<_16, Arg::Src>::Then>; - if (dst < src) - return move(dst, src, count); - else if (dst > src) - return move_backward(dst, src, count); +static inline void inline_memmove(Ptr dst, CPtr src, size_t count) { +#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64) +#if defined(LLVM_LIBC_ARCH_X86) + static constexpr size_t kMaxSize = x86::kAvx512F ? 64 + : x86::kAvx ? 32 + : x86::kSse2 ? 16 + : 8; +#elif defined(LLVM_LIBC_ARCH_AARCH64) + static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8; +#endif + // return inline_memmove_generic(dst, src, count); + if (count == 0) + return; + if (count == 1) + return generic::Memmove<1, kMaxSize>::block(dst, src); + if (count <= 4) + return generic::Memmove<2, kMaxSize>::head_tail(dst, src, count); + if (count <= 8) + return generic::Memmove<4, kMaxSize>::head_tail(dst, src, count); + if (count <= 16) + return generic::Memmove<8, kMaxSize>::head_tail(dst, src, count); + if (count <= 32) + return generic::Memmove<16, kMaxSize>::head_tail(dst, src, count); + if (count <= 64) + return generic::Memmove<32, kMaxSize>::head_tail(dst, src, count); + if (count <= 128) + return generic::Memmove<64, kMaxSize>::head_tail(dst, src, count); + if (dst < src) { + generic::Memmove<32, kMaxSize>::align_forward(dst, src, count); + return generic::Memmove<64, kMaxSize>::loop_and_tail_forward(dst, src, + count); + } else { + generic::Memmove<32, kMaxSize>::align_backward(dst, src, count); + return generic::Memmove<64, kMaxSize>::loop_and_tail_backward(dst, src, + count); + } +#elif defined(LLVM_LIBC_ARCH_ARM) + return inline_memmove_embedded_tiny(dst, src, count); +#else +#error "Unsupported platform" +#endif } LLVM_LIBC_FUNCTION(void *, memmove, (void *dst, const void *src, size_t count)) { - inline_memmove(reinterpret_cast(dst), - reinterpret_cast(src), count); + inline_memmove(reinterpret_cast(dst), reinterpret_cast(src), + count); return dst; } diff --git a/libc/src/string/memory_utils/bcmp_implementations.h b/libc/src/string/memory_utils/bcmp_implementations.h index c26e38e..2e18ee8 100644 --- a/libc/src/string/memory_utils/bcmp_implementations.h +++ b/libc/src/string/memory_utils/bcmp_implementations.h @@ -11,49 +11,169 @@ #include "src/__support/architectures.h" #include "src/__support/common.h" -#include "src/string/memory_utils/elements.h" +#include "src/string/memory_utils/op_aarch64.h" +#include "src/string/memory_utils/op_builtin.h" +#include "src/string/memory_utils/op_generic.h" +#include "src/string/memory_utils/op_x86.h" #include // size_t namespace __llvm_libc { -// Fixed-size difference between 'lhs' and 'rhs'. -template bool differs(const char *lhs, const char *rhs) { - return !Element::equals(lhs, rhs); +[[maybe_unused]] static inline BcmpReturnType +inline_bcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) { +#pragma nounroll + for (size_t offset = 0; offset < count; ++offset) + if (auto value = generic::Bcmp<1>::block(p1 + offset, p2 + offset)) + return value; + return BcmpReturnType::ZERO(); } -// Runtime-size difference between 'lhs' and 'rhs'. -template -bool differs(const char *lhs, const char *rhs, size_t size) { - return !Element::equals(lhs, rhs, size); + +#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64) +[[maybe_unused]] static inline BcmpReturnType +inline_bcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) { + if (count < 256) + return generic::Bcmp<16>::loop_and_tail(p1, p2, count); + if (auto value = generic::Bcmp<64>::block(p1, p2)) + return value; + align_to_next_boundary<64, Arg::P1>(p1, p2, count); + return generic::Bcmp<64>::loop_and_tail(p1, p2, count); } +#endif // defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64) -static inline int inline_bcmp(const char *lhs, const char *rhs, size_t count) { #if defined(LLVM_LIBC_ARCH_X86) - using namespace ::__llvm_libc::x86; -#elif defined(LLVM_LIBC_ARCH_AARCH64) - using namespace ::__llvm_libc::aarch64; -#else - using namespace ::__llvm_libc::scalar; -#endif +[[maybe_unused]] static inline BcmpReturnType +inline_bcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) { + if (count <= 32) + return x86::sse2::Bcmp<16>::head_tail(p1, p2, count); + if (count < 256) + return x86::sse2::Bcmp<16>::loop_and_tail(p1, p2, count); + if (auto value = x86::sse2::Bcmp<16>::block(p1, p2)) + return value; + align_to_next_boundary<16, Arg::P1>(p1, p2, count); + return x86::sse2::Bcmp<64>::loop_and_tail(p1, p2, count); +} + +[[maybe_unused]] static inline BcmpReturnType +inline_bcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) { + if (count <= 32) + return x86::sse2::Bcmp<16>::head_tail(p1, p2, count); + if (count <= 64) + return x86::avx2::Bcmp<32>::head_tail(p1, p2, count); + if (count <= 128) + return x86::avx2::Bcmp<64>::head_tail(p1, p2, count); + if (unlikely(count >= 256)) { + if (auto value = x86::avx2::Bcmp<64>::block(p1, p2)) + return value; + align_to_next_boundary<64, Arg::P1>(p1, p2, count); + } + return x86::avx2::Bcmp<64>::loop_and_tail(p1, p2, count); +} + +[[maybe_unused]] static inline BcmpReturnType +inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) { + if (count <= 32) + return x86::sse2::Bcmp<16>::head_tail(p1, p2, count); + if (count <= 64) + return x86::avx2::Bcmp<32>::head_tail(p1, p2, count); + if (count <= 128) + return x86::avx512bw::Bcmp<64>::head_tail(p1, p2, count); + if (unlikely(count >= 256)) { + if (auto value = x86::avx512bw::Bcmp<64>::block(p1, p2)) + return value; + align_to_next_boundary<64, Arg::P1>(p1, p2, count); + } + return x86::avx512bw::Bcmp<64>::loop_and_tail(p1, p2, count); +} + +[[maybe_unused]] static inline BcmpReturnType inline_bcmp_x86(CPtr p1, CPtr p2, + size_t count) { if (count == 0) - return 0; + return BcmpReturnType::ZERO(); if (count == 1) - return differs<_1>(lhs, rhs); + return generic::Bcmp<1>::block(p1, p2); if (count == 2) - return differs<_2>(lhs, rhs); - if (count == 3) - return differs<_3>(lhs, rhs); + return generic::Bcmp<2>::block(p1, p2); + if (count <= 4) + return generic::Bcmp<2>::head_tail(p1, p2, count); if (count <= 8) - return differs>(lhs, rhs, count); + return generic::Bcmp<4>::head_tail(p1, p2, count); if (count <= 16) - return differs>(lhs, rhs, count); - if (count <= 32) - return differs>(lhs, rhs, count); + return generic::Bcmp<8>::head_tail(p1, p2, count); + if constexpr (x86::kAvx512BW) + return inline_bcmp_x86_avx512bw_gt16(p1, p2, count); + else if constexpr (x86::kAvx2) + return inline_bcmp_x86_avx2_gt16(p1, p2, count); + else if constexpr (x86::kSse2) + return inline_bcmp_x86_sse2_gt16(p1, p2, count); + else + return inline_bcmp_generic_gt16(p1, p2, count); +} +#endif // defined(LLVM_LIBC_ARCH_X86) + +#if defined(LLVM_LIBC_ARCH_AARCH64) +[[maybe_unused]] static inline BcmpReturnType +inline_bcmp_aarch64(CPtr p1, CPtr p2, size_t count) { + if (likely(count <= 32)) { + if (unlikely(count >= 16)) { + return generic::Bcmp<16>::head_tail(p1, p2, count); + } + switch (count) { + case 0: + return BcmpReturnType::ZERO(); + case 1: + return generic::Bcmp<1>::block(p1, p2); + case 2: + return generic::Bcmp<2>::block(p1, p2); + case 3: + return generic::Bcmp<2>::head_tail(p1, p2, count); + case 4: + return generic::Bcmp<4>::block(p1, p2); + case 5: + case 6: + case 7: + return generic::Bcmp<4>::head_tail(p1, p2, count); + case 8: + return generic::Bcmp<8>::block(p1, p2); + case 9: + case 10: + case 11: + case 12: + case 13: + case 14: + case 15: + return generic::Bcmp<8>::head_tail(p1, p2, count); + } + } + if (count <= 64) - return differs>(lhs, rhs, count); - if (count <= 128) - return differs>(lhs, rhs, count); - return differs::Then>>(lhs, rhs, count); + return generic::Bcmp<32>::head_tail(p1, p2, count); + + // Aligned loop if > 256, otherwise normal loop + if (count > 256) { + if (auto value = generic::Bcmp<32>::block(p1, p2)) + return value; + align_to_next_boundary<16, Arg::P1>(p1, p2, count); + } + return generic::Bcmp<32>::loop_and_tail(p1, p2, count); +} +#endif // defined(LLVM_LIBC_ARCH_AARCH64) + +static inline BcmpReturnType inline_bcmp(CPtr p1, CPtr p2, size_t count) { +#if defined(LLVM_LIBC_ARCH_X86) + return inline_bcmp_x86(p1, p2, count); +#elif defined(LLVM_LIBC_ARCH_AARCH64) + return inline_bcmp_aarch64(p1, p2, count); +#elif defined(LLVM_LIBC_ARCH_ARM) + return inline_bcmp_embedded_tiny(p1, p2, count); +#else +#error "Unsupported platform" +#endif +} + +static inline int inline_bcmp(const void *p1, const void *p2, size_t count) { + return static_cast(inline_bcmp(reinterpret_cast(p1), + reinterpret_cast(p2), count)); } } // namespace __llvm_libc diff --git a/libc/src/string/memory_utils/bzero_implementations.h b/libc/src/string/memory_utils/bzero_implementations.h index 168fdd7..550c910 100644 --- a/libc/src/string/memory_utils/bzero_implementations.h +++ b/libc/src/string/memory_utils/bzero_implementations.h @@ -15,10 +15,14 @@ namespace __llvm_libc { -inline static void inline_bzero(char *dst, size_t count) { +inline static void inline_bzero(Ptr dst, size_t count) { inline_memset(dst, 0, count); } +inline static void inline_bzero(void *dst, size_t count) { + inline_bzero(reinterpret_cast(dst), count); +} + } // namespace __llvm_libc #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BZERO_IMPLEMENTATIONS_H diff --git a/libc/src/string/memory_utils/memcmp_implementations.h b/libc/src/string/memory_utils/memcmp_implementations.h index f207946..b3258b9 100644 --- a/libc/src/string/memory_utils/memcmp_implementations.h +++ b/libc/src/string/memory_utils/memcmp_implementations.h @@ -11,93 +11,141 @@ #include "src/__support/architectures.h" #include "src/__support/common.h" -#include "src/string/memory_utils/elements.h" +#include "src/string/memory_utils/op_aarch64.h" +#include "src/string/memory_utils/op_builtin.h" +#include "src/string/memory_utils/op_generic.h" +#include "src/string/memory_utils/op_x86.h" +#include "src/string/memory_utils/utils.h" #include // size_t namespace __llvm_libc { +[[maybe_unused]] static inline MemcmpReturnType +inline_memcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) { +#pragma nounroll + for (size_t offset = 0; offset < count; ++offset) + if (auto value = generic::Memcmp<1>::block(p1 + offset, p2 + offset)) + return value; + return MemcmpReturnType::ZERO(); +} + +#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64) +[[maybe_unused]] static inline MemcmpReturnType +inline_memcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) { + if (unlikely(count >= 384)) { + if (auto value = generic::Memcmp<16>::block(p1, p2)) + return value; + align_to_next_boundary<16, Arg::P1>(p1, p2, count); + } + return generic::Memcmp<16>::loop_and_tail(p1, p2, count); +} +#endif // defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64) -static inline int inline_memcmp(const char *lhs, const char *rhs, - size_t count) { #if defined(LLVM_LIBC_ARCH_X86) - ///////////////////////////////////////////////////////////////////////////// - // LLVM_LIBC_ARCH_X86 - ///////////////////////////////////////////////////////////////////////////// - using namespace __llvm_libc::x86; - if (count == 0) - return 0; - if (count == 1) - return three_way_compare<_1>(lhs, rhs); - if (count == 2) - return three_way_compare<_2>(lhs, rhs); - if (count == 3) - return three_way_compare<_3>(lhs, rhs); - if (count <= 8) - return three_way_compare>(lhs, rhs, count); - if (count <= 16) - return three_way_compare>(lhs, rhs, count); +[[maybe_unused]] static inline MemcmpReturnType +inline_memcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) { + if (unlikely(count >= 384)) { + if (auto value = x86::sse2::Memcmp<16>::block(p1, p2)) + return value; + align_to_next_boundary<16, Arg::P1>(p1, p2, count); + } + return x86::sse2::Memcmp<16>::loop_and_tail(p1, p2, count); +} + +[[maybe_unused]] static inline MemcmpReturnType +inline_memcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) { if (count <= 32) - return three_way_compare>(lhs, rhs, count); + return x86::sse2::Memcmp<16>::head_tail(p1, p2, count); if (count <= 64) - return three_way_compare>(lhs, rhs, count); + return x86::avx2::Memcmp<32>::head_tail(p1, p2, count); if (count <= 128) - return three_way_compare>(lhs, rhs, count); - return three_way_compare::Then>>(lhs, rhs, count); -#elif defined(LLVM_LIBC_ARCH_AARCH64) - ///////////////////////////////////////////////////////////////////////////// - // LLVM_LIBC_ARCH_AARCH64 - ///////////////////////////////////////////////////////////////////////////// - using namespace ::__llvm_libc::aarch64; - if (count == 0) // [0, 0] - return 0; - if (count == 1) // [1, 1] - return three_way_compare<_1>(lhs, rhs); - if (count == 2) // [2, 2] - return three_way_compare<_2>(lhs, rhs); - if (count == 3) // [3, 3] - return three_way_compare<_3>(lhs, rhs); - if (count < 8) // [4, 7] - return three_way_compare>(lhs, rhs, count); - if (count < 16) // [8, 15] - return three_way_compare>(lhs, rhs, count); - if (unlikely(count >= 128)) // [128, ∞] - return three_way_compare::Then>>(lhs, rhs, count); - if (!equals<_16>(lhs, rhs)) // [16, 16] - return three_way_compare<_16>(lhs, rhs); + return x86::avx2::Memcmp<64>::head_tail(p1, p2, count); + if (unlikely(count >= 384)) { + if (auto value = x86::avx2::Memcmp<32>::block(p1, p2)) + return value; + align_to_next_boundary<32, Arg::P1>(p1, p2, count); + } + return x86::avx2::Memcmp<32>::loop_and_tail(p1, p2, count); +} + +[[maybe_unused]] static inline MemcmpReturnType +inline_memcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) { + if (count <= 32) + return x86::sse2::Memcmp<16>::head_tail(p1, p2, count); + if (count <= 64) + return x86::avx2::Memcmp<32>::head_tail(p1, p2, count); + if (count <= 128) + return x86::avx512bw::Memcmp<64>::head_tail(p1, p2, count); + if (unlikely(count >= 384)) { + if (auto value = x86::avx512bw::Memcmp<64>::block(p1, p2)) + return value; + align_to_next_boundary<64, Arg::P1>(p1, p2, count); + } + return x86::avx512bw::Memcmp<64>::loop_and_tail(p1, p2, count); +} +#endif // defined(LLVM_LIBC_ARCH_X86) + +#if defined(LLVM_LIBC_ARCH_AARCH64) +[[maybe_unused]] static inline MemcmpReturnType +inline_memcmp_aarch64_neon_gt16(CPtr p1, CPtr p2, size_t count) { + if (unlikely(count >= 128)) { // [128, ∞] + if (auto value = generic::Memcmp<16>::block(p1, p2)) + return value; + align_to_next_boundary<16, Arg::P1>(p1, p2, count); + return generic::Memcmp<32>::loop_and_tail(p1, p2, count); + } + if (generic::Bcmp<16>::block(p1, p2)) // [16, 16] + return generic::Memcmp<16>::block(p1, p2); if (count < 32) // [17, 31] - return three_way_compare>(lhs, rhs, count); - if (!equals::Then<_16>>(lhs, rhs)) // [32, 32] - return three_way_compare::Then<_16>>(lhs, rhs); + return generic::Memcmp<16>::tail(p1, p2, count); + if (generic::Bcmp<16>::block(p1 + 16, p2 + 16)) // [32, 32] + return generic::Memcmp<16>::block(p1 + 16, p2 + 16); if (count < 64) // [33, 63] - return three_way_compare>(lhs, rhs, count); + return generic::Memcmp<32>::tail(p1, p2, count); // [64, 127] - return three_way_compare::Then>>(lhs, rhs, count); -#else - ///////////////////////////////////////////////////////////////////////////// - // Default - ///////////////////////////////////////////////////////////////////////////// - using namespace ::__llvm_libc::scalar; + return generic::Memcmp<16>::loop_and_tail(p1 + 32, p2 + 32, count - 32); +} +#endif // defined(LLVM_LIBC_ARCH_AARCH64) +static inline MemcmpReturnType inline_memcmp(CPtr p1, CPtr p2, size_t count) { +#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64) if (count == 0) - return 0; + return MemcmpReturnType::ZERO(); if (count == 1) - return three_way_compare<_1>(lhs, rhs); + return generic::Memcmp<1>::block(p1, p2); if (count == 2) - return three_way_compare<_2>(lhs, rhs); + return generic::Memcmp<2>::block(p1, p2); if (count == 3) - return three_way_compare<_3>(lhs, rhs); + return generic::Memcmp<3>::block(p1, p2); if (count <= 8) - return three_way_compare>(lhs, rhs, count); + return generic::Memcmp<4>::head_tail(p1, p2, count); if (count <= 16) - return three_way_compare>(lhs, rhs, count); - if (count <= 32) - return three_way_compare>(lhs, rhs, count); - if (count <= 64) - return three_way_compare>(lhs, rhs, count); - if (count <= 128) - return three_way_compare>(lhs, rhs, count); - return three_way_compare::Then>>(lhs, rhs, count); + return generic::Memcmp<8>::head_tail(p1, p2, count); +#if defined(LLVM_LIBC_ARCH_X86) + if constexpr (x86::kAvx512BW) + return inline_memcmp_x86_avx512bw_gt16(p1, p2, count); + else if constexpr (x86::kAvx2) + return inline_memcmp_x86_avx2_gt16(p1, p2, count); + else if constexpr (x86::kSse2) + return inline_memcmp_x86_sse2_gt16(p1, p2, count); + else + return inline_memcmp_generic_gt16(p1, p2, count); +#elif defined(LLVM_LIBC_ARCH_AARCH64) + if constexpr (aarch64::kNeon) + return inline_memcmp_aarch64_neon_gt16(p1, p2, count); + else + return inline_memcmp_generic_gt16(p1, p2, count); #endif +#elif defined(LLVM_LIBC_ARCH_ARM) + return inline_memcmp_embedded_tiny(p1, p2, count); +#else +#error "Unsupported platform" +#endif +} + +static inline int inline_memcmp(const void *p1, const void *p2, size_t count) { + return static_cast(inline_memcmp(reinterpret_cast(p1), + reinterpret_cast(p2), count)); } } // namespace __llvm_libc diff --git a/libc/src/string/memory_utils/memcpy_implementations.h b/libc/src/string/memory_utils/memcpy_implementations.h index 3385d40..cb9a828 100644 --- a/libc/src/string/memory_utils/memcpy_implementations.h +++ b/libc/src/string/memory_utils/memcpy_implementations.h @@ -11,145 +11,130 @@ #include "src/__support/architectures.h" #include "src/__support/common.h" -#include "src/string/memory_utils/elements.h" +#include "src/string/memory_utils/op_aarch64.h" +#include "src/string/memory_utils/op_builtin.h" +#include "src/string/memory_utils/op_generic.h" +#include "src/string/memory_utils/op_x86.h" #include "src/string/memory_utils/utils.h" #include // size_t -// Design rationale -// ================ -// -// Using a profiler to observe size distributions for calls into libc -// functions, it was found most operations act on a small number of bytes. -// This makes it important to favor small sizes. -// -// The tests for `count` are in ascending order so the cost of branching is -// proportional to the cost of copying. -// -// The function is written in C++ for several reasons: -// - The compiler can __see__ the code, this is useful when performing Profile -// Guided Optimization as the optimized code can take advantage of branching -// probabilities. -// - It also allows for easier customization and favors testing multiple -// implementation parameters. -// - As compilers and processors get better, the generated code is improved -// with little change on the code side. - namespace __llvm_libc { -static inline void inline_memcpy(char *__restrict dst, - const char *__restrict src, size_t count) { - using namespace __llvm_libc::builtin; -#if defined(LLVM_LIBC_ARCH_X86) - ///////////////////////////////////////////////////////////////////////////// - // LLVM_LIBC_ARCH_X86 - ///////////////////////////////////////////////////////////////////////////// - - // Whether to use only rep;movsb. - constexpr bool USE_ONLY_REP_MOVSB = - LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB); - - // kRepMovsBSize == -1 : Only CopyAligned is used. - // kRepMovsBSize == 0 : Only RepMovsb is used. - // else CopyAligned is used up to kRepMovsBSize and then RepMovsb. - constexpr size_t REP_MOVS_B_SIZE = -#if defined(LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE) - LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE; -#else - -1; -#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE - - // Whether target supports AVX instructions. - constexpr bool HAS_AVX = LLVM_LIBC_IS_DEFINED(__AVX__); - -#if defined(__AVX__) - using LoopBlockSize = _64; -#else - using LoopBlockSize = _32; -#endif - - if (USE_ONLY_REP_MOVSB) - return copy(dst, src, count); +[[maybe_unused]] static inline void +inline_memcpy_embedded_tiny(Ptr __restrict dst, CPtr __restrict src, + size_t count) { +#pragma nounroll + for (size_t offset = 0; offset < count; ++offset) + builtin::Memcpy<1>::block(dst + offset, src + offset); +} +#if defined(LLVM_LIBC_ARCH_X86) +[[maybe_unused]] static inline void +inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) { if (count == 0) return; if (count == 1) - return copy<_1>(dst, src); + return builtin::Memcpy<1>::block(dst, src); if (count == 2) - return copy<_2>(dst, src); + return builtin::Memcpy<2>::block(dst, src); if (count == 3) - return copy<_3>(dst, src); + return builtin::Memcpy<3>::block(dst, src); if (count == 4) - return copy<_4>(dst, src); + return builtin::Memcpy<4>::block(dst, src); if (count < 8) - return copy>(dst, src, count); + return builtin::Memcpy<4>::head_tail(dst, src, count); if (count < 16) - return copy>(dst, src, count); + return builtin::Memcpy<8>::head_tail(dst, src, count); if (count < 32) - return copy>(dst, src, count); + return builtin::Memcpy<16>::head_tail(dst, src, count); if (count < 64) - return copy>(dst, src, count); + return builtin::Memcpy<32>::head_tail(dst, src, count); if (count < 128) - return copy>(dst, src, count); - if (HAS_AVX && count < 256) - return copy>(dst, src, count); - if (count <= REP_MOVS_B_SIZE) - return copy::Then>>(dst, src, - count); - return copy(dst, src, count); -#elif defined(LLVM_LIBC_ARCH_AARCH64) - ///////////////////////////////////////////////////////////////////////////// - // LLVM_LIBC_ARCH_AARCH64 - ///////////////////////////////////////////////////////////////////////////// + return builtin::Memcpy<64>::head_tail(dst, src, count); + if (x86::kAvx && count < 256) + return builtin::Memcpy<128>::head_tail(dst, src, count); + builtin::Memcpy<32>::block(dst, src); + align_to_next_boundary<32, Arg::Dst>(dst, src, count); + static constexpr size_t kBlockSize = x86::kAvx ? 64 : 32; + return builtin::Memcpy::loop_and_tail(dst, src, count); +} + +[[maybe_unused]] static inline void +inline_memcpy_x86_maybe_interpose_repmovsb(Ptr __restrict dst, + CPtr __restrict src, size_t count) { + // Whether to use rep;movsb exclusively, not at all, or only above a certain + // threshold. + // TODO: Use only a single preprocessor definition to simplify the code. +#ifndef LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE +#define LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE -1 +#endif + + static constexpr bool kUseOnlyRepMovsb = + LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB); + static constexpr size_t kRepMovsbThreshold = + LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE; + if constexpr (kUseOnlyRepMovsb) + return x86::Memcpy::repmovsb(dst, src, count); + else if constexpr (kRepMovsbThreshold >= 0) { + if (unlikely(count >= kRepMovsbThreshold)) + return x86::Memcpy::repmovsb(dst, src, count); + else + return inline_memcpy_x86(dst, src, count); + } else { + return inline_memcpy_x86(dst, src, count); + } +} +#endif // defined(LLVM_LIBC_ARCH_X86) + +#if defined(LLVM_LIBC_ARCH_AARCH64) +[[maybe_unused]] static inline void +inline_memcpy_aarch64(Ptr __restrict dst, CPtr __restrict src, size_t count) { if (count == 0) return; if (count == 1) - return copy<_1>(dst, src); + return builtin::Memcpy<1>::block(dst, src); if (count == 2) - return copy<_2>(dst, src); + return builtin::Memcpy<2>::block(dst, src); if (count == 3) - return copy<_3>(dst, src); + return builtin::Memcpy<3>::block(dst, src); if (count == 4) - return copy<_4>(dst, src); + return builtin::Memcpy<4>::block(dst, src); if (count < 8) - return copy>(dst, src, count); + return builtin::Memcpy<4>::head_tail(dst, src, count); if (count < 16) - return copy>(dst, src, count); + return builtin::Memcpy<8>::head_tail(dst, src, count); if (count < 32) - return copy>(dst, src, count); + return builtin::Memcpy<16>::head_tail(dst, src, count); if (count < 64) - return copy>(dst, src, count); + return builtin::Memcpy<32>::head_tail(dst, src, count); if (count < 128) - return copy>(dst, src, count); - return copy::Then>>(dst, src, count); + return builtin::Memcpy<64>::head_tail(dst, src, count); + builtin::Memcpy<16>::block(dst, src); + align_to_next_boundary<16, Arg::Src>(dst, src, count); + return builtin::Memcpy<64>::loop_and_tail(dst, src, count); +} +#endif // defined(LLVM_LIBC_ARCH_AARCH64) + +static inline void inline_memcpy(Ptr __restrict dst, CPtr __restrict src, + size_t count) { + using namespace __llvm_libc::builtin; +#if defined(LLVM_LIBC_ARCH_X86) + return inline_memcpy_x86_maybe_interpose_repmovsb(dst, src, count); +#elif defined(LLVM_LIBC_ARCH_AARCH64) + return inline_memcpy_aarch64(dst, src, count); +#elif defined(LLVM_LIBC_ARCH_ARM) + return inline_memcpy_embedded_tiny(dst, src, count); #else - ///////////////////////////////////////////////////////////////////////////// - // Default - ///////////////////////////////////////////////////////////////////////////// - if (count == 0) - return; - if (count == 1) - return copy<_1>(dst, src); - if (count == 2) - return copy<_2>(dst, src); - if (count == 3) - return copy<_3>(dst, src); - if (count == 4) - return copy<_4>(dst, src); - if (count < 8) - return copy>(dst, src, count); - if (count < 16) - return copy>(dst, src, count); - if (count < 32) - return copy>(dst, src, count); - if (count < 64) - return copy>(dst, src, count); - if (count < 128) - return copy>(dst, src, count); - return copy::Then>>(dst, src, count); +#error "Unsupported platform" #endif } +static inline void inline_memcpy(void *__restrict dst, + const void *__restrict src, size_t count) { + inline_memcpy(reinterpret_cast(dst), reinterpret_cast(src), count); +} + } // namespace __llvm_libc #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_IMPLEMENTATIONS_H diff --git a/libc/src/string/memory_utils/memset_implementations.h b/libc/src/string/memory_utils/memset_implementations.h index d58ed3b..75ecf16 100644 --- a/libc/src/string/memory_utils/memset_implementations.h +++ b/libc/src/string/memory_utils/memset_implementations.h @@ -10,129 +10,111 @@ #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H #include "src/__support/architectures.h" -#include "src/string/memory_utils/elements.h" +#include "src/string/memory_utils/op_aarch64.h" +#include "src/string/memory_utils/op_builtin.h" +#include "src/string/memory_utils/op_generic.h" +#include "src/string/memory_utils/op_x86.h" #include "src/string/memory_utils/utils.h" #include // size_t namespace __llvm_libc { -// A general purpose implementation assuming cheap unaligned writes for sizes: -// 1, 2, 4, 8, 16, 32 and 64 Bytes. Note that some architecture can't store 32 -// or 64 Bytes at a time, the compiler will expand them as needed. -// -// This implementation is subject to change as we benchmark more processors. We -// may also want to customize it for processors with specialized instructions -// that performs better (e.g. `rep stosb`). -// -// A note on the apparent discrepancy in the use of 32 vs 64 Bytes writes. -// We want to balance two things here: -// - The number of redundant writes (when using `SetBlockOverlap`), -// - The number of conditionals for sizes <=128 (~90% of memset calls are for -// such sizes). -// -// For the range 64-128: -// - SetBlockOverlap<64> uses no conditionals but always writes 128 Bytes this -// is wasteful near 65 but efficient toward 128. -// - SetAlignedBlocks<32> would consume between 3 and 4 conditionals and write -// 96 or 128 Bytes. -// - Another approach could be to use an hybrid approach copy<64>+Overlap<32> -// for 65-96 and copy<96>+Overlap<32> for 97-128 -// -// Benchmarks showed that redundant writes were cheap (for Intel X86) but -// conditional were expensive, even on processor that do not support writing 64B -// at a time (pre-AVX512F). We also want to favor short functions that allow -// more hot code to fit in the iL1 cache. -// -// Above 128 we have to use conditionals since we don't know the upper bound in -// advance. SetAlignedBlocks<64> may waste up to 63 Bytes, SetAlignedBlocks<32> -// may waste up to 31 Bytes. Benchmarks showed that SetAlignedBlocks<64> was not -// superior for sizes that mattered. -inline static void inline_memset(char *dst, unsigned char value, size_t count) { +[[maybe_unused]] inline static void +inline_memset_embedded_tiny(Ptr dst, uint8_t value, size_t count) { +#pragma nounroll + for (size_t offset = 0; offset < count; ++offset) + generic::Memset<1, 1>::block(dst + offset, value); +} + #if defined(LLVM_LIBC_ARCH_X86) - ///////////////////////////////////////////////////////////////////////////// - // LLVM_LIBC_ARCH_X86 - ///////////////////////////////////////////////////////////////////////////// - using namespace __llvm_libc::x86; +template +[[maybe_unused]] inline static void inline_memset_x86(Ptr dst, uint8_t value, + size_t count) { if (count == 0) return; if (count == 1) - return splat_set<_1>(dst, value); + return generic::Memset<1, MaxSize>::block(dst, value); if (count == 2) - return splat_set<_2>(dst, value); + return generic::Memset<2, MaxSize>::block(dst, value); if (count == 3) - return splat_set<_3>(dst, value); + return generic::Memset<3, MaxSize>::block(dst, value); if (count <= 8) - return splat_set>(dst, value, count); + return generic::Memset<4, MaxSize>::head_tail(dst, value, count); if (count <= 16) - return splat_set>(dst, value, count); + return generic::Memset<8, MaxSize>::head_tail(dst, value, count); if (count <= 32) - return splat_set>(dst, value, count); + return generic::Memset<16, MaxSize>::head_tail(dst, value, count); if (count <= 64) - return splat_set>(dst, value, count); + return generic::Memset<32, MaxSize>::head_tail(dst, value, count); if (count <= 128) - return splat_set>(dst, value, count); - return splat_set::Then>>(dst, value, count); -#elif defined(LLVM_LIBC_ARCH_AARCH64) - ///////////////////////////////////////////////////////////////////////////// - // LLVM_LIBC_ARCH_AARCH64 - ///////////////////////////////////////////////////////////////////////////// - using namespace __llvm_libc::aarch64_memset; + return generic::Memset<64, MaxSize>::head_tail(dst, value, count); + // Aligned loop + generic::Memset<32, MaxSize>::block(dst, value); + align_to_next_boundary<32>(dst, count); + return generic::Memset<32, MaxSize>::loop_and_tail(dst, value, count); +} +#endif // defined(LLVM_LIBC_ARCH_X86) + +#if defined(LLVM_LIBC_ARCH_AARCH64) +template +[[maybe_unused]] inline static void +inline_memset_aarch64(Ptr dst, uint8_t value, size_t count) { if (count == 0) return; if (count <= 3) { - splat_set<_1>(dst, value); + generic::Memset<1, MaxSize>::block(dst, value); if (count > 1) - splat_set>(dst, value, count); + generic::Memset<2, MaxSize>::tail(dst, value, count); return; } if (count <= 8) - return splat_set>(dst, value, count); + return generic::Memset<4, MaxSize>::head_tail(dst, value, count); if (count <= 16) - return splat_set>(dst, value, count); + return generic::Memset<8, MaxSize>::head_tail(dst, value, count); if (count <= 32) - return splat_set>(dst, value, count); + return generic::Memset<16, MaxSize>::head_tail(dst, value, count); if (count <= (32 + 64)) { - splat_set<_32>(dst, value); + generic::Memset<32, MaxSize>::block(dst, value); if (count <= 64) - return splat_set>(dst, value, count); - splat_set::Then<_32>>(dst, value); - splat_set>(dst, value, count); + return generic::Memset<32, MaxSize>::tail(dst, value, count); + generic::Memset<32, MaxSize>::block(dst + 32, value); + generic::Memset<32, MaxSize>::tail(dst, value, count); return; } - if (count >= 448 && value == 0 && hasZva()) - return splat_set::Then>>(dst, 0, - count); - else - return splat_set::Then>>(dst, value, count); -#else - ///////////////////////////////////////////////////////////////////////////// - // Default - ///////////////////////////////////////////////////////////////////////////// - using namespace ::__llvm_libc::scalar; + if (count >= 448 && value == 0 && aarch64::neon::hasZva()) { + generic::Memset<64, MaxSize>::block(dst, 0); + align_to_next_boundary<64>(dst, count); + return aarch64::neon::BzeroCacheLine<64>::loop_and_tail(dst, 0, count); + } else { + generic::Memset<16, MaxSize>::block(dst, value); + align_to_next_boundary<16>(dst, count); + return generic::Memset<64, MaxSize>::loop_and_tail(dst, value, count); + } +} +#endif // defined(LLVM_LIBC_ARCH_AARCH64) - if (count == 0) - return; - if (count == 1) - return splat_set<_1>(dst, value); - if (count == 2) - return splat_set<_2>(dst, value); - if (count == 3) - return splat_set<_3>(dst, value); - if (count <= 8) - return splat_set>(dst, value, count); - if (count <= 16) - return splat_set>(dst, value, count); - if (count <= 32) - return splat_set>(dst, value, count); - if (count <= 64) - return splat_set>(dst, value, count); - if (count <= 128) - return splat_set>(dst, value, count); - return splat_set::Then>>(dst, value, count); +inline static void inline_memset(Ptr dst, uint8_t value, size_t count) { +#if defined(LLVM_LIBC_ARCH_X86) + static constexpr size_t kMaxSize = x86::kAvx512F ? 64 + : x86::kAvx ? 32 + : x86::kSse2 ? 16 + : 8; + return inline_memset_x86(dst, value, count); +#elif defined(LLVM_LIBC_ARCH_AARCH64) + static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8; + return inline_memset_aarch64(dst, value, count); +#elif defined(LLVM_LIBC_ARCH_ARM) + return inline_memset_embedded_tiny(dst, value, count); +#else +#error "Unsupported platform" #endif } +inline static void inline_memset(void *dst, uint8_t value, size_t count) { + inline_memset(reinterpret_cast(dst), value, count); +} + } // namespace __llvm_libc #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h index a4b59a1..8e64322 100644 --- a/libc/src/string/memory_utils/op_x86.h +++ b/libc/src/string/memory_utils/op_x86.h @@ -42,7 +42,7 @@ static inline constexpr bool kAvx512BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__); /////////////////////////////////////////////////////////////////////////////// // Memcpy repmovsb implementation struct Memcpy { - static void repmovsb(char *dst, const char *src, size_t count) { + static void repmovsb(void *dst, const void *src, size_t count) { asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory"); } }; diff --git a/libc/src/string/mempcpy.cpp b/libc/src/string/mempcpy.cpp index f26bd64..dd539eb 100644 --- a/libc/src/string/mempcpy.cpp +++ b/libc/src/string/mempcpy.cpp @@ -15,11 +15,10 @@ namespace __llvm_libc { LLVM_LIBC_FUNCTION(void *, mempcpy, - (void *__restrict dest, const void *__restrict src, + (void *__restrict dst, const void *__restrict src, size_t count)) { - char *result = reinterpret_cast(dest); - inline_memcpy(result, reinterpret_cast(src), count); - return result + count; + inline_memcpy(dst, src, count); + return reinterpret_cast(dst) + count; } } // namespace __llvm_libc diff --git a/libc/src/string/memset.cpp b/libc/src/string/memset.cpp index 549c074..b80cfce 100644 --- a/libc/src/string/memset.cpp +++ b/libc/src/string/memset.cpp @@ -13,8 +13,7 @@ namespace __llvm_libc { LLVM_LIBC_FUNCTION(void *, memset, (void *dst, int value, size_t count)) { - inline_memset(reinterpret_cast(dst), - static_cast(value), count); + inline_memset(dst, static_cast(value), count); return dst; } -- 2.7.4