From d8415b02a519f222ecf71b069c96cc85ac635de3 Mon Sep 17 00:00:00 2001 From: Sterling Augustine Date: Fri, 14 Oct 2022 11:17:46 -0700 Subject: [PATCH] Revert "[libc] New version of the mem* framework" This reverts commit https://reviews.llvm.org/D135134 (b3f1d58a131eb546aaf1ac165c77ccb89c40d758) That revision appears to have broken Arm memcpy in some subtle ways. Am communicating with the original author to get a good reproduction. --- libc/src/stdio/printf_core/string_writer.cpp | 2 +- libc/src/string/bcmp.cpp | 4 +- libc/src/string/memcmp.cpp | 4 +- libc/src/string/memmove.cpp | 100 +-- libc/src/string/memory_utils/CMakeLists.txt | 8 +- libc/src/string/memory_utils/README.md | 97 --- .../src/string/memory_utils/bcmp_implementations.h | 171 +---- libc/src/string/memory_utils/elements.h | 774 +++++++++++++++++++++ libc/src/string/memory_utils/elements_aarch64.h | 130 ++++ libc/src/string/memory_utils/elements_x86.h | 189 +++++ .../string/memory_utils/memcmp_implementations.h | 175 ++--- .../string/memory_utils/memcpy_implementations.h | 193 ++--- .../string/memory_utils/memset_implementations.h | 156 +++-- libc/src/string/memory_utils/op_aarch64.h | 175 ----- libc/src/string/memory_utils/op_builtin.h | 148 ---- libc/src/string/memory_utils/op_generic.h | 466 ------------- libc/src/string/memory_utils/op_x86.h | 219 ------ libc/src/string/memory_utils/utils.h | 139 ++-- libc/src/string/memset.cpp | 4 +- libc/test/src/string/bcmp_test.cpp | 12 +- libc/test/src/string/memmove_test.cpp | 14 +- libc/test/src/string/memory_utils/CMakeLists.txt | 2 + .../test/src/string/memory_utils/elements_test.cpp | 137 ++++ .../src/string/memory_utils/memory_access_test.cpp | 228 ++++++ libc/test/src/string/memory_utils/utils_test.cpp | 79 ++- utils/bazel/llvm-project-overlay/libc/BUILD.bazel | 9 +- 26 files changed, 1895 insertions(+), 1740 deletions(-) delete mode 100644 libc/src/string/memory_utils/README.md create mode 100644 libc/src/string/memory_utils/elements.h create mode 100644 libc/src/string/memory_utils/elements_aarch64.h create mode 100644 libc/src/string/memory_utils/elements_x86.h delete mode 100644 libc/src/string/memory_utils/op_aarch64.h delete mode 100644 libc/src/string/memory_utils/op_builtin.h delete mode 100644 libc/src/string/memory_utils/op_generic.h delete mode 100644 libc/src/string/memory_utils/op_x86.h create mode 100644 libc/test/src/string/memory_utils/elements_test.cpp create mode 100644 libc/test/src/string/memory_utils/memory_access_test.cpp diff --git a/libc/src/stdio/printf_core/string_writer.cpp b/libc/src/stdio/printf_core/string_writer.cpp index 472573d..a80df32 100644 --- a/libc/src/stdio/printf_core/string_writer.cpp +++ b/libc/src/stdio/printf_core/string_writer.cpp @@ -33,7 +33,7 @@ void StringWriter::write(char new_char, size_t len) { len = available_capacity; if (len > 0) { - inline_memset(cur_buffer, static_cast(new_char), len); + inline_memset(cur_buffer, new_char, len); cur_buffer += len; available_capacity -= len; } diff --git a/libc/src/string/bcmp.cpp b/libc/src/string/bcmp.cpp index fb00780..963a7f5 100644 --- a/libc/src/string/bcmp.cpp +++ b/libc/src/string/bcmp.cpp @@ -14,8 +14,8 @@ namespace __llvm_libc { LLVM_LIBC_FUNCTION(int, bcmp, (const void *lhs, const void *rhs, size_t count)) { - return static_cast(inline_bcmp(static_cast(lhs), - static_cast(rhs), count)); + return inline_bcmp(static_cast(lhs), + static_cast(rhs), count); } } // namespace __llvm_libc diff --git a/libc/src/string/memcmp.cpp b/libc/src/string/memcmp.cpp index 357b57d1..292525e 100644 --- a/libc/src/string/memcmp.cpp +++ b/libc/src/string/memcmp.cpp @@ -15,8 +15,8 @@ namespace __llvm_libc { LLVM_LIBC_FUNCTION(int, memcmp, (const void *lhs, const void *rhs, size_t count)) { - return static_cast(inline_memcmp(static_cast(lhs), - static_cast(rhs), count)); + return inline_memcmp(static_cast(lhs), + static_cast(rhs), count); } } // namespace __llvm_libc diff --git a/libc/src/string/memmove.cpp b/libc/src/string/memmove.cpp index f40ab62..f242578 100644 --- a/libc/src/string/memmove.cpp +++ b/libc/src/string/memmove.cpp @@ -9,104 +9,36 @@ #include "src/string/memmove.h" #include "src/__support/common.h" -#include "src/string/memory_utils/op_aarch64.h" -#include "src/string/memory_utils/op_builtin.h" -#include "src/string/memory_utils/op_generic.h" -#include "src/string/memory_utils/op_x86.h" +#include "src/__support/integer_operations.h" +#include "src/string/memory_utils/elements.h" #include // size_t, ptrdiff_t -#include - namespace __llvm_libc { -[[maybe_unused]] static inline void -inline_memmove_embedded_tiny(Ptr dst, CPtr src, size_t count) { - if ((count == 0) || (dst == src)) - return; - if (dst < src) { -#pragma nounroll - for (size_t offset = 0; offset < count; ++offset) - builtin::Memcpy<1>::block(dst + offset, src + offset); - } else { -#pragma nounroll - for (ptrdiff_t offset = count - 1; offset >= 0; --offset) - builtin::Memcpy<1>::block(dst + offset, src + offset); - } -} - -template -[[maybe_unused]] static inline void inline_memmove_generic(Ptr dst, CPtr src, - size_t count) { +static inline void inline_memmove(char *dst, const char *src, size_t count) { + using namespace __llvm_libc::scalar; if (count == 0) return; if (count == 1) - return generic::Memmove<1, MaxSize>::block(dst, src); + return move<_1>(dst, src); if (count <= 4) - return generic::Memmove<2, MaxSize>::head_tail(dst, src, count); + return move>(dst, src, count); if (count <= 8) - return generic::Memmove<4, MaxSize>::head_tail(dst, src, count); + return move>(dst, src, count); if (count <= 16) - return generic::Memmove<8, MaxSize>::head_tail(dst, src, count); + return move>(dst, src, count); if (count <= 32) - return generic::Memmove<16, MaxSize>::head_tail(dst, src, count); + return move>(dst, src, count); if (count <= 64) - return generic::Memmove<32, MaxSize>::head_tail(dst, src, count); + return move>(dst, src, count); if (count <= 128) - return generic::Memmove<64, MaxSize>::head_tail(dst, src, count); - if (dst < src) { - generic::Memmove<32, MaxSize>::template align_forward(dst, src, - count); - return generic::Memmove<64, MaxSize>::loop_and_tail_forward(dst, src, - count); - } else { - generic::Memmove<32, MaxSize>::template align_backward(dst, src, - count); - return generic::Memmove<64, MaxSize>::loop_and_tail_backward(dst, src, - count); - } -} + return move>(dst, src, count); -static inline void inline_memmove(Ptr dst, CPtr src, size_t count) { -#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64) -#if defined(LLVM_LIBC_ARCH_X86) - static constexpr size_t kMaxSize = x86::kAvx512F ? 64 - : x86::kAvx ? 32 - : x86::kSse2 ? 16 - : 8; -#elif defined(LLVM_LIBC_ARCH_AARCH64) - static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8; -#endif - // return inline_memmove_generic(dst, src, count); - if (count == 0) - return; - if (count == 1) - return generic::Memmove<1, kMaxSize>::block(dst, src); - if (count <= 4) - return generic::Memmove<2, kMaxSize>::head_tail(dst, src, count); - if (count <= 8) - return generic::Memmove<4, kMaxSize>::head_tail(dst, src, count); - if (count <= 16) - return generic::Memmove<8, kMaxSize>::head_tail(dst, src, count); - if (count <= 32) - return generic::Memmove<16, kMaxSize>::head_tail(dst, src, count); - if (count <= 64) - return generic::Memmove<32, kMaxSize>::head_tail(dst, src, count); - if (count <= 128) - return generic::Memmove<64, kMaxSize>::head_tail(dst, src, count); - if (dst < src) { - generic::Memmove<32, kMaxSize>::align_forward(dst, src, count); - return generic::Memmove<64, kMaxSize>::loop_and_tail_forward(dst, src, - count); - } else { - generic::Memmove<32, kMaxSize>::align_backward(dst, src, count); - return generic::Memmove<64, kMaxSize>::loop_and_tail_backward(dst, src, - count); - } -#elif defined(LLVM_LIBC_ARCH_ARM) - return inline_memmove_embedded_tiny(dst, src, count); -#else -#error "Unsupported platform" -#endif + using AlignedMoveLoop = Align<_16, Arg::Src>::Then>; + if (dst < src) + return move(dst, src, count); + else if (dst > src) + return move_backward(dst, src, count); } LLVM_LIBC_FUNCTION(void *, memmove, diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt index 630b0d4..d735fcf 100644 --- a/libc/src/string/memory_utils/CMakeLists.txt +++ b/libc/src/string/memory_utils/CMakeLists.txt @@ -2,17 +2,13 @@ add_header_library( memory_utils HDRS + utils.h + elements.h bcmp_implementations.h bzero_implementations.h memcmp_implementations.h memcpy_implementations.h memset_implementations.h - op_aarch64.h - op_higher_order.h - op_builtin.h - op_generic.h - op_x86.h - utils.h DEPS libc.src.__support.CPP.bit ) diff --git a/libc/src/string/memory_utils/README.md b/libc/src/string/memory_utils/README.md deleted file mode 100644 index 83a29066..0000000 --- a/libc/src/string/memory_utils/README.md +++ /dev/null @@ -1,97 +0,0 @@ -# The mem* framework - -The framework handles the following mem* functions: - - `memcpy` - - `memmove` - - `memset` - - `bzero` - - `bcmp` - - `memcmp` - -## Building blocks - -These functions can be built out of a set of lower-level operations: - - **`block`** : operates on a block of `SIZE` bytes. - - **`tail`** : operates on the last `SIZE` bytes of the buffer (e.g., `[dst + count - SIZE, dst + count]`) - - **`head_tail`** : operates on the first and last `SIZE` bytes. This is the same as calling `block` and `tail`. - - **`loop_and_tail`** : calls `block` in a loop to consume as much as possible of the `count` bytes and handle the remaining bytes with a `tail` operation. - -As an illustration, let's take the example of a trivial `memset` implementation: - - ```C++ - extern "C" void memset(const char* dst, int value, size_t count) { - if (count == 0) return; - if (count == 1) return Memset<1>::block(dst, value); - if (count == 2) return Memset<2>::block(dst, value); - if (count == 3) return Memset<3>::block(dst, value); - if (count <= 8) return Memset<4>::head_tail(dst, value, count); // Note that 0 to 4 bytes are written twice. - if (count <= 16) return Memset<8>::head_tail(dst, value, count); // Same here. - return Memset<16>::loop_and_tail(dst, value, count); -} - ``` - -Now let's have a look into the `Memset` structure: - -```C++ -template -struct Memset { - static constexpr size_t SIZE = Size; - - static inline void block(Ptr dst, uint8_t value) { - // Implement me - } - - static inline void tail(Ptr dst, uint8_t value, size_t count) { - block(dst + count - SIZE, value); - } - - static inline void head_tail(Ptr dst, uint8_t value, size_t count) { - block(dst, value); - tail(dst, value, count); - } - - static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) { - size_t offset = 0; - do { - block(dst + offset, value); - offset += SIZE; - } while (offset < count - SIZE); - tail(dst, value, count); - } -}; -``` - -As you can see, the `tail`, `head_tail` and `loop_and_tail` are higher order functions that build on each others. Only `block` really needs to be implemented. -In earlier designs we were implementing these higher order functions with templated functions but it appears that it is more readable to have the implementation explicitly stated. -**This design is useful because it provides customization points**. For instance, for `bcmp` on `aarch64` we can provide a better implementation of `head_tail` using vector reduction intrinsics. - -## Scoped specializations - -We can have several specializations of the `Memset` structure. Depending on the target requirements we can use one or several scopes for the same implementation. - -In the following example we use the `generic` implementation for the small sizes but use the `x86` implementation for the loop. -```C++ - extern "C" void memset(const char* dst, int value, size_t count) { - if (count == 0) return; - if (count == 1) return generic::Memset<1>::block(dst, value); - if (count == 2) return generic::Memset<2>::block(dst, value); - if (count == 3) return generic::Memset<3>::block(dst, value); - if (count <= 8) return generic::Memset<4>::head_tail(dst, value, count); - if (count <= 16) return generic::Memset<8>::head_tail(dst, value, count); - return x86::Memset<16>::loop_and_tail(dst, value, count); -} -``` - -### The `builtin` scope - -Ultimately we would like the compiler to provide the code for the `block` function. For this we rely on dedicated builtins available in Clang (e.g., [`__builtin_memset_inline`](https://clang.llvm.org/docs/LanguageExtensions.html#guaranteed-inlined-memset)) - -### The `generic` scope - -In this scope we define pure C++ implementations using native integral types and clang vector extensions. - -### The arch specific scopes - -Then comes implementations that are using specific architectures or microarchitectures features (e.g., `rep;movsb` for `x86` or `dc zva` for `aarch64`). - -The purpose here is to rely on builtins as much as possible and fallback to `asm volatile` as a last resort. diff --git a/libc/src/string/memory_utils/bcmp_implementations.h b/libc/src/string/memory_utils/bcmp_implementations.h index 9d9ad2b..c26e38e 100644 --- a/libc/src/string/memory_utils/bcmp_implementations.h +++ b/libc/src/string/memory_utils/bcmp_implementations.h @@ -11,164 +11,49 @@ #include "src/__support/architectures.h" #include "src/__support/common.h" -#include "src/string/memory_utils/op_aarch64.h" -#include "src/string/memory_utils/op_builtin.h" -#include "src/string/memory_utils/op_generic.h" -#include "src/string/memory_utils/op_x86.h" +#include "src/string/memory_utils/elements.h" #include // size_t namespace __llvm_libc { -[[maybe_unused]] static inline BcmpReturnType -inline_bcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) { -#pragma nounroll - for (size_t offset = 0; offset < count; ++offset) - if (auto value = generic::Bcmp<1>::block(p1 + offset, p2 + offset)) - return value; - return BcmpReturnType::ZERO(); +// Fixed-size difference between 'lhs' and 'rhs'. +template bool differs(const char *lhs, const char *rhs) { + return !Element::equals(lhs, rhs); } - -#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64) -[[maybe_unused]] static inline BcmpReturnType -inline_bcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) { - if (count < 256) - return generic::Bcmp<16>::loop_and_tail(p1, p2, count); - if (auto value = generic::Bcmp<64>::block(p1, p2)) - return value; - align_to_next_boundary<64, Arg::P1>(p1, p2, count); - return generic::Bcmp<64>::loop_and_tail(p1, p2, count); +// Runtime-size difference between 'lhs' and 'rhs'. +template +bool differs(const char *lhs, const char *rhs, size_t size) { + return !Element::equals(lhs, rhs, size); } -#endif // defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64) +static inline int inline_bcmp(const char *lhs, const char *rhs, size_t count) { #if defined(LLVM_LIBC_ARCH_X86) -[[maybe_unused]] static inline BcmpReturnType -inline_bcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) { - if (count <= 32) - return x86::sse2::Bcmp<16>::head_tail(p1, p2, count); - if (count < 256) - return x86::sse2::Bcmp<16>::loop_and_tail(p1, p2, count); - if (auto value = x86::sse2::Bcmp<16>::block(p1, p2)) - return value; - align_to_next_boundary<16, Arg::P1>(p1, p2, count); - return x86::sse2::Bcmp<64>::loop_and_tail(p1, p2, count); -} - -[[maybe_unused]] static inline BcmpReturnType -inline_bcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) { - if (count <= 32) - return x86::sse2::Bcmp<16>::head_tail(p1, p2, count); - if (count <= 64) - return x86::avx2::Bcmp<32>::head_tail(p1, p2, count); - if (count <= 128) - return x86::avx2::Bcmp<64>::head_tail(p1, p2, count); - if (unlikely(count >= 256)) { - if (auto value = x86::avx2::Bcmp<64>::block(p1, p2)) - return value; - align_to_next_boundary<64, Arg::P1>(p1, p2, count); - } - return x86::avx2::Bcmp<64>::loop_and_tail(p1, p2, count); -} - -[[maybe_unused]] static inline BcmpReturnType -inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) { - if (count <= 32) - return x86::sse2::Bcmp<16>::head_tail(p1, p2, count); - if (count <= 64) - return x86::avx2::Bcmp<32>::head_tail(p1, p2, count); - if (count <= 128) - return x86::avx512bw::Bcmp<64>::head_tail(p1, p2, count); - if (unlikely(count >= 256)) { - if (auto value = x86::avx512bw::Bcmp<64>::block(p1, p2)) - return value; - align_to_next_boundary<64, Arg::P1>(p1, p2, count); - } - return x86::avx512bw::Bcmp<64>::loop_and_tail(p1, p2, count); -} - -[[maybe_unused]] static inline BcmpReturnType inline_bcmp_x86(CPtr p1, CPtr p2, - size_t count) { + using namespace ::__llvm_libc::x86; +#elif defined(LLVM_LIBC_ARCH_AARCH64) + using namespace ::__llvm_libc::aarch64; +#else + using namespace ::__llvm_libc::scalar; +#endif if (count == 0) - return BcmpReturnType::ZERO(); + return 0; if (count == 1) - return generic::Bcmp<1>::block(p1, p2); + return differs<_1>(lhs, rhs); if (count == 2) - return generic::Bcmp<2>::block(p1, p2); - if (count <= 4) - return generic::Bcmp<2>::head_tail(p1, p2, count); + return differs<_2>(lhs, rhs); + if (count == 3) + return differs<_3>(lhs, rhs); if (count <= 8) - return generic::Bcmp<4>::head_tail(p1, p2, count); + return differs>(lhs, rhs, count); if (count <= 16) - return generic::Bcmp<8>::head_tail(p1, p2, count); - if constexpr (x86::kAvx512BW) - return inline_bcmp_x86_avx512bw_gt16(p1, p2, count); - else if constexpr (x86::kAvx2) - return inline_bcmp_x86_avx2_gt16(p1, p2, count); - else if constexpr (x86::kSse2) - return inline_bcmp_x86_sse2_gt16(p1, p2, count); - else - return inline_bcmp_generic_gt16(p1, p2, count); -} -#endif // defined(LLVM_LIBC_ARCH_X86) - -#if defined(LLVM_LIBC_ARCH_AARCH64) -[[maybe_unused]] static inline BcmpReturnType -inline_bcmp_aarch64(CPtr p1, CPtr p2, size_t count) { - if (likely(count <= 32)) { - if (unlikely(count >= 16)) { - return generic::Bcmp<16>::head_tail(p1, p2, count); - } - switch (count) { - case 0: - return BcmpReturnType::ZERO(); - case 1: - return generic::Bcmp<1>::block(p1, p2); - case 2: - return generic::Bcmp<2>::block(p1, p2); - case 3: - return generic::Bcmp<2>::head_tail(p1, p2, count); - case 4: - return generic::Bcmp<4>::block(p1, p2); - case 5: - case 6: - case 7: - return generic::Bcmp<4>::head_tail(p1, p2, count); - case 8: - return generic::Bcmp<8>::block(p1, p2); - case 9: - case 10: - case 11: - case 12: - case 13: - case 14: - case 15: - return generic::Bcmp<8>::head_tail(p1, p2, count); - } - } - + return differs>(lhs, rhs, count); + if (count <= 32) + return differs>(lhs, rhs, count); if (count <= 64) - return generic::Bcmp<32>::head_tail(p1, p2, count); - - // Aligned loop if > 256, otherwise normal loop - if (count > 256) { - if (auto value = generic::Bcmp<32>::block(p1, p2)) - return value; - align_to_next_boundary<16, Arg::P1>(p1, p2, count); - } - return generic::Bcmp<32>::loop_and_tail(p1, p2, count); -} -#endif // defined(LLVM_LIBC_ARCH_AARCH64) - -static inline BcmpReturnType inline_bcmp(CPtr p1, CPtr p2, size_t count) { -#if defined(LLVM_LIBC_ARCH_X86) - return inline_bcmp_x86(p1, p2, count); -#elif defined(LLVM_LIBC_ARCH_AARCH64) - return inline_bcmp_aarch64(p1, p2, count); -#elif defined(LLVM_LIBC_ARCH_ARM) - return inline_bcmp_embedded_tiny(p1, p2, count); -#else -#error "Unsupported platform" -#endif + return differs>(lhs, rhs, count); + if (count <= 128) + return differs>(lhs, rhs, count); + return differs::Then>>(lhs, rhs, count); } } // namespace __llvm_libc diff --git a/libc/src/string/memory_utils/elements.h b/libc/src/string/memory_utils/elements.h new file mode 100644 index 0000000..f5a3830 --- /dev/null +++ b/libc/src/string/memory_utils/elements.h @@ -0,0 +1,774 @@ +//===-- Elementary operations to compose memory primitives ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H +#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H + +#include // size_t +#include // uint8_t, uint16_t, uint32_t, uint64_t + +#include "src/__support/endian.h" +#include "src/string/memory_utils/utils.h" + +namespace __llvm_libc { + +// Elementary Operations +// -------------------------------- +// We define abstract elementary operations acting on fixed chunks of memory. +// These are low level building blocks that are meant to be assembled to compose +// higher order abstractions. Each function is defined twice: once with +// fixed-size operations, and once with runtime-size operations. + +// Fixed-size copy from 'src' to 'dst'. +template +void copy(char *__restrict dst, const char *__restrict src) { + Element::copy(dst, src); +} +// Runtime-size copy from 'src' to 'dst'. +template +void copy(char *__restrict dst, const char *__restrict src, size_t size) { + Element::copy(dst, src, size); +} + +// Fixed-size move from 'src' to 'dst'. +template void move(char *dst, const char *src) { + Element::move(dst, src); +} +// Runtime-size move from 'src' to 'dst'. +template void move(char *dst, const char *src, size_t size) { + Element::move(dst, src, size); +} +// Runtime-size move from 'src' to 'dst'. +template +void move_backward(char *dst, const char *src, size_t size) { + Element::move_backward(dst, src, size); +} + +// Fixed-size equality between 'lhs' and 'rhs'. +template bool equals(const char *lhs, const char *rhs) { + return Element::equals(lhs, rhs); +} +// Runtime-size equality between 'lhs' and 'rhs'. +template +bool equals(const char *lhs, const char *rhs, size_t size) { + return Element::equals(lhs, rhs, size); +} + +// Fixed-size three-way comparison between 'lhs' and 'rhs'. +template +int three_way_compare(const char *lhs, const char *rhs) { + return Element::three_way_compare(lhs, rhs); +} +// Runtime-size three-way comparison between 'lhs' and 'rhs'. +template +int three_way_compare(const char *lhs, const char *rhs, size_t size) { + return Element::three_way_compare(lhs, rhs, size); +} + +// Fixed-size initialization. +template +void splat_set(char *dst, const unsigned char value) { + Element::splat_set(dst, value); +} +// Runtime-size initialization. +template +void splat_set(char *dst, const unsigned char value, size_t size) { + Element::splat_set(dst, value, size); +} + +// Stack placeholder for Move operations. +template struct Storage { char bytes[Element::SIZE]; }; + +// Fixed-size Higher-Order Operations +// ---------------------------------- +// - Repeated: Repeat the operation several times in a row. +// - Chained: Chain the operation of several types. + +// Repeat the operation several times in a row. +template struct Repeated { + static constexpr size_t SIZE = ElementCount * Element::SIZE; + + static void copy(char *__restrict dst, const char *__restrict src) { + for (size_t i = 0; i < ElementCount; ++i) { + const size_t offset = i * Element::SIZE; + Element::copy(dst + offset, src + offset); + } + } + + static void move(char *dst, const char *src) { + const auto value = load(src); + store(dst, value); + } + + static bool equals(const char *lhs, const char *rhs) { + for (size_t i = 0; i < ElementCount; ++i) { + const size_t offset = i * Element::SIZE; + if (!Element::equals(lhs + offset, rhs + offset)) + return false; + } + return true; + } + + static int three_way_compare(const char *lhs, const char *rhs) { + for (size_t i = 0; i < ElementCount; ++i) { + const size_t offset = i * Element::SIZE; + // We make the assumption that 'equals' is cheaper than + // 'three_way_compare'. + if (Element::equals(lhs + offset, rhs + offset)) + continue; + return Element::three_way_compare(lhs + offset, rhs + offset); + } + return 0; + } + + static void splat_set(char *dst, const unsigned char value) { + for (size_t i = 0; i < ElementCount; ++i) { + const size_t offset = i * Element::SIZE; + Element::splat_set(dst + offset, value); + } + } + + static Storage load(const char *ptr) { + Storage value; + copy(reinterpret_cast(&value), ptr); + return value; + } + + static void store(char *ptr, Storage value) { + copy(ptr, reinterpret_cast(&value)); + } +}; + +template struct Repeated { + static void move(char *, const char *) {} +}; + +// Chain the operation of several types. +// For instance, to handle a 3 bytes operation, one can use: +// Chained::Operation(); +template struct Chained; + +template struct Chained { + static constexpr size_t SIZE = Head::SIZE + Chained::SIZE; + + static void copy(char *__restrict dst, const char *__restrict src) { + Chained::copy(dst + Head::SIZE, src + Head::SIZE); + __llvm_libc::copy(dst, src); + } + + static void move(char *dst, const char *src) { + const auto value = Head::load(src); + Chained::move(dst + Head::SIZE, src + Head::SIZE); + Head::store(dst, value); + } + + static bool equals(const char *lhs, const char *rhs) { + if (!__llvm_libc::equals(lhs, rhs)) + return false; + return Chained::equals(lhs + Head::SIZE, rhs + Head::SIZE); + } + + static int three_way_compare(const char *lhs, const char *rhs) { + if (__llvm_libc::equals(lhs, rhs)) + return Chained::three_way_compare(lhs + Head::SIZE, + rhs + Head::SIZE); + return __llvm_libc::three_way_compare(lhs, rhs); + } + + static void splat_set(char *dst, const unsigned char value) { + Chained::splat_set(dst + Head::SIZE, value); + __llvm_libc::splat_set(dst, value); + } +}; + +template <> struct Chained<> { + static constexpr size_t SIZE = 0; + static void copy(char *__restrict, const char *__restrict) {} + static void move(char *, const char *) {} + static bool equals(const char *, const char *) { return true; } + static int three_way_compare(const char *, const char *) { return 0; } + static void splat_set(char *, const unsigned char) {} +}; + +// Overlap ElementA and ElementB so they span Size bytes. +template +struct Overlap { + static constexpr size_t SIZE = Size; + static_assert(ElementB::SIZE <= ElementA::SIZE, "ElementB too big"); + static_assert(ElementA::SIZE <= Size, "ElementA too big"); + static_assert((ElementA::SIZE + ElementB::SIZE) >= Size, + "Elements too small to overlap"); + static constexpr size_t OFFSET = SIZE - ElementB::SIZE; + + static void copy(char *__restrict dst, const char *__restrict src) { + ElementA::copy(dst, src); + ElementB::copy(dst + OFFSET, src + OFFSET); + } + + static void move(char *dst, const char *src) { + const auto value_a = ElementA::load(src); + const auto value_b = ElementB::load(src + OFFSET); + ElementB::store(dst + OFFSET, value_b); + ElementA::store(dst, value_a); + } + + static bool equals(const char *lhs, const char *rhs) { + if (!ElementA::equals(lhs, rhs)) + return false; + if (!ElementB::equals(lhs + OFFSET, rhs + OFFSET)) + return false; + return true; + } + + static int three_way_compare(const char *lhs, const char *rhs) { + if (!ElementA::equals(lhs, rhs)) + return ElementA::three_way_compare(lhs, rhs); + if (!ElementB::equals(lhs + OFFSET, rhs + OFFSET)) + return ElementB::three_way_compare(lhs + OFFSET, rhs + OFFSET); + return 0; + } + + static void splat_set(char *dst, const unsigned char value) { + ElementA::splat_set(dst, value); + ElementB::splat_set(dst + OFFSET, value); + } +}; + +// Runtime-size Higher-Order Operations +// ------------------------------------ +// - Tail: Perform the operation on the last 'T::SIZE' bytes of the buffer. +// - HeadTail: Perform the operation on the first and last 'T::SIZE' bytes +// of the buffer. +// - Loop: Perform a loop of fixed-sized operations. + +// Perform the operation on the last 'T::SIZE' bytes of the buffer. +// +// e.g. with +// [1234567812345678123] +// [__XXXXXXXXXXXXXX___] +// [________XXXXXXXX___] +// +// Precondition: `size >= T::SIZE`. +template struct Tail { + static void copy(char *__restrict dst, const char *__restrict src, + size_t size) { + return T::copy(dst + offset(size), src + offset(size)); + } + + static bool equals(const char *lhs, const char *rhs, size_t size) { + return T::equals(lhs + offset(size), rhs + offset(size)); + } + + static int three_way_compare(const char *lhs, const char *rhs, size_t size) { + return T::three_way_compare(lhs + offset(size), rhs + offset(size)); + } + + static void splat_set(char *dst, const unsigned char value, size_t size) { + return T::splat_set(dst + offset(size), value); + } + + static size_t offset(size_t size) { return size - T::SIZE; } +}; + +// Perform the operation on the first and last 'T::SIZE' bytes of the buffer. +// This is useful for overlapping operations. +// +// e.g. with +// [1234567812345678123] +// [__XXXXXXXXXXXXXX___] +// [__XXXXXXXX_________] +// [________XXXXXXXX___] +// +// Precondition: `size >= T::SIZE && size <= 2 x T::SIZE`. +template struct HeadTail { + static void copy(char *__restrict dst, const char *__restrict src, + size_t size) { + T::copy(dst, src); + Tail::copy(dst, src, size); + } + + static void move(char *dst, const char *src, size_t size) { + const size_t offset = Tail::offset(size); + const auto head_value = T::load(src); + const auto tail_value = T::load(src + offset); + T::store(dst + offset, tail_value); + T::store(dst, head_value); + } + + static bool equals(const char *lhs, const char *rhs, size_t size) { + if (!T::equals(lhs, rhs)) + return false; + return Tail::equals(lhs, rhs, size); + } + + static int three_way_compare(const char *lhs, const char *rhs, size_t size) { + if (!T::equals(lhs, rhs)) + return T::three_way_compare(lhs, rhs); + return Tail::three_way_compare(lhs, rhs, size); + } + + static void splat_set(char *dst, const unsigned char value, size_t size) { + T::splat_set(dst, value); + Tail::splat_set(dst, value, size); + } +}; + +// Simple loop ending with a Tail operation. +// +// e.g. with +// [12345678123456781234567812345678] +// [__XXXXXXXXXXXXXXXXXXXXXXXXXXXX___] +// [__XXXXXXXX_______________________] +// [__________XXXXXXXX_______________] +// [__________________XXXXXXXX_______] +// [______________________XXXXXXXX___] +// +// Precondition: +// - size >= T::SIZE +template struct Loop { + static_assert(T::SIZE == TailT::SIZE, + "Tail type must have the same size as T"); + + static void copy(char *__restrict dst, const char *__restrict src, + size_t size) { + size_t offset = 0; + do { + T::copy(dst + offset, src + offset); + offset += T::SIZE; + } while (offset < size - T::SIZE); + Tail::copy(dst, src, size); + } + + // Move forward suitable when dst < src. We load the tail bytes before + // handling the loop. + // + // e.g. Moving two bytes + // [ | | | | |] + // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___] + // [_________________________LLLLLLLL___] + // [___LLLLLLLL_________________________] + // [_SSSSSSSS___________________________] + // [___________LLLLLLLL_________________] + // [_________SSSSSSSS___________________] + // [___________________LLLLLLLL_________] + // [_________________SSSSSSSS___________] + // [_______________________SSSSSSSS_____] + static void move(char *dst, const char *src, size_t size) { + const size_t tail_offset = Tail::offset(size); + const auto tail_value = TailT::load(src + tail_offset); + size_t offset = 0; + do { + T::move(dst + offset, src + offset); + offset += T::SIZE; + } while (offset < size - T::SIZE); + TailT::store(dst + tail_offset, tail_value); + } + + // Move forward suitable when dst > src. We load the head bytes before + // handling the loop. + // + // e.g. Moving two bytes + // [ | | | | |] + // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___] + // [___LLLLLLLL_________________________] + // [_________________________LLLLLLLL___] + // [___________________________SSSSSSSS_] + // [_________________LLLLLLLL___________] + // [___________________SSSSSSSS_________] + // [_________LLLLLLLL___________________] + // [___________SSSSSSSS_________________] + // [_____SSSSSSSS_______________________] + static void move_backward(char *dst, const char *src, size_t size) { + const auto head_value = TailT::load(src); + ptrdiff_t offset = size - T::SIZE; + do { + T::move(dst + offset, src + offset); + offset -= T::SIZE; + } while (offset >= 0); + TailT::store(dst, head_value); + } + + static bool equals(const char *lhs, const char *rhs, size_t size) { + size_t offset = 0; + do { + if (!T::equals(lhs + offset, rhs + offset)) + return false; + offset += T::SIZE; + } while (offset < size - T::SIZE); + return Tail::equals(lhs, rhs, size); + } + + static int three_way_compare(const char *lhs, const char *rhs, size_t size) { + size_t offset = 0; + do { + if (!T::equals(lhs + offset, rhs + offset)) + return T::three_way_compare(lhs + offset, rhs + offset); + offset += T::SIZE; + } while (offset < size - T::SIZE); + return Tail::three_way_compare(lhs, rhs, size); + } + + static void splat_set(char *dst, const unsigned char value, size_t size) { + size_t offset = 0; + do { + T::splat_set(dst + offset, value); + offset += T::SIZE; + } while (offset < size - T::SIZE); + Tail::splat_set(dst, value, size); + } +}; + +namespace internal { + +template struct ArgSelector {}; + +template <> struct ArgSelector { + template + static T1 *__restrict &Select(T1 *__restrict &p1ref, T2 *__restrict &) { + return p1ref; + } +}; + +template <> struct ArgSelector { + template + static T2 *__restrict &Select(T1 *__restrict &, T2 *__restrict &p2ref) { + return p2ref; + } +}; + +// Provides a specialized bump function that adjusts pointers and size so first +// argument (resp. second argument) gets aligned to Alignment. +// We make sure the compiler knows about the adjusted pointer alignment. +// The 'additional_bumps' parameter allows to reach previous / next aligned +// pointers. +template struct Align { + template + static void bump(T1 *__restrict &p1ref, T2 *__restrict &p2ref, size_t &size, + int additional_bumps = 0) { + auto &aligned_ptr = ArgSelector::Select(p1ref, p2ref); + auto offset = offset_to_next_aligned(aligned_ptr); + adjust(offset + additional_bumps * Alignment, p1ref, p2ref, size); + aligned_ptr = assume_aligned(aligned_ptr); + } +}; + +} // namespace internal + +// An alignment operation that: +// - executes the 'AlignmentT' operation +// - bumps 'dst' or 'src' (resp. 'lhs' or 'rhs') pointers so that the selected +// pointer gets aligned, size is decreased accordingly. +// - calls the 'NextT' operation. +// +// e.g. A 16-byte Destination Aligned 32-byte Loop Copy can be written as: +// copy::Then>>(dst, src, count); +template struct Align { +private: + static constexpr size_t ALIGNMENT = AlignmentT::SIZE; + static_assert(ALIGNMENT > 1, "Alignment must be more than 1"); + static_assert(is_power2(ALIGNMENT), "Alignment must be a power of 2"); + +public: + template struct Then { + static void copy(char *__restrict dst, const char *__restrict src, + size_t size) { + AlignmentT::copy(dst, src); + internal::Align::bump(dst, src, size); + NextT::copy(dst, src, size); + } + + // Move forward suitable when dst < src. The alignment is performed with an + // HeadTail operation of size ∈ [Alignment, 2 x Alignment]. + // + // e.g. Moving two bytes and making sure src is then aligned. + // [ | | | | ] + // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_] + // [____LLLLLLLL_____________________] + // [___________LLLLLLLL______________] + // [_SSSSSSSS________________________] + // [________SSSSSSSS_________________] + // + // e.g. Moving two bytes and making sure dst is then aligned. + // [ | | | | ] + // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_] + // [____LLLLLLLL_____________________] + // [______LLLLLLLL___________________] + // [_SSSSSSSS________________________] + // [___SSSSSSSS______________________] + static void move(char *dst, const char *src, size_t size) { + char *next_dst = dst; + const char *next_src = src; + size_t next_size = size; + internal::Align::bump(next_dst, next_src, next_size, + 1); + HeadTail::move(dst, src, size - next_size); + NextT::move(next_dst, next_src, next_size); + } + + // Move backward suitable when dst > src. The alignment is performed with an + // HeadTail operation of size ∈ [Alignment, 2 x Alignment]. + // + // e.g. Moving two bytes backward and making sure src is then aligned. + // [ | | | | ] + // [____XXXXXXXXXXXXXXXXXXXXXXXX_____] + // [ _________________LLLLLLLL_______] + // [ ___________________LLLLLLLL_____] + // [____________________SSSSSSSS_____] + // [______________________SSSSSSSS___] + // + // e.g. Moving two bytes and making sure dst is then aligned. + // [ | | | | ] + // [____XXXXXXXXXXXXXXXXXXXXXXXX_____] + // [ _______________LLLLLLLL_________] + // [ ___________________LLLLLLLL_____] + // [__________________SSSSSSSS_______] + // [______________________SSSSSSSS___] + static void move_backward(char *dst, const char *src, size_t size) { + char *headtail_dst = dst + size; + const char *headtail_src = src + size; + size_t headtail_size = 0; + internal::Align::bump(headtail_dst, headtail_src, + headtail_size, -2); + HeadTail::move(headtail_dst, headtail_src, headtail_size); + NextT::move_backward(dst, src, size - headtail_size); + } + + static bool equals(const char *lhs, const char *rhs, size_t size) { + if (!AlignmentT::equals(lhs, rhs)) + return false; + internal::Align::bump(lhs, rhs, size); + return NextT::equals(lhs, rhs, size); + } + + static int three_way_compare(const char *lhs, const char *rhs, + size_t size) { + if (!AlignmentT::equals(lhs, rhs)) + return AlignmentT::three_way_compare(lhs, rhs); + internal::Align::bump(lhs, rhs, size); + return NextT::three_way_compare(lhs, rhs, size); + } + + static void splat_set(char *dst, const unsigned char value, size_t size) { + AlignmentT::splat_set(dst, value); + char *dummy = nullptr; + internal::Align::bump(dst, dummy, size); + NextT::splat_set(dst, value, size); + } + }; +}; + +// An operation that allows to skip the specified amount of bytes. +template struct Skip { + template struct Then { + static void copy(char *__restrict dst, const char *__restrict src, + size_t size) { + NextT::copy(dst + Bytes, src + Bytes, size - Bytes); + } + + static void copy(char *__restrict dst, const char *__restrict src) { + NextT::copy(dst + Bytes, src + Bytes); + } + + static bool equals(const char *lhs, const char *rhs, size_t size) { + return NextT::equals(lhs + Bytes, rhs + Bytes, size - Bytes); + } + + static bool equals(const char *lhs, const char *rhs) { + return NextT::equals(lhs + Bytes, rhs + Bytes); + } + + static int three_way_compare(const char *lhs, const char *rhs, + size_t size) { + return NextT::three_way_compare(lhs + Bytes, rhs + Bytes, size - Bytes); + } + + static int three_way_compare(const char *lhs, const char *rhs) { + return NextT::three_way_compare(lhs + Bytes, rhs + Bytes); + } + + static void splat_set(char *dst, const unsigned char value, size_t size) { + NextT::splat_set(dst + Bytes, value, size - Bytes); + } + + static void splat_set(char *dst, const unsigned char value) { + NextT::splat_set(dst + Bytes, value); + } + }; +}; + +// Fixed-size Builtin Operations +// ----------------------------- +// Note: Do not use 'builtin' right now as it requires the implementation of the +// `_inline` versions of all the builtins. Theoretically, Clang can still turn +// them into calls to the C library leading to reentrancy problems. +namespace builtin { + +#ifndef __has_builtin +#define __has_builtin(x) 0 // Compatibility with non-clang compilers. +#endif + +template struct Builtin { + static constexpr size_t SIZE = Size; + + static void copy(char *__restrict dst, const char *__restrict src) { +#if LLVM_LIBC_HAVE_MEMORY_SANITIZER || LLVM_LIBC_HAVE_ADDRESS_SANITIZER + for_loop_copy(dst, src); +#elif __has_builtin(__builtin_memcpy_inline) + // __builtin_memcpy_inline guarantees to never call external functions. + // Unfortunately it is not widely available. + __builtin_memcpy_inline(dst, src, SIZE); +#else + for_loop_copy(dst, src); +#endif + } + + static void move(char *dst, const char *src) { +#if LLVM_LIBC_HAVE_MEMORY_SANITIZER || LLVM_LIBC_HAVE_ADDRESS_SANITIZER + for_loop_move(dst, src); +#elif __has_builtin(__builtin_memmove) + __builtin_memmove(dst, src, SIZE); +#else + for_loop_move(dst, src); +#endif + } + +#if __has_builtin(__builtin_memcmp_inline) +#define LLVM_LIBC_MEMCMP __builtin_memcmp_inline +#else +#define LLVM_LIBC_MEMCMP __builtin_memcmp +#endif + + static bool equals(const char *lhs, const char *rhs) { + return LLVM_LIBC_MEMCMP(lhs, rhs, SIZE) == 0; + } + + static int three_way_compare(const char *lhs, const char *rhs) { + return LLVM_LIBC_MEMCMP(lhs, rhs, SIZE); + } + + static void splat_set(char *dst, const unsigned char value) { + __builtin_memset(dst, value, SIZE); + } + +private: + // Copies `SIZE` bytes from `src` to `dst` using a for loop. + // This code requires the use of `-fno-builtin-memcpy` to prevent the compiler + // from turning the for-loop back into `__builtin_memcpy`. + static void for_loop_copy(char *__restrict dst, const char *__restrict src) { + for (size_t i = 0; i < SIZE; ++i) + dst[i] = src[i]; + } + + static void for_loop_move(char *dst, const char *src) { + for (size_t i = 0; i < SIZE; ++i) + dst[i] = src[i]; + } +}; + +using _1 = Builtin<1>; +using _2 = Builtin<2>; +using _3 = Builtin<3>; +using _4 = Builtin<4>; +using _8 = Builtin<8>; +using _16 = Builtin<16>; +using _32 = Builtin<32>; +using _64 = Builtin<64>; +using _128 = Builtin<128>; + +} // namespace builtin + +// Fixed-size Scalar Operations +// ---------------------------- +namespace scalar { + +// The Scalar type makes use of simple sized integers. +template struct Scalar { + static constexpr size_t SIZE = sizeof(T); + + static void copy(char *__restrict dst, const char *__restrict src) { + store(dst, load(src)); + } + + static void move(char *dst, const char *src) { store(dst, load(src)); } + + static bool equals(const char *lhs, const char *rhs) { + return load(lhs) == load(rhs); + } + + static int three_way_compare(const char *lhs, const char *rhs) { + return scalar_three_way_compare(load(lhs), load(rhs)); + } + + static void splat_set(char *dst, const unsigned char value) { + store(dst, get_splatted_value(value)); + } + + static int scalar_three_way_compare(T a, T b); + + static T load(const char *ptr) { + T value; + builtin::Builtin::copy(reinterpret_cast(&value), ptr); + return value; + } + static void store(char *ptr, T value) { + builtin::Builtin::copy(ptr, reinterpret_cast(&value)); + } + +private: + static T get_splatted_value(const unsigned char value) { + return T(~0) / T(0xFF) * T(value); + } +}; + +template <> +inline int Scalar::scalar_three_way_compare(uint8_t a, uint8_t b) { + const int16_t la = Endian::to_big_endian(a); + const int16_t lb = Endian::to_big_endian(b); + return la - lb; +} +template <> +inline int Scalar::scalar_three_way_compare(uint16_t a, uint16_t b) { + const int32_t la = Endian::to_big_endian(a); + const int32_t lb = Endian::to_big_endian(b); + return la - lb; +} +template <> +inline int Scalar::scalar_three_way_compare(uint32_t a, uint32_t b) { + const uint32_t la = Endian::to_big_endian(a); + const uint32_t lb = Endian::to_big_endian(b); + return la > lb ? 1 : la < lb ? -1 : 0; +} +template <> +inline int Scalar::scalar_three_way_compare(uint64_t a, uint64_t b) { + const uint64_t la = Endian::to_big_endian(a); + const uint64_t lb = Endian::to_big_endian(b); + return la > lb ? 1 : la < lb ? -1 : 0; +} + +using UINT8 = Scalar; // 1 Byte +using UINT16 = Scalar; // 2 Bytes +using UINT32 = Scalar; // 4 Bytes +using UINT64 = Scalar; // 8 Bytes + +using _1 = UINT8; +using _2 = UINT16; +using _3 = Chained; +using _4 = UINT32; +using _8 = UINT64; +using _16 = Repeated<_8, 2>; +using _32 = Repeated<_8, 4>; +using _64 = Repeated<_8, 8>; +using _128 = Repeated<_8, 16>; + +} // namespace scalar +} // namespace __llvm_libc + +#include +#include + +#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H diff --git a/libc/src/string/memory_utils/elements_aarch64.h b/libc/src/string/memory_utils/elements_aarch64.h new file mode 100644 index 0000000..0529df7 --- /dev/null +++ b/libc/src/string/memory_utils/elements_aarch64.h @@ -0,0 +1,130 @@ +//===-- Elementary operations for aarch64 --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_AARCH64_H +#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_AARCH64_H + +#include "src/__support/architectures.h" + +#if defined(LLVM_LIBC_ARCH_AARCH64) + +#include +#include // size_t +#include // uint8_t, uint16_t, uint32_t, uint64_t + +#ifdef __ARM_NEON +#include +#endif + +namespace __llvm_libc { +namespace aarch64_memset { +#ifdef __ARM_NEON +struct Splat8 { + static constexpr size_t SIZE = 8; + static void splat_set(char *dst, const unsigned char value) { + vst1_u8((uint8_t *)dst, vdup_n_u8(value)); + } +}; + +struct Splat16 { + static constexpr size_t SIZE = 16; + static void splat_set(char *dst, const unsigned char value) { + vst1q_u8((uint8_t *)dst, vdupq_n_u8(value)); + } +}; + +using _8 = Splat8; +using _16 = Splat16; +#else +using _8 = __llvm_libc::scalar::_8; +using _16 = Repeated<_8, 2>; +#endif // __ARM_NEON + +using _1 = __llvm_libc::scalar::_1; +using _2 = __llvm_libc::scalar::_2; +using _3 = __llvm_libc::scalar::_3; +using _4 = __llvm_libc::scalar::_4; +using _32 = Chained<_16, _16>; +using _64 = Chained<_32, _32>; + +struct Zva64 { + static constexpr size_t SIZE = 64; + + static void splat_set(char *dst, const unsigned char) { +#if __SIZEOF_POINTER__ == 4 + asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory"); +#else + asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory"); +#endif + } +}; + +inline static bool hasZva() { + uint64_t zva_val; + asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val)); + // DC ZVA is permitted if DZP, bit [4] is zero. + // BS, bits [3:0] is log2 of the block size in words. + // So the next line checks whether the instruction is permitted and block size + // is 16 words (i.e. 64 bytes). + return (zva_val & 0b11111) == 0b00100; +} + +} // namespace aarch64_memset + +namespace aarch64 { + +using _1 = __llvm_libc::scalar::_1; +using _2 = __llvm_libc::scalar::_2; +using _3 = __llvm_libc::scalar::_3; +using _4 = __llvm_libc::scalar::_4; +using _8 = __llvm_libc::scalar::_8; +using _16 = __llvm_libc::scalar::_16; + +#ifdef __ARM_NEON +struct N32 { + static constexpr size_t SIZE = 32; + static bool equals(const char *lhs, const char *rhs) { + uint8x16_t l_0 = vld1q_u8((const uint8_t *)lhs); + uint8x16_t r_0 = vld1q_u8((const uint8_t *)rhs); + uint8x16_t l_1 = vld1q_u8((const uint8_t *)(lhs + 16)); + uint8x16_t r_1 = vld1q_u8((const uint8_t *)(rhs + 16)); + uint8x16_t temp = vpmaxq_u8(veorq_u8(l_0, r_0), veorq_u8(l_1, r_1)); + uint64_t res = + vgetq_lane_u64(vreinterpretq_u64_u8(vpmaxq_u8(temp, temp)), 0); + return res == 0; + } + static int three_way_compare(const char *lhs, const char *rhs) { + uint8x16_t l_0 = vld1q_u8((const uint8_t *)lhs); + uint8x16_t r_0 = vld1q_u8((const uint8_t *)rhs); + uint8x16_t l_1 = vld1q_u8((const uint8_t *)(lhs + 16)); + uint8x16_t r_1 = vld1q_u8((const uint8_t *)(rhs + 16)); + uint8x16_t temp = vpmaxq_u8(veorq_u8(l_0, r_0), veorq_u8(l_1, r_1)); + uint64_t res = + vgetq_lane_u64(vreinterpretq_u64_u8(vpmaxq_u8(temp, temp)), 0); + if (res == 0) + return 0; + size_t index = (__builtin_ctzl(res) >> 3) << 2; + uint32_t l = *((const uint32_t *)(lhs + index)); + uint32_t r = *((const uint32_t *)(rhs + index)); + return __llvm_libc::scalar::_4::scalar_three_way_compare(l, r); + } +}; + +using _32 = N32; +using _64 = Repeated<_32, 2>; +#else +using _32 = __llvm_libc::scalar::_32; +using _64 = __llvm_libc::scalar::_64; +#endif // __ARM_NEON + +} // namespace aarch64 +} // namespace __llvm_libc + +#endif // defined(LLVM_LIBC_ARCH_AARCH64) + +#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_AARCH64_H diff --git a/libc/src/string/memory_utils/elements_x86.h b/libc/src/string/memory_utils/elements_x86.h new file mode 100644 index 0000000..7a2a8cc --- /dev/null +++ b/libc/src/string/memory_utils/elements_x86.h @@ -0,0 +1,189 @@ +//===-- Elementary operations for x86 -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H +#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H + +#include "src/__support/CPP/bit.h" +#include "src/__support/architectures.h" + +#if defined(LLVM_LIBC_ARCH_X86) + +#include // size_t +#include // uint8_t, uint16_t, uint32_t, uint64_t + +#ifdef __SSE2__ +#include +#endif // __SSE2__ + +#include "src/string/memory_utils/elements.h" // __llvm_libc::scalar + +// Fixed-size Vector Operations +// ---------------------------- + +namespace __llvm_libc { +namespace x86 { + +#ifdef __SSE2__ +template struct Vector : public Base { + static void copy(char *__restrict dst, const char *__restrict src) { + Base::store(dst, Base::load(src)); + } + + static void move(char *dst, const char *src) { + Base::store(dst, Base::load(src)); + } + + static bool equals(const char *a, const char *b) { + return Base::not_equal_mask(Base::load(a), Base::load(b)) == 0; + } + + static int three_way_compare(const char *a, const char *b) { + const auto mask = Base::not_equal_mask(Base::load(a), Base::load(b)); + if (!mask) + return 0; + return char_diff(a, b, mask); + } + + static void splat_set(char *dst, const unsigned char value) { + Base::store(dst, Base::get_splatted_value(value)); + } + + static int char_diff(const char *a, const char *b, uint64_t mask) { + const size_t diff_index = __builtin_ctzll(mask); + const int ca = (unsigned char)a[diff_index]; + const int cb = (unsigned char)b[diff_index]; + return ca - cb; + } +}; + +struct M128 { + static constexpr size_t SIZE = 16; + using T = char __attribute__((__vector_size__(SIZE))); + static uint16_t mask(T value) { + // NOLINTNEXTLINE(llvmlibc-callee-namespace) + return static_cast( + _mm_movemask_epi8(cpp::bit_cast<__m128i>(value))); + } + static uint16_t not_equal_mask(T a, T b) { return mask(a != b); } + static T load(const char *ptr) { + // NOLINTNEXTLINE(llvmlibc-callee-namespace) + return cpp::bit_cast( + _mm_loadu_si128(reinterpret_cast<__m128i_u const *>(ptr))); + } + static void store(char *ptr, T value) { + // NOLINTNEXTLINE(llvmlibc-callee-namespace) + return _mm_storeu_si128(reinterpret_cast<__m128i_u *>(ptr), + cpp::bit_cast<__m128i>(value)); + } + static T get_splatted_value(const char v) { + const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}; + return splatted; + } +}; + +using Vector128 = Vector; // 16 Bytes + +#ifdef __AVX2__ +struct M256 { + static constexpr size_t SIZE = 32; + using T = char __attribute__((__vector_size__(SIZE))); + static uint32_t mask(T value) { + // NOLINTNEXTLINE(llvmlibc-callee-namespace) + return _mm256_movemask_epi8(cpp::bit_cast<__m256i>(value)); + } + static uint32_t not_equal_mask(T a, T b) { return mask(a != b); } + static T load(const char *ptr) { + // NOLINTNEXTLINE(llvmlibc-callee-namespace) + return cpp::bit_cast( + _mm256_loadu_si256(reinterpret_cast<__m256i const *>(ptr))); + } + static void store(char *ptr, T value) { + // NOLINTNEXTLINE(llvmlibc-callee-namespace) + return _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), + cpp::bit_cast<__m256i>(value)); + } + static T get_splatted_value(const char v) { + const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, + v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}; + return splatted; + } +}; + +using Vector256 = Vector; // 32 Bytes + +#if defined(__AVX512F__) and defined(__AVX512BW__) +struct M512 { + static constexpr size_t SIZE = 64; + using T = char __attribute__((__vector_size__(SIZE))); + static uint64_t not_equal_mask(T a, T b) { + // NOLINTNEXTLINE(llvmlibc-callee-namespace) + return _mm512_cmpneq_epi8_mask(cpp::bit_cast<__m512i>(a), + cpp::bit_cast<__m512i>(b)); + } + static T load(const char *ptr) { + // NOLINTNEXTLINE(llvmlibc-callee-namespace) + return cpp::bit_cast(_mm512_loadu_epi8(ptr)); + } + static void store(char *ptr, T value) { + // NOLINTNEXTLINE(llvmlibc-callee-namespace) + return _mm512_storeu_epi8(ptr, cpp::bit_cast<__m512i>(value)); + } + static T get_splatted_value(const char v) { + const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, + v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, + v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, + v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}; + return splatted; + } +}; +using Vector512 = Vector; + +#endif // defined(__AVX512F__) and defined(__AVX512BW__) +#endif // __AVX2__ +#endif // __SSE2__ + +using _1 = __llvm_libc::scalar::_1; +using _2 = __llvm_libc::scalar::_2; +using _3 = __llvm_libc::scalar::_3; +using _4 = __llvm_libc::scalar::_4; +using _8 = __llvm_libc::scalar::_8; +#if defined(__AVX512F__) && defined(__AVX512BW__) +using _16 = __llvm_libc::x86::Vector128; +using _32 = __llvm_libc::x86::Vector256; +using _64 = __llvm_libc::x86::Vector512; +using _128 = __llvm_libc::Repeated<_64, 2>; +#elif defined(__AVX2__) +using _16 = __llvm_libc::x86::Vector128; +using _32 = __llvm_libc::x86::Vector256; +using _64 = __llvm_libc::Repeated<_32, 2>; +using _128 = __llvm_libc::Repeated<_32, 4>; +#elif defined(__SSE2__) +using _16 = __llvm_libc::x86::Vector128; +using _32 = __llvm_libc::Repeated<_16, 2>; +using _64 = __llvm_libc::Repeated<_16, 4>; +using _128 = __llvm_libc::Repeated<_16, 8>; +#else +using _16 = __llvm_libc::Repeated<_8, 2>; +using _32 = __llvm_libc::Repeated<_8, 4>; +using _64 = __llvm_libc::Repeated<_8, 8>; +using _128 = __llvm_libc::Repeated<_8, 16>; +#endif + +struct Accelerator { + static void copy(char *dst, const char *src, size_t count) { + asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory"); + } +}; + +} // namespace x86 +} // namespace __llvm_libc + +#endif // defined(LLVM_LIBC_ARCH_X86) + +#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H diff --git a/libc/src/string/memory_utils/memcmp_implementations.h b/libc/src/string/memory_utils/memcmp_implementations.h index af85afa..f207946 100644 --- a/libc/src/string/memory_utils/memcmp_implementations.h +++ b/libc/src/string/memory_utils/memcmp_implementations.h @@ -11,133 +11,92 @@ #include "src/__support/architectures.h" #include "src/__support/common.h" -#include "src/string/memory_utils/op_aarch64.h" -#include "src/string/memory_utils/op_builtin.h" -#include "src/string/memory_utils/op_generic.h" -#include "src/string/memory_utils/op_x86.h" -#include "src/string/memory_utils/utils.h" +#include "src/string/memory_utils/elements.h" #include // size_t namespace __llvm_libc { -[[maybe_unused]] static inline MemcmpReturnType -inline_memcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) { -#pragma nounroll - for (size_t offset = 0; offset < count; ++offset) - if (auto value = generic::Memcmp<1>::block(p1 + offset, p2 + offset)) - return value; - return MemcmpReturnType::ZERO(); -} - -#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64) -[[maybe_unused]] static inline MemcmpReturnType -inline_memcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) { - if (unlikely(count >= 384)) { - if (auto value = generic::Memcmp<16>::block(p1, p2)) - return value; - align_to_next_boundary<16, Arg::P1>(p1, p2, count); - } - return generic::Memcmp<16>::loop_and_tail(p1, p2, count); -} -#endif // defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64) +static inline int inline_memcmp(const char *lhs, const char *rhs, + size_t count) { #if defined(LLVM_LIBC_ARCH_X86) -[[maybe_unused]] static inline MemcmpReturnType -inline_memcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) { - if (unlikely(count >= 384)) { - if (auto value = x86::sse2::Memcmp<16>::block(p1, p2)) - return value; - align_to_next_boundary<16, Arg::P1>(p1, p2, count); - } - return x86::sse2::Memcmp<16>::loop_and_tail(p1, p2, count); -} - -[[maybe_unused]] static inline MemcmpReturnType -inline_memcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) { - if (count <= 32) - return x86::sse2::Memcmp<16>::head_tail(p1, p2, count); - if (count <= 64) - return x86::avx2::Memcmp<32>::head_tail(p1, p2, count); - if (count <= 128) - return x86::avx2::Memcmp<64>::head_tail(p1, p2, count); - if (unlikely(count >= 384)) { - if (auto value = x86::avx2::Memcmp<32>::block(p1, p2)) - return value; - align_to_next_boundary<32, Arg::P1>(p1, p2, count); - } - return x86::avx2::Memcmp<32>::loop_and_tail(p1, p2, count); -} - -[[maybe_unused]] static inline MemcmpReturnType -inline_memcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) { + ///////////////////////////////////////////////////////////////////////////// + // LLVM_LIBC_ARCH_X86 + ///////////////////////////////////////////////////////////////////////////// + using namespace __llvm_libc::x86; + if (count == 0) + return 0; + if (count == 1) + return three_way_compare<_1>(lhs, rhs); + if (count == 2) + return three_way_compare<_2>(lhs, rhs); + if (count == 3) + return three_way_compare<_3>(lhs, rhs); + if (count <= 8) + return three_way_compare>(lhs, rhs, count); + if (count <= 16) + return three_way_compare>(lhs, rhs, count); if (count <= 32) - return x86::sse2::Memcmp<16>::head_tail(p1, p2, count); + return three_way_compare>(lhs, rhs, count); if (count <= 64) - return x86::avx2::Memcmp<32>::head_tail(p1, p2, count); + return three_way_compare>(lhs, rhs, count); if (count <= 128) - return x86::avx512bw::Memcmp<64>::head_tail(p1, p2, count); - if (unlikely(count >= 384)) { - if (auto value = x86::avx512bw::Memcmp<64>::block(p1, p2)) - return value; - align_to_next_boundary<64, Arg::P1>(p1, p2, count); - } - return x86::avx512bw::Memcmp<64>::loop_and_tail(p1, p2, count); -} -#endif // defined(LLVM_LIBC_ARCH_X86) - -#if defined(LLVM_LIBC_ARCH_AARCH64) -[[maybe_unused]] static inline MemcmpReturnType -inline_memcmp_aarch64_neon_gt16(CPtr p1, CPtr p2, size_t count) { - if (unlikely(count >= 128)) { // [128, ∞] - if (auto value = generic::Memcmp<16>::block(p1, p2)) - return value; - align_to_next_boundary<16, Arg::P1>(p1, p2, count); - return generic::Memcmp<32>::loop_and_tail(p1, p2, count); - } + return three_way_compare>(lhs, rhs, count); + return three_way_compare::Then>>(lhs, rhs, count); +#elif defined(LLVM_LIBC_ARCH_AARCH64) + ///////////////////////////////////////////////////////////////////////////// + // LLVM_LIBC_ARCH_AARCH64 + ///////////////////////////////////////////////////////////////////////////// + using namespace ::__llvm_libc::aarch64; + if (count == 0) // [0, 0] + return 0; + if (count == 1) // [1, 1] + return three_way_compare<_1>(lhs, rhs); + if (count == 2) // [2, 2] + return three_way_compare<_2>(lhs, rhs); + if (count == 3) // [3, 3] + return three_way_compare<_3>(lhs, rhs); + if (count < 8) // [4, 7] + return three_way_compare>(lhs, rhs, count); + if (count < 16) // [8, 15] + return three_way_compare>(lhs, rhs, count); + if (unlikely(count >= 128)) // [128, ∞] + return three_way_compare::Then>>(lhs, rhs, count); + if (!equals<_16>(lhs, rhs)) // [16, 16] + return three_way_compare<_16>(lhs, rhs); if (count < 32) // [17, 31] - return generic::Memcmp<16>::tail(p1, p2, count); - if (generic::Bcmp<16>::block(p1 + 16, p2 + 16)) // [32, 32] - return generic::Memcmp<16>::block(p1 + 16, p2 + 16); + return three_way_compare>(lhs, rhs, count); + if (!equals::Then<_16>>(lhs, rhs)) // [32, 32] + return three_way_compare::Then<_16>>(lhs, rhs); if (count < 64) // [33, 63] - return generic::Memcmp<32>::tail(p1, p2, count); + return three_way_compare>(lhs, rhs, count); // [64, 127] - return generic::Memcmp<16>::loop_and_tail(p1 + 32, p2 + 32, count - 32); -} -#endif // defined(LLVM_LIBC_ARCH_AARCH64) + return three_way_compare::Then>>(lhs, rhs, count); +#else + ///////////////////////////////////////////////////////////////////////////// + // Default + ///////////////////////////////////////////////////////////////////////////// + using namespace ::__llvm_libc::scalar; -static inline MemcmpReturnType inline_memcmp(CPtr p1, CPtr p2, size_t count) { -#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64) if (count == 0) - return MemcmpReturnType::ZERO(); + return 0; if (count == 1) - return generic::Memcmp<1>::block(p1, p2); + return three_way_compare<_1>(lhs, rhs); if (count == 2) - return generic::Memcmp<2>::block(p1, p2); + return three_way_compare<_2>(lhs, rhs); if (count == 3) - return generic::Memcmp<3>::block(p1, p2); + return three_way_compare<_3>(lhs, rhs); if (count <= 8) - return generic::Memcmp<4>::head_tail(p1, p2, count); + return three_way_compare>(lhs, rhs, count); if (count <= 16) - return generic::Memcmp<8>::head_tail(p1, p2, count); -#if defined(LLVM_LIBC_ARCH_X86) - if constexpr (x86::kAvx512BW) - return inline_memcmp_x86_avx512bw_gt16(p1, p2, count); - else if constexpr (x86::kAvx2) - return inline_memcmp_x86_avx2_gt16(p1, p2, count); - else if constexpr (x86::kSse2) - return inline_memcmp_x86_sse2_gt16(p1, p2, count); - else - return inline_memcmp_generic_gt16(p1, p2, count); -#elif defined(LLVM_LIBC_ARCH_AARCH64) - if constexpr (aarch64::kNeon) - return inline_memcmp_aarch64_neon_gt16(p1, p2, count); - else - return inline_memcmp_generic_gt16(p1, p2, count); -#endif -#elif defined(LLVM_LIBC_ARCH_ARM) - return inline_memcmp_embedded_tiny(p1, p2, count); -#else -#error "Unsupported platform" + return three_way_compare>(lhs, rhs, count); + if (count <= 32) + return three_way_compare>(lhs, rhs, count); + if (count <= 64) + return three_way_compare>(lhs, rhs, count); + if (count <= 128) + return three_way_compare>(lhs, rhs, count); + return three_way_compare::Then>>(lhs, rhs, count); #endif } diff --git a/libc/src/string/memory_utils/memcpy_implementations.h b/libc/src/string/memory_utils/memcpy_implementations.h index ddfe65d..3385d40 100644 --- a/libc/src/string/memory_utils/memcpy_implementations.h +++ b/libc/src/string/memory_utils/memcpy_implementations.h @@ -11,123 +11,142 @@ #include "src/__support/architectures.h" #include "src/__support/common.h" -#include "src/string/memory_utils/op_aarch64.h" -#include "src/string/memory_utils/op_builtin.h" -#include "src/string/memory_utils/op_generic.h" -#include "src/string/memory_utils/op_x86.h" +#include "src/string/memory_utils/elements.h" #include "src/string/memory_utils/utils.h" #include // size_t -namespace __llvm_libc { +// Design rationale +// ================ +// +// Using a profiler to observe size distributions for calls into libc +// functions, it was found most operations act on a small number of bytes. +// This makes it important to favor small sizes. +// +// The tests for `count` are in ascending order so the cost of branching is +// proportional to the cost of copying. +// +// The function is written in C++ for several reasons: +// - The compiler can __see__ the code, this is useful when performing Profile +// Guided Optimization as the optimized code can take advantage of branching +// probabilities. +// - It also allows for easier customization and favors testing multiple +// implementation parameters. +// - As compilers and processors get better, the generated code is improved +// with little change on the code side. -[[maybe_unused]] static inline void -inline_memcpy_embedded_tiny(char *__restrict dst, const char *__restrict src, - size_t count) { -#pragma nounroll - for (size_t offset = 0; offset < count; ++offset) - builtin::Memcpy<1>::block(dst + offset, src + offset); -} +namespace __llvm_libc { +static inline void inline_memcpy(char *__restrict dst, + const char *__restrict src, size_t count) { + using namespace __llvm_libc::builtin; #if defined(LLVM_LIBC_ARCH_X86) -[[maybe_unused]] static inline void -inline_memcpy_x86(char *__restrict dst, const char *__restrict src, - size_t count) { + ///////////////////////////////////////////////////////////////////////////// + // LLVM_LIBC_ARCH_X86 + ///////////////////////////////////////////////////////////////////////////// + + // Whether to use only rep;movsb. + constexpr bool USE_ONLY_REP_MOVSB = + LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB); + + // kRepMovsBSize == -1 : Only CopyAligned is used. + // kRepMovsBSize == 0 : Only RepMovsb is used. + // else CopyAligned is used up to kRepMovsBSize and then RepMovsb. + constexpr size_t REP_MOVS_B_SIZE = +#if defined(LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE) + LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE; +#else + -1; +#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE + + // Whether target supports AVX instructions. + constexpr bool HAS_AVX = LLVM_LIBC_IS_DEFINED(__AVX__); + +#if defined(__AVX__) + using LoopBlockSize = _64; +#else + using LoopBlockSize = _32; +#endif + + if (USE_ONLY_REP_MOVSB) + return copy(dst, src, count); + if (count == 0) return; if (count == 1) - return builtin::Memcpy<1>::block(dst, src); + return copy<_1>(dst, src); if (count == 2) - return builtin::Memcpy<2>::block(dst, src); + return copy<_2>(dst, src); if (count == 3) - return builtin::Memcpy<3>::block(dst, src); + return copy<_3>(dst, src); if (count == 4) - return builtin::Memcpy<4>::block(dst, src); + return copy<_4>(dst, src); if (count < 8) - return builtin::Memcpy<4>::head_tail(dst, src, count); + return copy>(dst, src, count); if (count < 16) - return builtin::Memcpy<8>::head_tail(dst, src, count); + return copy>(dst, src, count); if (count < 32) - return builtin::Memcpy<16>::head_tail(dst, src, count); + return copy>(dst, src, count); if (count < 64) - return builtin::Memcpy<32>::head_tail(dst, src, count); + return copy>(dst, src, count); if (count < 128) - return builtin::Memcpy<64>::head_tail(dst, src, count); - if (x86::kAvx && count < 256) - return builtin::Memcpy<128>::head_tail(dst, src, count); - builtin::Memcpy<32>::block(dst, src); - align_to_next_boundary<32, Arg::Dst>(dst, src, count); - static constexpr size_t kBlockSize = x86::kAvx ? 64 : 32; - return builtin::Memcpy::loop_and_tail(dst, src, count); -} - -[[maybe_unused]] static inline void inline_memcpy_x86_maybe_interpose_repmovsb( - char *__restrict dst, const char *__restrict src, size_t count) { - // Whether to use rep;movsb exclusively, not at all, or only above a certain - // threshold. - // TODO: Use only a single preprocessor definition to simplify the code. -#ifndef LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE -#define LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE -1 -#endif - - static constexpr bool kUseOnlyRepMovsb = - LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB); - static constexpr size_t kRepMovsbThreshold = - LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE; - if constexpr (kUseOnlyRepMovsb) - return x86::Memcpy::repmovsb(dst, src, count); - else if constexpr (kRepMovsbThreshold >= 0) { - if (unlikely(count >= kRepMovsbThreshold)) - return x86::Memcpy::repmovsb(dst, src, count); - else - return inline_memcpy_x86(dst, src, count); - } else { - return inline_memcpy_x86(dst, src, count); - } -} -#endif // defined(LLVM_LIBC_ARCH_X86) - -#if defined(LLVM_LIBC_ARCH_AARCH64) -[[maybe_unused]] static inline void -inline_memcpy_aarch64(char *__restrict dst, const char *__restrict src, - size_t count) { + return copy>(dst, src, count); + if (HAS_AVX && count < 256) + return copy>(dst, src, count); + if (count <= REP_MOVS_B_SIZE) + return copy::Then>>(dst, src, + count); + return copy(dst, src, count); +#elif defined(LLVM_LIBC_ARCH_AARCH64) + ///////////////////////////////////////////////////////////////////////////// + // LLVM_LIBC_ARCH_AARCH64 + ///////////////////////////////////////////////////////////////////////////// if (count == 0) return; if (count == 1) - return builtin::Memcpy<1>::block(dst, src); + return copy<_1>(dst, src); if (count == 2) - return builtin::Memcpy<2>::block(dst, src); + return copy<_2>(dst, src); if (count == 3) - return builtin::Memcpy<3>::block(dst, src); + return copy<_3>(dst, src); if (count == 4) - return builtin::Memcpy<4>::block(dst, src); + return copy<_4>(dst, src); if (count < 8) - return builtin::Memcpy<4>::head_tail(dst, src, count); + return copy>(dst, src, count); if (count < 16) - return builtin::Memcpy<8>::head_tail(dst, src, count); + return copy>(dst, src, count); if (count < 32) - return builtin::Memcpy<16>::head_tail(dst, src, count); + return copy>(dst, src, count); if (count < 64) - return builtin::Memcpy<32>::head_tail(dst, src, count); + return copy>(dst, src, count); if (count < 128) - return builtin::Memcpy<64>::head_tail(dst, src, count); - builtin::Memcpy<16>::block(dst, src); - align_to_next_boundary<16, Arg::Src>(dst, src, count); - return builtin::Memcpy<64>::loop_and_tail(dst, src, count); -} -#endif // defined(LLVM_LIBC_ARCH_AARCH64) - -static inline void inline_memcpy(char *__restrict dst, - const char *__restrict src, size_t count) { - using namespace __llvm_libc::builtin; -#if defined(LLVM_LIBC_ARCH_X86) - return inline_memcpy_x86_maybe_interpose_repmovsb(dst, src, count); -#elif defined(LLVM_LIBC_ARCH_AARCH64) - return inline_memcpy_aarch64(dst, src, count); -#elif defined(LLVM_LIBC_ARCH_ARM) - return inline_memcpy_embedded_tiny(dst, src, count); + return copy>(dst, src, count); + return copy::Then>>(dst, src, count); #else -#error "Unsupported platform" + ///////////////////////////////////////////////////////////////////////////// + // Default + ///////////////////////////////////////////////////////////////////////////// + if (count == 0) + return; + if (count == 1) + return copy<_1>(dst, src); + if (count == 2) + return copy<_2>(dst, src); + if (count == 3) + return copy<_3>(dst, src); + if (count == 4) + return copy<_4>(dst, src); + if (count < 8) + return copy>(dst, src, count); + if (count < 16) + return copy>(dst, src, count); + if (count < 32) + return copy>(dst, src, count); + if (count < 64) + return copy>(dst, src, count); + if (count < 128) + return copy>(dst, src, count); + return copy::Then>>(dst, src, count); #endif } diff --git a/libc/src/string/memory_utils/memset_implementations.h b/libc/src/string/memory_utils/memset_implementations.h index 66d4822..f1611a3 100644 --- a/libc/src/string/memory_utils/memset_implementations.h +++ b/libc/src/string/memory_utils/memset_implementations.h @@ -10,104 +10,126 @@ #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H #include "src/__support/architectures.h" -#include "src/string/memory_utils/op_aarch64.h" -#include "src/string/memory_utils/op_builtin.h" -#include "src/string/memory_utils/op_generic.h" -#include "src/string/memory_utils/op_x86.h" +#include "src/string/memory_utils/elements.h" #include "src/string/memory_utils/utils.h" #include // size_t namespace __llvm_libc { -[[maybe_unused]] inline static void -inline_memset_embedded_tiny(Ptr dst, uint8_t value, size_t count) { -#pragma nounroll - for (size_t offset = 0; offset < count; ++offset) - generic::Memset<1, 1>::block(dst + offset, value); -} - +// A general purpose implementation assuming cheap unaligned writes for sizes: +// 1, 2, 4, 8, 16, 32 and 64 Bytes. Note that some architecture can't store 32 +// or 64 Bytes at a time, the compiler will expand them as needed. +// +// This implementation is subject to change as we benchmark more processors. We +// may also want to customize it for processors with specialized instructions +// that performs better (e.g. `rep stosb`). +// +// A note on the apparent discrepancy in the use of 32 vs 64 Bytes writes. +// We want to balance two things here: +// - The number of redundant writes (when using `SetBlockOverlap`), +// - The number of conditionals for sizes <=128 (~90% of memset calls are for +// such sizes). +// +// For the range 64-128: +// - SetBlockOverlap<64> uses no conditionals but always writes 128 Bytes this +// is wasteful near 65 but efficient toward 128. +// - SetAlignedBlocks<32> would consume between 3 and 4 conditionals and write +// 96 or 128 Bytes. +// - Another approach could be to use an hybrid approach copy<64>+Overlap<32> +// for 65-96 and copy<96>+Overlap<32> for 97-128 +// +// Benchmarks showed that redundant writes were cheap (for Intel X86) but +// conditional were expensive, even on processor that do not support writing 64B +// at a time (pre-AVX512F). We also want to favor short functions that allow +// more hot code to fit in the iL1 cache. +// +// Above 128 we have to use conditionals since we don't know the upper bound in +// advance. SetAlignedBlocks<64> may waste up to 63 Bytes, SetAlignedBlocks<32> +// may waste up to 31 Bytes. Benchmarks showed that SetAlignedBlocks<64> was not +// superior for sizes that mattered. +inline static void inline_memset(char *dst, unsigned char value, size_t count) { #if defined(LLVM_LIBC_ARCH_X86) -template -[[maybe_unused]] inline static void inline_memset_x86(Ptr dst, uint8_t value, - size_t count) { + ///////////////////////////////////////////////////////////////////////////// + // LLVM_LIBC_ARCH_X86 + ///////////////////////////////////////////////////////////////////////////// + using namespace __llvm_libc::x86; if (count == 0) return; if (count == 1) - return generic::Memset<1, MaxSize>::block(dst, value); + return splat_set<_1>(dst, value); if (count == 2) - return generic::Memset<2, MaxSize>::block(dst, value); + return splat_set<_2>(dst, value); if (count == 3) - return generic::Memset<3, MaxSize>::block(dst, value); + return splat_set<_3>(dst, value); if (count <= 8) - return generic::Memset<4, MaxSize>::head_tail(dst, value, count); + return splat_set>(dst, value, count); if (count <= 16) - return generic::Memset<8, MaxSize>::head_tail(dst, value, count); + return splat_set>(dst, value, count); if (count <= 32) - return generic::Memset<16, MaxSize>::head_tail(dst, value, count); + return splat_set>(dst, value, count); if (count <= 64) - return generic::Memset<32, MaxSize>::head_tail(dst, value, count); + return splat_set>(dst, value, count); if (count <= 128) - return generic::Memset<64, MaxSize>::head_tail(dst, value, count); - // Aligned loop - generic::Memset<32, MaxSize>::block(dst, value); - align_to_next_boundary<32>(dst, count); - return generic::Memset<32, MaxSize>::loop_and_tail(dst, value, count); -} -#endif // defined(LLVM_LIBC_ARCH_X86) - -#if defined(LLVM_LIBC_ARCH_AARCH64) -template -[[maybe_unused]] inline static void -inline_memset_aarch64(Ptr dst, uint8_t value, size_t count) { + return splat_set>(dst, value, count); + return splat_set::Then>>(dst, value, count); +#elif defined(LLVM_LIBC_ARCH_AARCH64) + ///////////////////////////////////////////////////////////////////////////// + // LLVM_LIBC_ARCH_AARCH64 + ///////////////////////////////////////////////////////////////////////////// + using namespace __llvm_libc::aarch64_memset; if (count == 0) return; if (count <= 3) { - generic::Memset<1, MaxSize>::block(dst, value); + splat_set<_1>(dst, value); if (count > 1) - generic::Memset<2, MaxSize>::tail(dst, value, count); + splat_set>(dst, value, count); return; } if (count <= 8) - return generic::Memset<4, MaxSize>::head_tail(dst, value, count); + return splat_set>(dst, value, count); if (count <= 16) - return generic::Memset<8, MaxSize>::head_tail(dst, value, count); + return splat_set>(dst, value, count); if (count <= 32) - return generic::Memset<16, MaxSize>::head_tail(dst, value, count); + return splat_set>(dst, value, count); if (count <= (32 + 64)) { - generic::Memset<32, MaxSize>::block(dst, value); + splat_set<_32>(dst, value); if (count <= 64) - return generic::Memset<32, MaxSize>::tail(dst, value, count); - generic::Memset<32, MaxSize>::block(dst + 32, value); - generic::Memset<32, MaxSize>::tail(dst, value, count); + return splat_set>(dst, value, count); + splat_set::Then<_32>>(dst, value); + splat_set>(dst, value, count); return; } - if (count >= 448 && value == 0 && aarch64::neon::hasZva()) { - generic::Memset<64, MaxSize>::block(dst, 0); - align_to_next_boundary<64>(dst, count); - return aarch64::neon::BzeroCacheLine<64>::loop_and_tail(dst, 0, count); - } else { - generic::Memset<16, MaxSize>::block(dst, value); - align_to_next_boundary<16>(dst, count); - return generic::Memset<64, MaxSize>::loop_and_tail(dst, value, count); - } -} -#endif // defined(LLVM_LIBC_ARCH_AARCH64) - -inline static void inline_memset(Ptr dst, uint8_t value, size_t count) { -#if defined(LLVM_LIBC_ARCH_X86) - static constexpr size_t kMaxSize = x86::kAvx512F ? 64 - : x86::kAvx ? 32 - : x86::kSse2 ? 16 - : 8; - return inline_memset_x86(dst, value, count); -#elif defined(LLVM_LIBC_ARCH_AARCH64) - static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8; - return inline_memset_aarch64(dst, value, count); -#elif defined(LLVM_LIBC_ARCH_ARM) - return inline_memset_embedded_tiny(dst, value, count); + if (count >= 448 && value == 0 && hasZva()) + return splat_set::Then>>(dst, 0, + count); + else + return splat_set::Then>>(dst, value, count); #else -#error "Unsupported platform" + ///////////////////////////////////////////////////////////////////////////// + // Default + ///////////////////////////////////////////////////////////////////////////// + using namespace ::__llvm_libc::scalar; + + if (count == 0) + return; + if (count == 1) + return splat_set<_1>(dst, value); + if (count == 2) + return splat_set<_2>(dst, value); + if (count == 3) + return splat_set<_3>(dst, value); + if (count <= 8) + return splat_set>(dst, value, count); + if (count <= 16) + return splat_set>(dst, value, count); + if (count <= 32) + return splat_set>(dst, value, count); + if (count <= 64) + return splat_set>(dst, value, count); + if (count <= 128) + return splat_set>(dst, value, count); + return splat_set::Then>>(dst, value, count); #endif } diff --git a/libc/src/string/memory_utils/op_aarch64.h b/libc/src/string/memory_utils/op_aarch64.h deleted file mode 100644 index f8ccd83..0000000 --- a/libc/src/string/memory_utils/op_aarch64.h +++ /dev/null @@ -1,175 +0,0 @@ -//===-- aarch64 implementation of memory function building blocks ---------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file provides aarch64 specific building blocks to compose memory -// functions. -// -//===----------------------------------------------------------------------===// -#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H -#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H - -#include "src/__support/architectures.h" - -#if defined(LLVM_LIBC_ARCH_AARCH64) - -#include "src/__support/common.h" -#include "src/string/memory_utils/op_generic.h" - -#ifdef __ARM_NEON -#include -#endif //__ARM_NEON - -namespace __llvm_libc::aarch64 { - -static inline constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON); - -namespace neon { - -template struct BzeroCacheLine { - static constexpr size_t SIZE = Size; - - static inline void block(Ptr dst, uint8_t) { - static_assert(Size == 64); -#if __SIZEOF_POINTER__ == 4 - asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory"); -#else - asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory"); -#endif - } - - static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) { - static_assert(Size > 1); - size_t offset = 0; - do { - block(dst + offset, value); - offset += SIZE; - } while (offset < count - SIZE); - // Unaligned store, we can't use 'dc zva' here. - static constexpr size_t kMaxSize = kNeon ? 16 : 8; - generic::Memset::tail(dst, value, count); - } -}; - -inline static bool hasZva() { - uint64_t zva_val; - asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val)); - // DC ZVA is permitted if DZP, bit [4] is zero. - // BS, bits [3:0] is log2 of the block count in words. - // So the next line checks whether the instruction is permitted and block - // count is 16 words (i.e. 64 bytes). - return (zva_val & 0b11111) == 0b00100; -} - -} // namespace neon - -/////////////////////////////////////////////////////////////////////////////// -// Memset - -/////////////////////////////////////////////////////////////////////////////// -// Bcmp -template struct Bcmp { - static constexpr size_t SIZE = Size; - static constexpr size_t BlockSize = 32; - - static const unsigned char *as_u8(CPtr ptr) { - return reinterpret_cast(ptr); - } - - static inline BcmpReturnType block(CPtr p1, CPtr p2) { - if constexpr (Size == BlockSize) { - auto _p1 = as_u8(p1); - auto _p2 = as_u8(p2); - uint8x16_t a = vld1q_u8(_p1); - uint8x16_t b = vld1q_u8(_p1 + 16); - uint8x16_t n = vld1q_u8(_p2); - uint8x16_t o = vld1q_u8(_p2 + 16); - uint8x16_t an = veorq_u8(a, n); - uint8x16_t bo = veorq_u8(b, o); - // anbo = (a ^ n) | (b ^ o). At least one byte is nonzero if there is - // a difference between the two buffers. We reduce this value down to 4 - // bytes in two steps. First, calculate the saturated move value when - // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get - // a single 32 bit nonzero value if a mismatch occurred. - uint8x16_t anbo = vorrq_u8(an, bo); - uint32x2_t anbo_reduced = vqmovn_u64(anbo); - return vmaxv_u32(anbo_reduced); - } else if constexpr ((Size % BlockSize) == 0) { - for (size_t offset = 0; offset < Size; offset += BlockSize) - if (auto value = Bcmp::block(p1 + offset, p2 + offset)) - return value; - } else { - deferred_static_assert("SIZE not implemented"); - } - return BcmpReturnType::ZERO(); - } - - static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { - return block(p1 + count - SIZE, p2 + count - SIZE); - } - - static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { - if constexpr (Size <= 8) { - return generic::Bcmp::head_tail(p1, p2, count); - } else if constexpr (Size == 16) { - auto _p1 = as_u8(p1); - auto _p2 = as_u8(p2); - uint8x16_t a = vld1q_u8(_p1); - uint8x16_t b = vld1q_u8(_p1 + count - 16); - uint8x16_t n = vld1q_u8(_p2); - uint8x16_t o = vld1q_u8(_p2 + count - 16); - uint8x16_t an = veorq_s8(a, n); - uint8x16_t bo = veorq_s8(b, o); - // anbo = (a ^ n) | (b ^ o) - uint8x16_t anbo = vorrq_s8(an, bo); - uint32x2_t anbo_reduced = vqmovn_u64(anbo); - return vmaxv_u32(anbo_reduced); - } else if constexpr (Size == 32) { - auto _p1 = as_u8(p1); - auto _p2 = as_u8(p2); - uint8x16_t a = vld1q_u8(_p1); - uint8x16_t b = vld1q_u8(_p1 + 16); - uint8x16_t c = vld1q_u8(_p1 + count - 16); - uint8x16_t d = vld1q_u8(_p1 + count - 32); - uint8x16_t n = vld1q_u8(_p2); - uint8x16_t o = vld1q_u8(_p2 + 16); - uint8x16_t p = vld1q_u8(_p2 + count - 16); - uint8x16_t q = vld1q_u8(_p2 + count - 32); - uint8x16_t an = veorq_s8(a, n); - uint8x16_t bo = veorq_s8(b, o); - uint8x16_t cp = veorq_s8(c, p); - uint8x16_t dq = veorq_s8(d, q); - uint8x16_t anbo = vorrq_s8(an, bo); - uint8x16_t cpdq = vorrq_s8(cp, dq); - // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)). Reduce this to - // a nonzero 32 bit value if a mismatch occurred. - uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq); - uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq); - return vmaxv_u32(abnocpdq_reduced); - } else { - deferred_static_assert("SIZE not implemented"); - } - return BcmpReturnType::ZERO(); - } - - static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) { - static_assert(Size > 1); - size_t offset = 0; - do { - if (auto value = block(p1 + offset, p2 + offset)) - return value; - offset += SIZE; - } while (offset < count - SIZE); - return tail(p1, p2, count); - } -}; - -} // namespace __llvm_libc::aarch64 - -#endif // LLVM_LIBC_ARCH_AARCH64 - -#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H diff --git a/libc/src/string/memory_utils/op_builtin.h b/libc/src/string/memory_utils/op_builtin.h deleted file mode 100644 index 6b3e92e..0000000 --- a/libc/src/string/memory_utils/op_builtin.h +++ /dev/null @@ -1,148 +0,0 @@ -//===-- Implementation using the __builtin_XXX_inline ---------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file provides generic C++ building blocks to compose memory functions. -// They rely on the compiler to generate the best possible code through the use -// of the `__builtin_XXX_inline` builtins. These builtins are currently only -// available in Clang. -// -//===----------------------------------------------------------------------===// -#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H -#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H - -#include "src/string/memory_utils/utils.h" - -namespace __llvm_libc::builtin { - -/////////////////////////////////////////////////////////////////////////////// -// Memcpy -template struct Memcpy { - static constexpr size_t SIZE = Size; - static inline void block(Ptr __restrict dst, CPtr __restrict src) { -#ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE - return __builtin_memcpy_inline(dst, src, SIZE); -#else - deferred_static_assert("Missing __builtin_memcpy_inline"); - (void)dst; - (void)src; -#endif - } - - static inline void tail(Ptr __restrict dst, CPtr __restrict src, - size_t count) { - block(dst + count - SIZE, src + count - SIZE); - } - - static inline void head_tail(Ptr __restrict dst, CPtr __restrict src, - size_t count) { - block(dst, src); - tail(dst, src, count); - } - - static inline void loop_and_tail(Ptr __restrict dst, CPtr __restrict src, - size_t count) { - static_assert(Size > 1); - size_t offset = 0; - do { - block(dst + offset, src + offset); - offset += SIZE; - } while (offset < count - SIZE); - tail(dst, src, count); - } -}; - -/////////////////////////////////////////////////////////////////////////////// -// Memset -template struct Memset { - using ME = Memset; - static constexpr size_t SIZE = Size; - static inline void block(Ptr dst, uint8_t value) { -#ifdef LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE - __builtin_memset_inline(dst, value, Size); -#else - deferred_static_assert("Missing __builtin_memset_inline"); - (void)dst; - (void)value; -#endif - } - - static inline void tail(Ptr dst, uint8_t value, size_t count) { - block(dst + count - SIZE, value); - } - - static inline void head_tail(Ptr dst, uint8_t value, size_t count) { - block(dst, value); - tail(dst, value, count); - } - - static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) { - static_assert(Size > 1); - size_t offset = 0; - do { - block(dst + offset, value); - offset += SIZE; - } while (offset < count - SIZE); - tail(dst, value, count); - } -}; - -/////////////////////////////////////////////////////////////////////////////// -// Bcmp -template struct Bcmp { - using ME = Bcmp; - static constexpr size_t SIZE = Size; - static inline BcmpReturnType block(CPtr, CPtr) { - deferred_static_assert("Missing __builtin_memcmp_inline"); - return BcmpReturnType::ZERO(); - } - - static inline BcmpReturnType tail(CPtr, CPtr, size_t) { - deferred_static_assert("Not implemented"); - return BcmpReturnType::ZERO(); - } - - static inline BcmpReturnType head_tail(CPtr, CPtr, size_t) { - deferred_static_assert("Not implemented"); - return BcmpReturnType::ZERO(); - } - - static inline BcmpReturnType loop_and_tail(CPtr, CPtr, size_t) { - deferred_static_assert("Not implemented"); - return BcmpReturnType::ZERO(); - } -}; - -/////////////////////////////////////////////////////////////////////////////// -// Memcmp -template struct Memcmp { - using ME = Memcmp; - static constexpr size_t SIZE = Size; - static inline MemcmpReturnType block(CPtr, CPtr) { - deferred_static_assert("Missing __builtin_memcmp_inline"); - return MemcmpReturnType::ZERO(); - } - - static inline MemcmpReturnType tail(CPtr, CPtr, size_t) { - deferred_static_assert("Not implemented"); - return MemcmpReturnType::ZERO(); - } - - static inline MemcmpReturnType head_tail(CPtr, CPtr, size_t) { - deferred_static_assert("Not implemented"); - return MemcmpReturnType::ZERO(); - } - - static inline MemcmpReturnType loop_and_tail(CPtr, CPtr, size_t) { - deferred_static_assert("Not implemented"); - return MemcmpReturnType::ZERO(); - } -}; - -} // namespace __llvm_libc::builtin - -#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h deleted file mode 100644 index 226d775..0000000 --- a/libc/src/string/memory_utils/op_generic.h +++ /dev/null @@ -1,466 +0,0 @@ -//===-- Generic implementation of memory function building blocks ---------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file provides generic C++ building blocks. -// Depending on the requested size, the block operation uses unsigned integral -// types, vector types or an array of the type with the maximum size. -// -// The maximum size is passed as a template argument. For instance, on x86 -// platforms that only supports integral types the maximum size would be 8 -// (corresponding to uint64_t). On this platform if we request the size 32, this -// would be treated as a cpp::array. -// -// On the other hand, if the platform is x86 with support for AVX the maximum -// size is 32 and the operation can be handled with a single native operation. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H -#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H - -#include "src/__support/CPP/array.h" -#include "src/__support/CPP/type_traits.h" -#include "src/__support/endian.h" -#include "src/string/memory_utils/op_builtin.h" -#include "src/string/memory_utils/utils.h" - -#include - -namespace __llvm_libc::generic { - -// CTPair and CTMap below implement a compile time map. -// This is useful to map from a Size to a type handling this size. -// -// Example usage: -// using MyMap = CTMap, -// CTPair<2, uint16_t>, -// >; -// ... -// using UInt8T = MyMap::find_type<1>; -template struct CTPair { - using type = T; - static CTPair get_pair(cpp::integral_constant) { return {}; } -}; -template struct CTMap : public Pairs... { - using Pairs::get_pair...; - template - using find_type = - typename decltype(get_pair(cpp::integral_constant{}))::type; -}; - -// Helper to test if a type is void. -template inline constexpr bool is_void_v = cpp::is_same_v; - -// Implements load, store and splat for unsigned integral types. -template struct ScalarType { - using Type = T; - static_assert(cpp::is_integral_v && !cpp::is_signed_v); - - static inline Type load(CPtr src) { return ::__llvm_libc::load(src); } - static inline void store(Ptr dst, Type value) { - ::__llvm_libc::store(dst, value); - } - static inline Type splat(uint8_t value) { - return Type(~0) / Type(0xFF) * Type(value); - } -}; - -// Implements load, store and splat for vector types. -template struct VectorType { - using Type = uint8_t __attribute__((__vector_size__(Size))); - static inline Type load(CPtr src) { return ::__llvm_libc::load(src); } - static inline void store(Ptr dst, Type value) { - ::__llvm_libc::store(dst, value); - } - static inline Type splat(uint8_t value) { - Type Out; - // This for loop is optimized out for vector types. - for (size_t i = 0; i < Size; ++i) - Out[i] = static_cast(value); - return Out; - } -}; - -// We currently don't support 8- or 16-bit platforms, it must be 32- or 64-bit. -static_assert((UINTPTR_MAX == 4294967295U) || - (UINTPTR_MAX == 18446744073709551615UL)); - -// Map from sizes to structures offering static load, store and splat methods. -// Note: On platforms lacking vector support, we use the ArrayType below and -// decompose the operation in smaller pieces. -using NativeTypeMap = - CTMap>, // - CTPair<2, ScalarType>, // - CTPair<4, ScalarType>, // -#if defined(LLVM_LIBC_ARCH_X86_64) || defined(LLVM_LIBC_ARCH_AARCH64) - CTPair<8, ScalarType>, // Not available on 32bit -#endif // - CTPair<16, VectorType<16>>, // - CTPair<32, VectorType<32>>, // - CTPair<64, VectorType<64>>>; - -// Implements load, store and splat for sizes not natively supported by the -// platform. SubType is either ScalarType or VectorType. -template struct ArrayType { - using Type = cpp::array; - static constexpr size_t SizeOfElement = sizeof(typename SubType::Type); - static inline Type load(CPtr src) { - Type Value; - for (size_t I = 0; I < ArraySize; ++I) - Value[I] = SubType::load(src + (I * SizeOfElement)); - return Value; - } - static inline void store(Ptr dst, Type Value) { - for (size_t I = 0; I < ArraySize; ++I) - SubType::store(dst + (I * SizeOfElement), Value[I]); - } - static inline Type splat(uint8_t value) { - Type Out; - for (size_t I = 0; I < ArraySize; ++I) - Out[I] = SubType::splat(value); - return Out; - } -}; - -// Checks whether we should use an ArrayType. -template static constexpr bool useArrayType() { - return (Size > MaxSize) && ((Size % MaxSize) == 0) && - !is_void_v>; -} - -// Compute the type to handle an operation of Size bytes knowing that the -// underlying platform only support native types up to MaxSize bytes. -template -using getTypeFor = cpp::conditional_t< - useArrayType(), - ArrayType, Size / MaxSize>, - NativeTypeMap::find_type>; - -/////////////////////////////////////////////////////////////////////////////// -// Memcpy -// When building with clang we can delegate to the builtin implementation. -/////////////////////////////////////////////////////////////////////////////// - -template using Memcpy = builtin::Memcpy; - -/////////////////////////////////////////////////////////////////////////////// -// Memset -// The MaxSize template argument gives the maximum size handled natively by the -// platform. For instance on x86 with AVX support this would be 32. If a size -// greater than MaxSize is requested we break the operation down in smaller -// pieces of size MaxSize. -/////////////////////////////////////////////////////////////////////////////// -template struct Memset { - static_assert(is_power2(MaxSize)); - static constexpr size_t SIZE = Size; - - static inline void block(Ptr dst, uint8_t value) { - if constexpr (Size == 3) { - Memset<1, MaxSize>::block(dst + 2, value); - Memset<2, MaxSize>::block(dst, value); - } else { - using T = getTypeFor; - if constexpr (is_void_v) { - deferred_static_assert("Unimplemented Size"); - } else { - T::store(dst, T::splat(value)); - } - } - } - - static inline void tail(Ptr dst, uint8_t value, size_t count) { - block(dst + count - SIZE, value); - } - - static inline void head_tail(Ptr dst, uint8_t value, size_t count) { - block(dst, value); - tail(dst, value, count); - } - - static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) { - static_assert(SIZE > 1); - size_t offset = 0; - do { - block(dst + offset, value); - offset += SIZE; - } while (offset < count - SIZE); - tail(dst, value, count); - } -}; - -/////////////////////////////////////////////////////////////////////////////// -// Bcmp -/////////////////////////////////////////////////////////////////////////////// -template struct Bcmp { - static constexpr size_t SIZE = Size; - static constexpr size_t MaxSize = 8; - - template static inline uint32_t load_xor(CPtr p1, CPtr p2) { - return load(p1) ^ load(p2); - } - - template - static inline uint32_t load_not_equal(CPtr p1, CPtr p2) { - return load(p1) != load(p2); - } - - static inline BcmpReturnType block(CPtr p1, CPtr p2) { - static constexpr size_t MaxSize = 8; - if constexpr (Size == 1) { - return load_xor(p1, p2); - } else if constexpr (Size == 2) { - return load_xor(p1, p2); - } else if constexpr (Size == 4) { - return load_xor(p1, p2); - } else if constexpr (Size == 8) { - return load_not_equal(p1, p2); - } else if constexpr (useArrayType()) { - for (size_t offset = 0; offset < Size; offset += MaxSize) - if (auto value = Bcmp::block(p1 + offset, p2 + offset)) - return value; - } else { - deferred_static_assert("Unimplemented Size"); - } - return BcmpReturnType::ZERO(); - } - - static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { - return block(p1 + count - SIZE, p2 + count - SIZE); - } - - static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { - return block(p1, p2) | tail(p1, p2, count); - } - - static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) { - static_assert(Size > 1); - size_t offset = 0; - do { - if (auto value = block(p1 + offset, p2 + offset)) - return value; - offset += SIZE; - } while (offset < count - SIZE); - return tail(p1, p2, count); - } -}; - -/////////////////////////////////////////////////////////////////////////////// -// Memcmp -/////////////////////////////////////////////////////////////////////////////// -template struct Memcmp { - static constexpr size_t SIZE = Size; - static constexpr size_t MaxSize = 8; - - template static inline T load_be(CPtr ptr) { - return Endian::to_big_endian(load(ptr)); - } - - template - static inline MemcmpReturnType load_be_diff(CPtr p1, CPtr p2) { - return load_be(p1) - load_be(p2); - } - - template - static inline MemcmpReturnType load_be_cmp(CPtr p1, CPtr p2) { - const auto la = load_be(p1); - const auto lb = load_be(p2); - return la > lb ? 1 : la < lb ? -1 : 0; - } - - static inline MemcmpReturnType block(CPtr p1, CPtr p2) { - if constexpr (Size == 1) { - return load_be_diff(p1, p2); - } else if constexpr (Size == 2) { - return load_be_diff(p1, p2); - } else if constexpr (Size == 4) { - return load_be_cmp(p1, p2); - } else if constexpr (Size == 8) { - return load_be_cmp(p1, p2); - } else if constexpr (useArrayType()) { - for (size_t offset = 0; offset < Size; offset += MaxSize) - if (Bcmp::block(p1 + offset, p2 + offset)) - return Memcmp::block(p1 + offset, p2 + offset); - return MemcmpReturnType::ZERO(); - } else if constexpr (Size == 3) { - if (auto value = Memcmp<2>::block(p1, p2)) - return value; - return Memcmp<1>::block(p1 + 2, p2 + 2); - } else { - deferred_static_assert("Unimplemented Size"); - } - } - - static inline MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { - return block(p1 + count - SIZE, p2 + count - SIZE); - } - - static inline MemcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { - if (auto value = block(p1, p2)) - return value; - return tail(p1, p2, count); - } - - static inline MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) { - static_assert(Size > 1); - size_t offset = 0; - do { - if (auto value = block(p1 + offset, p2 + offset)) - return value; - offset += SIZE; - } while (offset < count - SIZE); - return tail(p1, p2, count); - } -}; - -/////////////////////////////////////////////////////////////////////////////// -// Memmove -/////////////////////////////////////////////////////////////////////////////// - -template struct Memmove { - static_assert(is_power2(MaxSize)); - using T = getTypeFor; - static constexpr size_t SIZE = Size; - - static inline void block(Ptr dst, CPtr src) { - if constexpr (is_void_v) { - deferred_static_assert("Unimplemented Size"); - } else { - T::store(dst, T::load(src)); - } - } - - static inline void head_tail(Ptr dst, CPtr src, size_t count) { - const size_t offset = count - Size; - if constexpr (is_void_v) { - deferred_static_assert("Unimplemented Size"); - } else { - // The load and store operations can be performed in any order as long as - // they are not interleaved. More investigations are needed to determine - // the best order. - const auto head = T::load(src); - const auto tail = T::load(src + offset); - T::store(dst, head); - T::store(dst + offset, tail); - } - } - - // Align forward suitable when dst < src. The alignment is performed with - // an HeadTail operation of count ∈ [Alignment, 2 x Alignment]. - // - // e.g. Moving two bytes forward, we make sure src is aligned. - // [ | | | | ] - // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_] - // [____LLLLLLLL_____________________] - // [___________LLLLLLLA______________] - // [_SSSSSSSS________________________] - // [________SSSSSSSS_________________] - // - // e.g. Moving two bytes forward, we make sure dst is aligned. - // [ | | | | ] - // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_] - // [____LLLLLLLL_____________________] - // [______LLLLLLLL___________________] - // [_SSSSSSSS________________________] - // [___SSSSSSSA______________________] - template - static inline void align_forward(Ptr &dst, CPtr &src, size_t &count) { - Ptr prev_dst = dst; - CPtr prev_src = src; - size_t prev_count = count; - align_to_next_boundary(dst, src, count); - adjust(Size, dst, src, count); - head_tail(prev_dst, prev_src, prev_count - count); - } - - // Align backward suitable when dst > src. The alignment is performed with - // an HeadTail operation of count ∈ [Alignment, 2 x Alignment]. - // - // e.g. Moving two bytes backward, we make sure src is aligned. - // [ | | | | ] - // [____XXXXXXXXXXXXXXXXXXXXXXXX_____] - // [ _________________ALLLLLLL_______] - // [ ___________________LLLLLLLL_____] - // [____________________SSSSSSSS_____] - // [______________________SSSSSSSS___] - // - // e.g. Moving two bytes backward, we make sure dst is aligned. - // [ | | | | ] - // [____XXXXXXXXXXXXXXXXXXXXXXXX_____] - // [ _______________LLLLLLLL_________] - // [ ___________________LLLLLLLL_____] - // [__________________ASSSSSSS_______] - // [______________________SSSSSSSS___] - template - static inline void align_backward(Ptr &dst, CPtr &src, size_t &count) { - Ptr headtail_dst = dst + count; - CPtr headtail_src = src + count; - size_t headtail_size = 0; - align_to_next_boundary(headtail_dst, headtail_src, - headtail_size); - adjust(-2 * Size, headtail_dst, headtail_src, headtail_size); - head_tail(headtail_dst, headtail_src, headtail_size); - count -= headtail_size; - } - - // Move forward suitable when dst < src. We load the tail bytes before - // handling the loop. - // - // e.g. Moving two bytes - // [ | | | | |] - // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___] - // [_________________________LLLLLLLL___] - // [___LLLLLLLL_________________________] - // [_SSSSSSSS___________________________] - // [___________LLLLLLLL_________________] - // [_________SSSSSSSS___________________] - // [___________________LLLLLLLL_________] - // [_________________SSSSSSSS___________] - // [_______________________SSSSSSSS_____] - static inline void loop_and_tail_forward(Ptr dst, CPtr src, size_t count) { - static_assert(Size > 1); - const size_t tail_offset = count - Size; - const auto tail_value = T::load(src + tail_offset); - size_t offset = 0; -#pragma nounroll - do { - block(dst + offset, src + offset); - offset += Size; - } while (offset < count - Size); - T::store(dst + tail_offset, tail_value); - } - - // Move backward suitable when dst > src. We load the head bytes before - // handling the loop. - // - // e.g. Moving two bytes - // [ | | | | |] - // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___] - // [___LLLLLLLL_________________________] - // [_________________________LLLLLLLL___] - // [___________________________SSSSSSSS_] - // [_________________LLLLLLLL___________] - // [___________________SSSSSSSS_________] - // [_________LLLLLLLL___________________] - // [___________SSSSSSSS_________________] - // [_____SSSSSSSS_______________________] - static inline void loop_and_tail_backward(Ptr dst, CPtr src, size_t count) { - static_assert(Size > 1); - const auto head_value = T::load(src); - ptrdiff_t offset = count - Size; -#pragma nounroll - do { - block(dst + offset, src + offset); - offset -= Size; - } while (offset >= 0); - T::store(dst, head_value); - } -}; - -} // namespace __llvm_libc::generic - -#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h deleted file mode 100644 index 96847b2..0000000 --- a/libc/src/string/memory_utils/op_x86.h +++ /dev/null @@ -1,219 +0,0 @@ -//===-- x86 implementation of memory function building blocks -------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file provides x86 specific building blocks to compose memory functions. -// -//===----------------------------------------------------------------------===// -#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H -#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H - -#include "src/__support/architectures.h" - -#if defined(LLVM_LIBC_ARCH_X86_64) - -#include "src/__support/common.h" -#include "src/string/memory_utils/op_builtin.h" -#include "src/string/memory_utils/op_generic.h" - -#ifdef __SSE2__ -#include -#else -// Define fake functions to prevent the compiler from failing on undefined -// functions in case SSE2 is not present. -#define _mm512_cmpneq_epi8_mask(A, B) 0 -#define _mm_movemask_epi8(A) 0 -#define _mm256_movemask_epi8(A) 0 -#endif // __SSE2__ - -namespace __llvm_libc::x86 { - -// A set of constants to check compile time features. -static inline constexpr bool kSse2 = LLVM_LIBC_IS_DEFINED(__SSE2__); -static inline constexpr bool kAvx = LLVM_LIBC_IS_DEFINED(__AVX__); -static inline constexpr bool kAvx2 = LLVM_LIBC_IS_DEFINED(__AVX2__); -static inline constexpr bool kAvx512F = LLVM_LIBC_IS_DEFINED(__AVX512F__); -static inline constexpr bool kAvx512BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__); - -/////////////////////////////////////////////////////////////////////////////// -// Memcpy repmovsb implementation -struct Memcpy { - static void repmovsb(char *dst, const char *src, size_t count) { - asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory"); - } -}; - -/////////////////////////////////////////////////////////////////////////////// -// Bcmp - -// Base implementation for the Bcmp specializations. -// - BlockSize is either 16, 32 or 64 depending on the available compile time -// features, it is used to switch between "single native operation" or a -// "sequence of native operations". -// - BlockBcmp is the function that implements the bcmp logic. -template struct BcmpImpl { - static inline BcmpReturnType block(CPtr p1, CPtr p2) { - if constexpr (Size == BlockSize) { - return BlockBcmp(p1, p2); - } else if constexpr (Size % BlockSize == 0) { - for (size_t offset = 0; offset < Size; offset += BlockSize) - if (auto value = BlockBcmp(p1 + offset, p2 + offset)) - return value; - } else { - deferred_static_assert("SIZE not implemented"); - } - return BcmpReturnType::ZERO(); - } - - static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { - return block(p1 + count - Size, p2 + count - Size); - } - - static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { - return block(p1, p2) | tail(p1, p2, count); - } - - static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) { - static_assert(Size > 1); - size_t offset = 0; - do { - if (auto value = block(p1 + offset, p2 + offset)) - return value; - offset += Size; - } while (offset < count - Size); - return tail(p1, p2, count); - } -}; - -namespace sse2 { -static inline BcmpReturnType bcmp16(CPtr p1, CPtr p2) { - using T = char __attribute__((__vector_size__(16))); - // A mask indicating which bytes differ after loading 16 bytes from p1 and p2. - const int mask = _mm_movemask_epi8(load(p1) != load(p2)); - return static_cast(mask); -} -template using Bcmp = BcmpImpl; -} // namespace sse2 - -namespace avx2 { -static inline BcmpReturnType bcmp32(CPtr p1, CPtr p2) { - using T = char __attribute__((__vector_size__(32))); - // A mask indicating which bytes differ after loading 32 bytes from p1 and p2. - const int mask = _mm256_movemask_epi8(load(p1) != load(p2)); - // _mm256_movemask_epi8 returns an int but it is to be interpreted as a 32-bit - // mask. - return static_cast(mask); -} -template using Bcmp = BcmpImpl; -} // namespace avx2 - -namespace avx512bw { -static inline BcmpReturnType bcmp64(CPtr p1, CPtr p2) { - using T = char __attribute__((__vector_size__(64))); - // A mask indicating which bytes differ after loading 64 bytes from p1 and p2. - const uint64_t mask = _mm512_cmpneq_epi8_mask(load(p1), load(p2)); - const bool mask_is_set = mask != 0; - return static_cast(mask_is_set); -} -template using Bcmp = BcmpImpl; -} // namespace avx512bw - -// Assuming that the mask is non zero, the index of the first mismatching byte -// is the number of trailing zeros in the mask. Trailing zeros and not leading -// zeros because the x86 architecture is little endian. -static inline MemcmpReturnType char_diff_no_zero(CPtr p1, CPtr p2, - uint64_t mask) { - const size_t diff_index = __builtin_ctzll(mask); - const int16_t ca = p1[diff_index]; - const int16_t cb = p2[diff_index]; - return ca - cb; -} - -/////////////////////////////////////////////////////////////////////////////// -// Memcmp - -// Base implementation for the Memcmp specializations. -// - BlockSize is either 16, 32 or 64 depending on the available compile time -// features, it is used to switch between "single native operation" or a -// "sequence of native operations". -// - BlockMemcmp is the function that implements the memcmp logic. -// - BlockBcmp is the function that implements the bcmp logic. -template -struct MemcmpImpl { - static inline MemcmpReturnType block(CPtr p1, CPtr p2) { - if constexpr (Size == BlockSize) { - return BlockMemcmp(p1, p2); - } else if constexpr (Size % BlockSize == 0) { - for (size_t offset = 0; offset < Size; offset += BlockSize) - if (auto value = BlockBcmp(p1 + offset, p2 + offset)) - return BlockMemcmp(p1 + offset, p2 + offset); - } else { - deferred_static_assert("SIZE not implemented"); - } - return MemcmpReturnType::ZERO(); - } - - static inline MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { - return block(p1 + count - Size, p2 + count - Size); - } - - static inline MemcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { - if (auto value = block(p1, p2)) - return value; - return tail(p1, p2, count); - } - - static inline MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) { - static_assert(Size > 1); - size_t offset = 0; - do { - if (auto value = block(p1 + offset, p2 + offset)) - return value; - offset += Size; - } while (offset < count - Size); - return tail(p1, p2, count); - } -}; - -namespace sse2 { -static inline MemcmpReturnType memcmp16(CPtr p1, CPtr p2) { - using T = char __attribute__((__vector_size__(16))); - // A mask indicating which bytes differ after loading 16 bytes from p1 and p2. - if (int mask = _mm_movemask_epi8(load(p1) != load(p2))) - return char_diff_no_zero(p1, p2, mask); - return MemcmpReturnType::ZERO(); -} -template using Memcmp = MemcmpImpl; -} // namespace sse2 - -namespace avx2 { -static inline MemcmpReturnType memcmp32(CPtr p1, CPtr p2) { - using T = char __attribute__((__vector_size__(32))); - // A mask indicating which bytes differ after loading 32 bytes from p1 and p2. - if (int mask = _mm256_movemask_epi8(load(p1) != load(p2))) - return char_diff_no_zero(p1, p2, mask); - return MemcmpReturnType::ZERO(); -} -template using Memcmp = MemcmpImpl; -} // namespace avx2 - -namespace avx512bw { -static inline MemcmpReturnType memcmp64(CPtr p1, CPtr p2) { - using T = char __attribute__((__vector_size__(64))); - // A mask indicating which bytes differ after loading 64 bytes from p1 and p2. - if (uint64_t mask = _mm512_cmpneq_epi8_mask(load(p1), load(p2))) - return char_diff_no_zero(p1, p2, mask); - return MemcmpReturnType::ZERO(); -} -template using Memcmp = MemcmpImpl; -} // namespace avx512bw - -} // namespace __llvm_libc::x86 - -#endif // LLVM_LIBC_ARCH_X86_64 - -#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h index 9d1321f..d915835 100644 --- a/libc/src/string/memory_utils/utils.h +++ b/libc/src/string/memory_utils/utils.h @@ -9,8 +9,19 @@ #ifndef LLVM_LIBC_SRC_MEMORY_UTILS_UTILS_H #define LLVM_LIBC_SRC_MEMORY_UTILS_UTILS_H -#include "src/__support/CPP/bit.h" -#include "src/__support/CPP/type_traits.h" +#include "src/__support/architectures.h" + +// Cache line sizes for ARM: These values are not strictly correct since +// cache line sizes depend on implementations, not architectures. There +// are even implementations with cache line sizes configurable at boot +// time. +#if defined(LLVM_LIBC_ARCH_AARCH64) || defined(LLVM_LIBC_ARCH_X86) +#define LLVM_LIBC_CACHELINE_SIZE 64 +#elif defined(LLVM_LIBC_ARCH_ARM) +#define LLVM_LIBC_CACHELINE_SIZE 32 +#else +#error "Unsupported platform for memory functions." +#endif #include // size_t #include // intptr_t / uintptr_t @@ -51,46 +62,32 @@ static constexpr size_t ge_power2(size_t value) { return is_power2_or_zero(value) ? value : 1ULL << (log2(value) + 1); } -// Returns the number of bytes to substract from ptr to get to the previous -// multiple of alignment. If ptr is already aligned returns 0. -template uintptr_t distance_to_align_down(const void *ptr) { +template intptr_t offset_from_last_aligned(const void *ptr) { static_assert(is_power2(alignment), "alignment must be a power of 2"); return reinterpret_cast(ptr) & (alignment - 1U); } -// Returns the number of bytes to add to ptr to get to the next multiple of -// alignment. If ptr is already aligned returns 0. -template uintptr_t distance_to_align_up(const void *ptr) { +template intptr_t offset_to_next_aligned(const void *ptr) { static_assert(is_power2(alignment), "alignment must be a power of 2"); // The logic is not straightforward and involves unsigned modulo arithmetic // but the generated code is as fast as it can be. return -reinterpret_cast(ptr) & (alignment - 1U); } -// Returns the number of bytes to add to ptr to get to the next multiple of -// alignment. If ptr is already aligned returns alignment. -template -uintptr_t distance_to_next_aligned(const void *ptr) { - return alignment - distance_to_align_down(ptr); +// Returns the offset from `ptr` to the next cache line. +static inline intptr_t offset_to_next_cache_line(const void *ptr) { + return offset_to_next_aligned(ptr); } -// Returns the same pointer but notifies the compiler that it is aligned. template static T *assume_aligned(T *ptr) { return reinterpret_cast(__builtin_assume_aligned(ptr, alignment)); } - #if defined __has_builtin #if __has_builtin(__builtin_memcpy_inline) #define LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE #endif #endif -#if defined __has_builtin -#if __has_builtin(__builtin_memset_inline) -#define LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE -#endif -#endif - // Performs a constant count copy. template static inline void memcpy_inline(void *__restrict dst, @@ -106,56 +103,28 @@ static inline void memcpy_inline(void *__restrict dst, using Ptr = char *; // Pointer to raw data. using CPtr = const char *; // Const pointer to raw data. -// This type makes sure that we don't accidentally promote an integral type to -// another one. It is only constructible from the exact T type. -template struct StrictIntegralType { - static_assert(cpp::is_integral_v); - - // Can only be constructed from a T. - template , bool> = 0> - StrictIntegralType(U value) : value(value) {} - - // Allows using the type in an if statement. - explicit operator bool() const { return value; } - - // If type is unsigned (bcmp) we allow bitwise OR operations. - StrictIntegralType operator|(const StrictIntegralType &Rhs) const { - static_assert(!cpp::is_signed_v); - return value | Rhs.value; - } - - // For interation with the C API we allow explicit conversion back to the - // `int` type. - explicit operator int() const { - // bit_cast makes sure that T and int have the same size. - return cpp::bit_cast(value); - } - - // Helper to get the zero value. - static inline constexpr StrictIntegralType ZERO() { return {T(0)}; } - -private: - T value; -}; - -using MemcmpReturnType = StrictIntegralType; -using BcmpReturnType = StrictIntegralType; - -// Loads bytes from memory (possibly unaligned) and materializes them as -// type. +// Loads bytes from memory (possibly unaligned) and materializes them as type. template static inline T load(CPtr ptr) { T Out; memcpy_inline(&Out, ptr); return Out; } -// Stores a value of type T in memory (possibly unaligned). +// Stores a value of type T in memory (possibly unaligned) template static inline void store(Ptr ptr, T value) { memcpy_inline(ptr, &value); } -// Advances the pointers p1 and p2 by offset bytes and decrease count by the -// same amount. +// For an operation like memset that operates on a pointer and a count, advances +// the pointer by offset bytes and decrease count by the same amount. +static inline void adjust(ptrdiff_t offset, Ptr &ptr, size_t &count) { + ptr += offset; + count -= offset; +} + +// For an operation like memcpy or memcmp that operates on two pointers and a +// count, advances the pointers by offset bytes and decrease count by the same +// amount. template static inline void adjust(ptrdiff_t offset, T1 *__restrict &p1, T2 *__restrict &p2, size_t &count) { @@ -164,37 +133,31 @@ static inline void adjust(ptrdiff_t offset, T1 *__restrict &p1, count -= offset; } -// Advances p1 and p2 so p1 gets aligned to the next SIZE bytes boundary -// and decrease count by the same amount. +// For an operation like memset that operates on a pointer and a count, advances +// the pointer so it is aligned to SIZE bytes and decrease count by the same +// amount. // We make sure the compiler knows about the adjusted pointer alignment. -template -void align_p1_to_next_boundary(T1 *__restrict &p1, T2 *__restrict &p2, - size_t &count) { - adjust(distance_to_next_aligned(p1), p1, p2, count); - p1 = assume_aligned(p1); -} - -// Same as align_p1_to_next_boundary above but with a single pointer instead. -template -void align_to_next_boundary(T1 *&p1, size_t &count) { - CPtr dummy; - align_p1_to_next_boundary(p1, dummy, count); +template void align(Ptr &ptr, size_t &count) { + adjust(offset_to_next_aligned(ptr), ptr, count); + ptr = assume_aligned(ptr); } -// An enum class that discriminates between the first and second pointer. -enum class Arg { P1, P2, Dst = P1, Src = P2 }; - -// Same as align_p1_to_next_boundary but allows for aligning p2 instead of p1. -// Precondition: &p1 != &p2 +// For an operation like memcpy or memcmp that operates on two pointers and a +// count, advances the pointers so one of them gets aligned to SIZE bytes and +// decrease count by the same amount. +// We make sure the compiler knows about the adjusted pointer alignment. +enum class Arg { _1, _2, Dst = _1, Src = _2, Lhs = _1, Rhs = _2 }; template -void align_to_next_boundary(T1 *__restrict &p1, T2 *__restrict &p2, - size_t &count) { - if constexpr (AlignOn == Arg::P1) - align_p1_to_next_boundary(p1, p2, count); - else if constexpr (AlignOn == Arg::P2) - align_p1_to_next_boundary(p2, p1, count); // swapping p1 and p2. - else - deferred_static_assert("AlignOn must be either Arg::P1 or Arg::P2"); +void align(T1 *__restrict &p1, T2 *__restrict &p2, size_t &count) { + if constexpr (AlignOn == Arg::_1) { + adjust(offset_to_next_aligned(p1), p1, p2, count); + p1 = assume_aligned(p1); + } else if constexpr (AlignOn == Arg::_2) { + adjust(offset_to_next_aligned(p2), p1, p2, count); + p2 = assume_aligned(p2); + } else { + deferred_static_assert("AlignOn must be either Arg::_1 or Arg::_2"); + } } } // namespace __llvm_libc diff --git a/libc/src/string/memset.cpp b/libc/src/string/memset.cpp index 1b492b5..549c074 100644 --- a/libc/src/string/memset.cpp +++ b/libc/src/string/memset.cpp @@ -13,8 +13,8 @@ namespace __llvm_libc { LLVM_LIBC_FUNCTION(void *, memset, (void *dst, int value, size_t count)) { - inline_memset(reinterpret_cast(dst), static_cast(value), - count); + inline_memset(reinterpret_cast(dst), + static_cast(value), count); return dst; } diff --git a/libc/test/src/string/bcmp_test.cpp b/libc/test/src/string/bcmp_test.cpp index 8f0fe52..19df7ad26 100644 --- a/libc/test/src/string/bcmp_test.cpp +++ b/libc/test/src/string/bcmp_test.cpp @@ -12,25 +12,25 @@ TEST(LlvmLibcBcmpTest, CmpZeroByte) { const char *lhs = "ab"; const char *rhs = "bc"; - ASSERT_EQ(__llvm_libc::bcmp(lhs, rhs, 0), 0); + EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, 0), 0); } TEST(LlvmLibcBcmpTest, LhsRhsAreTheSame) { const char *lhs = "ab"; const char *rhs = "ab"; - ASSERT_EQ(__llvm_libc::bcmp(lhs, rhs, 2), 0); + EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, 2), 0); } TEST(LlvmLibcBcmpTest, LhsBeforeRhsLexically) { const char *lhs = "ab"; const char *rhs = "ac"; - ASSERT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0); + EXPECT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0); } TEST(LlvmLibcBcmpTest, LhsAfterRhsLexically) { const char *lhs = "ac"; const char *rhs = "ab"; - ASSERT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0); + EXPECT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0); } TEST(LlvmLibcBcmpTest, Sweep) { @@ -46,13 +46,13 @@ TEST(LlvmLibcBcmpTest, Sweep) { reset(lhs); reset(rhs); for (size_t i = 0; i < K_MAX_SIZE; ++i) - ASSERT_EQ(__llvm_libc::bcmp(lhs, rhs, i), 0); + EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, i), 0); reset(lhs); reset(rhs); for (size_t i = 0; i < K_MAX_SIZE; ++i) { rhs[i] = 'b'; - ASSERT_NE(__llvm_libc::bcmp(lhs, rhs, K_MAX_SIZE), 0); + EXPECT_NE(__llvm_libc::bcmp(lhs, rhs, K_MAX_SIZE), 0); rhs[i] = 'a'; } } diff --git a/libc/test/src/string/memmove_test.cpp b/libc/test/src/string/memmove_test.cpp index 451ccdb..26b4d9e 100644 --- a/libc/test/src/string/memmove_test.cpp +++ b/libc/test/src/string/memmove_test.cpp @@ -20,7 +20,7 @@ TEST(LlvmLibcMemmoveTest, MoveZeroByte) { void *const Dst = Buffer; void *const Ret = __llvm_libc::memmove(Dst, Buffer + 2, 0); EXPECT_EQ(Ret, Dst); - ASSERT_MEM_EQ(Buffer, Expected); + EXPECT_MEM_EQ(Buffer, Expected); } TEST(LlvmLibcMemmoveTest, DstAndSrcPointToSameAddress) { @@ -29,7 +29,7 @@ TEST(LlvmLibcMemmoveTest, DstAndSrcPointToSameAddress) { void *const Dst = Buffer; void *const Ret = __llvm_libc::memmove(Dst, Buffer, 1); EXPECT_EQ(Ret, Dst); - ASSERT_MEM_EQ(Buffer, Expected); + EXPECT_MEM_EQ(Buffer, Expected); } TEST(LlvmLibcMemmoveTest, DstStartsBeforeSrc) { @@ -40,7 +40,7 @@ TEST(LlvmLibcMemmoveTest, DstStartsBeforeSrc) { void *const Dst = Buffer + 1; void *const Ret = __llvm_libc::memmove(Dst, Buffer + 2, 2); EXPECT_EQ(Ret, Dst); - ASSERT_MEM_EQ(Buffer, Expected); + EXPECT_MEM_EQ(Buffer, Expected); } TEST(LlvmLibcMemmoveTest, DstStartsAfterSrc) { @@ -49,7 +49,7 @@ TEST(LlvmLibcMemmoveTest, DstStartsAfterSrc) { void *const Dst = Buffer + 2; void *const Ret = __llvm_libc::memmove(Dst, Buffer + 1, 2); EXPECT_EQ(Ret, Dst); - ASSERT_MEM_EQ(Buffer, Expected); + EXPECT_MEM_EQ(Buffer, Expected); } // e.g. `Dst` follow `src`. @@ -62,7 +62,7 @@ TEST(LlvmLibcMemmoveTest, SrcFollowDst) { void *const Dst = Buffer + 1; void *const Ret = __llvm_libc::memmove(Dst, Buffer + 2, 1); EXPECT_EQ(Ret, Dst); - ASSERT_MEM_EQ(Buffer, Expected); + EXPECT_MEM_EQ(Buffer, Expected); } TEST(LlvmLibcMemmoveTest, DstFollowSrc) { @@ -71,7 +71,7 @@ TEST(LlvmLibcMemmoveTest, DstFollowSrc) { void *const Dst = Buffer + 2; void *const Ret = __llvm_libc::memmove(Dst, Buffer + 1, 1); EXPECT_EQ(Ret, Dst); - ASSERT_MEM_EQ(Buffer, Expected); + EXPECT_MEM_EQ(Buffer, Expected); } static constexpr int kMaxSize = 512; @@ -106,7 +106,7 @@ TEST(LlvmLibcMemmoveTest, Thorough) { void *const Ret = __llvm_libc::memmove(Dst, Buffer.data() + SrcOffset, Size); EXPECT_EQ(Ret, Dst); - ASSERT_MEM_EQ(Buffer, Expected); + EXPECT_MEM_EQ(Buffer, Expected); } } } diff --git a/libc/test/src/string/memory_utils/CMakeLists.txt b/libc/test/src/string/memory_utils/CMakeLists.txt index d54f845..8f92627 100644 --- a/libc/test/src/string/memory_utils/CMakeLists.txt +++ b/libc/test/src/string/memory_utils/CMakeLists.txt @@ -3,6 +3,8 @@ add_libc_unittest( SUITE libc_string_unittests SRCS + elements_test.cpp + memory_access_test.cpp utils_test.cpp COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE} diff --git a/libc/test/src/string/memory_utils/elements_test.cpp b/libc/test/src/string/memory_utils/elements_test.cpp new file mode 100644 index 0000000..2187001 --- /dev/null +++ b/libc/test/src/string/memory_utils/elements_test.cpp @@ -0,0 +1,137 @@ +//===-- Unittests for memory_utils ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/CPP/array.h" +#include "src/__support/CPP/span.h" +#include "src/string/memory_utils/elements.h" +#include "utils/UnitTest/Test.h" + +namespace __llvm_libc { + +// Registering Types +using FixedSizeTypes = testing::TypeList< +#if defined(__SSE2__) + x86::Vector128, // +#endif // __SSE2__ +#if defined(__AVX2__) + x86::Vector256, // +#endif // __AVX2__ +#if defined(__AVX512F__) and defined(__AVX512BW__) + x86::Vector512, // +#endif // defined(__AVX512F__) and defined(__AVX512BW__) + scalar::UINT8, // + scalar::UINT16, // + scalar::UINT32, // + scalar::UINT64, // + Repeated, // + Repeated, // + Repeated, // + Repeated, // + Repeated, // + Chained, // + Chained, // + builtin::_1, // + builtin::_2, // + builtin::_3, // + builtin::_4, // + builtin::_8 // + >; + +char GetRandomChar() { + static constexpr const uint64_t a = 1103515245; + static constexpr const uint64_t c = 12345; + static constexpr const uint64_t m = 1ULL << 31; + static uint64_t seed = 123456789; + seed = (a * seed + c) % m; + return seed; +} + +void Randomize(cpp::span buffer) { + for (auto ¤t : buffer) + current = GetRandomChar(); +} + +template using Buffer = cpp::array; + +template Buffer GetRandomBuffer() { + Buffer buffer; + Randomize(buffer); + return buffer; +} + +TYPED_TEST(LlvmLibcMemoryElements, copy, FixedSizeTypes) { + Buffer Dst; + const auto buffer = GetRandomBuffer(); + copy(Dst.data(), buffer.data()); + for (size_t i = 0; i < ParamType::SIZE; ++i) + EXPECT_EQ(Dst[i], buffer[i]); +} + +template T copy(const T &Input) { + T Output; + for (size_t I = 0; I < Input.size(); ++I) + Output[I] = Input[I]; + return Output; +} + +TYPED_TEST(LlvmLibcMemoryElements, Move, FixedSizeTypes) { + constexpr size_t SIZE = ParamType::SIZE; + using LargeBuffer = cpp::array; + LargeBuffer GroundTruth; + Randomize(GroundTruth); + // Forward, we move the SIZE first bytes from offset 0 to SIZE. + for (size_t Offset = 0; Offset < SIZE; ++Offset) { + LargeBuffer Buffer = copy(GroundTruth); + move(&Buffer[Offset], &Buffer[0]); + for (size_t I = 0; I < SIZE; ++I) + EXPECT_EQ(Buffer[I + Offset], GroundTruth[I]); + } + // Backward, we move the SIZE last bytes from offset 0 to SIZE. + for (size_t Offset = 0; Offset < SIZE; ++Offset) { + LargeBuffer Buffer = copy(GroundTruth); + move(&Buffer[Offset], &Buffer[SIZE]); + for (size_t I = 0; I < SIZE; ++I) + EXPECT_EQ(Buffer[I + Offset], GroundTruth[SIZE + I]); + } +} + +TYPED_TEST(LlvmLibcMemoryElements, Equals, FixedSizeTypes) { + const auto buffer = GetRandomBuffer(); + EXPECT_TRUE(equals(buffer.data(), buffer.data())); +} + +TYPED_TEST(LlvmLibcMemoryElements, three_way_compare, FixedSizeTypes) { + Buffer initial; + for (auto &c : initial) + c = 5; + + // Testing equality + EXPECT_EQ(three_way_compare(initial.data(), initial.data()), 0); + + // Testing all mismatching positions + for (size_t i = 0; i < ParamType::SIZE; ++i) { + auto copy = initial; + ++copy[i]; // copy is now lexicographycally greated than initial + const auto *less = initial.data(); + const auto *greater = copy.data(); + EXPECT_LT(three_way_compare(less, greater), 0); + EXPECT_GT(three_way_compare(greater, less), 0); + } +} + +TYPED_TEST(LlvmLibcMemoryElements, Splat, FixedSizeTypes) { + Buffer Dst; + const cpp::array values = {char(0x00), char(0x7F), char(0xFF)}; + for (char value : values) { + splat_set(Dst.data(), value); + for (size_t i = 0; i < ParamType::SIZE; ++i) + EXPECT_EQ(Dst[i], value); + } +} + +} // namespace __llvm_libc diff --git a/libc/test/src/string/memory_utils/memory_access_test.cpp b/libc/test/src/string/memory_utils/memory_access_test.cpp new file mode 100644 index 0000000..b81700f --- /dev/null +++ b/libc/test/src/string/memory_utils/memory_access_test.cpp @@ -0,0 +1,228 @@ +//===-- Unittests for memory_utils ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#define LLVM_LIBC_UNITTEST_OBSERVE 1 + +#include "src/__support/CPP/array.h" +#include "src/string/memory_utils/elements.h" +#include "utils/UnitTest/Test.h" + +#include +#include + +namespace __llvm_libc { + +static constexpr const size_t kMaxBuffer = 32; + +struct BufferAccess : cpp::array { + BufferAccess() { Reset(); } + void Reset() { + for (auto &value : *this) + value = '0'; + this->operator[](kMaxBuffer) = '\0'; + } + void Touch(ptrdiff_t offset, size_t size) { + if (offset < 0) + return; + for (size_t i = 0; i < size; ++i) + ++(*this)[offset + i]; + } + operator const char *() const { return this->data(); } +}; + +struct Buffer { + ptrdiff_t Offset(const char *ptr) const { + const bool contained = ptr >= data.begin() && ptr < data.end(); + return contained ? ptr - data.begin() : -1; + } + void Reset() { + reads.Reset(); + writes.Reset(); + } + cpp::array data; + BufferAccess __attribute__((aligned(64))) reads; + BufferAccess __attribute__((aligned(64))) writes; +}; + +struct MemoryAccessObserver { + void ObserveRead(const char *ptr, size_t size) { + Buffer1.reads.Touch(Buffer1.Offset(ptr), size); + Buffer2.reads.Touch(Buffer2.Offset(ptr), size); + } + + void ObserveWrite(const char *ptr, size_t size) { + Buffer1.writes.Touch(Buffer1.Offset(ptr), size); + Buffer2.writes.Touch(Buffer2.Offset(ptr), size); + } + + void Reset() { + Buffer1.Reset(); + Buffer2.Reset(); + } + + Buffer Buffer1; + Buffer Buffer2; +}; + +MemoryAccessObserver Observer; + +template struct TestingElement { + static constexpr size_t SIZE = Size; + + static void copy(char *__restrict dst, const char *__restrict src) { + Observer.ObserveRead(src, SIZE); + Observer.ObserveWrite(dst, SIZE); + } + + static bool equals(const char *lhs, const char *rhs) { + Observer.ObserveRead(lhs, SIZE); + Observer.ObserveRead(rhs, SIZE); + return true; + } + + static int three_way_compare(const char *lhs, const char *rhs) { + Observer.ObserveRead(lhs, SIZE); + Observer.ObserveRead(rhs, SIZE); + return 0; + } + + static void splat_set(char *dst, const unsigned char value) { + Observer.ObserveWrite(dst, SIZE); + } +}; + +using Types = testing::TypeList< + TestingElement<1>, // 1 Byte + TestingElement<2>, // 2 Bytes + TestingElement<4>, // 4 Bytes + Repeated, 3>, // 6 Bytes + Chained, TestingElement<2>, TestingElement<1>> // 7 Bytes + >; + +struct LlvmLibcTestAccessBase : public testing::Test { + + template + void checkOperations(const BufferAccess &expected) { + static const BufferAccess untouched; + + Observer.Reset(); + HigherOrder::copy(dst_ptr() + Offset, src_ptr() + Offset, Size); + ASSERT_STREQ(src().writes, untouched); + ASSERT_STREQ(dst().reads, untouched); + ASSERT_STREQ(src().reads, expected); + ASSERT_STREQ(dst().writes, expected); + Observer.Reset(); + HigherOrder::equals(lhs_ptr() + Offset, rhs_ptr() + Offset, Size); + ASSERT_STREQ(lhs().writes, untouched); + ASSERT_STREQ(rhs().writes, untouched); + ASSERT_STREQ(lhs().reads, expected); + ASSERT_STREQ(rhs().reads, expected); + Observer.Reset(); + HigherOrder::three_way_compare(lhs_ptr() + Offset, rhs_ptr() + Offset, + Size); + ASSERT_STREQ(lhs().writes, untouched); + ASSERT_STREQ(rhs().writes, untouched); + ASSERT_STREQ(lhs().reads, expected); + ASSERT_STREQ(rhs().reads, expected); + Observer.Reset(); + HigherOrder::splat_set(dst_ptr() + Offset, 5, Size); + ASSERT_STREQ(src().reads, untouched); + ASSERT_STREQ(src().writes, untouched); + ASSERT_STREQ(dst().reads, untouched); + ASSERT_STREQ(dst().writes, expected); + } + + void checkMaxAccess(const BufferAccess &expected, int max) { + for (size_t i = 0; i < kMaxBuffer; ++i) { + int value = (int)expected[i] - '0'; + ASSERT_GE(value, 0); + ASSERT_LE(value, max); + } + } + +private: + const Buffer &lhs() const { return Observer.Buffer1; } + const Buffer &rhs() const { return Observer.Buffer2; } + const Buffer &src() const { return Observer.Buffer2; } + const Buffer &dst() const { return Observer.Buffer1; } + Buffer &dst() { return Observer.Buffer1; } + + char *dst_ptr() { return dst().data.begin(); } + const char *src_ptr() { return src().data.begin(); } + const char *lhs_ptr() { return lhs().data.begin(); } + const char *rhs_ptr() { return rhs().data.begin(); } +}; + +template +struct LlvmLibcTestAccessTail : public LlvmLibcTestAccessBase { + + void TearDown() override { + static constexpr size_t Size = 10; + + BufferAccess expected; + expected.Touch(Size - ParamType::SIZE, ParamType::SIZE); + + checkMaxAccess(expected, 1); + checkOperations, Size>(expected); + } +}; +TYPED_TEST_F(LlvmLibcTestAccessTail, Operations, Types) {} + +template +struct LlvmLibcTestAccessHeadTail : public LlvmLibcTestAccessBase { + void TearDown() override { + static constexpr size_t Size = 10; + + BufferAccess expected; + expected.Touch(0, ParamType::SIZE); + expected.Touch(Size - ParamType::SIZE, ParamType::SIZE); + + checkMaxAccess(expected, 2); + checkOperations, Size>(expected); + } +}; +TYPED_TEST_F(LlvmLibcTestAccessHeadTail, Operations, Types) {} + +template +struct LlvmLibcTestAccessLoop : public LlvmLibcTestAccessBase { + void TearDown() override { + static constexpr size_t Size = 20; + + BufferAccess expected; + for (size_t i = 0; i < Size - ParamType::SIZE; i += ParamType::SIZE) + expected.Touch(i, ParamType::SIZE); + expected.Touch(Size - ParamType::SIZE, ParamType::SIZE); + + checkMaxAccess(expected, 2); + checkOperations, Size>(expected); + } +}; +TYPED_TEST_F(LlvmLibcTestAccessLoop, Operations, Types) {} + +template +struct LlvmLibcTestAccessAlignedAccess : public LlvmLibcTestAccessBase { + void TearDown() override { + static constexpr size_t Size = 10; + static constexpr size_t Offset = 2; + using AlignmentT = TestingElement<4>; + + BufferAccess expected; + expected.Touch(Offset, AlignmentT::SIZE); + expected.Touch(AlignmentT::SIZE, ParamType::SIZE); + expected.Touch(Offset + Size - ParamType::SIZE, ParamType::SIZE); + + checkMaxAccess(expected, 3); + checkOperations::Then>, Size, + Offset>(expected); + checkOperations::Then>, Size, + Offset>(expected); + } +}; +TYPED_TEST_F(LlvmLibcTestAccessAlignedAccess, Operations, Types) {} + +} // namespace __llvm_libc diff --git a/libc/test/src/string/memory_utils/utils_test.cpp b/libc/test/src/string/memory_utils/utils_test.cpp index 5c7920c..a20c090 100644 --- a/libc/test/src/string/memory_utils/utils_test.cpp +++ b/libc/test/src/string/memory_utils/utils_test.cpp @@ -72,41 +72,55 @@ TEST(LlvmLibcUtilsTest, GEPowerOf2) { EXPECT_EQ(ge_power2(i), kExpectedValues[i]); } -using UINT = uintptr_t; +using I = intptr_t; // Converts an offset into a pointer. const void *forge(size_t offset) { return reinterpret_cast(offset); } -TEST(LlvmLibcUtilsTest, DistanceToNextAligned) { - EXPECT_EQ(distance_to_next_aligned<16>(forge(0)), UINT(16)); - EXPECT_EQ(distance_to_next_aligned<16>(forge(1)), UINT(15)); - EXPECT_EQ(distance_to_next_aligned<16>(forge(16)), UINT(16)); - EXPECT_EQ(distance_to_next_aligned<16>(forge(15)), UINT(1)); - EXPECT_EQ(distance_to_next_aligned<32>(forge(16)), UINT(16)); +TEST(LlvmLibcUtilsTest, OffsetToNextAligned) { + EXPECT_EQ(offset_to_next_aligned<16>(forge(0)), I(0)); + EXPECT_EQ(offset_to_next_aligned<16>(forge(1)), I(15)); + EXPECT_EQ(offset_to_next_aligned<16>(forge(16)), I(0)); + EXPECT_EQ(offset_to_next_aligned<16>(forge(15)), I(1)); + EXPECT_EQ(offset_to_next_aligned<32>(forge(16)), I(16)); } -TEST(LlvmLibcUtilsTest, DistanceToAlignUp) { - EXPECT_EQ(distance_to_align_up<16>(forge(0)), UINT(0)); - EXPECT_EQ(distance_to_align_up<16>(forge(1)), UINT(15)); - EXPECT_EQ(distance_to_align_up<16>(forge(16)), UINT(0)); - EXPECT_EQ(distance_to_align_up<16>(forge(15)), UINT(1)); - EXPECT_EQ(distance_to_align_up<32>(forge(16)), UINT(16)); +TEST(LlvmLibcUtilsTest, OffsetFromLastAligned) { + EXPECT_EQ(offset_from_last_aligned<16>(forge(0)), I(0)); + EXPECT_EQ(offset_from_last_aligned<16>(forge(1)), I(1)); + EXPECT_EQ(offset_from_last_aligned<16>(forge(16)), I(0)); + EXPECT_EQ(offset_from_last_aligned<16>(forge(15)), I(15)); + EXPECT_EQ(offset_from_last_aligned<32>(forge(16)), I(16)); } -TEST(LlvmLibcUtilsTest, DistanceToAlignDown) { - EXPECT_EQ(distance_to_align_down<16>(forge(0)), UINT(0)); - EXPECT_EQ(distance_to_align_down<16>(forge(1)), UINT(1)); - EXPECT_EQ(distance_to_align_down<16>(forge(16)), UINT(0)); - EXPECT_EQ(distance_to_align_down<16>(forge(15)), UINT(15)); - EXPECT_EQ(distance_to_align_down<32>(forge(16)), UINT(16)); +TEST(LlvmLibcUtilsTest, OffsetToNextCacheLine) { + EXPECT_GT(LLVM_LIBC_CACHELINE_SIZE, 0); + EXPECT_EQ(offset_to_next_cache_line(forge(0)), I(0)); + EXPECT_EQ(offset_to_next_cache_line(forge(1)), + I(LLVM_LIBC_CACHELINE_SIZE - 1)); + EXPECT_EQ(offset_to_next_cache_line(forge(LLVM_LIBC_CACHELINE_SIZE)), I(0)); + EXPECT_EQ(offset_to_next_cache_line(forge(LLVM_LIBC_CACHELINE_SIZE - 1)), + I(1)); +} + +TEST(LlvmLibcUtilsTest, Adjust1) { + char a; + const size_t base_size = 10; + for (size_t I = -2; I < 2; ++I) { + auto *ptr = &a; + size_t size = base_size; + adjust(I, ptr, size); + EXPECT_EQ(intptr_t(ptr), intptr_t(&a + I)); + EXPECT_EQ(size, base_size - I); + } } TEST(LlvmLibcUtilsTest, Adjust2) { char a, b; const size_t base_size = 10; - for (ptrdiff_t I = -2; I < 2; ++I) { + for (size_t I = -2; I < 2; ++I) { auto *p1 = &a; auto *p2 = &b; size_t size = base_size; @@ -117,6 +131,19 @@ TEST(LlvmLibcUtilsTest, Adjust2) { } } +TEST(LlvmLibcUtilsTest, Align1) { + char a; + const size_t base_size = 10; + { + auto *ptr = &a; + size_t size = base_size; + align<128>(ptr, size); + EXPECT_TRUE(uintptr_t(ptr) % 128 == 0); + EXPECT_GE(ptr, &a); + EXPECT_EQ(size_t(ptr - &a), base_size - size); + } +} + TEST(LlvmLibcUtilsTest, Align2) { char a, b; const size_t base_size = 10; @@ -124,10 +151,10 @@ TEST(LlvmLibcUtilsTest, Align2) { auto *p1 = &a; auto *p2 = &b; size_t size = base_size; - align_to_next_boundary<128, Arg::P1>(p1, p2, size); + align<128, Arg::_1>(p1, p2, size); EXPECT_TRUE(uintptr_t(p1) % 128 == 0); - EXPECT_GT(p1, &a); - EXPECT_GT(p2, &b); + EXPECT_GE(p1, &a); + EXPECT_GE(p2, &b); EXPECT_EQ(size_t(p1 - &a), base_size - size); EXPECT_EQ(size_t(p2 - &b), base_size - size); } @@ -135,10 +162,10 @@ TEST(LlvmLibcUtilsTest, Align2) { auto *p1 = &a; auto *p2 = &b; size_t size = base_size; - align_to_next_boundary<128, Arg::P2>(p1, p2, size); + align<128, Arg::_2>(p1, p2, size); EXPECT_TRUE(uintptr_t(p2) % 128 == 0); - EXPECT_GT(p1, &a); - EXPECT_GT(p2, &b); + EXPECT_GE(p1, &a); + EXPECT_GE(p2, &b); EXPECT_EQ(size_t(p1 - &a), base_size - size); EXPECT_EQ(size_t(p2 - &b), base_size - size); } diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index f94b502..90aea2c 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -973,10 +973,9 @@ no_sanitize_features = [ cc_library( name = "string_memory_utils", hdrs = [ - "src/string/memory_utils/op_aarch64.h", - "src/string/memory_utils/op_builtin.h", - "src/string/memory_utils/op_generic.h", - "src/string/memory_utils/op_x86.h", + "src/string/memory_utils/elements.h", + "src/string/memory_utils/elements_aarch64.h", + "src/string/memory_utils/elements_x86.h", "src/string/memory_utils/utils.h", ], textual_hdrs = [ @@ -989,8 +988,6 @@ cc_library( deps = [ ":__support_common", ":__support_cpp_bit", - ":__support_cpp_type_traits", - ":__support_cpp_array", ":libc_root", ], ) -- 2.7.4