From 2cfae7cdf4514652156bfbc0c240df2112ab925c Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet Date: Tue, 13 Jun 2023 14:41:17 +0000 Subject: [PATCH] [libc] Dispatch memmove to memcpy when buffers are disjoint Most of the time `memmove` is called on buffers that are disjoint, in that case we can use `memcpy` which is faster. The additional test is branchless on x86, aarch64 and RISCV with the zbb extension (bitmanip). On x86 this patch adds a latency of 2 to 3 cycles. Before ``` -------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... -------------------------------------------------------------------------------- BM_Memmove/0/0_median 5.00 ns 5.00 ns 10 bytes_per_cycle=1.25477/s bytes_per_second=2.62933G/s items_per_second=199.87M/s __llvm_libc::memmove,memmove Google A BM_Memmove/1/0_median 6.21 ns 6.21 ns 10 bytes_per_cycle=3.22173/s bytes_per_second=6.75106G/s items_per_second=160.955M/s __llvm_libc::memmove,memmove Google B BM_Memmove/2/0_median 8.09 ns 8.09 ns 10 bytes_per_cycle=5.31462/s bytes_per_second=11.1366G/s items_per_second=123.603M/s __llvm_libc::memmove,memmove Google D BM_Memmove/3/0_median 5.95 ns 5.95 ns 10 bytes_per_cycle=2.71865/s bytes_per_second=5.69687G/s items_per_second=167.967M/s __llvm_libc::memmove,memmove Google L BM_Memmove/4/0_median 5.63 ns 5.63 ns 10 bytes_per_cycle=2.28294/s bytes_per_second=4.78383G/s items_per_second=177.615M/s __llvm_libc::memmove,memmove Google M BM_Memmove/5/0_median 5.68 ns 5.68 ns 10 bytes_per_cycle=2.16798/s bytes_per_second=4.54295G/s items_per_second=176.015M/s __llvm_libc::memmove,memmove Google Q BM_Memmove/6/0_median 7.46 ns 7.46 ns 10 bytes_per_cycle=3.97619/s bytes_per_second=8.332G/s items_per_second=134.044M/s __llvm_libc::memmove,memmove Google S BM_Memmove/7/0_median 5.40 ns 5.40 ns 10 bytes_per_cycle=1.79695/s bytes_per_second=3.76546G/s items_per_second=185.211M/s __llvm_libc::memmove,memmove Google U BM_Memmove/8/0_median 5.62 ns 5.62 ns 10 bytes_per_cycle=3.18747/s bytes_per_second=6.67927G/s items_per_second=177.983M/s __llvm_libc::memmove,memmove Google W BM_Memmove/9/0_median 101 ns 101 ns 10 bytes_per_cycle=9.77359/s bytes_per_second=20.4803G/s items_per_second=9.9333M/s __llvm_libc::memmove,uniform 384 to 4096 ``` After ``` BM_Memmove/0/0_median 3.57 ns 3.57 ns 10 bytes_per_cycle=1.71375/s bytes_per_second=3.59112G/s items_per_second=280.411M/s __llvm_libc::memmove,memmove Google A BM_Memmove/1/0_median 4.52 ns 4.52 ns 10 bytes_per_cycle=4.47557/s bytes_per_second=9.37843G/s items_per_second=221.427M/s __llvm_libc::memmove,memmove Google B BM_Memmove/2/0_median 5.70 ns 5.70 ns 10 bytes_per_cycle=7.37396/s bytes_per_second=15.4519G/s items_per_second=175.399M/s __llvm_libc::memmove,memmove Google D BM_Memmove/3/0_median 4.47 ns 4.47 ns 10 bytes_per_cycle=3.4148/s bytes_per_second=7.15563G/s items_per_second=223.743M/s __llvm_libc::memmove,memmove Google L BM_Memmove/4/0_median 4.53 ns 4.53 ns 10 bytes_per_cycle=2.86071/s bytes_per_second=5.99454G/s items_per_second=220.69M/s __llvm_libc::memmove,memmove Google M BM_Memmove/5/0_median 4.19 ns 4.19 ns 10 bytes_per_cycle=2.5484/s bytes_per_second=5.3401G/s items_per_second=238.924M/s __llvm_libc::memmove,memmove Google Q BM_Memmove/6/0_median 5.02 ns 5.02 ns 10 bytes_per_cycle=5.94164/s bytes_per_second=12.4505G/s items_per_second=199.14M/s __llvm_libc::memmove,memmove Google S BM_Memmove/7/0_median 4.03 ns 4.03 ns 10 bytes_per_cycle=2.47028/s bytes_per_second=5.17641G/s items_per_second=247.906M/s __llvm_libc::memmove,memmove Google U BM_Memmove/8/0_median 4.70 ns 4.70 ns 10 bytes_per_cycle=3.84975/s bytes_per_second=8.06706G/s items_per_second=212.72M/s __llvm_libc::memmove,memmove Google W BM_Memmove/9/0_median 90.7 ns 90.7 ns 10 bytes_per_cycle=10.8681/s bytes_per_second=22.7739G/s items_per_second=11.02M/s __llvm_libc::memmove,uniform 384 to 4096 ``` Reviewed By: courbet Differential Revision: https://reviews.llvm.org/D152811 --- libc/src/string/memmove.cpp | 6 +++++- libc/src/string/memory_utils/utils.h | 15 +++++++++++++++ libc/test/src/string/memory_utils/utils_test.cpp | 13 +++++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/libc/src/string/memmove.cpp b/libc/src/string/memmove.cpp index ba1b29b..3b6866f 100644 --- a/libc/src/string/memmove.cpp +++ b/libc/src/string/memmove.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/string/memmove.h" +#include "src/string/memory_utils/memcpy_implementations.h" #include "src/string/memory_utils/memmove_implementations.h" #include // size_t @@ -14,7 +15,10 @@ namespace __llvm_libc { LLVM_LIBC_FUNCTION(void *, memmove, (void *dst, const void *src, size_t count)) { - inline_memmove(dst, src, count); + if (is_disjoint(dst, src, count)) + inline_memcpy(dst, src, count); + else + inline_memmove(dst, src, count); return dst; } diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h index 32be4c4..5a3dcc6 100644 --- a/libc/src/string/memory_utils/utils.h +++ b/libc/src/string/memory_utils/utils.h @@ -84,6 +84,21 @@ template static T *assume_aligned(T *ptr) { return reinterpret_cast(__builtin_assume_aligned(ptr, alignment)); } +// Returns true iff memory regions [p1, p1 + size] and [p2, p2 + size] are +// disjoint. +LIBC_INLINE bool is_disjoint(const void *p1, const void *p2, size_t size) { + const char *a = static_cast(p1); + const char *b = static_cast(p2); + if (a > b) { + // Swap a and b, this compiles down to conditionnal move for aarch64, x86 + // and RISCV with zbb extension. + const char *tmp = a; + a = b; + b = tmp; + } + return a + size <= b; +} + #if LIBC_HAS_BUILTIN(__builtin_memcpy_inline) #define LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE #endif diff --git a/libc/test/src/string/memory_utils/utils_test.cpp b/libc/test/src/string/memory_utils/utils_test.cpp index 4fd6def..8968e0d 100644 --- a/libc/test/src/string/memory_utils/utils_test.cpp +++ b/libc/test/src/string/memory_utils/utils_test.cpp @@ -144,6 +144,19 @@ TEST(LlvmLibcUtilsTest, Align2) { } } +TEST(LlvmLibcUtilsTest, DisjointBuffers) { + char buf[3]; + const char *const a = buf + 0; + const char *const b = buf + 1; + EXPECT_TRUE(is_disjoint(a, b, 0)); + EXPECT_TRUE(is_disjoint(a, b, 1)); + EXPECT_FALSE(is_disjoint(a, b, 2)); + + EXPECT_TRUE(is_disjoint(b, a, 0)); + EXPECT_TRUE(is_disjoint(b, a, 1)); + EXPECT_FALSE(is_disjoint(b, a, 2)); +} + TEST(LlvmLibcUtilsTest, LoadStoreAligned) { const uint64_t init = 0xDEAD'C0DE'BEEF'F00D; CPtr const src = reinterpret_cast(&init); -- 2.7.4