From 893f02c2aff98364b902c08c46d602944d20c1e1 Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet Date: Tue, 16 May 2023 12:45:22 +0000 Subject: [PATCH] [libc] Add optimized memcmp for RISCV This patch adds two versions of `bcmp` optimized for architectures where unaligned accesses are either illegal or extremely slow. It is currently enabled for RISCV 64 and RISCV 32 but it could be used for ARM 32 architectures as well. Here is the before / after output of `libc.benchmarks.memory_functions.opt_host --benchmark_filter=BM_memcmp` on a quad core Linux starfive RISCV 64 board running at 1.5GHz. Before ``` Run on (4 X 1500 MHz CPU s) CPU Caches: L1 Instruction 32 KiB (x4) L1 Data 32 KiB (x4) L2 Unified 2048 KiB (x1) ---------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... ---------------------------------------------------------------------- BM_Memcmp/0/0 110 ns 66.4 ns 10404864 bytes_per_cycle=0.107646/s bytes_per_second=153.989M/s items_per_second=15.071M/s __llvm_libc::memcmp,memcmp Google A BM_Memcmp/1/0 318 ns 211 ns 3026944 bytes_per_cycle=0.131539/s bytes_per_second=188.167M/s items_per_second=4.73691M/s __llvm_libc::memcmp,memcmp Google B BM_Memcmp/2/0 204 ns 115 ns 6118400 bytes_per_cycle=0.121675/s bytes_per_second=174.058M/s items_per_second=8.70241M/s __llvm_libc::memcmp,memcmp Google D BM_Memcmp/3/0 143 ns 99.6 ns 7013376 bytes_per_cycle=0.117974/s bytes_per_second=168.763M/s items_per_second=10.0437M/s __llvm_libc::memcmp,memcmp Google L BM_Memcmp/4/0 81.3 ns 58.2 ns 11426816 bytes_per_cycle=0.101125/s bytes_per_second=144.661M/s items_per_second=17.1805M/s __llvm_libc::memcmp,memcmp Google M BM_Memcmp/5/0 177 ns 118 ns 5952512 bytes_per_cycle=0.120612/s bytes_per_second=172.537M/s items_per_second=8.45549M/s __llvm_libc::memcmp,memcmp Google Q BM_Memcmp/6/0 342 ns 220 ns 3483648 bytes_per_cycle=0.132004/s bytes_per_second=188.834M/s items_per_second=4.54739M/s __llvm_libc::memcmp,memcmp Google S BM_Memcmp/7/0 208 ns 130 ns 5681152 bytes_per_cycle=0.12468/s bytes_per_second=178.356M/s items_per_second=7.6674M/s __llvm_libc::memcmp,memcmp Google U BM_Memcmp/8/0 123 ns 79.1 ns 8387584 bytes_per_cycle=0.110593/s bytes_per_second=158.204M/s items_per_second=12.6439M/s __llvm_libc::memcmp,memcmp Google W BM_Memcmp/9/0 20707 ns 10643 ns 67584 bytes_per_cycle=0.142401/s bytes_per_second=203.707M/s items_per_second=93.9559k/s __llvm_libc::memcmp,uniform 384 to 4096 ``` After ``` BM_Memcmp/0/0 80.4 ns 55.8 ns 12648448 bytes_per_cycle=0.132703/s bytes_per_second=189.834M/s items_per_second=17.9256M/s __llvm_libc::memcmp,memcmp Google A BM_Memcmp/1/0 140 ns 80.5 ns 8230912 bytes_per_cycle=0.337273/s bytes_per_second=482.474M/s items_per_second=12.4165M/s __llvm_libc::memcmp,memcmp Google B BM_Memcmp/2/0 101 ns 66.4 ns 10571776 bytes_per_cycle=0.208539/s bytes_per_second=298.317M/s items_per_second=15.0687M/s __llvm_libc::memcmp,memcmp Google D BM_Memcmp/3/0 118 ns 67.6 ns 10533888 bytes_per_cycle=0.176822/s bytes_per_second=252.946M/s items_per_second=14.7946M/s __llvm_libc::memcmp,memcmp Google L BM_Memcmp/4/0 106 ns 53.0 ns 12722176 bytes_per_cycle=0.111141/s bytes_per_second=158.988M/s items_per_second=18.8591M/s __llvm_libc::memcmp,memcmp Google M BM_Memcmp/5/0 141 ns 70.2 ns 10436608 bytes_per_cycle=0.26032/s bytes_per_second=372.39M/s items_per_second=14.2458M/s __llvm_libc::memcmp,memcmp Google Q BM_Memcmp/6/0 144 ns 79.3 ns 8932352 bytes_per_cycle=0.353168/s bytes_per_second=505.211M/s items_per_second=12.612M/s __llvm_libc::memcmp,memcmp Google S BM_Memcmp/7/0 123 ns 71.7 ns 9945088 bytes_per_cycle=0.22143/s bytes_per_second=316.758M/s items_per_second=13.9421M/s __llvm_libc::memcmp,memcmp Google U BM_Memcmp/8/0 97.0 ns 56.2 ns 12509184 bytes_per_cycle=0.160526/s bytes_per_second=229.635M/s items_per_second=17.7784M/s __llvm_libc::memcmp,memcmp Google W BM_Memcmp/9/0 1840 ns 989 ns 676864 bytes_per_cycle=1.4894/s bytes_per_second=2.08067G/s items_per_second=1010.92k/s __llvm_libc::memcmp,uniform 384 to 4096 ``` glibc ``` BM_Memcmp/0/0 72.6 ns 51.7 ns 12963840 bytes_per_cycle=0.141261/s bytes_per_second=202.075M/s items_per_second=19.3246M/s glibc::memcmp,memcmp Google A BM_Memcmp/1/0 118 ns 75.2 ns 9280512 bytes_per_cycle=0.354054/s bytes_per_second=506.478M/s items_per_second=13.3046M/s glibc::memcmp,memcmp Google B BM_Memcmp/2/0 114 ns 62.9 ns 11152384 bytes_per_cycle=0.222675/s bytes_per_second=318.539M/s items_per_second=15.8943M/s glibc::memcmp,memcmp Google D BM_Memcmp/3/0 84.0 ns 63.5 ns 11030528 bytes_per_cycle=0.186353/s bytes_per_second=266.581M/s items_per_second=15.7378M/s glibc::memcmp,memcmp Google L BM_Memcmp/4/0 93.5 ns 51.2 ns 13462528 bytes_per_cycle=0.119215/s bytes_per_second=170.539M/s items_per_second=19.5384M/s glibc::memcmp,memcmp Google M BM_Memcmp/5/0 123 ns 61.7 ns 11376640 bytes_per_cycle=0.225262/s bytes_per_second=322.239M/s items_per_second=16.1993M/s glibc::memcmp,memcmp Google Q BM_Memcmp/6/0 122 ns 71.6 ns 9967616 bytes_per_cycle=0.380844/s bytes_per_second=544.802M/s items_per_second=13.9579M/s glibc::memcmp,memcmp Google S BM_Memcmp/7/0 118 ns 65.6 ns 10555392 bytes_per_cycle=0.238677/s bytes_per_second=341.43M/s items_per_second=15.2334M/s glibc::memcmp,memcmp Google U BM_Memcmp/8/0 90.4 ns 54.0 ns 12920832 bytes_per_cycle=0.161987/s bytes_per_second=231.724M/s items_per_second=18.5169M/s glibc::memcmp,memcmp Google W BM_Memcmp/9/0 1045 ns 601 ns 1195008 bytes_per_cycle=2.53677/s bytes_per_second=3.54383G/s items_per_second=1.66423M/s glibc::memcmp,uniform 384 to 4096 ``` Reviewed By: sivachandra Differential Revision: https://reviews.llvm.org/D150663 --- .../string/memory_utils/memcmp_implementations.h | 71 +++++++++++++++++++++- 1 file changed, 68 insertions(+), 3 deletions(-) diff --git a/libc/src/string/memory_utils/memcmp_implementations.h b/libc/src/string/memory_utils/memcmp_implementations.h index 9403f306..d870ec4 100644 --- a/libc/src/string/memory_utils/memcmp_implementations.h +++ b/libc/src/string/memory_utils/memcmp_implementations.h @@ -26,21 +26,86 @@ namespace __llvm_libc { [[maybe_unused]] LIBC_INLINE MemcmpReturnType -inline_memcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) { +inline_memcmp_byte_per_byte(CPtr p1, CPtr p2, size_t offset, size_t count) { LIBC_LOOP_NOUNROLL - for (size_t offset = 0; offset < count; ++offset) + for (; offset < count; ++offset) if (auto value = generic::Memcmp<1>::block(p1 + offset, p2 + offset)) return value; return MemcmpReturnType::ZERO(); } +[[maybe_unused]] LIBC_INLINE MemcmpReturnType +inline_memcmp_aligned_access_64bit(CPtr p1, CPtr p2, size_t count) { + constexpr size_t kAlign = sizeof(uint64_t); + if (count <= 2 * kAlign) + return inline_memcmp_byte_per_byte(p1, p2, 0, count); + size_t bytes_to_p1_align = distance_to_align_up(p1); + if (auto value = inline_memcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align)) + return value; + size_t offset = bytes_to_p1_align; + size_t p2_alignment = distance_to_align_down(p2 + offset); + for (; offset < count - kAlign; offset += kAlign) { + uint64_t b; + if (p2_alignment == 0) + b = load64_aligned(p2, offset); + else if (p2_alignment == 4) + b = load64_aligned(p2, offset); + else if (p2_alignment == 2) + b = load64_aligned(p2, offset); + else + b = load64_aligned( + p2, offset); + uint64_t a = load64_aligned(p1, offset); + if (a != b) { + // TODO use cmp_neq_uint64_t from D148717 once it's submitted. + return Endian::to_big_endian(a) < Endian::to_big_endian(b) ? -1 : 1; + } + } + return inline_memcmp_byte_per_byte(p1, p2, offset, count); +} + +[[maybe_unused]] LIBC_INLINE MemcmpReturnType +inline_memcmp_aligned_access_32bit(CPtr p1, CPtr p2, size_t count) { + constexpr size_t kAlign = sizeof(uint32_t); + if (count <= 2 * kAlign) + return inline_memcmp_byte_per_byte(p1, p2, 0, count); + size_t bytes_to_p1_align = distance_to_align_up(p1); + if (auto value = inline_memcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align)) + return value; + size_t offset = bytes_to_p1_align; + size_t p2_alignment = distance_to_align_down(p2 + offset); + for (; offset < count - kAlign; offset += kAlign) { + uint32_t b; + if (p2_alignment == 0) + b = load32_aligned(p2, offset); + else if (p2_alignment == 2) + b = load32_aligned(p2, offset); + else + b = load32_aligned(p2, offset); + uint32_t a = load32_aligned(p1, offset); + if (a != b) { + // TODO use cmp_uint32_t from D148717 once it's submitted. + // We perform the difference as an uint64_t. + const int64_t diff = static_cast(Endian::to_big_endian(a)) - + static_cast(Endian::to_big_endian(b)); + // And reduce the uint64_t into an uint32_t. + return static_cast((diff >> 1) | (diff & 0xFFFF)); + } + } + return inline_memcmp_byte_per_byte(p1, p2, offset, count); +} + LIBC_INLINE MemcmpReturnType inline_memcmp(CPtr p1, CPtr p2, size_t count) { #if defined(LIBC_TARGET_ARCH_IS_X86) return inline_memcmp_x86(p1, p2, count); #elif defined(LIBC_TARGET_ARCH_IS_AARCH64) return inline_memcmp_aarch64(p1, p2, count); +#elif defined(LIBC_TARGET_ARCH_IS_RISCV64) + return inline_memcmp_aligned_access_64bit(p1, p2, count); +#elif defined(LIBC_TARGET_ARCH_IS_RISCV32) + return inline_memcmp_aligned_access_32bit(p1, p2, count); #else - return inline_memcmp_embedded_tiny(p1, p2, count); + return inline_memcmp_byte_per_byte(p1, p2, 0, count); #endif } -- 2.7.4