From 7c1f279328451e7c17cc16e9142adf11293f32c7 Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet Date: Mon, 15 May 2023 13:11:16 +0000 Subject: [PATCH] [libc] Add optimized bcmp for RISCV [libc] Add optimized bcmp for RISCV This patch adds two versions of bcmp optimized for architectures where unaligned accesses are either illegal or extremely slow. It is currently enabled for RISCV 64 and RISCV 32 but it could be used for ARM 32 architectures as well. Here is the before / after output of libc.benchmarks.memory_functions.opt_host --benchmark_filter=BM_Bcmp on a quad core Linux starfive RISCV 64 board running at 1.5GHz. Before ``` Run on (4 X 1500 MHz CPU s) CPU Caches: L1 Instruction 32 KiB (x4) L1 Data 32 KiB (x4) L2 Unified 2048 KiB (x1) Load Average: 7.03, 5.98, 3.71 ---------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... ---------------------------------------------------------------------- BM_Bcmp/0/0 102 ns 60.5 ns 11662336 bytes_per_cycle=0.122696/s bytes_per_second=175.518M/s items_per_second=16.5258M/s __llvm_libc::bcmp,memcmp Google A BM_Bcmp/1/0 328 ns 172 ns 3737600 bytes_per_cycle=0.15256/s bytes_per_second=218.238M/s items_per_second=5.80575M/s __llvm_libc::bcmp,memcmp Google B BM_Bcmp/2/0 199 ns 99.7 ns 7019520 bytes_per_cycle=0.141897/s bytes_per_second=202.986M/s items_per_second=10.032M/s __llvm_libc::bcmp,memcmp Google D BM_Bcmp/3/0 173 ns 86.5 ns 8361984 bytes_per_cycle=0.13863/s bytes_per_second=198.312M/s items_per_second=11.5669M/s __llvm_libc::bcmp,memcmp Google L BM_Bcmp/4/0 105 ns 51.8 ns 13213696 bytes_per_cycle=0.116399/s bytes_per_second=166.51M/s items_per_second=19.2931M/s __llvm_libc::bcmp,memcmp Google M BM_Bcmp/5/0 167 ns 93.9 ns 7853056 bytes_per_cycle=0.139432/s bytes_per_second=199.459M/s items_per_second=10.6503M/s __llvm_libc::bcmp,memcmp Google Q BM_Bcmp/6/0 262 ns 165 ns 3931136 bytes_per_cycle=0.151516/s bytes_per_second=216.745M/s items_per_second=6.07091M/s __llvm_libc::bcmp,memcmp Google S BM_Bcmp/7/0 168 ns 105 ns 6665216 bytes_per_cycle=0.143159/s bytes_per_second=204.791M/s items_per_second=9.52163M/s __llvm_libc::bcmp,memcmp Google U BM_Bcmp/8/0 108 ns 68.0 ns 10175488 bytes_per_cycle=0.125504/s bytes_per_second=179.535M/s items_per_second=14.701M/s __llvm_libc::bcmp,memcmp Google W BM_Bcmp/9/0 15371 ns 9007 ns 78848 bytes_per_cycle=0.166128/s bytes_per_second=237.648M/s items_per_second=111.031k/s __llvm_libc::bcmp,uniform 384 to 4096 ``` After ``` BM_Bcmp/0/0 74.2 ns 49.7 ns 14306304 bytes_per_cycle=0.148927/s bytes_per_second=213.042M/s items_per_second=20.1101M/s __llvm_libc::bcmp,memcmp Google A BM_Bcmp/1/0 108 ns 68.1 ns 10350592 bytes_per_cycle=0.411197/s bytes_per_second=588.222M/s items_per_second=14.6849M/s __llvm_libc::bcmp,memcmp Google B BM_Bcmp/2/0 80.2 ns 56.0 ns 12386304 bytes_per_cycle=0.258588/s bytes_per_second=369.912M/s items_per_second=17.8585M/s __llvm_libc::bcmp,memcmp Google D BM_Bcmp/3/0 92.4 ns 55.7 ns 12555264 bytes_per_cycle=0.206835/s bytes_per_second=295.88M/s items_per_second=17.943M/s __llvm_libc::bcmp,memcmp Google L BM_Bcmp/4/0 79.3 ns 46.8 ns 14288896 bytes_per_cycle=0.125872/s bytes_per_second=180.061M/s items_per_second=21.3611M/s __llvm_libc::bcmp,memcmp Google M BM_Bcmp/5/0 98.0 ns 57.9 ns 12232704 bytes_per_cycle=0.268815/s bytes_per_second=384.543M/s items_per_second=17.2711M/s __llvm_libc::bcmp,memcmp Google Q BM_Bcmp/6/0 132 ns 65.5 ns 10474496 bytes_per_cycle=0.417246/s bytes_per_second=596.875M/s items_per_second=15.2673M/s __llvm_libc::bcmp,memcmp Google S BM_Bcmp/7/0 101 ns 60.9 ns 11505664 bytes_per_cycle=0.253733/s bytes_per_second=362.968M/s items_per_second=16.4202M/s __llvm_libc::bcmp,memcmp Google U BM_Bcmp/8/0 72.5 ns 50.2 ns 14082048 bytes_per_cycle=0.183262/s bytes_per_second=262.158M/s items_per_second=19.9271M/s __llvm_libc::bcmp,memcmp Google W BM_Bcmp/9/0 852 ns 803 ns 854016 bytes_per_cycle=1.85028/s bytes_per_second=2.58481G/s items_per_second=1.24597M/s __llvm_libc::bcmp,uniform 384 to 4096 ``` For comparison with glibc ``` BM_Bcmp/0/0 106 ns 52.6 ns 12906496 bytes_per_cycle=0.142072/s bytes_per_second=203.235M/s items_per_second=19.0271M/s glibc::bcmp,memcmp Google A BM_Bcmp/1/0 132 ns 77.1 ns 8905728 bytes_per_cycle=0.365072/s bytes_per_second=522.239M/s items_per_second=12.9782M/s glibc::bcmp,memcmp Google B BM_Bcmp/2/0 122 ns 62.3 ns 10909696 bytes_per_cycle=0.222667/s bytes_per_second=318.527M/s items_per_second=16.0563M/s glibc::bcmp,memcmp Google D BM_Bcmp/3/0 99.5 ns 64.2 ns 11074560 bytes_per_cycle=0.185126/s bytes_per_second=264.825M/s items_per_second=15.5674M/s glibc::bcmp,memcmp Google L BM_Bcmp/4/0 86.6 ns 50.2 ns 13488128 bytes_per_cycle=0.117941/s bytes_per_second=168.717M/s items_per_second=19.9053M/s glibc::bcmp,memcmp Google M BM_Bcmp/5/0 106 ns 61.4 ns 11344896 bytes_per_cycle=0.248968/s bytes_per_second=356.151M/s items_per_second=16.284M/s glibc::bcmp,memcmp Google Q BM_Bcmp/6/0 145 ns 71.9 ns 10046464 bytes_per_cycle=0.389814/s bytes_per_second=557.633M/s items_per_second=13.9019M/s glibc::bcmp,memcmp Google S BM_Bcmp/7/0 119 ns 65.6 ns 10718208 bytes_per_cycle=0.243756/s bytes_per_second=348.696M/s items_per_second=15.2329M/s glibc::bcmp,memcmp Google U BM_Bcmp/8/0 86.4 ns 54.5 ns 13250560 bytes_per_cycle=0.154831/s bytes_per_second=221.488M/s items_per_second=18.3532M/s glibc::bcmp,memcmp Google W BM_Bcmp/9/0 1090 ns 604 ns 1186816 bytes_per_cycle=2.53848/s bytes_per_second=3.54622G/s items_per_second=1.65598M/s glibc::bcmp,uniform 384 to 4096 ``` Reviewed By: sivachandra Differential Revision: https://reviews.llvm.org/D150567 --- .../src/string/memory_utils/bcmp_implementations.h | 67 ++++++++++++++++++++-- libc/src/string/memory_utils/utils.h | 1 + 2 files changed, 63 insertions(+), 5 deletions(-) diff --git a/libc/src/string/memory_utils/bcmp_implementations.h b/libc/src/string/memory_utils/bcmp_implementations.h index 1c238cb..070e779 100644 --- a/libc/src/string/memory_utils/bcmp_implementations.h +++ b/libc/src/string/memory_utils/bcmp_implementations.h @@ -22,14 +22,67 @@ namespace __llvm_libc { [[maybe_unused]] LIBC_INLINE BcmpReturnType -inline_bcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) { +inline_bcmp_byte_per_byte(CPtr p1, CPtr p2, size_t offset, size_t count) { LIBC_LOOP_NOUNROLL - for (size_t offset = 0; offset < count; ++offset) - if (auto value = generic::Bcmp<1>::block(p1 + offset, p2 + offset)) - return value; + for (; offset < count; ++offset) + if (p1[offset] != p2[offset]) + return BcmpReturnType::NONZERO(); return BcmpReturnType::ZERO(); } +[[maybe_unused]] LIBC_INLINE BcmpReturnType +inline_bcmp_aligned_access_64bit(CPtr p1, CPtr p2, size_t count) { + constexpr size_t kAlign = sizeof(uint64_t); + if (count <= 2 * kAlign) + return inline_bcmp_byte_per_byte(p1, p2, 0, count); + size_t bytes_to_p1_align = distance_to_align_up(p1); + if (auto value = inline_bcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align)) + return value; + size_t offset = bytes_to_p1_align; + size_t p2_alignment = distance_to_align_down(p2 + offset); + for (; offset < count - kAlign; offset += kAlign) { + uint64_t a; + if (p2_alignment == 0) + a = load64_aligned(p2, offset); + else if (p2_alignment == 4) + a = load64_aligned(p2, offset); + else if (p2_alignment == 2) + a = load64_aligned(p2, offset); + else + a = load64_aligned( + p2, offset); + uint64_t b = load64_aligned(p1, offset); + if (a != b) + return BcmpReturnType::NONZERO(); + } + return inline_bcmp_byte_per_byte(p1, p2, offset, count); +} + +[[maybe_unused]] LIBC_INLINE BcmpReturnType +inline_bcmp_aligned_access_32bit(CPtr p1, CPtr p2, size_t count) { + constexpr size_t kAlign = sizeof(uint32_t); + if (count <= 2 * kAlign) + return inline_bcmp_byte_per_byte(p1, p2, 0, count); + size_t bytes_to_p1_align = distance_to_align_up(p1); + if (auto value = inline_bcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align)) + return value; + size_t offset = bytes_to_p1_align; + size_t p2_alignment = distance_to_align_down(p2 + offset); + for (; offset < count - kAlign; offset += kAlign) { + uint32_t a; + if (p2_alignment == 0) + a = load32_aligned(p2, offset); + else if (p2_alignment == 2) + a = load32_aligned(p2, offset); + else + a = load32_aligned(p2, offset); + uint32_t b = load32_aligned(p1, offset); + if (a != b) + return BcmpReturnType::NONZERO(); + } + return inline_bcmp_byte_per_byte(p1, p2, offset, count); +} + #if defined(LIBC_TARGET_ARCH_IS_X86) || defined(LIBC_TARGET_ARCH_IS_AARCH64) [[maybe_unused]] LIBC_INLINE BcmpReturnType inline_bcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) { @@ -167,8 +220,12 @@ LIBC_INLINE BcmpReturnType inline_bcmp(CPtr p1, CPtr p2, size_t count) { return inline_bcmp_x86(p1, p2, count); #elif defined(LIBC_TARGET_ARCH_IS_AARCH64) return inline_bcmp_aarch64(p1, p2, count); +#elif defined(LIBC_TARGET_ARCH_IS_RISCV64) + return inline_bcmp_aligned_access_64bit(p1, p2, count); +#elif defined(LIBC_TARGET_ARCH_IS_RISCV32) + return inline_bcmp_aligned_access_32bit(p1, p2, count); #else - return inline_bcmp_embedded_tiny(p1, p2, count); + return inline_bcmp_byte_per_byte(p1, p2, 0, count); #endif } diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h index ab33331..d5be52b 100644 --- a/libc/src/string/memory_utils/utils.h +++ b/libc/src/string/memory_utils/utils.h @@ -140,6 +140,7 @@ template struct StrictIntegralType { // Helper to get the zero value. LIBC_INLINE static constexpr StrictIntegralType ZERO() { return {T(0)}; } + LIBC_INLINE static constexpr StrictIntegralType NONZERO() { return {T(1)}; } private: T value; -- 2.7.4