[libc] add unsafe mode to strlen

author Michael Jones <michaelrj@google.com>

Thu, 14 Jul 2022 22:17:18 +0000 (15:17 -0700)

committer Michael Jones <michaelrj@google.com>

Thu, 1 Dec 2022 00:48:35 +0000 (16:48 -0800)
author Michael Jones <michaelrj@google.com>
Thu, 14 Jul 2022 22:17:18 +0000 (15:17 -0700)
committer Michael Jones <michaelrj@google.com>
Thu, 1 Dec 2022 00:48:35 +0000 (16:48 -0800)
diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt

index 2d2cc42..e4ccf0f 100644 (file)
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -26,6 +26,10 @@ set(LIBC_BUILD_SCRIPTS_DIR "${LIBC_SOURCE_DIR}/utils/build_scripts")
  # Flags to pass down to the compiler while building the libc functions.
  set(LIBC_COMPILE_OPTIONS_DEFAULT "" CACHE STRING "Architecture to tell clang to optimize for (e.g. -march=... or -mcpu=...)")
  
+include(common_libc_tuners.cmake)
+
+list(APPEND LIBC_COMPILE_OPTIONS_DEFAULT ${LIBC_COMMON_TUNE_OPTIONS})
+
  # Check --print-resource-dir to find the compiler resource dir if this flag
  # is supported by the compiler.
  execute_process(
diff --git a/libc/common_libc_tuners.cmake b/libc/common_libc_tuners.cmake

new file mode 100644 (file)

index 0000000..cde28fa
--- /dev/null
+++ b/libc/common_libc_tuners.cmake
@@ -0,0 +1,14 @@
+# ------------------------------------------------------------------------------
+# Common tuning option definitions.
+# ------------------------------------------------------------------------------
+
+set(LIBC_COMMON_TUNE_OPTIONS "")
+
+option(LIBC_UNSAFE_STRING_WIDE_READ "Functions searching for the first character in a string such as strlen will read the string as int sized blocks instead of bytes. This relies on undefined behavior and may fail on some systems, but improves performance on long strings." OFF)
+if(LIBC_UNSAFE_STRING_WIDE_READ)
+  if(LLVM_USE_SANITIZER)
+    message(FATAL_ERROR "LIBC_UNSAFE_STRING_WIDE_READ is set at the same time as a sanitizer. LIBC_UNSAFE_STRING_WIDE_READ causes strlen and memchr to read beyond the end of their target strings, which is undefined behavior caught by sanitizers.")
+  else()
+    list(APPEND LIBC_COMMON_TUNE_OPTIONS "-DLIBC_UNSAFE_STRING_WIDE_READ")
+    endif()
+endif()
diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt

index 2c960af..7719178 100644 (file)
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -195,6 +195,7 @@ add_entrypoint_object(
    HDRS
      strlen.h
    DEPENDS
+    .string_utils
      libc.include.string
  )
  
diff --git a/libc/src/string/string_utils.h b/libc/src/string/string_utils.h

index f8de696..24be2ab 100644 (file)
--- a/libc/src/string/string_utils.h
+++ b/libc/src/string/string_utils.h
@@ -23,24 +23,142 @@
  namespace __llvm_libc {
  namespace internal {
  
-// Returns the length of a string, denoted by the first occurrence
-// of a null terminator.
-static inline size_t string_length(const char *src) {
+template <typename Word> constexpr Word repeat_byte(Word byte) {
+  constexpr size_t BITS_IN_BYTE = 8;
+  constexpr size_t BYTE_MASK = 0xff;
+  Word result = 0;
+  byte = byte & BYTE_MASK;
+  for (size_t i = 0; i < sizeof(Word); ++i)
+    result = (result << BITS_IN_BYTE) | byte;
+  return result;
+}
+
+// The goal of this function is to take in a block of arbitrary size and return
+// if it has any bytes equal to zero without branching. This is done by
+// transforming the block such that zero bytes become non-zero and non-zero
+// bytes become zero.
+// The first transformation relies on the properties of carrying in arithmetic
+// subtraction. Specifically, if 0x01 is subtracted from a byte that is 0x00,
+// then the result for that byte must be equal to 0xff (or 0xfe if the next byte
+// needs a carry as well).
+// The next transformation is a simple mask. All zero bytes will have the high
+// bit set after the subtraction, so each byte is masked with 0x80. This narrows
+// the set of bytes that result in a non-zero value to only zero bytes and bytes
+// with the high bit and any other bit set.
+// The final transformation masks the result of the previous transformations
+// with the inverse of the original byte. This means that any byte that had the
+// high bit set will no longer have it set, narrowing the list of bytes which
+// result in non-zero values to just the zero byte.
+template <typename Word> constexpr bool has_zeroes(Word block) {
+  constexpr Word LOW_BITS = repeat_byte<Word>(0x01);
+  constexpr Word HIGH_BITS = repeat_byte<Word>(0x80);
+  Word subtracted = block - LOW_BITS;
+  Word inverted = ~block;
+  return (subtracted & inverted & HIGH_BITS) != 0;
+}
+
+template <typename Word>
+static inline size_t string_length_wide_read(const char *src) {
+  const char *char_ptr = src;
+  // Step 1: read 1 byte at a time to align to block size
+  for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0;
+       ++char_ptr) {
+    if (*char_ptr == '\0')
+      return char_ptr - src;
+  }
+  // Step 2: read blocks
+  for (const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr);
+       !has_zeroes<Word>(*block_ptr); ++block_ptr) {
+    char_ptr = reinterpret_cast<const char *>(block_ptr);
+  }
+  // Step 3: find the zero in the block
+  for (; *char_ptr != '\0'; ++char_ptr) {
+    ;
+  }
+  return char_ptr - src;
+}
+
+static inline size_t string_length_byte_read(const char *src) {
    size_t length;
    for (length = 0; *src; ++src, ++length)
      ;
    return length;
  }
  
-// Returns the first occurrence of 'ch' within the first 'n' characters of
-// 'src'. If 'ch' is not found, returns nullptr.
-static inline void *find_first_character(const unsigned char *src,
-                                         unsigned char ch, size_t n) {
+// Returns the length of a string, denoted by the first occurrence
+// of a null terminator.
+static inline size_t string_length(const char *src) {
+#ifdef LIBC_UNSAFE_STRING_WIDE_READ
+  // Unsigned int is the default size for most processors, and on x86-64 it
+  // performs better than larger sizes when the src pointer can't be assumed to
+  // be aligned to a word boundary, so it's the size we use for reading the
+  // string a block at a time.
+  return string_length_wide_read<unsigned int>(src);
+#else
+  return string_length_byte_read(src);
+#endif
+}
+
+template <typename Word>
+static inline void *find_first_character_wide_read(const unsigned char *src,
+                                                   unsigned char ch, size_t n) {
+  const unsigned char *char_ptr = src;
+  size_t cur = 0;
+
+  // Step 1: read 1 byte at a time to align to block size
+  for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0 && cur < n;
+       ++char_ptr, ++cur) {
+    if (*char_ptr == ch)
+      return const_cast<unsigned char *>(char_ptr);
+  }
+
+  const Word ch_mask = repeat_byte<Word>(ch);
+
+  // Step 2: read blocks
+  for (const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr);
+       !has_zeroes<Word>((*block_ptr) ^ ch_mask) && cur < n;
+       ++block_ptr, cur += sizeof(Word)) {
+    char_ptr = reinterpret_cast<const unsigned char *>(block_ptr);
+  }
+
+  // Step 3: find the match in the block
+  for (; *char_ptr != ch && cur < n; ++char_ptr, ++cur) {
+    ;
+  }
+
+  if (*char_ptr != ch || cur >= n)
+    return static_cast<void *>(nullptr);
+
+  return const_cast<unsigned char *>(char_ptr);
+}
+
+static inline void *find_first_character_byte_read(const unsigned char *src,
+                                                   unsigned char ch, size_t n) {
    for (; n && *src != ch; --n, ++src)
      ;
    return n ? const_cast<unsigned char *>(src) : nullptr;
  }
  
+// Returns the first occurrence of 'ch' within the first 'n' characters of
+// 'src'. If 'ch' is not found, returns nullptr.
+static inline void *find_first_character(const unsigned char *src,
+                                         unsigned char ch, size_t max_strlen) {
+#ifdef LIBC_UNSAFE_STRING_WIDE_READ
+  // If the maximum size of the string is small, the overhead of aligning to a
+  // word boundary and generating a bitmask of the appropriate size may be
+  // greater than the gains from reading larger chunks. Based on some testing,
+  // the crossover point between when it's faster to just read bytewise and read
+  // blocks is somewhere between 16 and 32, so 4 times the size of the block
+  // should be in that range.
+  // Unsigned int is used for the same reason as in strlen.
+  using BlockType = unsigned int;
+  if (max_strlen > (sizeof(BlockType) * 4)) {
+    return find_first_character_wide_read<BlockType>(src, ch, max_strlen);
+  }
+#endif
+  return find_first_character_byte_read(src, ch, max_strlen);
+}
+
  // Returns the maximum length span that contains only characters not found in
  // 'segment'. If no characters are found, returns the length of 'src'.
  static inline size_t complementary_span(const char *src, const char *segment) {
author	Michael Jones <michaelrj@google.com>
	Thu, 14 Jul 2022 22:17:18 +0000 (15:17 -0700)
committer	Michael Jones <michaelrj@google.com>
	Thu, 1 Dec 2022 00:48:35 +0000 (16:48 -0800)
libc/CMakeLists.txt		patch \| blob \| history
libc/common_libc_tuners.cmake	[new file with mode: 0644]	patch \| blob
libc/src/string/CMakeLists.txt		patch \| blob \| history
libc/src/string/string_utils.h		patch \| blob \| history