From d8415b02a519f222ecf71b069c96cc85ac635de3 Mon Sep 17 00:00:00 2001
From: Sterling Augustine <saugustine@google.com>
Date: Fri, 14 Oct 2022 11:17:46 -0700
Subject: [PATCH] Revert "[libc] New version of the mem* framework"

This reverts commit https://reviews.llvm.org/D135134 (b3f1d58a131eb546aaf1ac165c77ccb89c40d758)

That revision appears to have broken Arm memcpy in some subtle
ways. Am communicating with the original author to get a
good reproduction.
---
 libc/src/stdio/printf_core/string_writer.cpp       |   2 +-
 libc/src/string/bcmp.cpp                           |   4 +-
 libc/src/string/memcmp.cpp                         |   4 +-
 libc/src/string/memmove.cpp                        | 100 +--
 libc/src/string/memory_utils/CMakeLists.txt        |   8 +-
 libc/src/string/memory_utils/README.md             |  97 ---
 .../src/string/memory_utils/bcmp_implementations.h | 171 +----
 libc/src/string/memory_utils/elements.h            | 774 +++++++++++++++++++++
 libc/src/string/memory_utils/elements_aarch64.h    | 130 ++++
 libc/src/string/memory_utils/elements_x86.h        | 189 +++++
 .../string/memory_utils/memcmp_implementations.h   | 175 ++---
 .../string/memory_utils/memcpy_implementations.h   | 193 ++---
 .../string/memory_utils/memset_implementations.h   | 156 +++--
 libc/src/string/memory_utils/op_aarch64.h          | 175 -----
 libc/src/string/memory_utils/op_builtin.h          | 148 ----
 libc/src/string/memory_utils/op_generic.h          | 466 -------------
 libc/src/string/memory_utils/op_x86.h              | 219 ------
 libc/src/string/memory_utils/utils.h               | 139 ++--
 libc/src/string/memset.cpp                         |   4 +-
 libc/test/src/string/bcmp_test.cpp                 |  12 +-
 libc/test/src/string/memmove_test.cpp              |  14 +-
 libc/test/src/string/memory_utils/CMakeLists.txt   |   2 +
 .../test/src/string/memory_utils/elements_test.cpp | 137 ++++
 .../src/string/memory_utils/memory_access_test.cpp | 228 ++++++
 libc/test/src/string/memory_utils/utils_test.cpp   |  79 ++-
 utils/bazel/llvm-project-overlay/libc/BUILD.bazel  |   9 +-
 26 files changed, 1895 insertions(+), 1740 deletions(-)
 delete mode 100644 libc/src/string/memory_utils/README.md
 create mode 100644 libc/src/string/memory_utils/elements.h
 create mode 100644 libc/src/string/memory_utils/elements_aarch64.h
 create mode 100644 libc/src/string/memory_utils/elements_x86.h
 delete mode 100644 libc/src/string/memory_utils/op_aarch64.h
 delete mode 100644 libc/src/string/memory_utils/op_builtin.h
 delete mode 100644 libc/src/string/memory_utils/op_generic.h
 delete mode 100644 libc/src/string/memory_utils/op_x86.h
 create mode 100644 libc/test/src/string/memory_utils/elements_test.cpp
 create mode 100644 libc/test/src/string/memory_utils/memory_access_test.cpp
diff --git a/libc/src/stdio/printf_core/string_writer.cpp b/libc/src/stdio/printf_core/string_writer.cpp
index 472573d..a80df32 100644
--- a/libc/src/stdio/printf_core/string_writer.cpp
+++ b/libc/src/stdio/printf_core/string_writer.cpp
@@ -33,7 +33,7 @@ void StringWriter::write(char new_char, size_t len) {
     len = available_capacity;
 
   if (len > 0) {
-    inline_memset(cur_buffer, static_cast<uint8_t>(new_char), len);
+    inline_memset(cur_buffer, new_char, len);
     cur_buffer += len;
     available_capacity -= len;
   }
diff --git a/libc/src/string/bcmp.cpp b/libc/src/string/bcmp.cpp
index fb00780..963a7f5 100644
--- a/libc/src/string/bcmp.cpp
+++ b/libc/src/string/bcmp.cpp
@@ -14,8 +14,8 @@ namespace __llvm_libc {
 
 LLVM_LIBC_FUNCTION(int, bcmp,
                    (const void *lhs, const void *rhs, size_t count)) {
-  return static_cast<int>(inline_bcmp(static_cast<const char *>(lhs),
-                                      static_cast<const char *>(rhs), count));
+  return inline_bcmp(static_cast<const char *>(lhs),
+                     static_cast<const char *>(rhs), count);
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/string/memcmp.cpp b/libc/src/string/memcmp.cpp
index 357b57d1..292525e 100644
--- a/libc/src/string/memcmp.cpp
+++ b/libc/src/string/memcmp.cpp
@@ -15,8 +15,8 @@ namespace __llvm_libc {
 
 LLVM_LIBC_FUNCTION(int, memcmp,
                    (const void *lhs, const void *rhs, size_t count)) {
-  return static_cast<int>(inline_memcmp(static_cast<const char *>(lhs),
-                                        static_cast<const char *>(rhs), count));
+  return inline_memcmp(static_cast<const char *>(lhs),
+                       static_cast<const char *>(rhs), count);
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/string/memmove.cpp b/libc/src/string/memmove.cpp
index f40ab62..f242578 100644
--- a/libc/src/string/memmove.cpp
+++ b/libc/src/string/memmove.cpp
@@ -9,104 +9,36 @@
 #include "src/string/memmove.h"
 
 #include "src/__support/common.h"
-#include "src/string/memory_utils/op_aarch64.h"
-#include "src/string/memory_utils/op_builtin.h"
-#include "src/string/memory_utils/op_generic.h"
-#include "src/string/memory_utils/op_x86.h"
+#include "src/__support/integer_operations.h"
+#include "src/string/memory_utils/elements.h"
 #include <stddef.h> // size_t, ptrdiff_t
 
-#include <stdio.h>
-
 namespace __llvm_libc {
 
-[[maybe_unused]] static inline void
-inline_memmove_embedded_tiny(Ptr dst, CPtr src, size_t count) {
-  if ((count == 0) || (dst == src))
-    return;
-  if (dst < src) {
-#pragma nounroll
-    for (size_t offset = 0; offset < count; ++offset)
-      builtin::Memcpy<1>::block(dst + offset, src + offset);
-  } else {
-#pragma nounroll
-    for (ptrdiff_t offset = count - 1; offset >= 0; --offset)
-      builtin::Memcpy<1>::block(dst + offset, src + offset);
-  }
-}
-
-template <size_t MaxSize>
-[[maybe_unused]] static inline void inline_memmove_generic(Ptr dst, CPtr src,
-                                                           size_t count) {
+static inline void inline_memmove(char *dst, const char *src, size_t count) {
+  using namespace __llvm_libc::scalar;
   if (count == 0)
     return;
   if (count == 1)
-    return generic::Memmove<1, MaxSize>::block(dst, src);
+    return move<_1>(dst, src);
   if (count <= 4)
-    return generic::Memmove<2, MaxSize>::head_tail(dst, src, count);
+    return move<HeadTail<_2>>(dst, src, count);
   if (count <= 8)
-    return generic::Memmove<4, MaxSize>::head_tail(dst, src, count);
+    return move<HeadTail<_4>>(dst, src, count);
   if (count <= 16)
-    return generic::Memmove<8, MaxSize>::head_tail(dst, src, count);
+    return move<HeadTail<_8>>(dst, src, count);
   if (count <= 32)
-    return generic::Memmove<16, MaxSize>::head_tail(dst, src, count);
+    return move<HeadTail<_16>>(dst, src, count);
   if (count <= 64)
-    return generic::Memmove<32, MaxSize>::head_tail(dst, src, count);
+    return move<HeadTail<_32>>(dst, src, count);
   if (count <= 128)
-    return generic::Memmove<64, MaxSize>::head_tail(dst, src, count);
-  if (dst < src) {
-    generic::Memmove<32, MaxSize>::template align_forward<Arg::Src>(dst, src,
-                                                                    count);
-    return generic::Memmove<64, MaxSize>::loop_and_tail_forward(dst, src,
-                                                                count);
-  } else {
-    generic::Memmove<32, MaxSize>::template align_backward<Arg::Src>(dst, src,
-                                                                     count);
-    return generic::Memmove<64, MaxSize>::loop_and_tail_backward(dst, src,
-                                                                 count);
-  }
-}
+    return move<HeadTail<_64>>(dst, src, count);
 
-static inline void inline_memmove(Ptr dst, CPtr src, size_t count) {
-#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
-#if defined(LLVM_LIBC_ARCH_X86)
-  static constexpr size_t kMaxSize = x86::kAvx512F ? 64
-                                     : x86::kAvx   ? 32
-                                     : x86::kSse2  ? 16
-                                                   : 8;
-#elif defined(LLVM_LIBC_ARCH_AARCH64)
-  static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8;
-#endif
-  // return inline_memmove_generic<kMaxSize>(dst, src, count);
-  if (count == 0)
-    return;
-  if (count == 1)
-    return generic::Memmove<1, kMaxSize>::block(dst, src);
-  if (count <= 4)
-    return generic::Memmove<2, kMaxSize>::head_tail(dst, src, count);
-  if (count <= 8)
-    return generic::Memmove<4, kMaxSize>::head_tail(dst, src, count);
-  if (count <= 16)
-    return generic::Memmove<8, kMaxSize>::head_tail(dst, src, count);
-  if (count <= 32)
-    return generic::Memmove<16, kMaxSize>::head_tail(dst, src, count);
-  if (count <= 64)
-    return generic::Memmove<32, kMaxSize>::head_tail(dst, src, count);
-  if (count <= 128)
-    return generic::Memmove<64, kMaxSize>::head_tail(dst, src, count);
-  if (dst < src) {
-    generic::Memmove<32, kMaxSize>::align_forward<Arg::Src>(dst, src, count);
-    return generic::Memmove<64, kMaxSize>::loop_and_tail_forward(dst, src,
-                                                                 count);
-  } else {
-    generic::Memmove<32, kMaxSize>::align_backward<Arg::Src>(dst, src, count);
-    return generic::Memmove<64, kMaxSize>::loop_and_tail_backward(dst, src,
-                                                                  count);
-  }
-#elif defined(LLVM_LIBC_ARCH_ARM)
-  return inline_memmove_embedded_tiny(dst, src, count);
-#else
-#error "Unsupported platform"
-#endif
+  using AlignedMoveLoop = Align<_16, Arg::Src>::Then<Loop<_64>>;
+  if (dst < src)
+    return move<AlignedMoveLoop>(dst, src, count);
+  else if (dst > src)
+    return move_backward<AlignedMoveLoop>(dst, src, count);
 }
 
 LLVM_LIBC_FUNCTION(void *, memmove,
diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt
index 630b0d4..d735fcf 100644
--- a/libc/src/string/memory_utils/CMakeLists.txt
+++ b/libc/src/string/memory_utils/CMakeLists.txt
@@ -2,17 +2,13 @@
 add_header_library(
   memory_utils
   HDRS
+    utils.h
+    elements.h
     bcmp_implementations.h
     bzero_implementations.h
     memcmp_implementations.h
     memcpy_implementations.h
     memset_implementations.h
-    op_aarch64.h
-    op_higher_order.h
-    op_builtin.h
-    op_generic.h
-    op_x86.h
-    utils.h
   DEPS
     libc.src.__support.CPP.bit
 )
diff --git a/libc/src/string/memory_utils/README.md b/libc/src/string/memory_utils/README.md
deleted file mode 100644
index 83a29066..0000000
--- a/libc/src/string/memory_utils/README.md
+++ /dev/null
@@ -1,97 +0,0 @@
-# The mem* framework
-
-The framework handles the following mem* functions:
- - `memcpy`
- - `memmove`
- - `memset`
- - `bzero`
- - `bcmp`
- - `memcmp`
-
-## Building blocks
-
-These functions can be built out of a set of lower-level operations:
- - **`block`** : operates on a block of `SIZE` bytes.
- - **`tail`** : operates on the last `SIZE` bytes of the buffer (e.g., `[dst + count - SIZE, dst + count]`)
- - **`head_tail`** : operates on the first and last `SIZE` bytes. This is the same as calling `block` and `tail`.
- - **`loop_and_tail`** : calls `block` in a loop to consume as much as possible of the `count` bytes and handle the remaining bytes with a `tail` operation.
-
-As an illustration, let's take the example of a trivial `memset` implementation:
-
- ```C++
- extern "C" void memset(const char* dst, int value, size_t count) {
-    if (count == 0) return;
-    if (count == 1) return Memset<1>::block(dst, value);
-    if (count == 2) return Memset<2>::block(dst, value);
-    if (count == 3) return Memset<3>::block(dst, value);
-    if (count <= 8) return Memset<4>::head_tail(dst, value, count);  // Note that 0 to 4 bytes are written twice.
-    if (count <= 16) return Memset<8>::head_tail(dst, value, count); // Same here.
-    return Memset<16>::loop_and_tail(dst, value, count);
-}
- ```
-
-Now let's have a look into the `Memset` structure:
-
-```C++
-template <size_t Size>
-struct Memset {
-  static constexpr size_t SIZE = Size;
-
-  static inline void block(Ptr dst, uint8_t value) {
-    // Implement me
-  }
-
-  static inline void tail(Ptr dst, uint8_t value, size_t count) {
-    block(dst + count - SIZE, value);
-  }
-
-  static inline void head_tail(Ptr dst, uint8_t value, size_t count) {
-    block(dst, value);
-    tail(dst, value, count);
-  }
-
-  static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
-    size_t offset = 0;
-    do {
-      block(dst + offset, value);
-      offset += SIZE;
-    } while (offset < count - SIZE);
-    tail(dst, value, count);
-  }
-};
-```
-
-As you can see, the `tail`, `head_tail` and `loop_and_tail` are higher order functions that build on each others. Only `block` really needs to be implemented.
-In earlier designs we were implementing these higher order functions with templated functions but it appears that it is more readable to have the implementation explicitly stated.
-**This design is useful because it provides customization points**. For instance, for `bcmp` on `aarch64` we can provide a better implementation of `head_tail` using vector reduction intrinsics.
-
-## Scoped specializations
-
-We can have several specializations of the `Memset` structure. Depending on the target requirements we can use one or several scopes for the same implementation.
-
-In the following example we use the `generic` implementation for the small sizes but use the `x86` implementation for the loop.
-```C++
- extern "C" void memset(const char* dst, int value, size_t count) {
-    if (count == 0) return;
-    if (count == 1) return generic::Memset<1>::block(dst, value);
-    if (count == 2) return generic::Memset<2>::block(dst, value);
-    if (count == 3) return generic::Memset<3>::block(dst, value);
-    if (count <= 8) return generic::Memset<4>::head_tail(dst, value, count);
-    if (count <= 16) return generic::Memset<8>::head_tail(dst, value, count);
-    return x86::Memset<16>::loop_and_tail(dst, value, count);
-}
-```
-
-### The `builtin` scope
-
-Ultimately we would like the compiler to provide the code for the `block` function. For this we rely on dedicated builtins available in Clang (e.g., [`__builtin_memset_inline`](https://clang.llvm.org/docs/LanguageExtensions.html#guaranteed-inlined-memset))
-
-### The `generic` scope
-
-In this scope we define pure C++ implementations using native integral types and clang vector extensions.
-
-### The arch specific scopes
-
-Then comes implementations that are using specific architectures or microarchitectures features (e.g., `rep;movsb` for `x86` or `dc zva` for `aarch64`).
-
-The purpose here is to rely on builtins as much as possible and fallback to `asm volatile` as a last resort.
diff --git a/libc/src/string/memory_utils/bcmp_implementations.h b/libc/src/string/memory_utils/bcmp_implementations.h
index 9d9ad2b..c26e38e 100644
--- a/libc/src/string/memory_utils/bcmp_implementations.h
+++ b/libc/src/string/memory_utils/bcmp_implementations.h
@@ -11,164 +11,49 @@
 
 #include "src/__support/architectures.h"
 #include "src/__support/common.h"
-#include "src/string/memory_utils/op_aarch64.h"
-#include "src/string/memory_utils/op_builtin.h"
-#include "src/string/memory_utils/op_generic.h"
-#include "src/string/memory_utils/op_x86.h"
+#include "src/string/memory_utils/elements.h"
 
 #include <stddef.h> // size_t
 
 namespace __llvm_libc {
 
-[[maybe_unused]] static inline BcmpReturnType
-inline_bcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) {
-#pragma nounroll
-  for (size_t offset = 0; offset < count; ++offset)
-    if (auto value = generic::Bcmp<1>::block(p1 + offset, p2 + offset))
-      return value;
-  return BcmpReturnType::ZERO();
+// Fixed-size difference between 'lhs' and 'rhs'.
+template <typename Element> bool differs(const char *lhs, const char *rhs) {
+  return !Element::equals(lhs, rhs);
 }
-
-#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
-[[maybe_unused]] static inline BcmpReturnType
-inline_bcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
-  if (count < 256)
-    return generic::Bcmp<16>::loop_and_tail(p1, p2, count);
-  if (auto value = generic::Bcmp<64>::block(p1, p2))
-    return value;
-  align_to_next_boundary<64, Arg::P1>(p1, p2, count);
-  return generic::Bcmp<64>::loop_and_tail(p1, p2, count);
+// Runtime-size difference between 'lhs' and 'rhs'.
+template <typename Element>
+bool differs(const char *lhs, const char *rhs, size_t size) {
+  return !Element::equals(lhs, rhs, size);
 }
-#endif // defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
 
+static inline int inline_bcmp(const char *lhs, const char *rhs, size_t count) {
 #if defined(LLVM_LIBC_ARCH_X86)
-[[maybe_unused]] static inline BcmpReturnType
-inline_bcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) {
-  if (count <= 32)
-    return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
-  if (count < 256)
-    return x86::sse2::Bcmp<16>::loop_and_tail(p1, p2, count);
-  if (auto value = x86::sse2::Bcmp<16>::block(p1, p2))
-    return value;
-  align_to_next_boundary<16, Arg::P1>(p1, p2, count);
-  return x86::sse2::Bcmp<64>::loop_and_tail(p1, p2, count);
-}
-
-[[maybe_unused]] static inline BcmpReturnType
-inline_bcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) {
-  if (count <= 32)
-    return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
-  if (count <= 64)
-    return x86::avx2::Bcmp<32>::head_tail(p1, p2, count);
-  if (count <= 128)
-    return x86::avx2::Bcmp<64>::head_tail(p1, p2, count);
-  if (unlikely(count >= 256)) {
-    if (auto value = x86::avx2::Bcmp<64>::block(p1, p2))
-      return value;
-    align_to_next_boundary<64, Arg::P1>(p1, p2, count);
-  }
-  return x86::avx2::Bcmp<64>::loop_and_tail(p1, p2, count);
-}
-
-[[maybe_unused]] static inline BcmpReturnType
-inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
-  if (count <= 32)
-    return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
-  if (count <= 64)
-    return x86::avx2::Bcmp<32>::head_tail(p1, p2, count);
-  if (count <= 128)
-    return x86::avx512bw::Bcmp<64>::head_tail(p1, p2, count);
-  if (unlikely(count >= 256)) {
-    if (auto value = x86::avx512bw::Bcmp<64>::block(p1, p2))
-      return value;
-    align_to_next_boundary<64, Arg::P1>(p1, p2, count);
-  }
-  return x86::avx512bw::Bcmp<64>::loop_and_tail(p1, p2, count);
-}
-
-[[maybe_unused]] static inline BcmpReturnType inline_bcmp_x86(CPtr p1, CPtr p2,
-                                                              size_t count) {
+  using namespace ::__llvm_libc::x86;
+#elif defined(LLVM_LIBC_ARCH_AARCH64)
+  using namespace ::__llvm_libc::aarch64;
+#else
+  using namespace ::__llvm_libc::scalar;
+#endif
   if (count == 0)
-    return BcmpReturnType::ZERO();
+    return 0;
   if (count == 1)
-    return generic::Bcmp<1>::block(p1, p2);
+    return differs<_1>(lhs, rhs);
   if (count == 2)
-    return generic::Bcmp<2>::block(p1, p2);
-  if (count <= 4)
-    return generic::Bcmp<2>::head_tail(p1, p2, count);
+    return differs<_2>(lhs, rhs);
+  if (count == 3)
+    return differs<_3>(lhs, rhs);
   if (count <= 8)
-    return generic::Bcmp<4>::head_tail(p1, p2, count);
+    return differs<HeadTail<_4>>(lhs, rhs, count);
   if (count <= 16)
-    return generic::Bcmp<8>::head_tail(p1, p2, count);
-  if constexpr (x86::kAvx512BW)
-    return inline_bcmp_x86_avx512bw_gt16(p1, p2, count);
-  else if constexpr (x86::kAvx2)
-    return inline_bcmp_x86_avx2_gt16(p1, p2, count);
-  else if constexpr (x86::kSse2)
-    return inline_bcmp_x86_sse2_gt16(p1, p2, count);
-  else
-    return inline_bcmp_generic_gt16(p1, p2, count);
-}
-#endif // defined(LLVM_LIBC_ARCH_X86)
-
-#if defined(LLVM_LIBC_ARCH_AARCH64)
-[[maybe_unused]] static inline BcmpReturnType
-inline_bcmp_aarch64(CPtr p1, CPtr p2, size_t count) {
-  if (likely(count <= 32)) {
-    if (unlikely(count >= 16)) {
-      return generic::Bcmp<16>::head_tail(p1, p2, count);
-    }
-    switch (count) {
-    case 0:
-      return BcmpReturnType::ZERO();
-    case 1:
-      return generic::Bcmp<1>::block(p1, p2);
-    case 2:
-      return generic::Bcmp<2>::block(p1, p2);
-    case 3:
-      return generic::Bcmp<2>::head_tail(p1, p2, count);
-    case 4:
-      return generic::Bcmp<4>::block(p1, p2);
-    case 5:
-    case 6:
-    case 7:
-      return generic::Bcmp<4>::head_tail(p1, p2, count);
-    case 8:
-      return generic::Bcmp<8>::block(p1, p2);
-    case 9:
-    case 10:
-    case 11:
-    case 12:
-    case 13:
-    case 14:
-    case 15:
-      return generic::Bcmp<8>::head_tail(p1, p2, count);
-    }
-  }
-
+    return differs<HeadTail<_8>>(lhs, rhs, count);
+  if (count <= 32)
+    return differs<HeadTail<_16>>(lhs, rhs, count);
   if (count <= 64)
-    return generic::Bcmp<32>::head_tail(p1, p2, count);
-
-  // Aligned loop if > 256, otherwise normal loop
-  if (count > 256) {
-    if (auto value = generic::Bcmp<32>::block(p1, p2))
-      return value;
-    align_to_next_boundary<16, Arg::P1>(p1, p2, count);
-  }
-  return generic::Bcmp<32>::loop_and_tail(p1, p2, count);
-}
-#endif // defined(LLVM_LIBC_ARCH_AARCH64)
-
-static inline BcmpReturnType inline_bcmp(CPtr p1, CPtr p2, size_t count) {
-#if defined(LLVM_LIBC_ARCH_X86)
-  return inline_bcmp_x86(p1, p2, count);
-#elif defined(LLVM_LIBC_ARCH_AARCH64)
-  return inline_bcmp_aarch64(p1, p2, count);
-#elif defined(LLVM_LIBC_ARCH_ARM)
-  return inline_bcmp_embedded_tiny(p1, p2, count);
-#else
-#error "Unsupported platform"
-#endif
+    return differs<HeadTail<_32>>(lhs, rhs, count);
+  if (count <= 128)
+    return differs<HeadTail<_64>>(lhs, rhs, count);
+  return differs<Align<_32>::Then<Loop<_32>>>(lhs, rhs, count);
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/string/memory_utils/elements.h b/libc/src/string/memory_utils/elements.h
new file mode 100644
index 0000000..f5a3830
--- /dev/null
+++ b/libc/src/string/memory_utils/elements.h
@@ -0,0 +1,774 @@
+//===-- Elementary operations to compose memory primitives ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H
+
+#include <stddef.h> // size_t
+#include <stdint.h> // uint8_t, uint16_t, uint32_t, uint64_t
+
+#include "src/__support/endian.h"
+#include "src/string/memory_utils/utils.h"
+
+namespace __llvm_libc {
+
+// Elementary Operations
+// --------------------------------
+// We define abstract elementary operations acting on fixed chunks of memory.
+// These are low level building blocks that are meant to be assembled to compose
+// higher order abstractions. Each function is defined twice: once with
+// fixed-size operations, and once with runtime-size operations.
+
+// Fixed-size copy from 'src' to 'dst'.
+template <typename Element>
+void copy(char *__restrict dst, const char *__restrict src) {
+  Element::copy(dst, src);
+}
+// Runtime-size copy from 'src' to 'dst'.
+template <typename Element>
+void copy(char *__restrict dst, const char *__restrict src, size_t size) {
+  Element::copy(dst, src, size);
+}
+
+// Fixed-size move from 'src' to 'dst'.
+template <typename Element> void move(char *dst, const char *src) {
+  Element::move(dst, src);
+}
+// Runtime-size move from 'src' to 'dst'.
+template <typename Element> void move(char *dst, const char *src, size_t size) {
+  Element::move(dst, src, size);
+}
+// Runtime-size move from 'src' to 'dst'.
+template <typename Element>
+void move_backward(char *dst, const char *src, size_t size) {
+  Element::move_backward(dst, src, size);
+}
+
+// Fixed-size equality between 'lhs' and 'rhs'.
+template <typename Element> bool equals(const char *lhs, const char *rhs) {
+  return Element::equals(lhs, rhs);
+}
+// Runtime-size equality between 'lhs' and 'rhs'.
+template <typename Element>
+bool equals(const char *lhs, const char *rhs, size_t size) {
+  return Element::equals(lhs, rhs, size);
+}
+
+// Fixed-size three-way comparison between 'lhs' and 'rhs'.
+template <typename Element>
+int three_way_compare(const char *lhs, const char *rhs) {
+  return Element::three_way_compare(lhs, rhs);
+}
+// Runtime-size three-way comparison between 'lhs' and 'rhs'.
+template <typename Element>
+int three_way_compare(const char *lhs, const char *rhs, size_t size) {
+  return Element::three_way_compare(lhs, rhs, size);
+}
+
+// Fixed-size initialization.
+template <typename Element>
+void splat_set(char *dst, const unsigned char value) {
+  Element::splat_set(dst, value);
+}
+// Runtime-size initialization.
+template <typename Element>
+void splat_set(char *dst, const unsigned char value, size_t size) {
+  Element::splat_set(dst, value, size);
+}
+
+// Stack placeholder for Move operations.
+template <typename Element> struct Storage { char bytes[Element::SIZE]; };
+
+// Fixed-size Higher-Order Operations
+// ----------------------------------
+// - Repeated<Type, ElementCount>: Repeat the operation several times in a row.
+// - Chained<Types...>: Chain the operation of several types.
+
+// Repeat the operation several times in a row.
+template <typename Element, size_t ElementCount> struct Repeated {
+  static constexpr size_t SIZE = ElementCount * Element::SIZE;
+
+  static void copy(char *__restrict dst, const char *__restrict src) {
+    for (size_t i = 0; i < ElementCount; ++i) {
+      const size_t offset = i * Element::SIZE;
+      Element::copy(dst + offset, src + offset);
+    }
+  }
+
+  static void move(char *dst, const char *src) {
+    const auto value = load(src);
+    store(dst, value);
+  }
+
+  static bool equals(const char *lhs, const char *rhs) {
+    for (size_t i = 0; i < ElementCount; ++i) {
+      const size_t offset = i * Element::SIZE;
+      if (!Element::equals(lhs + offset, rhs + offset))
+        return false;
+    }
+    return true;
+  }
+
+  static int three_way_compare(const char *lhs, const char *rhs) {
+    for (size_t i = 0; i < ElementCount; ++i) {
+      const size_t offset = i * Element::SIZE;
+      // We make the assumption that 'equals' is cheaper than
+      // 'three_way_compare'.
+      if (Element::equals(lhs + offset, rhs + offset))
+        continue;
+      return Element::three_way_compare(lhs + offset, rhs + offset);
+    }
+    return 0;
+  }
+
+  static void splat_set(char *dst, const unsigned char value) {
+    for (size_t i = 0; i < ElementCount; ++i) {
+      const size_t offset = i * Element::SIZE;
+      Element::splat_set(dst + offset, value);
+    }
+  }
+
+  static Storage<Repeated> load(const char *ptr) {
+    Storage<Repeated> value;
+    copy(reinterpret_cast<char *>(&value), ptr);
+    return value;
+  }
+
+  static void store(char *ptr, Storage<Repeated> value) {
+    copy(ptr, reinterpret_cast<const char *>(&value));
+  }
+};
+
+template <typename Element> struct Repeated<Element, 0> {
+  static void move(char *, const char *) {}
+};
+
+// Chain the operation of several types.
+// For instance, to handle a 3 bytes operation, one can use:
+// Chained<UINT16, UINT8>::Operation();
+template <typename... Types> struct Chained;
+
+template <typename Head, typename... Tail> struct Chained<Head, Tail...> {
+  static constexpr size_t SIZE = Head::SIZE + Chained<Tail...>::SIZE;
+
+  static void copy(char *__restrict dst, const char *__restrict src) {
+    Chained<Tail...>::copy(dst + Head::SIZE, src + Head::SIZE);
+    __llvm_libc::copy<Head>(dst, src);
+  }
+
+  static void move(char *dst, const char *src) {
+    const auto value = Head::load(src);
+    Chained<Tail...>::move(dst + Head::SIZE, src + Head::SIZE);
+    Head::store(dst, value);
+  }
+
+  static bool equals(const char *lhs, const char *rhs) {
+    if (!__llvm_libc::equals<Head>(lhs, rhs))
+      return false;
+    return Chained<Tail...>::equals(lhs + Head::SIZE, rhs + Head::SIZE);
+  }
+
+  static int three_way_compare(const char *lhs, const char *rhs) {
+    if (__llvm_libc::equals<Head>(lhs, rhs))
+      return Chained<Tail...>::three_way_compare(lhs + Head::SIZE,
+                                                 rhs + Head::SIZE);
+    return __llvm_libc::three_way_compare<Head>(lhs, rhs);
+  }
+
+  static void splat_set(char *dst, const unsigned char value) {
+    Chained<Tail...>::splat_set(dst + Head::SIZE, value);
+    __llvm_libc::splat_set<Head>(dst, value);
+  }
+};
+
+template <> struct Chained<> {
+  static constexpr size_t SIZE = 0;
+  static void copy(char *__restrict, const char *__restrict) {}
+  static void move(char *, const char *) {}
+  static bool equals(const char *, const char *) { return true; }
+  static int three_way_compare(const char *, const char *) { return 0; }
+  static void splat_set(char *, const unsigned char) {}
+};
+
+// Overlap ElementA and ElementB so they span Size bytes.
+template <size_t Size, typename ElementA, typename ElementB = ElementA>
+struct Overlap {
+  static constexpr size_t SIZE = Size;
+  static_assert(ElementB::SIZE <= ElementA::SIZE, "ElementB too big");
+  static_assert(ElementA::SIZE <= Size, "ElementA too big");
+  static_assert((ElementA::SIZE + ElementB::SIZE) >= Size,
+                "Elements too small to overlap");
+  static constexpr size_t OFFSET = SIZE - ElementB::SIZE;
+
+  static void copy(char *__restrict dst, const char *__restrict src) {
+    ElementA::copy(dst, src);
+    ElementB::copy(dst + OFFSET, src + OFFSET);
+  }
+
+  static void move(char *dst, const char *src) {
+    const auto value_a = ElementA::load(src);
+    const auto value_b = ElementB::load(src + OFFSET);
+    ElementB::store(dst + OFFSET, value_b);
+    ElementA::store(dst, value_a);
+  }
+
+  static bool equals(const char *lhs, const char *rhs) {
+    if (!ElementA::equals(lhs, rhs))
+      return false;
+    if (!ElementB::equals(lhs + OFFSET, rhs + OFFSET))
+      return false;
+    return true;
+  }
+
+  static int three_way_compare(const char *lhs, const char *rhs) {
+    if (!ElementA::equals(lhs, rhs))
+      return ElementA::three_way_compare(lhs, rhs);
+    if (!ElementB::equals(lhs + OFFSET, rhs + OFFSET))
+      return ElementB::three_way_compare(lhs + OFFSET, rhs + OFFSET);
+    return 0;
+  }
+
+  static void splat_set(char *dst, const unsigned char value) {
+    ElementA::splat_set(dst, value);
+    ElementB::splat_set(dst + OFFSET, value);
+  }
+};
+
+// Runtime-size Higher-Order Operations
+// ------------------------------------
+// - Tail<T>: Perform the operation on the last 'T::SIZE' bytes of the buffer.
+// - HeadTail<T>: Perform the operation on the first and last 'T::SIZE' bytes
+//   of the buffer.
+// - Loop<T>: Perform a loop of fixed-sized operations.
+
+// Perform the operation on the last 'T::SIZE' bytes of the buffer.
+//
+// e.g. with
+// [1234567812345678123]
+// [__XXXXXXXXXXXXXX___]
+// [________XXXXXXXX___]
+//
+// Precondition: `size >= T::SIZE`.
+template <typename T> struct Tail {
+  static void copy(char *__restrict dst, const char *__restrict src,
+                   size_t size) {
+    return T::copy(dst + offset(size), src + offset(size));
+  }
+
+  static bool equals(const char *lhs, const char *rhs, size_t size) {
+    return T::equals(lhs + offset(size), rhs + offset(size));
+  }
+
+  static int three_way_compare(const char *lhs, const char *rhs, size_t size) {
+    return T::three_way_compare(lhs + offset(size), rhs + offset(size));
+  }
+
+  static void splat_set(char *dst, const unsigned char value, size_t size) {
+    return T::splat_set(dst + offset(size), value);
+  }
+
+  static size_t offset(size_t size) { return size - T::SIZE; }
+};
+
+// Perform the operation on the first and last 'T::SIZE' bytes of the buffer.
+// This is useful for overlapping operations.
+//
+// e.g. with
+// [1234567812345678123]
+// [__XXXXXXXXXXXXXX___]
+// [__XXXXXXXX_________]
+// [________XXXXXXXX___]
+//
+// Precondition: `size >= T::SIZE && size <= 2 x T::SIZE`.
+template <typename T> struct HeadTail {
+  static void copy(char *__restrict dst, const char *__restrict src,
+                   size_t size) {
+    T::copy(dst, src);
+    Tail<T>::copy(dst, src, size);
+  }
+
+  static void move(char *dst, const char *src, size_t size) {
+    const size_t offset = Tail<T>::offset(size);
+    const auto head_value = T::load(src);
+    const auto tail_value = T::load(src + offset);
+    T::store(dst + offset, tail_value);
+    T::store(dst, head_value);
+  }
+
+  static bool equals(const char *lhs, const char *rhs, size_t size) {
+    if (!T::equals(lhs, rhs))
+      return false;
+    return Tail<T>::equals(lhs, rhs, size);
+  }
+
+  static int three_way_compare(const char *lhs, const char *rhs, size_t size) {
+    if (!T::equals(lhs, rhs))
+      return T::three_way_compare(lhs, rhs);
+    return Tail<T>::three_way_compare(lhs, rhs, size);
+  }
+
+  static void splat_set(char *dst, const unsigned char value, size_t size) {
+    T::splat_set(dst, value);
+    Tail<T>::splat_set(dst, value, size);
+  }
+};
+
+// Simple loop ending with a Tail operation.
+//
+// e.g. with
+// [12345678123456781234567812345678]
+// [__XXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
+// [__XXXXXXXX_______________________]
+// [__________XXXXXXXX_______________]
+// [__________________XXXXXXXX_______]
+// [______________________XXXXXXXX___]
+//
+// Precondition:
+// - size >= T::SIZE
+template <typename T, typename TailT = T> struct Loop {
+  static_assert(T::SIZE == TailT::SIZE,
+                "Tail type must have the same size as T");
+
+  static void copy(char *__restrict dst, const char *__restrict src,
+                   size_t size) {
+    size_t offset = 0;
+    do {
+      T::copy(dst + offset, src + offset);
+      offset += T::SIZE;
+    } while (offset < size - T::SIZE);
+    Tail<TailT>::copy(dst, src, size);
+  }
+
+  // Move forward suitable when dst < src. We load the tail bytes before
+  // handling the loop.
+  //
+  // e.g. Moving two bytes
+  // [   |       |       |       |       |]
+  // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
+  // [_________________________LLLLLLLL___]
+  // [___LLLLLLLL_________________________]
+  // [_SSSSSSSS___________________________]
+  // [___________LLLLLLLL_________________]
+  // [_________SSSSSSSS___________________]
+  // [___________________LLLLLLLL_________]
+  // [_________________SSSSSSSS___________]
+  // [_______________________SSSSSSSS_____]
+  static void move(char *dst, const char *src, size_t size) {
+    const size_t tail_offset = Tail<T>::offset(size);
+    const auto tail_value = TailT::load(src + tail_offset);
+    size_t offset = 0;
+    do {
+      T::move(dst + offset, src + offset);
+      offset += T::SIZE;
+    } while (offset < size - T::SIZE);
+    TailT::store(dst + tail_offset, tail_value);
+  }
+
+  // Move forward suitable when dst > src. We load the head bytes before
+  // handling the loop.
+  //
+  // e.g. Moving two bytes
+  // [   |       |       |       |       |]
+  // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
+  // [___LLLLLLLL_________________________]
+  // [_________________________LLLLLLLL___]
+  // [___________________________SSSSSSSS_]
+  // [_________________LLLLLLLL___________]
+  // [___________________SSSSSSSS_________]
+  // [_________LLLLLLLL___________________]
+  // [___________SSSSSSSS_________________]
+  // [_____SSSSSSSS_______________________]
+  static void move_backward(char *dst, const char *src, size_t size) {
+    const auto head_value = TailT::load(src);
+    ptrdiff_t offset = size - T::SIZE;
+    do {
+      T::move(dst + offset, src + offset);
+      offset -= T::SIZE;
+    } while (offset >= 0);
+    TailT::store(dst, head_value);
+  }
+
+  static bool equals(const char *lhs, const char *rhs, size_t size) {
+    size_t offset = 0;
+    do {
+      if (!T::equals(lhs + offset, rhs + offset))
+        return false;
+      offset += T::SIZE;
+    } while (offset < size - T::SIZE);
+    return Tail<TailT>::equals(lhs, rhs, size);
+  }
+
+  static int three_way_compare(const char *lhs, const char *rhs, size_t size) {
+    size_t offset = 0;
+    do {
+      if (!T::equals(lhs + offset, rhs + offset))
+        return T::three_way_compare(lhs + offset, rhs + offset);
+      offset += T::SIZE;
+    } while (offset < size - T::SIZE);
+    return Tail<TailT>::three_way_compare(lhs, rhs, size);
+  }
+
+  static void splat_set(char *dst, const unsigned char value, size_t size) {
+    size_t offset = 0;
+    do {
+      T::splat_set(dst + offset, value);
+      offset += T::SIZE;
+    } while (offset < size - T::SIZE);
+    Tail<TailT>::splat_set(dst, value, size);
+  }
+};
+
+namespace internal {
+
+template <Arg arg> struct ArgSelector {};
+
+template <> struct ArgSelector<Arg::_1> {
+  template <typename T1, typename T2>
+  static T1 *__restrict &Select(T1 *__restrict &p1ref, T2 *__restrict &) {
+    return p1ref;
+  }
+};
+
+template <> struct ArgSelector<Arg::_2> {
+  template <typename T1, typename T2>
+  static T2 *__restrict &Select(T1 *__restrict &, T2 *__restrict &p2ref) {
+    return p2ref;
+  }
+};
+
+// Provides a specialized bump function that adjusts pointers and size so first
+// argument (resp. second argument) gets aligned to Alignment.
+// We make sure the compiler knows about the adjusted pointer alignment.
+// The 'additional_bumps' parameter allows to reach previous / next aligned
+// pointers.
+template <Arg arg, size_t Alignment> struct Align {
+  template <typename T1, typename T2>
+  static void bump(T1 *__restrict &p1ref, T2 *__restrict &p2ref, size_t &size,
+                   int additional_bumps = 0) {
+    auto &aligned_ptr = ArgSelector<arg>::Select(p1ref, p2ref);
+    auto offset = offset_to_next_aligned<Alignment>(aligned_ptr);
+    adjust(offset + additional_bumps * Alignment, p1ref, p2ref, size);
+    aligned_ptr = assume_aligned<Alignment>(aligned_ptr);
+  }
+};
+
+} // namespace internal
+
+// An alignment operation that:
+// - executes the 'AlignmentT' operation
+// - bumps 'dst' or 'src' (resp. 'lhs' or 'rhs') pointers so that the selected
+//   pointer gets aligned, size is decreased accordingly.
+// - calls the 'NextT' operation.
+//
+// e.g. A 16-byte Destination Aligned 32-byte Loop Copy can be written as:
+// copy<Align<_16, Arg::Dst>::Then<Loop<_32>>>(dst, src, count);
+template <typename AlignmentT, Arg AlignOn = Arg::_1> struct Align {
+private:
+  static constexpr size_t ALIGNMENT = AlignmentT::SIZE;
+  static_assert(ALIGNMENT > 1, "Alignment must be more than 1");
+  static_assert(is_power2(ALIGNMENT), "Alignment must be a power of 2");
+
+public:
+  template <typename NextT> struct Then {
+    static void copy(char *__restrict dst, const char *__restrict src,
+                     size_t size) {
+      AlignmentT::copy(dst, src);
+      internal::Align<AlignOn, ALIGNMENT>::bump(dst, src, size);
+      NextT::copy(dst, src, size);
+    }
+
+    // Move forward suitable when dst < src. The alignment is performed with an
+    // HeadTail operation of size â [Alignment, 2 x Alignment].
+    //
+    // e.g. Moving two bytes and making sure src is then aligned.
+    // [  |       |       |       |      ]
+    // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
+    // [____LLLLLLLL_____________________]
+    // [___________LLLLLLLL______________]
+    // [_SSSSSSSS________________________]
+    // [________SSSSSSSS_________________]
+    //
+    // e.g. Moving two bytes and making sure dst is then aligned.
+    // [  |       |       |       |      ]
+    // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
+    // [____LLLLLLLL_____________________]
+    // [______LLLLLLLL___________________]
+    // [_SSSSSSSS________________________]
+    // [___SSSSSSSS______________________]
+    static void move(char *dst, const char *src, size_t size) {
+      char *next_dst = dst;
+      const char *next_src = src;
+      size_t next_size = size;
+      internal::Align<AlignOn, ALIGNMENT>::bump(next_dst, next_src, next_size,
+                                                1);
+      HeadTail<AlignmentT>::move(dst, src, size - next_size);
+      NextT::move(next_dst, next_src, next_size);
+    }
+
+    // Move backward suitable when dst > src. The alignment is performed with an
+    // HeadTail operation of size â [Alignment, 2 x Alignment].
+    //
+    // e.g. Moving two bytes backward and making sure src is then aligned.
+    // [  |       |       |       |      ]
+    // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
+    // [ _________________LLLLLLLL_______]
+    // [ ___________________LLLLLLLL_____]
+    // [____________________SSSSSSSS_____]
+    // [______________________SSSSSSSS___]
+    //
+    // e.g. Moving two bytes and making sure dst is then aligned.
+    // [  |       |       |       |      ]
+    // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
+    // [ _______________LLLLLLLL_________]
+    // [ ___________________LLLLLLLL_____]
+    // [__________________SSSSSSSS_______]
+    // [______________________SSSSSSSS___]
+    static void move_backward(char *dst, const char *src, size_t size) {
+      char *headtail_dst = dst + size;
+      const char *headtail_src = src + size;
+      size_t headtail_size = 0;
+      internal::Align<AlignOn, ALIGNMENT>::bump(headtail_dst, headtail_src,
+                                                headtail_size, -2);
+      HeadTail<AlignmentT>::move(headtail_dst, headtail_src, headtail_size);
+      NextT::move_backward(dst, src, size - headtail_size);
+    }
+
+    static bool equals(const char *lhs, const char *rhs, size_t size) {
+      if (!AlignmentT::equals(lhs, rhs))
+        return false;
+      internal::Align<AlignOn, ALIGNMENT>::bump(lhs, rhs, size);
+      return NextT::equals(lhs, rhs, size);
+    }
+
+    static int three_way_compare(const char *lhs, const char *rhs,
+                                 size_t size) {
+      if (!AlignmentT::equals(lhs, rhs))
+        return AlignmentT::three_way_compare(lhs, rhs);
+      internal::Align<AlignOn, ALIGNMENT>::bump(lhs, rhs, size);
+      return NextT::three_way_compare(lhs, rhs, size);
+    }
+
+    static void splat_set(char *dst, const unsigned char value, size_t size) {
+      AlignmentT::splat_set(dst, value);
+      char *dummy = nullptr;
+      internal::Align<Arg::_1, ALIGNMENT>::bump(dst, dummy, size);
+      NextT::splat_set(dst, value, size);
+    }
+  };
+};
+
+// An operation that allows to skip the specified amount of bytes.
+template <ptrdiff_t Bytes> struct Skip {
+  template <typename NextT> struct Then {
+    static void copy(char *__restrict dst, const char *__restrict src,
+                     size_t size) {
+      NextT::copy(dst + Bytes, src + Bytes, size - Bytes);
+    }
+
+    static void copy(char *__restrict dst, const char *__restrict src) {
+      NextT::copy(dst + Bytes, src + Bytes);
+    }
+
+    static bool equals(const char *lhs, const char *rhs, size_t size) {
+      return NextT::equals(lhs + Bytes, rhs + Bytes, size - Bytes);
+    }
+
+    static bool equals(const char *lhs, const char *rhs) {
+      return NextT::equals(lhs + Bytes, rhs + Bytes);
+    }
+
+    static int three_way_compare(const char *lhs, const char *rhs,
+                                 size_t size) {
+      return NextT::three_way_compare(lhs + Bytes, rhs + Bytes, size - Bytes);
+    }
+
+    static int three_way_compare(const char *lhs, const char *rhs) {
+      return NextT::three_way_compare(lhs + Bytes, rhs + Bytes);
+    }
+
+    static void splat_set(char *dst, const unsigned char value, size_t size) {
+      NextT::splat_set(dst + Bytes, value, size - Bytes);
+    }
+
+    static void splat_set(char *dst, const unsigned char value) {
+      NextT::splat_set(dst + Bytes, value);
+    }
+  };
+};
+
+// Fixed-size Builtin Operations
+// -----------------------------
+// Note: Do not use 'builtin' right now as it requires the implementation of the
+// `_inline` versions of all the builtins. Theoretically, Clang can still turn
+// them into calls to the C library leading to reentrancy problems.
+namespace builtin {
+
+#ifndef __has_builtin
+#define __has_builtin(x) 0 // Compatibility with non-clang compilers.
+#endif
+
+template <size_t Size> struct Builtin {
+  static constexpr size_t SIZE = Size;
+
+  static void copy(char *__restrict dst, const char *__restrict src) {
+#if LLVM_LIBC_HAVE_MEMORY_SANITIZER || LLVM_LIBC_HAVE_ADDRESS_SANITIZER
+    for_loop_copy(dst, src);
+#elif __has_builtin(__builtin_memcpy_inline)
+    // __builtin_memcpy_inline guarantees to never call external functions.
+    // Unfortunately it is not widely available.
+    __builtin_memcpy_inline(dst, src, SIZE);
+#else
+    for_loop_copy(dst, src);
+#endif
+  }
+
+  static void move(char *dst, const char *src) {
+#if LLVM_LIBC_HAVE_MEMORY_SANITIZER || LLVM_LIBC_HAVE_ADDRESS_SANITIZER
+    for_loop_move(dst, src);
+#elif __has_builtin(__builtin_memmove)
+    __builtin_memmove(dst, src, SIZE);
+#else
+    for_loop_move(dst, src);
+#endif
+  }
+
+#if __has_builtin(__builtin_memcmp_inline)
+#define LLVM_LIBC_MEMCMP __builtin_memcmp_inline
+#else
+#define LLVM_LIBC_MEMCMP __builtin_memcmp
+#endif
+
+  static bool equals(const char *lhs, const char *rhs) {
+    return LLVM_LIBC_MEMCMP(lhs, rhs, SIZE) == 0;
+  }
+
+  static int three_way_compare(const char *lhs, const char *rhs) {
+    return LLVM_LIBC_MEMCMP(lhs, rhs, SIZE);
+  }
+
+  static void splat_set(char *dst, const unsigned char value) {
+    __builtin_memset(dst, value, SIZE);
+  }
+
+private:
+  // Copies `SIZE` bytes from `src` to `dst` using a for loop.
+  // This code requires the use of `-fno-builtin-memcpy` to prevent the compiler
+  // from turning the for-loop back into `__builtin_memcpy`.
+  static void for_loop_copy(char *__restrict dst, const char *__restrict src) {
+    for (size_t i = 0; i < SIZE; ++i)
+      dst[i] = src[i];
+  }
+
+  static void for_loop_move(char *dst, const char *src) {
+    for (size_t i = 0; i < SIZE; ++i)
+      dst[i] = src[i];
+  }
+};
+
+using _1 = Builtin<1>;
+using _2 = Builtin<2>;
+using _3 = Builtin<3>;
+using _4 = Builtin<4>;
+using _8 = Builtin<8>;
+using _16 = Builtin<16>;
+using _32 = Builtin<32>;
+using _64 = Builtin<64>;
+using _128 = Builtin<128>;
+
+} // namespace builtin
+
+// Fixed-size Scalar Operations
+// ----------------------------
+namespace scalar {
+
+// The Scalar type makes use of simple sized integers.
+template <typename T> struct Scalar {
+  static constexpr size_t SIZE = sizeof(T);
+
+  static void copy(char *__restrict dst, const char *__restrict src) {
+    store(dst, load(src));
+  }
+
+  static void move(char *dst, const char *src) { store(dst, load(src)); }
+
+  static bool equals(const char *lhs, const char *rhs) {
+    return load(lhs) == load(rhs);
+  }
+
+  static int three_way_compare(const char *lhs, const char *rhs) {
+    return scalar_three_way_compare(load(lhs), load(rhs));
+  }
+
+  static void splat_set(char *dst, const unsigned char value) {
+    store(dst, get_splatted_value(value));
+  }
+
+  static int scalar_three_way_compare(T a, T b);
+
+  static T load(const char *ptr) {
+    T value;
+    builtin::Builtin<SIZE>::copy(reinterpret_cast<char *>(&value), ptr);
+    return value;
+  }
+  static void store(char *ptr, T value) {
+    builtin::Builtin<SIZE>::copy(ptr, reinterpret_cast<const char *>(&value));
+  }
+
+private:
+  static T get_splatted_value(const unsigned char value) {
+    return T(~0) / T(0xFF) * T(value);
+  }
+};
+
+template <>
+inline int Scalar<uint8_t>::scalar_three_way_compare(uint8_t a, uint8_t b) {
+  const int16_t la = Endian::to_big_endian(a);
+  const int16_t lb = Endian::to_big_endian(b);
+  return la - lb;
+}
+template <>
+inline int Scalar<uint16_t>::scalar_three_way_compare(uint16_t a, uint16_t b) {
+  const int32_t la = Endian::to_big_endian(a);
+  const int32_t lb = Endian::to_big_endian(b);
+  return la - lb;
+}
+template <>
+inline int Scalar<uint32_t>::scalar_three_way_compare(uint32_t a, uint32_t b) {
+  const uint32_t la = Endian::to_big_endian(a);
+  const uint32_t lb = Endian::to_big_endian(b);
+  return la > lb ? 1 : la < lb ? -1 : 0;
+}
+template <>
+inline int Scalar<uint64_t>::scalar_three_way_compare(uint64_t a, uint64_t b) {
+  const uint64_t la = Endian::to_big_endian(a);
+  const uint64_t lb = Endian::to_big_endian(b);
+  return la > lb ? 1 : la < lb ? -1 : 0;
+}
+
+using UINT8 = Scalar<uint8_t>;   // 1 Byte
+using UINT16 = Scalar<uint16_t>; // 2 Bytes
+using UINT32 = Scalar<uint32_t>; // 4 Bytes
+using UINT64 = Scalar<uint64_t>; // 8 Bytes
+
+using _1 = UINT8;
+using _2 = UINT16;
+using _3 = Chained<UINT16, UINT8>;
+using _4 = UINT32;
+using _8 = UINT64;
+using _16 = Repeated<_8, 2>;
+using _32 = Repeated<_8, 4>;
+using _64 = Repeated<_8, 8>;
+using _128 = Repeated<_8, 16>;
+
+} // namespace scalar
+} // namespace __llvm_libc
+
+#include <src/string/memory_utils/elements_aarch64.h>
+#include <src/string/memory_utils/elements_x86.h>
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H
diff --git a/libc/src/string/memory_utils/elements_aarch64.h b/libc/src/string/memory_utils/elements_aarch64.h
new file mode 100644
index 0000000..0529df7
--- /dev/null
+++ b/libc/src/string/memory_utils/elements_aarch64.h
@@ -0,0 +1,130 @@
+//===-- Elementary operations for aarch64 --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_AARCH64_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_AARCH64_H
+
+#include "src/__support/architectures.h"
+
+#if defined(LLVM_LIBC_ARCH_AARCH64)
+
+#include <src/string/memory_utils/elements.h>
+#include <stddef.h> // size_t
+#include <stdint.h> // uint8_t, uint16_t, uint32_t, uint64_t
+
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#endif
+
+namespace __llvm_libc {
+namespace aarch64_memset {
+#ifdef __ARM_NEON
+struct Splat8 {
+  static constexpr size_t SIZE = 8;
+  static void splat_set(char *dst, const unsigned char value) {
+    vst1_u8((uint8_t *)dst, vdup_n_u8(value));
+  }
+};
+
+struct Splat16 {
+  static constexpr size_t SIZE = 16;
+  static void splat_set(char *dst, const unsigned char value) {
+    vst1q_u8((uint8_t *)dst, vdupq_n_u8(value));
+  }
+};
+
+using _8 = Splat8;
+using _16 = Splat16;
+#else
+using _8 = __llvm_libc::scalar::_8;
+using _16 = Repeated<_8, 2>;
+#endif // __ARM_NEON
+
+using _1 = __llvm_libc::scalar::_1;
+using _2 = __llvm_libc::scalar::_2;
+using _3 = __llvm_libc::scalar::_3;
+using _4 = __llvm_libc::scalar::_4;
+using _32 = Chained<_16, _16>;
+using _64 = Chained<_32, _32>;
+
+struct Zva64 {
+  static constexpr size_t SIZE = 64;
+
+  static void splat_set(char *dst, const unsigned char) {
+#if __SIZEOF_POINTER__ == 4
+    asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
+#else
+    asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
+#endif
+  }
+};
+
+inline static bool hasZva() {
+  uint64_t zva_val;
+  asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
+  // DC ZVA is permitted if DZP, bit [4] is zero.
+  // BS, bits [3:0] is log2 of the block size in words.
+  // So the next line checks whether the instruction is permitted and block size
+  // is 16 words (i.e. 64 bytes).
+  return (zva_val & 0b11111) == 0b00100;
+}
+
+} // namespace aarch64_memset
+
+namespace aarch64 {
+
+using _1 = __llvm_libc::scalar::_1;
+using _2 = __llvm_libc::scalar::_2;
+using _3 = __llvm_libc::scalar::_3;
+using _4 = __llvm_libc::scalar::_4;
+using _8 = __llvm_libc::scalar::_8;
+using _16 = __llvm_libc::scalar::_16;
+
+#ifdef __ARM_NEON
+struct N32 {
+  static constexpr size_t SIZE = 32;
+  static bool equals(const char *lhs, const char *rhs) {
+    uint8x16_t l_0 = vld1q_u8((const uint8_t *)lhs);
+    uint8x16_t r_0 = vld1q_u8((const uint8_t *)rhs);
+    uint8x16_t l_1 = vld1q_u8((const uint8_t *)(lhs + 16));
+    uint8x16_t r_1 = vld1q_u8((const uint8_t *)(rhs + 16));
+    uint8x16_t temp = vpmaxq_u8(veorq_u8(l_0, r_0), veorq_u8(l_1, r_1));
+    uint64_t res =
+        vgetq_lane_u64(vreinterpretq_u64_u8(vpmaxq_u8(temp, temp)), 0);
+    return res == 0;
+  }
+  static int three_way_compare(const char *lhs, const char *rhs) {
+    uint8x16_t l_0 = vld1q_u8((const uint8_t *)lhs);
+    uint8x16_t r_0 = vld1q_u8((const uint8_t *)rhs);
+    uint8x16_t l_1 = vld1q_u8((const uint8_t *)(lhs + 16));
+    uint8x16_t r_1 = vld1q_u8((const uint8_t *)(rhs + 16));
+    uint8x16_t temp = vpmaxq_u8(veorq_u8(l_0, r_0), veorq_u8(l_1, r_1));
+    uint64_t res =
+        vgetq_lane_u64(vreinterpretq_u64_u8(vpmaxq_u8(temp, temp)), 0);
+    if (res == 0)
+      return 0;
+    size_t index = (__builtin_ctzl(res) >> 3) << 2;
+    uint32_t l = *((const uint32_t *)(lhs + index));
+    uint32_t r = *((const uint32_t *)(rhs + index));
+    return __llvm_libc::scalar::_4::scalar_three_way_compare(l, r);
+  }
+};
+
+using _32 = N32;
+using _64 = Repeated<_32, 2>;
+#else
+using _32 = __llvm_libc::scalar::_32;
+using _64 = __llvm_libc::scalar::_64;
+#endif // __ARM_NEON
+
+} // namespace aarch64
+} // namespace __llvm_libc
+
+#endif // defined(LLVM_LIBC_ARCH_AARCH64)
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_AARCH64_H
diff --git a/libc/src/string/memory_utils/elements_x86.h b/libc/src/string/memory_utils/elements_x86.h
new file mode 100644
index 0000000..7a2a8cc
--- /dev/null
+++ b/libc/src/string/memory_utils/elements_x86.h
@@ -0,0 +1,189 @@
+//===-- Elementary operations for x86 -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/architectures.h"
+
+#if defined(LLVM_LIBC_ARCH_X86)
+
+#include <stddef.h> // size_t
+#include <stdint.h> // uint8_t, uint16_t, uint32_t, uint64_t
+
+#ifdef __SSE2__
+#include <immintrin.h>
+#endif //  __SSE2__
+
+#include "src/string/memory_utils/elements.h" // __llvm_libc::scalar
+
+// Fixed-size Vector Operations
+// ----------------------------
+
+namespace __llvm_libc {
+namespace x86 {
+
+#ifdef __SSE2__
+template <typename Base> struct Vector : public Base {
+  static void copy(char *__restrict dst, const char *__restrict src) {
+    Base::store(dst, Base::load(src));
+  }
+
+  static void move(char *dst, const char *src) {
+    Base::store(dst, Base::load(src));
+  }
+
+  static bool equals(const char *a, const char *b) {
+    return Base::not_equal_mask(Base::load(a), Base::load(b)) == 0;
+  }
+
+  static int three_way_compare(const char *a, const char *b) {
+    const auto mask = Base::not_equal_mask(Base::load(a), Base::load(b));
+    if (!mask)
+      return 0;
+    return char_diff(a, b, mask);
+  }
+
+  static void splat_set(char *dst, const unsigned char value) {
+    Base::store(dst, Base::get_splatted_value(value));
+  }
+
+  static int char_diff(const char *a, const char *b, uint64_t mask) {
+    const size_t diff_index = __builtin_ctzll(mask);
+    const int ca = (unsigned char)a[diff_index];
+    const int cb = (unsigned char)b[diff_index];
+    return ca - cb;
+  }
+};
+
+struct M128 {
+  static constexpr size_t SIZE = 16;
+  using T = char __attribute__((__vector_size__(SIZE)));
+  static uint16_t mask(T value) {
+    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
+    return static_cast<uint16_t>(
+        _mm_movemask_epi8(cpp::bit_cast<__m128i>(value)));
+  }
+  static uint16_t not_equal_mask(T a, T b) { return mask(a != b); }
+  static T load(const char *ptr) {
+    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
+    return cpp::bit_cast<T>(
+        _mm_loadu_si128(reinterpret_cast<__m128i_u const *>(ptr)));
+  }
+  static void store(char *ptr, T value) {
+    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
+    return _mm_storeu_si128(reinterpret_cast<__m128i_u *>(ptr),
+                            cpp::bit_cast<__m128i>(value));
+  }
+  static T get_splatted_value(const char v) {
+    const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v};
+    return splatted;
+  }
+};
+
+using Vector128 = Vector<M128>; // 16 Bytes
+
+#ifdef __AVX2__
+struct M256 {
+  static constexpr size_t SIZE = 32;
+  using T = char __attribute__((__vector_size__(SIZE)));
+  static uint32_t mask(T value) {
+    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
+    return _mm256_movemask_epi8(cpp::bit_cast<__m256i>(value));
+  }
+  static uint32_t not_equal_mask(T a, T b) { return mask(a != b); }
+  static T load(const char *ptr) {
+    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
+    return cpp::bit_cast<T>(
+        _mm256_loadu_si256(reinterpret_cast<__m256i const *>(ptr)));
+  }
+  static void store(char *ptr, T value) {
+    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
+    return _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr),
+                               cpp::bit_cast<__m256i>(value));
+  }
+  static T get_splatted_value(const char v) {
+    const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
+                        v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v};
+    return splatted;
+  }
+};
+
+using Vector256 = Vector<M256>; // 32 Bytes
+
+#if defined(__AVX512F__) and defined(__AVX512BW__)
+struct M512 {
+  static constexpr size_t SIZE = 64;
+  using T = char __attribute__((__vector_size__(SIZE)));
+  static uint64_t not_equal_mask(T a, T b) {
+    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
+    return _mm512_cmpneq_epi8_mask(cpp::bit_cast<__m512i>(a),
+                                   cpp::bit_cast<__m512i>(b));
+  }
+  static T load(const char *ptr) {
+    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
+    return cpp::bit_cast<T>(_mm512_loadu_epi8(ptr));
+  }
+  static void store(char *ptr, T value) {
+    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
+    return _mm512_storeu_epi8(ptr, cpp::bit_cast<__m512i>(value));
+  }
+  static T get_splatted_value(const char v) {
+    const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
+                        v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
+                        v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
+                        v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v};
+    return splatted;
+  }
+};
+using Vector512 = Vector<M512>;
+
+#endif // defined(__AVX512F__) and defined(__AVX512BW__)
+#endif // __AVX2__
+#endif // __SSE2__
+
+using _1 = __llvm_libc::scalar::_1;
+using _2 = __llvm_libc::scalar::_2;
+using _3 = __llvm_libc::scalar::_3;
+using _4 = __llvm_libc::scalar::_4;
+using _8 = __llvm_libc::scalar::_8;
+#if defined(__AVX512F__) && defined(__AVX512BW__)
+using _16 = __llvm_libc::x86::Vector128;
+using _32 = __llvm_libc::x86::Vector256;
+using _64 = __llvm_libc::x86::Vector512;
+using _128 = __llvm_libc::Repeated<_64, 2>;
+#elif defined(__AVX2__)
+using _16 = __llvm_libc::x86::Vector128;
+using _32 = __llvm_libc::x86::Vector256;
+using _64 = __llvm_libc::Repeated<_32, 2>;
+using _128 = __llvm_libc::Repeated<_32, 4>;
+#elif defined(__SSE2__)
+using _16 = __llvm_libc::x86::Vector128;
+using _32 = __llvm_libc::Repeated<_16, 2>;
+using _64 = __llvm_libc::Repeated<_16, 4>;
+using _128 = __llvm_libc::Repeated<_16, 8>;
+#else
+using _16 = __llvm_libc::Repeated<_8, 2>;
+using _32 = __llvm_libc::Repeated<_8, 4>;
+using _64 = __llvm_libc::Repeated<_8, 8>;
+using _128 = __llvm_libc::Repeated<_8, 16>;
+#endif
+
+struct Accelerator {
+  static void copy(char *dst, const char *src, size_t count) {
+    asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
+  }
+};
+
+} // namespace x86
+} // namespace __llvm_libc
+
+#endif // defined(LLVM_LIBC_ARCH_X86)
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H
diff --git a/libc/src/string/memory_utils/memcmp_implementations.h b/libc/src/string/memory_utils/memcmp_implementations.h
index af85afa..f207946 100644
--- a/libc/src/string/memory_utils/memcmp_implementations.h
+++ b/libc/src/string/memory_utils/memcmp_implementations.h
@@ -11,133 +11,92 @@
 
 #include "src/__support/architectures.h"
 #include "src/__support/common.h"
-#include "src/string/memory_utils/op_aarch64.h"
-#include "src/string/memory_utils/op_builtin.h"
-#include "src/string/memory_utils/op_generic.h"
-#include "src/string/memory_utils/op_x86.h"
-#include "src/string/memory_utils/utils.h"
+#include "src/string/memory_utils/elements.h"
 
 #include <stddef.h> // size_t
 
 namespace __llvm_libc {
-[[maybe_unused]] static inline MemcmpReturnType
-inline_memcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) {
-#pragma nounroll
-  for (size_t offset = 0; offset < count; ++offset)
-    if (auto value = generic::Memcmp<1>::block(p1 + offset, p2 + offset))
-      return value;
-  return MemcmpReturnType::ZERO();
-}
-
-#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
-[[maybe_unused]] static inline MemcmpReturnType
-inline_memcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
-  if (unlikely(count >= 384)) {
-    if (auto value = generic::Memcmp<16>::block(p1, p2))
-      return value;
-    align_to_next_boundary<16, Arg::P1>(p1, p2, count);
-  }
-  return generic::Memcmp<16>::loop_and_tail(p1, p2, count);
-}
-#endif // defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
 
+static inline int inline_memcmp(const char *lhs, const char *rhs,
+                                size_t count) {
 #if defined(LLVM_LIBC_ARCH_X86)
-[[maybe_unused]] static inline MemcmpReturnType
-inline_memcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) {
-  if (unlikely(count >= 384)) {
-    if (auto value = x86::sse2::Memcmp<16>::block(p1, p2))
-      return value;
-    align_to_next_boundary<16, Arg::P1>(p1, p2, count);
-  }
-  return x86::sse2::Memcmp<16>::loop_and_tail(p1, p2, count);
-}
-
-[[maybe_unused]] static inline MemcmpReturnType
-inline_memcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) {
-  if (count <= 32)
-    return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
-  if (count <= 64)
-    return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
-  if (count <= 128)
-    return x86::avx2::Memcmp<64>::head_tail(p1, p2, count);
-  if (unlikely(count >= 384)) {
-    if (auto value = x86::avx2::Memcmp<32>::block(p1, p2))
-      return value;
-    align_to_next_boundary<32, Arg::P1>(p1, p2, count);
-  }
-  return x86::avx2::Memcmp<32>::loop_and_tail(p1, p2, count);
-}
-
-[[maybe_unused]] static inline MemcmpReturnType
-inline_memcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
+  /////////////////////////////////////////////////////////////////////////////
+  // LLVM_LIBC_ARCH_X86
+  /////////////////////////////////////////////////////////////////////////////
+  using namespace __llvm_libc::x86;
+  if (count == 0)
+    return 0;
+  if (count == 1)
+    return three_way_compare<_1>(lhs, rhs);
+  if (count == 2)
+    return three_way_compare<_2>(lhs, rhs);
+  if (count == 3)
+    return three_way_compare<_3>(lhs, rhs);
+  if (count <= 8)
+    return three_way_compare<HeadTail<_4>>(lhs, rhs, count);
+  if (count <= 16)
+    return three_way_compare<HeadTail<_8>>(lhs, rhs, count);
   if (count <= 32)
-    return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
+    return three_way_compare<HeadTail<_16>>(lhs, rhs, count);
   if (count <= 64)
-    return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
+    return three_way_compare<HeadTail<_32>>(lhs, rhs, count);
   if (count <= 128)
-    return x86::avx512bw::Memcmp<64>::head_tail(p1, p2, count);
-  if (unlikely(count >= 384)) {
-    if (auto value = x86::avx512bw::Memcmp<64>::block(p1, p2))
-      return value;
-    align_to_next_boundary<64, Arg::P1>(p1, p2, count);
-  }
-  return x86::avx512bw::Memcmp<64>::loop_and_tail(p1, p2, count);
-}
-#endif // defined(LLVM_LIBC_ARCH_X86)
-
-#if defined(LLVM_LIBC_ARCH_AARCH64)
-[[maybe_unused]] static inline MemcmpReturnType
-inline_memcmp_aarch64_neon_gt16(CPtr p1, CPtr p2, size_t count) {
-  if (unlikely(count >= 128)) { // [128, â]
-    if (auto value = generic::Memcmp<16>::block(p1, p2))
-      return value;
-    align_to_next_boundary<16, Arg::P1>(p1, p2, count);
-    return generic::Memcmp<32>::loop_and_tail(p1, p2, count);
-  }
+    return three_way_compare<HeadTail<_64>>(lhs, rhs, count);
+  return three_way_compare<Align<_32>::Then<Loop<_32>>>(lhs, rhs, count);
+#elif defined(LLVM_LIBC_ARCH_AARCH64)
+  /////////////////////////////////////////////////////////////////////////////
+  // LLVM_LIBC_ARCH_AARCH64
+  /////////////////////////////////////////////////////////////////////////////
+  using namespace ::__llvm_libc::aarch64;
+  if (count == 0) // [0, 0]
+    return 0;
+  if (count == 1) // [1, 1]
+    return three_way_compare<_1>(lhs, rhs);
+  if (count == 2) // [2, 2]
+    return three_way_compare<_2>(lhs, rhs);
+  if (count == 3) // [3, 3]
+    return three_way_compare<_3>(lhs, rhs);
+  if (count < 8) // [4, 7]
+    return three_way_compare<HeadTail<_4>>(lhs, rhs, count);
+  if (count < 16) // [8, 15]
+    return three_way_compare<HeadTail<_8>>(lhs, rhs, count);
+  if (unlikely(count >= 128)) // [128, â]
+    return three_way_compare<Align<_16>::Then<Loop<_32>>>(lhs, rhs, count);
+  if (!equals<_16>(lhs, rhs)) // [16, 16]
+    return three_way_compare<_16>(lhs, rhs);
   if (count < 32) // [17, 31]
-    return generic::Memcmp<16>::tail(p1, p2, count);
-  if (generic::Bcmp<16>::block(p1 + 16, p2 + 16)) // [32, 32]
-    return generic::Memcmp<16>::block(p1 + 16, p2 + 16);
+    return three_way_compare<Tail<_16>>(lhs, rhs, count);
+  if (!equals<Skip<16>::Then<_16>>(lhs, rhs)) // [32, 32]
+    return three_way_compare<Skip<16>::Then<_16>>(lhs, rhs);
   if (count < 64) // [33, 63]
-    return generic::Memcmp<32>::tail(p1, p2, count);
+    return three_way_compare<Tail<_32>>(lhs, rhs, count);
   // [64, 127]
-  return generic::Memcmp<16>::loop_and_tail(p1 + 32, p2 + 32, count - 32);
-}
-#endif // defined(LLVM_LIBC_ARCH_AARCH64)
+  return three_way_compare<Skip<32>::Then<Loop<_16>>>(lhs, rhs, count);
+#else
+  /////////////////////////////////////////////////////////////////////////////
+  // Default
+  /////////////////////////////////////////////////////////////////////////////
+  using namespace ::__llvm_libc::scalar;
 
-static inline MemcmpReturnType inline_memcmp(CPtr p1, CPtr p2, size_t count) {
-#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
   if (count == 0)
-    return MemcmpReturnType::ZERO();
+    return 0;
   if (count == 1)
-    return generic::Memcmp<1>::block(p1, p2);
+    return three_way_compare<_1>(lhs, rhs);
   if (count == 2)
-    return generic::Memcmp<2>::block(p1, p2);
+    return three_way_compare<_2>(lhs, rhs);
   if (count == 3)
-    return generic::Memcmp<3>::block(p1, p2);
+    return three_way_compare<_3>(lhs, rhs);
   if (count <= 8)
-    return generic::Memcmp<4>::head_tail(p1, p2, count);
+    return three_way_compare<HeadTail<_4>>(lhs, rhs, count);
   if (count <= 16)
-    return generic::Memcmp<8>::head_tail(p1, p2, count);
-#if defined(LLVM_LIBC_ARCH_X86)
-  if constexpr (x86::kAvx512BW)
-    return inline_memcmp_x86_avx512bw_gt16(p1, p2, count);
-  else if constexpr (x86::kAvx2)
-    return inline_memcmp_x86_avx2_gt16(p1, p2, count);
-  else if constexpr (x86::kSse2)
-    return inline_memcmp_x86_sse2_gt16(p1, p2, count);
-  else
-    return inline_memcmp_generic_gt16(p1, p2, count);
-#elif defined(LLVM_LIBC_ARCH_AARCH64)
-  if constexpr (aarch64::kNeon)
-    return inline_memcmp_aarch64_neon_gt16(p1, p2, count);
-  else
-    return inline_memcmp_generic_gt16(p1, p2, count);
-#endif
-#elif defined(LLVM_LIBC_ARCH_ARM)
-  return inline_memcmp_embedded_tiny(p1, p2, count);
-#else
-#error "Unsupported platform"
+    return three_way_compare<HeadTail<_8>>(lhs, rhs, count);
+  if (count <= 32)
+    return three_way_compare<HeadTail<_16>>(lhs, rhs, count);
+  if (count <= 64)
+    return three_way_compare<HeadTail<_32>>(lhs, rhs, count);
+  if (count <= 128)
+    return three_way_compare<HeadTail<_64>>(lhs, rhs, count);
+  return three_way_compare<Align<_32>::Then<Loop<_32>>>(lhs, rhs, count);
 #endif
 }
 
diff --git a/libc/src/string/memory_utils/memcpy_implementations.h b/libc/src/string/memory_utils/memcpy_implementations.h
index ddfe65d..3385d40 100644
--- a/libc/src/string/memory_utils/memcpy_implementations.h
+++ b/libc/src/string/memory_utils/memcpy_implementations.h
@@ -11,123 +11,142 @@
 
 #include "src/__support/architectures.h"
 #include "src/__support/common.h"
-#include "src/string/memory_utils/op_aarch64.h"
-#include "src/string/memory_utils/op_builtin.h"
-#include "src/string/memory_utils/op_generic.h"
-#include "src/string/memory_utils/op_x86.h"
+#include "src/string/memory_utils/elements.h"
 #include "src/string/memory_utils/utils.h"
 
 #include <stddef.h> // size_t
 
-namespace __llvm_libc {
+// Design rationale
+// ================
+//
+// Using a profiler to observe size distributions for calls into libc
+// functions, it was found most operations act on a small number of bytes.
+// This makes it important to favor small sizes.
+//
+// The tests for `count` are in ascending order so the cost of branching is
+// proportional to the cost of copying.
+//
+// The function is written in C++ for several reasons:
+// - The compiler can __see__ the code, this is useful when performing Profile
+//   Guided Optimization as the optimized code can take advantage of branching
+//   probabilities.
+// - It also allows for easier customization and favors testing multiple
+//   implementation parameters.
+// - As compilers and processors get better, the generated code is improved
+//   with little change on the code side.
 
-[[maybe_unused]] static inline void
-inline_memcpy_embedded_tiny(char *__restrict dst, const char *__restrict src,
-                            size_t count) {
-#pragma nounroll
-  for (size_t offset = 0; offset < count; ++offset)
-    builtin::Memcpy<1>::block(dst + offset, src + offset);
-}
+namespace __llvm_libc {
 
+static inline void inline_memcpy(char *__restrict dst,
+                                 const char *__restrict src, size_t count) {
+  using namespace __llvm_libc::builtin;
 #if defined(LLVM_LIBC_ARCH_X86)
-[[maybe_unused]] static inline void
-inline_memcpy_x86(char *__restrict dst, const char *__restrict src,
-                  size_t count) {
+  /////////////////////////////////////////////////////////////////////////////
+  // LLVM_LIBC_ARCH_X86
+  /////////////////////////////////////////////////////////////////////////////
+
+  // Whether to use only rep;movsb.
+  constexpr bool USE_ONLY_REP_MOVSB =
+      LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB);
+
+  // kRepMovsBSize == -1 : Only CopyAligned is used.
+  // kRepMovsBSize ==  0 : Only RepMovsb is used.
+  // else CopyAligned is used up to kRepMovsBSize and then RepMovsb.
+  constexpr size_t REP_MOVS_B_SIZE =
+#if defined(LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE)
+      LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;
+#else
+      -1;
+#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
+
+  // Whether target supports AVX instructions.
+  constexpr bool HAS_AVX = LLVM_LIBC_IS_DEFINED(__AVX__);
+
+#if defined(__AVX__)
+  using LoopBlockSize = _64;
+#else
+  using LoopBlockSize = _32;
+#endif
+
+  if (USE_ONLY_REP_MOVSB)
+    return copy<x86::Accelerator>(dst, src, count);
+
   if (count == 0)
     return;
   if (count == 1)
-    return builtin::Memcpy<1>::block(dst, src);
+    return copy<_1>(dst, src);
   if (count == 2)
-    return builtin::Memcpy<2>::block(dst, src);
+    return copy<_2>(dst, src);
   if (count == 3)
-    return builtin::Memcpy<3>::block(dst, src);
+    return copy<_3>(dst, src);
   if (count == 4)
-    return builtin::Memcpy<4>::block(dst, src);
+    return copy<_4>(dst, src);
   if (count < 8)
-    return builtin::Memcpy<4>::head_tail(dst, src, count);
+    return copy<HeadTail<_4>>(dst, src, count);
   if (count < 16)
-    return builtin::Memcpy<8>::head_tail(dst, src, count);
+    return copy<HeadTail<_8>>(dst, src, count);
   if (count < 32)
-    return builtin::Memcpy<16>::head_tail(dst, src, count);
+    return copy<HeadTail<_16>>(dst, src, count);
   if (count < 64)
-    return builtin::Memcpy<32>::head_tail(dst, src, count);
+    return copy<HeadTail<_32>>(dst, src, count);
   if (count < 128)
-    return builtin::Memcpy<64>::head_tail(dst, src, count);
-  if (x86::kAvx && count < 256)
-    return builtin::Memcpy<128>::head_tail(dst, src, count);
-  builtin::Memcpy<32>::block(dst, src);
-  align_to_next_boundary<32, Arg::Dst>(dst, src, count);
-  static constexpr size_t kBlockSize = x86::kAvx ? 64 : 32;
-  return builtin::Memcpy<kBlockSize>::loop_and_tail(dst, src, count);
-}
-
-[[maybe_unused]] static inline void inline_memcpy_x86_maybe_interpose_repmovsb(
-    char *__restrict dst, const char *__restrict src, size_t count) {
-  // Whether to use rep;movsb exclusively, not at all, or only above a certain
-  // threshold.
-  // TODO: Use only a single preprocessor definition to simplify the code.
-#ifndef LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
-#define LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE -1
-#endif
-
-  static constexpr bool kUseOnlyRepMovsb =
-      LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB);
-  static constexpr size_t kRepMovsbThreshold =
-      LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;
-  if constexpr (kUseOnlyRepMovsb)
-    return x86::Memcpy::repmovsb(dst, src, count);
-  else if constexpr (kRepMovsbThreshold >= 0) {
-    if (unlikely(count >= kRepMovsbThreshold))
-      return x86::Memcpy::repmovsb(dst, src, count);
-    else
-      return inline_memcpy_x86(dst, src, count);
-  } else {
-    return inline_memcpy_x86(dst, src, count);
-  }
-}
-#endif // defined(LLVM_LIBC_ARCH_X86)
-
-#if defined(LLVM_LIBC_ARCH_AARCH64)
-[[maybe_unused]] static inline void
-inline_memcpy_aarch64(char *__restrict dst, const char *__restrict src,
-                      size_t count) {
+    return copy<HeadTail<_64>>(dst, src, count);
+  if (HAS_AVX && count < 256)
+    return copy<HeadTail<_128>>(dst, src, count);
+  if (count <= REP_MOVS_B_SIZE)
+    return copy<Align<_32, Arg::Dst>::Then<Loop<LoopBlockSize>>>(dst, src,
+                                                                 count);
+  return copy<x86::Accelerator>(dst, src, count);
+#elif defined(LLVM_LIBC_ARCH_AARCH64)
+  /////////////////////////////////////////////////////////////////////////////
+  // LLVM_LIBC_ARCH_AARCH64
+  /////////////////////////////////////////////////////////////////////////////
   if (count == 0)
     return;
   if (count == 1)
-    return builtin::Memcpy<1>::block(dst, src);
+    return copy<_1>(dst, src);
   if (count == 2)
-    return builtin::Memcpy<2>::block(dst, src);
+    return copy<_2>(dst, src);
   if (count == 3)
-    return builtin::Memcpy<3>::block(dst, src);
+    return copy<_3>(dst, src);
   if (count == 4)
-    return builtin::Memcpy<4>::block(dst, src);
+    return copy<_4>(dst, src);
   if (count < 8)
-    return builtin::Memcpy<4>::head_tail(dst, src, count);
+    return copy<HeadTail<_4>>(dst, src, count);
   if (count < 16)
-    return builtin::Memcpy<8>::head_tail(dst, src, count);
+    return copy<HeadTail<_8>>(dst, src, count);
   if (count < 32)
-    return builtin::Memcpy<16>::head_tail(dst, src, count);
+    return copy<HeadTail<_16>>(dst, src, count);
   if (count < 64)
-    return builtin::Memcpy<32>::head_tail(dst, src, count);
+    return copy<HeadTail<_32>>(dst, src, count);
   if (count < 128)
-    return builtin::Memcpy<64>::head_tail(dst, src, count);
-  builtin::Memcpy<16>::block(dst, src);
-  align_to_next_boundary<16, Arg::Src>(dst, src, count);
-  return builtin::Memcpy<64>::loop_and_tail(dst, src, count);
-}
-#endif // defined(LLVM_LIBC_ARCH_AARCH64)
-
-static inline void inline_memcpy(char *__restrict dst,
-                                 const char *__restrict src, size_t count) {
-  using namespace __llvm_libc::builtin;
-#if defined(LLVM_LIBC_ARCH_X86)
-  return inline_memcpy_x86_maybe_interpose_repmovsb(dst, src, count);
-#elif defined(LLVM_LIBC_ARCH_AARCH64)
-  return inline_memcpy_aarch64(dst, src, count);
-#elif defined(LLVM_LIBC_ARCH_ARM)
-  return inline_memcpy_embedded_tiny(dst, src, count);
+    return copy<HeadTail<_64>>(dst, src, count);
+  return copy<Align<_16, Arg::Src>::Then<Loop<_64>>>(dst, src, count);
 #else
-#error "Unsupported platform"
+  /////////////////////////////////////////////////////////////////////////////
+  // Default
+  /////////////////////////////////////////////////////////////////////////////
+  if (count == 0)
+    return;
+  if (count == 1)
+    return copy<_1>(dst, src);
+  if (count == 2)
+    return copy<_2>(dst, src);
+  if (count == 3)
+    return copy<_3>(dst, src);
+  if (count == 4)
+    return copy<_4>(dst, src);
+  if (count < 8)
+    return copy<HeadTail<_4>>(dst, src, count);
+  if (count < 16)
+    return copy<HeadTail<_8>>(dst, src, count);
+  if (count < 32)
+    return copy<HeadTail<_16>>(dst, src, count);
+  if (count < 64)
+    return copy<HeadTail<_32>>(dst, src, count);
+  if (count < 128)
+    return copy<HeadTail<_64>>(dst, src, count);
+  return copy<Align<_32, Arg::Src>::Then<Loop<_32>>>(dst, src, count);
 #endif
 }
 
diff --git a/libc/src/string/memory_utils/memset_implementations.h b/libc/src/string/memory_utils/memset_implementations.h
index 66d4822..f1611a3 100644
--- a/libc/src/string/memory_utils/memset_implementations.h
+++ b/libc/src/string/memory_utils/memset_implementations.h
@@ -10,104 +10,126 @@
 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H
 
 #include "src/__support/architectures.h"
-#include "src/string/memory_utils/op_aarch64.h"
-#include "src/string/memory_utils/op_builtin.h"
-#include "src/string/memory_utils/op_generic.h"
-#include "src/string/memory_utils/op_x86.h"
+#include "src/string/memory_utils/elements.h"
 #include "src/string/memory_utils/utils.h"
 
 #include <stddef.h> // size_t
 
 namespace __llvm_libc {
 
-[[maybe_unused]] inline static void
-inline_memset_embedded_tiny(Ptr dst, uint8_t value, size_t count) {
-#pragma nounroll
-  for (size_t offset = 0; offset < count; ++offset)
-    generic::Memset<1, 1>::block(dst + offset, value);
-}
-
+// A general purpose implementation assuming cheap unaligned writes for sizes:
+// 1, 2, 4, 8, 16, 32 and 64 Bytes. Note that some architecture can't store 32
+// or 64 Bytes at a time, the compiler will expand them as needed.
+//
+// This implementation is subject to change as we benchmark more processors. We
+// may also want to customize it for processors with specialized instructions
+// that performs better (e.g. `rep stosb`).
+//
+// A note on the apparent discrepancy in the use of 32 vs 64 Bytes writes.
+// We want to balance two things here:
+//  - The number of redundant writes (when using `SetBlockOverlap`),
+//  - The number of conditionals for sizes <=128 (~90% of memset calls are for
+//    such sizes).
+//
+// For the range 64-128:
+//  - SetBlockOverlap<64> uses no conditionals but always writes 128 Bytes this
+//  is wasteful near 65 but efficient toward 128.
+//  - SetAlignedBlocks<32> would consume between 3 and 4 conditionals and write
+//  96 or 128 Bytes.
+//  - Another approach could be to use an hybrid approach copy<64>+Overlap<32>
+//  for 65-96 and copy<96>+Overlap<32> for 97-128
+//
+// Benchmarks showed that redundant writes were cheap (for Intel X86) but
+// conditional were expensive, even on processor that do not support writing 64B
+// at a time (pre-AVX512F). We also want to favor short functions that allow
+// more hot code to fit in the iL1 cache.
+//
+// Above 128 we have to use conditionals since we don't know the upper bound in
+// advance. SetAlignedBlocks<64> may waste up to 63 Bytes, SetAlignedBlocks<32>
+// may waste up to 31 Bytes. Benchmarks showed that SetAlignedBlocks<64> was not
+// superior for sizes that mattered.
+inline static void inline_memset(char *dst, unsigned char value, size_t count) {
 #if defined(LLVM_LIBC_ARCH_X86)
-template <size_t MaxSize>
-[[maybe_unused]] inline static void inline_memset_x86(Ptr dst, uint8_t value,
-                                                      size_t count) {
+  /////////////////////////////////////////////////////////////////////////////
+  // LLVM_LIBC_ARCH_X86
+  /////////////////////////////////////////////////////////////////////////////
+  using namespace __llvm_libc::x86;
   if (count == 0)
     return;
   if (count == 1)
-    return generic::Memset<1, MaxSize>::block(dst, value);
+    return splat_set<_1>(dst, value);
   if (count == 2)
-    return generic::Memset<2, MaxSize>::block(dst, value);
+    return splat_set<_2>(dst, value);
   if (count == 3)
-    return generic::Memset<3, MaxSize>::block(dst, value);
+    return splat_set<_3>(dst, value);
   if (count <= 8)
-    return generic::Memset<4, MaxSize>::head_tail(dst, value, count);
+    return splat_set<HeadTail<_4>>(dst, value, count);
   if (count <= 16)
-    return generic::Memset<8, MaxSize>::head_tail(dst, value, count);
+    return splat_set<HeadTail<_8>>(dst, value, count);
   if (count <= 32)
-    return generic::Memset<16, MaxSize>::head_tail(dst, value, count);
+    return splat_set<HeadTail<_16>>(dst, value, count);
   if (count <= 64)
-    return generic::Memset<32, MaxSize>::head_tail(dst, value, count);
+    return splat_set<HeadTail<_32>>(dst, value, count);
   if (count <= 128)
-    return generic::Memset<64, MaxSize>::head_tail(dst, value, count);
-  // Aligned loop
-  generic::Memset<32, MaxSize>::block(dst, value);
-  align_to_next_boundary<32>(dst, count);
-  return generic::Memset<32, MaxSize>::loop_and_tail(dst, value, count);
-}
-#endif // defined(LLVM_LIBC_ARCH_X86)
-
-#if defined(LLVM_LIBC_ARCH_AARCH64)
-template <size_t MaxSize>
-[[maybe_unused]] inline static void
-inline_memset_aarch64(Ptr dst, uint8_t value, size_t count) {
+    return splat_set<HeadTail<_64>>(dst, value, count);
+  return splat_set<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
+#elif defined(LLVM_LIBC_ARCH_AARCH64)
+  /////////////////////////////////////////////////////////////////////////////
+  // LLVM_LIBC_ARCH_AARCH64
+  /////////////////////////////////////////////////////////////////////////////
+  using namespace __llvm_libc::aarch64_memset;
   if (count == 0)
     return;
   if (count <= 3) {
-    generic::Memset<1, MaxSize>::block(dst, value);
+    splat_set<_1>(dst, value);
     if (count > 1)
-      generic::Memset<2, MaxSize>::tail(dst, value, count);
+      splat_set<Tail<_2>>(dst, value, count);
     return;
   }
   if (count <= 8)
-    return generic::Memset<4, MaxSize>::head_tail(dst, value, count);
+    return splat_set<HeadTail<_4>>(dst, value, count);
   if (count <= 16)
-    return generic::Memset<8, MaxSize>::head_tail(dst, value, count);
+    return splat_set<HeadTail<_8>>(dst, value, count);
   if (count <= 32)
-    return generic::Memset<16, MaxSize>::head_tail(dst, value, count);
+    return splat_set<HeadTail<_16>>(dst, value, count);
   if (count <= (32 + 64)) {
-    generic::Memset<32, MaxSize>::block(dst, value);
+    splat_set<_32>(dst, value);
     if (count <= 64)
-      return generic::Memset<32, MaxSize>::tail(dst, value, count);
-    generic::Memset<32, MaxSize>::block(dst + 32, value);
-    generic::Memset<32, MaxSize>::tail(dst, value, count);
+      return splat_set<Tail<_32>>(dst, value, count);
+    splat_set<Skip<32>::Then<_32>>(dst, value);
+    splat_set<Tail<_32>>(dst, value, count);
     return;
   }
-  if (count >= 448 && value == 0 && aarch64::neon::hasZva()) {
-    generic::Memset<64, MaxSize>::block(dst, 0);
-    align_to_next_boundary<64>(dst, count);
-    return aarch64::neon::BzeroCacheLine<64>::loop_and_tail(dst, 0, count);
-  } else {
-    generic::Memset<16, MaxSize>::block(dst, value);
-    align_to_next_boundary<16>(dst, count);
-    return generic::Memset<64, MaxSize>::loop_and_tail(dst, value, count);
-  }
-}
-#endif // defined(LLVM_LIBC_ARCH_AARCH64)
-
-inline static void inline_memset(Ptr dst, uint8_t value, size_t count) {
-#if defined(LLVM_LIBC_ARCH_X86)
-  static constexpr size_t kMaxSize = x86::kAvx512F ? 64
-                                     : x86::kAvx   ? 32
-                                     : x86::kSse2  ? 16
-                                                   : 8;
-  return inline_memset_x86<kMaxSize>(dst, value, count);
-#elif defined(LLVM_LIBC_ARCH_AARCH64)
-  static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8;
-  return inline_memset_aarch64<kMaxSize>(dst, value, count);
-#elif defined(LLVM_LIBC_ARCH_ARM)
-  return inline_memset_embedded_tiny(dst, value, count);
+  if (count >= 448 && value == 0 && hasZva())
+    return splat_set<Align<_64, Arg::_1>::Then<Loop<Zva64, _64>>>(dst, 0,
+                                                                  count);
+  else
+    return splat_set<Align<_16, Arg::_1>::Then<Loop<_64>>>(dst, value, count);
 #else
-#error "Unsupported platform"
+  /////////////////////////////////////////////////////////////////////////////
+  // Default
+  /////////////////////////////////////////////////////////////////////////////
+  using namespace ::__llvm_libc::scalar;
+
+  if (count == 0)
+    return;
+  if (count == 1)
+    return splat_set<_1>(dst, value);
+  if (count == 2)
+    return splat_set<_2>(dst, value);
+  if (count == 3)
+    return splat_set<_3>(dst, value);
+  if (count <= 8)
+    return splat_set<HeadTail<_4>>(dst, value, count);
+  if (count <= 16)
+    return splat_set<HeadTail<_8>>(dst, value, count);
+  if (count <= 32)
+    return splat_set<HeadTail<_16>>(dst, value, count);
+  if (count <= 64)
+    return splat_set<HeadTail<_32>>(dst, value, count);
+  if (count <= 128)
+    return splat_set<HeadTail<_64>>(dst, value, count);
+  return splat_set<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
 #endif
 }
 
diff --git a/libc/src/string/memory_utils/op_aarch64.h b/libc/src/string/memory_utils/op_aarch64.h
deleted file mode 100644
index f8ccd83..0000000
--- a/libc/src/string/memory_utils/op_aarch64.h
+++ /dev/null
@@ -1,175 +0,0 @@
-//===-- aarch64 implementation of memory function building blocks ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides aarch64 specific building blocks to compose memory
-// functions.
-//
-//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
-#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
-
-#include "src/__support/architectures.h"
-
-#if defined(LLVM_LIBC_ARCH_AARCH64)
-
-#include "src/__support/common.h"
-#include "src/string/memory_utils/op_generic.h"
-
-#ifdef __ARM_NEON
-#include <arm_neon.h>
-#endif //__ARM_NEON
-
-namespace __llvm_libc::aarch64 {
-
-static inline constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON);
-
-namespace neon {
-
-template <size_t Size> struct BzeroCacheLine {
-  static constexpr size_t SIZE = Size;
-
-  static inline void block(Ptr dst, uint8_t) {
-    static_assert(Size == 64);
-#if __SIZEOF_POINTER__ == 4
-    asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
-#else
-    asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
-#endif
-  }
-
-  static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
-    static_assert(Size > 1);
-    size_t offset = 0;
-    do {
-      block(dst + offset, value);
-      offset += SIZE;
-    } while (offset < count - SIZE);
-    // Unaligned store, we can't use 'dc zva' here.
-    static constexpr size_t kMaxSize = kNeon ? 16 : 8;
-    generic::Memset<Size, kMaxSize>::tail(dst, value, count);
-  }
-};
-
-inline static bool hasZva() {
-  uint64_t zva_val;
-  asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
-  // DC ZVA is permitted if DZP, bit [4] is zero.
-  // BS, bits [3:0] is log2 of the block count in words.
-  // So the next line checks whether the instruction is permitted and block
-  // count is 16 words (i.e. 64 bytes).
-  return (zva_val & 0b11111) == 0b00100;
-}
-
-} // namespace neon
-
-///////////////////////////////////////////////////////////////////////////////
-// Memset
-
-///////////////////////////////////////////////////////////////////////////////
-// Bcmp
-template <size_t Size> struct Bcmp {
-  static constexpr size_t SIZE = Size;
-  static constexpr size_t BlockSize = 32;
-
-  static const unsigned char *as_u8(CPtr ptr) {
-    return reinterpret_cast<const unsigned char *>(ptr);
-  }
-
-  static inline BcmpReturnType block(CPtr p1, CPtr p2) {
-    if constexpr (Size == BlockSize) {
-      auto _p1 = as_u8(p1);
-      auto _p2 = as_u8(p2);
-      uint8x16_t a = vld1q_u8(_p1);
-      uint8x16_t b = vld1q_u8(_p1 + 16);
-      uint8x16_t n = vld1q_u8(_p2);
-      uint8x16_t o = vld1q_u8(_p2 + 16);
-      uint8x16_t an = veorq_u8(a, n);
-      uint8x16_t bo = veorq_u8(b, o);
-      // anbo = (a ^ n) | (b ^ o).  At least one byte is nonzero if there is
-      // a difference between the two buffers.  We reduce this value down to 4
-      // bytes in two steps. First, calculate the saturated move value when
-      // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get
-      // a single 32 bit nonzero value if a mismatch occurred.
-      uint8x16_t anbo = vorrq_u8(an, bo);
-      uint32x2_t anbo_reduced = vqmovn_u64(anbo);
-      return vmaxv_u32(anbo_reduced);
-    } else if constexpr ((Size % BlockSize) == 0) {
-      for (size_t offset = 0; offset < Size; offset += BlockSize)
-        if (auto value = Bcmp<BlockSize>::block(p1 + offset, p2 + offset))
-          return value;
-    } else {
-      deferred_static_assert("SIZE not implemented");
-    }
-    return BcmpReturnType::ZERO();
-  }
-
-  static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
-    return block(p1 + count - SIZE, p2 + count - SIZE);
-  }
-
-  static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
-    if constexpr (Size <= 8) {
-      return generic::Bcmp<Size>::head_tail(p1, p2, count);
-    } else if constexpr (Size == 16) {
-      auto _p1 = as_u8(p1);
-      auto _p2 = as_u8(p2);
-      uint8x16_t a = vld1q_u8(_p1);
-      uint8x16_t b = vld1q_u8(_p1 + count - 16);
-      uint8x16_t n = vld1q_u8(_p2);
-      uint8x16_t o = vld1q_u8(_p2 + count - 16);
-      uint8x16_t an = veorq_s8(a, n);
-      uint8x16_t bo = veorq_s8(b, o);
-      // anbo = (a ^ n) | (b ^ o)
-      uint8x16_t anbo = vorrq_s8(an, bo);
-      uint32x2_t anbo_reduced = vqmovn_u64(anbo);
-      return vmaxv_u32(anbo_reduced);
-    } else if constexpr (Size == 32) {
-      auto _p1 = as_u8(p1);
-      auto _p2 = as_u8(p2);
-      uint8x16_t a = vld1q_u8(_p1);
-      uint8x16_t b = vld1q_u8(_p1 + 16);
-      uint8x16_t c = vld1q_u8(_p1 + count - 16);
-      uint8x16_t d = vld1q_u8(_p1 + count - 32);
-      uint8x16_t n = vld1q_u8(_p2);
-      uint8x16_t o = vld1q_u8(_p2 + 16);
-      uint8x16_t p = vld1q_u8(_p2 + count - 16);
-      uint8x16_t q = vld1q_u8(_p2 + count - 32);
-      uint8x16_t an = veorq_s8(a, n);
-      uint8x16_t bo = veorq_s8(b, o);
-      uint8x16_t cp = veorq_s8(c, p);
-      uint8x16_t dq = veorq_s8(d, q);
-      uint8x16_t anbo = vorrq_s8(an, bo);
-      uint8x16_t cpdq = vorrq_s8(cp, dq);
-      // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)).  Reduce this to
-      // a nonzero 32 bit value if a mismatch occurred.
-      uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq);
-      uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq);
-      return vmaxv_u32(abnocpdq_reduced);
-    } else {
-      deferred_static_assert("SIZE not implemented");
-    }
-    return BcmpReturnType::ZERO();
-  }
-
-  static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
-    static_assert(Size > 1);
-    size_t offset = 0;
-    do {
-      if (auto value = block(p1 + offset, p2 + offset))
-        return value;
-      offset += SIZE;
-    } while (offset < count - SIZE);
-    return tail(p1, p2, count);
-  }
-};
-
-} // namespace __llvm_libc::aarch64
-
-#endif // LLVM_LIBC_ARCH_AARCH64
-
-#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
diff --git a/libc/src/string/memory_utils/op_builtin.h b/libc/src/string/memory_utils/op_builtin.h
deleted file mode 100644
index 6b3e92e..0000000
--- a/libc/src/string/memory_utils/op_builtin.h
+++ /dev/null
@@ -1,148 +0,0 @@
-//===-- Implementation using the __builtin_XXX_inline ---------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides generic C++ building blocks to compose memory functions.
-// They rely on the compiler to generate the best possible code through the use
-// of the `__builtin_XXX_inline` builtins. These builtins are currently only
-// available in Clang.
-//
-//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H
-#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H
-
-#include "src/string/memory_utils/utils.h"
-
-namespace __llvm_libc::builtin {
-
-///////////////////////////////////////////////////////////////////////////////
-// Memcpy
-template <size_t Size> struct Memcpy {
-  static constexpr size_t SIZE = Size;
-  static inline void block(Ptr __restrict dst, CPtr __restrict src) {
-#ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
-    return __builtin_memcpy_inline(dst, src, SIZE);
-#else
-    deferred_static_assert("Missing __builtin_memcpy_inline");
-    (void)dst;
-    (void)src;
-#endif
-  }
-
-  static inline void tail(Ptr __restrict dst, CPtr __restrict src,
-                          size_t count) {
-    block(dst + count - SIZE, src + count - SIZE);
-  }
-
-  static inline void head_tail(Ptr __restrict dst, CPtr __restrict src,
-                               size_t count) {
-    block(dst, src);
-    tail(dst, src, count);
-  }
-
-  static inline void loop_and_tail(Ptr __restrict dst, CPtr __restrict src,
-                                   size_t count) {
-    static_assert(Size > 1);
-    size_t offset = 0;
-    do {
-      block(dst + offset, src + offset);
-      offset += SIZE;
-    } while (offset < count - SIZE);
-    tail(dst, src, count);
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-// Memset
-template <size_t Size> struct Memset {
-  using ME = Memset;
-  static constexpr size_t SIZE = Size;
-  static inline void block(Ptr dst, uint8_t value) {
-#ifdef LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE
-    __builtin_memset_inline(dst, value, Size);
-#else
-    deferred_static_assert("Missing __builtin_memset_inline");
-    (void)dst;
-    (void)value;
-#endif
-  }
-
-  static inline void tail(Ptr dst, uint8_t value, size_t count) {
-    block(dst + count - SIZE, value);
-  }
-
-  static inline void head_tail(Ptr dst, uint8_t value, size_t count) {
-    block(dst, value);
-    tail(dst, value, count);
-  }
-
-  static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
-    static_assert(Size > 1);
-    size_t offset = 0;
-    do {
-      block(dst + offset, value);
-      offset += SIZE;
-    } while (offset < count - SIZE);
-    tail(dst, value, count);
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-// Bcmp
-template <size_t Size> struct Bcmp {
-  using ME = Bcmp;
-  static constexpr size_t SIZE = Size;
-  static inline BcmpReturnType block(CPtr, CPtr) {
-    deferred_static_assert("Missing __builtin_memcmp_inline");
-    return BcmpReturnType::ZERO();
-  }
-
-  static inline BcmpReturnType tail(CPtr, CPtr, size_t) {
-    deferred_static_assert("Not implemented");
-    return BcmpReturnType::ZERO();
-  }
-
-  static inline BcmpReturnType head_tail(CPtr, CPtr, size_t) {
-    deferred_static_assert("Not implemented");
-    return BcmpReturnType::ZERO();
-  }
-
-  static inline BcmpReturnType loop_and_tail(CPtr, CPtr, size_t) {
-    deferred_static_assert("Not implemented");
-    return BcmpReturnType::ZERO();
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-// Memcmp
-template <size_t Size> struct Memcmp {
-  using ME = Memcmp;
-  static constexpr size_t SIZE = Size;
-  static inline MemcmpReturnType block(CPtr, CPtr) {
-    deferred_static_assert("Missing __builtin_memcmp_inline");
-    return MemcmpReturnType::ZERO();
-  }
-
-  static inline MemcmpReturnType tail(CPtr, CPtr, size_t) {
-    deferred_static_assert("Not implemented");
-    return MemcmpReturnType::ZERO();
-  }
-
-  static inline MemcmpReturnType head_tail(CPtr, CPtr, size_t) {
-    deferred_static_assert("Not implemented");
-    return MemcmpReturnType::ZERO();
-  }
-
-  static inline MemcmpReturnType loop_and_tail(CPtr, CPtr, size_t) {
-    deferred_static_assert("Not implemented");
-    return MemcmpReturnType::ZERO();
-  }
-};
-
-} // namespace __llvm_libc::builtin
-
-#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
deleted file mode 100644
index 226d775..0000000
--- a/libc/src/string/memory_utils/op_generic.h
+++ /dev/null
@@ -1,466 +0,0 @@
-//===-- Generic implementation of memory function building blocks ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides generic C++ building blocks.
-// Depending on the requested size, the block operation uses unsigned integral
-// types, vector types or an array of the type with the maximum size.
-//
-// The maximum size is passed as a template argument. For instance, on x86
-// platforms that only supports integral types the maximum size would be 8
-// (corresponding to uint64_t). On this platform if we request the size 32, this
-// would be treated as a cpp::array<uint64_t, 4>.
-//
-// On the other hand, if the platform is x86 with support for AVX the maximum
-// size is 32 and the operation can be handled with a single native operation.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H
-#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H
-
-#include "src/__support/CPP/array.h"
-#include "src/__support/CPP/type_traits.h"
-#include "src/__support/endian.h"
-#include "src/string/memory_utils/op_builtin.h"
-#include "src/string/memory_utils/utils.h"
-
-#include <stdint.h>
-
-namespace __llvm_libc::generic {
-
-// CTPair and CTMap below implement a compile time map.
-// This is useful to map from a Size to a type handling this size.
-//
-// Example usage:
-// using MyMap = CTMap<CTPair<1, uint8_t>,
-//                     CTPair<2, uint16_t>,
-//                     >;
-// ...
-// using UInt8T = MyMap::find_type<1>;
-template <size_t I, typename T> struct CTPair {
-  using type = T;
-  static CTPair get_pair(cpp::integral_constant<size_t, I>) { return {}; }
-};
-template <typename... Pairs> struct CTMap : public Pairs... {
-  using Pairs::get_pair...;
-  template <size_t I>
-  using find_type =
-      typename decltype(get_pair(cpp::integral_constant<size_t, I>{}))::type;
-};
-
-// Helper to test if a type is void.
-template <typename T> inline constexpr bool is_void_v = cpp::is_same_v<T, void>;
-
-// Implements load, store and splat for unsigned integral types.
-template <typename T> struct ScalarType {
-  using Type = T;
-  static_assert(cpp::is_integral_v<Type> && !cpp::is_signed_v<Type>);
-
-  static inline Type load(CPtr src) { return ::__llvm_libc::load<Type>(src); }
-  static inline void store(Ptr dst, Type value) {
-    ::__llvm_libc::store<Type>(dst, value);
-  }
-  static inline Type splat(uint8_t value) {
-    return Type(~0) / Type(0xFF) * Type(value);
-  }
-};
-
-// Implements load, store and splat for vector types.
-template <size_t Size> struct VectorType {
-  using Type = uint8_t __attribute__((__vector_size__(Size)));
-  static inline Type load(CPtr src) { return ::__llvm_libc::load<Type>(src); }
-  static inline void store(Ptr dst, Type value) {
-    ::__llvm_libc::store<Type>(dst, value);
-  }
-  static inline Type splat(uint8_t value) {
-    Type Out;
-    // This for loop is optimized out for vector types.
-    for (size_t i = 0; i < Size; ++i)
-      Out[i] = static_cast<uint8_t>(value);
-    return Out;
-  }
-};
-
-// We currently don't support 8- or 16-bit platforms, it must be 32- or 64-bit.
-static_assert((UINTPTR_MAX == 4294967295U) ||
-              (UINTPTR_MAX == 18446744073709551615UL));
-
-// Map from sizes to structures offering static load, store and splat methods.
-// Note: On platforms lacking vector support, we use the ArrayType below and
-// decompose the operation in smaller pieces.
-using NativeTypeMap =
-    CTMap<CTPair<1, ScalarType<uint8_t>>,  //
-          CTPair<2, ScalarType<uint16_t>>, //
-          CTPair<4, ScalarType<uint32_t>>, //
-#if defined(LLVM_LIBC_ARCH_X86_64) || defined(LLVM_LIBC_ARCH_AARCH64)
-          CTPair<8, ScalarType<uint64_t>>, // Not available on 32bit
-#endif                                     //
-          CTPair<16, VectorType<16>>,      //
-          CTPair<32, VectorType<32>>,      //
-          CTPair<64, VectorType<64>>>;
-
-// Implements load, store and splat for sizes not natively supported by the
-// platform. SubType is either ScalarType or VectorType.
-template <typename SubType, size_t ArraySize> struct ArrayType {
-  using Type = cpp::array<typename SubType::Type, ArraySize>;
-  static constexpr size_t SizeOfElement = sizeof(typename SubType::Type);
-  static inline Type load(CPtr src) {
-    Type Value;
-    for (size_t I = 0; I < ArraySize; ++I)
-      Value[I] = SubType::load(src + (I * SizeOfElement));
-    return Value;
-  }
-  static inline void store(Ptr dst, Type Value) {
-    for (size_t I = 0; I < ArraySize; ++I)
-      SubType::store(dst + (I * SizeOfElement), Value[I]);
-  }
-  static inline Type splat(uint8_t value) {
-    Type Out;
-    for (size_t I = 0; I < ArraySize; ++I)
-      Out[I] = SubType::splat(value);
-    return Out;
-  }
-};
-
-// Checks whether we should use an ArrayType.
-template <size_t Size, size_t MaxSize> static constexpr bool useArrayType() {
-  return (Size > MaxSize) && ((Size % MaxSize) == 0) &&
-         !is_void_v<NativeTypeMap::find_type<MaxSize>>;
-}
-
-// Compute the type to handle an operation of Size bytes knowing that the
-// underlying platform only support native types up to MaxSize bytes.
-template <size_t Size, size_t MaxSize>
-using getTypeFor = cpp::conditional_t<
-    useArrayType<Size, MaxSize>(),
-    ArrayType<NativeTypeMap::find_type<MaxSize>, Size / MaxSize>,
-    NativeTypeMap::find_type<Size>>;
-
-///////////////////////////////////////////////////////////////////////////////
-// Memcpy
-// When building with clang we can delegate to the builtin implementation.
-///////////////////////////////////////////////////////////////////////////////
-
-template <size_t Size> using Memcpy = builtin::Memcpy<Size>;
-
-///////////////////////////////////////////////////////////////////////////////
-// Memset
-// The MaxSize template argument gives the maximum size handled natively by the
-// platform. For instance on x86 with AVX support this would be 32. If a size
-// greater than MaxSize is requested we break the operation down in smaller
-// pieces of size MaxSize.
-///////////////////////////////////////////////////////////////////////////////
-template <size_t Size, size_t MaxSize> struct Memset {
-  static_assert(is_power2(MaxSize));
-  static constexpr size_t SIZE = Size;
-
-  static inline void block(Ptr dst, uint8_t value) {
-    if constexpr (Size == 3) {
-      Memset<1, MaxSize>::block(dst + 2, value);
-      Memset<2, MaxSize>::block(dst, value);
-    } else {
-      using T = getTypeFor<Size, MaxSize>;
-      if constexpr (is_void_v<T>) {
-        deferred_static_assert("Unimplemented Size");
-      } else {
-        T::store(dst, T::splat(value));
-      }
-    }
-  }
-
-  static inline void tail(Ptr dst, uint8_t value, size_t count) {
-    block(dst + count - SIZE, value);
-  }
-
-  static inline void head_tail(Ptr dst, uint8_t value, size_t count) {
-    block(dst, value);
-    tail(dst, value, count);
-  }
-
-  static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
-    static_assert(SIZE > 1);
-    size_t offset = 0;
-    do {
-      block(dst + offset, value);
-      offset += SIZE;
-    } while (offset < count - SIZE);
-    tail(dst, value, count);
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-// Bcmp
-///////////////////////////////////////////////////////////////////////////////
-template <size_t Size> struct Bcmp {
-  static constexpr size_t SIZE = Size;
-  static constexpr size_t MaxSize = 8;
-
-  template <typename T> static inline uint32_t load_xor(CPtr p1, CPtr p2) {
-    return load<T>(p1) ^ load<T>(p2);
-  }
-
-  template <typename T>
-  static inline uint32_t load_not_equal(CPtr p1, CPtr p2) {
-    return load<T>(p1) != load<T>(p2);
-  }
-
-  static inline BcmpReturnType block(CPtr p1, CPtr p2) {
-    static constexpr size_t MaxSize = 8;
-    if constexpr (Size == 1) {
-      return load_xor<uint8_t>(p1, p2);
-    } else if constexpr (Size == 2) {
-      return load_xor<uint16_t>(p1, p2);
-    } else if constexpr (Size == 4) {
-      return load_xor<uint32_t>(p1, p2);
-    } else if constexpr (Size == 8) {
-      return load_not_equal<uint64_t>(p1, p2);
-    } else if constexpr (useArrayType<Size, MaxSize>()) {
-      for (size_t offset = 0; offset < Size; offset += MaxSize)
-        if (auto value = Bcmp<MaxSize>::block(p1 + offset, p2 + offset))
-          return value;
-    } else {
-      deferred_static_assert("Unimplemented Size");
-    }
-    return BcmpReturnType::ZERO();
-  }
-
-  static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
-    return block(p1 + count - SIZE, p2 + count - SIZE);
-  }
-
-  static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
-    return block(p1, p2) | tail(p1, p2, count);
-  }
-
-  static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
-    static_assert(Size > 1);
-    size_t offset = 0;
-    do {
-      if (auto value = block(p1 + offset, p2 + offset))
-        return value;
-      offset += SIZE;
-    } while (offset < count - SIZE);
-    return tail(p1, p2, count);
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-// Memcmp
-///////////////////////////////////////////////////////////////////////////////
-template <size_t Size> struct Memcmp {
-  static constexpr size_t SIZE = Size;
-  static constexpr size_t MaxSize = 8;
-
-  template <typename T> static inline T load_be(CPtr ptr) {
-    return Endian::to_big_endian(load<T>(ptr));
-  }
-
-  template <typename T>
-  static inline MemcmpReturnType load_be_diff(CPtr p1, CPtr p2) {
-    return load_be<T>(p1) - load_be<T>(p2);
-  }
-
-  template <typename T>
-  static inline MemcmpReturnType load_be_cmp(CPtr p1, CPtr p2) {
-    const auto la = load_be<T>(p1);
-    const auto lb = load_be<T>(p2);
-    return la > lb ? 1 : la < lb ? -1 : 0;
-  }
-
-  static inline MemcmpReturnType block(CPtr p1, CPtr p2) {
-    if constexpr (Size == 1) {
-      return load_be_diff<uint8_t>(p1, p2);
-    } else if constexpr (Size == 2) {
-      return load_be_diff<uint16_t>(p1, p2);
-    } else if constexpr (Size == 4) {
-      return load_be_cmp<uint32_t>(p1, p2);
-    } else if constexpr (Size == 8) {
-      return load_be_cmp<uint64_t>(p1, p2);
-    } else if constexpr (useArrayType<Size, MaxSize>()) {
-      for (size_t offset = 0; offset < Size; offset += MaxSize)
-        if (Bcmp<MaxSize>::block(p1 + offset, p2 + offset))
-          return Memcmp<MaxSize>::block(p1 + offset, p2 + offset);
-      return MemcmpReturnType::ZERO();
-    } else if constexpr (Size == 3) {
-      if (auto value = Memcmp<2>::block(p1, p2))
-        return value;
-      return Memcmp<1>::block(p1 + 2, p2 + 2);
-    } else {
-      deferred_static_assert("Unimplemented Size");
-    }
-  }
-
-  static inline MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
-    return block(p1 + count - SIZE, p2 + count - SIZE);
-  }
-
-  static inline MemcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
-    if (auto value = block(p1, p2))
-      return value;
-    return tail(p1, p2, count);
-  }
-
-  static inline MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
-    static_assert(Size > 1);
-    size_t offset = 0;
-    do {
-      if (auto value = block(p1 + offset, p2 + offset))
-        return value;
-      offset += SIZE;
-    } while (offset < count - SIZE);
-    return tail(p1, p2, count);
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-// Memmove
-///////////////////////////////////////////////////////////////////////////////
-
-template <size_t Size, size_t MaxSize> struct Memmove {
-  static_assert(is_power2(MaxSize));
-  using T = getTypeFor<Size, MaxSize>;
-  static constexpr size_t SIZE = Size;
-
-  static inline void block(Ptr dst, CPtr src) {
-    if constexpr (is_void_v<T>) {
-      deferred_static_assert("Unimplemented Size");
-    } else {
-      T::store(dst, T::load(src));
-    }
-  }
-
-  static inline void head_tail(Ptr dst, CPtr src, size_t count) {
-    const size_t offset = count - Size;
-    if constexpr (is_void_v<T>) {
-      deferred_static_assert("Unimplemented Size");
-    } else {
-      // The load and store operations can be performed in any order as long as
-      // they are not interleaved. More investigations are needed to determine
-      // the best order.
-      const auto head = T::load(src);
-      const auto tail = T::load(src + offset);
-      T::store(dst, head);
-      T::store(dst + offset, tail);
-    }
-  }
-
-  // Align forward suitable when dst < src. The alignment is performed with
-  // an HeadTail operation of count â [Alignment, 2 x Alignment].
-  //
-  // e.g. Moving two bytes forward, we make sure src is aligned.
-  // [  |       |       |       |      ]
-  // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
-  // [____LLLLLLLL_____________________]
-  // [___________LLLLLLLA______________]
-  // [_SSSSSSSS________________________]
-  // [________SSSSSSSS_________________]
-  //
-  // e.g. Moving two bytes forward, we make sure dst is aligned.
-  // [  |       |       |       |      ]
-  // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
-  // [____LLLLLLLL_____________________]
-  // [______LLLLLLLL___________________]
-  // [_SSSSSSSS________________________]
-  // [___SSSSSSSA______________________]
-  template <Arg AlignOn>
-  static inline void align_forward(Ptr &dst, CPtr &src, size_t &count) {
-    Ptr prev_dst = dst;
-    CPtr prev_src = src;
-    size_t prev_count = count;
-    align_to_next_boundary<Size, AlignOn>(dst, src, count);
-    adjust(Size, dst, src, count);
-    head_tail(prev_dst, prev_src, prev_count - count);
-  }
-
-  // Align backward suitable when dst > src. The alignment is performed with
-  // an HeadTail operation of count â [Alignment, 2 x Alignment].
-  //
-  // e.g. Moving two bytes backward, we make sure src is aligned.
-  // [  |       |       |       |      ]
-  // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
-  // [ _________________ALLLLLLL_______]
-  // [ ___________________LLLLLLLL_____]
-  // [____________________SSSSSSSS_____]
-  // [______________________SSSSSSSS___]
-  //
-  // e.g. Moving two bytes backward, we make sure dst is aligned.
-  // [  |       |       |       |      ]
-  // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
-  // [ _______________LLLLLLLL_________]
-  // [ ___________________LLLLLLLL_____]
-  // [__________________ASSSSSSS_______]
-  // [______________________SSSSSSSS___]
-  template <Arg AlignOn>
-  static inline void align_backward(Ptr &dst, CPtr &src, size_t &count) {
-    Ptr headtail_dst = dst + count;
-    CPtr headtail_src = src + count;
-    size_t headtail_size = 0;
-    align_to_next_boundary<Size, AlignOn>(headtail_dst, headtail_src,
-                                          headtail_size);
-    adjust(-2 * Size, headtail_dst, headtail_src, headtail_size);
-    head_tail(headtail_dst, headtail_src, headtail_size);
-    count -= headtail_size;
-  }
-
-  // Move forward suitable when dst < src. We load the tail bytes before
-  // handling the loop.
-  //
-  // e.g. Moving two bytes
-  // [   |       |       |       |       |]
-  // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
-  // [_________________________LLLLLLLL___]
-  // [___LLLLLLLL_________________________]
-  // [_SSSSSSSS___________________________]
-  // [___________LLLLLLLL_________________]
-  // [_________SSSSSSSS___________________]
-  // [___________________LLLLLLLL_________]
-  // [_________________SSSSSSSS___________]
-  // [_______________________SSSSSSSS_____]
-  static inline void loop_and_tail_forward(Ptr dst, CPtr src, size_t count) {
-    static_assert(Size > 1);
-    const size_t tail_offset = count - Size;
-    const auto tail_value = T::load(src + tail_offset);
-    size_t offset = 0;
-#pragma nounroll
-    do {
-      block(dst + offset, src + offset);
-      offset += Size;
-    } while (offset < count - Size);
-    T::store(dst + tail_offset, tail_value);
-  }
-
-  // Move backward suitable when dst > src. We load the head bytes before
-  // handling the loop.
-  //
-  // e.g. Moving two bytes
-  // [   |       |       |       |       |]
-  // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
-  // [___LLLLLLLL_________________________]
-  // [_________________________LLLLLLLL___]
-  // [___________________________SSSSSSSS_]
-  // [_________________LLLLLLLL___________]
-  // [___________________SSSSSSSS_________]
-  // [_________LLLLLLLL___________________]
-  // [___________SSSSSSSS_________________]
-  // [_____SSSSSSSS_______________________]
-  static inline void loop_and_tail_backward(Ptr dst, CPtr src, size_t count) {
-    static_assert(Size > 1);
-    const auto head_value = T::load(src);
-    ptrdiff_t offset = count - Size;
-#pragma nounroll
-    do {
-      block(dst + offset, src + offset);
-      offset -= Size;
-    } while (offset >= 0);
-    T::store(dst, head_value);
-  }
-};
-
-} // namespace __llvm_libc::generic
-
-#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H
diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h
deleted file mode 100644
index 96847b2..0000000
--- a/libc/src/string/memory_utils/op_x86.h
+++ /dev/null
@@ -1,219 +0,0 @@
-//===-- x86 implementation of memory function building blocks -------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides x86 specific building blocks to compose memory functions.
-//
-//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
-#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
-
-#include "src/__support/architectures.h"
-
-#if defined(LLVM_LIBC_ARCH_X86_64)
-
-#include "src/__support/common.h"
-#include "src/string/memory_utils/op_builtin.h"
-#include "src/string/memory_utils/op_generic.h"
-
-#ifdef __SSE2__
-#include <immintrin.h>
-#else
-// Define fake functions to prevent the compiler from failing on undefined
-// functions in case SSE2 is not present.
-#define _mm512_cmpneq_epi8_mask(A, B) 0
-#define _mm_movemask_epi8(A) 0
-#define _mm256_movemask_epi8(A) 0
-#endif //  __SSE2__
-
-namespace __llvm_libc::x86 {
-
-// A set of constants to check compile time features.
-static inline constexpr bool kSse2 = LLVM_LIBC_IS_DEFINED(__SSE2__);
-static inline constexpr bool kAvx = LLVM_LIBC_IS_DEFINED(__AVX__);
-static inline constexpr bool kAvx2 = LLVM_LIBC_IS_DEFINED(__AVX2__);
-static inline constexpr bool kAvx512F = LLVM_LIBC_IS_DEFINED(__AVX512F__);
-static inline constexpr bool kAvx512BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__);
-
-///////////////////////////////////////////////////////////////////////////////
-// Memcpy repmovsb implementation
-struct Memcpy {
-  static void repmovsb(char *dst, const char *src, size_t count) {
-    asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-// Bcmp
-
-// Base implementation for the Bcmp specializations.
-//  - BlockSize is either 16, 32 or 64 depending on the available compile time
-// features, it is used to switch between "single native operation" or a
-// "sequence of native operations".
-//  - BlockBcmp is the function that implements the bcmp logic.
-template <size_t Size, size_t BlockSize, auto BlockBcmp> struct BcmpImpl {
-  static inline BcmpReturnType block(CPtr p1, CPtr p2) {
-    if constexpr (Size == BlockSize) {
-      return BlockBcmp(p1, p2);
-    } else if constexpr (Size % BlockSize == 0) {
-      for (size_t offset = 0; offset < Size; offset += BlockSize)
-        if (auto value = BlockBcmp(p1 + offset, p2 + offset))
-          return value;
-    } else {
-      deferred_static_assert("SIZE not implemented");
-    }
-    return BcmpReturnType::ZERO();
-  }
-
-  static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
-    return block(p1 + count - Size, p2 + count - Size);
-  }
-
-  static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
-    return block(p1, p2) | tail(p1, p2, count);
-  }
-
-  static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
-    static_assert(Size > 1);
-    size_t offset = 0;
-    do {
-      if (auto value = block(p1 + offset, p2 + offset))
-        return value;
-      offset += Size;
-    } while (offset < count - Size);
-    return tail(p1, p2, count);
-  }
-};
-
-namespace sse2 {
-static inline BcmpReturnType bcmp16(CPtr p1, CPtr p2) {
-  using T = char __attribute__((__vector_size__(16)));
-  // A mask indicating which bytes differ after loading 16 bytes from p1 and p2.
-  const int mask = _mm_movemask_epi8(load<T>(p1) != load<T>(p2));
-  return static_cast<uint32_t>(mask);
-}
-template <size_t Size> using Bcmp = BcmpImpl<Size, 16, bcmp16>;
-} // namespace sse2
-
-namespace avx2 {
-static inline BcmpReturnType bcmp32(CPtr p1, CPtr p2) {
-  using T = char __attribute__((__vector_size__(32)));
-  // A mask indicating which bytes differ after loading 32 bytes from p1 and p2.
-  const int mask = _mm256_movemask_epi8(load<T>(p1) != load<T>(p2));
-  // _mm256_movemask_epi8 returns an int but it is to be interpreted as a 32-bit
-  // mask.
-  return static_cast<uint32_t>(mask);
-}
-template <size_t Size> using Bcmp = BcmpImpl<Size, 32, bcmp32>;
-} // namespace avx2
-
-namespace avx512bw {
-static inline BcmpReturnType bcmp64(CPtr p1, CPtr p2) {
-  using T = char __attribute__((__vector_size__(64)));
-  // A mask indicating which bytes differ after loading 64 bytes from p1 and p2.
-  const uint64_t mask = _mm512_cmpneq_epi8_mask(load<T>(p1), load<T>(p2));
-  const bool mask_is_set = mask != 0;
-  return static_cast<uint32_t>(mask_is_set);
-}
-template <size_t Size> using Bcmp = BcmpImpl<Size, 64, bcmp64>;
-} // namespace avx512bw
-
-// Assuming that the mask is non zero, the index of the first mismatching byte
-// is the number of trailing zeros in the mask. Trailing zeros and not leading
-// zeros because the x86 architecture is little endian.
-static inline MemcmpReturnType char_diff_no_zero(CPtr p1, CPtr p2,
-                                                 uint64_t mask) {
-  const size_t diff_index = __builtin_ctzll(mask);
-  const int16_t ca = p1[diff_index];
-  const int16_t cb = p2[diff_index];
-  return ca - cb;
-}
-
-///////////////////////////////////////////////////////////////////////////////
-// Memcmp
-
-// Base implementation for the Memcmp specializations.
-//  - BlockSize is either 16, 32 or 64 depending on the available compile time
-// features, it is used to switch between "single native operation" or a
-// "sequence of native operations".
-//  - BlockMemcmp is the function that implements the memcmp logic.
-//  - BlockBcmp is the function that implements the bcmp logic.
-template <size_t Size, size_t BlockSize, auto BlockMemcmp, auto BlockBcmp>
-struct MemcmpImpl {
-  static inline MemcmpReturnType block(CPtr p1, CPtr p2) {
-    if constexpr (Size == BlockSize) {
-      return BlockMemcmp(p1, p2);
-    } else if constexpr (Size % BlockSize == 0) {
-      for (size_t offset = 0; offset < Size; offset += BlockSize)
-        if (auto value = BlockBcmp(p1 + offset, p2 + offset))
-          return BlockMemcmp(p1 + offset, p2 + offset);
-    } else {
-      deferred_static_assert("SIZE not implemented");
-    }
-    return MemcmpReturnType::ZERO();
-  }
-
-  static inline MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
-    return block(p1 + count - Size, p2 + count - Size);
-  }
-
-  static inline MemcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
-    if (auto value = block(p1, p2))
-      return value;
-    return tail(p1, p2, count);
-  }
-
-  static inline MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
-    static_assert(Size > 1);
-    size_t offset = 0;
-    do {
-      if (auto value = block(p1 + offset, p2 + offset))
-        return value;
-      offset += Size;
-    } while (offset < count - Size);
-    return tail(p1, p2, count);
-  }
-};
-
-namespace sse2 {
-static inline MemcmpReturnType memcmp16(CPtr p1, CPtr p2) {
-  using T = char __attribute__((__vector_size__(16)));
-  // A mask indicating which bytes differ after loading 16 bytes from p1 and p2.
-  if (int mask = _mm_movemask_epi8(load<T>(p1) != load<T>(p2)))
-    return char_diff_no_zero(p1, p2, mask);
-  return MemcmpReturnType::ZERO();
-}
-template <size_t Size> using Memcmp = MemcmpImpl<Size, 16, memcmp16, bcmp16>;
-} // namespace sse2
-
-namespace avx2 {
-static inline MemcmpReturnType memcmp32(CPtr p1, CPtr p2) {
-  using T = char __attribute__((__vector_size__(32)));
-  // A mask indicating which bytes differ after loading 32 bytes from p1 and p2.
-  if (int mask = _mm256_movemask_epi8(load<T>(p1) != load<T>(p2)))
-    return char_diff_no_zero(p1, p2, mask);
-  return MemcmpReturnType::ZERO();
-}
-template <size_t Size> using Memcmp = MemcmpImpl<Size, 32, memcmp32, bcmp32>;
-} // namespace avx2
-
-namespace avx512bw {
-static inline MemcmpReturnType memcmp64(CPtr p1, CPtr p2) {
-  using T = char __attribute__((__vector_size__(64)));
-  // A mask indicating which bytes differ after loading 64 bytes from p1 and p2.
-  if (uint64_t mask = _mm512_cmpneq_epi8_mask(load<T>(p1), load<T>(p2)))
-    return char_diff_no_zero(p1, p2, mask);
-  return MemcmpReturnType::ZERO();
-}
-template <size_t Size> using Memcmp = MemcmpImpl<Size, 64, memcmp64, bcmp64>;
-} // namespace avx512bw
-
-} // namespace __llvm_libc::x86
-
-#endif // LLVM_LIBC_ARCH_X86_64
-
-#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
index 9d1321f..d915835 100644
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@@ -9,8 +9,19 @@
 #ifndef LLVM_LIBC_SRC_MEMORY_UTILS_UTILS_H
 #define LLVM_LIBC_SRC_MEMORY_UTILS_UTILS_H
 
-#include "src/__support/CPP/bit.h"
-#include "src/__support/CPP/type_traits.h"
+#include "src/__support/architectures.h"
+
+// Cache line sizes for ARM: These values are not strictly correct since
+// cache line sizes depend on implementations, not architectures.  There
+// are even implementations with cache line sizes configurable at boot
+// time.
+#if defined(LLVM_LIBC_ARCH_AARCH64) || defined(LLVM_LIBC_ARCH_X86)
+#define LLVM_LIBC_CACHELINE_SIZE 64
+#elif defined(LLVM_LIBC_ARCH_ARM)
+#define LLVM_LIBC_CACHELINE_SIZE 32
+#else
+#error "Unsupported platform for memory functions."
+#endif
 
 #include <stddef.h> // size_t
 #include <stdint.h> // intptr_t / uintptr_t
@@ -51,46 +62,32 @@ static constexpr size_t ge_power2(size_t value) {
   return is_power2_or_zero(value) ? value : 1ULL << (log2(value) + 1);
 }
 
-// Returns the number of bytes to substract from ptr to get to the previous
-// multiple of alignment. If ptr is already aligned returns 0.
-template <size_t alignment> uintptr_t distance_to_align_down(const void *ptr) {
+template <size_t alignment> intptr_t offset_from_last_aligned(const void *ptr) {
   static_assert(is_power2(alignment), "alignment must be a power of 2");
   return reinterpret_cast<uintptr_t>(ptr) & (alignment - 1U);
 }
 
-// Returns the number of bytes to add to ptr to get to the next multiple of
-// alignment. If ptr is already aligned returns 0.
-template <size_t alignment> uintptr_t distance_to_align_up(const void *ptr) {
+template <size_t alignment> intptr_t offset_to_next_aligned(const void *ptr) {
   static_assert(is_power2(alignment), "alignment must be a power of 2");
   // The logic is not straightforward and involves unsigned modulo arithmetic
   // but the generated code is as fast as it can be.
   return -reinterpret_cast<uintptr_t>(ptr) & (alignment - 1U);
 }
 
-// Returns the number of bytes to add to ptr to get to the next multiple of
-// alignment. If ptr is already aligned returns alignment.
-template <size_t alignment>
-uintptr_t distance_to_next_aligned(const void *ptr) {
-  return alignment - distance_to_align_down<alignment>(ptr);
+// Returns the offset from `ptr` to the next cache line.
+static inline intptr_t offset_to_next_cache_line(const void *ptr) {
+  return offset_to_next_aligned<LLVM_LIBC_CACHELINE_SIZE>(ptr);
 }
 
-// Returns the same pointer but notifies the compiler that it is aligned.
 template <size_t alignment, typename T> static T *assume_aligned(T *ptr) {
   return reinterpret_cast<T *>(__builtin_assume_aligned(ptr, alignment));
 }
-
 #if defined __has_builtin
 #if __has_builtin(__builtin_memcpy_inline)
 #define LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
 #endif
 #endif
 
-#if defined __has_builtin
-#if __has_builtin(__builtin_memset_inline)
-#define LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE
-#endif
-#endif
-
 // Performs a constant count copy.
 template <size_t Size>
 static inline void memcpy_inline(void *__restrict dst,
@@ -106,56 +103,28 @@ static inline void memcpy_inline(void *__restrict dst,
 using Ptr = char *;        // Pointer to raw data.
 using CPtr = const char *; // Const pointer to raw data.
 
-// This type makes sure that we don't accidentally promote an integral type to
-// another one. It is only constructible from the exact T type.
-template <typename T> struct StrictIntegralType {
-  static_assert(cpp::is_integral_v<T>);
-
-  // Can only be constructed from a T.
-  template <typename U, cpp::enable_if_t<cpp::is_same_v<U, T>, bool> = 0>
-  StrictIntegralType(U value) : value(value) {}
-
-  // Allows using the type in an if statement.
-  explicit operator bool() const { return value; }
-
-  // If type is unsigned (bcmp) we allow bitwise OR operations.
-  StrictIntegralType operator|(const StrictIntegralType &Rhs) const {
-    static_assert(!cpp::is_signed_v<T>);
-    return value | Rhs.value;
-  }
-
-  // For interation with the C API we allow explicit conversion back to the
-  // `int` type.
-  explicit operator int() const {
-    // bit_cast makes sure that T and int have the same size.
-    return cpp::bit_cast<int>(value);
-  }
-
-  // Helper to get the zero value.
-  static inline constexpr StrictIntegralType ZERO() { return {T(0)}; }
-
-private:
-  T value;
-};
-
-using MemcmpReturnType = StrictIntegralType<int32_t>;
-using BcmpReturnType = StrictIntegralType<uint32_t>;
-
-// Loads bytes from memory (possibly unaligned) and materializes them as
-// type.
+// Loads bytes from memory (possibly unaligned) and materializes them as type.
 template <typename T> static inline T load(CPtr ptr) {
   T Out;
   memcpy_inline<sizeof(T)>(&Out, ptr);
   return Out;
 }
 
-// Stores a value of type T in memory (possibly unaligned).
+// Stores a value of type T in memory (possibly unaligned)
 template <typename T> static inline void store(Ptr ptr, T value) {
   memcpy_inline<sizeof(T)>(ptr, &value);
 }
 
-// Advances the pointers p1 and p2 by offset bytes and decrease count by the
-// same amount.
+// For an operation like memset that operates on a pointer and a count, advances
+// the pointer by offset bytes and decrease count by the same amount.
+static inline void adjust(ptrdiff_t offset, Ptr &ptr, size_t &count) {
+  ptr += offset;
+  count -= offset;
+}
+
+// For an operation like memcpy or memcmp that operates on two pointers and a
+// count, advances the pointers by offset bytes and decrease count by the same
+// amount.
 template <typename T1, typename T2>
 static inline void adjust(ptrdiff_t offset, T1 *__restrict &p1,
                           T2 *__restrict &p2, size_t &count) {
@@ -164,37 +133,31 @@ static inline void adjust(ptrdiff_t offset, T1 *__restrict &p1,
   count -= offset;
 }
 
-// Advances p1 and p2 so p1 gets aligned to the next SIZE bytes boundary
-// and decrease count by the same amount.
+// For an operation like memset that operates on a pointer and a count, advances
+// the pointer so it is aligned to SIZE bytes and decrease count by the same
+// amount.
 // We make sure the compiler knows about the adjusted pointer alignment.
-template <size_t SIZE, typename T1, typename T2>
-void align_p1_to_next_boundary(T1 *__restrict &p1, T2 *__restrict &p2,
-                               size_t &count) {
-  adjust(distance_to_next_aligned<SIZE>(p1), p1, p2, count);
-  p1 = assume_aligned<SIZE>(p1);
-}
-
-// Same as align_p1_to_next_boundary above but with a single pointer instead.
-template <size_t SIZE, typename T1>
-void align_to_next_boundary(T1 *&p1, size_t &count) {
-  CPtr dummy;
-  align_p1_to_next_boundary<SIZE>(p1, dummy, count);
+template <size_t SIZE> void align(Ptr &ptr, size_t &count) {
+  adjust(offset_to_next_aligned<SIZE>(ptr), ptr, count);
+  ptr = assume_aligned<SIZE>(ptr);
 }
 
-// An enum class that discriminates between the first and second pointer.
-enum class Arg { P1, P2, Dst = P1, Src = P2 };
-
-// Same as align_p1_to_next_boundary but allows for aligning p2 instead of p1.
-// Precondition: &p1 != &p2
+// For an operation like memcpy or memcmp that operates on two pointers and a
+// count, advances the pointers so one of them gets aligned to SIZE bytes and
+// decrease count by the same amount.
+// We make sure the compiler knows about the adjusted pointer alignment.
+enum class Arg { _1, _2, Dst = _1, Src = _2, Lhs = _1, Rhs = _2 };
 template <size_t SIZE, Arg AlignOn, typename T1, typename T2>
-void align_to_next_boundary(T1 *__restrict &p1, T2 *__restrict &p2,
-                            size_t &count) {
-  if constexpr (AlignOn == Arg::P1)
-    align_p1_to_next_boundary<SIZE>(p1, p2, count);
-  else if constexpr (AlignOn == Arg::P2)
-    align_p1_to_next_boundary<SIZE>(p2, p1, count); // swapping p1 and p2.
-  else
-    deferred_static_assert("AlignOn must be either Arg::P1 or Arg::P2");
+void align(T1 *__restrict &p1, T2 *__restrict &p2, size_t &count) {
+  if constexpr (AlignOn == Arg::_1) {
+    adjust(offset_to_next_aligned<SIZE>(p1), p1, p2, count);
+    p1 = assume_aligned<SIZE>(p1);
+  } else if constexpr (AlignOn == Arg::_2) {
+    adjust(offset_to_next_aligned<SIZE>(p2), p1, p2, count);
+    p2 = assume_aligned<SIZE>(p2);
+  } else {
+    deferred_static_assert("AlignOn must be either Arg::_1 or Arg::_2");
+  }
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/string/memset.cpp b/libc/src/string/memset.cpp
index 1b492b5..549c074 100644
--- a/libc/src/string/memset.cpp
+++ b/libc/src/string/memset.cpp
@@ -13,8 +13,8 @@
 namespace __llvm_libc {
 
 LLVM_LIBC_FUNCTION(void *, memset, (void *dst, int value, size_t count)) {
-  inline_memset(reinterpret_cast<char *>(dst), static_cast<uint8_t>(value),
-                count);
+  inline_memset(reinterpret_cast<char *>(dst),
+                static_cast<unsigned char>(value), count);
   return dst;
 }
 
diff --git a/libc/test/src/string/bcmp_test.cpp b/libc/test/src/string/bcmp_test.cpp
index 8f0fe52..19df7ad26 100644
--- a/libc/test/src/string/bcmp_test.cpp
+++ b/libc/test/src/string/bcmp_test.cpp
@@ -12,25 +12,25 @@
 TEST(LlvmLibcBcmpTest, CmpZeroByte) {
   const char *lhs = "ab";
   const char *rhs = "bc";
-  ASSERT_EQ(__llvm_libc::bcmp(lhs, rhs, 0), 0);
+  EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, 0), 0);
 }
 
 TEST(LlvmLibcBcmpTest, LhsRhsAreTheSame) {
   const char *lhs = "ab";
   const char *rhs = "ab";
-  ASSERT_EQ(__llvm_libc::bcmp(lhs, rhs, 2), 0);
+  EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, 2), 0);
 }
 
 TEST(LlvmLibcBcmpTest, LhsBeforeRhsLexically) {
   const char *lhs = "ab";
   const char *rhs = "ac";
-  ASSERT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0);
+  EXPECT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0);
 }
 
 TEST(LlvmLibcBcmpTest, LhsAfterRhsLexically) {
   const char *lhs = "ac";
   const char *rhs = "ab";
-  ASSERT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0);
+  EXPECT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0);
 }
 
 TEST(LlvmLibcBcmpTest, Sweep) {
@@ -46,13 +46,13 @@ TEST(LlvmLibcBcmpTest, Sweep) {
   reset(lhs);
   reset(rhs);
   for (size_t i = 0; i < K_MAX_SIZE; ++i)
-    ASSERT_EQ(__llvm_libc::bcmp(lhs, rhs, i), 0);
+    EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, i), 0);
 
   reset(lhs);
   reset(rhs);
   for (size_t i = 0; i < K_MAX_SIZE; ++i) {
     rhs[i] = 'b';
-    ASSERT_NE(__llvm_libc::bcmp(lhs, rhs, K_MAX_SIZE), 0);
+    EXPECT_NE(__llvm_libc::bcmp(lhs, rhs, K_MAX_SIZE), 0);
     rhs[i] = 'a';
   }
 }
diff --git a/libc/test/src/string/memmove_test.cpp b/libc/test/src/string/memmove_test.cpp
index 451ccdb..26b4d9e 100644
--- a/libc/test/src/string/memmove_test.cpp
+++ b/libc/test/src/string/memmove_test.cpp
@@ -20,7 +20,7 @@ TEST(LlvmLibcMemmoveTest, MoveZeroByte) {
   void *const Dst = Buffer;
   void *const Ret = __llvm_libc::memmove(Dst, Buffer + 2, 0);
   EXPECT_EQ(Ret, Dst);
-  ASSERT_MEM_EQ(Buffer, Expected);
+  EXPECT_MEM_EQ(Buffer, Expected);
 }
 
 TEST(LlvmLibcMemmoveTest, DstAndSrcPointToSameAddress) {
@@ -29,7 +29,7 @@ TEST(LlvmLibcMemmoveTest, DstAndSrcPointToSameAddress) {
   void *const Dst = Buffer;
   void *const Ret = __llvm_libc::memmove(Dst, Buffer, 1);
   EXPECT_EQ(Ret, Dst);
-  ASSERT_MEM_EQ(Buffer, Expected);
+  EXPECT_MEM_EQ(Buffer, Expected);
 }
 
 TEST(LlvmLibcMemmoveTest, DstStartsBeforeSrc) {
@@ -40,7 +40,7 @@ TEST(LlvmLibcMemmoveTest, DstStartsBeforeSrc) {
   void *const Dst = Buffer + 1;
   void *const Ret = __llvm_libc::memmove(Dst, Buffer + 2, 2);
   EXPECT_EQ(Ret, Dst);
-  ASSERT_MEM_EQ(Buffer, Expected);
+  EXPECT_MEM_EQ(Buffer, Expected);
 }
 
 TEST(LlvmLibcMemmoveTest, DstStartsAfterSrc) {
@@ -49,7 +49,7 @@ TEST(LlvmLibcMemmoveTest, DstStartsAfterSrc) {
   void *const Dst = Buffer + 2;
   void *const Ret = __llvm_libc::memmove(Dst, Buffer + 1, 2);
   EXPECT_EQ(Ret, Dst);
-  ASSERT_MEM_EQ(Buffer, Expected);
+  EXPECT_MEM_EQ(Buffer, Expected);
 }
 
 // e.g. `Dst` follow `src`.
@@ -62,7 +62,7 @@ TEST(LlvmLibcMemmoveTest, SrcFollowDst) {
   void *const Dst = Buffer + 1;
   void *const Ret = __llvm_libc::memmove(Dst, Buffer + 2, 1);
   EXPECT_EQ(Ret, Dst);
-  ASSERT_MEM_EQ(Buffer, Expected);
+  EXPECT_MEM_EQ(Buffer, Expected);
 }
 
 TEST(LlvmLibcMemmoveTest, DstFollowSrc) {
@@ -71,7 +71,7 @@ TEST(LlvmLibcMemmoveTest, DstFollowSrc) {
   void *const Dst = Buffer + 2;
   void *const Ret = __llvm_libc::memmove(Dst, Buffer + 1, 1);
   EXPECT_EQ(Ret, Dst);
-  ASSERT_MEM_EQ(Buffer, Expected);
+  EXPECT_MEM_EQ(Buffer, Expected);
 }
 
 static constexpr int kMaxSize = 512;
@@ -106,7 +106,7 @@ TEST(LlvmLibcMemmoveTest, Thorough) {
       void *const Ret =
           __llvm_libc::memmove(Dst, Buffer.data() + SrcOffset, Size);
       EXPECT_EQ(Ret, Dst);
-      ASSERT_MEM_EQ(Buffer, Expected);
+      EXPECT_MEM_EQ(Buffer, Expected);
     }
   }
 }
diff --git a/libc/test/src/string/memory_utils/CMakeLists.txt b/libc/test/src/string/memory_utils/CMakeLists.txt
index d54f845..8f92627 100644
--- a/libc/test/src/string/memory_utils/CMakeLists.txt
+++ b/libc/test/src/string/memory_utils/CMakeLists.txt
@@ -3,6 +3,8 @@ add_libc_unittest(
   SUITE
     libc_string_unittests
   SRCS
+    elements_test.cpp
+    memory_access_test.cpp
     utils_test.cpp
   COMPILE_OPTIONS
     ${LIBC_COMPILE_OPTIONS_NATIVE}
diff --git a/libc/test/src/string/memory_utils/elements_test.cpp b/libc/test/src/string/memory_utils/elements_test.cpp
new file mode 100644
index 0000000..2187001
--- /dev/null
+++ b/libc/test/src/string/memory_utils/elements_test.cpp
@@ -0,0 +1,137 @@
+//===-- Unittests for memory_utils ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/span.h"
+#include "src/string/memory_utils/elements.h"
+#include "utils/UnitTest/Test.h"
+
+namespace __llvm_libc {
+
+// Registering Types
+using FixedSizeTypes = testing::TypeList<
+#if defined(__SSE2__)
+    x86::Vector128, //
+#endif              // __SSE2__
+#if defined(__AVX2__)
+    x86::Vector256, //
+#endif              // __AVX2__
+#if defined(__AVX512F__) and defined(__AVX512BW__)
+    x86::Vector512, //
+#endif              // defined(__AVX512F__) and defined(__AVX512BW__)
+    scalar::UINT8,  //
+    scalar::UINT16, //
+    scalar::UINT32, //
+    scalar::UINT64, //
+    Repeated<scalar::UINT64, 2>,                            //
+    Repeated<scalar::UINT64, 4>,                            //
+    Repeated<scalar::UINT64, 8>,                            //
+    Repeated<scalar::UINT64, 16>,                           //
+    Repeated<scalar::UINT64, 32>,                           //
+    Chained<scalar::UINT16, scalar::UINT8>,                 //
+    Chained<scalar::UINT32, scalar::UINT16, scalar::UINT8>, //
+    builtin::_1,                                            //
+    builtin::_2,                                            //
+    builtin::_3,                                            //
+    builtin::_4,                                            //
+    builtin::_8                                             //
+    >;
+
+char GetRandomChar() {
+  static constexpr const uint64_t a = 1103515245;
+  static constexpr const uint64_t c = 12345;
+  static constexpr const uint64_t m = 1ULL << 31;
+  static uint64_t seed = 123456789;
+  seed = (a * seed + c) % m;
+  return seed;
+}
+
+void Randomize(cpp::span<char> buffer) {
+  for (auto &current : buffer)
+    current = GetRandomChar();
+}
+
+template <typename Element> using Buffer = cpp::array<char, Element::SIZE>;
+
+template <typename Element> Buffer<Element> GetRandomBuffer() {
+  Buffer<Element> buffer;
+  Randomize(buffer);
+  return buffer;
+}
+
+TYPED_TEST(LlvmLibcMemoryElements, copy, FixedSizeTypes) {
+  Buffer<ParamType> Dst;
+  const auto buffer = GetRandomBuffer<ParamType>();
+  copy<ParamType>(Dst.data(), buffer.data());
+  for (size_t i = 0; i < ParamType::SIZE; ++i)
+    EXPECT_EQ(Dst[i], buffer[i]);
+}
+
+template <typename T> T copy(const T &Input) {
+  T Output;
+  for (size_t I = 0; I < Input.size(); ++I)
+    Output[I] = Input[I];
+  return Output;
+}
+
+TYPED_TEST(LlvmLibcMemoryElements, Move, FixedSizeTypes) {
+  constexpr size_t SIZE = ParamType::SIZE;
+  using LargeBuffer = cpp::array<char, SIZE * 2>;
+  LargeBuffer GroundTruth;
+  Randomize(GroundTruth);
+  // Forward, we move the SIZE first bytes from offset 0 to SIZE.
+  for (size_t Offset = 0; Offset < SIZE; ++Offset) {
+    LargeBuffer Buffer = copy(GroundTruth);
+    move<ParamType>(&Buffer[Offset], &Buffer[0]);
+    for (size_t I = 0; I < SIZE; ++I)
+      EXPECT_EQ(Buffer[I + Offset], GroundTruth[I]);
+  }
+  // Backward, we move the SIZE last bytes from offset 0 to SIZE.
+  for (size_t Offset = 0; Offset < SIZE; ++Offset) {
+    LargeBuffer Buffer = copy(GroundTruth);
+    move<ParamType>(&Buffer[Offset], &Buffer[SIZE]);
+    for (size_t I = 0; I < SIZE; ++I)
+      EXPECT_EQ(Buffer[I + Offset], GroundTruth[SIZE + I]);
+  }
+}
+
+TYPED_TEST(LlvmLibcMemoryElements, Equals, FixedSizeTypes) {
+  const auto buffer = GetRandomBuffer<ParamType>();
+  EXPECT_TRUE(equals<ParamType>(buffer.data(), buffer.data()));
+}
+
+TYPED_TEST(LlvmLibcMemoryElements, three_way_compare, FixedSizeTypes) {
+  Buffer<ParamType> initial;
+  for (auto &c : initial)
+    c = 5;
+
+  // Testing equality
+  EXPECT_EQ(three_way_compare<ParamType>(initial.data(), initial.data()), 0);
+
+  // Testing all mismatching positions
+  for (size_t i = 0; i < ParamType::SIZE; ++i) {
+    auto copy = initial;
+    ++copy[i]; // copy is now lexicographycally greated than initial
+    const auto *less = initial.data();
+    const auto *greater = copy.data();
+    EXPECT_LT(three_way_compare<ParamType>(less, greater), 0);
+    EXPECT_GT(three_way_compare<ParamType>(greater, less), 0);
+  }
+}
+
+TYPED_TEST(LlvmLibcMemoryElements, Splat, FixedSizeTypes) {
+  Buffer<ParamType> Dst;
+  const cpp::array<char, 3> values = {char(0x00), char(0x7F), char(0xFF)};
+  for (char value : values) {
+    splat_set<ParamType>(Dst.data(), value);
+    for (size_t i = 0; i < ParamType::SIZE; ++i)
+      EXPECT_EQ(Dst[i], value);
+  }
+}
+
+} // namespace __llvm_libc
diff --git a/libc/test/src/string/memory_utils/memory_access_test.cpp b/libc/test/src/string/memory_utils/memory_access_test.cpp
new file mode 100644
index 0000000..b81700f
--- /dev/null
+++ b/libc/test/src/string/memory_utils/memory_access_test.cpp
@@ -0,0 +1,228 @@
+//===-- Unittests for memory_utils ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define LLVM_LIBC_UNITTEST_OBSERVE 1
+
+#include "src/__support/CPP/array.h"
+#include "src/string/memory_utils/elements.h"
+#include "utils/UnitTest/Test.h"
+
+#include <stdio.h>
+#include <string.h>
+
+namespace __llvm_libc {
+
+static constexpr const size_t kMaxBuffer = 32;
+
+struct BufferAccess : cpp::array<char, kMaxBuffer + 1> {
+  BufferAccess() { Reset(); }
+  void Reset() {
+    for (auto &value : *this)
+      value = '0';
+    this->operator[](kMaxBuffer) = '\0';
+  }
+  void Touch(ptrdiff_t offset, size_t size) {
+    if (offset < 0)
+      return;
+    for (size_t i = 0; i < size; ++i)
+      ++(*this)[offset + i];
+  }
+  operator const char *() const { return this->data(); }
+};
+
+struct Buffer {
+  ptrdiff_t Offset(const char *ptr) const {
+    const bool contained = ptr >= data.begin() && ptr < data.end();
+    return contained ? ptr - data.begin() : -1;
+  }
+  void Reset() {
+    reads.Reset();
+    writes.Reset();
+  }
+  cpp::array<char, kMaxBuffer> data;
+  BufferAccess __attribute__((aligned(64))) reads;
+  BufferAccess __attribute__((aligned(64))) writes;
+};
+
+struct MemoryAccessObserver {
+  void ObserveRead(const char *ptr, size_t size) {
+    Buffer1.reads.Touch(Buffer1.Offset(ptr), size);
+    Buffer2.reads.Touch(Buffer2.Offset(ptr), size);
+  }
+
+  void ObserveWrite(const char *ptr, size_t size) {
+    Buffer1.writes.Touch(Buffer1.Offset(ptr), size);
+    Buffer2.writes.Touch(Buffer2.Offset(ptr), size);
+  }
+
+  void Reset() {
+    Buffer1.Reset();
+    Buffer2.Reset();
+  }
+
+  Buffer Buffer1;
+  Buffer Buffer2;
+};
+
+MemoryAccessObserver Observer;
+
+template <size_t Size> struct TestingElement {
+  static constexpr size_t SIZE = Size;
+
+  static void copy(char *__restrict dst, const char *__restrict src) {
+    Observer.ObserveRead(src, SIZE);
+    Observer.ObserveWrite(dst, SIZE);
+  }
+
+  static bool equals(const char *lhs, const char *rhs) {
+    Observer.ObserveRead(lhs, SIZE);
+    Observer.ObserveRead(rhs, SIZE);
+    return true;
+  }
+
+  static int three_way_compare(const char *lhs, const char *rhs) {
+    Observer.ObserveRead(lhs, SIZE);
+    Observer.ObserveRead(rhs, SIZE);
+    return 0;
+  }
+
+  static void splat_set(char *dst, const unsigned char value) {
+    Observer.ObserveWrite(dst, SIZE);
+  }
+};
+
+using Types = testing::TypeList<
+    TestingElement<1>,                                               // 1 Byte
+    TestingElement<2>,                                               // 2 Bytes
+    TestingElement<4>,                                               // 4 Bytes
+    Repeated<TestingElement<2>, 3>,                                  // 6 Bytes
+    Chained<TestingElement<4>, TestingElement<2>, TestingElement<1>> // 7 Bytes
+    >;
+
+struct LlvmLibcTestAccessBase : public testing::Test {
+
+  template <typename HigherOrder, size_t Size, size_t Offset = 0>
+  void checkOperations(const BufferAccess &expected) {
+    static const BufferAccess untouched;
+
+    Observer.Reset();
+    HigherOrder::copy(dst_ptr() + Offset, src_ptr() + Offset, Size);
+    ASSERT_STREQ(src().writes, untouched);
+    ASSERT_STREQ(dst().reads, untouched);
+    ASSERT_STREQ(src().reads, expected);
+    ASSERT_STREQ(dst().writes, expected);
+    Observer.Reset();
+    HigherOrder::equals(lhs_ptr() + Offset, rhs_ptr() + Offset, Size);
+    ASSERT_STREQ(lhs().writes, untouched);
+    ASSERT_STREQ(rhs().writes, untouched);
+    ASSERT_STREQ(lhs().reads, expected);
+    ASSERT_STREQ(rhs().reads, expected);
+    Observer.Reset();
+    HigherOrder::three_way_compare(lhs_ptr() + Offset, rhs_ptr() + Offset,
+                                   Size);
+    ASSERT_STREQ(lhs().writes, untouched);
+    ASSERT_STREQ(rhs().writes, untouched);
+    ASSERT_STREQ(lhs().reads, expected);
+    ASSERT_STREQ(rhs().reads, expected);
+    Observer.Reset();
+    HigherOrder::splat_set(dst_ptr() + Offset, 5, Size);
+    ASSERT_STREQ(src().reads, untouched);
+    ASSERT_STREQ(src().writes, untouched);
+    ASSERT_STREQ(dst().reads, untouched);
+    ASSERT_STREQ(dst().writes, expected);
+  }
+
+  void checkMaxAccess(const BufferAccess &expected, int max) {
+    for (size_t i = 0; i < kMaxBuffer; ++i) {
+      int value = (int)expected[i] - '0';
+      ASSERT_GE(value, 0);
+      ASSERT_LE(value, max);
+    }
+  }
+
+private:
+  const Buffer &lhs() const { return Observer.Buffer1; }
+  const Buffer &rhs() const { return Observer.Buffer2; }
+  const Buffer &src() const { return Observer.Buffer2; }
+  const Buffer &dst() const { return Observer.Buffer1; }
+  Buffer &dst() { return Observer.Buffer1; }
+
+  char *dst_ptr() { return dst().data.begin(); }
+  const char *src_ptr() { return src().data.begin(); }
+  const char *lhs_ptr() { return lhs().data.begin(); }
+  const char *rhs_ptr() { return rhs().data.begin(); }
+};
+
+template <typename ParamType>
+struct LlvmLibcTestAccessTail : public LlvmLibcTestAccessBase {
+
+  void TearDown() override {
+    static constexpr size_t Size = 10;
+
+    BufferAccess expected;
+    expected.Touch(Size - ParamType::SIZE, ParamType::SIZE);
+
+    checkMaxAccess(expected, 1);
+    checkOperations<Tail<ParamType>, Size>(expected);
+  }
+};
+TYPED_TEST_F(LlvmLibcTestAccessTail, Operations, Types) {}
+
+template <typename ParamType>
+struct LlvmLibcTestAccessHeadTail : public LlvmLibcTestAccessBase {
+  void TearDown() override {
+    static constexpr size_t Size = 10;
+
+    BufferAccess expected;
+    expected.Touch(0, ParamType::SIZE);
+    expected.Touch(Size - ParamType::SIZE, ParamType::SIZE);
+
+    checkMaxAccess(expected, 2);
+    checkOperations<HeadTail<ParamType>, Size>(expected);
+  }
+};
+TYPED_TEST_F(LlvmLibcTestAccessHeadTail, Operations, Types) {}
+
+template <typename ParamType>
+struct LlvmLibcTestAccessLoop : public LlvmLibcTestAccessBase {
+  void TearDown() override {
+    static constexpr size_t Size = 20;
+
+    BufferAccess expected;
+    for (size_t i = 0; i < Size - ParamType::SIZE; i += ParamType::SIZE)
+      expected.Touch(i, ParamType::SIZE);
+    expected.Touch(Size - ParamType::SIZE, ParamType::SIZE);
+
+    checkMaxAccess(expected, 2);
+    checkOperations<Loop<ParamType>, Size>(expected);
+  }
+};
+TYPED_TEST_F(LlvmLibcTestAccessLoop, Operations, Types) {}
+
+template <typename ParamType>
+struct LlvmLibcTestAccessAlignedAccess : public LlvmLibcTestAccessBase {
+  void TearDown() override {
+    static constexpr size_t Size = 10;
+    static constexpr size_t Offset = 2;
+    using AlignmentT = TestingElement<4>;
+
+    BufferAccess expected;
+    expected.Touch(Offset, AlignmentT::SIZE);
+    expected.Touch(AlignmentT::SIZE, ParamType::SIZE);
+    expected.Touch(Offset + Size - ParamType::SIZE, ParamType::SIZE);
+
+    checkMaxAccess(expected, 3);
+    checkOperations<Align<AlignmentT, Arg::_1>::Then<HeadTail<ParamType>>, Size,
+                    Offset>(expected);
+    checkOperations<Align<AlignmentT, Arg::_2>::Then<HeadTail<ParamType>>, Size,
+                    Offset>(expected);
+  }
+};
+TYPED_TEST_F(LlvmLibcTestAccessAlignedAccess, Operations, Types) {}
+
+} // namespace __llvm_libc
diff --git a/libc/test/src/string/memory_utils/utils_test.cpp b/libc/test/src/string/memory_utils/utils_test.cpp
index 5c7920c..a20c090 100644
--- a/libc/test/src/string/memory_utils/utils_test.cpp
+++ b/libc/test/src/string/memory_utils/utils_test.cpp
@@ -72,41 +72,55 @@ TEST(LlvmLibcUtilsTest, GEPowerOf2) {
     EXPECT_EQ(ge_power2(i), kExpectedValues[i]);
 }
 
-using UINT = uintptr_t;
+using I = intptr_t;
 
 // Converts an offset into a pointer.
 const void *forge(size_t offset) {
   return reinterpret_cast<const void *>(offset);
 }
 
-TEST(LlvmLibcUtilsTest, DistanceToNextAligned) {
-  EXPECT_EQ(distance_to_next_aligned<16>(forge(0)), UINT(16));
-  EXPECT_EQ(distance_to_next_aligned<16>(forge(1)), UINT(15));
-  EXPECT_EQ(distance_to_next_aligned<16>(forge(16)), UINT(16));
-  EXPECT_EQ(distance_to_next_aligned<16>(forge(15)), UINT(1));
-  EXPECT_EQ(distance_to_next_aligned<32>(forge(16)), UINT(16));
+TEST(LlvmLibcUtilsTest, OffsetToNextAligned) {
+  EXPECT_EQ(offset_to_next_aligned<16>(forge(0)), I(0));
+  EXPECT_EQ(offset_to_next_aligned<16>(forge(1)), I(15));
+  EXPECT_EQ(offset_to_next_aligned<16>(forge(16)), I(0));
+  EXPECT_EQ(offset_to_next_aligned<16>(forge(15)), I(1));
+  EXPECT_EQ(offset_to_next_aligned<32>(forge(16)), I(16));
 }
 
-TEST(LlvmLibcUtilsTest, DistanceToAlignUp) {
-  EXPECT_EQ(distance_to_align_up<16>(forge(0)), UINT(0));
-  EXPECT_EQ(distance_to_align_up<16>(forge(1)), UINT(15));
-  EXPECT_EQ(distance_to_align_up<16>(forge(16)), UINT(0));
-  EXPECT_EQ(distance_to_align_up<16>(forge(15)), UINT(1));
-  EXPECT_EQ(distance_to_align_up<32>(forge(16)), UINT(16));
+TEST(LlvmLibcUtilsTest, OffsetFromLastAligned) {
+  EXPECT_EQ(offset_from_last_aligned<16>(forge(0)), I(0));
+  EXPECT_EQ(offset_from_last_aligned<16>(forge(1)), I(1));
+  EXPECT_EQ(offset_from_last_aligned<16>(forge(16)), I(0));
+  EXPECT_EQ(offset_from_last_aligned<16>(forge(15)), I(15));
+  EXPECT_EQ(offset_from_last_aligned<32>(forge(16)), I(16));
 }
 
-TEST(LlvmLibcUtilsTest, DistanceToAlignDown) {
-  EXPECT_EQ(distance_to_align_down<16>(forge(0)), UINT(0));
-  EXPECT_EQ(distance_to_align_down<16>(forge(1)), UINT(1));
-  EXPECT_EQ(distance_to_align_down<16>(forge(16)), UINT(0));
-  EXPECT_EQ(distance_to_align_down<16>(forge(15)), UINT(15));
-  EXPECT_EQ(distance_to_align_down<32>(forge(16)), UINT(16));
+TEST(LlvmLibcUtilsTest, OffsetToNextCacheLine) {
+  EXPECT_GT(LLVM_LIBC_CACHELINE_SIZE, 0);
+  EXPECT_EQ(offset_to_next_cache_line(forge(0)), I(0));
+  EXPECT_EQ(offset_to_next_cache_line(forge(1)),
+            I(LLVM_LIBC_CACHELINE_SIZE - 1));
+  EXPECT_EQ(offset_to_next_cache_line(forge(LLVM_LIBC_CACHELINE_SIZE)), I(0));
+  EXPECT_EQ(offset_to_next_cache_line(forge(LLVM_LIBC_CACHELINE_SIZE - 1)),
+            I(1));
+}
+
+TEST(LlvmLibcUtilsTest, Adjust1) {
+  char a;
+  const size_t base_size = 10;
+  for (size_t I = -2; I < 2; ++I) {
+    auto *ptr = &a;
+    size_t size = base_size;
+    adjust(I, ptr, size);
+    EXPECT_EQ(intptr_t(ptr), intptr_t(&a + I));
+    EXPECT_EQ(size, base_size - I);
+  }
 }
 
 TEST(LlvmLibcUtilsTest, Adjust2) {
   char a, b;
   const size_t base_size = 10;
-  for (ptrdiff_t I = -2; I < 2; ++I) {
+  for (size_t I = -2; I < 2; ++I) {
     auto *p1 = &a;
     auto *p2 = &b;
     size_t size = base_size;
@@ -117,6 +131,19 @@ TEST(LlvmLibcUtilsTest, Adjust2) {
   }
 }
 
+TEST(LlvmLibcUtilsTest, Align1) {
+  char a;
+  const size_t base_size = 10;
+  {
+    auto *ptr = &a;
+    size_t size = base_size;
+    align<128>(ptr, size);
+    EXPECT_TRUE(uintptr_t(ptr) % 128 == 0);
+    EXPECT_GE(ptr, &a);
+    EXPECT_EQ(size_t(ptr - &a), base_size - size);
+  }
+}
+
 TEST(LlvmLibcUtilsTest, Align2) {
   char a, b;
   const size_t base_size = 10;
@@ -124,10 +151,10 @@ TEST(LlvmLibcUtilsTest, Align2) {
     auto *p1 = &a;
     auto *p2 = &b;
     size_t size = base_size;
-    align_to_next_boundary<128, Arg::P1>(p1, p2, size);
+    align<128, Arg::_1>(p1, p2, size);
     EXPECT_TRUE(uintptr_t(p1) % 128 == 0);
-    EXPECT_GT(p1, &a);
-    EXPECT_GT(p2, &b);
+    EXPECT_GE(p1, &a);
+    EXPECT_GE(p2, &b);
     EXPECT_EQ(size_t(p1 - &a), base_size - size);
     EXPECT_EQ(size_t(p2 - &b), base_size - size);
   }
@@ -135,10 +162,10 @@ TEST(LlvmLibcUtilsTest, Align2) {
     auto *p1 = &a;
     auto *p2 = &b;
     size_t size = base_size;
-    align_to_next_boundary<128, Arg::P2>(p1, p2, size);
+    align<128, Arg::_2>(p1, p2, size);
     EXPECT_TRUE(uintptr_t(p2) % 128 == 0);
-    EXPECT_GT(p1, &a);
-    EXPECT_GT(p2, &b);
+    EXPECT_GE(p1, &a);
+    EXPECT_GE(p2, &b);
     EXPECT_EQ(size_t(p1 - &a), base_size - size);
     EXPECT_EQ(size_t(p2 - &b), base_size - size);
   }
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index f94b502..90aea2c 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -973,10 +973,9 @@ no_sanitize_features = [
 cc_library(
     name = "string_memory_utils",
     hdrs = [
-        "src/string/memory_utils/op_aarch64.h",
-        "src/string/memory_utils/op_builtin.h",
-        "src/string/memory_utils/op_generic.h",
-        "src/string/memory_utils/op_x86.h",
+        "src/string/memory_utils/elements.h",
+        "src/string/memory_utils/elements_aarch64.h",
+        "src/string/memory_utils/elements_x86.h",
         "src/string/memory_utils/utils.h",
     ],
     textual_hdrs = [
@@ -989,8 +988,6 @@ cc_library(
     deps = [
         ":__support_common",
         ":__support_cpp_bit",
-        ":__support_cpp_type_traits",
-        ":__support_cpp_array",
         ":libc_root",
     ],
 )
-- 
2.7.4