From 903cc71a82431d79e5fb541946a9e7c93750e374 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Wed, 19 Oct 2022 20:52:45 +0000
Subject: [PATCH] [libc] mem* framework v3

This version is more composable and also simpler at the expense of being more explicit and more verbose.

This patch provides rationale for the framework, implementation and unit tests but the functions themselves are still using the previous version. The change in implementation will come in a follow up patch.

Differential Revision: https://reviews.llvm.org/D136292
---
 libc/src/string/memory_utils/CMakeLists.txt       |   8 +-
 libc/src/string/memory_utils/README.md            |  97 +++++
 libc/src/string/memory_utils/op_aarch64.h         | 172 ++++++++
 libc/src/string/memory_utils/op_builtin.h         | 148 +++++++
 libc/src/string/memory_utils/op_generic.h         | 466 ++++++++++++++++++++++
 libc/src/string/memory_utils/op_x86.h             | 221 ++++++++++
 libc/src/string/memory_utils/utils.h              |  35 ++
 libc/test/src/string/memory_utils/CMakeLists.txt  |   1 +
 libc/test/src/string/memory_utils/op_tests.cpp    | 420 +++++++++++++++++++
 utils/bazel/llvm-project-overlay/libc/BUILD.bazel |   7 +-
 10 files changed, 1572 insertions(+), 3 deletions(-)
 create mode 100644 libc/src/string/memory_utils/README.md
 create mode 100644 libc/src/string/memory_utils/op_aarch64.h
 create mode 100644 libc/src/string/memory_utils/op_builtin.h
 create mode 100644 libc/src/string/memory_utils/op_generic.h
 create mode 100644 libc/src/string/memory_utils/op_x86.h
 create mode 100644 libc/test/src/string/memory_utils/op_tests.cpp

diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt
index d735fcf..b72242e 100644
--- a/libc/src/string/memory_utils/CMakeLists.txt
+++ b/libc/src/string/memory_utils/CMakeLists.txt
@@ -2,13 +2,17 @@
 add_header_library(
   memory_utils
   HDRS
-    utils.h
-    elements.h
     bcmp_implementations.h
     bzero_implementations.h
+    elements.h
     memcmp_implementations.h
     memcpy_implementations.h
     memset_implementations.h
+    op_aarch64.h
+    op_builtin.h
+    op_generic.h
+    op_x86.h
+    utils.h
   DEPS
     libc.src.__support.CPP.bit
 )
diff --git a/libc/src/string/memory_utils/README.md b/libc/src/string/memory_utils/README.md
new file mode 100644
index 0000000..83a29066
--- /dev/null
+++ b/libc/src/string/memory_utils/README.md
@@ -0,0 +1,97 @@
+# The mem* framework
+
+The framework handles the following mem* functions:
+ - `memcpy`
+ - `memmove`
+ - `memset`
+ - `bzero`
+ - `bcmp`
+ - `memcmp`
+
+## Building blocks
+
+These functions can be built out of a set of lower-level operations:
+ - **`block`** : operates on a block of `SIZE` bytes.
+ - **`tail`** : operates on the last `SIZE` bytes of the buffer (e.g., `[dst + count - SIZE, dst + count]`)
+ - **`head_tail`** : operates on the first and last `SIZE` bytes. This is the same as calling `block` and `tail`.
+ - **`loop_and_tail`** : calls `block` in a loop to consume as much as possible of the `count` bytes and handle the remaining bytes with a `tail` operation.
+
+As an illustration, let's take the example of a trivial `memset` implementation:
+
+ ```C++
+ extern "C" void memset(const char* dst, int value, size_t count) {
+    if (count == 0) return;
+    if (count == 1) return Memset<1>::block(dst, value);
+    if (count == 2) return Memset<2>::block(dst, value);
+    if (count == 3) return Memset<3>::block(dst, value);
+    if (count <= 8) return Memset<4>::head_tail(dst, value, count);  // Note that 0 to 4 bytes are written twice.
+    if (count <= 16) return Memset<8>::head_tail(dst, value, count); // Same here.
+    return Memset<16>::loop_and_tail(dst, value, count);
+}
+ ```
+
+Now let's have a look into the `Memset` structure:
+
+```C++
+template <size_t Size>
+struct Memset {
+  static constexpr size_t SIZE = Size;
+
+  static inline void block(Ptr dst, uint8_t value) {
+    // Implement me
+  }
+
+  static inline void tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst + count - SIZE, value);
+  }
+
+  static inline void head_tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst, value);
+    tail(dst, value, count);
+  }
+
+  static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+    size_t offset = 0;
+    do {
+      block(dst + offset, value);
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    tail(dst, value, count);
+  }
+};
+```
+
+As you can see, the `tail`, `head_tail` and `loop_and_tail` are higher order functions that build on each others. Only `block` really needs to be implemented.
+In earlier designs we were implementing these higher order functions with templated functions but it appears that it is more readable to have the implementation explicitly stated.
+**This design is useful because it provides customization points**. For instance, for `bcmp` on `aarch64` we can provide a better implementation of `head_tail` using vector reduction intrinsics.
+
+## Scoped specializations
+
+We can have several specializations of the `Memset` structure. Depending on the target requirements we can use one or several scopes for the same implementation.
+
+In the following example we use the `generic` implementation for the small sizes but use the `x86` implementation for the loop.
+```C++
+ extern "C" void memset(const char* dst, int value, size_t count) {
+    if (count == 0) return;
+    if (count == 1) return generic::Memset<1>::block(dst, value);
+    if (count == 2) return generic::Memset<2>::block(dst, value);
+    if (count == 3) return generic::Memset<3>::block(dst, value);
+    if (count <= 8) return generic::Memset<4>::head_tail(dst, value, count);
+    if (count <= 16) return generic::Memset<8>::head_tail(dst, value, count);
+    return x86::Memset<16>::loop_and_tail(dst, value, count);
+}
+```
+
+### The `builtin` scope
+
+Ultimately we would like the compiler to provide the code for the `block` function. For this we rely on dedicated builtins available in Clang (e.g., [`__builtin_memset_inline`](https://clang.llvm.org/docs/LanguageExtensions.html#guaranteed-inlined-memset))
+
+### The `generic` scope
+
+In this scope we define pure C++ implementations using native integral types and clang vector extensions.
+
+### The arch specific scopes
+
+Then comes implementations that are using specific architectures or microarchitectures features (e.g., `rep;movsb` for `x86` or `dc zva` for `aarch64`).
+
+The purpose here is to rely on builtins as much as possible and fallback to `asm volatile` as a last resort.
diff --git a/libc/src/string/memory_utils/op_aarch64.h b/libc/src/string/memory_utils/op_aarch64.h
new file mode 100644
index 0000000..d25b0f4
--- /dev/null
+++ b/libc/src/string/memory_utils/op_aarch64.h
@@ -0,0 +1,172 @@
+//===-- aarch64 implementation of memory function building blocks ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides aarch64 specific building blocks to compose memory
+// functions.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
+
+#include "src/__support/architectures.h"
+
+#if defined(LLVM_LIBC_ARCH_AARCH64)
+
+#include "src/__support/common.h"
+#include "src/string/memory_utils/op_generic.h"
+
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#endif //__ARM_NEON
+
+namespace __llvm_libc::aarch64 {
+
+static inline constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON);
+
+namespace neon {
+
+template <size_t Size> struct BzeroCacheLine {
+  static constexpr size_t SIZE = Size;
+
+  static inline void block(Ptr dst, uint8_t) {
+    static_assert(Size == 64);
+#if __SIZEOF_POINTER__ == 4
+    asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
+#else
+    asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
+#endif
+  }
+
+  static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+    static_assert(Size > 1, "a loop of size 1 does not need tail");
+    size_t offset = 0;
+    do {
+      block(dst + offset, value);
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    // Unaligned store, we can't use 'dc zva' here.
+    static constexpr size_t kMaxSize = kNeon ? 16 : 8;
+    generic::Memset<Size, kMaxSize>::tail(dst, value, count);
+  }
+};
+
+inline static bool hasZva() {
+  uint64_t zva_val;
+  asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
+  // DC ZVA is permitted if DZP, bit [4] is zero.
+  // BS, bits [3:0] is log2 of the block count in words.
+  // So the next line checks whether the instruction is permitted and block
+  // count is 16 words (i.e. 64 bytes).
+  return (zva_val & 0b11111) == 0b00100;
+}
+
+} // namespace neon
+
+///////////////////////////////////////////////////////////////////////////////
+// Bcmp
+template <size_t Size> struct Bcmp {
+  static constexpr size_t SIZE = Size;
+  static constexpr size_t BlockSize = 32;
+
+  static const unsigned char *as_u8(CPtr ptr) {
+    return reinterpret_cast<const unsigned char *>(ptr);
+  }
+
+  static inline BcmpReturnType block(CPtr p1, CPtr p2) {
+    if constexpr (Size == BlockSize) {
+      auto _p1 = as_u8(p1);
+      auto _p2 = as_u8(p2);
+      uint8x16_t a = vld1q_u8(_p1);
+      uint8x16_t b = vld1q_u8(_p1 + 16);
+      uint8x16_t n = vld1q_u8(_p2);
+      uint8x16_t o = vld1q_u8(_p2 + 16);
+      uint8x16_t an = veorq_u8(a, n);
+      uint8x16_t bo = veorq_u8(b, o);
+      // anbo = (a ^ n) | (b ^ o).  At least one byte is nonzero if there is
+      // a difference between the two buffers.  We reduce this value down to 4
+      // bytes in two steps. First, calculate the saturated move value when
+      // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get
+      // a single 32 bit nonzero value if a mismatch occurred.
+      uint8x16_t anbo = vorrq_u8(an, bo);
+      uint32x2_t anbo_reduced = vqmovn_u64(anbo);
+      return vmaxv_u32(anbo_reduced);
+    } else if constexpr ((Size % BlockSize) == 0) {
+      for (size_t offset = 0; offset < Size; offset += BlockSize)
+        if (auto value = Bcmp<BlockSize>::block(p1 + offset, p2 + offset))
+          return value;
+    } else {
+      deferred_static_assert("SIZE not implemented");
+    }
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1 + count - SIZE, p2 + count - SIZE);
+  }
+
+  static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
+    if constexpr (Size <= 8) {
+      return generic::Bcmp<Size>::head_tail(p1, p2, count);
+    } else if constexpr (Size == 16) {
+      auto _p1 = as_u8(p1);
+      auto _p2 = as_u8(p2);
+      uint8x16_t a = vld1q_u8(_p1);
+      uint8x16_t b = vld1q_u8(_p1 + count - 16);
+      uint8x16_t n = vld1q_u8(_p2);
+      uint8x16_t o = vld1q_u8(_p2 + count - 16);
+      uint8x16_t an = veorq_s8(a, n);
+      uint8x16_t bo = veorq_s8(b, o);
+      // anbo = (a ^ n) | (b ^ o)
+      uint8x16_t anbo = vorrq_s8(an, bo);
+      uint32x2_t anbo_reduced = vqmovn_u64(anbo);
+      return vmaxv_u32(anbo_reduced);
+    } else if constexpr (Size == 32) {
+      auto _p1 = as_u8(p1);
+      auto _p2 = as_u8(p2);
+      uint8x16_t a = vld1q_u8(_p1);
+      uint8x16_t b = vld1q_u8(_p1 + 16);
+      uint8x16_t c = vld1q_u8(_p1 + count - 16);
+      uint8x16_t d = vld1q_u8(_p1 + count - 32);
+      uint8x16_t n = vld1q_u8(_p2);
+      uint8x16_t o = vld1q_u8(_p2 + 16);
+      uint8x16_t p = vld1q_u8(_p2 + count - 16);
+      uint8x16_t q = vld1q_u8(_p2 + count - 32);
+      uint8x16_t an = veorq_s8(a, n);
+      uint8x16_t bo = veorq_s8(b, o);
+      uint8x16_t cp = veorq_s8(c, p);
+      uint8x16_t dq = veorq_s8(d, q);
+      uint8x16_t anbo = vorrq_s8(an, bo);
+      uint8x16_t cpdq = vorrq_s8(cp, dq);
+      // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)).  Reduce this to
+      // a nonzero 32 bit value if a mismatch occurred.
+      uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq);
+      uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq);
+      return vmaxv_u32(abnocpdq_reduced);
+    } else {
+      deferred_static_assert("SIZE not implemented");
+    }
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
+    static_assert(Size > 1, "a loop of size 1 does not need tail");
+    size_t offset = 0;
+    do {
+      if (auto value = block(p1 + offset, p2 + offset))
+        return value;
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    return tail(p1, p2, count);
+  }
+};
+
+} // namespace __llvm_libc::aarch64
+
+#endif // LLVM_LIBC_ARCH_AARCH64
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
diff --git a/libc/src/string/memory_utils/op_builtin.h b/libc/src/string/memory_utils/op_builtin.h
new file mode 100644
index 0000000..68ae862
--- /dev/null
+++ b/libc/src/string/memory_utils/op_builtin.h
@@ -0,0 +1,148 @@
+//===-- Implementation using the __builtin_XXX_inline ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides generic C++ building blocks to compose memory functions.
+// They rely on the compiler to generate the best possible code through the use
+// of the `__builtin_XXX_inline` builtins. These builtins are currently only
+// available in Clang.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H
+
+#include "src/string/memory_utils/utils.h"
+
+namespace __llvm_libc::builtin {
+
+///////////////////////////////////////////////////////////////////////////////
+// Memcpy
+template <size_t Size> struct Memcpy {
+  static constexpr size_t SIZE = Size;
+  static inline void block(Ptr __restrict dst, CPtr __restrict src) {
+#ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
+    return __builtin_memcpy_inline(dst, src, SIZE);
+#else
+    deferred_static_assert("Missing __builtin_memcpy_inline");
+    (void)dst;
+    (void)src;
+#endif
+  }
+
+  static inline void tail(Ptr __restrict dst, CPtr __restrict src,
+                          size_t count) {
+    block(dst + count - SIZE, src + count - SIZE);
+  }
+
+  static inline void head_tail(Ptr __restrict dst, CPtr __restrict src,
+                               size_t count) {
+    block(dst, src);
+    tail(dst, src, count);
+  }
+
+  static inline void loop_and_tail(Ptr __restrict dst, CPtr __restrict src,
+                                   size_t count) {
+    static_assert(Size > 1, "a loop of size 1 does not need tail");
+    size_t offset = 0;
+    do {
+      block(dst + offset, src + offset);
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    tail(dst, src, count);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Memset
+template <size_t Size> struct Memset {
+  using ME = Memset;
+  static constexpr size_t SIZE = Size;
+  static inline void block(Ptr dst, uint8_t value) {
+#ifdef LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE
+    __builtin_memset_inline(dst, value, Size);
+#else
+    deferred_static_assert("Missing __builtin_memset_inline");
+    (void)dst;
+    (void)value;
+#endif
+  }
+
+  static inline void tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst + count - SIZE, value);
+  }
+
+  static inline void head_tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst, value);
+    tail(dst, value, count);
+  }
+
+  static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+    static_assert(Size > 1, "a loop of size 1 does not need tail");
+    size_t offset = 0;
+    do {
+      block(dst + offset, value);
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    tail(dst, value, count);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Bcmp
+template <size_t Size> struct Bcmp {
+  using ME = Bcmp;
+  static constexpr size_t SIZE = Size;
+  static inline BcmpReturnType block(CPtr, CPtr) {
+    deferred_static_assert("Missing __builtin_memcmp_inline");
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType tail(CPtr, CPtr, size_t) {
+    deferred_static_assert("Not implemented");
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType head_tail(CPtr, CPtr, size_t) {
+    deferred_static_assert("Not implemented");
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType loop_and_tail(CPtr, CPtr, size_t) {
+    deferred_static_assert("Not implemented");
+    return BcmpReturnType::ZERO();
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Memcmp
+template <size_t Size> struct Memcmp {
+  using ME = Memcmp;
+  static constexpr size_t SIZE = Size;
+  static inline MemcmpReturnType block(CPtr, CPtr) {
+    deferred_static_assert("Missing __builtin_memcmp_inline");
+    return MemcmpReturnType::ZERO();
+  }
+
+  static inline MemcmpReturnType tail(CPtr, CPtr, size_t) {
+    deferred_static_assert("Not implemented");
+    return MemcmpReturnType::ZERO();
+  }
+
+  static inline MemcmpReturnType head_tail(CPtr, CPtr, size_t) {
+    deferred_static_assert("Not implemented");
+    return MemcmpReturnType::ZERO();
+  }
+
+  static inline MemcmpReturnType loop_and_tail(CPtr, CPtr, size_t) {
+    deferred_static_assert("Not implemented");
+    return MemcmpReturnType::ZERO();
+  }
+};
+
+} // namespace __llvm_libc::builtin
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
new file mode 100644
index 0000000..7eb3c23
--- /dev/null
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -0,0 +1,466 @@
+//===-- Generic implementation of memory function building blocks ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides generic C++ building blocks.
+// Depending on the requested size, the block operation uses unsigned integral
+// types, vector types or an array of the type with the maximum size.
+//
+// The maximum size is passed as a template argument. For instance, on x86
+// platforms that only supports integral types the maximum size would be 8
+// (corresponding to uint64_t). On this platform if we request the size 32, this
+// would be treated as a cpp::array<uint64_t, 4>.
+//
+// On the other hand, if the platform is x86 with support for AVX the maximum
+// size is 32 and the operation can be handled with a single native operation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H
+
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/type_traits.h"
+#include "src/__support/endian.h"
+#include "src/string/memory_utils/op_builtin.h"
+#include "src/string/memory_utils/utils.h"
+
+#include <stdint.h>
+
+namespace __llvm_libc::generic {
+
+// CTPair and CTMap below implement a compile time map.
+// This is useful to map from a Size to a type handling this size.
+//
+// Example usage:
+// using MyMap = CTMap<CTPair<1, uint8_t>,
+//                     CTPair<2, uint16_t>,
+//                     >;
+// ...
+// using UInt8T = MyMap::find_type<1>;
+template <size_t I, typename T> struct CTPair {
+  using type = T;
+  static CTPair get_pair(cpp::integral_constant<size_t, I>) { return {}; }
+};
+template <typename... Pairs> struct CTMap : public Pairs... {
+  using Pairs::get_pair...;
+  template <size_t I>
+  using find_type =
+      typename decltype(get_pair(cpp::integral_constant<size_t, I>{}))::type;
+};
+
+// Helper to test if a type is void.
+template <typename T> inline constexpr bool is_void_v = cpp::is_same_v<T, void>;
+
+// Implements load, store and splat for unsigned integral types.
+template <typename T> struct ScalarType {
+  using Type = T;
+  static_assert(cpp::is_integral_v<Type> && !cpp::is_signed_v<Type>);
+
+  static inline Type load(CPtr src) { return ::__llvm_libc::load<Type>(src); }
+  static inline void store(Ptr dst, Type value) {
+    ::__llvm_libc::store<Type>(dst, value);
+  }
+  static inline Type splat(uint8_t value) {
+    return Type(~0) / Type(0xFF) * Type(value);
+  }
+};
+
+// Implements load, store and splat for vector types.
+template <size_t Size> struct VectorType {
+  using Type = uint8_t __attribute__((__vector_size__(Size)));
+  static inline Type load(CPtr src) { return ::__llvm_libc::load<Type>(src); }
+  static inline void store(Ptr dst, Type value) {
+    ::__llvm_libc::store<Type>(dst, value);
+  }
+  static inline Type splat(uint8_t value) {
+    Type Out;
+    // This for loop is optimized out for vector types.
+    for (size_t i = 0; i < Size; ++i)
+      Out[i] = static_cast<uint8_t>(value);
+    return Out;
+  }
+};
+
+static_assert((UINTPTR_MAX == 4294967295U) ||
+                  (UINTPTR_MAX == 18446744073709551615UL),
+              "We currently only support 32- or 64-bit platforms");
+
+// Map from sizes to structures offering static load, store and splat methods.
+// Note: On platforms lacking vector support, we use the ArrayType below and
+// decompose the operation in smaller pieces.
+using NativeTypeMap =
+    CTMap<CTPair<1, ScalarType<uint8_t>>,  //
+          CTPair<2, ScalarType<uint16_t>>, //
+          CTPair<4, ScalarType<uint32_t>>, //
+#if defined(LLVM_LIBC_ARCH_X86_64) || defined(LLVM_LIBC_ARCH_AARCH64)
+          CTPair<8, ScalarType<uint64_t>>, // Not available on 32bit
+#endif                                     //
+          CTPair<16, VectorType<16>>,      //
+          CTPair<32, VectorType<32>>,      //
+          CTPair<64, VectorType<64>>>;
+
+// Implements load, store and splat for sizes not natively supported by the
+// platform. SubType is either ScalarType or VectorType.
+template <typename SubType, size_t ArraySize> struct ArrayType {
+  using Type = cpp::array<typename SubType::Type, ArraySize>;
+  static constexpr size_t SizeOfElement = sizeof(typename SubType::Type);
+  static inline Type load(CPtr src) {
+    Type Value;
+    for (size_t I = 0; I < ArraySize; ++I)
+      Value[I] = SubType::load(src + (I * SizeOfElement));
+    return Value;
+  }
+  static inline void store(Ptr dst, Type Value) {
+    for (size_t I = 0; I < ArraySize; ++I)
+      SubType::store(dst + (I * SizeOfElement), Value[I]);
+  }
+  static inline Type splat(uint8_t value) {
+    Type Out;
+    for (size_t I = 0; I < ArraySize; ++I)
+      Out[I] = SubType::splat(value);
+    return Out;
+  }
+};
+
+// Checks whether we should use an ArrayType.
+template <size_t Size, size_t MaxSize> static constexpr bool useArrayType() {
+  return (Size > MaxSize) && ((Size % MaxSize) == 0) &&
+         !is_void_v<NativeTypeMap::find_type<MaxSize>>;
+}
+
+// Compute the type to handle an operation of Size bytes knowing that the
+// underlying platform only support native types up to MaxSize bytes.
+template <size_t Size, size_t MaxSize>
+using getTypeFor = cpp::conditional_t<
+    useArrayType<Size, MaxSize>(),
+    ArrayType<NativeTypeMap::find_type<MaxSize>, Size / MaxSize>,
+    NativeTypeMap::find_type<Size>>;
+
+///////////////////////////////////////////////////////////////////////////////
+// Memcpy
+// When building with clang we can delegate to the builtin implementation.
+///////////////////////////////////////////////////////////////////////////////
+
+template <size_t Size> using Memcpy = builtin::Memcpy<Size>;
+
+///////////////////////////////////////////////////////////////////////////////
+// Memset
+// The MaxSize template argument gives the maximum size handled natively by the
+// platform. For instance on x86 with AVX support this would be 32. If a size
+// greater than MaxSize is requested we break the operation down in smaller
+// pieces of size MaxSize.
+///////////////////////////////////////////////////////////////////////////////
+template <size_t Size, size_t MaxSize> struct Memset {
+  static_assert(is_power2(MaxSize));
+  static constexpr size_t SIZE = Size;
+
+  static inline void block(Ptr dst, uint8_t value) {
+    if constexpr (Size == 3) {
+      Memset<1, MaxSize>::block(dst + 2, value);
+      Memset<2, MaxSize>::block(dst, value);
+    } else {
+      using T = getTypeFor<Size, MaxSize>;
+      if constexpr (is_void_v<T>) {
+        deferred_static_assert("Unimplemented Size");
+      } else {
+        T::store(dst, T::splat(value));
+      }
+    }
+  }
+
+  static inline void tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst + count - SIZE, value);
+  }
+
+  static inline void head_tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst, value);
+    tail(dst, value, count);
+  }
+
+  static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+    static_assert(SIZE > 1);
+    size_t offset = 0;
+    do {
+      block(dst + offset, value);
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    tail(dst, value, count);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Bcmp
+///////////////////////////////////////////////////////////////////////////////
+template <size_t Size> struct Bcmp {
+  static constexpr size_t SIZE = Size;
+  static constexpr size_t MaxSize = 8;
+
+  template <typename T> static inline uint32_t load_xor(CPtr p1, CPtr p2) {
+    return load<T>(p1) ^ load<T>(p2);
+  }
+
+  template <typename T>
+  static inline uint32_t load_not_equal(CPtr p1, CPtr p2) {
+    return load<T>(p1) != load<T>(p2);
+  }
+
+  static inline BcmpReturnType block(CPtr p1, CPtr p2) {
+    static constexpr size_t MaxSize = 8;
+    if constexpr (Size == 1) {
+      return load_xor<uint8_t>(p1, p2);
+    } else if constexpr (Size == 2) {
+      return load_xor<uint16_t>(p1, p2);
+    } else if constexpr (Size == 4) {
+      return load_xor<uint32_t>(p1, p2);
+    } else if constexpr (Size == 8) {
+      return load_not_equal<uint64_t>(p1, p2);
+    } else if constexpr (useArrayType<Size, MaxSize>()) {
+      for (size_t offset = 0; offset < Size; offset += MaxSize)
+        if (auto value = Bcmp<MaxSize>::block(p1 + offset, p2 + offset))
+          return value;
+    } else {
+      deferred_static_assert("Unimplemented Size");
+    }
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1 + count - SIZE, p2 + count - SIZE);
+  }
+
+  static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1, p2) | tail(p1, p2, count);
+  }
+
+  static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
+    static_assert(Size > 1, "a loop of size 1 does not need tail");
+    size_t offset = 0;
+    do {
+      if (auto value = block(p1 + offset, p2 + offset))
+        return value;
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    return tail(p1, p2, count);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Memcmp
+///////////////////////////////////////////////////////////////////////////////
+template <size_t Size> struct Memcmp {
+  static constexpr size_t SIZE = Size;
+  static constexpr size_t MaxSize = 8;
+
+  template <typename T> static inline T load_be(CPtr ptr) {
+    return Endian::to_big_endian(load<T>(ptr));
+  }
+
+  template <typename T>
+  static inline MemcmpReturnType load_be_diff(CPtr p1, CPtr p2) {
+    return load_be<T>(p1) - load_be<T>(p2);
+  }
+
+  template <typename T>
+  static inline MemcmpReturnType load_be_cmp(CPtr p1, CPtr p2) {
+    const auto la = load_be<T>(p1);
+    const auto lb = load_be<T>(p2);
+    return la > lb ? 1 : la < lb ? -1 : 0;
+  }
+
+  static inline MemcmpReturnType block(CPtr p1, CPtr p2) {
+    if constexpr (Size == 1) {
+      return load_be_diff<uint8_t>(p1, p2);
+    } else if constexpr (Size == 2) {
+      return load_be_diff<uint16_t>(p1, p2);
+    } else if constexpr (Size == 4) {
+      return load_be_cmp<uint32_t>(p1, p2);
+    } else if constexpr (Size == 8) {
+      return load_be_cmp<uint64_t>(p1, p2);
+    } else if constexpr (useArrayType<Size, MaxSize>()) {
+      for (size_t offset = 0; offset < Size; offset += MaxSize)
+        if (Bcmp<MaxSize>::block(p1 + offset, p2 + offset))
+          return Memcmp<MaxSize>::block(p1 + offset, p2 + offset);
+      return MemcmpReturnType::ZERO();
+    } else if constexpr (Size == 3) {
+      if (auto value = Memcmp<2>::block(p1, p2))
+        return value;
+      return Memcmp<1>::block(p1 + 2, p2 + 2);
+    } else {
+      deferred_static_assert("Unimplemented Size");
+    }
+  }
+
+  static inline MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1 + count - SIZE, p2 + count - SIZE);
+  }
+
+  static inline MemcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
+    if (auto value = block(p1, p2))
+      return value;
+    return tail(p1, p2, count);
+  }
+
+  static inline MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
+    static_assert(Size > 1, "a loop of size 1 does not need tail");
+    size_t offset = 0;
+    do {
+      if (auto value = block(p1 + offset, p2 + offset))
+        return value;
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    return tail(p1, p2, count);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Memmove
+///////////////////////////////////////////////////////////////////////////////
+
+template <size_t Size, size_t MaxSize> struct Memmove {
+  static_assert(is_power2(MaxSize));
+  using T = getTypeFor<Size, MaxSize>;
+  static constexpr size_t SIZE = Size;
+
+  static inline void block(Ptr dst, CPtr src) {
+    if constexpr (is_void_v<T>) {
+      deferred_static_assert("Unimplemented Size");
+    } else {
+      T::store(dst, T::load(src));
+    }
+  }
+
+  static inline void head_tail(Ptr dst, CPtr src, size_t count) {
+    const size_t offset = count - Size;
+    if constexpr (is_void_v<T>) {
+      deferred_static_assert("Unimplemented Size");
+    } else {
+      // The load and store operations can be performed in any order as long as
+      // they are not interleaved. More investigations are needed to determine
+      // the best order.
+      const auto head = T::load(src);
+      const auto tail = T::load(src + offset);
+      T::store(dst, head);
+      T::store(dst + offset, tail);
+    }
+  }
+
+  // Align forward suitable when dst < src. The alignment is performed with
+  // an HeadTail operation of count â [Alignment, 2 x Alignment].
+  //
+  // e.g. Moving two bytes forward, we make sure src is aligned.
+  // [  |       |       |       |      ]
+  // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
+  // [____LLLLLLLL_____________________]
+  // [___________LLLLLLLA______________]
+  // [_SSSSSSSS________________________]
+  // [________SSSSSSSS_________________]
+  //
+  // e.g. Moving two bytes forward, we make sure dst is aligned.
+  // [  |       |       |       |      ]
+  // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
+  // [____LLLLLLLL_____________________]
+  // [______LLLLLLLL___________________]
+  // [_SSSSSSSS________________________]
+  // [___SSSSSSSA______________________]
+  template <Arg AlignOn>
+  static inline void align_forward(Ptr &dst, CPtr &src, size_t &count) {
+    Ptr prev_dst = dst;
+    CPtr prev_src = src;
+    size_t prev_count = count;
+    align_to_next_boundary<Size, AlignOn>(dst, src, count);
+    adjust(Size, dst, src, count);
+    head_tail(prev_dst, prev_src, prev_count - count);
+  }
+
+  // Align backward suitable when dst > src. The alignment is performed with
+  // an HeadTail operation of count â [Alignment, 2 x Alignment].
+  //
+  // e.g. Moving two bytes backward, we make sure src is aligned.
+  // [  |       |       |       |      ]
+  // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
+  // [ _________________ALLLLLLL_______]
+  // [ ___________________LLLLLLLL_____]
+  // [____________________SSSSSSSS_____]
+  // [______________________SSSSSSSS___]
+  //
+  // e.g. Moving two bytes backward, we make sure dst is aligned.
+  // [  |       |       |       |      ]
+  // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
+  // [ _______________LLLLLLLL_________]
+  // [ ___________________LLLLLLLL_____]
+  // [__________________ASSSSSSS_______]
+  // [______________________SSSSSSSS___]
+  template <Arg AlignOn>
+  static inline void align_backward(Ptr &dst, CPtr &src, size_t &count) {
+    Ptr headtail_dst = dst + count;
+    CPtr headtail_src = src + count;
+    size_t headtail_size = 0;
+    align_to_next_boundary<Size, AlignOn>(headtail_dst, headtail_src,
+                                          headtail_size);
+    adjust(-2 * Size, headtail_dst, headtail_src, headtail_size);
+    head_tail(headtail_dst, headtail_src, headtail_size);
+    count -= headtail_size;
+  }
+
+  // Move forward suitable when dst < src. We load the tail bytes before
+  // handling the loop.
+  //
+  // e.g. Moving two bytes
+  // [   |       |       |       |       |]
+  // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
+  // [_________________________LLLLLLLL___]
+  // [___LLLLLLLL_________________________]
+  // [_SSSSSSSS___________________________]
+  // [___________LLLLLLLL_________________]
+  // [_________SSSSSSSS___________________]
+  // [___________________LLLLLLLL_________]
+  // [_________________SSSSSSSS___________]
+  // [_______________________SSSSSSSS_____]
+  static inline void loop_and_tail_forward(Ptr dst, CPtr src, size_t count) {
+    static_assert(Size > 1, "a loop of size 1 does not need tail");
+    const size_t tail_offset = count - Size;
+    const auto tail_value = T::load(src + tail_offset);
+    size_t offset = 0;
+#pragma nounroll
+    do {
+      block(dst + offset, src + offset);
+      offset += Size;
+    } while (offset < count - Size);
+    T::store(dst + tail_offset, tail_value);
+  }
+
+  // Move backward suitable when dst > src. We load the head bytes before
+  // handling the loop.
+  //
+  // e.g. Moving two bytes
+  // [   |       |       |       |       |]
+  // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
+  // [___LLLLLLLL_________________________]
+  // [_________________________LLLLLLLL___]
+  // [___________________________SSSSSSSS_]
+  // [_________________LLLLLLLL___________]
+  // [___________________SSSSSSSS_________]
+  // [_________LLLLLLLL___________________]
+  // [___________SSSSSSSS_________________]
+  // [_____SSSSSSSS_______________________]
+  static inline void loop_and_tail_backward(Ptr dst, CPtr src, size_t count) {
+    static_assert(Size > 1, "a loop of size 1 does not need tail");
+    const auto head_value = T::load(src);
+    ptrdiff_t offset = count - Size;
+#pragma nounroll
+    do {
+      block(dst + offset, src + offset);
+      offset -= Size;
+    } while (offset >= 0);
+    T::store(dst, head_value);
+  }
+};
+
+} // namespace __llvm_libc::generic
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H
diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h
new file mode 100644
index 0000000..004e500
--- /dev/null
+++ b/libc/src/string/memory_utils/op_x86.h
@@ -0,0 +1,221 @@
+//===-- x86 implementation of memory function building blocks -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides x86 specific building blocks to compose memory functions.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
+
+#include "src/__support/architectures.h"
+
+#if defined(LLVM_LIBC_ARCH_X86_64)
+
+#include "src/__support/common.h"
+#include "src/string/memory_utils/op_builtin.h"
+#include "src/string/memory_utils/op_generic.h"
+
+#ifdef __SSE2__
+#include <immintrin.h>
+#else
+// Define fake functions to prevent the compiler from failing on undefined
+// functions in case SSE2 is not present.
+#define _mm512_cmpneq_epi8_mask(A, B) 0
+#define _mm_movemask_epi8(A) 0
+#define _mm256_movemask_epi8(A) 0
+#endif //  __SSE2__
+
+namespace __llvm_libc::x86 {
+
+// A set of constants to check compile time features.
+static inline constexpr bool kSse2 = LLVM_LIBC_IS_DEFINED(__SSE2__);
+static inline constexpr bool kAvx = LLVM_LIBC_IS_DEFINED(__AVX__);
+static inline constexpr bool kAvx2 = LLVM_LIBC_IS_DEFINED(__AVX2__);
+static inline constexpr bool kAvx512F = LLVM_LIBC_IS_DEFINED(__AVX512F__);
+static inline constexpr bool kAvx512BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__);
+
+///////////////////////////////////////////////////////////////////////////////
+// Memcpy repmovsb implementation
+struct Memcpy {
+  static void repmovsb(char *dst, const char *src, size_t count) {
+    asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Bcmp
+
+// Base implementation for the Bcmp specializations.
+//  - BlockSize is either 16, 32 or 64 depending on the available compile time
+// features, it is used to switch between "single native operation" or a
+// "sequence of native operations".
+//  - BlockBcmp is the function that implements the bcmp logic.
+template <size_t Size, size_t BlockSize, auto BlockBcmp> struct BcmpImpl {
+  static constexpr size_t SIZE = Size;
+  static inline BcmpReturnType block(CPtr p1, CPtr p2) {
+    if constexpr (Size == BlockSize) {
+      return BlockBcmp(p1, p2);
+    } else if constexpr (Size % BlockSize == 0) {
+      for (size_t offset = 0; offset < Size; offset += BlockSize)
+        if (auto value = BlockBcmp(p1 + offset, p2 + offset))
+          return value;
+    } else {
+      deferred_static_assert("SIZE not implemented");
+    }
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1 + count - Size, p2 + count - Size);
+  }
+
+  static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1, p2) | tail(p1, p2, count);
+  }
+
+  static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
+    static_assert(Size > 1, "a loop of size 1 does not need tail");
+    size_t offset = 0;
+    do {
+      if (auto value = block(p1 + offset, p2 + offset))
+        return value;
+      offset += Size;
+    } while (offset < count - Size);
+    return tail(p1, p2, count);
+  }
+};
+
+namespace sse2 {
+static inline BcmpReturnType bcmp16(CPtr p1, CPtr p2) {
+  using T = char __attribute__((__vector_size__(16)));
+  // A mask indicating which bytes differ after loading 16 bytes from p1 and p2.
+  const int mask = _mm_movemask_epi8(load<T>(p1) != load<T>(p2));
+  return static_cast<uint32_t>(mask);
+}
+template <size_t Size> using Bcmp = BcmpImpl<Size, 16, bcmp16>;
+} // namespace sse2
+
+namespace avx2 {
+static inline BcmpReturnType bcmp32(CPtr p1, CPtr p2) {
+  using T = char __attribute__((__vector_size__(32)));
+  // A mask indicating which bytes differ after loading 32 bytes from p1 and p2.
+  const int mask = _mm256_movemask_epi8(load<T>(p1) != load<T>(p2));
+  // _mm256_movemask_epi8 returns an int but it is to be interpreted as a 32-bit
+  // mask.
+  return static_cast<uint32_t>(mask);
+}
+template <size_t Size> using Bcmp = BcmpImpl<Size, 32, bcmp32>;
+} // namespace avx2
+
+namespace avx512bw {
+static inline BcmpReturnType bcmp64(CPtr p1, CPtr p2) {
+  using T = char __attribute__((__vector_size__(64)));
+  // A mask indicating which bytes differ after loading 64 bytes from p1 and p2.
+  const uint64_t mask = _mm512_cmpneq_epi8_mask(load<T>(p1), load<T>(p2));
+  const bool mask_is_set = mask != 0;
+  return static_cast<uint32_t>(mask_is_set);
+}
+template <size_t Size> using Bcmp = BcmpImpl<Size, 64, bcmp64>;
+} // namespace avx512bw
+
+// Assuming that the mask is non zero, the index of the first mismatching byte
+// is the number of trailing zeros in the mask. Trailing zeros and not leading
+// zeros because the x86 architecture is little endian.
+static inline MemcmpReturnType char_diff_no_zero(CPtr p1, CPtr p2,
+                                                 uint64_t mask) {
+  const size_t diff_index = __builtin_ctzll(mask);
+  const int16_t ca = cpp::bit_cast<uint8_t>(p1[diff_index]);
+  const int16_t cb = cpp::bit_cast<uint8_t>(p2[diff_index]);
+  return ca - cb;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Memcmp
+
+// Base implementation for the Memcmp specializations.
+//  - BlockSize is either 16, 32 or 64 depending on the available compile time
+// features, it is used to switch between "single native operation" or a
+// "sequence of native operations".
+//  - BlockMemcmp is the function that implements the memcmp logic.
+//  - BlockBcmp is the function that implements the bcmp logic.
+template <size_t Size, size_t BlockSize, auto BlockMemcmp, auto BlockBcmp>
+struct MemcmpImpl {
+  static constexpr size_t SIZE = Size;
+  static inline MemcmpReturnType block(CPtr p1, CPtr p2) {
+    if constexpr (Size == BlockSize) {
+      return BlockMemcmp(p1, p2);
+    } else if constexpr (Size % BlockSize == 0) {
+      for (size_t offset = 0; offset < Size; offset += BlockSize)
+        if (auto value = BlockBcmp(p1 + offset, p2 + offset))
+          return BlockMemcmp(p1 + offset, p2 + offset);
+    } else {
+      deferred_static_assert("SIZE not implemented");
+    }
+    return MemcmpReturnType::ZERO();
+  }
+
+  static inline MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1 + count - Size, p2 + count - Size);
+  }
+
+  static inline MemcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
+    if (auto value = block(p1, p2))
+      return value;
+    return tail(p1, p2, count);
+  }
+
+  static inline MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
+    static_assert(Size > 1, "a loop of size 1 does not need tail");
+    size_t offset = 0;
+    do {
+      if (auto value = block(p1 + offset, p2 + offset))
+        return value;
+      offset += Size;
+    } while (offset < count - Size);
+    return tail(p1, p2, count);
+  }
+};
+
+namespace sse2 {
+static inline MemcmpReturnType memcmp16(CPtr p1, CPtr p2) {
+  using T = char __attribute__((__vector_size__(16)));
+  // A mask indicating which bytes differ after loading 16 bytes from p1 and p2.
+  if (int mask = _mm_movemask_epi8(load<T>(p1) != load<T>(p2)))
+    return char_diff_no_zero(p1, p2, mask);
+  return MemcmpReturnType::ZERO();
+}
+template <size_t Size> using Memcmp = MemcmpImpl<Size, 16, memcmp16, bcmp16>;
+} // namespace sse2
+
+namespace avx2 {
+static inline MemcmpReturnType memcmp32(CPtr p1, CPtr p2) {
+  using T = char __attribute__((__vector_size__(32)));
+  // A mask indicating which bytes differ after loading 32 bytes from p1 and p2.
+  if (int mask = _mm256_movemask_epi8(load<T>(p1) != load<T>(p2)))
+    return char_diff_no_zero(p1, p2, mask);
+  return MemcmpReturnType::ZERO();
+}
+template <size_t Size> using Memcmp = MemcmpImpl<Size, 32, memcmp32, bcmp32>;
+} // namespace avx2
+
+namespace avx512bw {
+static inline MemcmpReturnType memcmp64(CPtr p1, CPtr p2) {
+  using T = char __attribute__((__vector_size__(64)));
+  // A mask indicating which bytes differ after loading 64 bytes from p1 and p2.
+  if (uint64_t mask = _mm512_cmpneq_epi8_mask(load<T>(p1), load<T>(p2)))
+    return char_diff_no_zero(p1, p2, mask);
+  return MemcmpReturnType::ZERO();
+}
+template <size_t Size> using Memcmp = MemcmpImpl<Size, 64, memcmp64, bcmp64>;
+} // namespace avx512bw
+
+} // namespace __llvm_libc::x86
+
+#endif // LLVM_LIBC_ARCH_X86_64
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
index 60e5a19..9d1321f 100644
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@@ -106,6 +106,41 @@ static inline void memcpy_inline(void *__restrict dst,
 using Ptr = char *;        // Pointer to raw data.
 using CPtr = const char *; // Const pointer to raw data.
 
+// This type makes sure that we don't accidentally promote an integral type to
+// another one. It is only constructible from the exact T type.
+template <typename T> struct StrictIntegralType {
+  static_assert(cpp::is_integral_v<T>);
+
+  // Can only be constructed from a T.
+  template <typename U, cpp::enable_if_t<cpp::is_same_v<U, T>, bool> = 0>
+  StrictIntegralType(U value) : value(value) {}
+
+  // Allows using the type in an if statement.
+  explicit operator bool() const { return value; }
+
+  // If type is unsigned (bcmp) we allow bitwise OR operations.
+  StrictIntegralType operator|(const StrictIntegralType &Rhs) const {
+    static_assert(!cpp::is_signed_v<T>);
+    return value | Rhs.value;
+  }
+
+  // For interation with the C API we allow explicit conversion back to the
+  // `int` type.
+  explicit operator int() const {
+    // bit_cast makes sure that T and int have the same size.
+    return cpp::bit_cast<int>(value);
+  }
+
+  // Helper to get the zero value.
+  static inline constexpr StrictIntegralType ZERO() { return {T(0)}; }
+
+private:
+  T value;
+};
+
+using MemcmpReturnType = StrictIntegralType<int32_t>;
+using BcmpReturnType = StrictIntegralType<uint32_t>;
+
 // Loads bytes from memory (possibly unaligned) and materializes them as
 // type.
 template <typename T> static inline T load(CPtr ptr) {
diff --git a/libc/test/src/string/memory_utils/CMakeLists.txt b/libc/test/src/string/memory_utils/CMakeLists.txt
index 9dc2a6d..75647f1 100644
--- a/libc/test/src/string/memory_utils/CMakeLists.txt
+++ b/libc/test/src/string/memory_utils/CMakeLists.txt
@@ -5,6 +5,7 @@ add_libc_unittest(
   SRCS
     elements_test.cpp
     memory_access_test.cpp
+    op_tests.cpp
     utils_test.cpp
   COMPILE_OPTIONS
     ${LIBC_COMPILE_OPTIONS_NATIVE}
diff --git a/libc/test/src/string/memory_utils/op_tests.cpp b/libc/test/src/string/memory_utils/op_tests.cpp
new file mode 100644
index 0000000..cae2fc0
--- /dev/null
+++ b/libc/test/src/string/memory_utils/op_tests.cpp
@@ -0,0 +1,420 @@
+//===-- Unittests for op_ files -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/__support/CPP/span.h"
+#include "src/string/memory_utils/op_aarch64.h"
+#include "src/string/memory_utils/op_builtin.h"
+#include "src/string/memory_utils/op_generic.h"
+#include "src/string/memory_utils/op_x86.h"
+#include "src/string/memory_utils/utils.h"
+#include "utils/UnitTest/Test.h"
+
+#include <assert.h>
+#include <stdlib.h>
+
+#if defined(LLVM_LIBC_ARCH_X86_64) || defined(LLVM_LIBC_ARCH_AARCH64)
+#define LLVM_LIBC_HAS_UINT64
+#endif
+
+namespace __llvm_libc {
+
+static char GetRandomChar() {
+  static constexpr const uint64_t a = 1103515245;
+  static constexpr const uint64_t c = 12345;
+  static constexpr const uint64_t m = 1ULL << 31;
+  static uint64_t seed = 123456789;
+  seed = (a * seed + c) % m;
+  return seed;
+}
+
+// Randomize the content of the buffer.
+static void Randomize(cpp::span<char> buffer) {
+  for (auto &current : buffer)
+    current = GetRandomChar();
+}
+
+// Copy one span to another.
+static void Copy(cpp::span<char> dst, const cpp::span<char> src) {
+  assert(dst.size() == src.size());
+  for (size_t i = 0; i < dst.size(); ++i)
+    dst[i] = src[i];
+}
+
+// Simple structure to allocate an aligned buffer of a particular size.
+// By allocating exactly the right size, we can leverage asan to detect whether
+// we perform out of bounds accesses.
+struct RawAlignedBuffer {
+  static constexpr size_t kAlign = 64;
+  RawAlignedBuffer(size_t size)
+      : ptr((char *)aligned_alloc(kAlign, size)), size(size) {
+    assert(ptr);
+    assert((uintptr_t)(ptr) % kAlign == 0);
+  }
+  ~RawAlignedBuffer() { free(ptr); }
+  cpp::span<char> span() { return cpp::span<char>(ptr, size); }
+
+private:
+  char *ptr = nullptr;
+  size_t size = 0;
+};
+
+// Allocates two RawAlignedBuffer and extracts two spans out of them, one
+// aligned and one misaligned. Tests are run on both spans.
+struct Buffers {
+  Buffers(size_t size)
+      : size(size), aligned_buffer(size), misaligned_buffer(size + 1) {}
+
+  // Returns two spans of 'size' bytes. The first is aligned on
+  // RawAlignedBuffer::kAlign and the second one is unaligned.
+  cpp::array<cpp::span<char>, 2> spans() {
+    return {aligned_buffer.span(), misaligned_buffer.span().subspan(1)};
+  }
+
+  size_t size;
+  RawAlignedBuffer aligned_buffer;
+  RawAlignedBuffer misaligned_buffer;
+};
+
+using MemcpyImplementations = testing::TypeList<
+#ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
+    builtin::Memcpy<1>,  //
+    builtin::Memcpy<2>,  //
+    builtin::Memcpy<3>,  //
+    builtin::Memcpy<4>,  //
+    builtin::Memcpy<8>,  //
+    builtin::Memcpy<16>, //
+    builtin::Memcpy<32>, //
+    builtin::Memcpy<64>
+#endif // LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
+    >;
+
+template <auto FnImpl>
+bool CheckMemcpy(cpp::span<char> dst, cpp::span<char> src, size_t size) {
+  assert(dst.size() == src.size());
+  assert(dst.size() == size);
+  Randomize(dst);
+  FnImpl(dst.data(), src.data(), size);
+  for (size_t i = 0; i < size; ++i)
+    if (dst[i] != src[i])
+      return false;
+  return true;
+}
+
+template <typename T>
+static void MemcpyAdaptor(Ptr dst, CPtr src, size_t size) {
+  assert(size == T::SIZE);
+  return T::block(dst, src);
+}
+
+TYPED_TEST(LlvmLibcOpTest, Memcpy, MemcpyImplementations) {
+  using Impl = ParamType;
+  constexpr size_t kSize = Impl::SIZE;
+  { // Test block operation
+    Buffers SrcBuffer(kSize);
+    Buffers DstBuffer(kSize);
+    for (auto src : SrcBuffer.spans()) {
+      Randomize(src);
+      for (auto dst : DstBuffer.spans()) {
+        ASSERT_TRUE(CheckMemcpy<MemcpyAdaptor<Impl>>(dst, src, kSize));
+      }
+    }
+  }
+  { // Test head tail operations from kSize to 2 * kSize.
+    RawAlignedBuffer SrcBuffer(2 * kSize);
+    RawAlignedBuffer DstBuffer(2 * kSize);
+    Randomize(SrcBuffer.span());
+    for (size_t size = kSize; size < 2 * kSize; ++size) {
+      auto src = SrcBuffer.span().subspan(0, size);
+      auto dst = DstBuffer.span().subspan(0, size);
+      ASSERT_TRUE(CheckMemcpy<Impl::head_tail>(dst, src, size));
+    }
+  }
+  { // Test loop operations from kSize to 3 * kSize.
+    if constexpr (kSize > 1) {
+      RawAlignedBuffer SrcBuffer(3 * kSize);
+      RawAlignedBuffer DstBuffer(3 * kSize);
+      Randomize(SrcBuffer.span());
+      for (size_t size = kSize; size < 3 * kSize; ++size) {
+        auto src = SrcBuffer.span().subspan(0, size);
+        auto dst = DstBuffer.span().subspan(0, size);
+        ASSERT_TRUE(CheckMemcpy<Impl::loop_and_tail>(dst, src, size));
+      }
+    }
+  }
+}
+
+using MemsetImplementations = testing::TypeList<
+#ifdef LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE
+    builtin::Memset<1>,  //
+    builtin::Memset<2>,  //
+    builtin::Memset<3>,  //
+    builtin::Memset<4>,  //
+    builtin::Memset<8>,  //
+    builtin::Memset<16>, //
+    builtin::Memset<32>, //
+    builtin::Memset<64>,
+#endif
+#ifdef LLVM_LIBC_HAS_UINT64
+    generic::Memset<8, 8>,  //
+    generic::Memset<16, 8>, //
+    generic::Memset<32, 8>, //
+    generic::Memset<64, 8>, //
+#endif
+#ifdef __AVX512F__
+    generic::Memset<64, 64>, // prevents warning about avx512f
+#endif
+    generic::Memset<1, 1>,   //
+    generic::Memset<2, 1>,   //
+    generic::Memset<2, 2>,   //
+    generic::Memset<4, 2>,   //
+    generic::Memset<4, 4>,   //
+    generic::Memset<16, 16>, //
+    generic::Memset<32, 16>, //
+    generic::Memset<64, 16>, //
+    generic::Memset<32, 32>, //
+    generic::Memset<64, 32>  //
+    >;
+
+template <auto FnImpl>
+bool CheckMemset(cpp::span<char> dst, uint8_t value, size_t size) {
+  Randomize(dst);
+  FnImpl(dst.data(), value, size);
+  for (char c : dst)
+    if (c != (char)value)
+      return false;
+  return true;
+}
+
+template <typename T>
+static void MemsetAdaptor(Ptr dst, uint8_t value, size_t size) {
+  assert(size == T::SIZE);
+  return T::block(dst, value);
+}
+
+TYPED_TEST(LlvmLibcOpTest, Memset, MemsetImplementations) {
+  using Impl = ParamType;
+  constexpr size_t kSize = Impl::SIZE;
+  { // Test block operation
+    Buffers DstBuffer(kSize);
+    for (uint8_t value : cpp::array<uint8_t, 3>{0, 1, 255}) {
+      for (auto dst : DstBuffer.spans()) {
+        ASSERT_TRUE(CheckMemset<MemsetAdaptor<Impl>>(dst, value, kSize));
+      }
+    }
+  }
+  { // Test head tail operations from kSize to 2 * kSize.
+    RawAlignedBuffer DstBuffer(2 * kSize);
+    for (size_t size = kSize; size < 2 * kSize; ++size) {
+      const char value = size % 10;
+      auto dst = DstBuffer.span().subspan(0, size);
+      ASSERT_TRUE(CheckMemset<Impl::head_tail>(dst, value, size));
+    }
+  }
+  { // Test loop operations from kSize to 3 * kSize.
+    if constexpr (kSize > 1) {
+      RawAlignedBuffer DstBuffer(3 * kSize);
+      for (size_t size = kSize; size < 3 * kSize; ++size) {
+        const char value = size % 10;
+        auto dst = DstBuffer.span().subspan(0, size);
+        ASSERT_TRUE((CheckMemset<Impl::loop_and_tail>(dst, value, size)));
+      }
+    }
+  }
+}
+
+using BcmpImplementations = testing::TypeList<
+#ifdef __SSE2__
+    x86::sse2::Bcmp<16>,  //
+    x86::sse2::Bcmp<32>,  //
+    x86::sse2::Bcmp<64>,  //
+    x86::sse2::Bcmp<128>, //
+#endif
+#ifdef __AVX2__
+    x86::avx2::Bcmp<32>,  //
+    x86::avx2::Bcmp<64>,  //
+    x86::avx2::Bcmp<128>, //
+#endif
+#ifdef __AVX512BW__
+    x86::avx512bw::Bcmp<64>,  //
+    x86::avx512bw::Bcmp<128>, //
+#endif
+#ifdef __ARM_NEON
+    aarch64::neon::Bcmp<32>, //
+    aarch64::neon::Bcmp<64>, //
+#endif
+#ifdef LLVM_LIBC_HAS_UINT64
+    generic::Bcmp<8>, //
+#endif
+    generic::Bcmp<1>,  //
+    generic::Bcmp<2>,  //
+    generic::Bcmp<4>,  //
+    generic::Bcmp<16>, //
+    generic::Bcmp<32>, //
+    generic::Bcmp<64>  //
+    >;
+
+template <auto FnImpl>
+bool CheckBcmp(cpp::span<char> span1, cpp::span<char> span2, size_t size) {
+  assert(span1.size() == span2.size());
+  Copy(span2, span1);
+  // Compare equal
+  if (int cmp = (int)FnImpl(span1.data(), span2.data(), size); cmp != 0)
+    return false;
+  // Compare not equal if any byte differs
+  for (size_t i = 0; i < size; ++i) {
+    ++span2[i];
+    if (int cmp = (int)FnImpl(span1.data(), span2.data(), size); cmp == 0)
+      return false;
+    if (int cmp = (int)FnImpl(span2.data(), span1.data(), size); cmp == 0)
+      return false;
+    --span2[i];
+  }
+  return true;
+}
+
+template <typename T>
+static BcmpReturnType BcmpAdaptor(CPtr p1, CPtr p2, size_t size) {
+  assert(size == T::SIZE);
+  return T::block(p1, p2);
+}
+
+TYPED_TEST(LlvmLibcOpTest, Bcmp, BcmpImplementations) {
+  using Impl = ParamType;
+  constexpr size_t kSize = Impl::SIZE;
+  { // Test block operation
+    Buffers Buffer1(kSize);
+    Buffers Buffer2(kSize);
+    for (auto span1 : Buffer1.spans()) {
+      Randomize(span1);
+      for (auto span2 : Buffer2.spans())
+        ASSERT_TRUE((CheckBcmp<BcmpAdaptor<Impl>>(span1, span2, kSize)));
+    }
+  }
+  { // Test head tail operations from kSize to 2 * kSize.
+    RawAlignedBuffer Buffer1(2 * kSize);
+    RawAlignedBuffer Buffer2(2 * kSize);
+    Randomize(Buffer1.span());
+    for (size_t size = kSize; size < 2 * kSize; ++size) {
+      auto span1 = Buffer1.span().subspan(0, size);
+      auto span2 = Buffer2.span().subspan(0, size);
+      ASSERT_TRUE((CheckBcmp<Impl::head_tail>(span1, span2, size)));
+    }
+  }
+  { // Test loop operations from kSize to 3 * kSize.
+    if constexpr (kSize > 1) {
+      RawAlignedBuffer Buffer1(3 * kSize);
+      RawAlignedBuffer Buffer2(3 * kSize);
+      Randomize(Buffer1.span());
+      for (size_t size = kSize; size < 3 * kSize; ++size) {
+        auto span1 = Buffer1.span().subspan(0, size);
+        auto span2 = Buffer2.span().subspan(0, size);
+        ASSERT_TRUE((CheckBcmp<Impl::loop_and_tail>(span1, span2, size)));
+      }
+    }
+  }
+}
+
+using MemcmpImplementations = testing::TypeList<
+#ifdef __SSE2__
+    x86::sse2::Memcmp<16>,  //
+    x86::sse2::Memcmp<32>,  //
+    x86::sse2::Memcmp<64>,  //
+    x86::sse2::Memcmp<128>, //
+#endif
+#ifdef __AVX2__
+    x86::avx2::Memcmp<32>,  //
+    x86::avx2::Memcmp<64>,  //
+    x86::avx2::Memcmp<128>, //
+#endif
+#ifdef __AVX512BW__
+    x86::avx512bw::Memcmp<64>,  //
+    x86::avx512bw::Memcmp<128>, //
+#endif
+#ifdef LLVM_LIBC_HAS_UINT64
+    generic::Memcmp<8>, //
+#endif
+    generic::Memcmp<1>,  //
+    generic::Memcmp<2>,  //
+    generic::Memcmp<3>,  //
+    generic::Memcmp<4>,  //
+    generic::Memcmp<16>, //
+    generic::Memcmp<32>, //
+    generic::Memcmp<64>  //
+    >;
+
+template <auto FnImpl>
+bool CheckMemcmp(cpp::span<char> span1, cpp::span<char> span2, size_t size) {
+  assert(span1.size() == span2.size());
+  Copy(span2, span1);
+  // Compare equal
+  if (int cmp = (int)FnImpl(span1.data(), span2.data(), size); cmp != 0)
+    return false;
+  // Compare not equal if any byte differs
+  for (size_t i = 0; i < size; ++i) {
+    ++span2[i];
+    int ground_truth = __builtin_memcmp(span1.data(), span2.data(), size);
+    if (ground_truth > 0) {
+      if (int cmp = (int)FnImpl(span1.data(), span2.data(), size); cmp <= 0)
+        return false;
+      if (int cmp = (int)FnImpl(span2.data(), span1.data(), size); cmp >= 0)
+        return false;
+    } else {
+      if (int cmp = (int)FnImpl(span1.data(), span2.data(), size); cmp >= 0)
+        return false;
+      if (int cmp = (int)FnImpl(span2.data(), span1.data(), size); cmp <= 0)
+        return false;
+    }
+    --span2[i];
+  }
+  return true;
+}
+
+template <typename T>
+static MemcmpReturnType MemcmpAdaptor(CPtr p1, CPtr p2, size_t size) {
+  assert(size == T::SIZE);
+  return T::block(p1, p2);
+}
+
+TYPED_TEST(LlvmLibcOpTest, Memcmp, MemcmpImplementations) {
+  using Impl = ParamType;
+  constexpr size_t kSize = Impl::SIZE;
+  { // Test block operation
+    Buffers Buffer1(kSize);
+    Buffers Buffer2(kSize);
+    for (auto span1 : Buffer1.spans()) {
+      Randomize(span1);
+      for (auto span2 : Buffer2.spans())
+        ASSERT_TRUE((CheckMemcmp<MemcmpAdaptor<Impl>>(span1, span2, kSize)));
+    }
+  }
+  { // Test head tail operations from kSize to 2 * kSize.
+    RawAlignedBuffer Buffer1(2 * kSize);
+    RawAlignedBuffer Buffer2(2 * kSize);
+    Randomize(Buffer1.span());
+    for (size_t size = kSize; size < 2 * kSize; ++size) {
+      auto span1 = Buffer1.span().subspan(0, size);
+      auto span2 = Buffer2.span().subspan(0, size);
+      ASSERT_TRUE((CheckMemcmp<Impl::head_tail>(span1, span2, size)));
+    }
+  }
+  { // Test loop operations from kSize to 3 * kSize.
+    if constexpr (kSize > 1) {
+      RawAlignedBuffer Buffer1(3 * kSize);
+      RawAlignedBuffer Buffer2(3 * kSize);
+      Randomize(Buffer1.span());
+      for (size_t size = kSize; size < 3 * kSize; ++size) {
+        auto span1 = Buffer1.span().subspan(0, size);
+        auto span2 = Buffer2.span().subspan(0, size);
+        ASSERT_TRUE((CheckMemcmp<Impl::loop_and_tail>(span1, span2, size)));
+      }
+    }
+  }
+}
+
+} // namespace __llvm_libc
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 3c94295..cc23175 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -973,9 +973,13 @@ no_sanitize_features = [
 cc_library(
     name = "string_memory_utils",
     hdrs = [
-        "src/string/memory_utils/elements.h",
         "src/string/memory_utils/elements_aarch64.h",
         "src/string/memory_utils/elements_x86.h",
+        "src/string/memory_utils/elements.h",
+        "src/string/memory_utils/op_aarch64.h",
+        "src/string/memory_utils/op_builtin.h",
+        "src/string/memory_utils/op_generic.h",
+        "src/string/memory_utils/op_x86.h",
         "src/string/memory_utils/utils.h",
     ],
     textual_hdrs = [
@@ -989,6 +993,7 @@ cc_library(
         ":__support_common",
         ":__support_cpp_bit",
         ":__support_cpp_type_traits",
+        ":__support_cpp_array",
         ":libc_root",
     ],
 )
-- 
2.7.4