From 7f5d7bc827a5fda1c803342af536f81986e2635a Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Wed, 22 Jun 2022 11:34:51 +0000
Subject: [PATCH] [libc][mem*] Introduce Algorithms for new mem framework

This patch is a subpart of D125768 intented to make the review easier.

This patch introduces the same algorithms as in `libc/src/string/memory_utils/elements.h` but using the new API.

Differential Revision: https://reviews.llvm.org/D128335
---
 libc/src/string/memory_utils/algorithm.h           | 463 +++++++++++++++++
 libc/src/string/memory_utils/sized_op.h            |   3 +
 libc/test/src/string/memory_utils/CMakeLists.txt   |   1 +
 .../src/string/memory_utils/algorithm_test.cpp     | 565 +++++++++++++++++++++
 4 files changed, 1032 insertions(+)
 create mode 100644 libc/src/string/memory_utils/algorithm.h
 create mode 100644 libc/test/src/string/memory_utils/algorithm_test.cpp
diff --git a/libc/src/string/memory_utils/algorithm.h b/libc/src/string/memory_utils/algorithm.h
new file mode 100644
index 0000000..82c09fc7
--- /dev/null
+++ b/libc/src/string/memory_utils/algorithm.h
@@ -0,0 +1,463 @@
+//===-- Algorithms to compose sized memory operations ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Higher order primitives that build upon the SizedOpT facility.
+// They constitute the basic blocks for composing memory functions.
+// This file defines the following operations:
+// - Skip
+// - Tail
+// - HeadTail
+// - Loop
+// - Align
+//
+// See each class for documentation.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ALGORITHM_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ALGORITHM_H
+
+#include "src/string/memory_utils/address.h" // Address
+#include "src/string/memory_utils/utils.h"   // offset_to_next_aligned
+
+#include <stddef.h> // ptrdiff_t
+
+namespace __llvm_libc {
+
+// We are not yet allowed to use asserts in low level memory operations as
+// assert itself could depend on them.
+// We define this empty macro so we can enable them as soon as possible and keep
+// track of invariants.
+#define LIBC_ASSERT(COND)
+
+// An operation that allows to skip the specified amount of bytes.
+template <ptrdiff_t Bytes> struct Skip {
+  template <typename NextT> struct Then {
+    template <typename DstAddrT>
+    static inline void set(DstAddrT dst, ubyte value) {
+      static_assert(NextT::IS_FIXED_SIZE);
+      NextT::set(offsetAddr<Bytes>(dst), value);
+    }
+
+    template <typename SrcAddrT1, typename SrcAddrT2>
+    static inline uint64_t isDifferent(SrcAddrT1 src1, SrcAddrT2 src2) {
+      static_assert(NextT::IS_FIXED_SIZE);
+      return NextT::isDifferent(offsetAddr<Bytes>(src1),
+                                offsetAddr<Bytes>(src2));
+    }
+
+    template <typename SrcAddrT1, typename SrcAddrT2>
+    static inline int32_t threeWayCmp(SrcAddrT1 src1, SrcAddrT2 src2) {
+      static_assert(NextT::IS_FIXED_SIZE);
+      return NextT::threeWayCmp(offsetAddr<Bytes>(src1),
+                                offsetAddr<Bytes>(src2));
+    }
+
+    template <typename SrcAddrT1, typename SrcAddrT2>
+    static inline int32_t threeWayCmp(SrcAddrT1 src1, SrcAddrT2 src2,
+                                      size_t runtime_size) {
+      static_assert(NextT::IS_RUNTIME_SIZE);
+      return NextT::threeWayCmp(offsetAddr<Bytes>(src1),
+                                offsetAddr<Bytes>(src2), runtime_size - Bytes);
+    }
+  };
+};
+
+// Compute the address of a tail operation.
+// Because of the runtime size, we loose the alignment information.
+template <size_t Size, typename AddrT>
+static auto tailAddr(AddrT addr, size_t runtime_size) {
+  static_assert(IsAddressType<AddrT>::Value);
+  return offsetAddrAssumeAligned<1>(addr, runtime_size - Size);
+}
+
+// Perform the operation on the last 'Size' bytes of the buffer.
+//
+// e.g. with
+// [1234567812345678123]
+// [__XXXXXXXXXXXXXX___]
+// [________XXXXXXXX___]
+//
+// Precondition: `runtime_size >= Size`.
+template <typename SizedOpT> struct Tail {
+  static_assert(SizedOpT::IS_FIXED_SIZE);
+  static constexpr bool IS_RUNTIME_SIZE = true;
+  static constexpr size_t SIZE = SizedOpT::SIZE;
+
+  template <typename DstAddrT, typename SrcAddrT>
+  static inline void copy(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
+    SizedOpT::copy(tailAddr<SIZE>(dst, runtime_size),
+                   tailAddr<SIZE>(src, runtime_size));
+  }
+
+  template <typename DstAddrT, typename SrcAddrT>
+  static inline void move(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
+    SizedOpT::move(tailAddr<SIZE>(dst, runtime_size),
+                   tailAddr<SIZE>(src, runtime_size));
+  }
+
+  template <typename DstAddrT>
+  static inline void set(DstAddrT dst, ubyte value, size_t runtime_size) {
+    SizedOpT::set(tailAddr<SIZE>(dst, runtime_size), value);
+  }
+
+  template <typename SrcAddrT1, typename SrcAddrT2>
+  static inline uint64_t isDifferent(SrcAddrT1 src1, SrcAddrT2 src2,
+                                     size_t runtime_size) {
+    return SizedOpT::isDifferent(tailAddr<SIZE>(src1, runtime_size),
+                                 tailAddr<SIZE>(src2, runtime_size));
+  }
+
+  template <typename SrcAddrT1, typename SrcAddrT2>
+  static inline int32_t threeWayCmp(SrcAddrT1 src1, SrcAddrT2 src2,
+                                    size_t runtime_size) {
+    return SizedOpT::threeWayCmp(tailAddr<SIZE>(src1, runtime_size),
+                                 tailAddr<SIZE>(src2, runtime_size));
+  }
+};
+
+// Perform the operation on the first and the last `SizedOpT::Size` bytes of the
+// buffer. This is useful for overlapping operations.
+//
+// e.g. with
+// [1234567812345678123]
+// [__XXXXXXXXXXXXXX___]
+// [__XXXXXXXX_________]
+// [________XXXXXXXX___]
+//
+// Precondition: `runtime_size >= Size && runtime_size <= 2 x Size`.
+template <typename SizedOpT> struct HeadTail {
+  static_assert(SizedOpT::IS_FIXED_SIZE);
+  static constexpr bool IS_RUNTIME_SIZE = true;
+
+  template <typename DstAddrT, typename SrcAddrT>
+  static inline void copy(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
+    LIBC_ASSERT(runtime_size >= SizedOpT::SIZE);
+    SizedOpT::copy(dst, src);
+    Tail<SizedOpT>::copy(dst, src, runtime_size);
+  }
+
+  template <typename DstAddrT, typename SrcAddrT>
+  static inline void move(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
+    LIBC_ASSERT(runtime_size >= SizedOpT::SIZE);
+    static constexpr size_t BLOCK_SIZE = SizedOpT::SIZE;
+    // The load and store operations can be performed in any order as long as
+    // they are not interleaved. More investigations are needed to determine the
+    // best order.
+    auto head = SizedOpT::load(src);
+    auto tail = SizedOpT::load(tailAddr<BLOCK_SIZE>(src, runtime_size));
+    SizedOpT::store(tailAddr<BLOCK_SIZE>(dst, runtime_size), tail);
+    SizedOpT::store(dst, head);
+  }
+
+  template <typename DstAddrT>
+  static inline void set(DstAddrT dst, ubyte value, size_t runtime_size) {
+    LIBC_ASSERT(runtime_size >= SizedOpT::SIZE);
+    SizedOpT::set(dst, value);
+    Tail<SizedOpT>::set(dst, value, runtime_size);
+  }
+
+  template <typename SrcAddrT1, typename SrcAddrT2>
+  static inline uint64_t isDifferent(SrcAddrT1 src1, SrcAddrT2 src2,
+                                     size_t runtime_size) {
+    LIBC_ASSERT(runtime_size >= SizedOpT::SIZE);
+    // Two strategies can be applied here:
+    // 1. Compute head and tail and compose them with a bitwise or operation.
+    // 2. Stop early if head is different.
+    // We chose the later because HeadTail operations are typically performed
+    // with sizes ranging from 4 to 256 bytes. The cost of the loads is then
+    // significantly larger than the cost of the branch.
+    if (const uint64_t res = SizedOpT::isDifferent(src1, src2))
+      return res;
+    return Tail<SizedOpT>::isDifferent(src1, src2, runtime_size);
+  }
+
+  template <typename SrcAddrT1, typename SrcAddrT2>
+  static inline int32_t threeWayCmp(SrcAddrT1 src1, SrcAddrT2 src2,
+                                    size_t runtime_size) {
+    LIBC_ASSERT(runtime_size >= SizedOpT::SIZE &&
+                runtime_size <= 2 * SizedOpT::SIZE);
+    if (const int32_t res = SizedOpT::threeWayCmp(src1, src2))
+      return res;
+    return Tail<SizedOpT>::threeWayCmp(src1, src2, runtime_size);
+  }
+};
+
+// Simple loop ending with a Tail operation.
+//
+// e.g. with
+// [12345678123456781234567812345678]
+// [__XXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
+// [__XXXXXXXX_______________________]
+// [__________XXXXXXXX_______________]
+// [__________________XXXXXXXX_______]
+// [______________________XXXXXXXX___]
+//
+// Precondition:
+// - runtime_size >= Size
+template <typename SizedOpT> struct Loop {
+  static_assert(SizedOpT::IS_FIXED_SIZE);
+  static constexpr bool IS_RUNTIME_SIZE = true;
+  static constexpr size_t BLOCK_SIZE = SizedOpT::SIZE;
+
+  template <typename DstAddrT, typename SrcAddrT>
+  static inline void copy(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
+    size_t offset = 0;
+    do {
+      SizedOpT::copy(offsetAddrMultiplesOf<BLOCK_SIZE>(dst, offset),
+                     offsetAddrMultiplesOf<BLOCK_SIZE>(src, offset));
+      offset += BLOCK_SIZE;
+    } while (offset < runtime_size - BLOCK_SIZE);
+    Tail<SizedOpT>::copy(dst, src, runtime_size);
+  }
+
+  // Move forward suitable when dst < src. We load the tail bytes before
+  // handling the loop.
+  //
+  // e.g. Moving two bytes
+  // [   |       |       |       |       |]
+  // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
+  // [_________________________LLLLLLLL___]
+  // [___LLLLLLLL_________________________]
+  // [_SSSSSSSS___________________________]
+  // [___________LLLLLLLL_________________]
+  // [_________SSSSSSSS___________________]
+  // [___________________LLLLLLLL_________]
+  // [_________________SSSSSSSS___________]
+  // [_______________________SSSSSSSS_____]
+  template <typename DstAddrT, typename SrcAddrT>
+  static inline void move(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
+    const auto tail_value =
+        SizedOpT::load(tailAddr<BLOCK_SIZE>(src, runtime_size));
+    size_t offset = 0;
+    do {
+      SizedOpT::move(offsetAddrMultiplesOf<BLOCK_SIZE>(dst, offset),
+                     offsetAddrMultiplesOf<BLOCK_SIZE>(src, offset));
+      offset += BLOCK_SIZE;
+    } while (offset < runtime_size - BLOCK_SIZE);
+    SizedOpT::store(tailAddr<BLOCK_SIZE>(dst, runtime_size), tail_value);
+  }
+
+  // Move backward suitable when dst > src. We load the head bytes before
+  // handling the loop.
+  //
+  // e.g. Moving two bytes
+  // [   |       |       |       |       |]
+  // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
+  // [___LLLLLLLL_________________________]
+  // [_________________________LLLLLLLL___]
+  // [___________________________SSSSSSSS_]
+  // [_________________LLLLLLLL___________]
+  // [___________________SSSSSSSS_________]
+  // [_________LLLLLLLL___________________]
+  // [___________SSSSSSSS_________________]
+  // [_____SSSSSSSS_______________________]
+  template <typename DstAddrT, typename SrcAddrT>
+  static inline void move_backward(DstAddrT dst, SrcAddrT src,
+                                   size_t runtime_size) {
+    const auto head_value = SizedOpT::load(src);
+    ptrdiff_t offset = runtime_size - BLOCK_SIZE;
+    do {
+      SizedOpT::move(offsetAddrMultiplesOf<BLOCK_SIZE>(dst, offset),
+                     offsetAddrMultiplesOf<BLOCK_SIZE>(src, offset));
+      offset -= BLOCK_SIZE;
+    } while (offset >= 0);
+    SizedOpT::store(dst, head_value);
+  }
+
+  template <typename DstAddrT>
+  static inline void set(DstAddrT dst, ubyte value, size_t runtime_size) {
+    size_t offset = 0;
+    do {
+      SizedOpT::set(offsetAddrMultiplesOf<BLOCK_SIZE>(dst, offset), value);
+      offset += BLOCK_SIZE;
+    } while (offset < runtime_size - BLOCK_SIZE);
+    Tail<SizedOpT>::set(dst, value, runtime_size);
+  }
+
+  template <typename SrcAddrT1, typename SrcAddrT2>
+  static inline uint64_t isDifferent(SrcAddrT1 src1, SrcAddrT2 src2,
+                                     size_t runtime_size) {
+    size_t offset = 0;
+    do {
+      if (uint64_t res = SizedOpT::isDifferent(
+              offsetAddrMultiplesOf<BLOCK_SIZE>(src1, offset),
+              offsetAddrMultiplesOf<BLOCK_SIZE>(src2, offset)))
+        return res;
+      offset += BLOCK_SIZE;
+    } while (offset < runtime_size - BLOCK_SIZE);
+    return Tail<SizedOpT>::isDifferent(src1, src2, runtime_size);
+  }
+
+  template <typename SrcAddrT1, typename SrcAddrT2>
+  static inline int32_t threeWayCmp(SrcAddrT1 src1, SrcAddrT2 src2,
+                                    size_t runtime_size) {
+    size_t offset = 0;
+    do {
+      if (int32_t res = SizedOpT::threeWayCmp(
+              offsetAddrMultiplesOf<BLOCK_SIZE>(src1, offset),
+              offsetAddrMultiplesOf<BLOCK_SIZE>(src2, offset)))
+        return res;
+      offset += BLOCK_SIZE;
+    } while (offset < runtime_size - BLOCK_SIZE);
+    return Tail<SizedOpT>::threeWayCmp(src1, src2, runtime_size);
+  }
+};
+
+// Aligns using a statically-sized operation, then calls the subsequent NextT
+// operation.
+//
+// e.g. A 16-byte Destination Aligned 32-byte Loop Copy can be written as:
+// Align<_16, Arg::Dst>::Then<Loop<_32>>::copy(dst, src, runtime_size);
+enum class Arg { _1, _2, Dst = _1, Src = _2, Lhs = _1, Rhs = _2 };
+template <typename SizedOpT, Arg AlignOn = Arg::_1> struct Align {
+  static_assert(SizedOpT::IS_FIXED_SIZE);
+
+  template <typename NextT> struct Then {
+    static_assert(NextT::IS_RUNTIME_SIZE);
+
+    template <typename DstAddrT, typename SrcAddrT>
+    static inline void copy(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
+      SizedOpT::copy(dst, src);
+      auto aligned = align(dst, src, runtime_size);
+      NextT::copy(aligned.arg1, aligned.arg2, aligned.size);
+    }
+
+    // Move forward suitable when dst < src. The alignment is performed with
+    // an HeadTail operation of size â [Alignment, 2 x Alignment].
+    //
+    // e.g. Moving two bytes and making sure src is then aligned.
+    // [  |       |       |       |      ]
+    // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
+    // [____LLLLLLLL_____________________]
+    // [___________LLLLLLLL______________]
+    // [_SSSSSSSS________________________]
+    // [________SSSSSSSS_________________]
+    //
+    // e.g. Moving two bytes and making sure dst is then aligned.
+    // [  |       |       |       |      ]
+    // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
+    // [____LLLLLLLL_____________________]
+    // [______LLLLLLLL___________________]
+    // [_SSSSSSSS________________________]
+    // [___SSSSSSSS______________________]
+    template <typename DstAddrT, typename SrcAddrT>
+    static inline void move(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
+      auto aligned_after_begin = align(dst, src, runtime_size);
+      // We move pointers forward by Size so we can perform HeadTail.
+      auto aligned = aligned_after_begin.stepForward();
+      HeadTail<SizedOpT>::move(dst, src, runtime_size - aligned.size);
+      NextT::move(aligned.arg1, aligned.arg2, aligned.size);
+    }
+
+    // Move backward suitable when dst > src. The alignment is performed with
+    // an HeadTail operation of size â [Alignment, 2 x Alignment].
+    //
+    // e.g. Moving two bytes backward and making sure src is then aligned.
+    // [  |       |       |       |      ]
+    // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
+    // [ _________________LLLLLLLL_______]
+    // [ ___________________LLLLLLLL_____]
+    // [____________________SSSSSSSS_____]
+    // [______________________SSSSSSSS___]
+    //
+    // e.g. Moving two bytes and making sure dst is then aligned.
+    // [  |       |       |       |      ]
+    // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
+    // [ _______________LLLLLLLL_________]
+    // [ ___________________LLLLLLLL_____]
+    // [__________________SSSSSSSS_______]
+    // [______________________SSSSSSSS___]
+    template <typename DstAddrT, typename SrcAddrT>
+    static inline void move_backward(DstAddrT dst, SrcAddrT src,
+                                     size_t runtime_size) {
+      const auto dst_end = offsetAddrAssumeAligned<1>(dst, runtime_size);
+      const auto src_end = offsetAddrAssumeAligned<1>(src, runtime_size);
+      auto aligned_after_end = align(dst_end, src_end, 0);
+      // We move pointers back by 2 x Size so we can perform HeadTail.
+      auto aligned = aligned_after_end.stepBack().stepBack();
+      HeadTail<SizedOpT>::move(aligned.arg1, aligned.arg2, aligned.size);
+      NextT::move_backward(dst, src, runtime_size - aligned.size);
+    }
+
+    template <typename DstAddrT>
+    static inline void set(DstAddrT dst, ubyte value, size_t runtime_size) {
+      SizedOpT::set(dst, value);
+      DstAddrT _(nullptr);
+      auto aligned = align(dst, _, runtime_size);
+      NextT::set(aligned.arg1, value, aligned.size);
+    }
+
+    template <typename SrcAddrT1, typename SrcAddrT2>
+    static inline uint64_t isDifferent(SrcAddrT1 src1, SrcAddrT2 src2,
+                                       size_t runtime_size) {
+      if (const uint64_t res = SizedOpT::isDifferent(src1, src2))
+        return res;
+      auto aligned = align(src1, src2, runtime_size);
+      return NextT::isDifferent(aligned.arg1, aligned.arg2, aligned.size);
+    }
+
+    template <typename SrcAddrT1, typename SrcAddrT2>
+    static inline int32_t threeWayCmp(SrcAddrT1 src1, SrcAddrT2 src2,
+                                      size_t runtime_size) {
+      if (const int32_t res = SizedOpT::threeWayCmp(src1, src2))
+        return res;
+      auto aligned = align(src1, src2, runtime_size);
+      return NextT::threeWayCmp(aligned.arg1, aligned.arg2, aligned.size);
+    }
+  };
+
+private:
+  static constexpr size_t ALIGN_OP_SIZE = SizedOpT::SIZE;
+  static_assert(ALIGN_OP_SIZE > 1);
+
+  template <typename Arg1AddrT, typename Arg2AddrT> struct Aligned {
+    Arg1AddrT arg1;
+    Arg2AddrT arg2;
+    size_t size;
+
+    Aligned stepForward() const {
+      return Aligned{offsetAddrMultiplesOf<ALIGN_OP_SIZE>(arg1, ALIGN_OP_SIZE),
+                     offsetAddrMultiplesOf<ALIGN_OP_SIZE>(arg2, ALIGN_OP_SIZE),
+                     size - ALIGN_OP_SIZE};
+    }
+
+    Aligned stepBack() const {
+      return Aligned{offsetAddrMultiplesOf<ALIGN_OP_SIZE>(arg1, -ALIGN_OP_SIZE),
+                     offsetAddrMultiplesOf<ALIGN_OP_SIZE>(arg2, -ALIGN_OP_SIZE),
+                     size + ALIGN_OP_SIZE};
+    }
+  };
+
+  template <typename Arg1AddrT, typename Arg2AddrT>
+  static auto makeAligned(Arg1AddrT arg1, Arg2AddrT arg2, size_t size) {
+    return Aligned<Arg1AddrT, Arg2AddrT>{arg1, arg2, size};
+  }
+
+  template <typename Arg1AddrT, typename Arg2AddrT>
+  static auto align(Arg1AddrT arg1, Arg2AddrT arg2, size_t runtime_size) {
+    static_assert(IsAddressType<Arg1AddrT>::Value);
+    static_assert(IsAddressType<Arg2AddrT>::Value);
+    if constexpr (AlignOn == Arg::_1) {
+      auto offset = offset_to_next_aligned<ALIGN_OP_SIZE>(arg1.ptr_);
+      return makeAligned(offsetAddrAssumeAligned<ALIGN_OP_SIZE>(arg1, offset),
+                         offsetAddrAssumeAligned<1>(arg2, offset),
+                         runtime_size - offset);
+    } else if constexpr (AlignOn == Arg::_2) {
+      auto offset = offset_to_next_aligned<ALIGN_OP_SIZE>(arg2.ptr_);
+      return makeAligned(offsetAddrAssumeAligned<1>(arg1, offset),
+                         offsetAddrAssumeAligned<ALIGN_OP_SIZE>(arg2, offset),
+                         runtime_size - offset);
+    } else {
+      DeferredStaticAssert("AlignOn must be either Arg::_1 or Arg::_2");
+    }
+  }
+};
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ALGORITHM_H
diff --git a/libc/src/string/memory_utils/sized_op.h b/libc/src/string/memory_utils/sized_op.h
index 12ace7c..d58d393 100644
--- a/libc/src/string/memory_utils/sized_op.h
+++ b/libc/src/string/memory_utils/sized_op.h
@@ -33,6 +33,9 @@ namespace __llvm_libc {
 
 template <typename Backend, size_t Size> struct SizedOp {
   static constexpr size_t SIZE = Size;
+  // Define instantiations of SizedOp as a fixed size operation.
+  // i.e. an operation that is composable by types in algorithm.h
+  static constexpr bool IS_FIXED_SIZE = true;
 
 private:
   static_assert(Backend::IS_BACKEND_TYPE);
diff --git a/libc/test/src/string/memory_utils/CMakeLists.txt b/libc/test/src/string/memory_utils/CMakeLists.txt
index 83926e4..9b777a7 100644
--- a/libc/test/src/string/memory_utils/CMakeLists.txt
+++ b/libc/test/src/string/memory_utils/CMakeLists.txt
@@ -4,6 +4,7 @@ add_libc_unittest(
     libc_string_unittests
   SRCS
     address_test.cpp
+    algorithm_test.cpp
     backend_test.cpp
     elements_test.cpp
     memory_access_test.cpp
diff --git a/libc/test/src/string/memory_utils/algorithm_test.cpp b/libc/test/src/string/memory_utils/algorithm_test.cpp
new file mode 100644
index 0000000..e4fd3e3
--- /dev/null
+++ b/libc/test/src/string/memory_utils/algorithm_test.cpp
@@ -0,0 +1,565 @@
+
+#define LLVM_LIBC_USE_BUILTIN_MEMCPY_INLINE 0
+
+#include "utils/UnitTest/Test.h"
+#include <src/__support/CPP/Array.h>
+#include <src/string/memory_utils/algorithm.h>
+#include <src/string/memory_utils/backends.h>
+
+#include <sstream>
+
+namespace __llvm_libc {
+
+struct alignas(64) Buffer : cpp::Array<char, 128> {
+  bool contains(const char *ptr) const {
+    return ptr >= data() && ptr < (data() + size());
+  }
+  size_t getOffset(const char *ptr) const { return ptr - data(); }
+  void fill(char c) {
+    for (auto itr = begin(); itr != end(); ++itr)
+      *itr = c;
+  }
+};
+
+static Buffer buffer1;
+static Buffer buffer2;
+static std::ostringstream LOG;
+
+struct TestBackend {
+  static constexpr bool IS_BACKEND_TYPE = true;
+
+  template <typename T> static void log(const char *Action, const char *ptr) {
+    LOG << Action << "<" << sizeof(T) << "> ";
+    if (buffer1.contains(ptr))
+      LOG << "a[" << buffer1.getOffset(ptr) << "]";
+    else if (buffer2.contains(ptr))
+      LOG << "b[" << buffer2.getOffset(ptr) << "]";
+    LOG << "\n";
+  }
+
+  template <typename T, Temporality TS, Aligned AS>
+  static T load(const T *src) {
+    log<T>((AS == Aligned::YES ? "LdA" : "LdU"),
+           reinterpret_cast<const char *>(src));
+    return Scalar64BitBackend::load<T, TS, AS>(src);
+  }
+
+  template <typename T, Temporality TS, Aligned AS>
+  static void store(T *dst, T value) {
+    log<T>((AS == Aligned::YES ? "StA" : "StU"),
+           reinterpret_cast<const char *>(dst));
+    Scalar64BitBackend::store<T, TS, AS>(dst, value);
+  }
+
+  template <typename T> static inline T splat(ubyte value) {
+    LOG << "Splat<" << sizeof(T) << "> " << (unsigned)value << '\n';
+    return Scalar64BitBackend::splat<T>(value);
+  }
+
+  template <typename T> static inline uint64_t notEquals(T v1, T v2) {
+    LOG << "Neq<" << sizeof(T) << ">\n";
+    return Scalar64BitBackend::notEquals<T>(v1, v2);
+  }
+
+  template <typename T> static inline int32_t threeWayCmp(T v1, T v2) {
+    LOG << "Diff<" << sizeof(T) << ">\n";
+    return Scalar64BitBackend::threeWayCmp<T>(v1, v2);
+  }
+
+  template <size_t Size>
+  using getNextType = Scalar64BitBackend::getNextType<Size>;
+};
+
+struct LlvmLibcAlgorithm : public testing::Test {
+  void SetUp() override {
+    LOG = std::ostringstream();
+    LOG << '\n';
+  }
+
+  void fillEqual() {
+    buffer1.fill('a');
+    buffer2.fill('a');
+  }
+
+  void fillDifferent() {
+    buffer1.fill('a');
+    buffer2.fill('b');
+  }
+
+  const char *getTrace() {
+    trace_ = LOG.str();
+    return trace_.c_str();
+  }
+
+  const char *stripComments(const char *expected) {
+    expected_.clear();
+    std::stringstream ss(expected);
+    std::string line;
+    while (std::getline(ss, line, '\n')) {
+      const auto pos = line.find('#');
+      if (pos == std::string::npos) {
+        expected_ += line;
+      } else {
+        auto log = line.substr(0, pos);
+        while (!log.empty() && std::isspace(log.back()))
+          log.pop_back();
+        expected_ += log;
+      }
+      expected_ += '\n';
+    }
+    return expected_.c_str();
+  }
+
+  template <size_t Align = 1> SrcAddr<Align> buf1(size_t offset = 0) const {
+    return buffer1.data() + offset;
+  }
+  template <size_t Align = 1> SrcAddr<Align> buf2(size_t offset = 0) const {
+    return buffer2.data() + offset;
+  }
+  template <size_t Align = 1> DstAddr<Align> dst(size_t offset = 0) const {
+    return buffer1.data() + offset;
+  }
+  template <size_t Align = 1> SrcAddr<Align> src(size_t offset = 0) const {
+    return buffer2.data() + offset;
+  }
+
+private:
+  std::string trace_;
+  std::string expected_;
+};
+
+using _8 = SizedOp<TestBackend, 8>;
+
+///////////////////////////////////////////////////////////////////////////////
+//// Testing fixed fized forward operations
+///////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+// Copy
+
+TEST_F(LlvmLibcAlgorithm, copy_1) {
+  SizedOp<TestBackend, 1>::copy(dst(), src());
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<1> b[0]
+StU<1> a[0]
+)"));
+}
+
+TEST_F(LlvmLibcAlgorithm, copy_15) {
+  SizedOp<TestBackend, 15>::copy(dst(), src());
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<8> b[0]
+StU<8> a[0]
+LdU<4> b[8]
+StU<4> a[8]
+LdU<2> b[12]
+StU<2> a[12]
+LdU<1> b[14]
+StU<1> a[14]
+)"));
+}
+
+TEST_F(LlvmLibcAlgorithm, copy_16) {
+  SizedOp<TestBackend, 16>::copy(dst(), src());
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<8> b[0]
+StU<8> a[0]
+LdU<8> b[8]
+StU<8> a[8]
+)"));
+}
+///////////////////////////////////////////////////////////////////////////////
+// Move
+
+TEST_F(LlvmLibcAlgorithm, move_1) {
+  SizedOp<TestBackend, 1>::move(dst(), src());
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<1> b[0]
+StU<1> a[0]
+)"));
+}
+
+TEST_F(LlvmLibcAlgorithm, move_15) {
+  SizedOp<TestBackend, 15>::move(dst(), src());
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<8> b[0]
+LdU<4> b[8]
+LdU<2> b[12]
+LdU<1> b[14]
+StU<1> a[14]
+StU<2> a[12]
+StU<4> a[8]
+StU<8> a[0]
+)"));
+}
+
+TEST_F(LlvmLibcAlgorithm, move_16) {
+  SizedOp<TestBackend, 16>::move(dst(), src());
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<8> b[0]
+LdU<8> b[8]
+StU<8> a[8]
+StU<8> a[0]
+)"));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// set
+
+TEST_F(LlvmLibcAlgorithm, set_1) {
+  SizedOp<TestBackend, 1>::set(dst(), ubyte{42});
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+Splat<1> 42
+StU<1> a[0]
+)"));
+}
+
+TEST_F(LlvmLibcAlgorithm, set_15) {
+  SizedOp<TestBackend, 15>::set(dst(), ubyte{42});
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+Splat<8> 42
+StU<8> a[0]
+Splat<4> 42
+StU<4> a[8]
+Splat<2> 42
+StU<2> a[12]
+Splat<1> 42
+StU<1> a[14]
+)"));
+}
+
+TEST_F(LlvmLibcAlgorithm, set_16) {
+  SizedOp<TestBackend, 16>::set(dst(), ubyte{42});
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+Splat<8> 42
+StU<8> a[0]
+Splat<8> 42
+StU<8> a[8]
+)"));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// different
+
+TEST_F(LlvmLibcAlgorithm, different_1) {
+  fillEqual();
+  SizedOp<TestBackend, 1>::isDifferent(buf1(), buf2());
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<1> a[0]
+LdU<1> b[0]
+Neq<1>
+)"));
+}
+
+TEST_F(LlvmLibcAlgorithm, different_15) {
+  fillEqual();
+  SizedOp<TestBackend, 15>::isDifferent(buf1(), buf2());
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<8> a[0]
+LdU<8> b[0]
+Neq<8>
+LdU<4> a[8]
+LdU<4> b[8]
+Neq<4>
+LdU<2> a[12]
+LdU<2> b[12]
+Neq<2>
+LdU<1> a[14]
+LdU<1> b[14]
+Neq<1>
+)"));
+}
+
+TEST_F(LlvmLibcAlgorithm, different_15_no_shortcircuit) {
+  fillDifferent();
+  SizedOp<TestBackend, 15>::isDifferent(buf1(), buf2());
+  // If buffer compare isDifferent we continue to aggregate.
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<8> a[0]
+LdU<8> b[0]
+Neq<8>
+LdU<4> a[8]
+LdU<4> b[8]
+Neq<4>
+LdU<2> a[12]
+LdU<2> b[12]
+Neq<2>
+LdU<1> a[14]
+LdU<1> b[14]
+Neq<1>
+)"));
+}
+
+TEST_F(LlvmLibcAlgorithm, different_16) {
+  fillEqual();
+  SizedOp<TestBackend, 16>::isDifferent(buf1(), buf2());
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<8> a[0]
+LdU<8> b[0]
+Neq<8>
+LdU<8> a[8]
+LdU<8> b[8]
+Neq<8>
+)"));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// three_way_cmp
+
+TEST_F(LlvmLibcAlgorithm, three_way_cmp_eq_1) {
+  fillEqual();
+  SizedOp<TestBackend, 1>::threeWayCmp(buf1(), buf2());
+  // Buffer compare equal, returning 0 and no call to Diff.
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<1> a[0]
+LdU<1> b[0]
+Diff<1>
+)"));
+}
+
+TEST_F(LlvmLibcAlgorithm, three_way_cmp_eq_15) {
+  fillEqual();
+  SizedOp<TestBackend, 15>::threeWayCmp(buf1(), buf2());
+  // Buffer compare equal, returning 0 and no call to Diff.
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<8> a[0]
+LdU<8> b[0]
+Diff<8>
+LdU<4> a[8]
+LdU<4> b[8]
+Diff<4>
+LdU<2> a[12]
+LdU<2> b[12]
+Diff<2>
+LdU<1> a[14]
+LdU<1> b[14]
+Diff<1>
+)"));
+}
+
+TEST_F(LlvmLibcAlgorithm, three_way_cmp_neq_15_shortcircuit) {
+  fillDifferent();
+  SizedOp<TestBackend, 16>::threeWayCmp(buf1(), buf2());
+  // If buffer compare isDifferent we stop early.
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<8> a[0]
+LdU<8> b[0]
+Diff<8>
+)"));
+}
+
+TEST_F(LlvmLibcAlgorithm, three_way_cmp_eq_16) {
+  fillEqual();
+  SizedOp<TestBackend, 16>::threeWayCmp(buf1(), buf2());
+  // Buffer compare equal, returning 0 and no call to Diff.
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<8> a[0]
+LdU<8> b[0]
+Diff<8>
+LdU<8> a[8]
+LdU<8> b[8]
+Diff<8>
+)"));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//// Testing skip operations
+///////////////////////////////////////////////////////////////////////////////
+
+TEST_F(LlvmLibcAlgorithm, skip_and_set) {
+  Skip<11>::Then<SizedOp<TestBackend, 1>>::set(dst(), ubyte{42});
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+Splat<1> 42
+StU<1> a[11]
+)"));
+}
+
+TEST_F(LlvmLibcAlgorithm, skip_and_different_1) {
+  Skip<11>::Then<SizedOp<TestBackend, 1>>::isDifferent(buf1(), buf2());
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<1> a[11]
+LdU<1> b[11]
+Neq<1>
+)"));
+}
+
+TEST_F(LlvmLibcAlgorithm, skip_and_three_way_cmp_8) {
+  Skip<11>::Then<SizedOp<TestBackend, 1>>::threeWayCmp(buf1(), buf2());
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<1> a[11]
+LdU<1> b[11]
+Diff<1>
+)"));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//// Testing tail operations
+///////////////////////////////////////////////////////////////////////////////
+
+TEST_F(LlvmLibcAlgorithm, tail_copy_8) {
+  Tail<_8>::copy(dst(), src(), 16);
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<8> b[8]
+StU<8> a[8]
+)"));
+}
+
+TEST_F(LlvmLibcAlgorithm, tail_move_8) {
+  Tail<_8>::move(dst(), src(), 16);
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<8> b[8]
+StU<8> a[8]
+)"));
+}
+
+TEST_F(LlvmLibcAlgorithm, tail_set_8) {
+  Tail<_8>::set(dst(), ubyte{42}, 16);
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+Splat<8> 42
+StU<8> a[8]
+)"));
+}
+
+TEST_F(LlvmLibcAlgorithm, tail_different_8) {
+  fillEqual();
+  Tail<_8>::isDifferent(buf1(), buf2(), 16);
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<8> a[8]
+LdU<8> b[8]
+Neq<8>
+)"));
+}
+
+TEST_F(LlvmLibcAlgorithm, tail_three_way_cmp_8) {
+  fillEqual();
+  Tail<_8>::threeWayCmp(buf1(), buf2(), 16);
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<8> a[8]
+LdU<8> b[8]
+Diff<8>
+)"));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//// Testing HeadTail operations
+///////////////////////////////////////////////////////////////////////////////
+
+TEST_F(LlvmLibcAlgorithm, head_tail_copy_8) {
+  HeadTail<_8>::copy(dst(), src(), 16);
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<8> b[0]
+StU<8> a[0]
+LdU<8> b[8]
+StU<8> a[8]
+)"));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//// Testing Loop operations
+///////////////////////////////////////////////////////////////////////////////
+
+TEST_F(LlvmLibcAlgorithm, loop_copy_one_iteration_and_tail) {
+  Loop<_8>::copy(dst(), src(), 10);
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<8> b[0]
+StU<8> a[0] # covers 0-7
+LdU<8> b[2]
+StU<8> a[2] # covers 2-9
+)"));
+}
+
+TEST_F(LlvmLibcAlgorithm, loop_copy_two_iteration_and_tail) {
+  Loop<_8>::copy(dst(), src(), 17);
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<8> b[0]
+StU<8> a[0] # covers 0-7
+LdU<8> b[8]
+StU<8> a[8] # covers 8-15
+LdU<8> b[9]
+StU<8> a[9] # covers 9-16
+)"));
+}
+
+TEST_F(LlvmLibcAlgorithm, loop_with_one_turn_is_inefficient_but_ok) {
+  Loop<_8>::copy(dst(), src(), 8);
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<8> b[0]
+StU<8> a[0] # first iteration covers 0-7
+LdU<8> b[0] # tail also covers 0-7 but since Loop is supposed to be used
+StU<8> a[0] # with a sufficient number of iterations the tail cost is amortised
+)"));
+}
+
+TEST_F(LlvmLibcAlgorithm, loop_with_round_number_of_turn) {
+  Loop<_8>::copy(dst(), src(), 24);
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<8> b[0]
+StU<8> a[0] # first iteration covers 0-7
+LdU<8> b[8]
+StU<8> a[8] # second iteration covers 8-15
+LdU<8> b[16]
+StU<8> a[16]
+)"));
+}
+
+TEST_F(LlvmLibcAlgorithm, dst_aligned_loop) {
+  Loop<_8>::copy(dst<16>(), src(), 23);
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<8> b[0]
+StA<8> a[0] # store is aligned on 16B
+LdU<8> b[8]
+StA<8> a[8] # subsequent stores are aligned
+LdU<8> b[15]
+StU<8> a[15] # Tail is always unaligned
+)"));
+}
+
+TEST_F(LlvmLibcAlgorithm, aligned_loop) {
+  Loop<_8>::copy(dst<16>(), src<8>(), 23);
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdA<8> b[0] # load is aligned on 8B
+StA<8> a[0] # store is aligned on 16B
+LdA<8> b[8] # subsequent loads are aligned
+StA<8> a[8] # subsequent stores are aligned
+LdU<8> b[15] # Tail is always unaligned
+StU<8> a[15] # Tail is always unaligned
+)"));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//// Testing Align operations
+///////////////////////////////////////////////////////////////////////////////
+
+TEST_F(LlvmLibcAlgorithm, align_dst_copy_8) {
+  Align<_8, Arg::Dst>::Then<Loop<_8>>::copy(dst(2), src(3), 31);
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<8> b[3]
+StU<8> a[2] # First store covers unaligned bytes
+LdU<8> b[9]
+StA<8> a[8] # First aligned store
+LdU<8> b[17]
+StA<8> a[16] # Subsequent stores are aligned
+LdU<8> b[25]
+StA<8> a[24] # Subsequent stores are aligned
+LdU<8> b[26]
+StU<8> a[25] # Last store covers remaining bytes
+)"));
+}
+
+TEST_F(LlvmLibcAlgorithm, align_src_copy_8) {
+  Align<_8, Arg::Src>::Then<Loop<_8>>::copy(dst(2), src(3), 31);
+  EXPECT_STREQ(getTrace(), stripComments(R"(
+LdU<8> b[3] # First load covers unaligned bytes
+StU<8> a[2]
+LdA<8> b[8] # First aligned load
+StU<8> a[7]
+LdA<8> b[16] # Subsequent loads are aligned
+StU<8> a[15]
+LdA<8> b[24] # Subsequent loads are aligned
+StU<8> a[23]
+LdU<8> b[26] # Last load covers remaining bytes
+StU<8> a[25]
+)"));
+}
+
+} // namespace __llvm_libc
-- 
2.7.4