[libc] Use different alignment for memcpy between ARM and x86.
authorGuillaume Chatelet <gchatelet@google.com>
Mon, 26 Apr 2021 19:30:00 +0000 (19:30 +0000)
committerGuillaume Chatelet <gchatelet@google.com>
Mon, 26 Apr 2021 19:30:00 +0000 (19:30 +0000)
Aligned copy used to be 'destination aligned' for x86 but this decision was reverted in D93457 where we noticed that it was better for ARM to be 'source aligned'.
More benchmarking confirmed that it can be up to 30% faster to align copy to destination for x86. This Patch offers both implementations and switches x86 back to destination aligned.
It also fixes alignment to 32 byte on x86.

Differential Revision: https://reviews.llvm.org/D101296

libc/src/string/aarch64/memcpy.cpp
libc/src/string/memcpy.cpp
libc/src/string/memory_utils/memcpy_utils.h
libc/src/string/x86/memcpy.cpp
libc/test/src/string/memory_utils/memcpy_utils_test.cpp

index 63ed5fd..78988ec 100644 (file)
@@ -54,7 +54,7 @@ static void memcpy_aarch64(char *__restrict dst, const char *__restrict src,
     return CopyBlockOverlap<32>(dst, src, count);
   if (count < 128)
     return CopyBlockOverlap<64>(dst, src, count);
-  return CopyAlignedBlocks<64, 16>(dst, src, count);
+  return CopySrcAlignedBlocks<64, 16>(dst, src, count);
 }
 
 LLVM_LIBC_FUNCTION(void *, memcpy,
index a145b90..e050d7f 100644 (file)
@@ -52,7 +52,7 @@ static void memcpy_impl(char *__restrict dst, const char *__restrict src,
     return CopyBlockOverlap<32>(dst, src, count);
   if (count < 128)
     return CopyBlockOverlap<64>(dst, src, count);
-  return CopyAlignedBlocks<32>(dst, src, count);
+  return CopySrcAlignedBlocks<32>(dst, src, count);
 }
 
 LLVM_LIBC_FUNCTION(void *, memcpy,
index 8fb0491..23836bb 100644 (file)
@@ -98,8 +98,8 @@ static void CopyBlockOverlap(char *__restrict dst, const char *__restrict src,
 //               `count > 2 * kBlockSize` for efficiency.
 //               `count >= kAlignment` for correctness.
 template <size_t kBlockSize, size_t kAlignment = kBlockSize>
-static void CopyAlignedBlocks(char *__restrict dst, const char *__restrict src,
-                              size_t count) {
+static void CopySrcAlignedBlocks(char *__restrict dst,
+                                 const char *__restrict src, size_t count) {
   static_assert(is_power2(kAlignment), "kAlignment must be a power of two");
   static_assert(is_power2(kBlockSize), "kBlockSize must be a power of two");
   static_assert(kAlignment <= kBlockSize,
@@ -116,6 +116,25 @@ static void CopyAlignedBlocks(char *__restrict dst, const char *__restrict src,
   CopyLastBlock<kBlockSize>(dst, src, count); // Copy last block
 }
 
+template <size_t kBlockSize, size_t kAlignment = kBlockSize>
+static void CopyDstAlignedBlocks(char *__restrict dst,
+                                 const char *__restrict src, size_t count) {
+  static_assert(is_power2(kAlignment), "kAlignment must be a power of two");
+  static_assert(is_power2(kBlockSize), "kBlockSize must be a power of two");
+  static_assert(kAlignment <= kBlockSize,
+                "kAlignment must be less or equal to block size");
+  CopyBlock<kAlignment>(dst, src); // Copy first block
+
+  // Copy aligned blocks
+  const size_t ofla = offset_from_last_aligned<kAlignment>(dst);
+  const size_t limit = count + ofla - kBlockSize;
+  for (size_t offset = kAlignment; offset < limit; offset += kBlockSize)
+    CopyBlock<kBlockSize>(assume_aligned<kAlignment>(dst - ofla + offset),
+                          src - ofla + offset);
+
+  CopyLastBlock<kBlockSize>(dst, src, count); // Copy last block
+}
+
 } // namespace __llvm_libc
 
 #endif //  LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_UTILS_H
index b9163d9..bbd8fe9 100644 (file)
@@ -87,7 +87,7 @@ static void memcpy_x86(char *__restrict dst, const char *__restrict src,
   if (kHasAvx && count < 256)
     return CopyBlockOverlap<128>(dst, src, count);
   if (count <= kRepMovsBSize)
-    return CopyAlignedBlocks<kLoopCopyBlockSize>(dst, src, count);
+    return CopyDstAlignedBlocks<kLoopCopyBlockSize, 32>(dst, src, count);
   return CopyRepMovsb(dst, src, count);
 }
 
index be749c1..37529ae 100644 (file)
@@ -160,12 +160,12 @@ TEST(LlvmLibcMemcpyUtilsTest, CopyBlockOverlap) {
   EXPECT_STREQ(trace.Read(), "01112111");
 }
 
-TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocks) {
+TEST(LlvmLibcMemcpyUtilsTest, CopySrcAlignedBlocks) {
   auto &trace = GetTrace();
   // Source is aligned and multiple of alignment.
   //   "1111"
   trace.Clear();
-  CopyAlignedBlocks<4>(I(0), I(0), 4);
+  CopySrcAlignedBlocks<4>(I(0), I(0), 4);
   EXPECT_STREQ(trace.Write(), "2222");
   EXPECT_STREQ(trace.Read(), "2222");
 
@@ -174,7 +174,7 @@ TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocks) {
   // + "00001111"
   // = "11111111"
   trace.Clear();
-  CopyAlignedBlocks<4>(I(0), I(0), 8);
+  CopySrcAlignedBlocks<4>(I(0), I(0), 8);
   EXPECT_STREQ(trace.Write(), "11111111");
   EXPECT_STREQ(trace.Read(), "11111111");
 
@@ -185,7 +185,7 @@ TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocks) {
   // + "0000000001111"
   // = "1111111112221"
   trace.Clear();
-  CopyAlignedBlocks<4>(I(0), I(0), 13);
+  CopySrcAlignedBlocks<4>(I(0), I(0), 13);
   EXPECT_STREQ(trace.Write(), "1111111112221");
   EXPECT_STREQ(trace.Read(), "1111111112221");
 
@@ -196,7 +196,7 @@ TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocks) {
   // + "00000000001111"
   // = "01112111112211"
   trace.Clear();
-  CopyAlignedBlocks<4>(I(0), I(1), 13);
+  CopySrcAlignedBlocks<4>(I(0), I(1), 13);
   EXPECT_STREQ(trace.Write(), "1112111112211");
   EXPECT_STREQ(trace.Read(), "01112111112211");
 
@@ -206,24 +206,89 @@ TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocks) {
   // + "000000001111"
   // = "011121111111"
   trace.Clear();
-  CopyAlignedBlocks<4>(I(0), I(1), 11);
+  CopySrcAlignedBlocks<4>(I(0), I(1), 11);
   EXPECT_STREQ(trace.Write(), "11121111111");
   EXPECT_STREQ(trace.Read(), "011121111111");
 }
 
+TEST(LlvmLibcMemcpyUtilsTest, CopyDstAlignedBlocks) {
+  auto &trace = GetTrace();
+  // Destination is aligned and multiple of alignment.
+  //   "1111"
+  trace.Clear();
+  CopyDstAlignedBlocks<4>(I(0), I(0), 4);
+  EXPECT_STREQ(trace.Write(), "2222");
+  EXPECT_STREQ(trace.Read(), "2222");
+
+  // Destination is aligned and multiple of alignment.
+  //   "11110000"
+  // + "00001111"
+  // = "11111111"
+  trace.Clear();
+  CopyDstAlignedBlocks<4>(I(0), I(0), 8);
+  EXPECT_STREQ(trace.Write(), "11111111");
+  EXPECT_STREQ(trace.Read(), "11111111");
+
+  // Destination is aligned already overlap at end.
+  //   "1111000000000"
+  // + "0000111100000"
+  // + "0000000011110"
+  // + "0000000001111"
+  // = "1111111112221"
+  trace.Clear();
+  CopyDstAlignedBlocks<4>(I(0), I(0), 13);
+  EXPECT_STREQ(trace.Write(), "1111111112221");
+  EXPECT_STREQ(trace.Read(), "1111111112221");
+
+  // Misaligned destination.
+  //   "01111000000000"
+  // + "00001111000000"
+  // + "00000000111100"
+  // + "00000000001111"
+  // = "01112111112211"
+  trace.Clear();
+  CopyDstAlignedBlocks<4>(I(1), I(0), 13);
+  EXPECT_STREQ(trace.Write(), "01112111112211");
+  EXPECT_STREQ(trace.Read(), "1112111112211");
+
+  // Misaligned destination aligned at end.
+  //   "011110000000"
+  // + "000011110000"
+  // + "000000001111"
+  // = "011121111111"
+  trace.Clear();
+  CopyDstAlignedBlocks<4>(I(1), I(0), 11);
+  EXPECT_STREQ(trace.Write(), "011121111111");
+  EXPECT_STREQ(trace.Read(), "11121111111");
+}
+
 TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocksWithAlignment) {
   auto &trace = GetTrace();
   // Source is aligned and multiple of alignment.
   //   "11111111"
   trace.Clear();
-  CopyAlignedBlocks<8, 4>(I(0), I(0), 8);
+  CopySrcAlignedBlocks<8, 4>(I(0), I(0), 8);
+  EXPECT_STREQ(trace.Write(), "22221111");
+  EXPECT_STREQ(trace.Read(), "22221111");
+
+  // Destination is aligned and multiple of alignment.
+  //   "11111111"
+  trace.Clear();
+  CopyDstAlignedBlocks<8, 4>(I(0), I(0), 8);
   EXPECT_STREQ(trace.Write(), "22221111");
   EXPECT_STREQ(trace.Read(), "22221111");
 
   // Source is aligned and multiple of alignment.
   //   "111111111"
   trace.Clear();
-  CopyAlignedBlocks<8, 4>(I(0), I(0), 9);
+  CopySrcAlignedBlocks<8, 4>(I(0), I(0), 9);
+  EXPECT_STREQ(trace.Write(), "122211111");
+  EXPECT_STREQ(trace.Read(), "122211111");
+
+  // Destination is aligned and multiple of alignment.
+  //   "111111111"
+  trace.Clear();
+  CopyDstAlignedBlocks<8, 4>(I(0), I(0), 9);
   EXPECT_STREQ(trace.Write(), "122211111");
   EXPECT_STREQ(trace.Read(), "122211111");
 }
@@ -234,7 +299,7 @@ TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocksMaxReloads) {
     for (size_t count = 64; count < 768; ++count) {
       trace.Clear();
       // We should never reload more than twice when copying from count = 2x32.
-      CopyAlignedBlocks<32>(I(alignment), I(0), count);
+      CopySrcAlignedBlocks<32>(I(alignment), I(0), count);
       const char *const written = trace.Write();
       // First bytes are untouched.
       for (size_t i = 0; i < alignment; ++i)
@@ -254,7 +319,7 @@ TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocksWithAlignmentMaxReloads) {
     for (size_t count = 64; count < 768; ++count) {
       trace.Clear();
       // We should never reload more than twice when copying from count = 2x32.
-      CopyAlignedBlocks<32, 16>(I(alignment), I(0), count);
+      CopySrcAlignedBlocks<32, 16>(I(alignment), I(0), count);
       const char *const written = trace.Write();
       // First bytes are untouched.
       for (size_t i = 0; i < alignment; ++i)