From 32fe06348631c9e146832e5afac2ce97bb2df43e Mon Sep 17 00:00:00 2001
From: James Ko <jamesqko@gmail.com>
Date: Fri, 5 Aug 2016 22:06:59 -0400
Subject: [PATCH] Decrease writes to local variables in Buffer.MemoryCopy
 (#6627)

In `Buffer.MemoryCopy` currently we are making 4 writes every time we copy some data; 1 to update `*dest`, 1 to update `dest`, 1 to update `src` and 1 to update `len`. I've decreased it to 2; one to update a new local variable `i`, which keeps track of how many bytes we are into the buffer. All writes are now made using

```cs
*(dest + i + x) = *(src + i + x)
```

which has no additional overhead since they're converted to using memory addressing operands by the jit.

Another change I made was to add a few extra cases for the switch-case at the beginning that does copying for small sizes without any branches. It now covers sizes 0-22. This is beneficial to the main codepath, since we can convert the unrolled loop to a `do..while` loop and save an extra branch at the beginning. (max 7 bytes for alignment, 16 for 1 iteration of the loop, so the min bytes we can copy without checking whether we should stop is 23.) This adds

This PR increases the performance of `MemoryCopy` by 10-20% for most buffer sizes on x86; you can see the performance test/results (and the generated assembly for each version) [here](https://gist.github.com/jamesqo/337852c8ce09205a8289ce1f1b9b5382). (Note that this codepath is also used by `wstrcpy` at the moment, so this directly affects many common String operations.)
---
 src/mscorlib/src/System/Buffer.cs | 262 +++++++++++++++++++++++++-------------
 1 file changed, 176 insertions(+), 86 deletions(-)

diff --git a/src/mscorlib/src/System/Buffer.cs b/src/mscorlib/src/System/Buffer.cs
index d19de32..ea647f1 100644
--- a/src/mscorlib/src/System/Buffer.cs
+++ b/src/mscorlib/src/System/Buffer.cs
@@ -15,6 +15,12 @@ namespace System {
     using System.Security;
     using System.Runtime;
 
+#if BIT64
+    using nuint = System.UInt64;
+#else // BIT64
+    using nuint = System.UInt32;
+#endif // BIT64
+
 [System.Runtime.InteropServices.ComVisible(true)]
     public static class Buffer
     {
@@ -264,25 +270,23 @@ namespace System {
         // This method has different signature for x64 and other platforms and is done for performance reasons.
         [System.Security.SecurityCritical]
         [ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)]
-#if BIT64
-        internal unsafe static void Memmove(byte* dest, byte* src, ulong len)
-#else
-        internal unsafe static void Memmove(byte* dest, byte* src, uint len)
-#endif
+        internal unsafe static void Memmove(byte* dest, byte* src, nuint len)
         {
             // P/Invoke into the native version when the buffers are overlapping and the copy needs to be performed backwards
             // This check can produce false positives for lengths greater than Int32.MaxInt. It is fine because we want to use PInvoke path for the large lengths anyway.
-#if BIT64
-            if ((ulong)dest - (ulong)src < len) goto PInvoke;
-#else
-            if (((uint)dest - (uint)src) < len) goto PInvoke;
-#endif
-            //
+
+            if ((nuint)dest - (nuint)src < len) goto PInvoke;
+
             // This is portable version of memcpy. It mirrors what the hand optimized assembly versions of memcpy typically do.
             //
             // Ideally, we would just use the cpblk IL instruction here. Unfortunately, cpblk IL instruction is not as efficient as
             // possible yet and so we have this implementation here for now.
-            //
+
+            // Note: It's important that this switch handles lengths at least up to 22.
+            // See notes below near the main loop for why.
+
+            // The switch will be very fast since it can be implemented using a jump
+            // table in assembly. See http://stackoverflow.com/a/449297/4077294 for more info.
 
             switch (len)
             {
@@ -292,14 +296,14 @@ namespace System {
                 *dest = *src;
                 return;
             case 2:
-                *(short *)dest = *(short *)src;
+                *(short*)dest = *(short*)src;
                 return;
             case 3:
-                *(short *)dest = *(short *)src;
+                *(short*)dest = *(short*)src;
                 *(dest + 2) = *(src + 2);
                 return;
             case 4:
-                *(int *)dest = *(int *)src;
+                *(int*)dest = *(int*)src;
                 return;
             case 5:
                 *(int*)dest = *(int*)src;
@@ -401,87 +405,186 @@ namespace System {
                 *(int*)(dest + 12) = *(int*)(src + 12);
 #endif
                 return;
-            default:
-                break;
+            case 17:
+#if BIT64
+                *(long*)dest = *(long*)src;
+                *(long*)(dest + 8) = *(long*)(src + 8);
+#else
+                *(int*)dest = *(int*)src;
+                *(int*)(dest + 4) = *(int*)(src + 4);
+                *(int*)(dest + 8) = *(int*)(src + 8);
+                *(int*)(dest + 12) = *(int*)(src + 12);
+#endif
+                *(dest + 16) = *(src + 16);
+                return;
+            case 18:
+#if BIT64
+                *(long*)dest = *(long*)src;
+                *(long*)(dest + 8) = *(long*)(src + 8);
+#else
+                *(int*)dest = *(int*)src;
+                *(int*)(dest + 4) = *(int*)(src + 4);
+                *(int*)(dest + 8) = *(int*)(src + 8);
+                *(int*)(dest + 12) = *(int*)(src + 12);
+#endif
+                *(short*)(dest + 16) = *(short*)(src + 16);
+                return;
+            case 19:
+#if BIT64
+                *(long*)dest = *(long*)src;
+                *(long*)(dest + 8) = *(long*)(src + 8);
+#else
+                *(int*)dest = *(int*)src;
+                *(int*)(dest + 4) = *(int*)(src + 4);
+                *(int*)(dest + 8) = *(int*)(src + 8);
+                *(int*)(dest + 12) = *(int*)(src + 12);
+#endif
+                *(short*)(dest + 16) = *(short*)(src + 16);
+                *(dest + 18) = *(src + 18);
+                return;
+            case 20:
+#if BIT64
+                *(long*)dest = *(long*)src;
+                *(long*)(dest + 8) = *(long*)(src + 8);
+#else
+                *(int*)dest = *(int*)src;
+                *(int*)(dest + 4) = *(int*)(src + 4);
+                *(int*)(dest + 8) = *(int*)(src + 8);
+                *(int*)(dest + 12) = *(int*)(src + 12);
+#endif
+                *(int*)(dest + 16) = *(int*)(src + 16);
+                return;
+            case 21:
+#if BIT64
+                *(long*)dest = *(long*)src;
+                *(long*)(dest + 8) = *(long*)(src + 8);
+#else
+                *(int*)dest = *(int*)src;
+                *(int*)(dest + 4) = *(int*)(src + 4);
+                *(int*)(dest + 8) = *(int*)(src + 8);
+                *(int*)(dest + 12) = *(int*)(src + 12);
+#endif
+                *(int*)(dest + 16) = *(int*)(src + 16);
+                *(dest + 20) = *(src + 20);
+                return;
+            case 22:
+#if BIT64
+                *(long*)dest = *(long*)src;
+                *(long*)(dest + 8) = *(long*)(src + 8);
+#else
+                *(int*)dest = *(int*)src;
+                *(int*)(dest + 4) = *(int*)(src + 4);
+                *(int*)(dest + 8) = *(int*)(src + 8);
+                *(int*)(dest + 12) = *(int*)(src + 12);
+#endif
+                *(int*)(dest + 16) = *(int*)(src + 16);
+                *(short*)(dest + 20) = *(short*)(src + 20);
+                return;
             }
 
             // P/Invoke into the native version for large lengths
             if (len >= 512) goto PInvoke;
 
+            nuint i = 0; // byte offset at which we're copying
+
             if (((int)dest & 3) != 0)
             {
                 if (((int)dest & 1) != 0)
                 {
-                    *dest = *src;
-                    src++;
-                    dest++;
-                    len--;
-                    if (((int)dest & 2) == 0)
-                        goto Aligned;
+                    *(dest + i) = *(src + i);
+                    i += 1;
+                    if (((int)dest & 2) != 0)
+                        goto IntAligned;
                 }
-                *(short *)dest = *(short *)src;
-                src += 2;
-                dest += 2;
-                len -= 2;
-            Aligned: ;
+                *(short*)(dest + i) = *(short*)(src + i);
+                i += 2;
             }
 
+            IntAligned:
+
 #if BIT64
-            if (((int)dest & 4) != 0)
+            // On 64-bit IntPtr.Size == 8, so we want to advance to the next 8-aligned address. If
+            // (int)dest % 8 is 0, 5, 6, or 7, we will already have advanced by 0, 3, 2, or 1
+            // bytes to the next aligned address (respectively), so do nothing. On the other hand,
+            // if it is 1, 2, 3, or 4 we will want to copy-and-advance another 4 bytes until
+            // we're aligned.
+            // The thing 1, 2, 3, and 4 have in common that the others don't is that if you
+            // subtract one from them, their 3rd lsb will not be set. Hence, the below check.
+
+            if ((((int)dest - 1) & 4) == 0)
             {
-                *(int *)dest = *(int *)src;
-                src += 4;
-                dest += 4;
-                len -= 4;
+                *(int*)(dest + i) = *(int*)(src + i);
+                i += 4;
             }
-#endif
+#endif // BIT64
 
-#if BIT64
-            ulong count = len / 16;
-#else
-            uint count = len / 16;
-#endif
-            while (count > 0)
+            nuint end = len - 16;
+            len -= i; // lower 4 bits of len represent how many bytes are left *after* the unrolled loop
+
+            // We know due to the above switch-case that this loop will always run 1 iteration; max
+            // bytes we copy before checking is 23 (7 to align the pointers, 16 for 1 iteration) so
+            // the switch handles lengths 0-22.
+            Contract.Assert(end >= 7 && i <= end);
+
+            // This is separated out into a different variable, so the i + 16 addition can be
+            // performed at the start of the pipeline and the loop condition does not have
+            // a dependency on the writes.
+            nuint counter; 
+
+            do
             {
+                counter = i + 16;
+
+                // This loop looks very costly since there appear to be a bunch of temporary values
+                // being created with the adds, but the jit (for x86 anyways) will convert each of
+                // these to use memory addressing operands.
+
+                // So the only cost is a bit of code size, which is made up for by the fact that
+                // we save on writes to dest/src.
+
 #if BIT64
-                ((long*)dest)[0] = ((long*)src)[0];
-                ((long*)dest)[1] = ((long*)src)[1];
+                *(long*)(dest + i) = *(long*)(src + i);
+                *(long*)(dest + i + 8) = *(long*)(src + i + 8);
 #else
-                ((int*)dest)[0] = ((int*)src)[0];
-                ((int*)dest)[1] = ((int*)src)[1];
-                ((int*)dest)[2] = ((int*)src)[2];
-                ((int*)dest)[3] = ((int*)src)[3];
+                *(int*)(dest + i) = *(int*)(src + i);
+                *(int*)(dest + i + 4) = *(int*)(src + i + 4);
+                *(int*)(dest + i + 8) = *(int*)(src + i + 8);
+                *(int*)(dest + i + 12) = *(int*)(src + i + 12);
 #endif
-                dest += 16;
-                src += 16;
-                count--;
+
+                i = counter;
+                
+                // See notes above for why this wasn't used instead
+                // i += 16;
             }
+            while (counter <= end);
 
             if ((len & 8) != 0)
             {
 #if BIT64
-                ((long*)dest)[0] = ((long*)src)[0];
+                *(long*)(dest + i) = *(long*)(src + i);
 #else
-                ((int*)dest)[0] = ((int*)src)[0];
-                ((int*)dest)[1] = ((int*)src)[1];
+                *(int*)(dest + i) = *(int*)(src + i);
+                *(int*)(dest + i + 4) = *(int*)(src + i + 4);
 #endif
-                dest += 8;
-                src += 8;
-           }
-           if ((len & 4) != 0) 
-           {
-                ((int*)dest)[0] = ((int*)src)[0];
-                dest += 4;
-                src += 4;
-           }
-           if ((len & 2) != 0) 
-           {
-                ((short*)dest)[0] = ((short*)src)[0];
-                dest += 2;
-                src += 2;
-           }
-           if ((len & 1) != 0)
-                *dest = *src;
+                i += 8;
+            }
+            if ((len & 4) != 0) 
+            {
+                *(int*)(dest + i) = *(int*)(src + i);
+                i += 4;
+            }
+            if ((len & 2) != 0) 
+            {
+                *(short*)(dest + i) = *(short*)(src + i);
+                i += 2;
+            }
+            if ((len & 1) != 0)
+            {
+                *(dest + i) = *(src + i);
+                // We're not using i after this, so not needed
+                // i += 1;
+            }
 
             return;
 
@@ -495,11 +598,7 @@ namespace System {
         [SecurityCritical]
         [ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)]
         [MethodImplAttribute(MethodImplOptions.NoInlining)]
-#if BIT64
-        private unsafe static void _Memmove(byte* dest, byte* src, ulong len)
-#else
-        private unsafe static void _Memmove(byte* dest, byte* src, uint len)
-#endif
+        private unsafe static void _Memmove(byte* dest, byte* src, nuint len)
         {
             __Memmove(dest, src, len);
         }
@@ -508,12 +607,7 @@ namespace System {
         [SuppressUnmanagedCodeSecurity]
         [SecurityCritical]
         [ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)]
-#if BIT64
-        extern private unsafe static void __Memmove(byte* dest, byte* src, ulong len);
-#else
-        extern private unsafe static void __Memmove(byte* dest, byte* src, uint len);
-#endif
-
+        extern private unsafe static void __Memmove(byte* dest, byte* src, nuint len);
 
         // The attributes on this method are chosen for best JIT performance. 
         // Please do not edit unless intentional.
@@ -526,11 +620,7 @@ namespace System {
             {
                 ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.sourceBytesToCopy);
             }
-#if BIT64
-            Memmove((byte*)destination, (byte*)source, checked((ulong) sourceBytesToCopy));
-#else
-            Memmove((byte*)destination, (byte*)source, checked((uint)sourceBytesToCopy));
-#endif // BIT64
+            Memmove((byte*)destination, (byte*)source, checked((nuint)sourceBytesToCopy));
         }
 
 
@@ -547,7 +637,7 @@ namespace System {
             }
 #if BIT64
             Memmove((byte*)destination, (byte*)source, sourceBytesToCopy);
-#else
+#else // BIT64
             Memmove((byte*)destination, (byte*)source, checked((uint)sourceBytesToCopy));
 #endif // BIT64
         }
-- 
2.7.4