From 32fe06348631c9e146832e5afac2ce97bb2df43e Mon Sep 17 00:00:00 2001 From: James Ko Date: Fri, 5 Aug 2016 22:06:59 -0400 Subject: [PATCH] Decrease writes to local variables in Buffer.MemoryCopy (#6627) In `Buffer.MemoryCopy` currently we are making 4 writes every time we copy some data; 1 to update `*dest`, 1 to update `dest`, 1 to update `src` and 1 to update `len`. I've decreased it to 2; one to update a new local variable `i`, which keeps track of how many bytes we are into the buffer. All writes are now made using ```cs *(dest + i + x) = *(src + i + x) ``` which has no additional overhead since they're converted to using memory addressing operands by the jit. Another change I made was to add a few extra cases for the switch-case at the beginning that does copying for small sizes without any branches. It now covers sizes 0-22. This is beneficial to the main codepath, since we can convert the unrolled loop to a `do..while` loop and save an extra branch at the beginning. (max 7 bytes for alignment, 16 for 1 iteration of the loop, so the min bytes we can copy without checking whether we should stop is 23.) This adds This PR increases the performance of `MemoryCopy` by 10-20% for most buffer sizes on x86; you can see the performance test/results (and the generated assembly for each version) [here](https://gist.github.com/jamesqo/337852c8ce09205a8289ce1f1b9b5382). (Note that this codepath is also used by `wstrcpy` at the moment, so this directly affects many common String operations.) --- src/mscorlib/src/System/Buffer.cs | 262 +++++++++++++++++++++++++------------- 1 file changed, 176 insertions(+), 86 deletions(-) diff --git a/src/mscorlib/src/System/Buffer.cs b/src/mscorlib/src/System/Buffer.cs index d19de32..ea647f1 100644 --- a/src/mscorlib/src/System/Buffer.cs +++ b/src/mscorlib/src/System/Buffer.cs @@ -15,6 +15,12 @@ namespace System { using System.Security; using System.Runtime; +#if BIT64 + using nuint = System.UInt64; +#else // BIT64 + using nuint = System.UInt32; +#endif // BIT64 + [System.Runtime.InteropServices.ComVisible(true)] public static class Buffer { @@ -264,25 +270,23 @@ namespace System { // This method has different signature for x64 and other platforms and is done for performance reasons. [System.Security.SecurityCritical] [ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)] -#if BIT64 - internal unsafe static void Memmove(byte* dest, byte* src, ulong len) -#else - internal unsafe static void Memmove(byte* dest, byte* src, uint len) -#endif + internal unsafe static void Memmove(byte* dest, byte* src, nuint len) { // P/Invoke into the native version when the buffers are overlapping and the copy needs to be performed backwards // This check can produce false positives for lengths greater than Int32.MaxInt. It is fine because we want to use PInvoke path for the large lengths anyway. -#if BIT64 - if ((ulong)dest - (ulong)src < len) goto PInvoke; -#else - if (((uint)dest - (uint)src) < len) goto PInvoke; -#endif - // + + if ((nuint)dest - (nuint)src < len) goto PInvoke; + // This is portable version of memcpy. It mirrors what the hand optimized assembly versions of memcpy typically do. // // Ideally, we would just use the cpblk IL instruction here. Unfortunately, cpblk IL instruction is not as efficient as // possible yet and so we have this implementation here for now. - // + + // Note: It's important that this switch handles lengths at least up to 22. + // See notes below near the main loop for why. + + // The switch will be very fast since it can be implemented using a jump + // table in assembly. See http://stackoverflow.com/a/449297/4077294 for more info. switch (len) { @@ -292,14 +296,14 @@ namespace System { *dest = *src; return; case 2: - *(short *)dest = *(short *)src; + *(short*)dest = *(short*)src; return; case 3: - *(short *)dest = *(short *)src; + *(short*)dest = *(short*)src; *(dest + 2) = *(src + 2); return; case 4: - *(int *)dest = *(int *)src; + *(int*)dest = *(int*)src; return; case 5: *(int*)dest = *(int*)src; @@ -401,87 +405,186 @@ namespace System { *(int*)(dest + 12) = *(int*)(src + 12); #endif return; - default: - break; + case 17: +#if BIT64 + *(long*)dest = *(long*)src; + *(long*)(dest + 8) = *(long*)(src + 8); +#else + *(int*)dest = *(int*)src; + *(int*)(dest + 4) = *(int*)(src + 4); + *(int*)(dest + 8) = *(int*)(src + 8); + *(int*)(dest + 12) = *(int*)(src + 12); +#endif + *(dest + 16) = *(src + 16); + return; + case 18: +#if BIT64 + *(long*)dest = *(long*)src; + *(long*)(dest + 8) = *(long*)(src + 8); +#else + *(int*)dest = *(int*)src; + *(int*)(dest + 4) = *(int*)(src + 4); + *(int*)(dest + 8) = *(int*)(src + 8); + *(int*)(dest + 12) = *(int*)(src + 12); +#endif + *(short*)(dest + 16) = *(short*)(src + 16); + return; + case 19: +#if BIT64 + *(long*)dest = *(long*)src; + *(long*)(dest + 8) = *(long*)(src + 8); +#else + *(int*)dest = *(int*)src; + *(int*)(dest + 4) = *(int*)(src + 4); + *(int*)(dest + 8) = *(int*)(src + 8); + *(int*)(dest + 12) = *(int*)(src + 12); +#endif + *(short*)(dest + 16) = *(short*)(src + 16); + *(dest + 18) = *(src + 18); + return; + case 20: +#if BIT64 + *(long*)dest = *(long*)src; + *(long*)(dest + 8) = *(long*)(src + 8); +#else + *(int*)dest = *(int*)src; + *(int*)(dest + 4) = *(int*)(src + 4); + *(int*)(dest + 8) = *(int*)(src + 8); + *(int*)(dest + 12) = *(int*)(src + 12); +#endif + *(int*)(dest + 16) = *(int*)(src + 16); + return; + case 21: +#if BIT64 + *(long*)dest = *(long*)src; + *(long*)(dest + 8) = *(long*)(src + 8); +#else + *(int*)dest = *(int*)src; + *(int*)(dest + 4) = *(int*)(src + 4); + *(int*)(dest + 8) = *(int*)(src + 8); + *(int*)(dest + 12) = *(int*)(src + 12); +#endif + *(int*)(dest + 16) = *(int*)(src + 16); + *(dest + 20) = *(src + 20); + return; + case 22: +#if BIT64 + *(long*)dest = *(long*)src; + *(long*)(dest + 8) = *(long*)(src + 8); +#else + *(int*)dest = *(int*)src; + *(int*)(dest + 4) = *(int*)(src + 4); + *(int*)(dest + 8) = *(int*)(src + 8); + *(int*)(dest + 12) = *(int*)(src + 12); +#endif + *(int*)(dest + 16) = *(int*)(src + 16); + *(short*)(dest + 20) = *(short*)(src + 20); + return; } // P/Invoke into the native version for large lengths if (len >= 512) goto PInvoke; + nuint i = 0; // byte offset at which we're copying + if (((int)dest & 3) != 0) { if (((int)dest & 1) != 0) { - *dest = *src; - src++; - dest++; - len--; - if (((int)dest & 2) == 0) - goto Aligned; + *(dest + i) = *(src + i); + i += 1; + if (((int)dest & 2) != 0) + goto IntAligned; } - *(short *)dest = *(short *)src; - src += 2; - dest += 2; - len -= 2; - Aligned: ; + *(short*)(dest + i) = *(short*)(src + i); + i += 2; } + IntAligned: + #if BIT64 - if (((int)dest & 4) != 0) + // On 64-bit IntPtr.Size == 8, so we want to advance to the next 8-aligned address. If + // (int)dest % 8 is 0, 5, 6, or 7, we will already have advanced by 0, 3, 2, or 1 + // bytes to the next aligned address (respectively), so do nothing. On the other hand, + // if it is 1, 2, 3, or 4 we will want to copy-and-advance another 4 bytes until + // we're aligned. + // The thing 1, 2, 3, and 4 have in common that the others don't is that if you + // subtract one from them, their 3rd lsb will not be set. Hence, the below check. + + if ((((int)dest - 1) & 4) == 0) { - *(int *)dest = *(int *)src; - src += 4; - dest += 4; - len -= 4; + *(int*)(dest + i) = *(int*)(src + i); + i += 4; } -#endif +#endif // BIT64 -#if BIT64 - ulong count = len / 16; -#else - uint count = len / 16; -#endif - while (count > 0) + nuint end = len - 16; + len -= i; // lower 4 bits of len represent how many bytes are left *after* the unrolled loop + + // We know due to the above switch-case that this loop will always run 1 iteration; max + // bytes we copy before checking is 23 (7 to align the pointers, 16 for 1 iteration) so + // the switch handles lengths 0-22. + Contract.Assert(end >= 7 && i <= end); + + // This is separated out into a different variable, so the i + 16 addition can be + // performed at the start of the pipeline and the loop condition does not have + // a dependency on the writes. + nuint counter; + + do { + counter = i + 16; + + // This loop looks very costly since there appear to be a bunch of temporary values + // being created with the adds, but the jit (for x86 anyways) will convert each of + // these to use memory addressing operands. + + // So the only cost is a bit of code size, which is made up for by the fact that + // we save on writes to dest/src. + #if BIT64 - ((long*)dest)[0] = ((long*)src)[0]; - ((long*)dest)[1] = ((long*)src)[1]; + *(long*)(dest + i) = *(long*)(src + i); + *(long*)(dest + i + 8) = *(long*)(src + i + 8); #else - ((int*)dest)[0] = ((int*)src)[0]; - ((int*)dest)[1] = ((int*)src)[1]; - ((int*)dest)[2] = ((int*)src)[2]; - ((int*)dest)[3] = ((int*)src)[3]; + *(int*)(dest + i) = *(int*)(src + i); + *(int*)(dest + i + 4) = *(int*)(src + i + 4); + *(int*)(dest + i + 8) = *(int*)(src + i + 8); + *(int*)(dest + i + 12) = *(int*)(src + i + 12); #endif - dest += 16; - src += 16; - count--; + + i = counter; + + // See notes above for why this wasn't used instead + // i += 16; } + while (counter <= end); if ((len & 8) != 0) { #if BIT64 - ((long*)dest)[0] = ((long*)src)[0]; + *(long*)(dest + i) = *(long*)(src + i); #else - ((int*)dest)[0] = ((int*)src)[0]; - ((int*)dest)[1] = ((int*)src)[1]; + *(int*)(dest + i) = *(int*)(src + i); + *(int*)(dest + i + 4) = *(int*)(src + i + 4); #endif - dest += 8; - src += 8; - } - if ((len & 4) != 0) - { - ((int*)dest)[0] = ((int*)src)[0]; - dest += 4; - src += 4; - } - if ((len & 2) != 0) - { - ((short*)dest)[0] = ((short*)src)[0]; - dest += 2; - src += 2; - } - if ((len & 1) != 0) - *dest = *src; + i += 8; + } + if ((len & 4) != 0) + { + *(int*)(dest + i) = *(int*)(src + i); + i += 4; + } + if ((len & 2) != 0) + { + *(short*)(dest + i) = *(short*)(src + i); + i += 2; + } + if ((len & 1) != 0) + { + *(dest + i) = *(src + i); + // We're not using i after this, so not needed + // i += 1; + } return; @@ -495,11 +598,7 @@ namespace System { [SecurityCritical] [ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)] [MethodImplAttribute(MethodImplOptions.NoInlining)] -#if BIT64 - private unsafe static void _Memmove(byte* dest, byte* src, ulong len) -#else - private unsafe static void _Memmove(byte* dest, byte* src, uint len) -#endif + private unsafe static void _Memmove(byte* dest, byte* src, nuint len) { __Memmove(dest, src, len); } @@ -508,12 +607,7 @@ namespace System { [SuppressUnmanagedCodeSecurity] [SecurityCritical] [ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)] -#if BIT64 - extern private unsafe static void __Memmove(byte* dest, byte* src, ulong len); -#else - extern private unsafe static void __Memmove(byte* dest, byte* src, uint len); -#endif - + extern private unsafe static void __Memmove(byte* dest, byte* src, nuint len); // The attributes on this method are chosen for best JIT performance. // Please do not edit unless intentional. @@ -526,11 +620,7 @@ namespace System { { ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.sourceBytesToCopy); } -#if BIT64 - Memmove((byte*)destination, (byte*)source, checked((ulong) sourceBytesToCopy)); -#else - Memmove((byte*)destination, (byte*)source, checked((uint)sourceBytesToCopy)); -#endif // BIT64 + Memmove((byte*)destination, (byte*)source, checked((nuint)sourceBytesToCopy)); } @@ -547,7 +637,7 @@ namespace System { } #if BIT64 Memmove((byte*)destination, (byte*)source, sourceBytesToCopy); -#else +#else // BIT64 Memmove((byte*)destination, (byte*)source, checked((uint)sourceBytesToCopy)); #endif // BIT64 } -- 2.7.4