From 47a4d73cbad6ebbaed6886793be90fd9a72b79e3 Mon Sep 17 00:00:00 2001 From: Varun Venkatesan Date: Sat, 4 Mar 2017 09:24:30 -0800 Subject: [PATCH] Extending optimized JIT helpers to Buffer.MemoryCopy (dotnet/coreclr#9786) Commit migrated from https://github.com/dotnet/coreclr/commit/c6372c5bfebd61470ea5d111f224d035b1d2ebdf --- src/coreclr/src/mscorlib/src/System/Buffer.cs | 441 +++++++++----------------- 1 file changed, 151 insertions(+), 290 deletions(-) diff --git a/src/coreclr/src/mscorlib/src/System/Buffer.cs b/src/coreclr/src/mscorlib/src/System/Buffer.cs index 8b4e98b..5c14d73 100644 --- a/src/coreclr/src/mscorlib/src/System/Buffer.cs +++ b/src/coreclr/src/mscorlib/src/System/Buffer.cs @@ -2,6 +2,10 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +#if AMD64 || (BIT32 && !ARM) +#define HAS_CUSTOM_BLOCKS +#endif + namespace System { //Only contains static methods. Does not require serialization @@ -256,326 +260,175 @@ namespace System // This method has different signature for x64 and other platforms and is done for performance reasons. internal unsafe static void Memmove(byte* dest, byte* src, nuint len) { - // P/Invoke into the native version when the buffers are overlapping and the copy needs to be performed backwards - // This check can produce false positives for lengths greater than Int32.MaxInt. It is fine because we want to use PInvoke path for the large lengths anyway. +#if AMD64 || (BIT32 && !ARM) + const nuint CopyThreshold = 2048; +#else + const nuint CopyThreshold = 512; +#endif // AMD64 || (BIT32 && !ARM) - if ((nuint)dest - (nuint)src < len) goto PInvoke; + // P/Invoke into the native version when the buffers are overlapping. - // This is portable version of memcpy. It mirrors what the hand optimized assembly versions of memcpy typically do. - // - // Ideally, we would just use the cpblk IL instruction here. Unfortunately, cpblk IL instruction is not as efficient as - // possible yet and so we have this implementation here for now. + if (((nuint)dest - (nuint)src < len) || ((nuint)src - (nuint)dest < len)) goto PInvoke; - // Note: It's important that this switch handles lengths at least up to 22. - // See notes below near the main loop for why. + byte* srcEnd = src + len; + byte* destEnd = dest + len; - // The switch will be very fast since it can be implemented using a jump - // table in assembly. See http://stackoverflow.com/a/449297/4077294 for more info. + if (len <= 16) goto MCPY02; + if (len > 64) goto MCPY05; - switch (len) - { - case 0: - return; - case 1: - *dest = *src; - return; - case 2: - *(short*)dest = *(short*)src; - return; - case 3: - *(short*)dest = *(short*)src; - *(dest + 2) = *(src + 2); - return; - case 4: - *(int*)dest = *(int*)src; - return; - case 5: - *(int*)dest = *(int*)src; - *(dest + 4) = *(src + 4); - return; - case 6: - *(int*)dest = *(int*)src; - *(short*)(dest + 4) = *(short*)(src + 4); - return; - case 7: - *(int*)dest = *(int*)src; - *(short*)(dest + 4) = *(short*)(src + 4); - *(dest + 6) = *(src + 6); - return; - case 8: -#if BIT64 - *(long*)dest = *(long*)src; -#else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); -#endif - return; - case 9: -#if BIT64 - *(long*)dest = *(long*)src; -#else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); -#endif - *(dest + 8) = *(src + 8); - return; - case 10: -#if BIT64 - *(long*)dest = *(long*)src; -#else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); -#endif - *(short*)(dest + 8) = *(short*)(src + 8); - return; - case 11: -#if BIT64 - *(long*)dest = *(long*)src; + MCPY00: + // Copy bytes which are multiples of 16 and leave the remainder for MCPY01 to handle. + Debug.Assert(len > 16 && len <= 64); +#if HAS_CUSTOM_BLOCKS + *(Block16*)dest = *(Block16*)src; // [0,16] +#elif BIT64 + *(long*)dest = *(long*)src; + *(long*)(dest + 8) = *(long*)(src + 8); // [0,16] #else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); + *(int*)dest = *(int*)src; + *(int*)(dest + 4) = *(int*)(src + 4); + *(int*)(dest + 8) = *(int*)(src + 8); + *(int*)(dest + 12) = *(int*)(src + 12); // [0,16] #endif - *(short*)(dest + 8) = *(short*)(src + 8); - *(dest + 10) = *(src + 10); - return; - case 12: -#if BIT64 - *(long*)dest = *(long*)src; + if (len <= 32) goto MCPY01; +#if HAS_CUSTOM_BLOCKS + *(Block16*)(dest + 16) = *(Block16*)(src + 16); // [0,32] +#elif BIT64 + *(long*)(dest + 16) = *(long*)(src + 16); + *(long*)(dest + 24) = *(long*)(src + 24); // [0,32] #else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); + *(int*)(dest + 16) = *(int*)(src + 16); + *(int*)(dest + 20) = *(int*)(src + 20); + *(int*)(dest + 24) = *(int*)(src + 24); + *(int*)(dest + 28) = *(int*)(src + 28); // [0,32] #endif - *(int*)(dest + 8) = *(int*)(src + 8); - return; - case 13: -#if BIT64 - *(long*)dest = *(long*)src; -#else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); -#endif - *(int*)(dest + 8) = *(int*)(src + 8); - *(dest + 12) = *(src + 12); - return; - case 14: -#if BIT64 - *(long*)dest = *(long*)src; + if (len <= 48) goto MCPY01; +#if HAS_CUSTOM_BLOCKS + *(Block16*)(dest + 32) = *(Block16*)(src + 32); // [0,48] +#elif BIT64 + *(long*)(dest + 32) = *(long*)(src + 32); + *(long*)(dest + 40) = *(long*)(src + 40); // [0,48] #else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); + *(int*)(dest + 32) = *(int*)(src + 32); + *(int*)(dest + 36) = *(int*)(src + 36); + *(int*)(dest + 40) = *(int*)(src + 40); + *(int*)(dest + 44) = *(int*)(src + 44); // [0,48] #endif - *(int*)(dest + 8) = *(int*)(src + 8); - *(short*)(dest + 12) = *(short*)(src + 12); - return; - case 15: -#if BIT64 - *(long*)dest = *(long*)src; -#else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); -#endif - *(int*)(dest + 8) = *(int*)(src + 8); - *(short*)(dest + 12) = *(short*)(src + 12); - *(dest + 14) = *(src + 14); - return; - case 16: -#if BIT64 - *(long*)dest = *(long*)src; - *(long*)(dest + 8) = *(long*)(src + 8); -#else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); - *(int*)(dest + 8) = *(int*)(src + 8); - *(int*)(dest + 12) = *(int*)(src + 12); -#endif - return; - case 17: -#if BIT64 - *(long*)dest = *(long*)src; - *(long*)(dest + 8) = *(long*)(src + 8); -#else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); - *(int*)(dest + 8) = *(int*)(src + 8); - *(int*)(dest + 12) = *(int*)(src + 12); -#endif - *(dest + 16) = *(src + 16); - return; - case 18: -#if BIT64 - *(long*)dest = *(long*)src; - *(long*)(dest + 8) = *(long*)(src + 8); -#else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); - *(int*)(dest + 8) = *(int*)(src + 8); - *(int*)(dest + 12) = *(int*)(src + 12); -#endif - *(short*)(dest + 16) = *(short*)(src + 16); - return; - case 19: -#if BIT64 - *(long*)dest = *(long*)src; - *(long*)(dest + 8) = *(long*)(src + 8); -#else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); - *(int*)(dest + 8) = *(int*)(src + 8); - *(int*)(dest + 12) = *(int*)(src + 12); -#endif - *(short*)(dest + 16) = *(short*)(src + 16); - *(dest + 18) = *(src + 18); - return; - case 20: -#if BIT64 - *(long*)dest = *(long*)src; - *(long*)(dest + 8) = *(long*)(src + 8); -#else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); - *(int*)(dest + 8) = *(int*)(src + 8); - *(int*)(dest + 12) = *(int*)(src + 12); -#endif - *(int*)(dest + 16) = *(int*)(src + 16); - return; - case 21: -#if BIT64 - *(long*)dest = *(long*)src; - *(long*)(dest + 8) = *(long*)(src + 8); + + MCPY01: + // Unconditionally copy the last 16 bytes using destEnd and srcEnd and return. + Debug.Assert(len > 16 && len <= 64); +#if HAS_CUSTOM_BLOCKS + *(Block16*)(destEnd - 16) = *(Block16*)(srcEnd - 16); +#elif BIT64 + *(long*)(destEnd - 16) = *(long*)(srcEnd - 16); + *(long*)(destEnd - 8) = *(long*)(srcEnd - 8); #else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); - *(int*)(dest + 8) = *(int*)(src + 8); - *(int*)(dest + 12) = *(int*)(src + 12); + *(int*)(destEnd - 16) = *(int*)(srcEnd - 16); + *(int*)(destEnd - 12) = *(int*)(srcEnd - 12); + *(int*)(destEnd - 8) = *(int*)(srcEnd - 8); + *(int*)(destEnd - 4) = *(int*)(srcEnd - 4); #endif - *(int*)(dest + 16) = *(int*)(src + 16); - *(dest + 20) = *(src + 20); - return; - case 22: + return; + + MCPY02: + // Copy the first 8 bytes and then unconditionally copy the last 8 bytes and return. + if ((len & 24) == 0) goto MCPY03; + Debug.Assert(len >= 8 && len <= 16); #if BIT64 - *(long*)dest = *(long*)src; - *(long*)(dest + 8) = *(long*)(src + 8); + *(long*)dest = *(long*)src; + *(long*)(destEnd - 8) = *(long*)(srcEnd - 8); #else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); - *(int*)(dest + 8) = *(int*)(src + 8); - *(int*)(dest + 12) = *(int*)(src + 12); + *(int*)dest = *(int*)src; + *(int*)(dest + 4) = *(int*)(src + 4); + *(int*)(destEnd - 8) = *(int*)(srcEnd - 8); + *(int*)(destEnd - 4) = *(int*)(srcEnd - 4); #endif - *(int*)(dest + 16) = *(int*)(src + 16); - *(short*)(dest + 20) = *(short*)(src + 20); - return; - } - - // P/Invoke into the native version for large lengths - if (len >= 512) goto PInvoke; - - nuint i = 0; // byte offset at which we're copying + return; - if (((int)dest & 3) != 0) - { - if (((int)dest & 1) != 0) - { - *(dest + i) = *(src + i); - i += 1; - if (((int)dest & 2) != 0) - goto IntAligned; - } - *(short*)(dest + i) = *(short*)(src + i); - i += 2; - } + MCPY03: + // Copy the first 4 bytes and then unconditionally copy the last 4 bytes and return. + if ((len & 4) == 0) goto MCPY04; + Debug.Assert(len >= 4 && len < 8); + *(int*)dest = *(int*)src; + *(int*)(destEnd - 4) = *(int*)(srcEnd - 4); + return; - IntAligned: + MCPY04: + // Copy the first byte. For pending bytes, do an unconditionally copy of the last 2 bytes and return. + Debug.Assert(len < 4); + if (len == 0) return; + *dest = *src; + if ((len & 2) == 0) return; + *(short*)(destEnd - 2) = *(short*)(srcEnd - 2); + return; -#if BIT64 - // On 64-bit IntPtr.Size == 8, so we want to advance to the next 8-aligned address. If - // (int)dest % 8 is 0, 5, 6, or 7, we will already have advanced by 0, 3, 2, or 1 - // bytes to the next aligned address (respectively), so do nothing. On the other hand, - // if it is 1, 2, 3, or 4 we will want to copy-and-advance another 4 bytes until - // we're aligned. - // The thing 1, 2, 3, and 4 have in common that the others don't is that if you - // subtract one from them, their 3rd lsb will not be set. Hence, the below check. - - if ((((int)dest - 1) & 4) == 0) + MCPY05: + // PInvoke to the native version when the copy length exceeds the threshold. + if (len > CopyThreshold) { - *(int*)(dest + i) = *(int*)(src + i); - i += 4; + goto PInvoke; } -#endif // BIT64 - - nuint end = len - 16; - len -= i; // lower 4 bits of len represent how many bytes are left *after* the unrolled loop - - // We know due to the above switch-case that this loop will always run 1 iteration; max - // bytes we copy before checking is 23 (7 to align the pointers, 16 for 1 iteration) so - // the switch handles lengths 0-22. - Debug.Assert(end >= 7 && i <= end); - - // This is separated out into a different variable, so the i + 16 addition can be - // performed at the start of the pipeline and the loop condition does not have - // a dependency on the writes. - nuint counter; - - do - { - counter = i + 16; - - // This loop looks very costly since there appear to be a bunch of temporary values - // being created with the adds, but the jit (for x86 anyways) will convert each of - // these to use memory addressing operands. - - // So the only cost is a bit of code size, which is made up for by the fact that - // we save on writes to dest/src. -#if BIT64 - *(long*)(dest + i) = *(long*)(src + i); - *(long*)(dest + i + 8) = *(long*)(src + i + 8); + // Copy 64-bytes at a time until the remainder is less than 64. + // If remainder is greater than 16 bytes, then jump to MCPY00. Otherwise, unconditionally copy the last 16 bytes and return. + Debug.Assert(len > 64 && len <= CopyThreshold); + nuint n = len >> 6; + + MCPY06: +#if HAS_CUSTOM_BLOCKS + *(Block64*)dest = *(Block64*)src; +#elif BIT64 + *(long*)dest = *(long*)src; + *(long*)(dest + 8) = *(long*)(src + 8); + *(long*)(dest + 16) = *(long*)(src + 16); + *(long*)(dest + 24) = *(long*)(src + 24); + *(long*)(dest + 32) = *(long*)(src + 32); + *(long*)(dest + 40) = *(long*)(src + 40); + *(long*)(dest + 48) = *(long*)(src + 48); + *(long*)(dest + 56) = *(long*)(src + 56); #else - *(int*)(dest + i) = *(int*)(src + i); - *(int*)(dest + i + 4) = *(int*)(src + i + 4); - *(int*)(dest + i + 8) = *(int*)(src + i + 8); - *(int*)(dest + i + 12) = *(int*)(src + i + 12); + *(int*)dest = *(int*)src; + *(int*)(dest + 4) = *(int*)(src + 4); + *(int*)(dest + 8) = *(int*)(src + 8); + *(int*)(dest + 12) = *(int*)(src + 12); + *(int*)(dest + 16) = *(int*)(src + 16); + *(int*)(dest + 20) = *(int*)(src + 20); + *(int*)(dest + 24) = *(int*)(src + 24); + *(int*)(dest + 28) = *(int*)(src + 28); + *(int*)(dest + 32) = *(int*)(src + 32); + *(int*)(dest + 36) = *(int*)(src + 36); + *(int*)(dest + 40) = *(int*)(src + 40); + *(int*)(dest + 44) = *(int*)(src + 44); + *(int*)(dest + 48) = *(int*)(src + 48); + *(int*)(dest + 52) = *(int*)(src + 52); + *(int*)(dest + 56) = *(int*)(src + 56); + *(int*)(dest + 60) = *(int*)(src + 60); #endif - - i = counter; - - // See notes above for why this wasn't used instead - // i += 16; - } - while (counter <= end); - - if ((len & 8) != 0) - { -#if BIT64 - *(long*)(dest + i) = *(long*)(src + i); + dest += 64; + src += 64; + n--; + if (n != 0) goto MCPY06; + + len %= 64; + if (len > 16) goto MCPY00; +#if HAS_CUSTOM_BLOCKS + *(Block16*)(destEnd - 16) = *(Block16*)(srcEnd - 16); +#elif BIT64 + *(long*)(destEnd - 16) = *(long*)(srcEnd - 16); + *(long*)(destEnd - 8) = *(long*)(srcEnd - 8); #else - *(int*)(dest + i) = *(int*)(src + i); - *(int*)(dest + i + 4) = *(int*)(src + i + 4); + *(int*)(destEnd - 16) = *(int*)(srcEnd - 16); + *(int*)(destEnd - 12) = *(int*)(srcEnd - 12); + *(int*)(destEnd - 8) = *(int*)(srcEnd - 8); + *(int*)(destEnd - 4) = *(int*)(srcEnd - 4); #endif - i += 8; - } - if ((len & 4) != 0) - { - *(int*)(dest + i) = *(int*)(src + i); - i += 4; - } - if ((len & 2) != 0) - { - *(short*)(dest + i) = *(short*)(src + i); - i += 2; - } - if ((len & 1) != 0) - { - *(dest + i) = *(src + i); - // We're not using i after this, so not needed - // i += 1; - } - return; - PInvoke: + PInvoke: _Memmove(dest, src, len); } - + // Non-inlinable wrapper around the QCall that avoids poluting the fast path // with P/Invoke prolog/epilog. [MethodImplAttribute(MethodImplOptions.NoInlining)] @@ -618,5 +471,13 @@ namespace System Memmove((byte*)destination, (byte*)source, checked((uint)sourceBytesToCopy)); #endif // BIT64 } + +#if HAS_CUSTOM_BLOCKS + [StructLayout(LayoutKind.Sequential, Size = 16)] + private struct Block16 { } + + [StructLayout(LayoutKind.Sequential, Size = 64)] + private struct Block64 { } +#endif // HAS_CUSTOM_BLOCKS } } -- 2.7.4