From: Levi Broderick Date: Sat, 27 Jan 2018 06:35:46 +0000 (-0800) Subject: Optimize Span.Copy and Span.TryCopyTo (#15947) X-Git-Tag: accepted/tizen/unified/20190422.045933~3192 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=e07292d009d47a4920c19c669796b6893b307ec4;p=platform%2Fupstream%2Fcoreclr.git Optimize Span.Copy and Span.TryCopyTo (#15947) * Introduce a ref-based version of Buffer.Memmove * Remove pinning logic from Span.CopyTo * Tweak flow graph of Span.CopyTo / TryCopyTo in order to encourage better codegen * Push some uncommon logic (one-element buffers, perfectly overlapping buffers) down to Memmove --- diff --git a/src/mscorlib/shared/System/ReadOnlySpan.cs b/src/mscorlib/shared/System/ReadOnlySpan.cs index 06af661..4ee07a7 100644 --- a/src/mscorlib/shared/System/ReadOnlySpan.cs +++ b/src/mscorlib/shared/System/ReadOnlySpan.cs @@ -10,6 +10,12 @@ using Internal.Runtime.CompilerServices; #pragma warning disable 0809 //warning CS0809: Obsolete member 'Span.Equals(object)' overrides non-obsolete member 'object.Equals(object)' +#if BIT64 +using nuint = System.UInt64; +#else +using nuint = System.UInt32; +#endif + namespace System { /// @@ -198,8 +204,18 @@ namespace System /// public void CopyTo(Span destination) { - if (!TryCopyTo(destination)) + // Using "if (!TryCopyTo(...))" results in two branches: one for the length + // check, and one for the result of TryCopyTo. Since these checks are equivalent, + // we can optimize by performing the check once ourselves then calling Memmove directly. + + if ((uint)_length <= (uint)destination.Length) + { + Buffer.Memmove(ref destination.DangerousGetPinnableReference(), ref _pointer.Value, (nuint)_length); + } + else + { ThrowHelper.ThrowArgumentException_DestinationTooShort(); + } } /// Copies the contents of this read-only span into destination span. If the source @@ -211,11 +227,13 @@ namespace System /// The span to copy items into. public bool TryCopyTo(Span destination) { - if ((uint)_length > (uint)destination.Length) - return false; - - Span.CopyTo(ref destination.DangerousGetPinnableReference(), ref _pointer.Value, _length); - return true; + bool retVal = false; + if ((uint)_length <= (uint)destination.Length) + { + Buffer.Memmove(ref destination.DangerousGetPinnableReference(), ref _pointer.Value, (nuint)_length); + retVal = true; + } + return retVal; } /// @@ -319,7 +337,7 @@ namespace System return Array.Empty(); var destination = new T[_length]; - Span.CopyTo(ref Unsafe.As(ref destination.GetRawSzArrayData()), ref _pointer.Value, _length); + Buffer.Memmove(ref Unsafe.As(ref destination.GetRawSzArrayData()), ref _pointer.Value, (nuint)_length); return destination; } diff --git a/src/mscorlib/shared/System/Span.NonGeneric.cs b/src/mscorlib/shared/System/Span.NonGeneric.cs index f6cd939..7c942f8 100644 --- a/src/mscorlib/shared/System/Span.NonGeneric.cs +++ b/src/mscorlib/shared/System/Span.NonGeneric.cs @@ -241,41 +241,7 @@ namespace System return new ReadOnlySpan(ref Unsafe.Add(ref text.GetRawStringData(), start), length); } - - internal static unsafe void CopyTo(ref T destination, ref T source, int elementsCount) - { - if (Unsafe.AreSame(ref destination, ref source)) - return; - - if (elementsCount <= 1) - { - if (elementsCount == 1) - { - destination = source; - } - return; - } - - nuint byteCount = (nuint)elementsCount * (nuint)Unsafe.SizeOf(); - if (!RuntimeHelpers.IsReferenceOrContainsReferences()) - { - fixed (byte* pDestination = &Unsafe.As(ref destination)) - { - fixed (byte* pSource = &Unsafe.As(ref source)) - { - Buffer.Memmove(pDestination, pSource, byteCount); - } - } - } - else - { - RuntimeImports.RhBulkMoveWithWriteBarrier( - ref Unsafe.As(ref destination), - ref Unsafe.As(ref source), - byteCount); - } - } - + internal static unsafe void ClearWithoutReferences(ref byte b, nuint byteLength) { if (byteLength == 0) diff --git a/src/mscorlib/shared/System/Span.cs b/src/mscorlib/shared/System/Span.cs index 8c57ab4..851e6fe 100644 --- a/src/mscorlib/shared/System/Span.cs +++ b/src/mscorlib/shared/System/Span.cs @@ -278,8 +278,18 @@ namespace System /// public void CopyTo(Span destination) { - if (!TryCopyTo(destination)) + // Using "if (!TryCopyTo(...))" results in two branches: one for the length + // check, and one for the result of TryCopyTo. Since these checks are equivalent, + // we can optimize by performing the check once ourselves then calling Memmove directly. + + if ((uint)_length <= (uint)destination.Length) + { + Buffer.Memmove(ref destination.DangerousGetPinnableReference(), ref _pointer.Value, (nuint)_length); + } + else + { ThrowHelper.ThrowArgumentException_DestinationTooShort(); + } } /// @@ -292,11 +302,13 @@ namespace System /// return false and no data is written to the destination. public bool TryCopyTo(Span destination) { - if ((uint)_length > (uint)destination.Length) - return false; - - Span.CopyTo(ref destination._pointer.Value, ref _pointer.Value, _length); - return true; + bool retVal = false; + if ((uint)_length <= (uint)destination.Length) + { + Buffer.Memmove(ref destination.DangerousGetPinnableReference(), ref _pointer.Value, (nuint)_length); + retVal = true; + } + return retVal; } /// @@ -406,7 +418,7 @@ namespace System return Array.Empty(); var destination = new T[_length]; - Span.CopyTo(ref Unsafe.As(ref destination.GetRawSzArrayData()), ref _pointer.Value, _length); + Buffer.Memmove(ref Unsafe.As(ref destination.GetRawSzArrayData()), ref _pointer.Value, (nuint)_length); return destination; } diff --git a/src/mscorlib/src/System/Buffer.cs b/src/mscorlib/src/System/Buffer.cs index 9d4d693..185e612 100644 --- a/src/mscorlib/src/System/Buffer.cs +++ b/src/mscorlib/src/System/Buffer.cs @@ -18,10 +18,13 @@ namespace System using System.Diagnostics; using System.Security; using System.Runtime; + using Internal.Runtime.CompilerServices; #if BIT64 + using nint = System.Int64; using nuint = System.UInt64; #else // BIT64 + using nint = System.Int32; using nuint = System.UInt32; #endif // BIT64 @@ -429,7 +432,240 @@ namespace System _Memmove(dest, src, len); } - // Non-inlinable wrapper around the QCall that avoids poluting the fast path + // This method has different signature for x64 and other platforms and is done for performance reasons. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void Memmove(ref T destination, ref T source, nuint elementCount) + { + if (!RuntimeHelpers.IsReferenceOrContainsReferences()) + { + // Blittable memmove + + Memmove( + new ByReference(ref Unsafe.As(ref destination)), + new ByReference(ref Unsafe.As(ref source)), + elementCount * (nuint)Unsafe.SizeOf()); + } + else + { + // Non-blittable memmove + + // Try to avoid calling RhBulkMoveWithWriteBarrier if we can get away + // with a no-op. + if (!Unsafe.AreSame(ref destination, ref source) && elementCount != 0) + { + RuntimeImports.RhBulkMoveWithWriteBarrier( + ref Unsafe.As(ref destination), + ref Unsafe.As(ref source), + elementCount * (nuint)Unsafe.SizeOf()); + } + } + } + + // This method has different signature for x64 and other platforms and is done for performance reasons. + private static void Memmove(ByReference dest, ByReference src, nuint len) + { +#if AMD64 || (BIT32 && !ARM) + const nuint CopyThreshold = 2048; +#elif ARM64 +#if PLATFORM_WINDOWS + // Determined optimal value for Windows. + // https://github.com/dotnet/coreclr/issues/13843 + const nuint CopyThreshold = UInt64.MaxValue; +#else // PLATFORM_WINDOWS + // Managed code is currently faster than glibc unoptimized memmove + // TODO-ARM64-UNIX-OPT revisit when glibc optimized memmove is in Linux distros + // https://github.com/dotnet/coreclr/issues/13844 + const nuint CopyThreshold = UInt64.MaxValue; +#endif // PLATFORM_WINDOWS +#else + const nuint CopyThreshold = 512; +#endif // AMD64 || (BIT32 && !ARM) + + // P/Invoke into the native version when the buffers are overlapping. + + if (((nuint)Unsafe.ByteOffset(ref src.Value, ref dest.Value) < len) || ((nuint)Unsafe.ByteOffset(ref dest.Value, ref src.Value) < len)) + { + goto BuffersOverlap; + } + + // Use "(IntPtr)(nint)len" to avoid overflow checking on the explicit cast to IntPtr + + ref byte srcEnd = ref Unsafe.Add(ref src.Value, (IntPtr)(nint)len); + ref byte destEnd = ref Unsafe.Add(ref dest.Value, (IntPtr)(nint)len); + + if (len <= 16) + goto MCPY02; + if (len > 64) + goto MCPY05; + +MCPY00: +// Copy bytes which are multiples of 16 and leave the remainder for MCPY01 to handle. + Debug.Assert(len > 16 && len <= 64); +#if HAS_CUSTOM_BLOCKS + Unsafe.As(ref dest.Value) = Unsafe.As(ref src.Value); // [0,16] +#elif BIT64 + Unsafe.As(ref dest.Value) = Unsafe.As(ref src.Value); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 8)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 8)); // [0,16] +#else + Unsafe.As(ref dest.Value) = Unsafe.As(ref src.Value); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 4)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 4)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 8)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 8)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 12)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 12)); // [0,16] +#endif + if (len <= 32) + goto MCPY01; +#if HAS_CUSTOM_BLOCKS + Unsafe.As(ref Unsafe.Add(ref dest.Value, 16)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 16)); // [0,32] +#elif BIT64 + Unsafe.As(ref Unsafe.Add(ref dest.Value, 16)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 16)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 24)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 24)); // [0,32] +#else + Unsafe.As(ref Unsafe.Add(ref dest.Value, 16)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 16)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 20)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 20)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 24)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 24)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 28)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 28)); // [0,32] +#endif + if (len <= 48) + goto MCPY01; +#if HAS_CUSTOM_BLOCKS + Unsafe.As(ref Unsafe.Add(ref dest.Value, 32)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 32)); // [0,48] +#elif BIT64 + Unsafe.As(ref Unsafe.Add(ref dest.Value, 32)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 32)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 40)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 40)); // [0,48] +#else + Unsafe.As(ref Unsafe.Add(ref dest.Value, 32)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 32)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 36)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 36)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 40)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 40)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 44)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 44)); // [0,48] +#endif + +MCPY01: +// Unconditionally copy the last 16 bytes using destEnd and srcEnd and return. + Debug.Assert(len > 16 && len <= 64); +#if HAS_CUSTOM_BLOCKS + Unsafe.As(ref Unsafe.Add(ref destEnd, -16)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -16)); +#elif BIT64 + Unsafe.As(ref Unsafe.Add(ref destEnd, -16)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -16)); + Unsafe.As(ref Unsafe.Add(ref destEnd, -8)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -8)); +#else + Unsafe.As(ref Unsafe.Add(ref destEnd, -16)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -16)); + Unsafe.As(ref Unsafe.Add(ref destEnd, -12)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -12)); + Unsafe.As(ref Unsafe.Add(ref destEnd, -8)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -8)); + Unsafe.As(ref Unsafe.Add(ref destEnd, -4)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -4)); +#endif + return; + +MCPY02: +// Copy the first 8 bytes and then unconditionally copy the last 8 bytes and return. + if ((len & 24) == 0) + goto MCPY03; + Debug.Assert(len >= 8 && len <= 16); +#if BIT64 + Unsafe.As(ref dest.Value) = Unsafe.As(ref src.Value); + Unsafe.As(ref Unsafe.Add(ref destEnd, -8)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -8)); +#else + Unsafe.As(ref dest.Value) = Unsafe.As(ref src.Value); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 4)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 4)); + Unsafe.As(ref Unsafe.Add(ref destEnd, -8)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -8)); + Unsafe.As(ref Unsafe.Add(ref destEnd, -4)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -4)); +#endif + return; + +MCPY03: +// Copy the first 4 bytes and then unconditionally copy the last 4 bytes and return. + if ((len & 4) == 0) + goto MCPY04; + Debug.Assert(len >= 4 && len < 8); + Unsafe.As(ref dest.Value) = Unsafe.As(ref src.Value); + Unsafe.As(ref Unsafe.Add(ref destEnd, -4)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -4)); + return; + +MCPY04: +// Copy the first byte. For pending bytes, do an unconditionally copy of the last 2 bytes and return. + Debug.Assert(len < 4); + if (len == 0) + return; + dest.Value = src.Value; + if ((len & 2) == 0) + return; + Unsafe.As(ref Unsafe.Add(ref destEnd, -2)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -2)); + return; + +MCPY05: +// PInvoke to the native version when the copy length exceeds the threshold. + if (len > CopyThreshold) + { + goto PInvoke; + } + // Copy 64-bytes at a time until the remainder is less than 64. + // If remainder is greater than 16 bytes, then jump to MCPY00. Otherwise, unconditionally copy the last 16 bytes and return. + Debug.Assert(len > 64 && len <= CopyThreshold); + nuint n = len >> 6; + +MCPY06: +#if HAS_CUSTOM_BLOCKS + Unsafe.As(ref dest.Value) = Unsafe.As(ref src.Value); +#elif BIT64 + Unsafe.As(ref dest.Value) = Unsafe.As(ref src.Value); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 8)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 8)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 16)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 16)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 24)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 24)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 32)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 32)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 40)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 40)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 48)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 48)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 56)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 56)); +#else + Unsafe.As(ref dest.Value) = Unsafe.As(ref src.Value); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 4)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 4)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 8)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 8)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 12)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 12)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 16)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 16)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 20)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 20)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 24)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 24)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 28)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 28)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 32)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 32)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 36)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 36)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 40)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 40)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 44)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 44)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 48)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 48)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 52)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 52)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 56)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 56)); + Unsafe.As(ref Unsafe.Add(ref dest.Value, 60)) = Unsafe.As(ref Unsafe.Add(ref src.Value, 60)); +#endif + dest = new ByReference(ref Unsafe.Add(ref dest.Value, 64)); + src = new ByReference(ref Unsafe.Add(ref src.Value, 64)); + n--; + if (n != 0) + goto MCPY06; + + len %= 64; + if (len > 16) + goto MCPY00; +#if HAS_CUSTOM_BLOCKS + Unsafe.As(ref Unsafe.Add(ref destEnd, -16)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -16)); +#elif BIT64 + Unsafe.As(ref Unsafe.Add(ref destEnd, -16)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -16)); + Unsafe.As(ref Unsafe.Add(ref destEnd, -8)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -8)); +#else + Unsafe.As(ref Unsafe.Add(ref destEnd, -16)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -16)); + Unsafe.As(ref Unsafe.Add(ref destEnd, -12)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -12)); + Unsafe.As(ref Unsafe.Add(ref destEnd, -8)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -8)); + Unsafe.As(ref Unsafe.Add(ref destEnd, -4)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -4)); +#endif + return; + +BuffersOverlap: + // If the buffers overlap perfectly, there's no point to copying the data. + if (Unsafe.AreSame(ref dest.Value, ref src.Value)) + { + return; + } + +PInvoke: + _Memmove(ref dest.Value, ref src.Value, len); + } + + // Non-inlinable wrapper around the QCall that avoids polluting the fast path // with P/Invoke prolog/epilog. [MethodImplAttribute(MethodImplOptions.NoInlining)] private unsafe static void _Memmove(byte* dest, byte* src, nuint len) @@ -437,6 +673,16 @@ namespace System __Memmove(dest, src, len); } + // Non-inlinable wrapper around the QCall that avoids polluting the fast path + // with P/Invoke prolog/epilog. + [MethodImplAttribute(MethodImplOptions.NoInlining)] + private unsafe static void _Memmove(ref byte dest, ref byte src, nuint len) + { + fixed (byte* pDest = &dest) + fixed (byte* pSrc = &src) + __Memmove(pDest, pSrc, len); + } + [DllImport(JitHelpers.QCall, CharSet = CharSet.Unicode)] extern private unsafe static void __Memmove(byte* dest, byte* src, nuint len);