From: Ben Adams Date: Mon, 27 Apr 2020 22:19:24 +0000 (+0100) Subject: Use intrinsics for SequenceEqual vectorization to emit at R2R (#32371) X-Git-Tag: submit/tizen/20210909.063632~8334 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=535b998621ae5fb02b80c79b71912c62f1177ccb;p=platform%2Fupstream%2Fdotnet%2Fruntime.git Use intrinsics for SequenceEqual vectorization to emit at R2R (#32371) --- diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs index 025d22e..a657ceb 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs @@ -13,8 +13,10 @@ using Internal.Runtime.CompilerServices; #pragma warning disable SA1121 // explicitly using type aliases instead of built-in types #if TARGET_64BIT using nuint = System.UInt64; +using nint = System.Int64; #else using nuint = System.UInt32; +using nint = System.Int32; #endif // TARGET_64BIT namespace System @@ -1309,85 +1311,210 @@ namespace System // Optimized byte-based SequenceEquals. The "length" parameter for this one is declared a nuint rather than int as we also use it for types other than byte // where the length can exceed 2Gb once scaled by sizeof(T). - [MethodImpl(MethodImplOptions.AggressiveOptimization)] - public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint length) + public static bool SequenceEqual(ref byte first, ref byte second, nuint length) { - IntPtr offset = (IntPtr)0; // Use IntPtr for arithmetic to avoid unnecessary 64->32->64 truncations - IntPtr lengthToExamine = (IntPtr)(void*)length; + bool result; + // Use nint for arithmetic to avoid unnecessary 64->32->64 truncations + if (length >= sizeof(nuint)) + { + // Conditional jmp foward to favor shorter lengths. (See comment at "Equal:" label) + // The longer lengths can make back the time due to branch misprediction + // better than shorter lengths. + goto Longer; + } - if ((byte*)lengthToExamine >= (byte*)sizeof(UIntPtr)) +#if TARGET_64BIT + // On 32-bit, this will always be true since sizeof(nuint) == 4 + if (length < sizeof(uint)) +#endif { - // Only check that the ref is the same if buffers are large, and hence - // its worth avoiding doing unnecessary comparisons - if (Unsafe.AreSame(ref first, ref second)) - goto Equal; + uint differentBits = 0; + nuint offset = (length & 2); + if (offset != 0) + { + differentBits = LoadUShort(ref first); + differentBits -= LoadUShort(ref second); + } + if ((length & 1) != 0) + { + differentBits |= (uint)Unsafe.AddByteOffset(ref first, offset) - (uint)Unsafe.AddByteOffset(ref second, offset); + } + result = (differentBits == 0); + goto Result; + } +#if TARGET_64BIT + else + { + nuint offset = length - sizeof(uint); + uint differentBits = LoadUInt(ref first) - LoadUInt(ref second); + differentBits |= LoadUInt(ref first, offset) - LoadUInt(ref second, offset); + result = (differentBits == 0); + goto Result; + } +#endif + Longer: + // Only check that the ref is the same if buffers are large, + // and hence its worth avoiding doing unnecessary comparisons + if (!Unsafe.AreSame(ref first, ref second)) + { + // C# compiler inverts this test, making the outer goto the conditional jmp. + goto Vector; + } - if (Vector.IsHardwareAccelerated && (byte*)lengthToExamine >= (byte*)Vector.Count) + // This becomes a conditional jmp foward to not favor it. + goto Equal; + + Result: + return result; + // When the sequence is equal; which is the longest execution, we want it to determine that + // as fast as possible so we do not want the early outs to be "predicted not taken" branches. + Equal: + return true; + + Vector: + if (Sse2.IsSupported) + { + if (Avx2.IsSupported && length >= (nuint)Vector256.Count) { - lengthToExamine -= Vector.Count; - while ((byte*)lengthToExamine > (byte*)offset) + Vector256 vecResult; + nuint offset = 0; + nuint lengthToExamine = length - (nuint)Vector256.Count; + // Unsigned, so it shouldn't have overflowed larger than length (rather than negative) + Debug.Assert(lengthToExamine < length); + if (lengthToExamine != 0) { - if (LoadVector(ref first, offset) != LoadVector(ref second, offset)) + do { - goto NotEqual; - } - offset += Vector.Count; + vecResult = Avx2.CompareEqual(LoadVector256(ref first, offset), LoadVector256(ref second, offset)); + if (Avx2.MoveMask(vecResult) != -1) + { + goto NotEqual; + } + offset += (nuint)Vector256.Count; + } while (lengthToExamine > offset); } - return LoadVector(ref first, lengthToExamine) == LoadVector(ref second, lengthToExamine); - } - Debug.Assert((byte*)lengthToExamine >= (byte*)sizeof(UIntPtr)); + // Do final compare as Vector256.Count from end rather than start + vecResult = Avx2.CompareEqual(LoadVector256(ref first, lengthToExamine), LoadVector256(ref second, lengthToExamine)); + if (Avx2.MoveMask(vecResult) == -1) + { + // C# compiler inverts this test, making the outer goto the conditional jmp. + goto Equal; + } - lengthToExamine -= sizeof(UIntPtr); - while ((byte*)lengthToExamine > (byte*)offset) + // This becomes a conditional jmp foward to not favor it. + goto NotEqual; + } + // Use Vector128.Size as Vector128.Count doesn't inline at R2R time + // https://github.com/dotnet/runtime/issues/32714 + else if (length >= Vector128.Size) { - if (LoadUIntPtr(ref first, offset) != LoadUIntPtr(ref second, offset)) + Vector128 vecResult; + nuint offset = 0; + nuint lengthToExamine = length - Vector128.Size; + // Unsigned, so it shouldn't have overflowed larger than length (rather than negative) + Debug.Assert(lengthToExamine < length); + if (lengthToExamine != 0) { - goto NotEqual; + do + { + // We use instrincs directly as .Equals calls .AsByte() which doesn't inline at R2R time + // https://github.com/dotnet/runtime/issues/32714 + vecResult = Sse2.CompareEqual(LoadVector128(ref first, offset), LoadVector128(ref second, offset)); + if (Sse2.MoveMask(vecResult) != 0xFFFF) + { + goto NotEqual; + } + offset += Vector128.Size; + } while (lengthToExamine > offset); } - offset += sizeof(UIntPtr); - } - return LoadUIntPtr(ref first, lengthToExamine) == LoadUIntPtr(ref second, lengthToExamine); - } - Debug.Assert((byte*)lengthToExamine < (byte*)sizeof(UIntPtr)); + // Do final compare as Vector128.Count from end rather than start + vecResult = Sse2.CompareEqual(LoadVector128(ref first, lengthToExamine), LoadVector128(ref second, lengthToExamine)); + if (Sse2.MoveMask(vecResult) == 0xFFFF) + { + // C# compiler inverts this test, making the outer goto the conditional jmp. + goto Equal; + } - // On 32-bit, this will never be true since sizeof(UIntPtr) == 4 -#if TARGET_64BIT - if ((byte*)lengthToExamine >= (byte*)sizeof(int)) - { - if (LoadInt(ref first, offset) != LoadInt(ref second, offset)) - { + // This becomes a conditional jmp foward to not favor it. goto NotEqual; } - offset += sizeof(int); - lengthToExamine -= sizeof(int); } -#endif - - if ((byte*)lengthToExamine >= (byte*)sizeof(short)) + else if (Vector.IsHardwareAccelerated && length >= (nuint)Vector.Count) { - if (LoadShort(ref first, offset) != LoadShort(ref second, offset)) + nuint offset = 0; + nuint lengthToExamine = length - (nuint)Vector.Count; + // Unsigned, so it shouldn't have overflowed larger than length (rather than negative) + Debug.Assert(lengthToExamine < length); + if (lengthToExamine > 0) { - goto NotEqual; + do + { + if (LoadVector(ref first, offset) != LoadVector(ref second, offset)) + { + goto NotEqual; + } + offset += (nuint)Vector.Count; + } while (lengthToExamine > offset); } - offset += sizeof(short); - lengthToExamine -= sizeof(short); + + // Do final compare as Vector.Count from end rather than start + if (LoadVector(ref first, lengthToExamine) == LoadVector(ref second, lengthToExamine)) + { + // C# compiler inverts this test, making the outer goto the conditional jmp. + goto Equal; + } + + // This becomes a conditional jmp foward to not favor it. + goto NotEqual; } - if (lengthToExamine != IntPtr.Zero) +#if TARGET_64BIT + if (Sse2.IsSupported) { - Debug.Assert((int)lengthToExamine == 1); + Debug.Assert(length <= sizeof(nuint) * 2); - if (Unsafe.AddByteOffset(ref first, offset) != Unsafe.AddByteOffset(ref second, offset)) + nuint offset = length - sizeof(nuint); + nuint differentBits = LoadNUInt(ref first) - LoadNUInt(ref second); + differentBits |= LoadNUInt(ref first, offset) - LoadNUInt(ref second, offset); + result = (differentBits == 0); + goto Result; + } + else +#endif + { + Debug.Assert(length >= sizeof(nuint)); { - goto NotEqual; + nuint offset = 0; + nuint lengthToExamine = length - sizeof(nuint); + // Unsigned, so it shouldn't have overflowed larger than length (rather than negative) + Debug.Assert(lengthToExamine < length); + if (lengthToExamine > 0) + { + do + { + // Compare unsigned so not do a sign extend mov on 64 bit + if (LoadNUInt(ref first, offset) != LoadNUInt(ref second, offset)) + { + goto NotEqual; + } + offset += sizeof(nuint); + } while (lengthToExamine > offset); + } + + // Do final compare as sizeof(nuint) from end rather than start + result = (LoadNUInt(ref first, lengthToExamine) == LoadNUInt(ref second, lengthToExamine)); + goto Result; } } - Equal: - return true; - NotEqual: // Workaround for https://github.com/dotnet/runtime/issues/8795 + // As there are so many true/false exit points the Jit will coalesce them to one location. + // We want them at the end so the conditional early exit jmps are all jmp forwards so the + // branch predictor in a uninitialized state will not take them e.g. + // - loops are conditional jmps backwards and predicted + // - exceptions are conditional fowards jmps and not predicted + NotEqual: return false; } @@ -1644,27 +1771,55 @@ namespace System 0x01ul << 48) + 1; [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe short LoadShort(ref byte start, IntPtr offset) - => Unsafe.ReadUnaligned(ref Unsafe.AddByteOffset(ref start, offset)); + private static ushort LoadUShort(ref byte start) + => Unsafe.ReadUnaligned(ref start); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static ushort LoadUShort(ref byte start, nuint offset) + => Unsafe.ReadUnaligned(ref Unsafe.AddByteOffset(ref start, offset)); [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe int LoadInt(ref byte start, IntPtr offset) - => Unsafe.ReadUnaligned(ref Unsafe.AddByteOffset(ref start, offset)); + private static uint LoadUInt(ref byte start) + => Unsafe.ReadUnaligned(ref start); [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe UIntPtr LoadUIntPtr(ref byte start, IntPtr offset) + private static uint LoadUInt(ref byte start, nuint offset) + => Unsafe.ReadUnaligned(ref Unsafe.AddByteOffset(ref start, offset)); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static nuint LoadNUInt(ref byte start) + => Unsafe.ReadUnaligned(ref start); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static nuint LoadNUInt(ref byte start, nuint offset) + => Unsafe.ReadUnaligned(ref Unsafe.AddByteOffset(ref start, offset)); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static UIntPtr LoadUIntPtr(ref byte start, IntPtr offset) => Unsafe.ReadUnaligned(ref Unsafe.AddByteOffset(ref start, offset)); [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe Vector LoadVector(ref byte start, IntPtr offset) + private static Vector LoadVector(ref byte start, IntPtr offset) + => Unsafe.ReadUnaligned>(ref Unsafe.AddByteOffset(ref start, offset)); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector LoadVector(ref byte start, nuint offset) => Unsafe.ReadUnaligned>(ref Unsafe.AddByteOffset(ref start, offset)); [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe Vector128 LoadVector128(ref byte start, IntPtr offset) + private static Vector128 LoadVector128(ref byte start, IntPtr offset) => Unsafe.ReadUnaligned>(ref Unsafe.AddByteOffset(ref start, offset)); [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe Vector256 LoadVector256(ref byte start, IntPtr offset) + private static Vector128 LoadVector128(ref byte start, nuint offset) + => Unsafe.ReadUnaligned>(ref Unsafe.AddByteOffset(ref start, offset)); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 LoadVector256(ref byte start, IntPtr offset) + => Unsafe.ReadUnaligned>(ref Unsafe.AddByteOffset(ref start, offset)); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 LoadVector256(ref byte start, nuint offset) => Unsafe.ReadUnaligned>(ref Unsafe.AddByteOffset(ref start, offset)); [MethodImpl(MethodImplOptions.AggressiveInlining)]