From 80ffe5c713b87524aa87085cff81d09daf8c65b4 Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Mon, 15 May 2023 16:12:16 +0200 Subject: [PATCH] Reduce overhead of calling into SearchValues (#86046) --- .../src/System/SearchValues/AnyByteSearchValues.cs | 14 +- .../System/SearchValues/AsciiByteSearchValues.cs | 6 +- .../System/SearchValues/AsciiCharSearchValues.cs | 8 +- .../System/SearchValues/IndexOfAnyAsciiSearcher.cs | 722 +++++++++++---------- .../src/System/SearchValues/SearchValues.cs | 2 +- 5 files changed, 380 insertions(+), 372 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/AnyByteSearchValues.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/AnyByteSearchValues.cs index ce99ac0..a510214 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/AnyByteSearchValues.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/AnyByteSearchValues.cs @@ -9,12 +9,14 @@ namespace System.Buffers { internal sealed class AnyByteSearchValues : SearchValues { - private readonly Vector128 _bitmap0; - private readonly Vector128 _bitmap1; + private Vector512 _bitmaps; private readonly BitVector256 _lookup; - public AnyByteSearchValues(ReadOnlySpan values) => - IndexOfAnyAsciiSearcher.ComputeBitmap256(values, out _bitmap0, out _bitmap1, out _lookup); + public AnyByteSearchValues(ReadOnlySpan values) + { + IndexOfAnyAsciiSearcher.ComputeBitmap256(values, out Vector256 bitmap0, out Vector256 bitmap1, out _lookup); + _bitmaps = Vector512.Create(bitmap0, bitmap1); + } internal override byte[] GetValues() => _lookup.GetByteValues(); @@ -43,7 +45,7 @@ namespace System.Buffers where TNegator : struct, IndexOfAnyAsciiSearcher.INegator { return IndexOfAnyAsciiSearcher.IsVectorizationSupported && searchSpaceLength >= sizeof(ulong) - ? IndexOfAnyAsciiSearcher.IndexOfAnyVectorized(ref searchSpace, searchSpaceLength, _bitmap0, _bitmap1) + ? IndexOfAnyAsciiSearcher.IndexOfAnyVectorizedAnyByte(ref searchSpace, searchSpaceLength, ref _bitmaps) : IndexOfAnyScalar(ref searchSpace, searchSpaceLength); } @@ -52,7 +54,7 @@ namespace System.Buffers where TNegator : struct, IndexOfAnyAsciiSearcher.INegator { return IndexOfAnyAsciiSearcher.IsVectorizationSupported && searchSpaceLength >= sizeof(ulong) - ? IndexOfAnyAsciiSearcher.LastIndexOfAnyVectorized(ref searchSpace, searchSpaceLength, _bitmap0, _bitmap1) + ? IndexOfAnyAsciiSearcher.LastIndexOfAnyVectorizedAnyByte(ref searchSpace, searchSpaceLength, ref _bitmaps) : LastIndexOfAnyScalar(ref searchSpace, searchSpaceLength); } diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/AsciiByteSearchValues.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/AsciiByteSearchValues.cs index b71ebbd..371695e 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/AsciiByteSearchValues.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/AsciiByteSearchValues.cs @@ -9,7 +9,7 @@ namespace System.Buffers { internal sealed class AsciiByteSearchValues : SearchValues { - private readonly Vector128 _bitmap; + private Vector256 _bitmap; private readonly BitVector256 _lookup; public AsciiByteSearchValues(ReadOnlySpan values) => @@ -42,7 +42,7 @@ namespace System.Buffers where TNegator : struct, IndexOfAnyAsciiSearcher.INegator { return IndexOfAnyAsciiSearcher.IsVectorizationSupported && searchSpaceLength >= sizeof(ulong) - ? IndexOfAnyAsciiSearcher.IndexOfAnyVectorized(ref searchSpace, searchSpaceLength, _bitmap) + ? IndexOfAnyAsciiSearcher.IndexOfAnyVectorized(ref searchSpace, searchSpaceLength, ref _bitmap) : IndexOfAnyScalar(ref searchSpace, searchSpaceLength); } @@ -51,7 +51,7 @@ namespace System.Buffers where TNegator : struct, IndexOfAnyAsciiSearcher.INegator { return IndexOfAnyAsciiSearcher.IsVectorizationSupported && searchSpaceLength >= sizeof(ulong) - ? IndexOfAnyAsciiSearcher.LastIndexOfAnyVectorized(ref searchSpace, searchSpaceLength, _bitmap) + ? IndexOfAnyAsciiSearcher.LastIndexOfAnyVectorized(ref searchSpace, searchSpaceLength, ref _bitmap) : LastIndexOfAnyScalar(ref searchSpace, searchSpaceLength); } diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/AsciiCharSearchValues.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/AsciiCharSearchValues.cs index f56541b..1505126 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/AsciiCharSearchValues.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/AsciiCharSearchValues.cs @@ -10,10 +10,10 @@ namespace System.Buffers internal sealed class AsciiCharSearchValues : SearchValues where TOptimizations : struct, IndexOfAnyAsciiSearcher.IOptimizations { - private readonly Vector128 _bitmap; + private Vector256 _bitmap; private readonly BitVector256 _lookup; - public AsciiCharSearchValues(Vector128 bitmap, BitVector256 lookup) + public AsciiCharSearchValues(Vector256 bitmap, BitVector256 lookup) { _bitmap = bitmap; _lookup = lookup; @@ -46,7 +46,7 @@ namespace System.Buffers where TNegator : struct, IndexOfAnyAsciiSearcher.INegator { return IndexOfAnyAsciiSearcher.IsVectorizationSupported && searchSpaceLength >= Vector128.Count - ? IndexOfAnyAsciiSearcher.IndexOfAnyVectorized(ref Unsafe.As(ref searchSpace), searchSpaceLength, _bitmap) + ? IndexOfAnyAsciiSearcher.IndexOfAnyVectorized(ref Unsafe.As(ref searchSpace), searchSpaceLength, ref _bitmap) : IndexOfAnyScalar(ref searchSpace, searchSpaceLength); } @@ -55,7 +55,7 @@ namespace System.Buffers where TNegator : struct, IndexOfAnyAsciiSearcher.INegator { return IndexOfAnyAsciiSearcher.IsVectorizationSupported && searchSpaceLength >= Vector128.Count - ? IndexOfAnyAsciiSearcher.LastIndexOfAnyVectorized(ref Unsafe.As(ref searchSpace), searchSpaceLength, _bitmap) + ? IndexOfAnyAsciiSearcher.LastIndexOfAnyVectorized(ref Unsafe.As(ref searchSpace), searchSpaceLength, ref _bitmap) : LastIndexOfAnyScalar(ref searchSpace, searchSpaceLength); } diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs index 45c5849..3ffa188 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs @@ -19,7 +19,7 @@ namespace System.Buffers { internal static bool IsVectorizationSupported => Ssse3.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported; - internal static unsafe void ComputeBitmap256(ReadOnlySpan values, out Vector128 bitmap0, out Vector128 bitmap1, out BitVector256 lookup) + internal static unsafe void ComputeBitmap256(ReadOnlySpan values, out Vector256 bitmap0, out Vector256 bitmap1, out BitVector256 lookup) { // The exact format of these bitmaps differs from the other ComputeBitmap overloads as it's meant for the full [0, 255] range algorithm. // See http://0x80.pl/articles/simd-byte-lookup.html#universal-algorithm @@ -47,12 +47,12 @@ namespace System.Buffers } } - bitmap0 = bitmapSpace0; - bitmap1 = bitmapSpace1; + bitmap0 = Vector256.Create(bitmapSpace0, bitmapSpace0); + bitmap1 = Vector256.Create(bitmapSpace1, bitmapSpace1); lookup = lookupLocal; } - internal static unsafe void ComputeBitmap(ReadOnlySpan values, out Vector128 bitmap, out BitVector256 lookup) + internal static unsafe void ComputeBitmap(ReadOnlySpan values, out Vector256 bitmap, out BitVector256 lookup) where T : struct, IUnsignedNumber { Debug.Assert(typeof(T) == typeof(byte) || typeof(T) == typeof(char)); @@ -79,7 +79,7 @@ namespace System.Buffers bitmapLocal[(uint)lowNibble] |= (byte)(1 << highNibble); } - bitmap = bitmapSpace; + bitmap = Vector256.Create(bitmapSpace, bitmapSpace); lookup = lookupLocal; } @@ -133,9 +133,11 @@ namespace System.Buffers Vector128 bitmap = default; if (TryComputeBitmap(asciiValues, (byte*)&bitmap, out bool needleContainsZero)) { + Vector256 bitmap256 = Vector256.Create(bitmap, bitmap); + index = (Ssse3.IsSupported || PackedSimd.IsSupported) && needleContainsZero - ? IndexOfAnyVectorized(ref searchSpace, searchSpaceLength, bitmap) - : IndexOfAnyVectorized(ref searchSpace, searchSpaceLength, bitmap); + ? IndexOfAnyVectorized(ref searchSpace, searchSpaceLength, ref bitmap256) + : IndexOfAnyVectorized(ref searchSpace, searchSpaceLength, ref bitmap256); return true; } } @@ -155,9 +157,11 @@ namespace System.Buffers Vector128 bitmap = default; if (TryComputeBitmap(asciiValues, (byte*)&bitmap, out bool needleContainsZero)) { + Vector256 bitmap256 = Vector256.Create(bitmap, bitmap); + index = (Ssse3.IsSupported || PackedSimd.IsSupported) && needleContainsZero - ? LastIndexOfAnyVectorized(ref searchSpace, searchSpaceLength, bitmap) - : LastIndexOfAnyVectorized(ref searchSpace, searchSpaceLength, bitmap); + ? LastIndexOfAnyVectorized(ref searchSpace, searchSpaceLength, ref bitmap256) + : LastIndexOfAnyVectorized(ref searchSpace, searchSpaceLength, ref bitmap256); return true; } } @@ -166,91 +170,91 @@ namespace System.Buffers return false; } - internal static int IndexOfAnyVectorized(ref short searchSpace, int searchSpaceLength, Vector128 bitmap) + internal static int IndexOfAnyVectorized(ref short searchSpace, int searchSpaceLength, ref Vector256 bitmapRef) where TNegator : struct, INegator where TOptimizations : struct, IOptimizations { ref short currentSearchSpace = ref searchSpace; - if (searchSpaceLength > 2 * Vector128.Count) + if (Avx2.IsSupported && searchSpaceLength > 2 * Vector128.Count) { - if (Avx2.IsSupported) - { - Vector256 bitmap256 = Vector256.Create(bitmap, bitmap); + Vector256 bitmap256 = bitmapRef; - if (searchSpaceLength > 2 * Vector256.Count) - { - // Process the input in chunks of 32 characters (2 * Vector256). - // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector256. - // As packing two Vector256s into a Vector256 is cheap compared to the lookup, we can effectively double the throughput. - // If the input length is a multiple of 32, don't consume the last 32 characters in this loop. - // Let the fallback below handle it instead. This is why the condition is - // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". - ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - (2 * Vector256.Count)); - - do - { - Vector256 source0 = Vector256.LoadUnsafe(ref currentSearchSpace); - Vector256 source1 = Vector256.LoadUnsafe(ref currentSearchSpace, (nuint)Vector256.Count); - - Vector256 result = IndexOfAnyLookup(source0, source1, bitmap256); - if (result != Vector256.Zero) - { - return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); - } - - currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector256.Count); - } - while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref twoVectorsAwayFromEnd)); - } + if (searchSpaceLength > 2 * Vector256.Count) + { + // Process the input in chunks of 32 characters (2 * Vector256). + // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector256. + // As packing two Vector256s into a Vector256 is cheap compared to the lookup, we can effectively double the throughput. + // If the input length is a multiple of 32, don't consume the last 32 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". + ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - (2 * Vector256.Count)); - // We have 1-32 characters remaining. Process the first and last vector in the search space. - // They may overlap, but we'll handle that in the index calculation if we do get a match. - Debug.Assert(searchSpaceLength >= Vector256.Count, "We expect that the input is long enough for us to load a whole vector."); + do { - ref short oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector256.Count); - - ref short firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd) - ? ref oneVectorAwayFromEnd - : ref currentSearchSpace; - - Vector256 source0 = Vector256.LoadUnsafe(ref firstVector); - Vector256 source1 = Vector256.LoadUnsafe(ref oneVectorAwayFromEnd); + Vector256 source0 = Vector256.LoadUnsafe(ref currentSearchSpace); + Vector256 source1 = Vector256.LoadUnsafe(ref currentSearchSpace, (nuint)Vector256.Count); Vector256 result = IndexOfAnyLookup(source0, source1, bitmap256); if (result != Vector256.Zero) { - return ComputeFirstIndexOverlapped(ref searchSpace, ref firstVector, ref oneVectorAwayFromEnd, result); + return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); } - } - return -1; + currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector256.Count); + } + while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref twoVectorsAwayFromEnd)); } - else + + // We have 1-32 characters remaining. Process the first and last vector in the search space. + // They may overlap, but we'll handle that in the index calculation if we do get a match. + Debug.Assert(searchSpaceLength >= Vector256.Count, "We expect that the input is long enough for us to load a whole vector."); { - // Process the input in chunks of 16 characters (2 * Vector128). - // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector128. - // As packing two Vector128s into a Vector128 is cheap compared to the lookup, we can effectively double the throughput. - // If the input length is a multiple of 16, don't consume the last 16 characters in this loop. - // Let the fallback below handle it instead. This is why the condition is - // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". - ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - (2 * Vector128.Count)); + ref short oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector256.Count); - do + ref short firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd) + ? ref oneVectorAwayFromEnd + : ref currentSearchSpace; + + Vector256 source0 = Vector256.LoadUnsafe(ref firstVector); + Vector256 source1 = Vector256.LoadUnsafe(ref oneVectorAwayFromEnd); + + Vector256 result = IndexOfAnyLookup(source0, source1, bitmap256); + if (result != Vector256.Zero) { - Vector128 source0 = Vector128.LoadUnsafe(ref currentSearchSpace); - Vector128 source1 = Vector128.LoadUnsafe(ref currentSearchSpace, (nuint)Vector128.Count); + return ComputeFirstIndexOverlapped(ref searchSpace, ref firstVector, ref oneVectorAwayFromEnd, result); + } + } - Vector128 result = IndexOfAnyLookup(source0, source1, bitmap); - if (result != Vector128.Zero) - { - return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); - } + return -1; + } + + Vector128 bitmap = bitmapRef._lower; - currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector128.Count); + if (!Avx2.IsSupported && searchSpaceLength > 2 * Vector128.Count) + { + // Process the input in chunks of 16 characters (2 * Vector128). + // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector128. + // As packing two Vector128s into a Vector128 is cheap compared to the lookup, we can effectively double the throughput. + // If the input length is a multiple of 16, don't consume the last 16 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". + ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - (2 * Vector128.Count)); + + do + { + Vector128 source0 = Vector128.LoadUnsafe(ref currentSearchSpace); + Vector128 source1 = Vector128.LoadUnsafe(ref currentSearchSpace, (nuint)Vector128.Count); + + Vector128 result = IndexOfAnyLookup(source0, source1, bitmap); + if (result != Vector128.Zero) + { + return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); } - while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref twoVectorsAwayFromEnd)); + + currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector128.Count); } + while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref twoVectorsAwayFromEnd)); } // We have 1-16 characters remaining. Process the first and last vector in the search space. @@ -276,91 +280,91 @@ namespace System.Buffers return -1; } - internal static int LastIndexOfAnyVectorized(ref short searchSpace, int searchSpaceLength, Vector128 bitmap) + internal static int LastIndexOfAnyVectorized(ref short searchSpace, int searchSpaceLength, ref Vector256 bitmapRef) where TNegator : struct, INegator where TOptimizations : struct, IOptimizations { ref short currentSearchSpace = ref Unsafe.Add(ref searchSpace, searchSpaceLength); - if (searchSpaceLength > 2 * Vector128.Count) + if (Avx2.IsSupported && searchSpaceLength > 2 * Vector128.Count) { - if (Avx2.IsSupported) - { - Vector256 bitmap256 = Vector256.Create(bitmap, bitmap); - - if (searchSpaceLength > 2 * Vector256.Count) - { - // Process the input in chunks of 32 characters (2 * Vector256). - // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector256. - // As packing two Vector256s into a Vector256 is cheap compared to the lookup, we can effectively double the throughput. - // If the input length is a multiple of 32, don't consume the last 32 characters in this loop. - // Let the fallback below handle it instead. This is why the condition is - // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan". - ref short twoVectorsAfterStart = ref Unsafe.Add(ref searchSpace, 2 * Vector256.Count); - - do - { - currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, 2 * Vector256.Count); + Vector256 bitmap256 = bitmapRef; - Vector256 source0 = Vector256.LoadUnsafe(ref currentSearchSpace); - Vector256 source1 = Vector256.LoadUnsafe(ref currentSearchSpace, (nuint)Vector256.Count); - - Vector256 result = IndexOfAnyLookup(source0, source1, bitmap256); - if (result != Vector256.Zero) - { - return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); - } - } - while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref twoVectorsAfterStart)); - } + if (searchSpaceLength > 2 * Vector256.Count) + { + // Process the input in chunks of 32 characters (2 * Vector256). + // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector256. + // As packing two Vector256s into a Vector256 is cheap compared to the lookup, we can effectively double the throughput. + // If the input length is a multiple of 32, don't consume the last 32 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan". + ref short twoVectorsAfterStart = ref Unsafe.Add(ref searchSpace, 2 * Vector256.Count); - // We have 1-32 characters remaining. Process the first and last vector in the search space. - // They may overlap, but we'll handle that in the index calculation if we do get a match. - Debug.Assert(searchSpaceLength >= Vector256.Count, "We expect that the input is long enough for us to load a whole vector."); + do { - ref short oneVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector256.Count); + currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, 2 * Vector256.Count); - ref short secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAfterStart) - ? ref Unsafe.Subtract(ref currentSearchSpace, Vector256.Count) - : ref searchSpace; - - Vector256 source0 = Vector256.LoadUnsafe(ref searchSpace); - Vector256 source1 = Vector256.LoadUnsafe(ref secondVector); + Vector256 source0 = Vector256.LoadUnsafe(ref currentSearchSpace); + Vector256 source1 = Vector256.LoadUnsafe(ref currentSearchSpace, (nuint)Vector256.Count); Vector256 result = IndexOfAnyLookup(source0, source1, bitmap256); if (result != Vector256.Zero) { - return ComputeLastIndexOverlapped(ref searchSpace, ref secondVector, result); + return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); } } - - return -1; + while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref twoVectorsAfterStart)); } - else + + // We have 1-32 characters remaining. Process the first and last vector in the search space. + // They may overlap, but we'll handle that in the index calculation if we do get a match. + Debug.Assert(searchSpaceLength >= Vector256.Count, "We expect that the input is long enough for us to load a whole vector."); { - // Process the input in chunks of 16 characters (2 * Vector128). - // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector128. - // As packing two Vector128s into a Vector128 is cheap compared to the lookup, we can effectively double the throughput. - // If the input length is a multiple of 16, don't consume the last 16 characters in this loop. - // Let the fallback below handle it instead. This is why the condition is - // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan". - ref short twoVectorsAfterStart = ref Unsafe.Add(ref searchSpace, 2 * Vector128.Count); + ref short oneVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector256.Count); - do + ref short secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAfterStart) + ? ref Unsafe.Subtract(ref currentSearchSpace, Vector256.Count) + : ref searchSpace; + + Vector256 source0 = Vector256.LoadUnsafe(ref searchSpace); + Vector256 source1 = Vector256.LoadUnsafe(ref secondVector); + + Vector256 result = IndexOfAnyLookup(source0, source1, bitmap256); + if (result != Vector256.Zero) { - currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, 2 * Vector128.Count); + return ComputeLastIndexOverlapped(ref searchSpace, ref secondVector, result); + } + } - Vector128 source0 = Vector128.LoadUnsafe(ref currentSearchSpace); - Vector128 source1 = Vector128.LoadUnsafe(ref currentSearchSpace, (nuint)Vector128.Count); + return -1; + } - Vector128 result = IndexOfAnyLookup(source0, source1, bitmap); - if (result != Vector128.Zero) - { - return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); - } + Vector128 bitmap = bitmapRef._lower; + + if (!Avx2.IsSupported && searchSpaceLength > 2 * Vector128.Count) + { + // Process the input in chunks of 16 characters (2 * Vector128). + // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector128. + // As packing two Vector128s into a Vector128 is cheap compared to the lookup, we can effectively double the throughput. + // If the input length is a multiple of 16, don't consume the last 16 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan". + ref short twoVectorsAfterStart = ref Unsafe.Add(ref searchSpace, 2 * Vector128.Count); + + do + { + currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, 2 * Vector128.Count); + + Vector128 source0 = Vector128.LoadUnsafe(ref currentSearchSpace); + Vector128 source1 = Vector128.LoadUnsafe(ref currentSearchSpace, (nuint)Vector128.Count); + + Vector128 result = IndexOfAnyLookup(source0, source1, bitmap); + if (result != Vector128.Zero) + { + return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); } - while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref twoVectorsAfterStart)); } + while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref twoVectorsAfterStart)); } // We have 1-16 characters remaining. Process the first and last vector in the search space. @@ -386,85 +390,85 @@ namespace System.Buffers return -1; } - internal static int IndexOfAnyVectorized(ref byte searchSpace, int searchSpaceLength, Vector128 bitmap) + internal static int IndexOfAnyVectorized(ref byte searchSpace, int searchSpaceLength, ref Vector256 bitmapRef) where TNegator : struct, INegator { ref byte currentSearchSpace = ref searchSpace; - if (searchSpaceLength > Vector128.Count) + if (Avx2.IsSupported && searchSpaceLength > Vector128.Count) { - if (Avx2.IsSupported) + Vector256 bitmap256 = bitmapRef; + + if (searchSpaceLength > Vector256.Count) { - Vector256 bitmap256 = Vector256.Create(bitmap, bitmap); + // Process the input in chunks of 32 bytes. + // If the input length is a multiple of 32, don't consume the last 32 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". + ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector256.Count); - if (searchSpaceLength > Vector256.Count) + do { - // Process the input in chunks of 32 bytes. - // If the input length is a multiple of 32, don't consume the last 32 characters in this loop. - // Let the fallback below handle it instead. This is why the condition is - // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". - ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector256.Count); + Vector256 source = Vector256.LoadUnsafe(ref currentSearchSpace); - do + Vector256 result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap256)); + if (result != Vector256.Zero) { - Vector256 source = Vector256.LoadUnsafe(ref currentSearchSpace); - - Vector256 result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap256)); - if (result != Vector256.Zero) - { - return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); - } - - currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector256.Count); + return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); } - while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd)); + + currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector256.Count); } + while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd)); + } - // We have 1-32 bytes remaining. Process the first and last half vectors in the search space. - // They may overlap, but we'll handle that in the index calculation if we do get a match. - Debug.Assert(searchSpaceLength >= Vector128.Count, "We expect that the input is long enough for us to load a Vector128."); - { - ref byte halfVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128.Count); + // We have 1-32 bytes remaining. Process the first and last half vectors in the search space. + // They may overlap, but we'll handle that in the index calculation if we do get a match. + Debug.Assert(searchSpaceLength >= Vector128.Count, "We expect that the input is long enough for us to load a Vector128."); + { + ref byte halfVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128.Count); - ref byte firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAwayFromEnd) - ? ref halfVectorAwayFromEnd - : ref currentSearchSpace; + ref byte firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAwayFromEnd) + ? ref halfVectorAwayFromEnd + : ref currentSearchSpace; - Vector128 source0 = Vector128.LoadUnsafe(ref firstVector); - Vector128 source1 = Vector128.LoadUnsafe(ref halfVectorAwayFromEnd); - Vector256 source = Vector256.Create(source0, source1); + Vector128 source0 = Vector128.LoadUnsafe(ref firstVector); + Vector128 source1 = Vector128.LoadUnsafe(ref halfVectorAwayFromEnd); + Vector256 source = Vector256.Create(source0, source1); - Vector256 result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap256)); - if (result != Vector256.Zero) - { - return ComputeFirstIndexOverlapped(ref searchSpace, ref firstVector, ref halfVectorAwayFromEnd, result); - } + Vector256 result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap256)); + if (result != Vector256.Zero) + { + return ComputeFirstIndexOverlapped(ref searchSpace, ref firstVector, ref halfVectorAwayFromEnd, result); } - - return -1; } - else - { - // Process the input in chunks of 16 bytes. - // If the input length is a multiple of 16, don't consume the last 16 characters in this loop. - // Let the fallback below handle it instead. This is why the condition is - // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". - ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128.Count); - do - { - Vector128 source = Vector128.LoadUnsafe(ref currentSearchSpace); + return -1; + } - Vector128 result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap)); - if (result != Vector128.Zero) - { - return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); - } + Vector128 bitmap = bitmapRef._lower; + + if (!Avx2.IsSupported && searchSpaceLength > Vector128.Count) + { + // Process the input in chunks of 16 bytes. + // If the input length is a multiple of 16, don't consume the last 16 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". + ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128.Count); - currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector128.Count); + do + { + Vector128 source = Vector128.LoadUnsafe(ref currentSearchSpace); + + Vector128 result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap)); + if (result != Vector128.Zero) + { + return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); } - while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd)); + + currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector128.Count); } + while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd)); } // We have 1-16 bytes remaining. Process the first and last half vectors in the search space. @@ -491,85 +495,85 @@ namespace System.Buffers return -1; } - internal static int LastIndexOfAnyVectorized(ref byte searchSpace, int searchSpaceLength, Vector128 bitmap) + internal static int LastIndexOfAnyVectorized(ref byte searchSpace, int searchSpaceLength, ref Vector256 bitmapRef) where TNegator : struct, INegator { ref byte currentSearchSpace = ref Unsafe.Add(ref searchSpace, searchSpaceLength); - if (searchSpaceLength > Vector128.Count) + if (Avx2.IsSupported && searchSpaceLength > Vector128.Count) { - if (Avx2.IsSupported) - { - Vector256 bitmap256 = Vector256.Create(bitmap, bitmap); - - if (searchSpaceLength > Vector256.Count) - { - // Process the input in chunks of 32 bytes. - // If the input length is a multiple of 32, don't consume the last 32 characters in this loop. - // Let the fallback below handle it instead. This is why the condition is - // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan". - ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector256.Count); - - do - { - currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector256.Count); + Vector256 bitmap256 = bitmapRef; - Vector256 source = Vector256.LoadUnsafe(ref currentSearchSpace); - - Vector256 result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap256)); - if (result != Vector256.Zero) - { - return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); - } - } - while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart)); - } + if (searchSpaceLength > Vector256.Count) + { + // Process the input in chunks of 32 bytes. + // If the input length is a multiple of 32, don't consume the last 32 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan". + ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector256.Count); - // We have 1-32 bytes remaining. Process the first and last half vectors in the search space. - // They may overlap, but we'll handle that in the index calculation if we do get a match. - Debug.Assert(searchSpaceLength >= Vector128.Count, "We expect that the input is long enough for us to load a Vector128."); + do { - ref byte halfVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128.Count); + currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector256.Count); - ref byte secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAfterStart) - ? ref Unsafe.Subtract(ref currentSearchSpace, Vector128.Count) - : ref searchSpace; - - Vector128 source0 = Vector128.LoadUnsafe(ref searchSpace); - Vector128 source1 = Vector128.LoadUnsafe(ref secondVector); - Vector256 source = Vector256.Create(source0, source1); + Vector256 source = Vector256.LoadUnsafe(ref currentSearchSpace); Vector256 result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap256)); if (result != Vector256.Zero) { - return ComputeLastIndexOverlapped(ref searchSpace, ref secondVector, result); + return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); } } - - return -1; + while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart)); } - else + + // We have 1-32 bytes remaining. Process the first and last half vectors in the search space. + // They may overlap, but we'll handle that in the index calculation if we do get a match. + Debug.Assert(searchSpaceLength >= Vector128.Count, "We expect that the input is long enough for us to load a Vector128."); { - // Process the input in chunks of 16 bytes. - // If the input length is a multiple of 16, don't consume the last 16 characters in this loop. - // Let the fallback below handle it instead. This is why the condition is - // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan". - ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128.Count); + ref byte halfVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128.Count); - do + ref byte secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAfterStart) + ? ref Unsafe.Subtract(ref currentSearchSpace, Vector128.Count) + : ref searchSpace; + + Vector128 source0 = Vector128.LoadUnsafe(ref searchSpace); + Vector128 source1 = Vector128.LoadUnsafe(ref secondVector); + Vector256 source = Vector256.Create(source0, source1); + + Vector256 result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap256)); + if (result != Vector256.Zero) { - currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector128.Count); + return ComputeLastIndexOverlapped(ref searchSpace, ref secondVector, result); + } + } - Vector128 source = Vector128.LoadUnsafe(ref currentSearchSpace); + return -1; + } - Vector128 result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap)); - if (result != Vector128.Zero) - { - return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); - } + Vector128 bitmap = bitmapRef._lower; + + if (!Avx2.IsSupported && searchSpaceLength > Vector128.Count) + { + // Process the input in chunks of 16 bytes. + // If the input length is a multiple of 16, don't consume the last 16 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan". + ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128.Count); + + do + { + currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector128.Count); + + Vector128 source = Vector128.LoadUnsafe(ref currentSearchSpace); + + Vector128 result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap)); + if (result != Vector128.Zero) + { + return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); } - while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart)); } + while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart)); } // We have 1-16 bytes remaining. Process the first and last half vectors in the search space. @@ -596,86 +600,87 @@ namespace System.Buffers return -1; } - internal static int IndexOfAnyVectorized(ref byte searchSpace, int searchSpaceLength, Vector128 bitmap0, Vector128 bitmap1) + internal static int IndexOfAnyVectorizedAnyByte(ref byte searchSpace, int searchSpaceLength, ref Vector512 bitmapsRef) where TNegator : struct, INegator { ref byte currentSearchSpace = ref searchSpace; - if (searchSpaceLength > Vector128.Count) + if (Avx2.IsSupported && searchSpaceLength > Vector128.Count) { - if (Avx2.IsSupported) + Vector256 bitmap256_0 = bitmapsRef._lower; + Vector256 bitmap256_1 = bitmapsRef._upper; + + if (searchSpaceLength > Vector256.Count) { - Vector256 bitmap256_0 = Vector256.Create(bitmap0, bitmap0); - Vector256 bitmap256_1 = Vector256.Create(bitmap1, bitmap1); + // Process the input in chunks of 32 bytes. + // If the input length is a multiple of 32, don't consume the last 32 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". + ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector256.Count); - if (searchSpaceLength > Vector256.Count) + do { - // Process the input in chunks of 32 bytes. - // If the input length is a multiple of 32, don't consume the last 32 characters in this loop. - // Let the fallback below handle it instead. This is why the condition is - // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". - ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector256.Count); + Vector256 source = Vector256.LoadUnsafe(ref currentSearchSpace); - do + Vector256 result = IndexOfAnyLookup(source, bitmap256_0, bitmap256_1); + if (result != Vector256.Zero) { - Vector256 source = Vector256.LoadUnsafe(ref currentSearchSpace); - - Vector256 result = IndexOfAnyLookup(source, bitmap256_0, bitmap256_1); - if (result != Vector256.Zero) - { - return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); - } - - currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector256.Count); + return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); } - while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd)); + + currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector256.Count); } + while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd)); + } - // We have 1-32 bytes remaining. Process the first and last half vectors in the search space. - // They may overlap, but we'll handle that in the index calculation if we do get a match. - Debug.Assert(searchSpaceLength >= Vector128.Count, "We expect that the input is long enough for us to load a Vector128."); - { - ref byte halfVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128.Count); + // We have 1-32 bytes remaining. Process the first and last half vectors in the search space. + // They may overlap, but we'll handle that in the index calculation if we do get a match. + Debug.Assert(searchSpaceLength >= Vector128.Count, "We expect that the input is long enough for us to load a Vector128."); + { + ref byte halfVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128.Count); - ref byte firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAwayFromEnd) - ? ref halfVectorAwayFromEnd - : ref currentSearchSpace; + ref byte firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAwayFromEnd) + ? ref halfVectorAwayFromEnd + : ref currentSearchSpace; - Vector128 source0 = Vector128.LoadUnsafe(ref firstVector); - Vector128 source1 = Vector128.LoadUnsafe(ref halfVectorAwayFromEnd); - Vector256 source = Vector256.Create(source0, source1); + Vector128 source0 = Vector128.LoadUnsafe(ref firstVector); + Vector128 source1 = Vector128.LoadUnsafe(ref halfVectorAwayFromEnd); + Vector256 source = Vector256.Create(source0, source1); - Vector256 result = IndexOfAnyLookup(source, bitmap256_0, bitmap256_1); - if (result != Vector256.Zero) - { - return ComputeFirstIndexOverlapped(ref searchSpace, ref firstVector, ref halfVectorAwayFromEnd, result); - } + Vector256 result = IndexOfAnyLookup(source, bitmap256_0, bitmap256_1); + if (result != Vector256.Zero) + { + return ComputeFirstIndexOverlapped(ref searchSpace, ref firstVector, ref halfVectorAwayFromEnd, result); } - - return -1; } - else - { - // Process the input in chunks of 16 bytes. - // If the input length is a multiple of 16, don't consume the last 16 characters in this loop. - // Let the fallback below handle it instead. This is why the condition is - // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". - ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128.Count); - do - { - Vector128 source = Vector128.LoadUnsafe(ref currentSearchSpace); + return -1; + } - Vector128 result = IndexOfAnyLookup(source, bitmap0, bitmap1); - if (result != Vector128.Zero) - { - return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); - } + Vector128 bitmap0 = bitmapsRef._lower._lower; + Vector128 bitmap1 = bitmapsRef._upper._lower; + + if (!Avx2.IsSupported && searchSpaceLength > Vector128.Count) + { + // Process the input in chunks of 16 bytes. + // If the input length is a multiple of 16, don't consume the last 16 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". + ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128.Count); - currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector128.Count); + do + { + Vector128 source = Vector128.LoadUnsafe(ref currentSearchSpace); + + Vector128 result = IndexOfAnyLookup(source, bitmap0, bitmap1); + if (result != Vector128.Zero) + { + return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); } - while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd)); + + currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector128.Count); } + while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd)); } // We have 1-16 bytes remaining. Process the first and last half vectors in the search space. @@ -702,86 +707,87 @@ namespace System.Buffers return -1; } - internal static int LastIndexOfAnyVectorized(ref byte searchSpace, int searchSpaceLength, Vector128 bitmap0, Vector128 bitmap1) + internal static int LastIndexOfAnyVectorizedAnyByte(ref byte searchSpace, int searchSpaceLength, ref Vector512 bitmapsRef) where TNegator : struct, INegator { ref byte currentSearchSpace = ref Unsafe.Add(ref searchSpace, searchSpaceLength); - if (searchSpaceLength > Vector128.Count) + if (Avx2.IsSupported && searchSpaceLength > Vector128.Count) { - if (Avx2.IsSupported) - { - Vector256 bitmap256_0 = Vector256.Create(bitmap0, bitmap0); - Vector256 bitmap256_1 = Vector256.Create(bitmap1, bitmap1); - - if (searchSpaceLength > Vector256.Count) - { - // Process the input in chunks of 32 bytes. - // If the input length is a multiple of 32, don't consume the last 32 characters in this loop. - // Let the fallback below handle it instead. This is why the condition is - // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan". - ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector256.Count); - - do - { - currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector256.Count); - - Vector256 source = Vector256.LoadUnsafe(ref currentSearchSpace); + Vector256 bitmap256_0 = bitmapsRef._lower; + Vector256 bitmap256_1 = bitmapsRef._upper; - Vector256 result = IndexOfAnyLookup(source, bitmap256_0, bitmap256_1); - if (result != Vector256.Zero) - { - return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); - } - } - while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart)); - } + if (searchSpaceLength > Vector256.Count) + { + // Process the input in chunks of 32 bytes. + // If the input length is a multiple of 32, don't consume the last 32 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan". + ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector256.Count); - // We have 1-32 bytes remaining. Process the first and last half vectors in the search space. - // They may overlap, but we'll handle that in the index calculation if we do get a match. - Debug.Assert(searchSpaceLength >= Vector128.Count, "We expect that the input is long enough for us to load a Vector128."); + do { - ref byte halfVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128.Count); - - ref byte secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAfterStart) - ? ref Unsafe.Subtract(ref currentSearchSpace, Vector128.Count) - : ref searchSpace; + currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector256.Count); - Vector128 source0 = Vector128.LoadUnsafe(ref searchSpace); - Vector128 source1 = Vector128.LoadUnsafe(ref secondVector); - Vector256 source = Vector256.Create(source0, source1); + Vector256 source = Vector256.LoadUnsafe(ref currentSearchSpace); Vector256 result = IndexOfAnyLookup(source, bitmap256_0, bitmap256_1); if (result != Vector256.Zero) { - return ComputeLastIndexOverlapped(ref searchSpace, ref secondVector, result); + return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); } } - - return -1; + while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart)); } - else + + // We have 1-32 bytes remaining. Process the first and last half vectors in the search space. + // They may overlap, but we'll handle that in the index calculation if we do get a match. + Debug.Assert(searchSpaceLength >= Vector128.Count, "We expect that the input is long enough for us to load a Vector128."); { - // Process the input in chunks of 16 bytes. - // If the input length is a multiple of 16, don't consume the last 16 characters in this loop. - // Let the fallback below handle it instead. This is why the condition is - // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan". - ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128.Count); + ref byte halfVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128.Count); - do + ref byte secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAfterStart) + ? ref Unsafe.Subtract(ref currentSearchSpace, Vector128.Count) + : ref searchSpace; + + Vector128 source0 = Vector128.LoadUnsafe(ref searchSpace); + Vector128 source1 = Vector128.LoadUnsafe(ref secondVector); + Vector256 source = Vector256.Create(source0, source1); + + Vector256 result = IndexOfAnyLookup(source, bitmap256_0, bitmap256_1); + if (result != Vector256.Zero) { - currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector128.Count); + return ComputeLastIndexOverlapped(ref searchSpace, ref secondVector, result); + } + } - Vector128 source = Vector128.LoadUnsafe(ref currentSearchSpace); + return -1; + } - Vector128 result = IndexOfAnyLookup(source, bitmap0, bitmap1); - if (result != Vector128.Zero) - { - return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); - } + Vector128 bitmap0 = bitmapsRef._lower._lower; + Vector128 bitmap1 = bitmapsRef._upper._lower; + + if (!Avx2.IsSupported && searchSpaceLength > Vector128.Count) + { + // Process the input in chunks of 16 bytes. + // If the input length is a multiple of 16, don't consume the last 16 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan". + ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128.Count); + + do + { + currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector128.Count); + + Vector128 source = Vector128.LoadUnsafe(ref currentSearchSpace); + + Vector128 result = IndexOfAnyLookup(source, bitmap0, bitmap1); + if (result != Vector128.Zero) + { + return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); } - while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart)); } + while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart)); } // We have 1-16 bytes remaining. Process the first and last half vectors in the search space. diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/SearchValues.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/SearchValues.cs index c53ff57..9070e51 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/SearchValues.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/SearchValues.cs @@ -112,7 +112,7 @@ namespace System.Buffers // IndexOfAnyAsciiSearcher for chars is slower than Any3CharSearchValues, but faster than Any4SearchValues if (IndexOfAnyAsciiSearcher.IsVectorizationSupported && maxInclusive < 128) { - IndexOfAnyAsciiSearcher.ComputeBitmap(values, out Vector128 bitmap, out BitVector256 lookup); + IndexOfAnyAsciiSearcher.ComputeBitmap(values, out Vector256 bitmap, out BitVector256 lookup); return (Ssse3.IsSupported || PackedSimd.IsSupported) && lookup.Contains(0) ? new AsciiCharSearchValues(bitmap, lookup) -- 2.7.4