Reduce overhead of calling into SearchValues (#86046)
authorMiha Zupan <mihazupan.zupan1@gmail.com>
Mon, 15 May 2023 14:12:16 +0000 (16:12 +0200)
committerGitHub <noreply@github.com>
Mon, 15 May 2023 14:12:16 +0000 (10:12 -0400)
src/libraries/System.Private.CoreLib/src/System/SearchValues/AnyByteSearchValues.cs
src/libraries/System.Private.CoreLib/src/System/SearchValues/AsciiByteSearchValues.cs
src/libraries/System.Private.CoreLib/src/System/SearchValues/AsciiCharSearchValues.cs
src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs
src/libraries/System.Private.CoreLib/src/System/SearchValues/SearchValues.cs

index ce99ac0..a510214 100644 (file)
@@ -9,12 +9,14 @@ namespace System.Buffers
 {
     internal sealed class AnyByteSearchValues : SearchValues<byte>
     {
-        private readonly Vector128<byte> _bitmap0;
-        private readonly Vector128<byte> _bitmap1;
+        private Vector512<byte> _bitmaps;
         private readonly BitVector256 _lookup;
 
-        public AnyByteSearchValues(ReadOnlySpan<byte> values) =>
-            IndexOfAnyAsciiSearcher.ComputeBitmap256(values, out _bitmap0, out _bitmap1, out _lookup);
+        public AnyByteSearchValues(ReadOnlySpan<byte> values)
+        {
+            IndexOfAnyAsciiSearcher.ComputeBitmap256(values, out Vector256<byte> bitmap0, out Vector256<byte> bitmap1, out _lookup);
+            _bitmaps = Vector512.Create(bitmap0, bitmap1);
+        }
 
         internal override byte[] GetValues() => _lookup.GetByteValues();
 
@@ -43,7 +45,7 @@ namespace System.Buffers
             where TNegator : struct, IndexOfAnyAsciiSearcher.INegator
         {
             return IndexOfAnyAsciiSearcher.IsVectorizationSupported && searchSpaceLength >= sizeof(ulong)
-                ? IndexOfAnyAsciiSearcher.IndexOfAnyVectorized<TNegator>(ref searchSpace, searchSpaceLength, _bitmap0, _bitmap1)
+                ? IndexOfAnyAsciiSearcher.IndexOfAnyVectorizedAnyByte<TNegator>(ref searchSpace, searchSpaceLength, ref _bitmaps)
                 : IndexOfAnyScalar<TNegator>(ref searchSpace, searchSpaceLength);
         }
 
@@ -52,7 +54,7 @@ namespace System.Buffers
             where TNegator : struct, IndexOfAnyAsciiSearcher.INegator
         {
             return IndexOfAnyAsciiSearcher.IsVectorizationSupported && searchSpaceLength >= sizeof(ulong)
-                ? IndexOfAnyAsciiSearcher.LastIndexOfAnyVectorized<TNegator>(ref searchSpace, searchSpaceLength, _bitmap0, _bitmap1)
+                ? IndexOfAnyAsciiSearcher.LastIndexOfAnyVectorizedAnyByte<TNegator>(ref searchSpace, searchSpaceLength, ref _bitmaps)
                 : LastIndexOfAnyScalar<TNegator>(ref searchSpace, searchSpaceLength);
         }
 
index b71ebbd..371695e 100644 (file)
@@ -9,7 +9,7 @@ namespace System.Buffers
 {
     internal sealed class AsciiByteSearchValues : SearchValues<byte>
     {
-        private readonly Vector128<byte> _bitmap;
+        private Vector256<byte> _bitmap;
         private readonly BitVector256 _lookup;
 
         public AsciiByteSearchValues(ReadOnlySpan<byte> values) =>
@@ -42,7 +42,7 @@ namespace System.Buffers
             where TNegator : struct, IndexOfAnyAsciiSearcher.INegator
         {
             return IndexOfAnyAsciiSearcher.IsVectorizationSupported && searchSpaceLength >= sizeof(ulong)
-                ? IndexOfAnyAsciiSearcher.IndexOfAnyVectorized<TNegator>(ref searchSpace, searchSpaceLength, _bitmap)
+                ? IndexOfAnyAsciiSearcher.IndexOfAnyVectorized<TNegator>(ref searchSpace, searchSpaceLength, ref _bitmap)
                 : IndexOfAnyScalar<TNegator>(ref searchSpace, searchSpaceLength);
         }
 
@@ -51,7 +51,7 @@ namespace System.Buffers
             where TNegator : struct, IndexOfAnyAsciiSearcher.INegator
         {
             return IndexOfAnyAsciiSearcher.IsVectorizationSupported && searchSpaceLength >= sizeof(ulong)
-                ? IndexOfAnyAsciiSearcher.LastIndexOfAnyVectorized<TNegator>(ref searchSpace, searchSpaceLength, _bitmap)
+                ? IndexOfAnyAsciiSearcher.LastIndexOfAnyVectorized<TNegator>(ref searchSpace, searchSpaceLength, ref _bitmap)
                 : LastIndexOfAnyScalar<TNegator>(ref searchSpace, searchSpaceLength);
         }
 
index f56541b..1505126 100644 (file)
@@ -10,10 +10,10 @@ namespace System.Buffers
     internal sealed class AsciiCharSearchValues<TOptimizations> : SearchValues<char>
         where TOptimizations : struct, IndexOfAnyAsciiSearcher.IOptimizations
     {
-        private readonly Vector128<byte> _bitmap;
+        private Vector256<byte> _bitmap;
         private readonly BitVector256 _lookup;
 
-        public AsciiCharSearchValues(Vector128<byte> bitmap, BitVector256 lookup)
+        public AsciiCharSearchValues(Vector256<byte> bitmap, BitVector256 lookup)
         {
             _bitmap = bitmap;
             _lookup = lookup;
@@ -46,7 +46,7 @@ namespace System.Buffers
             where TNegator : struct, IndexOfAnyAsciiSearcher.INegator
         {
             return IndexOfAnyAsciiSearcher.IsVectorizationSupported && searchSpaceLength >= Vector128<short>.Count
-                ? IndexOfAnyAsciiSearcher.IndexOfAnyVectorized<TNegator, TOptimizations>(ref Unsafe.As<char, short>(ref searchSpace), searchSpaceLength, _bitmap)
+                ? IndexOfAnyAsciiSearcher.IndexOfAnyVectorized<TNegator, TOptimizations>(ref Unsafe.As<char, short>(ref searchSpace), searchSpaceLength, ref _bitmap)
                 : IndexOfAnyScalar<TNegator>(ref searchSpace, searchSpaceLength);
         }
 
@@ -55,7 +55,7 @@ namespace System.Buffers
             where TNegator : struct, IndexOfAnyAsciiSearcher.INegator
         {
             return IndexOfAnyAsciiSearcher.IsVectorizationSupported && searchSpaceLength >= Vector128<short>.Count
-                ? IndexOfAnyAsciiSearcher.LastIndexOfAnyVectorized<TNegator, TOptimizations>(ref Unsafe.As<char, short>(ref searchSpace), searchSpaceLength, _bitmap)
+                ? IndexOfAnyAsciiSearcher.LastIndexOfAnyVectorized<TNegator, TOptimizations>(ref Unsafe.As<char, short>(ref searchSpace), searchSpaceLength, ref _bitmap)
                 : LastIndexOfAnyScalar<TNegator>(ref searchSpace, searchSpaceLength);
         }
 
index 45c5849..3ffa188 100644 (file)
@@ -19,7 +19,7 @@ namespace System.Buffers
     {
         internal static bool IsVectorizationSupported => Ssse3.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported;
 
-        internal static unsafe void ComputeBitmap256(ReadOnlySpan<byte> values, out Vector128<byte> bitmap0, out Vector128<byte> bitmap1, out BitVector256 lookup)
+        internal static unsafe void ComputeBitmap256(ReadOnlySpan<byte> values, out Vector256<byte> bitmap0, out Vector256<byte> bitmap1, out BitVector256 lookup)
         {
             // The exact format of these bitmaps differs from the other ComputeBitmap overloads as it's meant for the full [0, 255] range algorithm.
             // See http://0x80.pl/articles/simd-byte-lookup.html#universal-algorithm
@@ -47,12 +47,12 @@ namespace System.Buffers
                 }
             }
 
-            bitmap0 = bitmapSpace0;
-            bitmap1 = bitmapSpace1;
+            bitmap0 = Vector256.Create(bitmapSpace0, bitmapSpace0);
+            bitmap1 = Vector256.Create(bitmapSpace1, bitmapSpace1);
             lookup = lookupLocal;
         }
 
-        internal static unsafe void ComputeBitmap<T>(ReadOnlySpan<T> values, out Vector128<byte> bitmap, out BitVector256 lookup)
+        internal static unsafe void ComputeBitmap<T>(ReadOnlySpan<T> values, out Vector256<byte> bitmap, out BitVector256 lookup)
             where T : struct, IUnsignedNumber<T>
         {
             Debug.Assert(typeof(T) == typeof(byte) || typeof(T) == typeof(char));
@@ -79,7 +79,7 @@ namespace System.Buffers
                 bitmapLocal[(uint)lowNibble] |= (byte)(1 << highNibble);
             }
 
-            bitmap = bitmapSpace;
+            bitmap = Vector256.Create(bitmapSpace, bitmapSpace);
             lookup = lookupLocal;
         }
 
@@ -133,9 +133,11 @@ namespace System.Buffers
                 Vector128<byte> bitmap = default;
                 if (TryComputeBitmap(asciiValues, (byte*)&bitmap, out bool needleContainsZero))
                 {
+                    Vector256<byte> bitmap256 = Vector256.Create(bitmap, bitmap);
+
                     index = (Ssse3.IsSupported || PackedSimd.IsSupported) && needleContainsZero
-                        ? IndexOfAnyVectorized<TNegator, Ssse3AndWasmHandleZeroInNeedle>(ref searchSpace, searchSpaceLength, bitmap)
-                        : IndexOfAnyVectorized<TNegator, Default>(ref searchSpace, searchSpaceLength, bitmap);
+                        ? IndexOfAnyVectorized<TNegator, Ssse3AndWasmHandleZeroInNeedle>(ref searchSpace, searchSpaceLength, ref bitmap256)
+                        : IndexOfAnyVectorized<TNegator, Default>(ref searchSpace, searchSpaceLength, ref bitmap256);
                     return true;
                 }
             }
@@ -155,9 +157,11 @@ namespace System.Buffers
                 Vector128<byte> bitmap = default;
                 if (TryComputeBitmap(asciiValues, (byte*)&bitmap, out bool needleContainsZero))
                 {
+                    Vector256<byte> bitmap256 = Vector256.Create(bitmap, bitmap);
+
                     index = (Ssse3.IsSupported || PackedSimd.IsSupported) && needleContainsZero
-                        ? LastIndexOfAnyVectorized<TNegator, Ssse3AndWasmHandleZeroInNeedle>(ref searchSpace, searchSpaceLength, bitmap)
-                        : LastIndexOfAnyVectorized<TNegator, Default>(ref searchSpace, searchSpaceLength, bitmap);
+                        ? LastIndexOfAnyVectorized<TNegator, Ssse3AndWasmHandleZeroInNeedle>(ref searchSpace, searchSpaceLength, ref bitmap256)
+                        : LastIndexOfAnyVectorized<TNegator, Default>(ref searchSpace, searchSpaceLength, ref bitmap256);
                     return true;
                 }
             }
@@ -166,91 +170,91 @@ namespace System.Buffers
             return false;
         }
 
-        internal static int IndexOfAnyVectorized<TNegator, TOptimizations>(ref short searchSpace, int searchSpaceLength, Vector128<byte> bitmap)
+        internal static int IndexOfAnyVectorized<TNegator, TOptimizations>(ref short searchSpace, int searchSpaceLength, ref Vector256<byte> bitmapRef)
             where TNegator : struct, INegator
             where TOptimizations : struct, IOptimizations
         {
             ref short currentSearchSpace = ref searchSpace;
 
-            if (searchSpaceLength > 2 * Vector128<short>.Count)
+            if (Avx2.IsSupported && searchSpaceLength > 2 * Vector128<short>.Count)
             {
-                if (Avx2.IsSupported)
-                {
-                    Vector256<byte> bitmap256 = Vector256.Create(bitmap, bitmap);
+                Vector256<byte> bitmap256 = bitmapRef;
 
-                    if (searchSpaceLength > 2 * Vector256<short>.Count)
-                    {
-                        // Process the input in chunks of 32 characters (2 * Vector256<short>).
-                        // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector256<byte>.
-                        // As packing two Vector256<short>s into a Vector256<byte> is cheap compared to the lookup, we can effectively double the throughput.
-                        // If the input length is a multiple of 32, don't consume the last 32 characters in this loop.
-                        // Let the fallback below handle it instead. This is why the condition is
-                        // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
-                        ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - (2 * Vector256<short>.Count));
-
-                        do
-                        {
-                            Vector256<short> source0 = Vector256.LoadUnsafe(ref currentSearchSpace);
-                            Vector256<short> source1 = Vector256.LoadUnsafe(ref currentSearchSpace, (nuint)Vector256<short>.Count);
-
-                            Vector256<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source0, source1, bitmap256);
-                            if (result != Vector256<byte>.Zero)
-                            {
-                                return ComputeFirstIndex<short, TNegator>(ref searchSpace, ref currentSearchSpace, result);
-                            }
-
-                            currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector256<short>.Count);
-                        }
-                        while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref twoVectorsAwayFromEnd));
-                    }
+                if (searchSpaceLength > 2 * Vector256<short>.Count)
+                {
+                    // Process the input in chunks of 32 characters (2 * Vector256<short>).
+                    // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector256<byte>.
+                    // As packing two Vector256<short>s into a Vector256<byte> is cheap compared to the lookup, we can effectively double the throughput.
+                    // If the input length is a multiple of 32, don't consume the last 32 characters in this loop.
+                    // Let the fallback below handle it instead. This is why the condition is
+                    // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
+                    ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - (2 * Vector256<short>.Count));
 
-                    // We have 1-32 characters remaining. Process the first and last vector in the search space.
-                    // They may overlap, but we'll handle that in the index calculation if we do get a match.
-                    Debug.Assert(searchSpaceLength >= Vector256<short>.Count, "We expect that the input is long enough for us to load a whole vector.");
+                    do
                     {
-                        ref short oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector256<short>.Count);
-
-                        ref short firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd)
-                            ? ref oneVectorAwayFromEnd
-                            : ref currentSearchSpace;
-
-                        Vector256<short> source0 = Vector256.LoadUnsafe(ref firstVector);
-                        Vector256<short> source1 = Vector256.LoadUnsafe(ref oneVectorAwayFromEnd);
+                        Vector256<short> source0 = Vector256.LoadUnsafe(ref currentSearchSpace);
+                        Vector256<short> source1 = Vector256.LoadUnsafe(ref currentSearchSpace, (nuint)Vector256<short>.Count);
 
                         Vector256<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source0, source1, bitmap256);
                         if (result != Vector256<byte>.Zero)
                         {
-                            return ComputeFirstIndexOverlapped<short, TNegator>(ref searchSpace, ref firstVector, ref oneVectorAwayFromEnd, result);
+                            return ComputeFirstIndex<short, TNegator>(ref searchSpace, ref currentSearchSpace, result);
                         }
-                    }
 
-                    return -1;
+                        currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector256<short>.Count);
+                    }
+                    while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref twoVectorsAwayFromEnd));
                 }
-                else
+
+                // We have 1-32 characters remaining. Process the first and last vector in the search space.
+                // They may overlap, but we'll handle that in the index calculation if we do get a match.
+                Debug.Assert(searchSpaceLength >= Vector256<short>.Count, "We expect that the input is long enough for us to load a whole vector.");
                 {
-                    // Process the input in chunks of 16 characters (2 * Vector128<short>).
-                    // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector128<byte>.
-                    // As packing two Vector128<short>s into a Vector128<byte> is cheap compared to the lookup, we can effectively double the throughput.
-                    // If the input length is a multiple of 16, don't consume the last 16 characters in this loop.
-                    // Let the fallback below handle it instead. This is why the condition is
-                    // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
-                    ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - (2 * Vector128<short>.Count));
+                    ref short oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector256<short>.Count);
 
-                    do
+                    ref short firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd)
+                        ? ref oneVectorAwayFromEnd
+                        : ref currentSearchSpace;
+
+                    Vector256<short> source0 = Vector256.LoadUnsafe(ref firstVector);
+                    Vector256<short> source1 = Vector256.LoadUnsafe(ref oneVectorAwayFromEnd);
+
+                    Vector256<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source0, source1, bitmap256);
+                    if (result != Vector256<byte>.Zero)
                     {
-                        Vector128<short> source0 = Vector128.LoadUnsafe(ref currentSearchSpace);
-                        Vector128<short> source1 = Vector128.LoadUnsafe(ref currentSearchSpace, (nuint)Vector128<short>.Count);
+                        return ComputeFirstIndexOverlapped<short, TNegator>(ref searchSpace, ref firstVector, ref oneVectorAwayFromEnd, result);
+                    }
+                }
 
-                        Vector128<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source0, source1, bitmap);
-                        if (result != Vector128<byte>.Zero)
-                        {
-                            return ComputeFirstIndex<short, TNegator>(ref searchSpace, ref currentSearchSpace, result);
-                        }
+                return -1;
+            }
+
+            Vector128<byte> bitmap = bitmapRef._lower;
 
-                        currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector128<short>.Count);
+            if (!Avx2.IsSupported && searchSpaceLength > 2 * Vector128<short>.Count)
+            {
+                // Process the input in chunks of 16 characters (2 * Vector128<short>).
+                // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector128<byte>.
+                // As packing two Vector128<short>s into a Vector128<byte> is cheap compared to the lookup, we can effectively double the throughput.
+                // If the input length is a multiple of 16, don't consume the last 16 characters in this loop.
+                // Let the fallback below handle it instead. This is why the condition is
+                // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
+                ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - (2 * Vector128<short>.Count));
+
+                do
+                {
+                    Vector128<short> source0 = Vector128.LoadUnsafe(ref currentSearchSpace);
+                    Vector128<short> source1 = Vector128.LoadUnsafe(ref currentSearchSpace, (nuint)Vector128<short>.Count);
+
+                    Vector128<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source0, source1, bitmap);
+                    if (result != Vector128<byte>.Zero)
+                    {
+                        return ComputeFirstIndex<short, TNegator>(ref searchSpace, ref currentSearchSpace, result);
                     }
-                    while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref twoVectorsAwayFromEnd));
+
+                    currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector128<short>.Count);
                 }
+                while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref twoVectorsAwayFromEnd));
             }
 
             // We have 1-16 characters remaining. Process the first and last vector in the search space.
@@ -276,91 +280,91 @@ namespace System.Buffers
             return -1;
         }
 
-        internal static int LastIndexOfAnyVectorized<TNegator, TOptimizations>(ref short searchSpace, int searchSpaceLength, Vector128<byte> bitmap)
+        internal static int LastIndexOfAnyVectorized<TNegator, TOptimizations>(ref short searchSpace, int searchSpaceLength, ref Vector256<byte> bitmapRef)
             where TNegator : struct, INegator
             where TOptimizations : struct, IOptimizations
         {
             ref short currentSearchSpace = ref Unsafe.Add(ref searchSpace, searchSpaceLength);
 
-            if (searchSpaceLength > 2 * Vector128<short>.Count)
+            if (Avx2.IsSupported && searchSpaceLength > 2 * Vector128<short>.Count)
             {
-                if (Avx2.IsSupported)
-                {
-                    Vector256<byte> bitmap256 = Vector256.Create(bitmap, bitmap);
-
-                    if (searchSpaceLength > 2 * Vector256<short>.Count)
-                    {
-                        // Process the input in chunks of 32 characters (2 * Vector256<short>).
-                        // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector256<byte>.
-                        // As packing two Vector256<short>s into a Vector256<byte> is cheap compared to the lookup, we can effectively double the throughput.
-                        // If the input length is a multiple of 32, don't consume the last 32 characters in this loop.
-                        // Let the fallback below handle it instead. This is why the condition is
-                        // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan".
-                        ref short twoVectorsAfterStart = ref Unsafe.Add(ref searchSpace, 2 * Vector256<short>.Count);
-
-                        do
-                        {
-                            currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, 2 * Vector256<short>.Count);
+                Vector256<byte> bitmap256 = bitmapRef;
 
-                            Vector256<short> source0 = Vector256.LoadUnsafe(ref currentSearchSpace);
-                            Vector256<short> source1 = Vector256.LoadUnsafe(ref currentSearchSpace, (nuint)Vector256<short>.Count);
-
-                            Vector256<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source0, source1, bitmap256);
-                            if (result != Vector256<byte>.Zero)
-                            {
-                                return ComputeLastIndex<short, TNegator>(ref searchSpace, ref currentSearchSpace, result);
-                            }
-                        }
-                        while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref twoVectorsAfterStart));
-                    }
+                if (searchSpaceLength > 2 * Vector256<short>.Count)
+                {
+                    // Process the input in chunks of 32 characters (2 * Vector256<short>).
+                    // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector256<byte>.
+                    // As packing two Vector256<short>s into a Vector256<byte> is cheap compared to the lookup, we can effectively double the throughput.
+                    // If the input length is a multiple of 32, don't consume the last 32 characters in this loop.
+                    // Let the fallback below handle it instead. This is why the condition is
+                    // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan".
+                    ref short twoVectorsAfterStart = ref Unsafe.Add(ref searchSpace, 2 * Vector256<short>.Count);
 
-                    // We have 1-32 characters remaining. Process the first and last vector in the search space.
-                    // They may overlap, but we'll handle that in the index calculation if we do get a match.
-                    Debug.Assert(searchSpaceLength >= Vector256<short>.Count, "We expect that the input is long enough for us to load a whole vector.");
+                    do
                     {
-                        ref short oneVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector256<short>.Count);
+                        currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, 2 * Vector256<short>.Count);
 
-                        ref short secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAfterStart)
-                            ? ref Unsafe.Subtract(ref currentSearchSpace, Vector256<short>.Count)
-                            : ref searchSpace;
-
-                        Vector256<short> source0 = Vector256.LoadUnsafe(ref searchSpace);
-                        Vector256<short> source1 = Vector256.LoadUnsafe(ref secondVector);
+                        Vector256<short> source0 = Vector256.LoadUnsafe(ref currentSearchSpace);
+                        Vector256<short> source1 = Vector256.LoadUnsafe(ref currentSearchSpace, (nuint)Vector256<short>.Count);
 
                         Vector256<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source0, source1, bitmap256);
                         if (result != Vector256<byte>.Zero)
                         {
-                            return ComputeLastIndexOverlapped<short, TNegator>(ref searchSpace, ref secondVector, result);
+                            return ComputeLastIndex<short, TNegator>(ref searchSpace, ref currentSearchSpace, result);
                         }
                     }
-
-                    return -1;
+                    while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref twoVectorsAfterStart));
                 }
-                else
+
+                // We have 1-32 characters remaining. Process the first and last vector in the search space.
+                // They may overlap, but we'll handle that in the index calculation if we do get a match.
+                Debug.Assert(searchSpaceLength >= Vector256<short>.Count, "We expect that the input is long enough for us to load a whole vector.");
                 {
-                    // Process the input in chunks of 16 characters (2 * Vector128<short>).
-                    // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector128<byte>.
-                    // As packing two Vector128<short>s into a Vector128<byte> is cheap compared to the lookup, we can effectively double the throughput.
-                    // If the input length is a multiple of 16, don't consume the last 16 characters in this loop.
-                    // Let the fallback below handle it instead. This is why the condition is
-                    // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan".
-                    ref short twoVectorsAfterStart = ref Unsafe.Add(ref searchSpace, 2 * Vector128<short>.Count);
+                    ref short oneVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector256<short>.Count);
 
-                    do
+                    ref short secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAfterStart)
+                        ? ref Unsafe.Subtract(ref currentSearchSpace, Vector256<short>.Count)
+                        : ref searchSpace;
+
+                    Vector256<short> source0 = Vector256.LoadUnsafe(ref searchSpace);
+                    Vector256<short> source1 = Vector256.LoadUnsafe(ref secondVector);
+
+                    Vector256<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source0, source1, bitmap256);
+                    if (result != Vector256<byte>.Zero)
                     {
-                        currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, 2 * Vector128<short>.Count);
+                        return ComputeLastIndexOverlapped<short, TNegator>(ref searchSpace, ref secondVector, result);
+                    }
+                }
 
-                        Vector128<short> source0 = Vector128.LoadUnsafe(ref currentSearchSpace);
-                        Vector128<short> source1 = Vector128.LoadUnsafe(ref currentSearchSpace, (nuint)Vector128<short>.Count);
+                return -1;
+            }
 
-                        Vector128<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source0, source1, bitmap);
-                        if (result != Vector128<byte>.Zero)
-                        {
-                            return ComputeLastIndex<short, TNegator>(ref searchSpace, ref currentSearchSpace, result);
-                        }
+            Vector128<byte> bitmap = bitmapRef._lower;
+
+            if (!Avx2.IsSupported && searchSpaceLength > 2 * Vector128<short>.Count)
+            {
+                // Process the input in chunks of 16 characters (2 * Vector128<short>).
+                // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector128<byte>.
+                // As packing two Vector128<short>s into a Vector128<byte> is cheap compared to the lookup, we can effectively double the throughput.
+                // If the input length is a multiple of 16, don't consume the last 16 characters in this loop.
+                // Let the fallback below handle it instead. This is why the condition is
+                // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan".
+                ref short twoVectorsAfterStart = ref Unsafe.Add(ref searchSpace, 2 * Vector128<short>.Count);
+
+                do
+                {
+                    currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, 2 * Vector128<short>.Count);
+
+                    Vector128<short> source0 = Vector128.LoadUnsafe(ref currentSearchSpace);
+                    Vector128<short> source1 = Vector128.LoadUnsafe(ref currentSearchSpace, (nuint)Vector128<short>.Count);
+
+                    Vector128<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source0, source1, bitmap);
+                    if (result != Vector128<byte>.Zero)
+                    {
+                        return ComputeLastIndex<short, TNegator>(ref searchSpace, ref currentSearchSpace, result);
                     }
-                    while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref twoVectorsAfterStart));
                 }
+                while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref twoVectorsAfterStart));
             }
 
             // We have 1-16 characters remaining. Process the first and last vector in the search space.
@@ -386,85 +390,85 @@ namespace System.Buffers
             return -1;
         }
 
-        internal static int IndexOfAnyVectorized<TNegator>(ref byte searchSpace, int searchSpaceLength, Vector128<byte> bitmap)
+        internal static int IndexOfAnyVectorized<TNegator>(ref byte searchSpace, int searchSpaceLength, ref Vector256<byte> bitmapRef)
             where TNegator : struct, INegator
         {
             ref byte currentSearchSpace = ref searchSpace;
 
-            if (searchSpaceLength > Vector128<byte>.Count)
+            if (Avx2.IsSupported && searchSpaceLength > Vector128<byte>.Count)
             {
-                if (Avx2.IsSupported)
+                Vector256<byte> bitmap256 = bitmapRef;
+
+                if (searchSpaceLength > Vector256<byte>.Count)
                 {
-                    Vector256<byte> bitmap256 = Vector256.Create(bitmap, bitmap);
+                    // Process the input in chunks of 32 bytes.
+                    // If the input length is a multiple of 32, don't consume the last 32 characters in this loop.
+                    // Let the fallback below handle it instead. This is why the condition is
+                    // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
+                    ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector256<byte>.Count);
 
-                    if (searchSpaceLength > Vector256<byte>.Count)
+                    do
                     {
-                        // Process the input in chunks of 32 bytes.
-                        // If the input length is a multiple of 32, don't consume the last 32 characters in this loop.
-                        // Let the fallback below handle it instead. This is why the condition is
-                        // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
-                        ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector256<byte>.Count);
+                        Vector256<byte> source = Vector256.LoadUnsafe(ref currentSearchSpace);
 
-                        do
+                        Vector256<byte> result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap256));
+                        if (result != Vector256<byte>.Zero)
                         {
-                            Vector256<byte> source = Vector256.LoadUnsafe(ref currentSearchSpace);
-
-                            Vector256<byte> result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap256));
-                            if (result != Vector256<byte>.Zero)
-                            {
-                                return ComputeFirstIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
-                            }
-
-                            currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector256<byte>.Count);
+                            return ComputeFirstIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
                         }
-                        while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd));
+
+                        currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector256<byte>.Count);
                     }
+                    while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd));
+                }
 
-                    // We have 1-32 bytes remaining. Process the first and last half vectors in the search space.
-                    // They may overlap, but we'll handle that in the index calculation if we do get a match.
-                    Debug.Assert(searchSpaceLength >= Vector128<byte>.Count, "We expect that the input is long enough for us to load a Vector128.");
-                    {
-                        ref byte halfVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128<byte>.Count);
+                // We have 1-32 bytes remaining. Process the first and last half vectors in the search space.
+                // They may overlap, but we'll handle that in the index calculation if we do get a match.
+                Debug.Assert(searchSpaceLength >= Vector128<byte>.Count, "We expect that the input is long enough for us to load a Vector128.");
+                {
+                    ref byte halfVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128<byte>.Count);
 
-                        ref byte firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAwayFromEnd)
-                            ? ref halfVectorAwayFromEnd
-                            : ref currentSearchSpace;
+                    ref byte firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAwayFromEnd)
+                        ? ref halfVectorAwayFromEnd
+                        : ref currentSearchSpace;
 
-                        Vector128<byte> source0 = Vector128.LoadUnsafe(ref firstVector);
-                        Vector128<byte> source1 = Vector128.LoadUnsafe(ref halfVectorAwayFromEnd);
-                        Vector256<byte> source = Vector256.Create(source0, source1);
+                    Vector128<byte> source0 = Vector128.LoadUnsafe(ref firstVector);
+                    Vector128<byte> source1 = Vector128.LoadUnsafe(ref halfVectorAwayFromEnd);
+                    Vector256<byte> source = Vector256.Create(source0, source1);
 
-                        Vector256<byte> result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap256));
-                        if (result != Vector256<byte>.Zero)
-                        {
-                            return ComputeFirstIndexOverlapped<byte, TNegator>(ref searchSpace, ref firstVector, ref halfVectorAwayFromEnd, result);
-                        }
+                    Vector256<byte> result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap256));
+                    if (result != Vector256<byte>.Zero)
+                    {
+                        return ComputeFirstIndexOverlapped<byte, TNegator>(ref searchSpace, ref firstVector, ref halfVectorAwayFromEnd, result);
                     }
-
-                    return -1;
                 }
-                else
-                {
-                    // Process the input in chunks of 16 bytes.
-                    // If the input length is a multiple of 16, don't consume the last 16 characters in this loop.
-                    // Let the fallback below handle it instead. This is why the condition is
-                    // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
-                    ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128<byte>.Count);
 
-                    do
-                    {
-                        Vector128<byte> source = Vector128.LoadUnsafe(ref currentSearchSpace);
+                return -1;
+            }
 
-                        Vector128<byte> result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap));
-                        if (result != Vector128<byte>.Zero)
-                        {
-                            return ComputeFirstIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
-                        }
+            Vector128<byte> bitmap = bitmapRef._lower;
+
+            if (!Avx2.IsSupported && searchSpaceLength > Vector128<byte>.Count)
+            {
+                // Process the input in chunks of 16 bytes.
+                // If the input length is a multiple of 16, don't consume the last 16 characters in this loop.
+                // Let the fallback below handle it instead. This is why the condition is
+                // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
+                ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128<byte>.Count);
 
-                        currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector128<byte>.Count);
+                do
+                {
+                    Vector128<byte> source = Vector128.LoadUnsafe(ref currentSearchSpace);
+
+                    Vector128<byte> result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap));
+                    if (result != Vector128<byte>.Zero)
+                    {
+                        return ComputeFirstIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
                     }
-                    while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd));
+
+                    currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector128<byte>.Count);
                 }
+                while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd));
             }
 
             // We have 1-16 bytes remaining. Process the first and last half vectors in the search space.
@@ -491,85 +495,85 @@ namespace System.Buffers
             return -1;
         }
 
-        internal static int LastIndexOfAnyVectorized<TNegator>(ref byte searchSpace, int searchSpaceLength, Vector128<byte> bitmap)
+        internal static int LastIndexOfAnyVectorized<TNegator>(ref byte searchSpace, int searchSpaceLength, ref Vector256<byte> bitmapRef)
             where TNegator : struct, INegator
         {
             ref byte currentSearchSpace = ref Unsafe.Add(ref searchSpace, searchSpaceLength);
 
-            if (searchSpaceLength > Vector128<byte>.Count)
+            if (Avx2.IsSupported && searchSpaceLength > Vector128<byte>.Count)
             {
-                if (Avx2.IsSupported)
-                {
-                    Vector256<byte> bitmap256 = Vector256.Create(bitmap, bitmap);
-
-                    if (searchSpaceLength > Vector256<byte>.Count)
-                    {
-                        // Process the input in chunks of 32 bytes.
-                        // If the input length is a multiple of 32, don't consume the last 32 characters in this loop.
-                        // Let the fallback below handle it instead. This is why the condition is
-                        // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan".
-                        ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector256<byte>.Count);
-
-                        do
-                        {
-                            currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector256<byte>.Count);
+                Vector256<byte> bitmap256 = bitmapRef;
 
-                            Vector256<byte> source = Vector256.LoadUnsafe(ref currentSearchSpace);
-
-                            Vector256<byte> result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap256));
-                            if (result != Vector256<byte>.Zero)
-                            {
-                                return ComputeLastIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
-                            }
-                        }
-                        while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart));
-                    }
+                if (searchSpaceLength > Vector256<byte>.Count)
+                {
+                    // Process the input in chunks of 32 bytes.
+                    // If the input length is a multiple of 32, don't consume the last 32 characters in this loop.
+                    // Let the fallback below handle it instead. This is why the condition is
+                    // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan".
+                    ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector256<byte>.Count);
 
-                    // We have 1-32 bytes remaining. Process the first and last half vectors in the search space.
-                    // They may overlap, but we'll handle that in the index calculation if we do get a match.
-                    Debug.Assert(searchSpaceLength >= Vector128<byte>.Count, "We expect that the input is long enough for us to load a Vector128.");
+                    do
                     {
-                        ref byte halfVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128<byte>.Count);
+                        currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector256<byte>.Count);
 
-                        ref byte secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAfterStart)
-                            ? ref Unsafe.Subtract(ref currentSearchSpace, Vector128<byte>.Count)
-                            : ref searchSpace;
-
-                        Vector128<byte> source0 = Vector128.LoadUnsafe(ref searchSpace);
-                        Vector128<byte> source1 = Vector128.LoadUnsafe(ref secondVector);
-                        Vector256<byte> source = Vector256.Create(source0, source1);
+                        Vector256<byte> source = Vector256.LoadUnsafe(ref currentSearchSpace);
 
                         Vector256<byte> result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap256));
                         if (result != Vector256<byte>.Zero)
                         {
-                            return ComputeLastIndexOverlapped<byte, TNegator>(ref searchSpace, ref secondVector, result);
+                            return ComputeLastIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
                         }
                     }
-
-                    return -1;
+                    while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart));
                 }
-                else
+
+                // We have 1-32 bytes remaining. Process the first and last half vectors in the search space.
+                // They may overlap, but we'll handle that in the index calculation if we do get a match.
+                Debug.Assert(searchSpaceLength >= Vector128<byte>.Count, "We expect that the input is long enough for us to load a Vector128.");
                 {
-                    // Process the input in chunks of 16 bytes.
-                    // If the input length is a multiple of 16, don't consume the last 16 characters in this loop.
-                    // Let the fallback below handle it instead. This is why the condition is
-                    // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan".
-                    ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128<byte>.Count);
+                    ref byte halfVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128<byte>.Count);
 
-                    do
+                    ref byte secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAfterStart)
+                        ? ref Unsafe.Subtract(ref currentSearchSpace, Vector128<byte>.Count)
+                        : ref searchSpace;
+
+                    Vector128<byte> source0 = Vector128.LoadUnsafe(ref searchSpace);
+                    Vector128<byte> source1 = Vector128.LoadUnsafe(ref secondVector);
+                    Vector256<byte> source = Vector256.Create(source0, source1);
+
+                    Vector256<byte> result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap256));
+                    if (result != Vector256<byte>.Zero)
                     {
-                        currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector128<byte>.Count);
+                        return ComputeLastIndexOverlapped<byte, TNegator>(ref searchSpace, ref secondVector, result);
+                    }
+                }
 
-                        Vector128<byte> source = Vector128.LoadUnsafe(ref currentSearchSpace);
+                return -1;
+            }
 
-                        Vector128<byte> result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap));
-                        if (result != Vector128<byte>.Zero)
-                        {
-                            return ComputeLastIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
-                        }
+            Vector128<byte> bitmap = bitmapRef._lower;
+
+            if (!Avx2.IsSupported && searchSpaceLength > Vector128<byte>.Count)
+            {
+                // Process the input in chunks of 16 bytes.
+                // If the input length is a multiple of 16, don't consume the last 16 characters in this loop.
+                // Let the fallback below handle it instead. This is why the condition is
+                // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan".
+                ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128<byte>.Count);
+
+                do
+                {
+                    currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector128<byte>.Count);
+
+                    Vector128<byte> source = Vector128.LoadUnsafe(ref currentSearchSpace);
+
+                    Vector128<byte> result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap));
+                    if (result != Vector128<byte>.Zero)
+                    {
+                        return ComputeLastIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
                     }
-                    while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart));
                 }
+                while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart));
             }
 
             // We have 1-16 bytes remaining. Process the first and last half vectors in the search space.
@@ -596,86 +600,87 @@ namespace System.Buffers
             return -1;
         }
 
-        internal static int IndexOfAnyVectorized<TNegator>(ref byte searchSpace, int searchSpaceLength, Vector128<byte> bitmap0, Vector128<byte> bitmap1)
+        internal static int IndexOfAnyVectorizedAnyByte<TNegator>(ref byte searchSpace, int searchSpaceLength, ref Vector512<byte> bitmapsRef)
             where TNegator : struct, INegator
         {
             ref byte currentSearchSpace = ref searchSpace;
 
-            if (searchSpaceLength > Vector128<byte>.Count)
+            if (Avx2.IsSupported && searchSpaceLength > Vector128<byte>.Count)
             {
-                if (Avx2.IsSupported)
+                Vector256<byte> bitmap256_0 = bitmapsRef._lower;
+                Vector256<byte> bitmap256_1 = bitmapsRef._upper;
+
+                if (searchSpaceLength > Vector256<byte>.Count)
                 {
-                    Vector256<byte> bitmap256_0 = Vector256.Create(bitmap0, bitmap0);
-                    Vector256<byte> bitmap256_1 = Vector256.Create(bitmap1, bitmap1);
+                    // Process the input in chunks of 32 bytes.
+                    // If the input length is a multiple of 32, don't consume the last 32 characters in this loop.
+                    // Let the fallback below handle it instead. This is why the condition is
+                    // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
+                    ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector256<byte>.Count);
 
-                    if (searchSpaceLength > Vector256<byte>.Count)
+                    do
                     {
-                        // Process the input in chunks of 32 bytes.
-                        // If the input length is a multiple of 32, don't consume the last 32 characters in this loop.
-                        // Let the fallback below handle it instead. This is why the condition is
-                        // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
-                        ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector256<byte>.Count);
+                        Vector256<byte> source = Vector256.LoadUnsafe(ref currentSearchSpace);
 
-                        do
+                        Vector256<byte> result = IndexOfAnyLookup<TNegator>(source, bitmap256_0, bitmap256_1);
+                        if (result != Vector256<byte>.Zero)
                         {
-                            Vector256<byte> source = Vector256.LoadUnsafe(ref currentSearchSpace);
-
-                            Vector256<byte> result = IndexOfAnyLookup<TNegator>(source, bitmap256_0, bitmap256_1);
-                            if (result != Vector256<byte>.Zero)
-                            {
-                                return ComputeFirstIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
-                            }
-
-                            currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector256<byte>.Count);
+                            return ComputeFirstIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
                         }
-                        while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd));
+
+                        currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector256<byte>.Count);
                     }
+                    while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd));
+                }
 
-                    // We have 1-32 bytes remaining. Process the first and last half vectors in the search space.
-                    // They may overlap, but we'll handle that in the index calculation if we do get a match.
-                    Debug.Assert(searchSpaceLength >= Vector128<byte>.Count, "We expect that the input is long enough for us to load a Vector128.");
-                    {
-                        ref byte halfVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128<byte>.Count);
+                // We have 1-32 bytes remaining. Process the first and last half vectors in the search space.
+                // They may overlap, but we'll handle that in the index calculation if we do get a match.
+                Debug.Assert(searchSpaceLength >= Vector128<byte>.Count, "We expect that the input is long enough for us to load a Vector128.");
+                {
+                    ref byte halfVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128<byte>.Count);
 
-                        ref byte firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAwayFromEnd)
-                            ? ref halfVectorAwayFromEnd
-                            : ref currentSearchSpace;
+                    ref byte firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAwayFromEnd)
+                        ? ref halfVectorAwayFromEnd
+                        : ref currentSearchSpace;
 
-                        Vector128<byte> source0 = Vector128.LoadUnsafe(ref firstVector);
-                        Vector128<byte> source1 = Vector128.LoadUnsafe(ref halfVectorAwayFromEnd);
-                        Vector256<byte> source = Vector256.Create(source0, source1);
+                    Vector128<byte> source0 = Vector128.LoadUnsafe(ref firstVector);
+                    Vector128<byte> source1 = Vector128.LoadUnsafe(ref halfVectorAwayFromEnd);
+                    Vector256<byte> source = Vector256.Create(source0, source1);
 
-                        Vector256<byte> result = IndexOfAnyLookup<TNegator>(source, bitmap256_0, bitmap256_1);
-                        if (result != Vector256<byte>.Zero)
-                        {
-                            return ComputeFirstIndexOverlapped<byte, TNegator>(ref searchSpace, ref firstVector, ref halfVectorAwayFromEnd, result);
-                        }
+                    Vector256<byte> result = IndexOfAnyLookup<TNegator>(source, bitmap256_0, bitmap256_1);
+                    if (result != Vector256<byte>.Zero)
+                    {
+                        return ComputeFirstIndexOverlapped<byte, TNegator>(ref searchSpace, ref firstVector, ref halfVectorAwayFromEnd, result);
                     }
-
-                    return -1;
                 }
-                else
-                {
-                    // Process the input in chunks of 16 bytes.
-                    // If the input length is a multiple of 16, don't consume the last 16 characters in this loop.
-                    // Let the fallback below handle it instead. This is why the condition is
-                    // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
-                    ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128<byte>.Count);
 
-                    do
-                    {
-                        Vector128<byte> source = Vector128.LoadUnsafe(ref currentSearchSpace);
+                return -1;
+            }
 
-                        Vector128<byte> result = IndexOfAnyLookup<TNegator>(source, bitmap0, bitmap1);
-                        if (result != Vector128<byte>.Zero)
-                        {
-                            return ComputeFirstIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
-                        }
+            Vector128<byte> bitmap0 = bitmapsRef._lower._lower;
+            Vector128<byte> bitmap1 = bitmapsRef._upper._lower;
+
+            if (!Avx2.IsSupported && searchSpaceLength > Vector128<byte>.Count)
+            {
+                // Process the input in chunks of 16 bytes.
+                // If the input length is a multiple of 16, don't consume the last 16 characters in this loop.
+                // Let the fallback below handle it instead. This is why the condition is
+                // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
+                ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128<byte>.Count);
 
-                        currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector128<byte>.Count);
+                do
+                {
+                    Vector128<byte> source = Vector128.LoadUnsafe(ref currentSearchSpace);
+
+                    Vector128<byte> result = IndexOfAnyLookup<TNegator>(source, bitmap0, bitmap1);
+                    if (result != Vector128<byte>.Zero)
+                    {
+                        return ComputeFirstIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
                     }
-                    while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd));
+
+                    currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector128<byte>.Count);
                 }
+                while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd));
             }
 
             // We have 1-16 bytes remaining. Process the first and last half vectors in the search space.
@@ -702,86 +707,87 @@ namespace System.Buffers
             return -1;
         }
 
-        internal static int LastIndexOfAnyVectorized<TNegator>(ref byte searchSpace, int searchSpaceLength, Vector128<byte> bitmap0, Vector128<byte> bitmap1)
+        internal static int LastIndexOfAnyVectorizedAnyByte<TNegator>(ref byte searchSpace, int searchSpaceLength, ref Vector512<byte> bitmapsRef)
             where TNegator : struct, INegator
         {
             ref byte currentSearchSpace = ref Unsafe.Add(ref searchSpace, searchSpaceLength);
 
-            if (searchSpaceLength > Vector128<byte>.Count)
+            if (Avx2.IsSupported && searchSpaceLength > Vector128<byte>.Count)
             {
-                if (Avx2.IsSupported)
-                {
-                    Vector256<byte> bitmap256_0 = Vector256.Create(bitmap0, bitmap0);
-                    Vector256<byte> bitmap256_1 = Vector256.Create(bitmap1, bitmap1);
-
-                    if (searchSpaceLength > Vector256<byte>.Count)
-                    {
-                        // Process the input in chunks of 32 bytes.
-                        // If the input length is a multiple of 32, don't consume the last 32 characters in this loop.
-                        // Let the fallback below handle it instead. This is why the condition is
-                        // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan".
-                        ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector256<byte>.Count);
-
-                        do
-                        {
-                            currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector256<byte>.Count);
-
-                            Vector256<byte> source = Vector256.LoadUnsafe(ref currentSearchSpace);
+                Vector256<byte> bitmap256_0 = bitmapsRef._lower;
+                Vector256<byte> bitmap256_1 = bitmapsRef._upper;
 
-                            Vector256<byte> result = IndexOfAnyLookup<TNegator>(source, bitmap256_0, bitmap256_1);
-                            if (result != Vector256<byte>.Zero)
-                            {
-                                return ComputeLastIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
-                            }
-                        }
-                        while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart));
-                    }
+                if (searchSpaceLength > Vector256<byte>.Count)
+                {
+                    // Process the input in chunks of 32 bytes.
+                    // If the input length is a multiple of 32, don't consume the last 32 characters in this loop.
+                    // Let the fallback below handle it instead. This is why the condition is
+                    // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan".
+                    ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector256<byte>.Count);
 
-                    // We have 1-32 bytes remaining. Process the first and last half vectors in the search space.
-                    // They may overlap, but we'll handle that in the index calculation if we do get a match.
-                    Debug.Assert(searchSpaceLength >= Vector128<byte>.Count, "We expect that the input is long enough for us to load a Vector128.");
+                    do
                     {
-                        ref byte halfVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128<byte>.Count);
-
-                        ref byte secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAfterStart)
-                            ? ref Unsafe.Subtract(ref currentSearchSpace, Vector128<byte>.Count)
-                            : ref searchSpace;
+                        currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector256<byte>.Count);
 
-                        Vector128<byte> source0 = Vector128.LoadUnsafe(ref searchSpace);
-                        Vector128<byte> source1 = Vector128.LoadUnsafe(ref secondVector);
-                        Vector256<byte> source = Vector256.Create(source0, source1);
+                        Vector256<byte> source = Vector256.LoadUnsafe(ref currentSearchSpace);
 
                         Vector256<byte> result = IndexOfAnyLookup<TNegator>(source, bitmap256_0, bitmap256_1);
                         if (result != Vector256<byte>.Zero)
                         {
-                            return ComputeLastIndexOverlapped<byte, TNegator>(ref searchSpace, ref secondVector, result);
+                            return ComputeLastIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
                         }
                     }
-
-                    return -1;
+                    while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart));
                 }
-                else
+
+                // We have 1-32 bytes remaining. Process the first and last half vectors in the search space.
+                // They may overlap, but we'll handle that in the index calculation if we do get a match.
+                Debug.Assert(searchSpaceLength >= Vector128<byte>.Count, "We expect that the input is long enough for us to load a Vector128.");
                 {
-                    // Process the input in chunks of 16 bytes.
-                    // If the input length is a multiple of 16, don't consume the last 16 characters in this loop.
-                    // Let the fallback below handle it instead. This is why the condition is
-                    // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan".
-                    ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128<byte>.Count);
+                    ref byte halfVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128<byte>.Count);
 
-                    do
+                    ref byte secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAfterStart)
+                        ? ref Unsafe.Subtract(ref currentSearchSpace, Vector128<byte>.Count)
+                        : ref searchSpace;
+
+                    Vector128<byte> source0 = Vector128.LoadUnsafe(ref searchSpace);
+                    Vector128<byte> source1 = Vector128.LoadUnsafe(ref secondVector);
+                    Vector256<byte> source = Vector256.Create(source0, source1);
+
+                    Vector256<byte> result = IndexOfAnyLookup<TNegator>(source, bitmap256_0, bitmap256_1);
+                    if (result != Vector256<byte>.Zero)
                     {
-                        currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector128<byte>.Count);
+                        return ComputeLastIndexOverlapped<byte, TNegator>(ref searchSpace, ref secondVector, result);
+                    }
+                }
 
-                        Vector128<byte> source = Vector128.LoadUnsafe(ref currentSearchSpace);
+                return -1;
+            }
 
-                        Vector128<byte> result = IndexOfAnyLookup<TNegator>(source, bitmap0, bitmap1);
-                        if (result != Vector128<byte>.Zero)
-                        {
-                            return ComputeLastIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
-                        }
+            Vector128<byte> bitmap0 = bitmapsRef._lower._lower;
+            Vector128<byte> bitmap1 = bitmapsRef._upper._lower;
+
+            if (!Avx2.IsSupported && searchSpaceLength > Vector128<byte>.Count)
+            {
+                // Process the input in chunks of 16 bytes.
+                // If the input length is a multiple of 16, don't consume the last 16 characters in this loop.
+                // Let the fallback below handle it instead. This is why the condition is
+                // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan".
+                ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128<byte>.Count);
+
+                do
+                {
+                    currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector128<byte>.Count);
+
+                    Vector128<byte> source = Vector128.LoadUnsafe(ref currentSearchSpace);
+
+                    Vector128<byte> result = IndexOfAnyLookup<TNegator>(source, bitmap0, bitmap1);
+                    if (result != Vector128<byte>.Zero)
+                    {
+                        return ComputeLastIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
                     }
-                    while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart));
                 }
+                while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart));
             }
 
             // We have 1-16 bytes remaining. Process the first and last half vectors in the search space.
index c53ff57..9070e51 100644 (file)
@@ -112,7 +112,7 @@ namespace System.Buffers
             // IndexOfAnyAsciiSearcher for chars is slower than Any3CharSearchValues, but faster than Any4SearchValues
             if (IndexOfAnyAsciiSearcher.IsVectorizationSupported && maxInclusive < 128)
             {
-                IndexOfAnyAsciiSearcher.ComputeBitmap(values, out Vector128<byte> bitmap, out BitVector256 lookup);
+                IndexOfAnyAsciiSearcher.ComputeBitmap(values, out Vector256<byte> bitmap, out BitVector256 lookup);
 
                 return (Ssse3.IsSupported || PackedSimd.IsSupported) && lookup.Contains(0)
                     ? new AsciiCharSearchValues<IndexOfAnyAsciiSearcher.Ssse3AndWasmHandleZeroInNeedle>(bitmap, lookup)