[json] Use S.R.I vectors in JsonReaderHelper (#81758)
authorRadek Doulik <radek.doulik@gmail.com>
Wed, 8 Feb 2023 12:20:29 +0000 (13:20 +0100)
committerGitHub <noreply@github.com>
Wed, 8 Feb 2023 12:20:29 +0000 (13:20 +0100)
* [json] Use S.R.I vectors in JsonReaderHelper

wasm performance improvement

| measurement | new simd | old simd | nosimd |
|-:|-:|-:|-:|
|       Json, non-ASCII text deserialize |     0.4234ms |     0.4280ms |     0.4234ms |
|                Json, small deserialize |     0.0376ms |     0.0402ms |     0.0397ms |
|                Json, large deserialize |    10.4231ms |    11.1614ms |    11.0021ms |

microbenchmarks perf

    --filter *Text.Json*Read*
    summary:
    better: 170, geomean: 1.120
    worse: 15, geomean: 1.080
    total diff: 185

* Use SRI vectors only on net7 and later

src/libraries/System.Text.Json/src/System.Text.Json.csproj
src/libraries/System.Text.Json/src/System/Text/Json/Reader/JsonReaderHelper.cs
src/libraries/System.Text.Json/src/System/Text/Json/Reader/JsonReaderHelper.sn.cs [new file with mode: 0644]
src/libraries/System.Text.Json/src/System/Text/Json/Reader/JsonReaderHelper.sri.cs [new file with mode: 0644]

index 1402d29..16003dc 100644 (file)
@@ -349,6 +349,11 @@ The System.Text.Json library is built-in as part of the shared framework in .NET
   <ItemGroup Condition="!$([MSBuild]::IsTargetFrameworkCompatible('$(TargetFramework)', 'net7.0'))">
     <Compile Include="$(CoreLibSharedDir)System\Diagnostics\CodeAnalysis\StringSyntaxAttribute.cs" />
     <Compile Include="$(CoreLibSharedDir)System\Diagnostics\CodeAnalysis\RequiresDynamicCodeAttribute.cs" />
+    <Compile Include="System\Text\Json\Reader\JsonReaderHelper.sn.cs" />
+  </ItemGroup>
+
+  <ItemGroup Condition="$([MSBuild]::IsTargetFrameworkCompatible('$(TargetFramework)', 'net7.0'))">
+    <Compile Include="System\Text\Json\Reader\JsonReaderHelper.sri.cs" />
   </ItemGroup>
 
   <!-- Application tfms (.NETCoreApp, .NETFramework) need to use the same or higher version of .NETStandard's dependencies. -->
@@ -366,6 +371,7 @@ The System.Text.Json library is built-in as part of the shared framework in .NET
     <Reference Include="System.Reflection.Primitives" />
     <Reference Include="System.Runtime" />
     <Reference Include="System.Runtime.InteropServices" />
+    <Reference Include="System.Runtime.Intrinsics" />
     <Reference Include="System.Runtime.Loader" />
     <Reference Include="System.Text.Encoding.Extensions" />
     <Reference Include="System.Threading" />
index 03d89f9..7172be8 100644 (file)
@@ -3,7 +3,6 @@
 
 using System.Buffers.Text;
 using System.Diagnostics;
-using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 
@@ -84,177 +83,6 @@ namespace System.Text.Json
                     span.Length);
         }
 
-        private static unsafe int IndexOfOrLessThan(ref byte searchSpace, byte value0, byte value1, byte lessThan, int length)
-        {
-            Debug.Assert(length >= 0);
-
-            uint uValue0 = value0; // Use uint for comparisons to avoid unnecessary 8->32 extensions
-            uint uValue1 = value1; // Use uint for comparisons to avoid unnecessary 8->32 extensions
-            uint uLessThan = lessThan; // Use uint for comparisons to avoid unnecessary 8->32 extensions
-            IntPtr index = (IntPtr)0; // Use IntPtr for arithmetic to avoid unnecessary 64->32->64 truncations
-            IntPtr nLength = (IntPtr)length;
-
-            if (Vector.IsHardwareAccelerated && length >= Vector<byte>.Count * 2)
-            {
-                int unaligned = (int)Unsafe.AsPointer(ref searchSpace) & (Vector<byte>.Count - 1);
-                nLength = (IntPtr)((Vector<byte>.Count - unaligned) & (Vector<byte>.Count - 1));
-            }
-        SequentialScan:
-            uint lookUp;
-            while ((byte*)nLength >= (byte*)8)
-            {
-                nLength -= 8;
-
-                lookUp = Unsafe.AddByteOffset(ref searchSpace, index);
-                if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
-                    goto Found;
-                lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 1);
-                if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
-                    goto Found1;
-                lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 2);
-                if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
-                    goto Found2;
-                lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 3);
-                if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
-                    goto Found3;
-                lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 4);
-                if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
-                    goto Found4;
-                lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 5);
-                if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
-                    goto Found5;
-                lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 6);
-                if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
-                    goto Found6;
-                lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 7);
-                if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
-                    goto Found7;
-
-                index += 8;
-            }
-
-            if ((byte*)nLength >= (byte*)4)
-            {
-                nLength -= 4;
-
-                lookUp = Unsafe.AddByteOffset(ref searchSpace, index);
-                if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
-                    goto Found;
-                lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 1);
-                if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
-                    goto Found1;
-                lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 2);
-                if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
-                    goto Found2;
-                lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 3);
-                if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
-                    goto Found3;
-
-                index += 4;
-            }
-
-            while ((byte*)nLength > (byte*)0)
-            {
-                nLength -= 1;
-
-                lookUp = Unsafe.AddByteOffset(ref searchSpace, index);
-                if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
-                    goto Found;
-
-                index += 1;
-            }
-
-            if (Vector.IsHardwareAccelerated && ((int)(byte*)index < length))
-            {
-                nLength = (IntPtr)((length - (int)(byte*)index) & ~(Vector<byte>.Count - 1));
-
-                // Get comparison Vector
-                Vector<byte> values0 = new Vector<byte>(value0);
-                Vector<byte> values1 = new Vector<byte>(value1);
-                Vector<byte> valuesLessThan = new Vector<byte>(lessThan);
-
-                while ((byte*)nLength > (byte*)index)
-                {
-                    Vector<byte> vData = Unsafe.ReadUnaligned<Vector<byte>>(ref Unsafe.AddByteOffset(ref searchSpace, index));
-
-                    var vMatches = Vector.BitwiseOr(
-                                    Vector.BitwiseOr(
-                                        Vector.Equals(vData, values0),
-                                        Vector.Equals(vData, values1)),
-                                    Vector.LessThan(vData, valuesLessThan));
-
-                    if (Vector<byte>.Zero.Equals(vMatches))
-                    {
-                        index += Vector<byte>.Count;
-                        continue;
-                    }
-                    // Find offset of first match
-                    return (int)(byte*)index + LocateFirstFoundByte(vMatches);
-                }
-
-                if ((int)(byte*)index < length)
-                {
-                    nLength = (IntPtr)(length - (int)(byte*)index);
-                    goto SequentialScan;
-                }
-            }
-            return -1;
-        Found: // Workaround for https://github.com/dotnet/runtime/issues/8795
-            return (int)(byte*)index;
-        Found1:
-            return (int)(byte*)(index + 1);
-        Found2:
-            return (int)(byte*)(index + 2);
-        Found3:
-            return (int)(byte*)(index + 3);
-        Found4:
-            return (int)(byte*)(index + 4);
-        Found5:
-            return (int)(byte*)(index + 5);
-        Found6:
-            return (int)(byte*)(index + 6);
-        Found7:
-            return (int)(byte*)(index + 7);
-        }
-
-        // Vector sub-search adapted from https://github.com/aspnet/KestrelHttpServer/pull/1138
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static int LocateFirstFoundByte(Vector<byte> match)
-        {
-            var vector64 = Vector.AsVectorUInt64(match);
-            ulong candidate = 0;
-            int i = 0;
-            // Pattern unrolled by jit https://github.com/dotnet/coreclr/pull/8001
-            for (; i < Vector<ulong>.Count; i++)
-            {
-                candidate = vector64[i];
-                if (candidate != 0)
-                {
-                    break;
-                }
-            }
-
-            // Single LEA instruction with jitted const (using function result)
-            return i * 8 + LocateFirstFoundByte(candidate);
-        }
-
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static int LocateFirstFoundByte(ulong match)
-        {
-            // Flag least significant power of two bit
-            var powerOfTwoFlag = match ^ (match - 1);
-            // Shift all powers of two into the high byte and extract
-            return (int)((powerOfTwoFlag * XorPowerOfTwoToHighByte) >> 57);
-        }
-
-        private const ulong XorPowerOfTwoToHighByte = (0x07ul |
-                                               0x06ul << 8 |
-                                               0x05ul << 16 |
-                                               0x04ul << 24 |
-                                               0x03ul << 32 |
-                                               0x02ul << 40 |
-                                               0x01ul << 48) + 1;
-
         public static bool TryGetEscapedDateTime(ReadOnlySpan<byte> source, out DateTime value)
         {
             Debug.Assert(source.Length <= JsonConstants.MaximumEscapedDateTimeOffsetParseLength);
diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Reader/JsonReaderHelper.sn.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Reader/JsonReaderHelper.sn.cs
new file mode 100644 (file)
index 0000000..59728a4
--- /dev/null
@@ -0,0 +1,183 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+
+namespace System.Text.Json
+{
+    internal static partial class JsonReaderHelper
+    {
+        private static unsafe int IndexOfOrLessThan(ref byte searchSpace, byte value0, byte value1, byte lessThan, int length)
+        {
+            Debug.Assert(length >= 0);
+
+            uint uValue0 = value0; // Use uint for comparisons to avoid unnecessary 8->32 extensions
+            uint uValue1 = value1; // Use uint for comparisons to avoid unnecessary 8->32 extensions
+            uint uLessThan = lessThan; // Use uint for comparisons to avoid unnecessary 8->32 extensions
+            IntPtr index = (IntPtr)0; // Use IntPtr for arithmetic to avoid unnecessary 64->32->64 truncations
+            IntPtr nLength = (IntPtr)length;
+
+            if (Vector.IsHardwareAccelerated && length >= Vector<byte>.Count * 2)
+            {
+                int unaligned = (int)Unsafe.AsPointer(ref searchSpace) & (Vector<byte>.Count - 1);
+                nLength = (IntPtr)((Vector<byte>.Count - unaligned) & (Vector<byte>.Count - 1));
+            }
+        SequentialScan:
+            uint lookUp;
+            while ((byte*)nLength >= (byte*)8)
+            {
+                nLength -= 8;
+
+                lookUp = Unsafe.AddByteOffset(ref searchSpace, index);
+                if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
+                    goto Found;
+                lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 1);
+                if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
+                    goto Found1;
+                lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 2);
+                if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
+                    goto Found2;
+                lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 3);
+                if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
+                    goto Found3;
+                lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 4);
+                if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
+                    goto Found4;
+                lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 5);
+                if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
+                    goto Found5;
+                lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 6);
+                if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
+                    goto Found6;
+                lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 7);
+                if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
+                    goto Found7;
+
+                index += 8;
+            }
+
+            if ((byte*)nLength >= (byte*)4)
+            {
+                nLength -= 4;
+
+                lookUp = Unsafe.AddByteOffset(ref searchSpace, index);
+                if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
+                    goto Found;
+                lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 1);
+                if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
+                    goto Found1;
+                lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 2);
+                if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
+                    goto Found2;
+                lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 3);
+                if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
+                    goto Found3;
+
+                index += 4;
+            }
+
+            while ((byte*)nLength > (byte*)0)
+            {
+                nLength -= 1;
+
+                lookUp = Unsafe.AddByteOffset(ref searchSpace, index);
+                if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
+                    goto Found;
+
+                index += 1;
+            }
+
+            if (Vector.IsHardwareAccelerated && ((int)(byte*)index < length))
+            {
+                nLength = (IntPtr)((length - (int)(byte*)index) & ~(Vector<byte>.Count - 1));
+
+                // Get comparison Vector
+                Vector<byte> values0 = new Vector<byte>(value0);
+                Vector<byte> values1 = new Vector<byte>(value1);
+                Vector<byte> valuesLessThan = new Vector<byte>(lessThan);
+
+                while ((byte*)nLength > (byte*)index)
+                {
+                    Vector<byte> vData = Unsafe.ReadUnaligned<Vector<byte>>(ref Unsafe.AddByteOffset(ref searchSpace, index));
+
+                    var vMatches = Vector.BitwiseOr(
+                                    Vector.BitwiseOr(
+                                        Vector.Equals(vData, values0),
+                                        Vector.Equals(vData, values1)),
+                                    Vector.LessThan(vData, valuesLessThan));
+
+                    if (Vector<byte>.Zero.Equals(vMatches))
+                    {
+                        index += Vector<byte>.Count;
+                        continue;
+                    }
+                    // Find offset of first match
+                    return (int)(byte*)index + LocateFirstFoundByte(vMatches);
+                }
+
+                if ((int)(byte*)index < length)
+                {
+                    nLength = (IntPtr)(length - (int)(byte*)index);
+                    goto SequentialScan;
+                }
+            }
+            return -1;
+        Found: // Workaround for https://github.com/dotnet/runtime/issues/8795
+            return (int)(byte*)index;
+        Found1:
+            return (int)(byte*)(index + 1);
+        Found2:
+            return (int)(byte*)(index + 2);
+        Found3:
+            return (int)(byte*)(index + 3);
+        Found4:
+            return (int)(byte*)(index + 4);
+        Found5:
+            return (int)(byte*)(index + 5);
+        Found6:
+            return (int)(byte*)(index + 6);
+        Found7:
+            return (int)(byte*)(index + 7);
+        }
+
+        // Vector sub-search adapted from https://github.com/aspnet/KestrelHttpServer/pull/1138
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static int LocateFirstFoundByte(Vector<byte> match)
+        {
+            var vector64 = Vector.AsVectorUInt64(match);
+            ulong candidate = 0;
+            int i = 0;
+            // Pattern unrolled by jit https://github.com/dotnet/coreclr/pull/8001
+            for (; i < Vector<ulong>.Count; i++)
+            {
+                candidate = vector64[i];
+                if (candidate != 0)
+                {
+                    break;
+                }
+            }
+
+            // Single LEA instruction with jitted const (using function result)
+            return i * 8 + LocateFirstFoundByte(candidate);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static int LocateFirstFoundByte(ulong match)
+        {
+            // Flag least significant power of two bit
+            var powerOfTwoFlag = match ^ (match - 1);
+            // Shift all powers of two into the high byte and extract
+            return (int)((powerOfTwoFlag * XorPowerOfTwoToHighByte) >> 57);
+        }
+
+        private const ulong XorPowerOfTwoToHighByte = (0x07ul |
+                                               0x06ul << 8 |
+                                               0x05ul << 16 |
+                                               0x04ul << 24 |
+                                               0x03ul << 32 |
+                                               0x02ul << 40 |
+                                               0x01ul << 48) + 1;
+    }
+}
diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Reader/JsonReaderHelper.sri.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Reader/JsonReaderHelper.sri.cs
new file mode 100644 (file)
index 0000000..ed6abe7
--- /dev/null
@@ -0,0 +1,216 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.Intrinsics;
+using System.Runtime.CompilerServices;
+
+namespace System.Text.Json
+{
+    internal static partial class JsonReaderHelper
+    {
+        private static unsafe int IndexOfOrLessThan(ref byte searchSpace, byte value0, byte value1, byte lessThan, int length)
+        {
+            Debug.Assert(length >= 0);
+
+            uint uValue0 = value0; // Use uint for comparisons to avoid unnecessary 8->32 extensions
+            uint uValue1 = value1; // Use uint for comparisons to avoid unnecessary 8->32 extensions
+            uint uLessThan = lessThan; // Use uint for comparisons to avoid unnecessary 8->32 extensions
+            IntPtr index = (IntPtr)0; // Use IntPtr for arithmetic to avoid unnecessary 64->32->64 truncations
+            IntPtr nLength = (IntPtr)length;
+
+            if (!Vector128.IsHardwareAccelerated || length < Vector128<byte>.Count)
+            {
+                uint lookUp;
+                while ((byte*)nLength >= (byte*)8)
+                {
+                    nLength -= 8;
+
+                    lookUp = Unsafe.AddByteOffset(ref searchSpace, index);
+                    if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
+                        goto Found;
+                    lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 1);
+                    if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
+                        goto Found1;
+                    lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 2);
+                    if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
+                        goto Found2;
+                    lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 3);
+                    if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
+                        goto Found3;
+                    lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 4);
+                    if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
+                        goto Found4;
+                    lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 5);
+                    if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
+                        goto Found5;
+                    lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 6);
+                    if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
+                        goto Found6;
+                    lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 7);
+                    if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
+                        goto Found7;
+
+                    index += 8;
+                }
+
+                if ((byte*)nLength >= (byte*)4)
+                {
+                    nLength -= 4;
+
+                    lookUp = Unsafe.AddByteOffset(ref searchSpace, index);
+                    if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
+                        goto Found;
+                    lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 1);
+                    if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
+                        goto Found1;
+                    lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 2);
+                    if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
+                        goto Found2;
+                    lookUp = Unsafe.AddByteOffset(ref searchSpace, index + 3);
+                    if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
+                        goto Found3;
+
+                    index += 4;
+                }
+
+                while ((byte*)nLength > (byte*)0)
+                {
+                    nLength -= 1;
+
+                    lookUp = Unsafe.AddByteOffset(ref searchSpace, index);
+                    if (uValue0 == lookUp || uValue1 == lookUp || uLessThan > lookUp)
+                        goto Found;
+
+                    index += 1;
+                }
+            }
+            else if (Vector256.IsHardwareAccelerated && length >= Vector256<byte>.Count)
+            {
+                // Get comparison Vectors
+                Vector256<byte> values0 = Vector256.Create(value0);
+                Vector256<byte> values1 = Vector256.Create(value1);
+                Vector256<byte> valuesLessThan = Vector256.Create(lessThan);
+
+                ref byte currentSearchSpace = ref searchSpace;
+                ref byte oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - Vector256<byte>.Count);
+
+                // Loop until either we've finished all elements or there's less than a vector's-worth remaining.
+                do
+                {
+                    var vData = Vector256.LoadUnsafe(ref currentSearchSpace);
+                    var vMatches = Vector256.BitwiseOr(
+                                    Vector256.BitwiseOr(
+                                        Vector256.Equals(vData, values0),
+                                        Vector256.Equals(vData, values1)),
+                                    Vector256.LessThan(vData, valuesLessThan));
+
+                    if (vMatches == Vector256<byte>.Zero)
+                    {
+                        currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector256<byte>.Count);
+                        continue;
+                    }
+
+                    return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, vMatches);
+                }
+                while (!Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd));
+
+                // If any elements remain, process the last vector in the search space.
+                if ((uint)length % Vector256<byte>.Count != 0)
+                {
+                    var vData = Vector256.LoadUnsafe(ref oneVectorAwayFromEnd);
+                    var vMatches = Vector256.BitwiseOr(
+                                    Vector256.BitwiseOr(
+                                        Vector256.Equals(vData, values0),
+                                        Vector256.Equals(vData, values1)),
+                                    Vector256.LessThan(vData, valuesLessThan));
+
+                    if (vMatches != Vector256<byte>.Zero)
+                    {
+                        return ComputeFirstIndex(ref searchSpace, ref oneVectorAwayFromEnd, vMatches);
+                    }
+                }
+            }
+            else
+            {
+                // Get comparison Vectors
+                Vector128<byte> values0 = Vector128.Create(value0);
+                Vector128<byte> values1 = Vector128.Create(value1);
+                Vector128<byte> valuesLessThan = Vector128.Create(lessThan);
+
+                ref byte currentSearchSpace = ref searchSpace;
+                ref byte oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - Vector128<byte>.Count);
+
+                // Loop until either we've finished all elements or there's less than a vector's-worth remaining.
+                do
+                {
+                    var vData = Vector128.LoadUnsafe(ref currentSearchSpace);
+                    var vMatches = Vector128.BitwiseOr(
+                                    Vector128.BitwiseOr(
+                                        Vector128.Equals(vData, values0),
+                                        Vector128.Equals(vData, values1)),
+                                    Vector128.LessThan(vData, valuesLessThan));
+
+                    if (vMatches == Vector128<byte>.Zero)
+                    {
+                        currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector128<byte>.Count);
+                        continue;
+                    }
+
+                    return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, vMatches);
+                }
+                while (!Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd));
+
+                // If any elements remain, process the last vector in the search space.
+                if ((uint)length % Vector128<byte>.Count != 0)
+                {
+                    var vData = Vector128.LoadUnsafe(ref oneVectorAwayFromEnd);
+                    var vMatches = Vector128.BitwiseOr(
+                                    Vector128.BitwiseOr(
+                                        Vector128.Equals(vData, values0),
+                                        Vector128.Equals(vData, values1)),
+                                    Vector128.LessThan(vData, valuesLessThan));
+
+                    if (vMatches != Vector128<byte>.Zero)
+                    {
+                        return ComputeFirstIndex(ref searchSpace, ref oneVectorAwayFromEnd, vMatches);
+                    }
+                }
+            }
+            return -1;
+        Found: // Workaround for https://github.com/dotnet/runtime/issues/8795
+            return (int)(byte*)index;
+        Found1:
+            return (int)(byte*)(index + 1);
+        Found2:
+            return (int)(byte*)(index + 2);
+        Found3:
+            return (int)(byte*)(index + 3);
+        Found4:
+            return (int)(byte*)(index + 4);
+        Found5:
+            return (int)(byte*)(index + 5);
+        Found6:
+            return (int)(byte*)(index + 6);
+        Found7:
+            return (int)(byte*)(index + 7);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static unsafe int ComputeFirstIndex(ref byte searchSpace, ref byte current, Vector256<byte> equals)
+        {
+            uint notEqualsElements = equals.ExtractMostSignificantBits();
+            int index = BitOperations.TrailingZeroCount(notEqualsElements);
+            return index + (int)(Unsafe.ByteOffset(ref searchSpace, ref current) / sizeof(byte));
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static unsafe int ComputeFirstIndex(ref byte searchSpace, ref byte current, Vector128<byte> equals)
+        {
+            uint notEqualsElements = equals.ExtractMostSignificantBits();
+            int index = BitOperations.TrailingZeroCount(notEqualsElements);
+            return index + (int)(Unsafe.ByteOffset(ref searchSpace, ref current) / sizeof(byte));
+        }
+    }
+}