From 442a42147ef23c3b9742abcd8b997e8f472af68a Mon Sep 17 00:00:00 2001
From: Egor Bogatov <egorbo@gmail.com>
Date: Tue, 25 Jan 2022 21:54:35 +0300
Subject: [PATCH] Faster IndexOf for substrings (#63285)

* Improve "lastChar == firstChar" case, also, use IndexOf directly if value.Length == 1

* Try plain IndexOf first, to optimize cases where even first char of value is never met

* add 1-byte implementation

* copyrights

* fix copy-paste mistake

* Initial LastIndexOf impl

* More efficient LastIndexOf

* fix bug in Char version (we need two clear two lowest bits in the mask) & temporarily remove AdvSimd impl

* use ResetLowestSetBit

* Fix bug

* Add two-byte LastIndexOf

* Fix build

* Minor optimizations

* optimize cases with two-byte/two-char values

* Remove gotos, fix build

* fix bug in LastIndexOf

* Make sure String.LastIndexOf is optimized

* Use xplat simd helpers - implicit ARM support

* fix arm

* Delete \

* Use Vector128.IsHardwareAccelerated

* Fix build

* Use IsAllZero

* Address feedback

* Address feedback

* micro-optimization, do-while is better here since mask is guaranteed to be non-zero

* Address feedabc

* Use clever trick I borrowed from IndexOfAny for trailing elements

* give up on +1 bump for SequenceCompare

* Clean up

* Clean up

* fix build

* Add debug asserts

* Clean up: give up on the unrolled trick - too little value from code bloat

* Add a test

* Fix build

* Add byte-specific test

* Fix build

* Update IndexOfSequence.byte.cs
---
 THIRD-PARTY-NOTICES.TXT                            |  29 ++
 .../tests/Span/IndexOfSequence.byte.cs             | 101 +++++++
 .../tests/Span/IndexOfSequence.char.cs             | 101 +++++++
 .../src/System/MemoryExtensions.cs                 |  25 +-
 .../src/System/Numerics/BitOperations.cs           |  20 ++
 .../src/System/SpanHelpers.Byte.cs                 | 310 ++++++++++++++++++---
 .../src/System/SpanHelpers.Char.cs                 | 299 +++++++++++++++++++-
 .../src/System/SpanHelpers.T.cs                    |  10 +-
 8 files changed, 833 insertions(+), 62 deletions(-)

diff --git a/THIRD-PARTY-NOTICES.TXT b/THIRD-PARTY-NOTICES.TXT
index e38f6ef..55329a8 100644
--- a/THIRD-PARTY-NOTICES.TXT
+++ b/THIRD-PARTY-NOTICES.TXT
@@ -697,6 +697,35 @@ License for fastmod (https://github.com/lemire/fastmod) and ibm-fpgen (https://g
    See the License for the specific language governing permissions and
    limitations under the License.
 
+License for sse4-strstr (https://github.com/WojciechMula/sse4-strstr)
+--------------------------------------
+
+   Copyright (c) 2008-2016, Wojciech MuÅa
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 License notice for The C++ REST SDK
 -----------------------------------
 
diff --git a/src/libraries/System.Memory/tests/Span/IndexOfSequence.byte.cs b/src/libraries/System.Memory/tests/Span/IndexOfSequence.byte.cs
index 3325056..1a9027d 100644
--- a/src/libraries/System.Memory/tests/Span/IndexOfSequence.byte.cs
+++ b/src/libraries/System.Memory/tests/Span/IndexOfSequence.byte.cs
@@ -2,6 +2,8 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 
 using Xunit;
+using System.Collections.Generic;
+using System.Runtime.InteropServices;
 
 namespace System.SpanTests
 {
@@ -115,5 +117,104 @@ namespace System.SpanTests
             int index = span.IndexOf(value);
             Assert.Equal(-1, index);
         }
+
+        public static IEnumerable<object[]> IndexOfSubSeqData_Byte()
+        {
+            // searchSpace, value, expected IndexOf value, expected LastIndexOf value
+            yield return new object[] { new byte[]{0,0,0,0,0},new byte[]{0,0,0}, 0, 2};
+            yield return new object[] { new byte[]{0,0,0,0,0,0,0,0,0,0},new byte[]{0,71,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,0,0,0,0,0,0},new byte[]{0,0,0}, 0, 7};
+            yield return new object[] { new byte[]{0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0},new byte[]{0,71,0,1,0}, 10, 10};
+            yield return new object[] { new byte[]{0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0},new byte[]{0,0,0,1,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0},new byte[]{0,0,0,1,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0},new byte[]{0,0,0,1,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0},new byte[]{0,71,1,0,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0},new byte[]{0,0,1,0,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0},new byte[]{0,0,1,0,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0},new byte[]{0,0,1,0,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0},new byte[]{0,1,0,0,0}, 12, 12};
+            yield return new object[] { new byte[]{0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0},new byte[]{0,1,0,0,0}, 11, 11};
+            yield return new object[] { new byte[]{0,0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0},new byte[]{0,1,0,0,0}, 6, 6};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1},new byte[]{0,0,0,1,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1},new byte[]{0,0,0,1,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1},new byte[]{0,0,0,0,1,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1},new byte[]{0,0,0,0,0,1,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1},new byte[]{0,0,0,0,0,1,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1},new byte[]{0,0,0,0,0,1,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1},new byte[]{0,1,0,0,1,0,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1},new byte[]{0,1,0,0,0,0,0}, 5, 5};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1},new byte[]{0,1,0,0,0,0,0}, 5, 5};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1},new byte[]{0,1,0,0,0,0,0}, 5, 5};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,1,1,0,2,0,1,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0},new byte[]{0,0,0}, 0, 44};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,1,1,0,2,0,1,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0},new byte[]{0,0,0,0}, 0, 43};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,1,1,0,2,0,1,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0},new byte[]{0,0,0,0,0}, 7, 42};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,1,1,0,2,0,1,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0},new byte[]{0,0,0,0,0,0}, 7, 41};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,1,1,0,2,0,1,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0},new byte[]{0,0,0,0,0,0,0}, 7, 11};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,1,1,0,2,0,1,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0},new byte[]{0,0,0,0,0,0,0,0}, 7, 10};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,1,1,0,2,0,1,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0},new byte[]{0,0,0,0,0,0,0,0,0}, 7, 9};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,1,1,0,2,0,1,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0},new byte[]{0,0,0,0,0,0,0,0,0,0,0}, 7, 7};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,1,1,0,2,0,1,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0},new byte[]{0,0,0,0,0,0,0,0,0,0,0,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,1,1,0,2,0,1,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0},new byte[]{0,0,0,0,0,0,0,0,0,0,0,0,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,1,1,0,2,0,1,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0},new byte[]{0,0,0,0,0,0,0,0,0,0,0,0,0,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,1,1,0,2,0,1,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0},new byte[]{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,1,1,0,2,0,1,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0},new byte[]{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,1,1,0,2,0,1,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0},new byte[]{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,1,1,0,2,0,1,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0},new byte[]{0,1,0,0}, 5, 48};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,1,1,0,2,0,1,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0},new byte[]{0,0,0,1,0}, 44, 44};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,1,1,0,2,0,1,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0},new byte[]{0,1,0,0,0,0}, 5, 19};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,1,1,0,2,0,1,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0},new byte[]{0,1,0,0,0,1,0,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,1,1,0,2,0,1,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0},new byte[]{0,0,0,0,0,0,0}, 7, 11};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,1,1,0,2,0,1,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0},new byte[]{0,0,1,0,0,1,0,0,0,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,1,1,0,2,0,1,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0},new byte[]{0,0,0,0,1,0,0,0,0,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,1,1,0,2,0,1,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0},new byte[]{0,0,0,0,0,0,1,0,0,0,0,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,1,1,0,2,0,1,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0},new byte[]{0,0,1,0,0,0,0,0,0,0,0,0,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,1,1,0,2,0,1,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0},new byte[]{0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,1,1,0,2,0,1,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0},new byte[]{0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,1,1,0,2,0,1,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0},new byte[]{0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, -1, -1};
+            yield return new object[] { new byte[]{0,0,0,0,71,0,1,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,1,1,0,2,0,1,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0},new byte[]{0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0}, -1, -1};
+            yield return new object[] { new byte[]{159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133},new byte[]{159,133,159,133,159,133}, 0, 22};
+            yield return new object[] { new byte[]{159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133},new byte[]{159,133,255,159,133}, -1, -1};
+            yield return new object[] { new byte[]{159,133,159,133,159,133,159,133,159,133,159,127,159,127,159,127,159,127,159,127,159,127,159,127,159,127,159,127,159,127,159,127,159,127,159,127,159,127,159,127,160,86,160,86,160,86,160,86,160,80},new byte[]{160,86,160,86,160,86,160,86,160,80}, 40, 40};
+            yield return new object[] { new byte[]{159,133,159,133,159,133,159,133,159,133,159,127,159,127,159,127,159,127,159,127,159,127,159,127,159,127,159,127,159,127,159,127,159,127,159,127,159,127,159,127,160,86,160,86,160,86,160,86,160,80,160,80,160,80,160,80,160,80,160,80,160,80,160,86,160,86,160,86,160,86},new byte[]{160,86,160,86,160,86,160,86}, 40, 62};
+            yield return new object[] { new byte[]{159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133,159,133},new byte[]{0,0,0,1}, -1, -1};
+            yield return new object[] { new byte[]{255,160,82,159,134,159,127,255,159,141,160,85,160,82,160,88,255,159,141,159,127,159,134,255,160,88,160,85,160,82,159,141,159,127,159,134,160,88,160,85,160,82,159,141,255,159,127,159,134,160,88,160,85,160,82,159,141,159,127,159,134,160,88,159,141,160,85,255,160,82,159,134,159,141,159,134,160,85,160,82,159,141,159,127,159,134,160,82,159,141,160,85,159,134,255,160,88,159,127,160,82,160,85,159,134,255,159,141,159,127,159,134,160,85,159,141},new byte[]{255,159,141,159,127,159,134,255}, 16, 16};
+            yield return new object[] { new byte[]{48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,49,50},new byte[]{49,49}, 29, 29};
+            yield return new object[] { new byte[]{48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,49},new byte[]{49,49}, 29, 29};
+            yield return new object[] { new byte[]{48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,49,50},new byte[]{49,49}, 29, 29};
+            yield return new object[] { new byte[]{49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,50},new byte[]{49,49}, -1, -1};
+            yield return new object[] { new byte[]{48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,49,49},new byte[]{49,49,49}, 29, 29};
+            yield return new object[] { new byte[]{48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,49,49,50},new byte[]{49,49,49}, 29, 29};
+            yield return new object[] { new byte[]{49,49,49,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,50},new byte[]{49,49,49}, 0, 1};
+            yield return new object[] { new byte[]{48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,49,50},new byte[]{48,48}, -1, -1};
+            yield return new object[] { new byte[]{48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,49},new byte[]{48,48}, -1, -1};
+            yield return new object[] { new byte[]{48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,49,50},new byte[]{48,48}, -1, -1};
+            yield return new object[] { new byte[]{49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,50},new byte[]{48,48}, -1, -1};
+            yield return new object[] { new byte[]{48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,49,49},new byte[]{48,48,48}, -1, -1};
+            yield return new object[] { new byte[]{48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,49,49,50},new byte[]{48,48,48}, -1, -1};
+            yield return new object[] { new byte[]{49,49,49,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,50},new byte[]{48,48,48}, -1, -1};
+            yield return new object[] { new byte[]{48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,49,50},new byte[]{48,49,48,48}, -1, -1};
+            yield return new object[] { new byte[]{49,48,49,49,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,50},new byte[]{49,48,49,49}, 0, 0};
+            yield return new object[] { new byte[]{49,48,49,49,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,48,49,50},new byte[]{160,80,48,160,80,160,80}, -1, -1};
+            yield return new object[] { new byte[]{49,48,49,49,49,49,49,49,49,49,49,49,49,49,49},new byte[]{49,48,49,49,49,49,49,49,49,49,49,49,49,49}, 0, 0};
+            yield return new object[] { new byte[]{49,48,49,49,49,49,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49},new byte[]{49,48,49,49,49,49,49,49,49,49,49,49,49,49}, 0, 15};
+            yield return new object[] { new byte[]{49,48,49,49,49,49,49,49,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49},new byte[]{49,48,49,49,49,49,49,49,49,49,49,49,49,49}, 0, 17};
+            yield return new object[] { new byte[]{49,48,49,49,49,49,49,49,71,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49},new byte[]{49,48,49,49,49,49,49,49,49,49,49,49,49,49}, 18, 32};
+            yield return new object[] { new byte[]{49,48,49,49,49,49,49,49,71,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49},new byte[]{49,49,49,49,49,49,49,49,49,49,49,49,49}, 20, 20};
+            yield return new object[] { new byte[]{49,48,49,49,49,49,49,49,71,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49},new byte[]{49,48,49,49,49,49,49,49,71,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49}, 0, 0};
+            yield return new object[] { new byte[]{49,48,49,49,49,49,49,49,71,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49},new byte[]{49,48,49,49,49,49,49,49,71,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49}, 0, 0};
+            yield return new object[] { new byte[]{49,48,49,49,49,49,49,49,71,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49},new byte[]{49,48,49,49,49,49,49,49,71,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49,49}, -1, -1};
+            yield return new object[] { new byte[]{49,48,49,49,49,49,49,49,71,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49},new byte[]{49,48,49,49,49,49,49,49,71,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49}, 0, 0};
+            yield return new object[] { new byte[]{49,48,49,49,49,49,49,49,71,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49},new byte[]{49,48,49,49,49,49,49,49,71,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49}, 0, 0};
+            yield return new object[] { new byte[]{71,71,71,71,71,71,71,71,71,71,71,71,71,71,49,48,49,49,49,49,49,49,71,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71},new byte[]{71,71,71,71,71,71,71,71,71,71,71,71,71,71,71}, 60, 60};
+            yield return new object[] { new byte[]{71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,49,48,49,49,49,49,49,49,71,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49,49,48,49,49,49,49,49,49,49,49,49,49,49,49,71,71,71,71,71,71,71,71,71,71,71,71,71,71},new byte[]{71,71,71,71,71,71,71,71,71,71,71,71,71,71,71}, 0, 0};
+        }
+
+        [Theory]
+        [MemberData(nameof(IndexOfSubSeqData_Byte))]
+        public static void ValueStartsAndEndsWithTheSameBytes(byte[] searchSpace, byte[] value, int expectedIndexOfValue, int expectedLastIndexOfValue)
+        {
+            Assert.Equal(expectedIndexOfValue, searchSpace.AsSpan().IndexOf(value));
+            Assert.Equal(expectedLastIndexOfValue, searchSpace.AsSpan().LastIndexOf(value));
+        }
     }
 }
diff --git a/src/libraries/System.Memory/tests/Span/IndexOfSequence.char.cs b/src/libraries/System.Memory/tests/Span/IndexOfSequence.char.cs
index bda6261..b341a79 100644
--- a/src/libraries/System.Memory/tests/Span/IndexOfSequence.char.cs
+++ b/src/libraries/System.Memory/tests/Span/IndexOfSequence.char.cs
@@ -2,6 +2,8 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 
 using Xunit;
+using System.Collections.Generic;
+using System.Runtime.InteropServices;
 
 namespace System.SpanTests
 {
@@ -115,5 +117,104 @@ namespace System.SpanTests
             int index = span.IndexOf(value);
             Assert.Equal(-1, index);
         }
+
+        public static IEnumerable<object[]> IndexOfSubSeqData_Char()
+        {
+            // searchSpace, value, expected IndexOf value, expected LastIndexOf value
+            yield return new object[] { "11111", "111", 0, 2 };
+            yield return new object[] { "1111111111", "1x1", -1, -1 };
+            yield return new object[] { "1111111111", "111", 0, 7 };
+            yield return new object[] { "11111111111x12111", "1x121", 10, 10 };
+            yield return new object[] { "11111111111x12111", "11121", -1, -1 };
+            yield return new object[] { "1111111111x121111", "11121", -1, -1 };
+            yield return new object[] { "11111x12111111111", "11121", -1, -1 };
+            yield return new object[] { "11111111111x12111", "1x211", -1, -1 };
+            yield return new object[] { "11111111111x12111", "11211", -1, -1 };
+            yield return new object[] { "1111111111x121111", "11211", -1, -1 };
+            yield return new object[] { "11111x12111111111", "11211", -1, -1 };
+            yield return new object[] { "11111111111x12111", "12111", 12, 12 };
+            yield return new object[] { "1111111111x121111", "12111", 11, 11 };
+            yield return new object[] { "11111x12111111111", "12111", 6, 6 };
+            yield return new object[] { "1111x1211111111111x12", "11121", -1, -1 };
+            yield return new object[] { "1111x1211111111111x12", "11121", -1, -1 };
+            yield return new object[] { "1111x1211111111111x12", "111121", -1, -1 };
+            yield return new object[] { "1111x1211111111111x12", "1111121", -1, -1 };
+            yield return new object[] { "1111x1211111111111x12", "1111121", -1, -1 };
+            yield return new object[] { "1111x1211111111111x12", "1111121", -1, -1 };
+            yield return new object[] { "1111x1211111111111x12", "1211211", -1, -1 };
+            yield return new object[] { "1111x1211111111111x12", "1211111", 5, 5 };
+            yield return new object[] { "1111x1211111111111x12", "1211111", 5, 5 };
+            yield return new object[] { "1111x1211111111111x12", "1211111", 5, 5 };
+            yield return new object[] { "1111x1211111111111x12111122131221221211221111112121121", "111", 0, 44 };
+            yield return new object[] { "1111x1211111111111x12111122131221221211221111112121121", "1111", 0, 43 };
+            yield return new object[] { "1111x1211111111111x12111122131221221211221111112121121", "11111", 7, 42 };
+            yield return new object[] { "1111x1211111111111x12111122131221221211221111112121121", "111111", 7, 41 };
+            yield return new object[] { "1111x1211111111111x12111122131221221211221111112121121", "1111111", 7, 11 };
+            yield return new object[] { "1111x1211111111111x12111122131221221211221111112121121", "11111111", 7, 10 };
+            yield return new object[] { "1111x1211111111111x12111122131221221211221111112121121", "111111111", 7, 9 };
+            yield return new object[] { "1111x1211111111111x12111122131221221211221111112121121", "11111111111", 7, 7 };
+            yield return new object[] { "1111x1211111111111x12111122131221221211221111112121121", "111111111111", -1, -1 };
+            yield return new object[] { "1111x1211111111111x12111122131221221211221111112121121", "1111111111111", -1, -1 };
+            yield return new object[] { "1111x1211111111111x12111122131221221211221111112121121", "11111111111111", -1, -1 };
+            yield return new object[] { "1111x1211111111111x12111122131221221211221111112121121", "111111111111111", -1, -1 };
+            yield return new object[] { "1111x1211111111111x12111122131221221211221111112121121", "11111111111111111", -1, -1 };
+            yield return new object[] { "1111x1211111111111x12111122131221221211221111112121121", "111111111111111111", -1, -1 };
+            yield return new object[] { "1111x1211111111111x12111122131221221211221111112121121", "1211", 5, 48 };
+            yield return new object[] { "1111x1211111111111x12111122131221221211221111112121121", "11121", 44, 44 };
+            yield return new object[] { "1111x1211111111111x12111122131221221211221111112121121", "121111", 5, 19 };
+            yield return new object[] { "1111x1211111111111x12111122131221221211221111112121121", "12111211", -1, -1 };
+            yield return new object[] { "1111x1211111111111x12111122131221221211221111112121121", "1111111", 7, 11 };
+            yield return new object[] { "1111x1211111111111x12111122131221221211221111112121121", "1121121111", -1, -1 };
+            yield return new object[] { "1111x1211111111111x12111122131221221211221111112121121", "1111211111", -1, -1 };
+            yield return new object[] { "1111x1211111111111x12111122131221221211221111112121121", "111111211111", -1, -1 };
+            yield return new object[] { "1111x1211111111111x12111122131221221211221111112121121", "1121111111111", -1, -1 };
+            yield return new object[] { "1111x1211111111111x12111122131221221211221111112121121", "11122111112111111", -1, -1 };
+            yield return new object[] { "1111x1211111111111x12111122131221221211221111112121121", "1111111211111111", -1, -1 };
+            yield return new object[] { "1111x1211111111111x12111122131221221211221111112121121", "111211111111111111", -1, -1 };
+            yield return new object[] { "1111x1211111111111x12111122131221221211221111112121121", "11111211111121111111", -1, -1 };
+            yield return new object[] { "Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶", "Ð¶Ð¶Ð¶", 0, 11 };
+            yield return new object[] { "Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶", "Ð¶0Ð¶", -1, -1 };
+            yield return new object[] { "Ð¶Ð¶Ð¶Ð¶Ð¶Ð°Ð°Ð°Ð°Ð°Ð°Ð°Ð°Ð°Ð°Ð°Ð°Ð°Ð°Ð°ÑÑÑÑÑ", "ÑÑÑÑÑ", 20, 20 };
+            yield return new object[] { "Ð¶Ð¶Ð¶Ð¶Ð¶Ð°Ð°Ð°Ð°Ð°Ð°Ð°Ð°Ð°Ð°Ð°Ð°Ð°Ð°Ð°ÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑ", "ÑÑÑÑ", 20, 31 };
+            yield return new object[] { "Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶Ð¶", "1112", -1, -1 };
+            yield return new object[] { "0ÑÐ·Ð°0Ð¾ÑÑÑ0Ð¾Ð°Ð·0ÑÑÑÐ¾Ð°Ð·ÑÑÑÐ¾0Ð°Ð·ÑÑÑÐ¾Ð°Ð·ÑÐ¾Ñ0ÑÐ·Ð¾Ð·ÑÑÐ¾Ð°Ð·ÑÐ¾ÑÐ·0ÑÐ°ÑÑÐ·0Ð¾Ð°Ð·ÑÐ¾", "0Ð¾Ð°Ð·0", 9, 9 };
+            yield return new object[] { "abababababababababababababababbc", "bb", 29, 29 };
+            yield return new object[] { "abababababababababababababababb", "bb", 29, 29 };
+            yield return new object[] { "abababababababababababababababbc", "bb", 29, 29 };
+            yield return new object[] { "babababababababababababababababc", "bb", -1, -1 };
+            yield return new object[] { "abababababababababababababababbb", "bbb", 29, 29 };
+            yield return new object[] { "abababababababababababababababbbc", "bbb", 29, 29 };
+            yield return new object[] { "bbbbabababababababababababababababc", "bbb", 0, 1 };
+            yield return new object[] { "abababababababababababababababbc", "aa", -1, -1 };
+            yield return new object[] { "abababababababababababababababb", "aa", -1, -1 };
+            yield return new object[] { "abababababababababababababababbc", "aa", -1, -1 };
+            yield return new object[] { "babababababababababababababababc", "aa", -1, -1 };
+            yield return new object[] { "abababababababababababababababbb", "aaa", -1, -1 };
+            yield return new object[] { "abababababababababababababababbbc", "aaa", -1, -1 };
+            yield return new object[] { "bbbbabababababababababababababababc", "aaa", -1, -1 };
+            yield return new object[] { "ababababababababababababababababbc", "abaa", -1, -1 };
+            yield return new object[] { "babbbabababababababababababababababc", "babb", 0, 0 };
+            yield return new object[] { "babbbabababababababababababababababc", "ÑaÑÑ", -1, -1 };
+            yield return new object[] { "babbbbbbbbbbbbb", "babbbbbbbbbbbb", 0, 0 };
+            yield return new object[] { "babbbbbbbbbbbbbbabbbbbbbbbbbb", "babbbbbbbbbbbb", 0, 15 };
+            yield return new object[] { "babbbbbbbbbbbbbbbbabbbbbbbbbbbb", "babbbbbbbbbbbb", 0, 17 };
+            yield return new object[] { "babbbbbbxbbbbbbbbbbabbbbbbbbbbbbbabbbbbbbbbbbb", "babbbbbbbbbbbb", 18, 32 };
+            yield return new object[] { "babbbbbbxbbbbbbbbbbabbbbbbbbbbbbbabbbbbbbbbbbb", "bbbbbbbbbbbbb", 20, 20 };
+            yield return new object[] { "babbbbbbxbbbbbbbbbbabbbbbbbbbbbbbabbbbbbbbbbbb", "babbbbbbxbbbbbbbbbbabbbbbbbbbbbbbabbbbbbbbbbbb", 0, 0 };
+            yield return new object[] { "babbbbbbxbbbbbbbbbbabbbbbbbbbbbbbabbbbbbbbbbbb", "babbbbbbxbbbbbbbbbbabbbbbbbbbbbbbabbbbbbbbbbb", 0, 0 };
+            yield return new object[] { "babbbbbbxbbbbbbbbbbabbbbbbbbbbbbbabbbbbbbbbbbb", "babbbbbbxbbbbbbbbbbabbbbbbbbbbbbbabbbbbbbbbbbbb", -1, -1 };
+            yield return new object[] { "babbbbbbxbbbbbbbbbbabbbbbbbbbbbbbabbbbbbbbbbbb", "babbbbbbxbbbbbbbbbbabbbbbbbbbbbbbabbbbbbbbbbbb", 0, 0 };
+            yield return new object[] { "babbbbbbxbbbbbbbbbbabbbbbbbbbbbbbabbbbbbbbbbbb", "babbbbbbxbbbbbbbbbbabbbbbbbbbbbbbabbbbbbbbbbb", 0, 0 };
+            yield return new object[] { "xxxxxxxxxxxxxxbabbbbbbxbbbbbbbbbbabbbbbbbbbbbbbabbbbbbbbbbbbxxxxxxxxxxxxxxx", "xxxxxxxxxxxxxxx", 60, 60 };
+            yield return new object[] { "xxxxxxxxxxxxxxxbabbbbbbxbbbbbbbbbbabbbbbbbbbbbbbabbbbbbbbbbbbxxxxxxxxxxxxxx", "xxxxxxxxxxxxxxx", 0, 0 };
+        }
+
+        [Theory]
+        [MemberData(nameof(IndexOfSubSeqData_Char))]
+        public static void ValueStartsAndEndsWithTheSameChars(string searchSpace, string value, int expectedIndexOfValue, int expectedLastIndexOfValue)
+        {
+            Assert.Equal(expectedIndexOfValue, searchSpace.AsSpan().IndexOf(value));
+            Assert.Equal(expectedLastIndexOfValue, searchSpace.AsSpan().LastIndexOf(value));
+        }
     }
 }
diff --git a/src/libraries/System.Private.CoreLib/src/System/MemoryExtensions.cs b/src/libraries/System.Private.CoreLib/src/System/MemoryExtensions.cs
index b49dea6..3c7c13f 100644
--- a/src/libraries/System.Private.CoreLib/src/System/MemoryExtensions.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/MemoryExtensions.cs
@@ -580,12 +580,25 @@ namespace System
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static int LastIndexOf<T>(this ReadOnlySpan<T> span, ReadOnlySpan<T> value) where T : IEquatable<T>
         {
-            if (Unsafe.SizeOf<T>() == sizeof(byte) && RuntimeHelpers.IsBitwiseEquatable<T>())
-                return SpanHelpers.LastIndexOf(
-                    ref Unsafe.As<T, byte>(ref MemoryMarshal.GetReference(span)),
-                    span.Length,
-                    ref Unsafe.As<T, byte>(ref MemoryMarshal.GetReference(value)),
-                    value.Length);
+            if (RuntimeHelpers.IsBitwiseEquatable<T>())
+            {
+                if (Unsafe.SizeOf<T>() == sizeof(byte))
+                {
+                    return SpanHelpers.LastIndexOf(
+                        ref Unsafe.As<T, byte>(ref MemoryMarshal.GetReference(span)),
+                        span.Length,
+                        ref Unsafe.As<T, byte>(ref MemoryMarshal.GetReference(value)),
+                        value.Length);
+                }
+                if (Unsafe.SizeOf<T>() == sizeof(char))
+                {
+                    return SpanHelpers.LastIndexOf(
+                        ref Unsafe.As<T, char>(ref MemoryMarshal.GetReference(span)),
+                        span.Length,
+                        ref Unsafe.As<T, char>(ref MemoryMarshal.GetReference(value)),
+                        value.Length);
+                }
+            }
 
             return SpanHelpers.LastIndexOf<T>(ref MemoryMarshal.GetReference(span), span.Length, ref MemoryMarshal.GetReference(value), value.Length);
         }
diff --git a/src/libraries/System.Private.CoreLib/src/System/Numerics/BitOperations.cs b/src/libraries/System.Private.CoreLib/src/System/Numerics/BitOperations.cs
index 6a7141c..419a1c9 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Numerics/BitOperations.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Numerics/BitOperations.cs
@@ -708,5 +708,25 @@ namespace System.Numerics
             return (nuint)RotateRight((uint)value, offset);
 #endif
         }
+
+        /// <summary>
+        /// Reset the lowest significant bit in the given value
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static uint ResetLowestSetBit(uint value)
+        {
+            // It's lowered to BLSR on x86
+            return value & (value - 1);
+        }
+
+        /// <summary>
+        /// Reset specific bit in the given value
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static uint ResetBit(uint value, int bitPos)
+        {
+            // TODO: Recognize BTR on x86 and LSL+BIC on ARM
+            return value & ~(uint)(1 << bitPos);
+        }
     }
 }
diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs
index e94b1df..3a3bc58 100644
--- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs
@@ -22,12 +22,21 @@ namespace System
             if (valueLength == 0)
                 return 0;  // A zero-length sequence is always treated as "found" at the start of the search space.
 
-            byte valueHead = value;
-            ref byte valueTail = ref Unsafe.Add(ref value, 1);
             int valueTailLength = valueLength - 1;
-            int remainingSearchSpaceLength = searchSpaceLength - valueTailLength;
+            if (valueTailLength == 0)
+                return IndexOf(ref searchSpace, value, searchSpaceLength); // for single-byte values use plain IndexOf
 
             int offset = 0;
+            byte valueHead = value;
+            int searchSpaceMinusValueTailLength = searchSpaceLength - valueTailLength;
+            if (Vector128.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector128<byte>.Count)
+            {
+                goto SEARCH_TWO_BYTES;
+            }
+
+            ref byte valueTail = ref Unsafe.Add(ref value, 1);
+            int remainingSearchSpaceLength = searchSpaceMinusValueTailLength;
+
             while (remainingSearchSpaceLength > 0)
             {
                 // Do a quick search for the first element of "value".
@@ -42,13 +51,264 @@ namespace System
                     break;  // The unsearched portion is now shorter than the sequence we're looking for. So it can't be there.
 
                 // Found the first element of "value". See if the tail matches.
-                if (SequenceEqual(ref Unsafe.Add(ref searchSpace, offset + 1), ref valueTail, (nuint)valueTailLength))  // The (nuint)-cast is necessary to pick the correct overload
+                if (SequenceEqual(
+                        ref Unsafe.Add(ref searchSpace, offset + 1),
+                        ref valueTail, (nuint)(uint)valueTailLength))  // The (nuint)-cast is necessary to pick the correct overload
                     return offset;  // The tail matched. Return a successful find.
 
                 remainingSearchSpaceLength--;
                 offset++;
             }
             return -1;
+
+            // Based on http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd "Algorithm 1: Generic SIMD" by Wojciech MuÅa
+            // Some details about the implementation can also be found in https://github.com/dotnet/runtime/pull/63285
+        SEARCH_TWO_BYTES:
+            if (Avx2.IsSupported && searchSpaceMinusValueTailLength - Vector256<byte>.Count >= 0)
+            {
+                // Find the last unique (which is not equal to ch1) byte
+                // the algorithm is fine if both are equal, just a little bit less efficient
+                byte ch2Val = Unsafe.Add(ref value, valueTailLength);
+                int ch1ch2Distance = valueTailLength;
+                while (ch2Val == value && ch1ch2Distance > 1)
+                    ch2Val = Unsafe.Add(ref value, --ch1ch2Distance);
+
+                Vector256<byte> ch1 = Vector256.Create(value);
+                Vector256<byte> ch2 = Vector256.Create(ch2Val);
+
+                do
+                {
+                    Debug.Assert(offset >= 0);
+                    // Make sure we don't go out of bounds
+                    Debug.Assert(offset + ch1ch2Distance + Vector256<byte>.Count <= searchSpaceLength);
+
+                    Vector256<byte> cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref searchSpace, (nuint)offset));
+                    Vector256<byte> cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
+                    Vector256<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
+
+                    // Early out: cmpAnd is all zeros
+                    if (cmpAnd != Vector256<byte>.Zero)
+                    {
+                        uint mask = cmpAnd.ExtractMostSignificantBits();
+                        do
+                        {
+                            int bitPos = BitOperations.TrailingZeroCount(mask);
+                            if (valueLength == 2 || // we already matched two bytes
+                                SequenceEqual(
+                                    ref Unsafe.Add(ref searchSpace, offset + bitPos),
+                                    ref value, (nuint)(uint)valueLength)) // The (nuint)-cast is necessary to pick the correct overload
+                            {
+                                return offset + bitPos;
+                            }
+                            mask = BitOperations.ResetLowestSetBit(mask); // Clear the lowest set bit
+                        } while (mask != 0);
+                    }
+                    offset += Vector256<byte>.Count;
+
+                    if (offset == searchSpaceMinusValueTailLength)
+                        return -1;
+
+                    // Overlap with the current chunk for trailing elements
+                    if (offset > searchSpaceMinusValueTailLength - Vector256<byte>.Count)
+                        offset = searchSpaceMinusValueTailLength - Vector256<byte>.Count;
+                } while (true);
+            }
+            else // 128bit vector path (SSE2 or AdvSimd)
+            {
+                // Find the last unique (which is not equal to ch1) byte
+                // the algorithm is fine if both are equal, just a little bit less efficient
+                byte ch2Val = Unsafe.Add(ref value, valueTailLength);
+                int ch1ch2Distance = valueTailLength;
+                while (ch2Val == value && ch1ch2Distance > 1)
+                    ch2Val = Unsafe.Add(ref value, --ch1ch2Distance);
+
+                Vector128<byte> ch1 = Vector128.Create(value);
+                Vector128<byte> ch2 = Vector128.Create(ch2Val);
+
+                do
+                {
+                    Debug.Assert(offset >= 0);
+                    // Make sure we don't go out of bounds
+                    Debug.Assert(offset + ch1ch2Distance + Vector128<byte>.Count <= searchSpaceLength);
+
+                    Vector128<byte> cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref searchSpace, (nuint)offset));
+                    Vector128<byte> cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
+                    Vector128<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
+
+                    // Early out: cmpAnd is all zeros
+                    if (cmpAnd != Vector128<byte>.Zero)
+                    {
+                        uint mask = cmpAnd.ExtractMostSignificantBits();
+                        do
+                        {
+                            int bitPos = BitOperations.TrailingZeroCount(mask);
+                            if (valueLength == 2 || // we already matched two bytes
+                                SequenceEqual(
+                                    ref Unsafe.Add(ref searchSpace, offset + bitPos),
+                                    ref value, (nuint)(uint)valueLength)) // The (nuint)-cast is necessary to pick the correct overload
+                            {
+                                return offset + bitPos;
+                            }
+                            // Clear the lowest set bit
+                            mask = BitOperations.ResetLowestSetBit(mask);
+                        } while (mask != 0);
+                    }
+                    offset += Vector128<byte>.Count;
+
+                    if (offset == searchSpaceMinusValueTailLength)
+                        return -1;
+
+                    // Overlap with the current chunk for trailing elements
+                    if (offset > searchSpaceMinusValueTailLength - Vector128<byte>.Count)
+                        offset = searchSpaceMinusValueTailLength - Vector128<byte>.Count;
+                } while (true);
+            }
+        }
+
+        public static int LastIndexOf(ref byte searchSpace, int searchSpaceLength, ref byte value, int valueLength)
+        {
+            Debug.Assert(searchSpaceLength >= 0);
+            Debug.Assert(valueLength >= 0);
+
+            if (valueLength == 0)
+                return searchSpaceLength;  // A zero-length sequence is always treated as "found" at the end of the search space.
+
+            int valueTailLength = valueLength - 1;
+            if (valueTailLength == 0)
+                return LastIndexOf(ref searchSpace, value, searchSpaceLength); // for single-byte values use plain LastIndexOf
+
+            int offset = 0;
+            byte valueHead = value;
+            int searchSpaceMinusValueTailLength = searchSpaceLength - valueTailLength;
+            if (Vector128.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector128<byte>.Count)
+            {
+                goto SEARCH_TWO_BYTES;
+            }
+
+            ref byte valueTail = ref Unsafe.Add(ref value, 1);
+
+            while (true)
+            {
+                Debug.Assert(0 <= offset && offset <= searchSpaceLength); // Ensures no deceptive underflows in the computation of "remainingSearchSpaceLength".
+                int remainingSearchSpaceLength = searchSpaceLength - offset - valueTailLength;
+                if (remainingSearchSpaceLength <= 0)
+                    break;  // The unsearched portion is now shorter than the sequence we're looking for. So it can't be there.
+
+                // Do a quick search for the first element of "value".
+                int relativeIndex = LastIndexOf(ref searchSpace, valueHead, remainingSearchSpaceLength);
+                if (relativeIndex < 0)
+                    break;
+
+                // Found the first element of "value". See if the tail matches.
+                if (SequenceEqual(
+                        ref Unsafe.Add(ref searchSpace, relativeIndex + 1),
+                        ref valueTail, (nuint)(uint)valueTailLength)) // The (nuint)-cast is necessary to pick the correct overload
+                    return relativeIndex;  // The tail matched. Return a successful find.
+
+                offset += remainingSearchSpaceLength - relativeIndex;
+            }
+            return -1;
+
+        // Based on http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd "Algorithm 1: Generic SIMD" by Wojciech MuÅa
+        // Some details about the implementation can also be found in https://github.com/dotnet/runtime/pull/63285
+        SEARCH_TWO_BYTES:
+            if (Avx2.IsSupported && searchSpaceMinusValueTailLength >= Vector256<byte>.Count)
+            {
+                offset = searchSpaceMinusValueTailLength - Vector256<byte>.Count;
+
+                // Find the last unique (which is not equal to ch1) byte
+                // the algorithm is fine if both are equal, just a little bit less efficient
+                byte ch2Val = Unsafe.Add(ref value, valueTailLength);
+                int ch1ch2Distance = valueTailLength;
+                while (ch2Val == value && ch1ch2Distance > 1)
+                    ch2Val = Unsafe.Add(ref value, --ch1ch2Distance);
+
+                Vector256<byte> ch1 = Vector256.Create(value);
+                Vector256<byte> ch2 = Vector256.Create(ch2Val);
+                do
+                {
+                    Vector256<byte> cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref searchSpace, (nuint)offset));
+                    Vector256<byte> cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
+                    Vector256<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
+
+                    // Early out: cmpAnd is all zeros
+                    if (cmpAnd != Vector256<byte>.Zero)
+                    {
+                        uint mask = cmpAnd.ExtractMostSignificantBits();
+                        do
+                        {
+                            // unlike IndexOf, here we use LZCNT to process matches starting from the end
+                            int bitPos = 31 - BitOperations.LeadingZeroCount(mask);
+                            if (valueLength == 2 || // we already matched two bytes
+                                SequenceEqual(
+                                    ref Unsafe.Add(ref searchSpace, offset + bitPos),
+                                    ref value, (nuint)(uint)valueLength)) // The (nuint)-cast is necessary to pick the correct overload
+                            {
+                                return bitPos + offset;
+                            }
+                            // Clear the highest set bit.
+                            mask = BitOperations.ResetBit(mask, bitPos);
+                        } while (mask != 0);
+                    }
+
+                    offset -= Vector256<byte>.Count;
+                    if (offset == -Vector256<byte>.Count)
+                        return -1;
+                    // Overlap with the current chunk if there is not enough room for the next one
+                    if (offset < 0)
+                        offset = 0;
+                } while (true);
+            }
+            else // 128bit vector path (SSE2 or AdvSimd)
+            {
+                offset = searchSpaceMinusValueTailLength - Vector128<byte>.Count;
+
+                // Find the last unique (which is not equal to ch1) byte
+                // the algorithm is fine if both are equal, just a little bit less efficient
+                byte ch2Val = Unsafe.Add(ref value, valueTailLength);
+                int ch1ch2Distance = valueTailLength;
+                while (ch2Val == value && ch1ch2Distance > 1)
+                    ch2Val = Unsafe.Add(ref value, --ch1ch2Distance);
+
+                Vector128<byte> ch1 = Vector128.Create(value);
+                Vector128<byte> ch2 = Vector128.Create(ch2Val);
+
+                do
+                {
+                    Vector128<byte> cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref searchSpace, (nuint)offset));
+                    Vector128<byte> cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
+                    Vector128<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
+
+                    // Early out: cmpAnd is all zeros
+                    // it's especially important for ARM where ExtractMostSignificantBits is not cheap
+                    if (cmpAnd != Vector128<byte>.Zero)
+                    {
+                        uint mask = cmpAnd.ExtractMostSignificantBits();
+                        do
+                        {
+                            // unlike IndexOf, here we use LZCNT to process matches starting from the end
+                            int bitPos = 31 - BitOperations.LeadingZeroCount(mask);
+                            if (valueLength == 2 || // we already matched two bytes
+                                SequenceEqual(
+                                    ref Unsafe.Add(ref searchSpace, offset + bitPos),
+                                    ref value, (nuint)(uint)valueLength)) // The (nuint)-cast is necessary to pick the correct overload
+                            {
+                                return bitPos + offset;
+                            }
+                            // Clear the highest set bit.
+                            mask = BitOperations.ResetBit(mask, bitPos);
+                        } while (mask != 0);
+                    }
+
+                    offset -= Vector128<byte>.Count;
+                    if (offset == -Vector128<byte>.Count)
+                        return -1;
+                    // Overlap with the current chunk if there is not enough room for the next one
+                    if (offset < 0)
+                        offset = 0;
+
+                } while (true);
+            }
         }
 
         // Adapted from IndexOf(...)
@@ -408,40 +668,6 @@ namespace System
             return (int)(offset + 7);
         }
 
-        public static int LastIndexOf(ref byte searchSpace, int searchSpaceLength, ref byte value, int valueLength)
-        {
-            Debug.Assert(searchSpaceLength >= 0);
-            Debug.Assert(valueLength >= 0);
-
-            if (valueLength == 0)
-                return searchSpaceLength;  // A zero-length sequence is always treated as "found" at the end of the search space.
-
-            byte valueHead = value;
-            ref byte valueTail = ref Unsafe.Add(ref value, 1);
-            int valueTailLength = valueLength - 1;
-
-            int offset = 0;
-            while (true)
-            {
-                Debug.Assert(0 <= offset && offset <= searchSpaceLength); // Ensures no deceptive underflows in the computation of "remainingSearchSpaceLength".
-                int remainingSearchSpaceLength = searchSpaceLength - offset - valueTailLength;
-                if (remainingSearchSpaceLength <= 0)
-                    break;  // The unsearched portion is now shorter than the sequence we're looking for. So it can't be there.
-
-                // Do a quick search for the first element of "value".
-                int relativeIndex = LastIndexOf(ref searchSpace, valueHead, remainingSearchSpaceLength);
-                if (relativeIndex < 0)
-                    break;
-
-                // Found the first element of "value". See if the tail matches.
-                if (SequenceEqual(ref Unsafe.Add(ref searchSpace, relativeIndex + 1), ref valueTail, (nuint)(uint)valueTailLength))  // The (nunit)-cast is necessary to pick the correct overload
-                    return relativeIndex;  // The tail matched. Return a successful find.
-
-                offset += remainingSearchSpaceLength - relativeIndex;
-            }
-            return -1;
-        }
-
         [MethodImpl(MethodImplOptions.AggressiveOptimization)]
         public static int LastIndexOf(ref byte searchSpace, byte value, int length)
         {
@@ -1564,13 +1790,11 @@ namespace System
                     // This becomes a conditional jmp foward to not favor it.
                     goto NotEqual;
                 }
-                // Use Vector128.Size as Vector128<byte>.Count doesn't inline at R2R time
-                // https://github.com/dotnet/runtime/issues/32714
-                else if (length >= Vector128.Size)
+                else if (length >= (nuint)Vector128<byte>.Count)
                 {
                     Vector128<byte> vecResult;
                     nuint offset = 0;
-                    nuint lengthToExamine = length - Vector128.Size;
+                    nuint lengthToExamine = length - (nuint)Vector128<byte>.Count;
                     // Unsigned, so it shouldn't have overflowed larger than length (rather than negative)
                     Debug.Assert(lengthToExamine < length);
                     if (lengthToExamine != 0)
@@ -1584,7 +1808,7 @@ namespace System
                             {
                                 goto NotEqual;
                             }
-                            offset += Vector128.Size;
+                            offset += (nuint)Vector128<byte>.Count;
                         } while (lengthToExamine > offset);
                     }
 
diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs
index 753e530..fb00b06 100644
--- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs
@@ -22,38 +22,315 @@ namespace System
             if (valueLength == 0)
                 return 0;  // A zero-length sequence is always treated as "found" at the start of the search space.
 
-            char valueHead = value;
-            ref char valueTail = ref Unsafe.Add(ref value, 1);
             int valueTailLength = valueLength - 1;
-            int remainingSearchSpaceLength = searchSpaceLength - valueTailLength;
+            if (valueTailLength == 0)
+            {
+                // for single-char values use plain IndexOf
+                return IndexOf(ref searchSpace, value, searchSpaceLength);
+            }
+
+            int offset = 0;
+            char valueHead = value;
+            int searchSpaceMinusValueTailLength = searchSpaceLength - valueTailLength;
+            if (Vector128.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector128<ushort>.Count)
+            {
+                goto SEARCH_TWO_CHARS;
+            }
+
+            ref byte valueTail = ref Unsafe.As<char, byte>(ref Unsafe.Add(ref value, 1));
+            int remainingSearchSpaceLength = searchSpaceMinusValueTailLength;
 
-            int index = 0;
             while (remainingSearchSpaceLength > 0)
             {
                 // Do a quick search for the first element of "value".
-                int relativeIndex = IndexOf(ref Unsafe.Add(ref searchSpace, index), valueHead, remainingSearchSpaceLength);
+                int relativeIndex = IndexOf(ref Unsafe.Add(ref searchSpace, offset), valueHead, remainingSearchSpaceLength);
                 if (relativeIndex < 0)
                     break;
 
                 remainingSearchSpaceLength -= relativeIndex;
-                index += relativeIndex;
+                offset += relativeIndex;
 
                 if (remainingSearchSpaceLength <= 0)
                     break;  // The unsearched portion is now shorter than the sequence we're looking for. So it can't be there.
 
                 // Found the first element of "value". See if the tail matches.
                 if (SequenceEqual(
-                    ref Unsafe.As<char, byte>(ref Unsafe.Add(ref searchSpace, index + 1)),
-                    ref Unsafe.As<char, byte>(ref valueTail),
-                    (nuint)(uint)valueTailLength * 2))
+                        ref Unsafe.As<char, byte>(ref Unsafe.Add(ref searchSpace, offset + 1)),
+                        ref valueTail,
+                        (nuint)(uint)valueTailLength * 2))
                 {
-                    return index;  // The tail matched. Return a successful find.
+                    return offset;  // The tail matched. Return a successful find.
                 }
 
                 remainingSearchSpaceLength--;
-                index++;
+                offset++;
             }
             return -1;
+
+            // Based on http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd "Algorithm 1: Generic SIMD" by Wojciech MuÅa
+            // Some details about the implementation can also be found in https://github.com/dotnet/runtime/pull/63285
+        SEARCH_TWO_CHARS:
+            if (Avx2.IsSupported && searchSpaceMinusValueTailLength - Vector256<ushort>.Count >= 0)
+            {
+                // Find the last unique (which is not equal to ch1) character
+                // the algorithm is fine if both are equal, just a little bit less efficient
+                ushort ch2Val = Unsafe.Add(ref value, valueTailLength);
+                int ch1ch2Distance = valueTailLength;
+                while (ch2Val == valueHead && ch1ch2Distance > 1)
+                    ch2Val = Unsafe.Add(ref value, --ch1ch2Distance);
+
+                Vector256<ushort> ch1 = Vector256.Create((ushort)valueHead);
+                Vector256<ushort> ch2 = Vector256.Create(ch2Val);
+
+                do
+                {
+                    // Make sure we don't go out of bounds
+                    Debug.Assert(offset + ch1ch2Distance + Vector256<ushort>.Count <= searchSpaceLength);
+
+                    Vector256<ushort> cmpCh1 = Vector256.Equals(ch1, LoadVector256(ref searchSpace, offset));
+                    Vector256<ushort> cmpCh2 = Vector256.Equals(ch2, LoadVector256(ref searchSpace, offset + ch1ch2Distance));
+                    Vector256<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
+
+                    // Early out: cmpAnd is all zeros
+                    if (cmpAnd != Vector256<byte>.Zero)
+                    {
+                        uint mask = cmpAnd.ExtractMostSignificantBits();
+                        do
+                        {
+                            int bitPos = BitOperations.TrailingZeroCount(mask);
+                            // div by 2 (shr) because we work with 2-byte chars
+                            int charPos = (int)((uint)bitPos / 2);
+                            if (valueLength == 2 || // we already matched two chars
+                                SequenceEqual(
+                                    ref Unsafe.As<char, byte>(ref Unsafe.Add(ref searchSpace, offset + charPos)),
+                                    ref Unsafe.As<char, byte>(ref value), (nuint)(uint)valueLength * 2))
+                            {
+                                return offset + charPos;
+                            }
+
+                            // Clear two the lowest set bits
+                            if (Bmi1.IsSupported)
+                                mask = Bmi1.ResetLowestSetBit(Bmi1.ResetLowestSetBit(mask));
+                            else
+                                mask &= ~(uint)(0b11 << bitPos);
+                        } while (mask != 0);
+                    }
+                    offset += Vector256<ushort>.Count;
+
+                    if (offset == searchSpaceMinusValueTailLength)
+                        return -1;
+
+                    // Overlap with the current chunk for trailing elements
+                    if (offset > searchSpaceMinusValueTailLength - Vector256<ushort>.Count)
+                        offset = searchSpaceMinusValueTailLength - Vector256<ushort>.Count;
+                } while (true);
+            }
+            else // 128bit vector path (SSE2 or AdvSimd)
+            {
+                // Find the last unique (which is not equal to ch1) character
+                // the algorithm is fine if both are equal, just a little bit less efficient
+                ushort ch2Val = Unsafe.Add(ref value, valueTailLength);
+                int ch1ch2Distance = valueTailLength;
+                while (ch2Val == valueHead && ch1ch2Distance > 1)
+                    ch2Val = Unsafe.Add(ref value, --ch1ch2Distance);
+
+                Vector128<ushort> ch1 = Vector128.Create((ushort)valueHead);
+                Vector128<ushort> ch2 = Vector128.Create(ch2Val);
+
+                do
+                {
+                    // Make sure we don't go out of bounds
+                    Debug.Assert(offset + ch1ch2Distance + Vector128<ushort>.Count <= searchSpaceLength);
+
+                    Vector128<ushort> cmpCh1 = Vector128.Equals(ch1, LoadVector128(ref searchSpace, offset));
+                    Vector128<ushort> cmpCh2 = Vector128.Equals(ch2, LoadVector128(ref searchSpace, offset + ch1ch2Distance));
+                    Vector128<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
+
+                    // Early out: cmpAnd is all zeros
+                    if (cmpAnd != Vector128<byte>.Zero)
+                    {
+                        uint mask = cmpAnd.ExtractMostSignificantBits();
+                        do
+                        {
+                            int bitPos = BitOperations.TrailingZeroCount(mask);
+                            // div by 2 (shr) because we work with 2-byte chars
+                            int charPos = (int)((uint)bitPos / 2);
+                            if (valueLength == 2 || // we already matched two chars
+                                SequenceEqual(
+                                    ref Unsafe.As<char, byte>(ref Unsafe.Add(ref searchSpace, offset + charPos)),
+                                    ref Unsafe.As<char, byte>(ref value), (nuint)(uint)valueLength * 2))
+                            {
+                                return offset + charPos;
+                            }
+
+                            // Clear two lowest set bits
+                            if (Bmi1.IsSupported)
+                                mask = Bmi1.ResetLowestSetBit(Bmi1.ResetLowestSetBit(mask));
+                            else
+                                mask &= ~(uint)(0b11 << bitPos);
+                        } while (mask != 0);
+                    }
+                    offset += Vector128<ushort>.Count;
+
+                    if (offset == searchSpaceMinusValueTailLength)
+                        return -1;
+
+                    // Overlap with the current chunk for trailing elements
+                    if (offset > searchSpaceMinusValueTailLength - Vector128<ushort>.Count)
+                        offset = searchSpaceMinusValueTailLength - Vector128<ushort>.Count;
+                } while (true);
+            }
+        }
+
+        public static int LastIndexOf(ref char searchSpace, int searchSpaceLength, ref char value, int valueLength)
+        {
+            Debug.Assert(searchSpaceLength >= 0);
+            Debug.Assert(valueLength >= 0);
+
+            if (valueLength == 0)
+                return searchSpaceLength;  // A zero-length sequence is always treated as "found" at the end of the search space.
+
+            int valueTailLength = valueLength - 1;
+            if (valueTailLength == 0)
+                return LastIndexOf(ref searchSpace, value, searchSpaceLength); // for single-char values use plain LastIndexOf
+
+            int offset = 0;
+            char valueHead = value;
+            int searchSpaceMinusValueTailLength = searchSpaceLength - valueTailLength;
+            if (Vector128.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector128<ushort>.Count)
+            {
+                goto SEARCH_TWO_CHARS;
+            }
+
+            ref byte valueTail = ref Unsafe.As<char, byte>(ref Unsafe.Add(ref value, 1));
+
+            while (true)
+            {
+                Debug.Assert(0 <= offset && offset <= searchSpaceLength); // Ensures no deceptive underflows in the computation of "remainingSearchSpaceLength".
+                int remainingSearchSpaceLength = searchSpaceLength - offset - valueTailLength;
+                if (remainingSearchSpaceLength <= 0)
+                    break;  // The unsearched portion is now shorter than the sequence we're looking for. So it can't be there.
+
+                // Do a quick search for the first element of "value".
+                int relativeIndex = LastIndexOf(ref searchSpace, valueHead, remainingSearchSpaceLength);
+                if (relativeIndex == -1)
+                    break;
+
+                // Found the first element of "value". See if the tail matches.
+                if (SequenceEqual(
+                        ref Unsafe.As<char, byte>(ref Unsafe.Add(ref searchSpace, relativeIndex + 1)),
+                        ref valueTail, (nuint)(uint)valueTailLength * 2))
+                {
+                    return relativeIndex; // The tail matched. Return a successful find.
+                }
+
+                offset += remainingSearchSpaceLength - relativeIndex;
+            }
+            return -1;
+
+            // Based on http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd "Algorithm 1: Generic SIMD" by Wojciech MuÅa
+            // Some details about the implementation can also be found in https://github.com/dotnet/runtime/pull/63285
+        SEARCH_TWO_CHARS:
+            if (Avx2.IsSupported && searchSpaceMinusValueTailLength >= Vector256<ushort>.Count)
+            {
+                offset = searchSpaceMinusValueTailLength - Vector256<ushort>.Count;
+
+                // Find the last unique (which is not equal to ch1) char
+                // the algorithm is fine if both are equal, just a little bit less efficient
+                char ch2Val = Unsafe.Add(ref value, valueTailLength);
+                int ch1ch2Distance = valueTailLength;
+                while (ch2Val == valueHead && ch1ch2Distance > 1)
+                    ch2Val = Unsafe.Add(ref value, --ch1ch2Distance);
+
+                Vector256<ushort> ch1 = Vector256.Create((ushort)valueHead);
+                Vector256<ushort> ch2 = Vector256.Create((ushort)ch2Val);
+
+                do
+                {
+
+                    Vector256<ushort> cmpCh1 = Vector256.Equals(ch1, LoadVector256(ref searchSpace, (nuint)offset));
+                    Vector256<ushort> cmpCh2 = Vector256.Equals(ch2, LoadVector256(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
+                    Vector256<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
+
+                    // Early out: cmpAnd is all zeros
+                    if (cmpAnd != Vector256<byte>.Zero)
+                    {
+                        uint mask = cmpAnd.ExtractMostSignificantBits();
+                        do
+                        {
+                            // unlike IndexOf, here we use LZCNT to process matches starting from the end
+                            int bitPos = 30 - BitOperations.LeadingZeroCount(mask);
+                            int charPos = (int)((uint)bitPos / 2);
+
+                            if (valueLength == 2 || // we already matched two chars
+                                SequenceEqual(
+                                    ref Unsafe.As<char, byte>(ref Unsafe.Add(ref searchSpace, offset + charPos)),
+                                    ref Unsafe.As<char, byte>(ref value), (nuint)(uint)valueLength * 2))
+                            {
+                                return charPos + offset;
+                            }
+                            mask &= ~(uint)(0b11 << bitPos); // clear two highest set bits.
+                        } while (mask != 0);
+                    }
+
+                    offset -= Vector256<ushort>.Count;
+                    if (offset == -Vector256<ushort>.Count)
+                        return -1;
+                    // Overlap with the current chunk if there is not enough room for the next one
+                    if (offset < 0)
+                        offset = 0;
+                } while (true);
+            }
+            else // 128bit vector path (SSE2 or AdvSimd)
+            {
+                offset = searchSpaceMinusValueTailLength - Vector128<ushort>.Count;
+
+                // Find the last unique (which is not equal to ch1) char
+                // the algorithm is fine if both are equal, just a little bit less efficient
+                char ch2Val = Unsafe.Add(ref value, valueTailLength);
+                int ch1ch2Distance = valueTailLength;
+                while (ch2Val == value && ch1ch2Distance > 1)
+                    ch2Val = Unsafe.Add(ref value, --ch1ch2Distance);
+
+                Vector128<ushort> ch1 = Vector128.Create((ushort)value);
+                Vector128<ushort> ch2 = Vector128.Create((ushort)ch2Val);
+
+                do
+                {
+                    Vector128<ushort> cmpCh1 = Vector128.Equals(ch1, LoadVector128(ref searchSpace, (nuint)offset));
+                    Vector128<ushort> cmpCh2 = Vector128.Equals(ch2, LoadVector128(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
+                    Vector128<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
+
+                    // Early out: cmpAnd is all zeros
+                    // it's especially important for ARM where ExtractMostSignificantBits is not cheap
+                    if (cmpAnd != Vector128<byte>.Zero)
+                    {
+                        uint mask = cmpAnd.ExtractMostSignificantBits();
+                        do
+                        {
+                            // unlike IndexOf, here we use LZCNT to process matches starting from the end
+                            int bitPos = 30 - BitOperations.LeadingZeroCount(mask);
+                            int charPos = (int)((uint)bitPos / 2);
+
+                            if (valueLength == 2 || // we already matched two chars
+                                SequenceEqual(
+                                    ref Unsafe.As<char, byte>(ref Unsafe.Add(ref searchSpace, offset + charPos)),
+                                    ref Unsafe.As<char, byte>(ref value), (nuint)(uint)valueLength * 2))
+                            {
+                                return charPos + offset;
+                            }
+                            mask &= ~(uint)(0b11 << bitPos); // clear two the highest set bits.
+                        } while (mask != 0);
+                    }
+
+                    offset -= Vector128<ushort>.Count;
+                    if (offset == -Vector128<ushort>.Count)
+                        return -1;
+                    // Overlap with the current chunk if there is not enough room for the next one
+                    if (offset < 0)
+                        offset = 0;
+                } while (true);
+            }
         }
 
         [MethodImpl(MethodImplOptions.AggressiveOptimization)]
diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.T.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.T.cs
index 91d2a34..27944af 100644
--- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.T.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.T.cs
@@ -774,11 +774,17 @@ namespace System
             if (valueLength == 0)
                 return searchSpaceLength;  // A zero-length sequence is always treated as "found" at the end of the search space.
 
-            T valueHead = value;
-            ref T valueTail = ref Unsafe.Add(ref value, 1);
             int valueTailLength = valueLength - 1;
+            if (valueTailLength == 0)
+            {
+                return LastIndexOf(ref searchSpace, value, searchSpaceLength);
+            }
 
             int index = 0;
+
+            T valueHead = value;
+            ref T valueTail = ref Unsafe.Add(ref value, 1);
+
             while (true)
             {
                 Debug.Assert(0 <= index && index <= searchSpaceLength); // Ensures no deceptive underflows in the computation of "remainingSearchSpaceLength".
-- 
2.7.4