Use StartsWith(..., OrdinalIgnoreCase) in RegexCompiler / source generator (#66339)
authorStephen Toub <stoub@microsoft.com>
Thu, 10 Mar 2022 19:11:56 +0000 (14:11 -0500)
committerGitHub <noreply@github.com>
Thu, 10 Mar 2022 19:11:56 +0000 (14:11 -0500)
* Use StartsWith(..., OrdinalIgnoreCase) in RegexCompiler / source generator

When we encounter a sequence of sets representing case-insensitive ASCII, we can simplify the code generated to just call StartsWith, which both makes it more readable but also takes advantage of the new JIT optimization to lower that into efficient vectorized comparisons based on the supplied literal.

This also cleans up some formatting in the source generator emitted code to make things much more concise and less noisy.

* Address PR feedback

src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

index 0220a77..88812c2 100644 (file)
@@ -1021,6 +1021,9 @@ namespace System.Text.RegularExpressions.Generator
             }
 
             // Whether the node has RegexOptions.IgnoreCase set.
+            // TODO: https://github.com/dotnet/runtime/issues/61048. We should be able to delete this and all usage sites once
+            // IgnoreCase is erradicated from the tree.  The only place it should possibly be left after that work is in a Backreference,
+            // and that can do this check directly as needed.
             static bool IsCaseInsensitive(RegexNode node) => (node.Options & RegexOptions.IgnoreCase) != 0;
 
             // Slices the inputSpan starting at pos until end and stores it into slice.
@@ -1127,7 +1130,7 @@ namespace System.Text.RegularExpressions.Generator
                         // If it's not a One, Multi, or Set, we can't apply this optimization.
                         // If it's IgnoreCase (and wasn't reduced to a non-IgnoreCase set), also ignore it to keep the logic simple.
                         if (node.Child(i).FindBranchOneMultiOrSetStart() is not RegexNode oneMultiOrSet ||
-                            (oneMultiOrSet.Options & RegexOptions.IgnoreCase) != 0) // TODO: https://github.com/dotnet/runtime/issues/61048
+                            IsCaseInsensitive(oneMultiOrSet))
                         {
                             useSwitchedBranches = false;
                             break;
@@ -1208,7 +1211,7 @@ namespace System.Text.RegularExpressions.Generator
 
                             RegexNode? childStart = child.FindBranchOneMultiOrSetStart();
                             Debug.Assert(childStart is not null, "Unexpectedly couldn't find the branch starting node.");
-                            Debug.Assert((childStart.Options & RegexOptions.IgnoreCase) == 0, "Expected only to find non-IgnoreCase branch starts");
+                            Debug.Assert(!IsCaseInsensitive(childStart), "Expected only to find non-IgnoreCase branch starts");
 
                             if (childStart.Kind is RegexNodeKind.Set)
                             {
@@ -2051,14 +2054,15 @@ namespace System.Text.RegularExpressions.Generator
                     TransferSliceStaticPosToPos();
                 }
 
-                // Separate out several node types that, for conciseness, don't need a header and scope written into the source.
+                // Separate out several node types that, for conciseness, don't need a header nor scope written into the source.
+                // Effectively these either evaporate, are completely self-explanatory, or only exist for their children to be rendered.
                 switch (node.Kind)
                 {
-                    // Nothing is written for an empty
+                    // Nothing is written for an empty.
                     case RegexNodeKind.Empty:
                         return;
 
-                    // A match failure doesn't need a scope.
+                    // A single-line goto for a failure doesn't need a scope or comment.
                     case RegexNodeKind.Nothing:
                         Goto(doneLabel);
                         return;
@@ -2075,79 +2079,95 @@ namespace System.Text.RegularExpressions.Generator
                         return;
                 }
 
-                // Put the node's code into its own scope. If the node contains labels that may need to
-                // be visible outside of its scope, the scope is still emitted for clarity but is commented out.
-                using (EmitScope(writer, DescribeNode(node, analysis), faux: analysis.MayBacktrack(node)))
+                // For everything else, output a comment about what the node is.
+                writer.WriteLine($"// {DescribeNode(node, analysis)}");
+
+                // Separate out several node types that, for conciseness, don't need a scope written into the source as they're
+                // always a single statement / block.
+                switch (node.Kind)
                 {
-                    switch (node.Kind)
-                    {
-                        case RegexNodeKind.Beginning:
-                        case RegexNodeKind.Start:
-                        case RegexNodeKind.Bol:
-                        case RegexNodeKind.Eol:
-                        case RegexNodeKind.End:
-                        case RegexNodeKind.EndZ:
-                            EmitAnchors(node);
-                            break;
+                    case RegexNodeKind.Beginning:
+                    case RegexNodeKind.Start:
+                    case RegexNodeKind.Bol:
+                    case RegexNodeKind.Eol:
+                    case RegexNodeKind.End:
+                    case RegexNodeKind.EndZ:
+                        EmitAnchors(node);
+                        return;
 
-                        case RegexNodeKind.Boundary:
-                        case RegexNodeKind.NonBoundary:
-                        case RegexNodeKind.ECMABoundary:
-                        case RegexNodeKind.NonECMABoundary:
-                            EmitBoundary(node);
-                            break;
+                    case RegexNodeKind.Boundary:
+                    case RegexNodeKind.NonBoundary:
+                    case RegexNodeKind.ECMABoundary:
+                    case RegexNodeKind.NonECMABoundary:
+                        EmitBoundary(node);
+                        return;
 
+                    case RegexNodeKind.One:
+                    case RegexNodeKind.Notone:
+                    case RegexNodeKind.Set:
+                        EmitSingleChar(node, emitLengthChecksIfRequired);
+                        return;
+
+                    case RegexNodeKind.Multi when !IsCaseInsensitive(node) && (node.Options & RegexOptions.RightToLeft) == 0:
+                        EmitMultiChar(node, emitLengthChecksIfRequired);
+                        return;
+
+                    case RegexNodeKind.UpdateBumpalong:
+                        EmitUpdateBumpalong(node);
+                        return;
+                }
+
+                // For everything else, put the node's code into its own scope, purely for readability. If the node contains labels
+                // that may need to be visible outside of its scope, the scope is still emitted for clarity but is commented out.
+                using (EmitScope(writer, null, faux: analysis.MayBacktrack(node)))
+                {
+                    switch (node.Kind)
+                    {
                         case RegexNodeKind.Multi:
                             EmitMultiChar(node, emitLengthChecksIfRequired);
-                            break;
-
-                        case RegexNodeKind.One:
-                        case RegexNodeKind.Notone:
-                        case RegexNodeKind.Set:
-                            EmitSingleChar(node, emitLengthChecksIfRequired);
-                            break;
+                            return;
 
                         case RegexNodeKind.Oneloop:
                         case RegexNodeKind.Notoneloop:
                         case RegexNodeKind.Setloop:
                             EmitSingleCharLoop(node, subsequent, emitLengthChecksIfRequired);
-                            break;
+                            return;
 
                         case RegexNodeKind.Onelazy:
                         case RegexNodeKind.Notonelazy:
                         case RegexNodeKind.Setlazy:
                             EmitSingleCharLazy(node, subsequent, emitLengthChecksIfRequired);
-                            break;
+                            return;
 
                         case RegexNodeKind.Oneloopatomic:
                         case RegexNodeKind.Notoneloopatomic:
                         case RegexNodeKind.Setloopatomic:
                             EmitSingleCharAtomicLoop(node, emitLengthChecksIfRequired);
-                            break;
+                            return;
 
                         case RegexNodeKind.Loop:
                             EmitLoop(node);
-                            break;
+                            return;
 
                         case RegexNodeKind.Lazyloop:
                             EmitLazy(node);
-                            break;
+                            return;
 
                         case RegexNodeKind.Alternate:
                             EmitAlternation(node);
-                            break;
+                            return;
 
                         case RegexNodeKind.Backreference:
                             EmitBackreference(node);
-                            break;
+                            return;
 
                         case RegexNodeKind.BackreferenceConditional:
                             EmitBackreferenceConditional(node);
-                            break;
+                            return;
 
                         case RegexNodeKind.ExpressionConditional:
                             EmitExpressionConditional(node);
-                            break;
+                            return;
 
                         case RegexNodeKind.Atomic:
                             Debug.Assert(analysis.MayBacktrack(node.Child(0)));
@@ -2156,25 +2176,20 @@ namespace System.Text.RegularExpressions.Generator
 
                         case RegexNodeKind.Capture:
                             EmitCapture(node, subsequent);
-                            break;
+                            return;
 
                         case RegexNodeKind.PositiveLookaround:
                             EmitPositiveLookaroundAssertion(node);
-                            break;
+                            return;
 
                         case RegexNodeKind.NegativeLookaround:
                             EmitNegativeLookaroundAssertion(node);
-                            break;
-
-                        case RegexNodeKind.UpdateBumpalong:
-                            EmitUpdateBumpalong(node);
-                            break;
-
-                        default:
-                            Debug.Fail($"Unexpected node type: {node.Kind}");
-                            break;
+                            return;
                     }
                 }
+
+                // All nodes should have been handled.
+                Debug.Fail($"Unexpected node type: {node.Kind}");
             }
 
             // Emits the node for an atomic.
@@ -2230,7 +2245,8 @@ namespace System.Text.RegularExpressions.Generator
                 for (int i = 0; i < childCount; i++)
                 {
                     // If we can find a subsequence of fixed-length children, we can emit a length check once for that sequence
-                    // and then skip the individual length checks for each.  We also want to minimize the repetition of if blocks,
+                    // and then skip the individual length checks for each.  We can also discover case-insensitive sequences that
+                    // can be checked efficiently with methods like StartsWith. We also want to minimize the repetition of if blocks,
                     // and so we try to emit a series of clauses all part of the same if block rather than one if block per child.
                     if ((node.Options & RegexOptions.RightToLeft) == 0 &&
                         emitLengthChecksIfRequired &&
@@ -2243,7 +2259,7 @@ namespace System.Text.RegularExpressions.Generator
                         {
                             for (; i < exclusiveEnd; i++)
                             {
-                                void WriteSingleCharChild(RegexNode child, bool includeDescription = true)
+                                void WritePrefix()
                                 {
                                     if (wroteClauses)
                                     {
@@ -2254,31 +2270,41 @@ namespace System.Text.RegularExpressions.Generator
                                     {
                                         writer.Write("if (");
                                     }
-                                    EmitSingleChar(child, emitLengthCheck: false, clauseOnly: true);
-                                    prevDescription = includeDescription ? DescribeNode(child, analysis) : null;
-                                    wroteClauses = true;
                                 }
 
                                 RegexNode child = node.Child(i);
-                                if (child.Kind is RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set)
+                                if (node.TryGetOrdinalCaseInsensitiveString(i, exclusiveEnd, out int nodesConsumed, out string? caseInsensitiveString))
                                 {
-                                    WriteSingleCharChild(child);
+                                    WritePrefix();
+                                    string sourceSpan = sliceStaticPos > 0 ? $"{sliceSpan}.Slice({sliceStaticPos})" : sliceSpan;
+                                    writer.Write($"!global::System.MemoryExtensions.StartsWith({sourceSpan}, {Literal(caseInsensitiveString)}, global::System.StringComparison.OrdinalIgnoreCase)");
+                                    prevDescription = $"Match the string {Literal(caseInsensitiveString)} (ordinal case-insensitive)";
+                                    wroteClauses = true;
+
+                                    sliceStaticPos += caseInsensitiveString.Length;
+                                    i += nodesConsumed - 1;
                                 }
-                                else if (child.Kind is RegexNodeKind.Oneloop or RegexNodeKind.Onelazy or RegexNodeKind.Oneloopatomic or
-                                                       RegexNodeKind.Setloop or RegexNodeKind.Setlazy or RegexNodeKind.Setloopatomic or
-                                                       RegexNodeKind.Notoneloop or RegexNodeKind.Notonelazy or RegexNodeKind.Notoneloopatomic &&
+                                else if (child.Kind is RegexNodeKind.Multi && !IsCaseInsensitive(child))
+                                {
+                                    WritePrefix();
+                                    EmitMultiCharString(child.Str!, caseInsensitive: false, emitLengthCheck: false, clauseOnly: true, rightToLeft: false);
+                                    prevDescription = DescribeNode(child, analysis);
+                                    wroteClauses = true;
+                                }
+                                else if ((child.IsOneFamily || child.IsNotoneFamily || child.IsSetFamily) &&
                                          child.M == child.N &&
                                          child.M <= MaxUnrollSize)
                                 {
-                                    for (int c = 0; c < child.M; c++)
+                                    int repeatCount = child.Kind is RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set ? 1 : child.M;
+                                    for (int c = 0; c < repeatCount; c++)
                                     {
-                                        WriteSingleCharChild(child, includeDescription: c == 0);
+                                        WritePrefix();
+                                        EmitSingleChar(child, emitLengthCheck: false, clauseOnly: true);
+                                        prevDescription = c == 0 ? DescribeNode(child, analysis) : null;
+                                        wroteClauses = true;
                                     }
                                 }
-                                else
-                                {
-                                    break;
-                                }
+                                else break;
                             }
 
                             if (wroteClauses)
@@ -2486,12 +2512,13 @@ namespace System.Text.RegularExpressions.Generator
             {
                 Debug.Assert(node.Kind is RegexNodeKind.Multi, $"Unexpected type: {node.Kind}");
                 Debug.Assert(node.Str is not null);
-                EmitMultiCharString(node.Str, IsCaseInsensitive(node), emitLengthCheck, (node.Options & RegexOptions.RightToLeft) != 0);
+                EmitMultiCharString(node.Str, IsCaseInsensitive(node), emitLengthCheck, clauseOnly: false, (node.Options & RegexOptions.RightToLeft) != 0);
             }
 
-            void EmitMultiCharString(string str, bool caseInsensitive, bool emitLengthCheck, bool rightToLeft)
+            void EmitMultiCharString(string str, bool caseInsensitive, bool emitLengthCheck, bool clauseOnly, bool rightToLeft)
             {
                 Debug.Assert(str.Length >= 2);
+                Debug.Assert(!clauseOnly || (!emitLengthCheck && !caseInsensitive && !rightToLeft));
 
                 if (rightToLeft)
                 {
@@ -2534,9 +2561,17 @@ namespace System.Text.RegularExpressions.Generator
                 else
                 {
                     string sourceSpan = sliceStaticPos > 0 ? $"{sliceSpan}.Slice({sliceStaticPos})" : sliceSpan;
-                    using (EmitBlock(writer, $"if (!global::System.MemoryExtensions.StartsWith({sourceSpan}, {Literal(str)}))"))
+                    string clause = $"!global::System.MemoryExtensions.StartsWith({sourceSpan}, {Literal(str)})";
+                    if (clauseOnly)
                     {
-                        Goto(doneLabel);
+                        writer.Write(clause);
+                    }
+                    else
+                    {
+                        using (EmitBlock(writer, $"if ({clause})"))
+                        {
+                            Goto(doneLabel);
+                        }
                     }
                 }
 
@@ -3061,7 +3096,7 @@ namespace System.Text.RegularExpressions.Generator
                     case <= RegexNode.MultiVsRepeaterLimit when node.IsOneFamily && !IsCaseInsensitive(node):
                         // This is a repeated case-sensitive character; emit it as a multi in order to get all the optimizations
                         // afforded to a multi, e.g. unrolling the loop with multi-char reads/comparisons at a time.
-                        EmitMultiCharString(new string(node.Ch, iterations), caseInsensitive: false, emitLengthCheck, rtl);
+                        EmitMultiCharString(new string(node.Ch, iterations), caseInsensitive: false, emitLengthCheck, clauseOnly: false, rtl);
                         return;
                 }
 
index 63eddea..a85ecef 100644 (file)
@@ -887,6 +887,20 @@ namespace System.Text.RegularExpressions
             return false;
         }
 
+        /// <summary>Gets whether the specified span contains only ASCII.</summary>
+        public static bool IsAscii(ReadOnlySpan<char> s) // TODO https://github.com/dotnet/runtime/issues/28230: Replace once Ascii is available
+        {
+            foreach (char c in s)
+            {
+                if (c >= 128)
+                {
+                    return false;
+                }
+            }
+
+            return true;
+        }
+
         /// <summary>Gets whether we can iterate through the set list pairs in order to completely enumerate the set's contents.</summary>
         /// <remarks>This may enumerate negated characters if the set is negated.</remarks>
         private static bool CanEasilyEnumerateSetContents(string set) =>
index 573e5e4..aa0b223 100644 (file)
@@ -55,7 +55,8 @@ namespace System.Text.RegularExpressions
         private static readonly MethodInfo s_spanLastIndexOfSpan = typeof(MemoryExtensions).GetMethod("LastIndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
         private static readonly MethodInfo s_spanSliceIntMethod = typeof(ReadOnlySpan<char>).GetMethod("Slice", new Type[] { typeof(int) })!;
         private static readonly MethodInfo s_spanSliceIntIntMethod = typeof(ReadOnlySpan<char>).GetMethod("Slice", new Type[] { typeof(int), typeof(int) })!;
-        private static readonly MethodInfo s_spanStartsWith = typeof(MemoryExtensions).GetMethod("StartsWith", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
+        private static readonly MethodInfo s_spanStartsWithSpan = typeof(MemoryExtensions).GetMethod("StartsWith", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
+        private static readonly MethodInfo s_spanStartsWithSpanComparison = typeof(MemoryExtensions).GetMethod("StartsWith", new Type[] { typeof(ReadOnlySpan<char>), typeof(ReadOnlySpan<char>), typeof(StringComparison) })!;
         private static readonly MethodInfo s_stringAsSpanMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string) })!;
         private static readonly MethodInfo s_stringGetCharsMethod = typeof(string).GetMethod("get_Chars", new Type[] { typeof(int) })!;
         private static readonly MethodInfo s_textInfoToLowerMethod = typeof(TextInfo).GetMethod("ToLower", new Type[] { typeof(char) })!;
@@ -1404,6 +1405,10 @@ namespace System.Text.RegularExpressions
             // Generated code successfully.
             return;
 
+            // Whether the node has RegexOptions.IgnoreCase set.
+            // TODO: https://github.com/dotnet/runtime/issues/61048. We should be able to delete this and all usage sites once
+            // IgnoreCase is erradicated from the tree.  The only place it should possibly be left after that work is in a Backreference,
+            // and that can do this check directly as needed.
             static bool IsCaseInsensitive(RegexNode node) => (node.Options & RegexOptions.IgnoreCase) != 0;
 
             // Slices the inputSpan starting at pos until end and stores it into slice.
@@ -2367,103 +2372,102 @@ namespace System.Text.RegularExpressions
                     case RegexNodeKind.End:
                     case RegexNodeKind.EndZ:
                         EmitAnchors(node);
-                        break;
+                        return;
 
                     case RegexNodeKind.Boundary:
                     case RegexNodeKind.NonBoundary:
                     case RegexNodeKind.ECMABoundary:
                     case RegexNodeKind.NonECMABoundary:
                         EmitBoundary(node);
-                        break;
+                        return;
 
                     case RegexNodeKind.Multi:
                         EmitMultiChar(node, emitLengthChecksIfRequired);
-                        break;
+                        return;
 
                     case RegexNodeKind.One:
                     case RegexNodeKind.Notone:
                     case RegexNodeKind.Set:
                         EmitSingleChar(node, emitLengthChecksIfRequired);
-                        break;
+                        return;
 
                     case RegexNodeKind.Oneloop:
                     case RegexNodeKind.Notoneloop:
                     case RegexNodeKind.Setloop:
                         EmitSingleCharLoop(node, subsequent, emitLengthChecksIfRequired);
-                        break;
+                        return;
 
                     case RegexNodeKind.Onelazy:
                     case RegexNodeKind.Notonelazy:
                     case RegexNodeKind.Setlazy:
                         EmitSingleCharLazy(node, subsequent, emitLengthChecksIfRequired);
-                        break;
+                        return;
 
                     case RegexNodeKind.Oneloopatomic:
                     case RegexNodeKind.Notoneloopatomic:
                     case RegexNodeKind.Setloopatomic:
                         EmitSingleCharAtomicLoop(node);
-                        break;
+                        return;
 
                     case RegexNodeKind.Loop:
                         EmitLoop(node);
-                        break;
+                        return;
 
                     case RegexNodeKind.Lazyloop:
                         EmitLazy(node);
-                        break;
+                        return;
 
                     case RegexNodeKind.Alternate:
                         EmitAlternation(node);
-                        break;
+                        return;
 
                     case RegexNodeKind.Concatenate:
                         EmitConcatenation(node, subsequent, emitLengthChecksIfRequired);
-                        break;
+                        return;
 
                     case RegexNodeKind.Atomic:
                         EmitAtomic(node, subsequent);
-                        break;
+                        return;
 
                     case RegexNodeKind.Backreference:
                         EmitBackreference(node);
-                        break;
+                        return;
 
                     case RegexNodeKind.BackreferenceConditional:
                         EmitBackreferenceConditional(node);
-                        break;
+                        return;
 
                     case RegexNodeKind.ExpressionConditional:
                         EmitExpressionConditional(node);
-                        break;
+                        return;
 
                     case RegexNodeKind.Capture:
                         EmitCapture(node, subsequent);
-                        break;
+                        return;
 
                     case RegexNodeKind.PositiveLookaround:
                         EmitPositiveLookaroundAssertion(node);
-                        break;
+                        return;
 
                     case RegexNodeKind.NegativeLookaround:
                         EmitNegativeLookaroundAssertion(node);
-                        break;
+                        return;
 
                     case RegexNodeKind.Nothing:
                         BrFar(doneLabel);
-                        break;
+                        return;
 
                     case RegexNodeKind.Empty:
                         // Emit nothing.
-                        break;
+                        return;
 
                     case RegexNodeKind.UpdateBumpalong:
                         EmitUpdateBumpalong(node);
-                        break;
-
-                    default:
-                        Debug.Fail($"Unexpected node type: {node.Kind}");
-                        break;
+                        return;
                 }
+
+                // All nodes should have been handled.
+                Debug.Fail($"Unexpected node type: {node.Kind}");
             }
 
             // Emits the node for an atomic.
@@ -2534,19 +2538,44 @@ namespace System.Text.RegularExpressions
                 Debug.Assert(node.Kind is RegexNodeKind.Concatenate, $"Unexpected type: {node.Kind}");
                 Debug.Assert(node.ChildCount() >= 2, $"Expected at least 2 children, found {node.ChildCount()}");
 
-                bool rtl = (node.Options & RegexOptions.RightToLeft) != 0;
-
                 // Emit the code for each child one after the other.
                 int childCount = node.ChildCount();
                 for (int i = 0; i < childCount; i++)
                 {
                     // If we can find a subsequence of fixed-length children, we can emit a length check once for that sequence
-                    // and then skip the individual length checks for each.
-                    if (!rtl && emitLengthChecksIfRequired && node.TryGetJoinableLengthCheckChildRange(i, out int requiredLength, out int exclusiveEnd))
+                    // and then skip the individual length checks for each. We can also discover case-insensitive sequences that
+                    // can be checked efficiently with methods like StartsWith.
+                    if ((node.Options & RegexOptions.RightToLeft) == 0 &&
+                        emitLengthChecksIfRequired &&
+                        node.TryGetJoinableLengthCheckChildRange(i, out int requiredLength, out int exclusiveEnd))
                     {
                         EmitSpanLengthCheck(requiredLength);
                         for (; i < exclusiveEnd; i++)
                         {
+                            if (node.TryGetOrdinalCaseInsensitiveString(i, exclusiveEnd, out int nodesConsumed, out string? caseInsensitiveString))
+                            {
+                                // if (!sliceSpan.Slice(sliceStaticPause).StartsWith(caseInsensitiveString, StringComparison.OrdinalIgnoreCase)) goto doneLabel;
+                                if (sliceStaticPos > 0)
+                                {
+                                    Ldloca(slice);
+                                    Ldc(sliceStaticPos);
+                                    Call(s_spanSliceIntMethod);
+                                }
+                                else
+                                {
+                                    Ldloc(slice);
+                                }
+                                Ldstr(caseInsensitiveString);
+                                Call(s_stringAsSpanMethod);
+                                Ldc((int)StringComparison.OrdinalIgnoreCase);
+                                Call(s_spanStartsWithSpanComparison);
+                                BrfalseFar(doneLabel);
+
+                                sliceStaticPos += caseInsensitiveString.Length;
+                                i += nodesConsumed - 1;
+                                continue;
+                            }
+
                             EmitNode(node.Child(i), GetSubsequent(i, node, subsequent), emitLengthChecksIfRequired: false);
                         }
 
@@ -2910,7 +2939,7 @@ namespace System.Text.RegularExpressions
                     Call(s_spanSliceIntMethod);
                     Ldstr(str);
                     Call(s_stringAsSpanMethod);
-                    Call(s_spanStartsWith);
+                    Call(s_spanStartsWithSpan);
                     BrfalseFar(doneLabel);
                     sliceStaticPos += str.Length;
                 }
index 3a0ead3..12afbb4 100644 (file)
@@ -2385,6 +2385,103 @@ namespace System.Text.RegularExpressions
         }
 
         /// <summary>
+        /// Determines whether the specified child index of a concatenation begins a sequence whose values
+        /// should be used to perform an ordinal case-insensitive comparison.
+        /// </summary>
+        /// <param name="childIndex">The index of the child with which to start the sequence.</param>
+        /// <param name="exclusiveChildBound">The exclusive upper bound on the child index to iterate to.</param>
+        /// <param name="nodesConsumed">How many nodes make up the sequence, if any.</param>
+        /// <param name="caseInsensitiveString">The string to use for an ordinal case-insensitive comparison, if any.</param>
+        /// <returns>true if a sequence was found; otherwise, false.</returns>
+        public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChildBound, out int nodesConsumed, [NotNullWhen(true)] out string? caseInsensitiveString)
+        {
+            Debug.Assert(Kind == RegexNodeKind.Concatenate, $"Expected Concatenate, got {Kind}");
+
+            var vsb = new ValueStringBuilder(stackalloc char[32]);
+
+            // We're looking in particular for sets of ASCII characters, so we focus only on sets with two characters in them, e.g. [Aa].
+            Span<char> twoChars = stackalloc char[2];
+
+            // Iterate from the child index to the exclusive upper bound.
+            int i = childIndex;
+            for ( ; i < exclusiveChildBound; i++)
+            {
+                RegexNode child = Child(i);
+                if ((child.Options & RegexOptions.IgnoreCase) != 0)
+                {
+                    // TODO https://github.com/dotnet/runtime/issues/61048: Remove this block once fixed.
+                    // We don't want any nodes that are still IgnoreCase, as they'd no longer be IgnoreCase if
+                    // they were applicable to this optimization.
+                    break;
+                }
+
+                if (child.Kind is RegexNodeKind.One)
+                {
+                    // We only want to include ASCII characters, and only if they don't participate in case conversion
+                    // such that they only case to themselves and nothing other cases to them.  Otherwise, including
+                    // them would potentially cause us to match against things not allowed by the pattern.
+                    if (child.Ch >= 128 ||
+                        RegexCharClass.ParticipatesInCaseConversion(child.Ch))
+                    {
+                        break;
+                    }
+
+                    vsb.Append(child.Ch);
+                }
+                else if (child.Kind is RegexNodeKind.Multi)
+                {
+                    // As with RegexNodeKind.One, the string needs to be composed solely of ASCII characters that
+                    // don't participate in case conversion.
+                    if (!RegexCharClass.IsAscii(child.Str.AsSpan()) ||
+                        RegexCharClass.ParticipatesInCaseConversion(child.Str.AsSpan()))
+                    {
+                        break;
+                    }
+
+                    vsb.Append(child.Str);
+                }
+                else if (child.Kind is RegexNodeKind.Set ||
+                         (child.Kind is RegexNodeKind.Setloop or RegexNodeKind.Setlazy or RegexNodeKind.Setloopatomic && child.M == child.N))
+                {
+                    // In particular we want to look for sets that contain only the upper and lowercase variant
+                    // of the same ASCII letter.
+                    if (RegexCharClass.IsNegated(child.Str!) ||
+                        RegexCharClass.GetSetChars(child.Str!, twoChars) != 2 ||
+                        twoChars[0] >= 128 ||
+                        twoChars[1] >= 128 ||
+                        twoChars[0] == twoChars[1] ||
+                        !char.IsLetter(twoChars[0]) ||
+                        !char.IsLetter(twoChars[1]) ||
+                        ((twoChars[0] | 0x20) != (twoChars[1] | 0x20)))
+                    {
+                        break;
+                    }
+
+                    vsb.Append((char)(twoChars[0] | 0x20), child.Kind is RegexNodeKind.Set ? 1 : child.M);
+                }
+                else
+                {
+                    break;
+                }
+            }
+
+            // If we found at least two characters, consider it a sequence found.  It's possible
+            // they all came from the same node, so this could be a sequence of just one node.
+            if (vsb.Length >= 2)
+            {
+                caseInsensitiveString = vsb.ToString();
+                nodesConsumed = i - childIndex;
+                return true;
+            }
+
+            // No sequence found.
+            caseInsensitiveString = null;
+            nodesConsumed = 0;
+            vsb.Dispose();
+            return false;
+        }
+
+        /// <summary>
         /// Determine whether the specified child node is the beginning of a sequence that can
         /// trivially have length checks combined in order to avoid bounds checks.
         /// </summary>
@@ -2403,6 +2500,8 @@ namespace System.Text.RegularExpressions
         /// </remarks>
         public bool TryGetJoinableLengthCheckChildRange(int childIndex, out int requiredLength, out int exclusiveEnd)
         {
+            Debug.Assert(Kind == RegexNodeKind.Concatenate, $"Expected Concatenate, got {Kind}");
+
             static bool CanJoinLengthCheck(RegexNode node) => node.Kind switch
             {
                 RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set => true,