Use StartsWith(..., OrdinalIgnoreCase) in RegexCompiler / source generator (#66339)

author Stephen Toub <stoub@microsoft.com>

Thu, 10 Mar 2022 19:11:56 +0000 (14:11 -0500)

committer GitHub <noreply@github.com>

Thu, 10 Mar 2022 19:11:56 +0000 (14:11 -0500)
author Stephen Toub <stoub@microsoft.com>
Thu, 10 Mar 2022 19:11:56 +0000 (14:11 -0500)
committer GitHub <noreply@github.com>
Thu, 10 Mar 2022 19:11:56 +0000 (14:11 -0500)
diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs

index 0220a77..88812c2 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
+++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
@@ -1021,6 +1021,9 @@ namespace System.Text.RegularExpressions.Generator
              }
  
              // Whether the node has RegexOptions.IgnoreCase set.
+            // TODO: https://github.com/dotnet/runtime/issues/61048. We should be able to delete this and all usage sites once
+            // IgnoreCase is erradicated from the tree.  The only place it should possibly be left after that work is in a Backreference,
+            // and that can do this check directly as needed.
              static bool IsCaseInsensitive(RegexNode node) => (node.Options & RegexOptions.IgnoreCase) != 0;
  
              // Slices the inputSpan starting at pos until end and stores it into slice.
@@ -1127,7 +1130,7 @@ namespace System.Text.RegularExpressions.Generator
                          // If it's not a One, Multi, or Set, we can't apply this optimization.
                          // If it's IgnoreCase (and wasn't reduced to a non-IgnoreCase set), also ignore it to keep the logic simple.
                          if (node.Child(i).FindBranchOneMultiOrSetStart() is not RegexNode oneMultiOrSet ||
-                            (oneMultiOrSet.Options & RegexOptions.IgnoreCase) != 0) // TODO: https://github.com/dotnet/runtime/issues/61048
+                            IsCaseInsensitive(oneMultiOrSet))
                          {
                              useSwitchedBranches = false;
                              break;
@@ -1208,7 +1211,7 @@ namespace System.Text.RegularExpressions.Generator
  
                              RegexNode? childStart = child.FindBranchOneMultiOrSetStart();
                              Debug.Assert(childStart is not null, "Unexpectedly couldn't find the branch starting node.");
-                            Debug.Assert((childStart.Options & RegexOptions.IgnoreCase) == 0, "Expected only to find non-IgnoreCase branch starts");
+                            Debug.Assert(!IsCaseInsensitive(childStart), "Expected only to find non-IgnoreCase branch starts");
  
                              if (childStart.Kind is RegexNodeKind.Set)
                              {
@@ -2051,14 +2054,15 @@ namespace System.Text.RegularExpressions.Generator
                      TransferSliceStaticPosToPos();
                  }
  
-                // Separate out several node types that, for conciseness, don't need a header and scope written into the source.
+                // Separate out several node types that, for conciseness, don't need a header nor scope written into the source.
+                // Effectively these either evaporate, are completely self-explanatory, or only exist for their children to be rendered.
                  switch (node.Kind)
                  {
-                    // Nothing is written for an empty
+                    // Nothing is written for an empty.
                      case RegexNodeKind.Empty:
                          return;
  
-                    // A match failure doesn't need a scope.
+                    // A single-line goto for a failure doesn't need a scope or comment.
                      case RegexNodeKind.Nothing:
                          Goto(doneLabel);
                          return;
@@ -2075,79 +2079,95 @@ namespace System.Text.RegularExpressions.Generator
                          return;
                  }
  
-                // Put the node's code into its own scope. If the node contains labels that may need to
-                // be visible outside of its scope, the scope is still emitted for clarity but is commented out.
-                using (EmitScope(writer, DescribeNode(node, analysis), faux: analysis.MayBacktrack(node)))
+                // For everything else, output a comment about what the node is.
+                writer.WriteLine($"// {DescribeNode(node, analysis)}");
+
+                // Separate out several node types that, for conciseness, don't need a scope written into the source as they're
+                // always a single statement / block.
+                switch (node.Kind)
                  {
-                    switch (node.Kind)
-                    {
-                        case RegexNodeKind.Beginning:
-                        case RegexNodeKind.Start:
-                        case RegexNodeKind.Bol:
-                        case RegexNodeKind.Eol:
-                        case RegexNodeKind.End:
-                        case RegexNodeKind.EndZ:
-                            EmitAnchors(node);
-                            break;
+                    case RegexNodeKind.Beginning:
+                    case RegexNodeKind.Start:
+                    case RegexNodeKind.Bol:
+                    case RegexNodeKind.Eol:
+                    case RegexNodeKind.End:
+                    case RegexNodeKind.EndZ:
+                        EmitAnchors(node);
+                        return;
  
-                        case RegexNodeKind.Boundary:
-                        case RegexNodeKind.NonBoundary:
-                        case RegexNodeKind.ECMABoundary:
-                        case RegexNodeKind.NonECMABoundary:
-                            EmitBoundary(node);
-                            break;
+                    case RegexNodeKind.Boundary:
+                    case RegexNodeKind.NonBoundary:
+                    case RegexNodeKind.ECMABoundary:
+                    case RegexNodeKind.NonECMABoundary:
+                        EmitBoundary(node);
+                        return;
  
+                    case RegexNodeKind.One:
+                    case RegexNodeKind.Notone:
+                    case RegexNodeKind.Set:
+                        EmitSingleChar(node, emitLengthChecksIfRequired);
+                        return;
+
+                    case RegexNodeKind.Multi when !IsCaseInsensitive(node) && (node.Options & RegexOptions.RightToLeft) == 0:
+                        EmitMultiChar(node, emitLengthChecksIfRequired);
+                        return;
+
+                    case RegexNodeKind.UpdateBumpalong:
+                        EmitUpdateBumpalong(node);
+                        return;
+                }
+
+                // For everything else, put the node's code into its own scope, purely for readability. If the node contains labels
+                // that may need to be visible outside of its scope, the scope is still emitted for clarity but is commented out.
+                using (EmitScope(writer, null, faux: analysis.MayBacktrack(node)))
+                {
+                    switch (node.Kind)
+                    {
                          case RegexNodeKind.Multi:
                              EmitMultiChar(node, emitLengthChecksIfRequired);
-                            break;
-
-                        case RegexNodeKind.One:
-                        case RegexNodeKind.Notone:
-                        case RegexNodeKind.Set:
-                            EmitSingleChar(node, emitLengthChecksIfRequired);
-                            break;
+                            return;
  
                          case RegexNodeKind.Oneloop:
                          case RegexNodeKind.Notoneloop:
                          case RegexNodeKind.Setloop:
                              EmitSingleCharLoop(node, subsequent, emitLengthChecksIfRequired);
-                            break;
+                            return;
  
                          case RegexNodeKind.Onelazy:
                          case RegexNodeKind.Notonelazy:
                          case RegexNodeKind.Setlazy:
                              EmitSingleCharLazy(node, subsequent, emitLengthChecksIfRequired);
-                            break;
+                            return;
  
                          case RegexNodeKind.Oneloopatomic:
                          case RegexNodeKind.Notoneloopatomic:
                          case RegexNodeKind.Setloopatomic:
                              EmitSingleCharAtomicLoop(node, emitLengthChecksIfRequired);
-                            break;
+                            return;
  
                          case RegexNodeKind.Loop:
                              EmitLoop(node);
-                            break;
+                            return;
  
                          case RegexNodeKind.Lazyloop:
                              EmitLazy(node);
-                            break;
+                            return;
  
                          case RegexNodeKind.Alternate:
                              EmitAlternation(node);
-                            break;
+                            return;
  
                          case RegexNodeKind.Backreference:
                              EmitBackreference(node);
-                            break;
+                            return;
  
                          case RegexNodeKind.BackreferenceConditional:
                              EmitBackreferenceConditional(node);
-                            break;
+                            return;
  
                          case RegexNodeKind.ExpressionConditional:
                              EmitExpressionConditional(node);
-                            break;
+                            return;
  
                          case RegexNodeKind.Atomic:
                              Debug.Assert(analysis.MayBacktrack(node.Child(0)));
@@ -2156,25 +2176,20 @@ namespace System.Text.RegularExpressions.Generator
  
                          case RegexNodeKind.Capture:
                              EmitCapture(node, subsequent);
-                            break;
+                            return;
  
                          case RegexNodeKind.PositiveLookaround:
                              EmitPositiveLookaroundAssertion(node);
-                            break;
+                            return;
  
                          case RegexNodeKind.NegativeLookaround:
                              EmitNegativeLookaroundAssertion(node);
-                            break;
-
-                        case RegexNodeKind.UpdateBumpalong:
-                            EmitUpdateBumpalong(node);
-                            break;
-
-                        default:
-                            Debug.Fail($"Unexpected node type: {node.Kind}");
-                            break;
+                            return;
                      }
                  }
+
+                // All nodes should have been handled.
+                Debug.Fail($"Unexpected node type: {node.Kind}");
              }
  
              // Emits the node for an atomic.
@@ -2230,7 +2245,8 @@ namespace System.Text.RegularExpressions.Generator
                  for (int i = 0; i < childCount; i++)
                  {
                      // If we can find a subsequence of fixed-length children, we can emit a length check once for that sequence
-                    // and then skip the individual length checks for each.  We also want to minimize the repetition of if blocks,
+                    // and then skip the individual length checks for each.  We can also discover case-insensitive sequences that
+                    // can be checked efficiently with methods like StartsWith. We also want to minimize the repetition of if blocks,
                      // and so we try to emit a series of clauses all part of the same if block rather than one if block per child.
                      if ((node.Options & RegexOptions.RightToLeft) == 0 &&
                          emitLengthChecksIfRequired &&
@@ -2243,7 +2259,7 @@ namespace System.Text.RegularExpressions.Generator
                          {
                              for (; i < exclusiveEnd; i++)
                              {
-                                void WriteSingleCharChild(RegexNode child, bool includeDescription = true)
+                                void WritePrefix()
                                  {
                                      if (wroteClauses)
                                      {
@@ -2254,31 +2270,41 @@ namespace System.Text.RegularExpressions.Generator
                                      {
                                          writer.Write("if (");
                                      }
-                                    EmitSingleChar(child, emitLengthCheck: false, clauseOnly: true);
-                                    prevDescription = includeDescription ? DescribeNode(child, analysis) : null;
-                                    wroteClauses = true;
                                  }
  
                                  RegexNode child = node.Child(i);
-                                if (child.Kind is RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set)
+                                if (node.TryGetOrdinalCaseInsensitiveString(i, exclusiveEnd, out int nodesConsumed, out string? caseInsensitiveString))
                                  {
-                                    WriteSingleCharChild(child);
+                                    WritePrefix();
+                                    string sourceSpan = sliceStaticPos > 0 ? $"{sliceSpan}.Slice({sliceStaticPos})" : sliceSpan;
+                                    writer.Write($"!global::System.MemoryExtensions.StartsWith({sourceSpan}, {Literal(caseInsensitiveString)}, global::System.StringComparison.OrdinalIgnoreCase)");
+                                    prevDescription = $"Match the string {Literal(caseInsensitiveString)} (ordinal case-insensitive)";
+                                    wroteClauses = true;
+
+                                    sliceStaticPos += caseInsensitiveString.Length;
+                                    i += nodesConsumed - 1;
                                  }
-                                else if (child.Kind is RegexNodeKind.Oneloop or RegexNodeKind.Onelazy or RegexNodeKind.Oneloopatomic or
-                                                       RegexNodeKind.Setloop or RegexNodeKind.Setlazy or RegexNodeKind.Setloopatomic or
-                                                       RegexNodeKind.Notoneloop or RegexNodeKind.Notonelazy or RegexNodeKind.Notoneloopatomic &&
+                                else if (child.Kind is RegexNodeKind.Multi && !IsCaseInsensitive(child))
+                                {
+                                    WritePrefix();
+                                    EmitMultiCharString(child.Str!, caseInsensitive: false, emitLengthCheck: false, clauseOnly: true, rightToLeft: false);
+                                    prevDescription = DescribeNode(child, analysis);
+                                    wroteClauses = true;
+                                }
+                                else if ((child.IsOneFamily || child.IsNotoneFamily || child.IsSetFamily) &&
                                           child.M == child.N &&
                                           child.M <= MaxUnrollSize)
                                  {
-                                    for (int c = 0; c < child.M; c++)
+                                    int repeatCount = child.Kind is RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set ? 1 : child.M;
+                                    for (int c = 0; c < repeatCount; c++)
                                      {
-                                        WriteSingleCharChild(child, includeDescription: c == 0);
+                                        WritePrefix();
+                                        EmitSingleChar(child, emitLengthCheck: false, clauseOnly: true);
+                                        prevDescription = c == 0 ? DescribeNode(child, analysis) : null;
+                                        wroteClauses = true;
                                      }
                                  }
-                                else
-                                {
-                                    break;
-                                }
+                                else break;
                              }
  
                              if (wroteClauses)
@@ -2486,12 +2512,13 @@ namespace System.Text.RegularExpressions.Generator
              {
                  Debug.Assert(node.Kind is RegexNodeKind.Multi, $"Unexpected type: {node.Kind}");
                  Debug.Assert(node.Str is not null);
-                EmitMultiCharString(node.Str, IsCaseInsensitive(node), emitLengthCheck, (node.Options & RegexOptions.RightToLeft) != 0);
+                EmitMultiCharString(node.Str, IsCaseInsensitive(node), emitLengthCheck, clauseOnly: false, (node.Options & RegexOptions.RightToLeft) != 0);
              }
  
-            void EmitMultiCharString(string str, bool caseInsensitive, bool emitLengthCheck, bool rightToLeft)
+            void EmitMultiCharString(string str, bool caseInsensitive, bool emitLengthCheck, bool clauseOnly, bool rightToLeft)
              {
                  Debug.Assert(str.Length >= 2);
+                Debug.Assert(!clauseOnly || (!emitLengthCheck && !caseInsensitive && !rightToLeft));
  
                  if (rightToLeft)
                  {
@@ -2534,9 +2561,17 @@ namespace System.Text.RegularExpressions.Generator
                  else
                  {
                      string sourceSpan = sliceStaticPos > 0 ? $"{sliceSpan}.Slice({sliceStaticPos})" : sliceSpan;
-                    using (EmitBlock(writer, $"if (!global::System.MemoryExtensions.StartsWith({sourceSpan}, {Literal(str)}))"))
+                    string clause = $"!global::System.MemoryExtensions.StartsWith({sourceSpan}, {Literal(str)})";
+                    if (clauseOnly)
                      {
-                        Goto(doneLabel);
+                        writer.Write(clause);
+                    }
+                    else
+                    {
+                        using (EmitBlock(writer, $"if ({clause})"))
+                        {
+                            Goto(doneLabel);
+                        }
                      }
                  }
  
@@ -3061,7 +3096,7 @@ namespace System.Text.RegularExpressions.Generator
                      case <= RegexNode.MultiVsRepeaterLimit when node.IsOneFamily && !IsCaseInsensitive(node):
                          // This is a repeated case-sensitive character; emit it as a multi in order to get all the optimizations
                          // afforded to a multi, e.g. unrolling the loop with multi-char reads/comparisons at a time.
-                        EmitMultiCharString(new string(node.Ch, iterations), caseInsensitive: false, emitLengthCheck, rtl);
+                        EmitMultiCharString(new string(node.Ch, iterations), caseInsensitive: false, emitLengthCheck, clauseOnly: false, rtl);
                          return;
                  }
  
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs

index 63eddea..a85ecef 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
@@ -887,6 +887,20 @@ namespace System.Text.RegularExpressions
              return false;
          }
  
+        /// <summary>Gets whether the specified span contains only ASCII.</summary>
+        public static bool IsAscii(ReadOnlySpan<char> s) // TODO https://github.com/dotnet/runtime/issues/28230: Replace once Ascii is available
+        {
+            foreach (char c in s)
+            {
+                if (c >= 128)
+                {
+                    return false;
+                }
+            }
+
+            return true;
+        }
+
          /// <summary>Gets whether we can iterate through the set list pairs in order to completely enumerate the set's contents.</summary>
          /// <remarks>This may enumerate negated characters if the set is negated.</remarks>
          private static bool CanEasilyEnumerateSetContents(string set) =>
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs

index 573e5e4..aa0b223 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
@@ -55,7 +55,8 @@ namespace System.Text.RegularExpressions
          private static readonly MethodInfo s_spanLastIndexOfSpan = typeof(MemoryExtensions).GetMethod("LastIndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
          private static readonly MethodInfo s_spanSliceIntMethod = typeof(ReadOnlySpan<char>).GetMethod("Slice", new Type[] { typeof(int) })!;
          private static readonly MethodInfo s_spanSliceIntIntMethod = typeof(ReadOnlySpan<char>).GetMethod("Slice", new Type[] { typeof(int), typeof(int) })!;
-        private static readonly MethodInfo s_spanStartsWith = typeof(MemoryExtensions).GetMethod("StartsWith", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
+        private static readonly MethodInfo s_spanStartsWithSpan = typeof(MemoryExtensions).GetMethod("StartsWith", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
+        private static readonly MethodInfo s_spanStartsWithSpanComparison = typeof(MemoryExtensions).GetMethod("StartsWith", new Type[] { typeof(ReadOnlySpan<char>), typeof(ReadOnlySpan<char>), typeof(StringComparison) })!;
          private static readonly MethodInfo s_stringAsSpanMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string) })!;
          private static readonly MethodInfo s_stringGetCharsMethod = typeof(string).GetMethod("get_Chars", new Type[] { typeof(int) })!;
          private static readonly MethodInfo s_textInfoToLowerMethod = typeof(TextInfo).GetMethod("ToLower", new Type[] { typeof(char) })!;
@@ -1404,6 +1405,10 @@ namespace System.Text.RegularExpressions
              // Generated code successfully.
              return;
  
+            // Whether the node has RegexOptions.IgnoreCase set.
+            // TODO: https://github.com/dotnet/runtime/issues/61048. We should be able to delete this and all usage sites once
+            // IgnoreCase is erradicated from the tree.  The only place it should possibly be left after that work is in a Backreference,
+            // and that can do this check directly as needed.
              static bool IsCaseInsensitive(RegexNode node) => (node.Options & RegexOptions.IgnoreCase) != 0;
  
              // Slices the inputSpan starting at pos until end and stores it into slice.
@@ -2367,103 +2372,102 @@ namespace System.Text.RegularExpressions
                      case RegexNodeKind.End:
                      case RegexNodeKind.EndZ:
                          EmitAnchors(node);
-                        break;
+                        return;
  
                      case RegexNodeKind.Boundary:
                      case RegexNodeKind.NonBoundary:
                      case RegexNodeKind.ECMABoundary:
                      case RegexNodeKind.NonECMABoundary:
                          EmitBoundary(node);
-                        break;
+                        return;
  
                      case RegexNodeKind.Multi:
                          EmitMultiChar(node, emitLengthChecksIfRequired);
-                        break;
+                        return;
  
                      case RegexNodeKind.One:
                      case RegexNodeKind.Notone:
                      case RegexNodeKind.Set:
                          EmitSingleChar(node, emitLengthChecksIfRequired);
-                        break;
+                        return;
  
                      case RegexNodeKind.Oneloop:
                      case RegexNodeKind.Notoneloop:
                      case RegexNodeKind.Setloop:
                          EmitSingleCharLoop(node, subsequent, emitLengthChecksIfRequired);
-                        break;
+                        return;
  
                      case RegexNodeKind.Onelazy:
                      case RegexNodeKind.Notonelazy:
                      case RegexNodeKind.Setlazy:
                          EmitSingleCharLazy(node, subsequent, emitLengthChecksIfRequired);
-                        break;
+                        return;
  
                      case RegexNodeKind.Oneloopatomic:
                      case RegexNodeKind.Notoneloopatomic:
                      case RegexNodeKind.Setloopatomic:
                          EmitSingleCharAtomicLoop(node);
-                        break;
+                        return;
  
                      case RegexNodeKind.Loop:
                          EmitLoop(node);
-                        break;
+                        return;
  
                      case RegexNodeKind.Lazyloop:
                          EmitLazy(node);
-                        break;
+                        return;
  
                      case RegexNodeKind.Alternate:
                          EmitAlternation(node);
-                        break;
+                        return;
  
                      case RegexNodeKind.Concatenate:
                          EmitConcatenation(node, subsequent, emitLengthChecksIfRequired);
-                        break;
+                        return;
  
                      case RegexNodeKind.Atomic:
                          EmitAtomic(node, subsequent);
-                        break;
+                        return;
  
                      case RegexNodeKind.Backreference:
                          EmitBackreference(node);
-                        break;
+                        return;
  
                      case RegexNodeKind.BackreferenceConditional:
                          EmitBackreferenceConditional(node);
-                        break;
+                        return;
  
                      case RegexNodeKind.ExpressionConditional:
                          EmitExpressionConditional(node);
-                        break;
+                        return;
  
                      case RegexNodeKind.Capture:
                          EmitCapture(node, subsequent);
-                        break;
+                        return;
  
                      case RegexNodeKind.PositiveLookaround:
                          EmitPositiveLookaroundAssertion(node);
-                        break;
+                        return;
  
                      case RegexNodeKind.NegativeLookaround:
                          EmitNegativeLookaroundAssertion(node);
-                        break;
+                        return;
  
                      case RegexNodeKind.Nothing:
                          BrFar(doneLabel);
-                        break;
+                        return;
  
                      case RegexNodeKind.Empty:
                          // Emit nothing.
-                        break;
+                        return;
  
                      case RegexNodeKind.UpdateBumpalong:
                          EmitUpdateBumpalong(node);
-                        break;
-
-                    default:
-                        Debug.Fail($"Unexpected node type: {node.Kind}");
-                        break;
+                        return;
                  }
+
+                // All nodes should have been handled.
+                Debug.Fail($"Unexpected node type: {node.Kind}");
              }
  
              // Emits the node for an atomic.
@@ -2534,19 +2538,44 @@ namespace System.Text.RegularExpressions
                  Debug.Assert(node.Kind is RegexNodeKind.Concatenate, $"Unexpected type: {node.Kind}");
                  Debug.Assert(node.ChildCount() >= 2, $"Expected at least 2 children, found {node.ChildCount()}");
  
-                bool rtl = (node.Options & RegexOptions.RightToLeft) != 0;
-
                  // Emit the code for each child one after the other.
                  int childCount = node.ChildCount();
                  for (int i = 0; i < childCount; i++)
                  {
                      // If we can find a subsequence of fixed-length children, we can emit a length check once for that sequence
-                    // and then skip the individual length checks for each.
-                    if (!rtl && emitLengthChecksIfRequired && node.TryGetJoinableLengthCheckChildRange(i, out int requiredLength, out int exclusiveEnd))
+                    // and then skip the individual length checks for each. We can also discover case-insensitive sequences that
+                    // can be checked efficiently with methods like StartsWith.
+                    if ((node.Options & RegexOptions.RightToLeft) == 0 &&
+                        emitLengthChecksIfRequired &&
+                        node.TryGetJoinableLengthCheckChildRange(i, out int requiredLength, out int exclusiveEnd))
                      {
                          EmitSpanLengthCheck(requiredLength);
                          for (; i < exclusiveEnd; i++)
                          {
+                            if (node.TryGetOrdinalCaseInsensitiveString(i, exclusiveEnd, out int nodesConsumed, out string? caseInsensitiveString))
+                            {
+                                // if (!sliceSpan.Slice(sliceStaticPause).StartsWith(caseInsensitiveString, StringComparison.OrdinalIgnoreCase)) goto doneLabel;
+                                if (sliceStaticPos > 0)
+                                {
+                                    Ldloca(slice);
+                                    Ldc(sliceStaticPos);
+                                    Call(s_spanSliceIntMethod);
+                                }
+                                else
+                                {
+                                    Ldloc(slice);
+                                }
+                                Ldstr(caseInsensitiveString);
+                                Call(s_stringAsSpanMethod);
+                                Ldc((int)StringComparison.OrdinalIgnoreCase);
+                                Call(s_spanStartsWithSpanComparison);
+                                BrfalseFar(doneLabel);
+
+                                sliceStaticPos += caseInsensitiveString.Length;
+                                i += nodesConsumed - 1;
+                                continue;
+                            }
+
                              EmitNode(node.Child(i), GetSubsequent(i, node, subsequent), emitLengthChecksIfRequired: false);
                          }
  
@@ -2910,7 +2939,7 @@ namespace System.Text.RegularExpressions
                      Call(s_spanSliceIntMethod);
                      Ldstr(str);
                      Call(s_stringAsSpanMethod);
-                    Call(s_spanStartsWith);
+                    Call(s_spanStartsWithSpan);
                      BrfalseFar(doneLabel);
                      sliceStaticPos += str.Length;
                  }
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

index 3a0ead3..12afbb4 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
@@ -2385,6 +2385,103 @@ namespace System.Text.RegularExpressions
          }
  
          /// <summary>
+        /// Determines whether the specified child index of a concatenation begins a sequence whose values
+        /// should be used to perform an ordinal case-insensitive comparison.
+        /// </summary>
+        /// <param name="childIndex">The index of the child with which to start the sequence.</param>
+        /// <param name="exclusiveChildBound">The exclusive upper bound on the child index to iterate to.</param>
+        /// <param name="nodesConsumed">How many nodes make up the sequence, if any.</param>
+        /// <param name="caseInsensitiveString">The string to use for an ordinal case-insensitive comparison, if any.</param>
+        /// <returns>true if a sequence was found; otherwise, false.</returns>
+        public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChildBound, out int nodesConsumed, [NotNullWhen(true)] out string? caseInsensitiveString)
+        {
+            Debug.Assert(Kind == RegexNodeKind.Concatenate, $"Expected Concatenate, got {Kind}");
+
+            var vsb = new ValueStringBuilder(stackalloc char[32]);
+
+            // We're looking in particular for sets of ASCII characters, so we focus only on sets with two characters in them, e.g. [Aa].
+            Span<char> twoChars = stackalloc char[2];
+
+            // Iterate from the child index to the exclusive upper bound.
+            int i = childIndex;
+            for ( ; i < exclusiveChildBound; i++)
+            {
+                RegexNode child = Child(i);
+                if ((child.Options & RegexOptions.IgnoreCase) != 0)
+                {
+                    // TODO https://github.com/dotnet/runtime/issues/61048: Remove this block once fixed.
+                    // We don't want any nodes that are still IgnoreCase, as they'd no longer be IgnoreCase if
+                    // they were applicable to this optimization.
+                    break;
+                }
+
+                if (child.Kind is RegexNodeKind.One)
+                {
+                    // We only want to include ASCII characters, and only if they don't participate in case conversion
+                    // such that they only case to themselves and nothing other cases to them.  Otherwise, including
+                    // them would potentially cause us to match against things not allowed by the pattern.
+                    if (child.Ch >= 128 ||
+                        RegexCharClass.ParticipatesInCaseConversion(child.Ch))
+                    {
+                        break;
+                    }
+
+                    vsb.Append(child.Ch);
+                }
+                else if (child.Kind is RegexNodeKind.Multi)
+                {
+                    // As with RegexNodeKind.One, the string needs to be composed solely of ASCII characters that
+                    // don't participate in case conversion.
+                    if (!RegexCharClass.IsAscii(child.Str.AsSpan()) ||
+                        RegexCharClass.ParticipatesInCaseConversion(child.Str.AsSpan()))
+                    {
+                        break;
+                    }
+
+                    vsb.Append(child.Str);
+                }
+                else if (child.Kind is RegexNodeKind.Set ||
+                         (child.Kind is RegexNodeKind.Setloop or RegexNodeKind.Setlazy or RegexNodeKind.Setloopatomic && child.M == child.N))
+                {
+                    // In particular we want to look for sets that contain only the upper and lowercase variant
+                    // of the same ASCII letter.
+                    if (RegexCharClass.IsNegated(child.Str!) ||
+                        RegexCharClass.GetSetChars(child.Str!, twoChars) != 2 ||
+                        twoChars[0] >= 128 ||
+                        twoChars[1] >= 128 ||
+                        twoChars[0] == twoChars[1] ||
+                        !char.IsLetter(twoChars[0]) ||
+                        !char.IsLetter(twoChars[1]) ||
+                        ((twoChars[0] | 0x20) != (twoChars[1] | 0x20)))
+                    {
+                        break;
+                    }
+
+                    vsb.Append((char)(twoChars[0] | 0x20), child.Kind is RegexNodeKind.Set ? 1 : child.M);
+                }
+                else
+                {
+                    break;
+                }
+            }
+
+            // If we found at least two characters, consider it a sequence found.  It's possible
+            // they all came from the same node, so this could be a sequence of just one node.
+            if (vsb.Length >= 2)
+            {
+                caseInsensitiveString = vsb.ToString();
+                nodesConsumed = i - childIndex;
+                return true;
+            }
+
+            // No sequence found.
+            caseInsensitiveString = null;
+            nodesConsumed = 0;
+            vsb.Dispose();
+            return false;
+        }
+
+        /// <summary>
          /// Determine whether the specified child node is the beginning of a sequence that can
          /// trivially have length checks combined in order to avoid bounds checks.
          /// </summary>
@@ -2403,6 +2500,8 @@ namespace System.Text.RegularExpressions
          /// </remarks>
          public bool TryGetJoinableLengthCheckChildRange(int childIndex, out int requiredLength, out int exclusiveEnd)
          {
+            Debug.Assert(Kind == RegexNodeKind.Concatenate, $"Expected Concatenate, got {Kind}");
+
              static bool CanJoinLengthCheck(RegexNode node) => node.Kind switch
              {
                  RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set => true,
author	Stephen Toub <stoub@microsoft.com>
	Thu, 10 Mar 2022 19:11:56 +0000 (14:11 -0500)
committer	GitHub <noreply@github.com>
	Thu, 10 Mar 2022 19:11:56 +0000 (14:11 -0500)
src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs		patch \| blob \| history