Improve case-insensitive prefix extraction in Regex (#89449)

author Stephen Toub <stoub@microsoft.com>

Wed, 26 Jul 2023 20:29:06 +0000 (16:29 -0400)

committer GitHub <noreply@github.com>

Wed, 26 Jul 2023 20:29:06 +0000 (16:29 -0400)
author Stephen Toub <stoub@microsoft.com>
Wed, 26 Jul 2023 20:29:06 +0000 (16:29 -0400)
committer GitHub <noreply@github.com>
Wed, 26 Jul 2023 20:29:06 +0000 (16:29 -0400)
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

index e579ad1..d488d3d 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
@@ -2512,8 +2512,16 @@ namespace System.Text.RegularExpressions
          /// <param name="exclusiveChildBound">The exclusive upper bound on the child index to iterate to.</param>
          /// <param name="nodesConsumed">How many nodes make up the sequence, if any.</param>
          /// <param name="caseInsensitiveString">The string to use for an ordinal case-insensitive comparison, if any.</param>
+        /// <param name="consumeZeroWidthNodes">
+        /// Defaults to false. When false, the consumer needs the semantics of matching the produced string to fully represent
+        /// the semantics of all the consumed nodes, which means nodes can be consumed iff they produce text that's represented
+        /// by the resulting string. When true, the resulting string needs to fully represent all valid matches at that position,
+        /// but it can have false positives, which means the resulting string doesn't need to fully represent all zero-width nodes
+        /// consumed. true is only valid when used as part of a search to determine where to try a full match, not as part of
+        /// actual matching logic.
+        /// </param>
          /// <returns>true if a sequence was found; otherwise, false.</returns>
-        public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChildBound, out int nodesConsumed, [NotNullWhen(true)] out string? caseInsensitiveString)
+        public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChildBound, out int nodesConsumed, [NotNullWhen(true)] out string? caseInsensitiveString, bool consumeZeroWidthNodes = false)
          {
              Debug.Assert(Kind == RegexNodeKind.Concatenate, $"Expected Concatenate, got {Kind}");
  
@@ -2572,6 +2580,31 @@ namespace System.Text.RegularExpressions
  
                      vsb.Append((char)(twoChars[0] | 0x20), child.Kind is RegexNodeKind.Set ? 1 : child.M);
                  }
+                else if (child.Kind is RegexNodeKind.Empty)
+                {
+                    // Skip over empty nodes, as they're pure nops. They would ideally have been optimized away,
+                    // but can still remain in some situations.
+                }
+                else if (consumeZeroWidthNodes &&
+                                       // anchors
+                         child.Kind is RegexNodeKind.Beginning or
+                                       RegexNodeKind.Bol or
+                                       RegexNodeKind.Start or
+                                       // boundaries
+                                       RegexNodeKind.Boundary or
+                                       RegexNodeKind.ECMABoundary or
+                                       RegexNodeKind.NonBoundary or
+                                       RegexNodeKind.NonECMABoundary or
+                                       // lookarounds
+                                       RegexNodeKind.NegativeLookaround or
+                                       RegexNodeKind.PositiveLookaround or
+                                       // logic
+                                       RegexNodeKind.UpdateBumpalong)
+                {
+                    // Skip over zero-width nodes that might be reasonable at the beginning of or within a substring.
+                    // We can only do these if consumeZeroWidthNodes is true, as otherwise we'd be producing a string that
+                    // may not fully represent the semantics of this portion of the pattern.
+                }
                  else
                  {
                      break;
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs

index 96c5021..bf3c03a 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
@@ -170,7 +170,7 @@ namespace System.Text.RegularExpressions
                          continue;
  
                      case RegexNodeKind.Concatenate:
-                        node.TryGetOrdinalCaseInsensitiveString(0, node.ChildCount(), out _, out string? caseInsensitiveString);
+                        node.TryGetOrdinalCaseInsensitiveString(0, node.ChildCount(), out _, out string? caseInsensitiveString, consumeZeroWidthNodes: true);
                          return caseInsensitiveString;
  
                      default:
diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs

index a796d2e..acec9ac 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs
@@ -91,6 +91,8 @@ namespace System.Text.RegularExpressions.Tests
          [InlineData(@"ab|(abc)|(abcd)", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "ab")]
          [InlineData(@"ab(?=cd)", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "ab")]
          [InlineData(@"ab(?=cd)", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "ab")]
+        [InlineData(@"\bab(?=\w)(?!=\d)c\b", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "abc")]
+        [InlineData(@"\bab(?=\w)(?!=\d)c\b", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abc")]
          public void LeadingPrefix(string pattern, int options, int expectedMode, string expectedPrefix)
          {
              RegexFindOptimizations opts = ComputeOptimizations(pattern, (RegexOptions)options);
@@ -107,6 +109,7 @@ namespace System.Text.RegularExpressions.Tests
          [InlineData(@"[Aa]", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingSet_RightToLeft, "Aa")]
          [InlineData(@"a", (int)RegexOptions.IgnoreCase | (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingSet_RightToLeft, "Aa")]
          [InlineData(@"ab|cd|ef|gh", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingSet_RightToLeft, "bdfh")]
+        [InlineData(@"\bab(?=\w)(?!=\d)c\b", (int)(RegexOptions.IgnoreCase | RegexOptions.RightToLeft), (int)FindNextStartingPositionMode.LeadingSet_RightToLeft, "Cc")]
          public void LeadingSet(string pattern, int options, int expectedMode, string expectedChars)
          {
              RegexFindOptimizations opts = ComputeOptimizations(pattern, (RegexOptions)options);
author	Stephen Toub <stoub@microsoft.com>
	Wed, 26 Jul 2023 20:29:06 +0000 (16:29 -0400)
committer	GitHub <noreply@github.com>
	Wed, 26 Jul 2023 20:29:06 +0000 (16:29 -0400)
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs		patch \| blob \| history