Remove implicit anchoring optimization from Regex (#42408)

author Stephen Toub <stoub@microsoft.com>

Sat, 19 Sep 2020 00:58:44 +0000 (20:58 -0400)

committer GitHub <noreply@github.com>

Sat, 19 Sep 2020 00:58:44 +0000 (17:58 -0700)
author Stephen Toub <stoub@microsoft.com>
Sat, 19 Sep 2020 00:58:44 +0000 (20:58 -0400)
committer GitHub <noreply@github.com>
Sat, 19 Sep 2020 00:58:44 +0000 (17:58 -0700)
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

index 8286589..bd55727 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
@@ -349,58 +349,6 @@ namespace System.Text.RegularExpressions
                          break;
                      }
                  }
-
-                // Optimization: implicit anchoring.
-                // If the expression begins with a .* loop, add an anchor to the beginning:
-                // - If Singleline is set such that '.' eats anything, the .* will zip to the end of the string and then backtrack through
-                //   the whole thing looking for a match; since it will have examined everything, there's no benefit to examining it all
-                //   again, and we can anchor to beginning.
-                // - If Singleline is not set, then '.' eats anything up until a '\n' and backtracks from there, so we can similarly avoid
-                //   re-examining that content and anchor to the beginning of lines.
-                // We are currently very conservative here, only examining concat nodes.  This could be loosened in the future, e.g. to
-                // explore captures (but think through any implications of there being a back ref to that capture), to explore loops and
-                // lazy loops a positive minimum (but the anchor shouldn't be part of the loop), to explore alternations and support adding
-                // an anchor if all of them begin with appropriate star loops (though this could also be accomplished by factoring out the
-                // loops to be before the alternation), etc.
-                {
-                    RegexNode node = rootNode.Child(0); // skip implicit root capture node
-                    while (true)
-                    {
-                        bool singleline = (node.Options & RegexOptions.Singleline) != 0;
-                        switch (node.Type)
-                        {
-                            case Concatenate:
-                                node = node.Child(0);
-                                continue;
-
-                            case Setloop when singleline && node.N == int.MaxValue && node.Str == RegexCharClass.AnyClass:
-                            case Setloopatomic when singleline && node.N == int.MaxValue && node.Str == RegexCharClass.AnyClass:
-                            case Notoneloop when !singleline && node.N == int.MaxValue && node.Ch == '\n':
-                            case Notoneloopatomic when !singleline && node.N == int.MaxValue && node.Ch == '\n':
-                                RegexNode? parent = node.Next;
-                                var anchor = new RegexNode(singleline ? Beginning : Bol, node.Options);
-                                Debug.Assert(parent != null);
-                                if (parent.Type == Concatenate)
-                                {
-                                    Debug.Assert(parent.ChildCount() >= 2);
-                                    Debug.Assert(parent.Children is List<RegexNode>);
-                                    anchor.Next = parent;
-                                    ((List<RegexNode>)parent.Children).Insert(0, anchor);
-                                }
-                                else
-                                {
-                                    Debug.Assert(parent.Type == Capture && parent.Next is null, "Only valid capture is the implicit root capture");
-                                    var concat = new RegexNode(Concatenate, parent.Options);
-                                    concat.AddChild(anchor);
-                                    concat.AddChild(node);
-                                    parent.ReplaceChild(0, concat);
-                                }
-                                break;
-                        }
-
-                        break;
-                    }
-                }
              }
  
              // Optimization: Unnecessary root atomic.
diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs

index 70dfee3..a7e7369 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs
@@ -127,6 +127,13 @@ namespace System.Text.RegularExpressions.Tests
              yield return new object[] { @"(?>\w+)(?<!a)", "aa", RegexOptions.None, 0, 2, false, string.Empty };
              yield return new object[] { @".+a", "baa", RegexOptions.None, 0, 3, true, "baa" };
              yield return new object[] { @"[ab]+a", "cacbaac", RegexOptions.None, 0, 7, true, "baa" };
+            foreach (RegexOptions lineOption in new[] { RegexOptions.None, RegexOptions.Singleline, RegexOptions.Multiline })
+            {
+                yield return new object[] { @".*", "abc", lineOption, 1, 2, true, "bc" };
+                yield return new object[] { @".*c", "abc", lineOption, 1, 2, true, "bc" };
+                yield return new object[] { @"b.*", "abc", lineOption, 1, 2, true, "bc" };
+                yield return new object[] { @".*", "abc", lineOption, 2, 1, true, "c" };
+            }
  
              // Using beginning/end of string chars \A, \Z: Actual - "\\Aaaa\\w+zzz\\Z"
              yield return new object[] { @"\Aaaa\w+zzz\Z", "aaaasdfajsdlfjzzz", RegexOptions.IgnoreCase, 0, 17, true, "aaaasdfajsdlfjzzz" };
@@ -852,6 +859,11 @@ namespace System.Text.RegularExpressions.Tests
                      Assert.True(Regex.IsMatch(input, pattern));
                  }
  
+                // Note: this block will fail if any inputs attempt to look for anchors or lookbehinds at the initial position,
+                // as there is a difference between Match(input, beginning) and Match(input, beginning, input.Length - beginning)
+                // in that the former doesn't modify from 0 what the engine sees as the beginning of the input whereas the latter
+                // is equivalent to taking a substring and then matching on that.  However, as we currently don't have any such inputs,
+                // it's currently a viable way to test the additional overload.  Same goes for the similar case below with options.
                  if (beginning + length == input.Length)
                  {
                      // Use Match(string, int)
@@ -859,11 +871,9 @@ namespace System.Text.RegularExpressions.Tests
  
                      Assert.True(r.IsMatch(input, beginning));
                  }
-                else
-                {
-                    // Use Match(string, int, int)
-                    VerifyMatch(r.Match(input, beginning, length), true, expected);
-                }
+
+                // Use Match(string, int, int)
+                VerifyMatch(r.Match(input, beginning, length), true, expected);
              }
  
              r = new Regex(pattern, options);
@@ -890,6 +900,36 @@ namespace System.Text.RegularExpressions.Tests
              }
          }
  
+        public static IEnumerable<object[]> Match_StartatDiffersFromBeginning_MemberData()
+        {
+            foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.Singleline, RegexOptions.Multiline })
+            {
+                // Anchors
+                yield return new object[] { @"^.*", "abc", options, 0, true, true };
+                yield return new object[] { @"^.*", "abc", options, 1, false, true };
+
+                // Positive Lookbehinds
+                yield return new object[] { @"(?<=abc)def", "abcdef", options, 3, true, false };
+
+                // Negative Lookbehinds
+                yield return new object[] { @"(?<!abc)def", "abcdef", options, 3, false, true };
+            }
+        }
+
+        [Theory]
+        [MemberData(nameof(Match_StartatDiffersFromBeginning_MemberData))]
+        [MemberData(nameof(RegexCompilationHelper.TransformRegexOptions), nameof(Match_StartatDiffersFromBeginning_MemberData), 2, MemberType = typeof(RegexCompilationHelper))]
+        public void Match_StartatDiffersFromBeginning(string pattern, string input, RegexOptions options, int startat, bool expectedSuccessStartAt, bool expectedSuccessBeginning)
+        {
+            var r = new Regex(pattern, options);
+
+            Assert.Equal(expectedSuccessStartAt, r.IsMatch(input, startat));
+            Assert.Equal(expectedSuccessStartAt, r.Match(input, startat).Success);
+
+            Assert.Equal(expectedSuccessBeginning, r.Match(input.Substring(startat)).Success);
+            Assert.Equal(expectedSuccessBeginning, r.Match(input, startat, input.Length - startat).Success);
+        }
+
          private static void VerifyMatch(Match match, bool expectedSuccess, CaptureData[] expected)
          {
              Assert.Equal(expectedSuccess, match.Success);
diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs

index 5094c8d..05c68bf 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs
@@ -251,6 +251,16 @@ namespace System.Text.RegularExpressions.Tests
                  }
              };
  
+            yield return new object[]
+            {
+                ".*", "abc", RegexOptions.None,
+                new[]
+                {
+                    new CaptureData("abc", 0, 3),
+                    new CaptureData("", 3, 0)
+                }
+            };
+
              if (!PlatformDetection.IsNetFramework)
              {
                  // .NET Framework missing fix in https://github.com/dotnet/runtime/pull/1075
author	Stephen Toub <stoub@microsoft.com>
	Sat, 19 Sep 2020 00:58:44 +0000 (20:58 -0400)
committer	GitHub <noreply@github.com>
	Sat, 19 Sep 2020 00:58:44 +0000 (17:58 -0700)
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs		patch \| blob \| history