From: Stephen Toub <stoub@microsoft.com>
Date: Sat, 19 Sep 2020 00:58:44 +0000 (-0400)
Subject: Remove implicit anchoring optimization from Regex (#42408)
X-Git-Tag: submit/tizen/20210909.063632~5422
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=a3f643be8fdd5759f78d982c961959f0074c1288;p=platform%2Fupstream%2Fdotnet%2Fruntime.git

Remove implicit anchoring optimization from Regex (#42408)

* Remove implicit anchoring optimization from Regex

In .NET 5 we added a bunch of optimizations to Regex.  One of them was a transform that optimized for the case where the pattern begins with `.*`.  If it does, then we insert an implicit anchor at the beginning in order to avoid unnecessary backtracking.  Imagine the pattern `.*a` and the pattern `bcdefghijklmnopqrstuvwxyz`.  This is going to start matching at `b`, find the next newline, and then backtrack from there looking for the `a`; it won't find it and will backtrack all the way, failing the match at that position.  At that point it'll bump to the next position, starting at `c`, and do it all over.  It'll fail, backtrack all the way, and bump again, starting at `d`, and doing it all over.  Etc.  The optimization recognizes that since `.` will match anything other than newline, after it fails to match at the first position, we can just skip all subsequent positions until the next newline, as they're all going to fail.

However, the optimization failed to take into account that someone can explicitly start a match in the middle of the provided text.  In that case, the implicitly added anchor will fail the match in the actual "Go" matching logic.

There are safe ways to do this optimization, e.g. introducing a variant of these anchors that let FindFirstChar skip ahead but that aren't considered for Go's matching purposes, but we can look at employing those for .NET 6.  For now for .NET 5, this commit just deletes the faulty optimization and adds a few tests that were failing it.

* Address PR feedback
---

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
index 8286589..bd55727 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
@@ -349,58 +349,6 @@ namespace System.Text.RegularExpressions
                         break;
                     }
                 }
-
-                // Optimization: implicit anchoring.
-                // If the expression begins with a .* loop, add an anchor to the beginning:
-                // - If Singleline is set such that '.' eats anything, the .* will zip to the end of the string and then backtrack through
-                //   the whole thing looking for a match; since it will have examined everything, there's no benefit to examining it all
-                //   again, and we can anchor to beginning.
-                // - If Singleline is not set, then '.' eats anything up until a '\n' and backtracks from there, so we can similarly avoid
-                //   re-examining that content and anchor to the beginning of lines.
-                // We are currently very conservative here, only examining concat nodes.  This could be loosened in the future, e.g. to
-                // explore captures (but think through any implications of there being a back ref to that capture), to explore loops and
-                // lazy loops a positive minimum (but the anchor shouldn't be part of the loop), to explore alternations and support adding
-                // an anchor if all of them begin with appropriate star loops (though this could also be accomplished by factoring out the
-                // loops to be before the alternation), etc.
-                {
-                    RegexNode node = rootNode.Child(0); // skip implicit root capture node
-                    while (true)
-                    {
-                        bool singleline = (node.Options & RegexOptions.Singleline) != 0;
-                        switch (node.Type)
-                        {
-                            case Concatenate:
-                                node = node.Child(0);
-                                continue;
-
-                            case Setloop when singleline && node.N == int.MaxValue && node.Str == RegexCharClass.AnyClass:
-                            case Setloopatomic when singleline && node.N == int.MaxValue && node.Str == RegexCharClass.AnyClass:
-                            case Notoneloop when !singleline && node.N == int.MaxValue && node.Ch == '\n':
-                            case Notoneloopatomic when !singleline && node.N == int.MaxValue && node.Ch == '\n':
-                                RegexNode? parent = node.Next;
-                                var anchor = new RegexNode(singleline ? Beginning : Bol, node.Options);
-                                Debug.Assert(parent != null);
-                                if (parent.Type == Concatenate)
-                                {
-                                    Debug.Assert(parent.ChildCount() >= 2);
-                                    Debug.Assert(parent.Children is List<RegexNode>);
-                                    anchor.Next = parent;
-                                    ((List<RegexNode>)parent.Children).Insert(0, anchor);
-                                }
-                                else
-                                {
-                                    Debug.Assert(parent.Type == Capture && parent.Next is null, "Only valid capture is the implicit root capture");
-                                    var concat = new RegexNode(Concatenate, parent.Options);
-                                    concat.AddChild(anchor);
-                                    concat.AddChild(node);
-                                    parent.ReplaceChild(0, concat);
-                                }
-                                break;
-                        }
-
-                        break;
-                    }
-                }
             }
 
             // Optimization: Unnecessary root atomic.
diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs
index 70dfee3..a7e7369 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs
@@ -127,6 +127,13 @@ namespace System.Text.RegularExpressions.Tests
             yield return new object[] { @"(?>\w+)(?<!a)", "aa", RegexOptions.None, 0, 2, false, string.Empty };
             yield return new object[] { @".+a", "baa", RegexOptions.None, 0, 3, true, "baa" };
             yield return new object[] { @"[ab]+a", "cacbaac", RegexOptions.None, 0, 7, true, "baa" };
+            foreach (RegexOptions lineOption in new[] { RegexOptions.None, RegexOptions.Singleline, RegexOptions.Multiline })
+            {
+                yield return new object[] { @".*", "abc", lineOption, 1, 2, true, "bc" };
+                yield return new object[] { @".*c", "abc", lineOption, 1, 2, true, "bc" };
+                yield return new object[] { @"b.*", "abc", lineOption, 1, 2, true, "bc" };
+                yield return new object[] { @".*", "abc", lineOption, 2, 1, true, "c" };
+            }
 
             // Using beginning/end of string chars \A, \Z: Actual - "\\Aaaa\\w+zzz\\Z"
             yield return new object[] { @"\Aaaa\w+zzz\Z", "aaaasdfajsdlfjzzz", RegexOptions.IgnoreCase, 0, 17, true, "aaaasdfajsdlfjzzz" };
@@ -852,6 +859,11 @@ namespace System.Text.RegularExpressions.Tests
                     Assert.True(Regex.IsMatch(input, pattern));
                 }
 
+                // Note: this block will fail if any inputs attempt to look for anchors or lookbehinds at the initial position,
+                // as there is a difference between Match(input, beginning) and Match(input, beginning, input.Length - beginning)
+                // in that the former doesn't modify from 0 what the engine sees as the beginning of the input whereas the latter
+                // is equivalent to taking a substring and then matching on that.  However, as we currently don't have any such inputs,
+                // it's currently a viable way to test the additional overload.  Same goes for the similar case below with options.
                 if (beginning + length == input.Length)
                 {
                     // Use Match(string, int)
@@ -859,11 +871,9 @@ namespace System.Text.RegularExpressions.Tests
 
                     Assert.True(r.IsMatch(input, beginning));
                 }
-                else
-                {
-                    // Use Match(string, int, int)
-                    VerifyMatch(r.Match(input, beginning, length), true, expected);
-                }
+
+                // Use Match(string, int, int)
+                VerifyMatch(r.Match(input, beginning, length), true, expected);
             }
 
             r = new Regex(pattern, options);
@@ -890,6 +900,36 @@ namespace System.Text.RegularExpressions.Tests
             }
         }
 
+        public static IEnumerable<object[]> Match_StartatDiffersFromBeginning_MemberData()
+        {
+            foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.Singleline, RegexOptions.Multiline })
+            {
+                // Anchors
+                yield return new object[] { @"^.*", "abc", options, 0, true, true };
+                yield return new object[] { @"^.*", "abc", options, 1, false, true };
+
+                // Positive Lookbehinds
+                yield return new object[] { @"(?<=abc)def", "abcdef", options, 3, true, false };
+
+                // Negative Lookbehinds
+                yield return new object[] { @"(?<!abc)def", "abcdef", options, 3, false, true };
+            }
+        }
+
+        [Theory]
+        [MemberData(nameof(Match_StartatDiffersFromBeginning_MemberData))]
+        [MemberData(nameof(RegexCompilationHelper.TransformRegexOptions), nameof(Match_StartatDiffersFromBeginning_MemberData), 2, MemberType = typeof(RegexCompilationHelper))]
+        public void Match_StartatDiffersFromBeginning(string pattern, string input, RegexOptions options, int startat, bool expectedSuccessStartAt, bool expectedSuccessBeginning)
+        {
+            var r = new Regex(pattern, options);
+
+            Assert.Equal(expectedSuccessStartAt, r.IsMatch(input, startat));
+            Assert.Equal(expectedSuccessStartAt, r.Match(input, startat).Success);
+
+            Assert.Equal(expectedSuccessBeginning, r.Match(input.Substring(startat)).Success);
+            Assert.Equal(expectedSuccessBeginning, r.Match(input, startat, input.Length - startat).Success);
+        }
+
         private static void VerifyMatch(Match match, bool expectedSuccess, CaptureData[] expected)
         {
             Assert.Equal(expectedSuccess, match.Success);
diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs
index 5094c8d..05c68bf 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs
@@ -251,6 +251,16 @@ namespace System.Text.RegularExpressions.Tests
                 }
             };
 
+            yield return new object[]
+            {
+                ".*", "abc", RegexOptions.None,
+                new[]
+                {
+                    new CaptureData("abc", 0, 3),
+                    new CaptureData("", 3, 0)
+                }
+            };
+
             if (!PlatformDetection.IsNetFramework)
             {
                 // .NET Framework missing fix in https://github.com/dotnet/runtime/pull/1075