From: Stephen Toub Date: Sat, 19 Sep 2020 00:58:44 +0000 (-0400) Subject: Remove implicit anchoring optimization from Regex (#42408) X-Git-Tag: submit/tizen/20210909.063632~5422 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=a3f643be8fdd5759f78d982c961959f0074c1288;p=platform%2Fupstream%2Fdotnet%2Fruntime.git Remove implicit anchoring optimization from Regex (#42408) * Remove implicit anchoring optimization from Regex In .NET 5 we added a bunch of optimizations to Regex. One of them was a transform that optimized for the case where the pattern begins with `.*`. If it does, then we insert an implicit anchor at the beginning in order to avoid unnecessary backtracking. Imagine the pattern `.*a` and the pattern `bcdefghijklmnopqrstuvwxyz`. This is going to start matching at `b`, find the next newline, and then backtrack from there looking for the `a`; it won't find it and will backtrack all the way, failing the match at that position. At that point it'll bump to the next position, starting at `c`, and do it all over. It'll fail, backtrack all the way, and bump again, starting at `d`, and doing it all over. Etc. The optimization recognizes that since `.` will match anything other than newline, after it fails to match at the first position, we can just skip all subsequent positions until the next newline, as they're all going to fail. However, the optimization failed to take into account that someone can explicitly start a match in the middle of the provided text. In that case, the implicitly added anchor will fail the match in the actual "Go" matching logic. There are safe ways to do this optimization, e.g. introducing a variant of these anchors that let FindFirstChar skip ahead but that aren't considered for Go's matching purposes, but we can look at employing those for .NET 6. For now for .NET 5, this commit just deletes the faulty optimization and adds a few tests that were failing it. * Address PR feedback --- diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 8286589..bd55727 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -349,58 +349,6 @@ namespace System.Text.RegularExpressions break; } } - - // Optimization: implicit anchoring. - // If the expression begins with a .* loop, add an anchor to the beginning: - // - If Singleline is set such that '.' eats anything, the .* will zip to the end of the string and then backtrack through - // the whole thing looking for a match; since it will have examined everything, there's no benefit to examining it all - // again, and we can anchor to beginning. - // - If Singleline is not set, then '.' eats anything up until a '\n' and backtracks from there, so we can similarly avoid - // re-examining that content and anchor to the beginning of lines. - // We are currently very conservative here, only examining concat nodes. This could be loosened in the future, e.g. to - // explore captures (but think through any implications of there being a back ref to that capture), to explore loops and - // lazy loops a positive minimum (but the anchor shouldn't be part of the loop), to explore alternations and support adding - // an anchor if all of them begin with appropriate star loops (though this could also be accomplished by factoring out the - // loops to be before the alternation), etc. - { - RegexNode node = rootNode.Child(0); // skip implicit root capture node - while (true) - { - bool singleline = (node.Options & RegexOptions.Singleline) != 0; - switch (node.Type) - { - case Concatenate: - node = node.Child(0); - continue; - - case Setloop when singleline && node.N == int.MaxValue && node.Str == RegexCharClass.AnyClass: - case Setloopatomic when singleline && node.N == int.MaxValue && node.Str == RegexCharClass.AnyClass: - case Notoneloop when !singleline && node.N == int.MaxValue && node.Ch == '\n': - case Notoneloopatomic when !singleline && node.N == int.MaxValue && node.Ch == '\n': - RegexNode? parent = node.Next; - var anchor = new RegexNode(singleline ? Beginning : Bol, node.Options); - Debug.Assert(parent != null); - if (parent.Type == Concatenate) - { - Debug.Assert(parent.ChildCount() >= 2); - Debug.Assert(parent.Children is List); - anchor.Next = parent; - ((List)parent.Children).Insert(0, anchor); - } - else - { - Debug.Assert(parent.Type == Capture && parent.Next is null, "Only valid capture is the implicit root capture"); - var concat = new RegexNode(Concatenate, parent.Options); - concat.AddChild(anchor); - concat.AddChild(node); - parent.ReplaceChild(0, concat); - } - break; - } - - break; - } - } } // Optimization: Unnecessary root atomic. diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 70dfee3..a7e7369 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -127,6 +127,13 @@ namespace System.Text.RegularExpressions.Tests yield return new object[] { @"(?>\w+)(? Match_StartatDiffersFromBeginning_MemberData() + { + foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.Singleline, RegexOptions.Multiline }) + { + // Anchors + yield return new object[] { @"^.*", "abc", options, 0, true, true }; + yield return new object[] { @"^.*", "abc", options, 1, false, true }; + + // Positive Lookbehinds + yield return new object[] { @"(?<=abc)def", "abcdef", options, 3, true, false }; + + // Negative Lookbehinds + yield return new object[] { @"(?