Fix auto-atomicity for loops around required one/notone/set loops (#56735)
authorStephen Toub <stoub@microsoft.com>
Mon, 9 Aug 2021 23:16:46 +0000 (19:16 -0400)
committerGitHub <noreply@github.com>
Mon, 9 Aug 2021 23:16:46 +0000 (17:16 -0600)
Given an expression like `(?:a{2,3}){2}`, we currently erroneously convert that to `(?>a{2,3}){2}`, i.e. making the inner loop atomic, even though that then causes this to fail to match `aaaa` when it should match it (it fails because the first iteration will atomically match `aaa` and thus won't given any back, and then the next iteration will fail to match at least two `a`s).

The simple fix is in FindLastExpressionInLoopForAutoAtomic, removing the special-case that tries to make the body of a loop that's a one/notone/set loop atomic.  There are likely special-cases of this special-case that are still valid, but I'm not convinced it's worth trying to maintain and risk other gaps.

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs
src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs

index 4604963..25c3a78 100644 (file)
@@ -1398,12 +1398,6 @@ namespace System.Text.RegularExpressions
                 node = node.Child(0);
             }
 
-            // If the loop's body terminates with a {one/notone/set} loop, return it.
-            if (node.Type == Oneloop || node.Type == Notoneloop || node.Type == Setloop)
-            {
-                return node;
-            }
-
             // If the loop's body is a concatenate, we can skip to its last child iff that
             // last child doesn't conflict with the first child, since this whole concatenation
             // could be repeated, such that the first node ends up following the last.  For
index 740d0ef..2dfd4aa 100644 (file)
@@ -128,6 +128,12 @@ namespace System.Text.RegularExpressions.Tests
             yield return new object[] { @"(?>\w+)(?<!a)", "aa", RegexOptions.None, 0, 2, false, string.Empty };
             yield return new object[] { @".+a", "baa", RegexOptions.None, 0, 3, true, "baa" };
             yield return new object[] { @"[ab]+a", "cacbaac", RegexOptions.None, 0, 7, true, "baa" };
+            yield return new object[] { @"^(\d{2,3}){2}$", "1234", RegexOptions.None, 0, 4, true, "1234" };
+            yield return new object[] { @"(\d{2,3}){2}", "1234", RegexOptions.None, 0, 4, true, "1234" };
+            yield return new object[] { @"((\d{2,3})){2}", "1234", RegexOptions.None, 0, 4, true, "1234" };
+            yield return new object[] { @"(\d{2,3})+", "1234", RegexOptions.None, 0, 4, true, "123" };
+            yield return new object[] { @"(\d{2,3})*", "123456", RegexOptions.None, 0, 4, true, "123" };
+            yield return new object[] { @"(abc\d{2,3}){2}", "abc123abc4567", RegexOptions.None, 0, 12, true, "abc123abc456" };
             foreach (RegexOptions lineOption in new[] { RegexOptions.None, RegexOptions.Singleline, RegexOptions.Multiline })
             {
                 yield return new object[] { @".*", "abc", lineOption, 1, 2, true, "bc" };
@@ -1213,10 +1219,10 @@ namespace System.Text.RegularExpressions.Tests
 
         [SkipOnTargetFramework(TargetFrameworkMonikers.NetFramework)] // take too long due to backtracking
         [Theory]
-        [InlineData(@"(\w*)+\.", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", false)]
-        [InlineData(@"(a+)+b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", false)]
-        [InlineData(@"(x+x+)+y", "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", false)]
-        public void IsMatch_SucceedQuicklyDueToAutoAtomicity(string regex, string input, bool expected)
+        [InlineData(@"(?:\w*)+\.", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", false)]
+        [InlineData(@"(?:a+)+b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", false)]
+        [InlineData(@"(?:x+x+)+y", "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", false)]
+        public void IsMatch_SucceedQuicklyDueToLoopReduction(string regex, string input, bool expected)
         {
             Assert.Equal(expected, Regex.IsMatch(input, regex, RegexOptions.None));
             Assert.Equal(expected, Regex.IsMatch(input, regex, RegexOptions.Compiled));
index 35545df..2bbdecf 100644 (file)
@@ -379,8 +379,8 @@ namespace System.Text.RegularExpressions.Tests
         [InlineData("(?:a[ce]*|b*)c", "(?:a[ce]*|(?>b*))c")]
         [InlineData("apple|(?:orange|pear)|grape", "apple|orange|pear|grape")]
         [InlineData("(?>(?>(?>(?:abc)*)))", "(?:abc)*")]
-        [InlineData("(w*)+", "((?>w*))+")]
-        [InlineData("(w*)+\\.", "((?>w*))+\\.")]
+        [InlineData("(?:w*)+", "(?>w*)+")]
+        [InlineData("(?:w*)+\\.", "(?>w*)+\\.")]
         [InlineData("(a[bcd]e*)*fg", "(a[bcd](?>e*))*fg")]
         [InlineData("(\\w[bcd]\\s*)*fg", "(\\w[bcd](?>\\s*))*fg")]
         public void PatternsReduceIdentically(string pattern1, string pattern2)
@@ -457,6 +457,8 @@ namespace System.Text.RegularExpressions.Tests
         [InlineData("[a-z]*[\x0000-\xFFFF]+", "(?>[a-z]*)[\x0000-\xFFFF]+")]
         [InlineData("[^a-c]*[e-g]", "(?>[^a-c]*)[e-g]")]
         [InlineData("[^a-c]*[^e-g]", "(?>[^a-c]*)[^e-g]")]
+        [InlineData("(w+)+", "((?>w+))+")]
+        [InlineData("(w{1,2})+", "((?>w{1,2}))+")]
         public void PatternsReduceDifferently(string pattern1, string pattern2)
         {
             var r1 = new Regex(pattern1);