Fix beginning fixups of captures after Regex span support (#66713)

author Stephen Toub <stoub@microsoft.com>

Wed, 16 Mar 2022 21:18:38 +0000 (17:18 -0400)

committer GitHub <noreply@github.com>

Wed, 16 Mar 2022 21:18:38 +0000 (17:18 -0400)
author Stephen Toub <stoub@microsoft.com>
Wed, 16 Mar 2022 21:18:38 +0000 (17:18 -0400)
committer GitHub <noreply@github.com>
Wed, 16 Mar 2022 21:18:38 +0000 (17:18 -0400)
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Capture.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Capture.cs

index 2eee81f..fba98ab 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Capture.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Capture.cs
@@ -19,17 +19,6 @@ namespace System.Text.RegularExpressions
          /// <summary>Returns the position in the original string where the first character of captured substring was found.</summary>
          public int Index { get; private protected set; }
  
-        /// <summary>
-        /// This method should only be called when the text for matching was sliced with a different beginning, so the resulting index of
-        /// the match is not from the start of the text, but instead the start of the slice. This method will add back that extra indices
-        /// to account for the original text beginning.
-        /// </summary>
-        /// <param name="beginning">The original text's beginning offset.</param>
-        internal void AddBeginningToIndex(int beginning)
-        {
-            Index += beginning;
-        }
-
          /// <summary>Returns the length of the captured substring.</summary>
          public int Length { get; private protected set; }
  
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs

index adf727a..882ff11 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs
@@ -45,8 +45,8 @@ namespace System.Text.RegularExpressions
          internal int _textstart;
  
          // output from the match
-        internal int[][] _matches;
-        internal int[] _matchcount;
+        internal readonly int[][] _matches;
+        internal readonly int[] _matchcount;
          internal bool _balancing;        // whether we've done any balancing with this match.  If we
                                           // have done balancing, we'll need to do extra work in Tidy().
  
@@ -89,7 +89,7 @@ namespace System.Text.RegularExpressions
          /// </summary>
          /// <remarks>
          /// The main difference between the public <see cref="Group.Success"/> property and this one, is that <see cref="Group.Success"/> requires
-        /// for a <see cref="Match"/> to call <see cref="Match.Tidy(int)"/> first, in order to report the correct value, while this API will return
+        /// for a <see cref="Match"/> to call <see cref="Tidy"/> first, in order to report the correct value, while this API will return
          /// the correct value right after a Match gets calculated, meaning that it will return <see langword="true"/> right after <see cref="RegexRunner.Capture(int, int, int)"/>
          /// </remarks>
          internal bool FoundMatch => _matchcount[0] > 0;
@@ -272,17 +272,43 @@ namespace System.Text.RegularExpressions
          }
  
          /// <summary>Tidy the match so that it can be used as an immutable result</summary>
-        internal void Tidy(int textpos)
+        internal void Tidy(int textpos, int beginningOfSpanSlice)
          {
+            int[] matchcount = _matchcount;
+            int[][] matches = _matches;
+
              _textpos = textpos;
-            _capcount = _matchcount[0];
-            int[] interval = _matches[0];
+            _capcount = matchcount[0];
+            int[] interval = matches[0];
              Index = interval[0];
              Length = interval[1];
              if (_balancing)
              {
                  TidyBalancing();
              }
+
+            // At this point the Match is consistent for handing back to a caller, with regards to the span that was processed.
+            // However, the caller may have actually provided a string, and may have done so with a non-zero beginning.
+            // In such a case, all offsets need to be shifted by beginning, e.g. if beginning is 5 and a capture occurred at
+            // offset 17, that 17 offset needs to be increased to 22 to account for the fact that it was actually 17 from the
+            // beginning, which the implementation saw as 0 but which from the caller's perspective was 5.
+            Debug.Assert(!_balancing);
+            if (beginningOfSpanSlice != 0)
+            {
+                Index += beginningOfSpanSlice;
+                for (int groupNumber = 0; groupNumber < matches.Length; groupNumber++)
+                {
+                    int[] captures = matches[groupNumber];
+                    if (captures is not null)
+                    {
+                        int capturesLength = matchcount[groupNumber] * 2; // each capture has an offset and a length
+                        for (int c = 0; c < capturesLength; c += 2)
+                        {
+                            captures[c] += beginningOfSpanSlice;
+                        }
+                    }
+                }
+            }
          }
  
          private void TidyBalancing()
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs

index c2cd8e8..07b435a 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs
@@ -570,13 +570,7 @@ namespace System.Text.RegularExpressions
                      return null;
                  }
  
-                match.Tidy(runner.runtextpos);
-
-                // If the passed in beginning was not 0 then we need to adjust the offsets on the match object.
-                if (beginning != 0)
-                {
-                    match.AddBeginningToIndex(beginning);
-                }
+                match.Tidy(runner.runtextpos, beginning);
  
                  return match;
              }
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs

index 13a20bb..bd25add 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs
@@ -184,7 +184,7 @@ namespace System.Text.RegularExpressions
                  }
  
                  runmatch = null;
-                match.Tidy(runtextpos);
+                match.Tidy(runtextpos, 0);
              }
              else
              {
diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs

index 58e78b2..3d297ad 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs
@@ -961,22 +961,26 @@ namespace System.Text.RegularExpressions.Tests
                  return;
              }
  
-            Match match = regex.Match(input);
-
-            Assert.True(match.Success);
-            Assert.Equal(expectedGroups[0], match.Value);
+            foreach (string prefix in new[] { "", "IGNORETHIS" })
+            {
+                Match match = prefix.Length == 0 ?
+                    regex.Match(input) : // validate the original input
+                    regex.Match(prefix + input, prefix.Length, input.Length); // validate we handle groups and beginning/length correctly
  
-            Assert.Equal(expectedGroups.Length, match.Groups.Count);
+                Assert.True(match.Success);
+                Assert.Equal(expectedGroups[0], match.Value);
+                Assert.Equal(expectedGroups.Length, match.Groups.Count);
  
-            int[] groupNumbers = regex.GetGroupNumbers();
-            string[] groupNames = regex.GetGroupNames();
-            for (int i = 0; i < expectedGroups.Length; i++)
-            {
-                Assert.Equal(expectedGroups[i], match.Groups[groupNumbers[i]].Value);
-                Assert.Equal(match.Groups[groupNumbers[i]], match.Groups[groupNames[i]]);
+                int[] groupNumbers = regex.GetGroupNumbers();
+                string[] groupNames = regex.GetGroupNames();
+                for (int i = 0; i < expectedGroups.Length; i++)
+                {
+                    Assert.Equal(expectedGroups[i], match.Groups[groupNumbers[i]].Value);
+                    Assert.Equal(match.Groups[groupNumbers[i]], match.Groups[groupNames[i]]);
  
-                Assert.Equal(groupNumbers[i], regex.GroupNumberFromName(groupNames[i]));
-                Assert.Equal(groupNames[i], regex.GroupNameFromNumber(groupNumbers[i]));
+                    Assert.Equal(groupNumbers[i], regex.GroupNumberFromName(groupNames[i]));
+                    Assert.Equal(groupNames[i], regex.GroupNameFromNumber(groupNumbers[i]));
+                }
              }
          }
author	Stephen Toub <stoub@microsoft.com>
	Wed, 16 Mar 2022 21:18:38 +0000 (17:18 -0400)
committer	GitHub <noreply@github.com>
	Wed, 16 Mar 2022 21:18:38 +0000 (17:18 -0400)
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Capture.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs		patch \| blob \| history