Separate capturing from normal states (#65340)
authorOlli Saarikivi <olsaarik@microsoft.com>
Wed, 16 Feb 2022 18:01:46 +0000 (10:01 -0800)
committerGitHub <noreply@github.com>
Wed, 16 Feb 2022 18:01:46 +0000 (10:01 -0800)
This reduces memory usage for NonBacktracking to what it was before capturing support.

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DfaMatchingState.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs

index 3e8d1bc..e6e9c46 100644 (file)
@@ -111,7 +111,7 @@ namespace System.Text.RegularExpressions.Symbolic
             // nextCharKind will be the PrevCharKind of the target state
             // use an existing state instead if one exists already
             // otherwise create a new new id for it
-            return Node._builder.MkState(derivative, nextCharKind);
+            return Node._builder.MkState(derivative, nextCharKind, capturing: false);
         }
 
         /// <summary>
@@ -135,7 +135,7 @@ namespace System.Text.RegularExpressions.Symbolic
                 // nextCharKind will be the PrevCharKind of the target state
                 // use an existing state instead if one exists already
                 // otherwise create a new new id for it
-                yield return (Node._builder.MkState(derivative, nextCharKind), effects);
+                yield return (Node._builder.MkState(derivative, nextCharKind, capturing: true), effects);
             }
         }
 
index 343706b..8efb450 100644 (file)
@@ -43,6 +43,9 @@ namespace System.Text.RegularExpressions.Symbolic
         // states that have been created
         internal HashSet<DfaMatchingState<TElement>> _stateCache = new();
 
+        // capturing states that have been created
+        internal HashSet<DfaMatchingState<TElement>> _capturingStateCache = new();
+
         internal readonly Dictionary<(SymbolicRegexKind,
             SymbolicRegexNode<TElement>?, // _left
             SymbolicRegexNode<TElement>?, // _right
@@ -64,6 +67,7 @@ namespace System.Text.RegularExpressions.Symbolic
         /// </summary>
         internal DfaMatchingState<TElement>[]? _statearray;
         internal DfaMatchingState<TElement>[]? _delta;
+        internal DfaMatchingState<TElement>[]? _capturingStatearray;
         internal List<(DfaMatchingState<TElement>, List<DerivativeEffect>)>[]? _capturingDelta;
         private const int InitialStateLimit = 1024;
 
@@ -105,6 +109,7 @@ namespace System.Text.RegularExpressions.Symbolic
             else
             {
                 _statearray = new DfaMatchingState<TElement>[InitialStateLimit];
+                _capturingStatearray = new DfaMatchingState<TElement>[InitialStateLimit];
 
                 // the extra slot with id minterms.Length is reserved for \Z (last occurrence of \n)
                 int mintermsCount = 1;
@@ -407,9 +412,14 @@ namespace System.Text.RegularExpressions.Symbolic
         }
 
         /// <summary>
-        /// Make a state with given node and previous character context
+        /// Make a state with given node and previous character context.
         /// </summary>
-        public DfaMatchingState<TElement> MkState(SymbolicRegexNode<TElement> node, uint prevCharKind, bool antimirov = false)
+        /// <param name="node">the pattern that this state will represent</param>
+        /// <param name="prevCharKind">the kind of the character that led to this state</param>
+        /// <param name="antimirov">if true, then state won't be cached</param>
+        /// <param name="capturing">whether to use the separate space of states with capturing transitions or not</param>
+        /// <returns></returns>
+        public DfaMatchingState<TElement> MkState(SymbolicRegexNode<TElement> node, uint prevCharKind, bool antimirov = false, bool capturing = false)
         {
             //first prune the anchors in the node
             TElement WLpred = _wordLetterPredicateForAnchors;
@@ -422,7 +432,7 @@ namespace System.Text.RegularExpressions.Symbolic
             bool contWithNWL = node.CanBeNullable || _solver.IsSatisfiable(_solver.And(_solver.Not(WLpred), startSet));
             SymbolicRegexNode<TElement> pruned_node = node.PruneAnchors(prevCharKind, contWithWL, contWithNWL);
             var s = new DfaMatchingState<TElement>(pruned_node, prevCharKind);
-            if (!_stateCache.TryGetValue(s, out DfaMatchingState<TElement>? state))
+            if (!(capturing ? _stateCache : _capturingStateCache).TryGetValue(s, out DfaMatchingState<TElement>? state))
             {
                 // do not cache set of states as states in antimirov mode
                 if (antimirov && pruned_node.Kind == SymbolicRegexKind.Or)
@@ -432,30 +442,43 @@ namespace System.Text.RegularExpressions.Symbolic
                 }
                 else
                 {
-                    state = MakeNewState(s);
+                    state = MakeNewState(s, capturing);
                 }
             }
 
             return state;
         }
 
-        private DfaMatchingState<TElement> MakeNewState(DfaMatchingState<TElement> state)
+        private DfaMatchingState<TElement> MakeNewState(DfaMatchingState<TElement> state, bool capturing)
         {
             lock (this)
             {
-                state.Id = _stateCache.Count;
-                _stateCache.Add(state);
+                HashSet<DfaMatchingState<TElement>> cache = capturing ? _stateCache : _capturingStateCache;
+                state.Id = cache.Count;
+                cache.Add(state);
 
-                Debug.Assert(_statearray is not null);
+                Debug.Assert(_statearray is not null && _capturingStatearray is not null);
 
-                if (state.Id == _statearray.Length)
+                if (capturing)
                 {
-                    int newsize = _statearray.Length + 1024;
-                    Array.Resize(ref _statearray, newsize);
-                    Array.Resize(ref _delta, newsize << _mintermsCount);
-                    Array.Resize(ref _capturingDelta, newsize << _mintermsCount);
+                    if (state.Id == _capturingStatearray.Length)
+                    {
+                        int newsize = _capturingStatearray.Length + 1024;
+                        Array.Resize(ref _capturingStatearray, newsize);
+                        Array.Resize(ref _capturingDelta, newsize << _mintermsCount);
+                    }
+                    _capturingStatearray[state.Id] = state;
+                }
+                else
+                {
+                    if (state.Id == _statearray.Length)
+                    {
+                        int newsize = _statearray.Length + 1024;
+                        Array.Resize(ref _statearray, newsize);
+                        Array.Resize(ref _delta, newsize << _mintermsCount);
+                    }
+                    _statearray[state.Id] = state;
                 }
-                _statearray[state.Id] = state;
                 return state;
             }
         }
index 79e9eb4..a549f9a 100644 (file)
@@ -141,6 +141,9 @@ namespace System.Text.RegularExpressions.Symbolic
         /// <summary>Number of capture groups.</summary>
         private readonly int _capsize;
 
+        /// <summary>This determines whether the matcher uses the special capturing NFA simulation mode.</summary>
+        internal bool HasSubcaptures => _capsize > 1;
+
         /// <summary>Get the minterm of <paramref name="c"/>.</summary>
         /// <param name="c">character code</param>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -181,7 +184,7 @@ namespace System.Text.RegularExpressions.Symbolic
             var initialStates = new DfaMatchingState<TSetType>[statesCount];
             for (uint i = 0; i < initialStates.Length; i++)
             {
-                initialStates[i] = _builder.MkState(_pattern, i);
+                initialStates[i] = _builder.MkState(_pattern, i, capturing: HasSubcaptures);
             }
             _initialStates = initialStates;
 
@@ -196,7 +199,7 @@ namespace System.Text.RegularExpressions.Symbolic
                 // but observe that the behavior from the state may ultimately depend on the previous
                 // input char e.g. possibly causing nullability of \b or \B or of a start-of-line anchor,
                 // in that sense there can be several "versions" (not more than StateCount) of the initial state.
-                DfaMatchingState<TSetType> state = _builder.MkState(_dotStarredPattern, i);
+                DfaMatchingState<TSetType> state = _builder.MkState(_dotStarredPattern, i, capturing: false);
                 state.IsInitialState = true;
                 dotstarredInitialStates[i] = state;
             }
@@ -208,7 +211,7 @@ namespace System.Text.RegularExpressions.Symbolic
             var reverseInitialStates = new DfaMatchingState<TSetType>[statesCount];
             for (uint i = 0; i < reverseInitialStates.Length; i++)
             {
-                reverseInitialStates[i] = _builder.MkState(_reversePattern, i);
+                reverseInitialStates[i] = _builder.MkState(_reversePattern, i, capturing: false);
             }
             _reverseInitialStates = reverseInitialStates;
 
@@ -414,7 +417,7 @@ namespace System.Text.RegularExpressions.Symbolic
                 Debug.Assert(currentStates.Node._alts is not null);
                 foreach (SymbolicRegexNode<TSetType> oneState in currentStates.Node._alts)
                 {
-                    DfaMatchingState<TSetType> nextStates = builder.MkState(oneState, currentStates.PrevCharKind);
+                    DfaMatchingState<TSetType> nextStates = builder.MkState(oneState, currentStates.PrevCharKind, capturing: false);
 
                     int offset = (nextStates.Id << builder._mintermsCount) | mintermId;
                     DfaMatchingState<TSetType> p = Volatile.Read(ref builder._delta[offset]) ?? matcher.CreateNewTransition(nextStates, minterm, offset);
@@ -426,7 +429,7 @@ namespace System.Text.RegularExpressions.Symbolic
                     kind = p.PrevCharKind;
                 }
 
-                return builder.MkState(union, kind, true);
+                return builder.MkState(union, kind, capturing: false, antimirov: true);
             }
         }
 
@@ -540,7 +543,7 @@ namespace System.Text.RegularExpressions.Symbolic
                     FindStartPosition(input, i, i_q0_A1); // Walk in reverse to locate the start position of the match
             }
 
-            if (_capsize <= 1)
+            if (!HasSubcaptures)
             {
                 int i_end = FindEndPosition(input, i_start);
                 return new SymbolicMatch(i_start, i_end + 1 - i_start);
@@ -664,8 +667,8 @@ namespace System.Text.RegularExpressions.Symbolic
 
                 foreach (var (sourceId, sourceRegisters) in current.Values)
                 {
-                    Debug.Assert(_builder._statearray is not null);
-                    DfaMatchingState<TSetType> sourceState = _builder._statearray[sourceId];
+                    Debug.Assert(_builder._capturingStatearray is not null);
+                    DfaMatchingState<TSetType> sourceState = _builder._capturingStatearray[sourceId];
 
                     // Find the minterm, handling the special case for the last \n
                     int mintermId = c == '\n' && i == input.Length - 1 && sourceState.StartsWithLineAnchor ?