// states that have been created
internal HashSet<DfaMatchingState<TElement>> _stateCache = new();
+ // capturing states that have been created
+ internal HashSet<DfaMatchingState<TElement>> _capturingStateCache = new();
+
internal readonly Dictionary<(SymbolicRegexKind,
SymbolicRegexNode<TElement>?, // _left
SymbolicRegexNode<TElement>?, // _right
/// </summary>
internal DfaMatchingState<TElement>[]? _statearray;
internal DfaMatchingState<TElement>[]? _delta;
+ internal DfaMatchingState<TElement>[]? _capturingStatearray;
internal List<(DfaMatchingState<TElement>, List<DerivativeEffect>)>[]? _capturingDelta;
private const int InitialStateLimit = 1024;
else
{
_statearray = new DfaMatchingState<TElement>[InitialStateLimit];
+ _capturingStatearray = new DfaMatchingState<TElement>[InitialStateLimit];
// the extra slot with id minterms.Length is reserved for \Z (last occurrence of \n)
int mintermsCount = 1;
}
/// <summary>
- /// Make a state with given node and previous character context
+ /// Make a state with given node and previous character context.
/// </summary>
- public DfaMatchingState<TElement> MkState(SymbolicRegexNode<TElement> node, uint prevCharKind, bool antimirov = false)
+ /// <param name="node">the pattern that this state will represent</param>
+ /// <param name="prevCharKind">the kind of the character that led to this state</param>
+ /// <param name="antimirov">if true, then state won't be cached</param>
+ /// <param name="capturing">whether to use the separate space of states with capturing transitions or not</param>
+ /// <returns></returns>
+ public DfaMatchingState<TElement> MkState(SymbolicRegexNode<TElement> node, uint prevCharKind, bool antimirov = false, bool capturing = false)
{
//first prune the anchors in the node
TElement WLpred = _wordLetterPredicateForAnchors;
bool contWithNWL = node.CanBeNullable || _solver.IsSatisfiable(_solver.And(_solver.Not(WLpred), startSet));
SymbolicRegexNode<TElement> pruned_node = node.PruneAnchors(prevCharKind, contWithWL, contWithNWL);
var s = new DfaMatchingState<TElement>(pruned_node, prevCharKind);
- if (!_stateCache.TryGetValue(s, out DfaMatchingState<TElement>? state))
+ if (!(capturing ? _stateCache : _capturingStateCache).TryGetValue(s, out DfaMatchingState<TElement>? state))
{
// do not cache set of states as states in antimirov mode
if (antimirov && pruned_node.Kind == SymbolicRegexKind.Or)
}
else
{
- state = MakeNewState(s);
+ state = MakeNewState(s, capturing);
}
}
return state;
}
- private DfaMatchingState<TElement> MakeNewState(DfaMatchingState<TElement> state)
+ private DfaMatchingState<TElement> MakeNewState(DfaMatchingState<TElement> state, bool capturing)
{
lock (this)
{
- state.Id = _stateCache.Count;
- _stateCache.Add(state);
+ HashSet<DfaMatchingState<TElement>> cache = capturing ? _stateCache : _capturingStateCache;
+ state.Id = cache.Count;
+ cache.Add(state);
- Debug.Assert(_statearray is not null);
+ Debug.Assert(_statearray is not null && _capturingStatearray is not null);
- if (state.Id == _statearray.Length)
+ if (capturing)
{
- int newsize = _statearray.Length + 1024;
- Array.Resize(ref _statearray, newsize);
- Array.Resize(ref _delta, newsize << _mintermsCount);
- Array.Resize(ref _capturingDelta, newsize << _mintermsCount);
+ if (state.Id == _capturingStatearray.Length)
+ {
+ int newsize = _capturingStatearray.Length + 1024;
+ Array.Resize(ref _capturingStatearray, newsize);
+ Array.Resize(ref _capturingDelta, newsize << _mintermsCount);
+ }
+ _capturingStatearray[state.Id] = state;
+ }
+ else
+ {
+ if (state.Id == _statearray.Length)
+ {
+ int newsize = _statearray.Length + 1024;
+ Array.Resize(ref _statearray, newsize);
+ Array.Resize(ref _delta, newsize << _mintermsCount);
+ }
+ _statearray[state.Id] = state;
}
- _statearray[state.Id] = state;
return state;
}
}
/// <summary>Number of capture groups.</summary>
private readonly int _capsize;
+ /// <summary>This determines whether the matcher uses the special capturing NFA simulation mode.</summary>
+ internal bool HasSubcaptures => _capsize > 1;
+
/// <summary>Get the minterm of <paramref name="c"/>.</summary>
/// <param name="c">character code</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
var initialStates = new DfaMatchingState<TSetType>[statesCount];
for (uint i = 0; i < initialStates.Length; i++)
{
- initialStates[i] = _builder.MkState(_pattern, i);
+ initialStates[i] = _builder.MkState(_pattern, i, capturing: HasSubcaptures);
}
_initialStates = initialStates;
// but observe that the behavior from the state may ultimately depend on the previous
// input char e.g. possibly causing nullability of \b or \B or of a start-of-line anchor,
// in that sense there can be several "versions" (not more than StateCount) of the initial state.
- DfaMatchingState<TSetType> state = _builder.MkState(_dotStarredPattern, i);
+ DfaMatchingState<TSetType> state = _builder.MkState(_dotStarredPattern, i, capturing: false);
state.IsInitialState = true;
dotstarredInitialStates[i] = state;
}
var reverseInitialStates = new DfaMatchingState<TSetType>[statesCount];
for (uint i = 0; i < reverseInitialStates.Length; i++)
{
- reverseInitialStates[i] = _builder.MkState(_reversePattern, i);
+ reverseInitialStates[i] = _builder.MkState(_reversePattern, i, capturing: false);
}
_reverseInitialStates = reverseInitialStates;
Debug.Assert(currentStates.Node._alts is not null);
foreach (SymbolicRegexNode<TSetType> oneState in currentStates.Node._alts)
{
- DfaMatchingState<TSetType> nextStates = builder.MkState(oneState, currentStates.PrevCharKind);
+ DfaMatchingState<TSetType> nextStates = builder.MkState(oneState, currentStates.PrevCharKind, capturing: false);
int offset = (nextStates.Id << builder._mintermsCount) | mintermId;
DfaMatchingState<TSetType> p = Volatile.Read(ref builder._delta[offset]) ?? matcher.CreateNewTransition(nextStates, minterm, offset);
kind = p.PrevCharKind;
}
- return builder.MkState(union, kind, true);
+ return builder.MkState(union, kind, capturing: false, antimirov: true);
}
}
FindStartPosition(input, i, i_q0_A1); // Walk in reverse to locate the start position of the match
}
- if (_capsize <= 1)
+ if (!HasSubcaptures)
{
int i_end = FindEndPosition(input, i_start);
return new SymbolicMatch(i_start, i_end + 1 - i_start);
foreach (var (sourceId, sourceRegisters) in current.Values)
{
- Debug.Assert(_builder._statearray is not null);
- DfaMatchingState<TSetType> sourceState = _builder._statearray[sourceId];
+ Debug.Assert(_builder._capturingStatearray is not null);
+ DfaMatchingState<TSetType> sourceState = _builder._capturingStatearray[sourceId];
// Find the minterm, handling the special case for the last \n
int mintermId = c == '\n' && i == input.Length - 1 && sourceState.StartsWithLineAnchor ?