Debug.Assert(node.Kind is RegexNodeKind.PositiveLookaround, $"Unexpected type: {node.Kind}");
Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
- // Lookarounds are implicitly atomic. Store the original done label to reset at the end.
- string originalDoneLabel = doneLabel;
-
// Save off pos. We'll need to reset this upon successful completion of the lookahead.
string startingPos = ReserveName("positivelookahead_starting_pos");
writer.WriteLine($"int {startingPos} = pos;");
int startingSliceStaticPos = sliceStaticPos;
// Emit the child.
- EmitNode(node.Child(0));
+ RegexNode child = node.Child(0);
+ if (analysis.MayBacktrack(child))
+ {
+ // Lookarounds are implicitly atomic, so we need to emit the node as atomic if it might backtrack.
+ EmitAtomic(node, null);
+ }
+ else
+ {
+ EmitNode(child);
+ }
// After the child completes successfully, reset the text positions.
// Do not reset captures, which persist beyond the lookahead.
writer.WriteLine($"pos = {startingPos};");
SliceInputSpan(writer);
sliceStaticPos = startingSliceStaticPos;
-
- doneLabel = originalDoneLabel;
}
// Emits the code to handle a negative lookahead assertion.
Debug.Assert(node.Kind is RegexNodeKind.NegativeLookaround, $"Unexpected type: {node.Kind}");
Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
- // Lookarounds are implicitly atomic. Store the original done label to reset at the end.
string originalDoneLabel = doneLabel;
// Save off pos. We'll need to reset this upon successful completion of the lookahead.
doneLabel = negativeLookaheadDoneLabel;
// Emit the child.
- EmitNode(node.Child(0));
+ RegexNode child = node.Child(0);
+ if (analysis.MayBacktrack(child))
+ {
+ // Lookarounds are implicitly atomic, so we need to emit the node as atomic if it might backtrack.
+ EmitAtomic(node, null);
+ }
+ else
+ {
+ EmitNode(child);
+ }
// If the generated code ends up here, it matched the lookahead, which actually
// means failure for a _negative_ lookahead, so we need to jump to the original done.
Goto(doneLabel);
return;
- // Atomic is invisible in the generated source, other than its impact on the targets of jumps
- case RegexNodeKind.Atomic:
- EmitAtomic(node, subsequent);
+ // Skip atomic nodes that wrap non-backtracking children; in such a case there's nothing to be made atomic.
+ case RegexNodeKind.Atomic when !analysis.MayBacktrack(node.Child(0)):
+ EmitNode(node.Child(0));
return;
// Concatenate is a simplification in the node tree so that a series of children can be represented as one.
EmitExpressionConditional(node);
break;
+ case RegexNodeKind.Atomic when analysis.MayBacktrack(node.Child(0)):
+ EmitAtomic(node, subsequent);
+ return;
+
case RegexNodeKind.Capture:
EmitCapture(node, subsequent);
break;
// Emits the node for an atomic.
void EmitAtomic(RegexNode node, RegexNode? subsequent)
{
- Debug.Assert(node.Kind is RegexNodeKind.Atomic, $"Unexpected type: {node.Kind}");
+ Debug.Assert(node.Kind is RegexNodeKind.Atomic or RegexNodeKind.PositiveLookaround or RegexNodeKind.NegativeLookaround, $"Unexpected type: {node.Kind}");
Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
+ Debug.Assert(analysis.MayBacktrack(node.Child(0)), "Expected child to potentially backtrack");
- // Atomic simply outputs the code for the child, but it ensures that any done label left
- // set by the child is reset to what it was prior to the node's processing. That way,
- // anything later that tries to jump back won't see labels set inside the atomic.
+ // Grab the current done label and the current backtracking position. The purpose of the atomic node
+ // is to ensure that nodes after it that might backtrack skip over the atomic, which means after
+ // rendering the atomic's child, we need to reset the label so that subsequent backtracking doesn't
+ // see any label left set by the atomic's child. We also need to reset the backtracking stack position
+ // so that the state on the stack remains consistent.
string originalDoneLabel = doneLabel;
+ additionalDeclarations.Add("int stackpos = 0;");
+ string startingStackpos = ReserveName("atomic_stackpos");
+ writer.WriteLine($"int {startingStackpos} = stackpos;");
+ writer.WriteLine();
+
+ // Emit the child.
EmitNode(node.Child(0), subsequent);
+ writer.WriteLine();
+
+ // Reset the stack position and done label.
+ writer.WriteLine($"stackpos = {startingStackpos};");
doneLabel = originalDoneLabel;
}
Debug.Assert(node.Kind is RegexNodeKind.PositiveLookaround, $"Unexpected type: {node.Kind}");
Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
- // Lookarounds are implicitly atomic. Store the original done label to reset at the end.
- Label originalDoneLabel = doneLabel;
-
// Save off pos. We'll need to reset this upon successful completion of the lookahead.
// startingPos = pos;
LocalBuilder startingPos = DeclareInt32();
int startingTextSpanPos = sliceStaticPos;
// Emit the child.
- EmitNode(node.Child(0));
+ RegexNode child = node.Child(0);
+ if (analysis.MayBacktrack(child))
+ {
+ // Lookarounds are implicitly atomic, so we need to emit the node as atomic if it might backtrack.
+ EmitAtomic(node, null);
+ }
+ else
+ {
+ EmitNode(child);
+ }
// After the child completes successfully, reset the text positions.
// Do not reset captures, which persist beyond the lookahead.
Stloc(pos);
SliceInputSpan();
sliceStaticPos = startingTextSpanPos;
-
- doneLabel = originalDoneLabel;
}
// Emits the code to handle a negative lookahead assertion.
Debug.Assert(node.Kind is RegexNodeKind.NegativeLookaround, $"Unexpected type: {node.Kind}");
Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
- // Lookarounds are implicitly atomic. Store the original done label to reset at the end.
Label originalDoneLabel = doneLabel;
// Save off pos. We'll need to reset this upon successful completion of the lookahead.
doneLabel = negativeLookaheadDoneLabel;
// Emit the child.
- EmitNode(node.Child(0));
+ RegexNode child = node.Child(0);
+ if (analysis.MayBacktrack(child))
+ {
+ // Lookarounds are implicitly atomic, so we need to emit the node as atomic if it might backtrack.
+ EmitAtomic(node, null);
+ }
+ else
+ {
+ EmitNode(child);
+ }
// If the generated code ends up here, it matched the lookahead, which actually
// means failure for a _negative_ lookahead, so we need to jump to the original done.
// Emits the node for an atomic.
void EmitAtomic(RegexNode node, RegexNode? subsequent)
{
- Debug.Assert(node.Kind is RegexNodeKind.Atomic, $"Unexpected type: {node.Kind}");
+ Debug.Assert(node.Kind is RegexNodeKind.Atomic or RegexNodeKind.PositiveLookaround or RegexNodeKind.NegativeLookaround, $"Unexpected type: {node.Kind}");
Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
- // Atomic simply outputs the code for the child, but it ensures that any done label left
- // set by the child is reset to what it was prior to the node's processing. That way,
- // anything later that tries to jump back won't see labels set inside the atomic.
+ RegexNode child = node.Child(0);
+
+ if (!analysis.MayBacktrack(child))
+ {
+ // If the child has no backtracking, the atomic is a nop and we can just skip it.
+ // Note that the source generator equivalent for this is in the top-level EmitNode, in order to avoid
+ // outputting some extra comments and scopes. As such formatting isn't a concern for the compiler,
+ // the logic is instead here in EmitAtomic.
+ EmitNode(child, subsequent);
+ return;
+ }
+
+ // Grab the current done label and the current backtracking position. The purpose of the atomic node
+ // is to ensure that nodes after it that might backtrack skip over the atomic, which means after
+ // rendering the atomic's child, we need to reset the label so that subsequent backtracking doesn't
+ // see any label left set by the atomic's child. We also need to reset the backtracking stack position
+ // so that the state on the stack remains consistent.
Label originalDoneLabel = doneLabel;
- EmitNode(node.Child(0), subsequent);
+
+ // int startingStackpos = stackpos;
+ using RentedLocalBuilder startingStackpos = RentInt32Local();
+ Ldloc(stackpos);
+ Stloc(startingStackpos);
+
+ // Emit the child.
+ EmitNode(child, subsequent);
+
+ // Reset the stack position and done label.
+ // stackpos = startingStackpos;
+ Ldloc(startingStackpos);
+ Stloc(stackpos);
doneLabel = originalDoneLabel;
}
yield return (Case("(?>a*)123"), "aa1234", options, 0, 5, true, "aa123");
yield return (Case("(?>(?>a*))123"), "aa1234", options, 0, 5, true, "aa123");
yield return (Case("(?>a{2,})b"), "aaab", options, 0, 4, true, "aaab");
+ yield return (Case("[a-z]{0,4}(?>[x-z]*.)(?=xyz1)"), "abcdxyz1", options, 0, 8, true, "abcd");
+ yield return (Case("[a-z]{0,4}(?=[x-z]*.)(?=cd)"), "abcdxyz1", options, 0, 8, true, "ab");
+ yield return (Case("[a-z]{0,4}(?![x-z]*[wx])(?=cd)"), "abcdxyz1", options, 0, 8, true, "ab");
// Atomic lazy
yield return (Case("(?>[0-9]+?)abc"), "abc12345abc", options, 3, 8, true, "5abc");