Skip to content

Fix handling of atomic nodes in RegexCompiler / source generator #66046

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1840,26 +1840,30 @@ void EmitPositiveLookaheadAssertion(RegexNode node)
Debug.Assert(node.Kind is RegexNodeKind.PositiveLookaround, $"Unexpected type: {node.Kind}");
Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");

// Lookarounds are implicitly atomic. Store the original done label to reset at the end.
string originalDoneLabel = doneLabel;

// Save off pos. We'll need to reset this upon successful completion of the lookahead.
string startingPos = ReserveName("positivelookahead_starting_pos");
writer.WriteLine($"int {startingPos} = pos;");
writer.WriteLine();
int startingSliceStaticPos = sliceStaticPos;

// Emit the child.
EmitNode(node.Child(0));
RegexNode child = node.Child(0);
if (analysis.MayBacktrack(child))
{
// Lookarounds are implicitly atomic, so we need to emit the node as atomic if it might backtrack.
EmitAtomic(node, null);
}
else
{
EmitNode(child);
}

// After the child completes successfully, reset the text positions.
// Do not reset captures, which persist beyond the lookahead.
writer.WriteLine();
writer.WriteLine($"pos = {startingPos};");
SliceInputSpan(writer);
sliceStaticPos = startingSliceStaticPos;

doneLabel = originalDoneLabel;
}

// Emits the code to handle a negative lookahead assertion.
Expand All @@ -1868,7 +1872,6 @@ void EmitNegativeLookaheadAssertion(RegexNode node)
Debug.Assert(node.Kind is RegexNodeKind.NegativeLookaround, $"Unexpected type: {node.Kind}");
Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");

// Lookarounds are implicitly atomic. Store the original done label to reset at the end.
string originalDoneLabel = doneLabel;

// Save off pos. We'll need to reset this upon successful completion of the lookahead.
Expand All @@ -1880,7 +1883,16 @@ void EmitNegativeLookaheadAssertion(RegexNode node)
doneLabel = negativeLookaheadDoneLabel;

// Emit the child.
EmitNode(node.Child(0));
RegexNode child = node.Child(0);
if (analysis.MayBacktrack(child))
{
// Lookarounds are implicitly atomic, so we need to emit the node as atomic if it might backtrack.
EmitAtomic(node, null);
}
else
{
EmitNode(child);
}

// If the generated code ends up here, it matched the lookahead, which actually
// means failure for a _negative_ lookahead, so we need to jump to the original done.
Expand Down Expand Up @@ -1920,9 +1932,9 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
Goto(doneLabel);
return;

// Atomic is invisible in the generated source, other than its impact on the targets of jumps
case RegexNodeKind.Atomic:
EmitAtomic(node, subsequent);
// Skip atomic nodes that wrap non-backtracking children; in such a case there's nothing to be made atomic.
case RegexNodeKind.Atomic when !analysis.MayBacktrack(node.Child(0)):
EmitNode(node.Child(0));
return;

// Concatenate is a simplification in the node tree so that a series of children can be represented as one.
Expand Down Expand Up @@ -2006,6 +2018,10 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
EmitExpressionConditional(node);
break;

case RegexNodeKind.Atomic when analysis.MayBacktrack(node.Child(0)):
EmitAtomic(node, subsequent);
return;

case RegexNodeKind.Capture:
EmitCapture(node, subsequent);
break;
Expand All @@ -2032,14 +2048,27 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
// Emits the node for an atomic.
void EmitAtomic(RegexNode node, RegexNode? subsequent)
{
Debug.Assert(node.Kind is RegexNodeKind.Atomic, $"Unexpected type: {node.Kind}");
Debug.Assert(node.Kind is RegexNodeKind.Atomic or RegexNodeKind.PositiveLookaround or RegexNodeKind.NegativeLookaround, $"Unexpected type: {node.Kind}");
Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
Debug.Assert(analysis.MayBacktrack(node.Child(0)), "Expected child to potentially backtrack");

// Atomic simply outputs the code for the child, but it ensures that any done label left
// set by the child is reset to what it was prior to the node's processing. That way,
// anything later that tries to jump back won't see labels set inside the atomic.
// Grab the current done label and the current backtracking position. The purpose of the atomic node
// is to ensure that nodes after it that might backtrack skip over the atomic, which means after
// rendering the atomic's child, we need to reset the label so that subsequent backtracking doesn't
// see any label left set by the atomic's child. We also need to reset the backtracking stack position
// so that the state on the stack remains consistent.
string originalDoneLabel = doneLabel;
additionalDeclarations.Add("int stackpos = 0;");
string startingStackpos = ReserveName("atomic_stackpos");
writer.WriteLine($"int {startingStackpos} = stackpos;");
writer.WriteLine();

// Emit the child.
EmitNode(node.Child(0), subsequent);
writer.WriteLine();

// Reset the stack position and done label.
writer.WriteLine($"stackpos = {startingStackpos};");
doneLabel = originalDoneLabel;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2015,9 +2015,6 @@ void EmitPositiveLookaheadAssertion(RegexNode node)
Debug.Assert(node.Kind is RegexNodeKind.PositiveLookaround, $"Unexpected type: {node.Kind}");
Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");

// Lookarounds are implicitly atomic. Store the original done label to reset at the end.
Label originalDoneLabel = doneLabel;

// Save off pos. We'll need to reset this upon successful completion of the lookahead.
// startingPos = pos;
LocalBuilder startingPos = DeclareInt32();
Expand All @@ -2026,7 +2023,16 @@ void EmitPositiveLookaheadAssertion(RegexNode node)
int startingTextSpanPos = sliceStaticPos;

// Emit the child.
EmitNode(node.Child(0));
RegexNode child = node.Child(0);
if (analysis.MayBacktrack(child))
{
// Lookarounds are implicitly atomic, so we need to emit the node as atomic if it might backtrack.
EmitAtomic(node, null);
}
else
{
EmitNode(child);
}

// After the child completes successfully, reset the text positions.
// Do not reset captures, which persist beyond the lookahead.
Expand All @@ -2036,8 +2042,6 @@ void EmitPositiveLookaheadAssertion(RegexNode node)
Stloc(pos);
SliceInputSpan();
sliceStaticPos = startingTextSpanPos;

doneLabel = originalDoneLabel;
}

// Emits the code to handle a negative lookahead assertion.
Expand All @@ -2046,7 +2050,6 @@ void EmitNegativeLookaheadAssertion(RegexNode node)
Debug.Assert(node.Kind is RegexNodeKind.NegativeLookaround, $"Unexpected type: {node.Kind}");
Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");

// Lookarounds are implicitly atomic. Store the original done label to reset at the end.
Label originalDoneLabel = doneLabel;

// Save off pos. We'll need to reset this upon successful completion of the lookahead.
Expand All @@ -2060,7 +2063,16 @@ void EmitNegativeLookaheadAssertion(RegexNode node)
doneLabel = negativeLookaheadDoneLabel;

// Emit the child.
EmitNode(node.Child(0));
RegexNode child = node.Child(0);
if (analysis.MayBacktrack(child))
{
// Lookarounds are implicitly atomic, so we need to emit the node as atomic if it might backtrack.
EmitAtomic(node, null);
}
else
{
EmitNode(child);
}

// If the generated code ends up here, it matched the lookahead, which actually
// means failure for a _negative_ lookahead, so we need to jump to the original done.
Expand Down Expand Up @@ -2204,14 +2216,40 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
// Emits the node for an atomic.
void EmitAtomic(RegexNode node, RegexNode? subsequent)
{
Debug.Assert(node.Kind is RegexNodeKind.Atomic, $"Unexpected type: {node.Kind}");
Debug.Assert(node.Kind is RegexNodeKind.Atomic or RegexNodeKind.PositiveLookaround or RegexNodeKind.NegativeLookaround, $"Unexpected type: {node.Kind}");
Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");

// Atomic simply outputs the code for the child, but it ensures that any done label left
// set by the child is reset to what it was prior to the node's processing. That way,
// anything later that tries to jump back won't see labels set inside the atomic.
RegexNode child = node.Child(0);

if (!analysis.MayBacktrack(child))
{
// If the child has no backtracking, the atomic is a nop and we can just skip it.
// Note that the source generator equivalent for this is in the top-level EmitNode, in order to avoid
// outputting some extra comments and scopes. As such formatting isn't a concern for the compiler,
// the logic is instead here in EmitAtomic.
EmitNode(child, subsequent);
return;
}

// Grab the current done label and the current backtracking position. The purpose of the atomic node
// is to ensure that nodes after it that might backtrack skip over the atomic, which means after
// rendering the atomic's child, we need to reset the label so that subsequent backtracking doesn't
// see any label left set by the atomic's child. We also need to reset the backtracking stack position
// so that the state on the stack remains consistent.
Label originalDoneLabel = doneLabel;
EmitNode(node.Child(0), subsequent);

// int startingStackpos = stackpos;
using RentedLocalBuilder startingStackpos = RentInt32Local();
Ldloc(stackpos);
Stloc(startingStackpos);

// Emit the child.
EmitNode(child, subsequent);

// Reset the stack position and done label.
// stackpos = startingStackpos;
Ldloc(startingStackpos);
Stloc(stackpos);
doneLabel = originalDoneLabel;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,9 @@ public static IEnumerable<object[]> Match_MemberData()
yield return (Case("(?>a*)123"), "aa1234", options, 0, 5, true, "aa123");
yield return (Case("(?>(?>a*))123"), "aa1234", options, 0, 5, true, "aa123");
yield return (Case("(?>a{2,})b"), "aaab", options, 0, 4, true, "aaab");
yield return (Case("[a-z]{0,4}(?>[x-z]*.)(?=xyz1)"), "abcdxyz1", options, 0, 8, true, "abcd");
yield return (Case("[a-z]{0,4}(?=[x-z]*.)(?=cd)"), "abcdxyz1", options, 0, 8, true, "ab");
yield return (Case("[a-z]{0,4}(?![x-z]*[wx])(?=cd)"), "abcdxyz1", options, 0, 8, true, "ab");

// Atomic lazy
yield return (Case("(?>[0-9]+?)abc"), "abc12345abc", options, 3, 8, true, "5abc");
Expand Down