Skip to content

Commit 19f9a34

Browse files
committed
Fix atomic handling for positive/negative lookaheads as well
1 parent 4d1aafe commit 19f9a34

File tree

3 files changed

+44
-18
lines changed

3 files changed

+44
-18
lines changed

src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1840,26 +1840,30 @@ void EmitPositiveLookaheadAssertion(RegexNode node)
18401840
Debug.Assert(node.Kind is RegexNodeKind.PositiveLookaround, $"Unexpected type: {node.Kind}");
18411841
Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
18421842

1843-
// Lookarounds are implicitly atomic. Store the original done label to reset at the end.
1844-
string originalDoneLabel = doneLabel;
1845-
18461843
// Save off pos. We'll need to reset this upon successful completion of the lookahead.
18471844
string startingPos = ReserveName("positivelookahead_starting_pos");
18481845
writer.WriteLine($"int {startingPos} = pos;");
18491846
writer.WriteLine();
18501847
int startingSliceStaticPos = sliceStaticPos;
18511848

18521849
// Emit the child.
1853-
EmitNode(node.Child(0));
1850+
RegexNode child = node.Child(0);
1851+
if (analysis.MayBacktrack(child))
1852+
{
1853+
// Lookarounds are implicitly atomic, so we need to emit the node as atomic if it might backtrack.
1854+
EmitAtomic(node, null);
1855+
}
1856+
else
1857+
{
1858+
EmitNode(child);
1859+
}
18541860

18551861
// After the child completes successfully, reset the text positions.
18561862
// Do not reset captures, which persist beyond the lookahead.
18571863
writer.WriteLine();
18581864
writer.WriteLine($"pos = {startingPos};");
18591865
SliceInputSpan(writer);
18601866
sliceStaticPos = startingSliceStaticPos;
1861-
1862-
doneLabel = originalDoneLabel;
18631867
}
18641868

18651869
// Emits the code to handle a negative lookahead assertion.
@@ -1868,7 +1872,6 @@ void EmitNegativeLookaheadAssertion(RegexNode node)
18681872
Debug.Assert(node.Kind is RegexNodeKind.NegativeLookaround, $"Unexpected type: {node.Kind}");
18691873
Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
18701874

1871-
// Lookarounds are implicitly atomic. Store the original done label to reset at the end.
18721875
string originalDoneLabel = doneLabel;
18731876

18741877
// Save off pos. We'll need to reset this upon successful completion of the lookahead.
@@ -1880,7 +1883,16 @@ void EmitNegativeLookaheadAssertion(RegexNode node)
18801883
doneLabel = negativeLookaheadDoneLabel;
18811884

18821885
// Emit the child.
1883-
EmitNode(node.Child(0));
1886+
RegexNode child = node.Child(0);
1887+
if (analysis.MayBacktrack(child))
1888+
{
1889+
// Lookarounds are implicitly atomic, so we need to emit the node as atomic if it might backtrack.
1890+
EmitAtomic(node, null);
1891+
}
1892+
else
1893+
{
1894+
EmitNode(child);
1895+
}
18841896

18851897
// If the generated code ends up here, it matched the lookahead, which actually
18861898
// means failure for a _negative_ lookahead, so we need to jump to the original done.
@@ -2036,7 +2048,7 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
20362048
// Emits the node for an atomic.
20372049
void EmitAtomic(RegexNode node, RegexNode? subsequent)
20382050
{
2039-
Debug.Assert(node.Kind is RegexNodeKind.Atomic, $"Unexpected type: {node.Kind}");
2051+
Debug.Assert(node.Kind is RegexNodeKind.Atomic or RegexNodeKind.PositiveLookaround or RegexNodeKind.NegativeLookaround, $"Unexpected type: {node.Kind}");
20402052
Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
20412053
Debug.Assert(analysis.MayBacktrack(node.Child(0)), "Expected child to potentially backtrack");
20422054

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2015,9 +2015,6 @@ void EmitPositiveLookaheadAssertion(RegexNode node)
20152015
Debug.Assert(node.Kind is RegexNodeKind.PositiveLookaround, $"Unexpected type: {node.Kind}");
20162016
Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
20172017

2018-
// Lookarounds are implicitly atomic. Store the original done label to reset at the end.
2019-
Label originalDoneLabel = doneLabel;
2020-
20212018
// Save off pos. We'll need to reset this upon successful completion of the lookahead.
20222019
// startingPos = pos;
20232020
LocalBuilder startingPos = DeclareInt32();
@@ -2026,7 +2023,16 @@ void EmitPositiveLookaheadAssertion(RegexNode node)
20262023
int startingTextSpanPos = sliceStaticPos;
20272024

20282025
// Emit the child.
2029-
EmitNode(node.Child(0));
2026+
RegexNode child = node.Child(0);
2027+
if (analysis.MayBacktrack(child))
2028+
{
2029+
// Lookarounds are implicitly atomic, so we need to emit the node as atomic if it might backtrack.
2030+
EmitAtomic(node, null);
2031+
}
2032+
else
2033+
{
2034+
EmitNode(child);
2035+
}
20302036

20312037
// After the child completes successfully, reset the text positions.
20322038
// Do not reset captures, which persist beyond the lookahead.
@@ -2036,8 +2042,6 @@ void EmitPositiveLookaheadAssertion(RegexNode node)
20362042
Stloc(pos);
20372043
SliceInputSpan();
20382044
sliceStaticPos = startingTextSpanPos;
2039-
2040-
doneLabel = originalDoneLabel;
20412045
}
20422046

20432047
// Emits the code to handle a negative lookahead assertion.
@@ -2046,7 +2050,6 @@ void EmitNegativeLookaheadAssertion(RegexNode node)
20462050
Debug.Assert(node.Kind is RegexNodeKind.NegativeLookaround, $"Unexpected type: {node.Kind}");
20472051
Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
20482052

2049-
// Lookarounds are implicitly atomic. Store the original done label to reset at the end.
20502053
Label originalDoneLabel = doneLabel;
20512054

20522055
// Save off pos. We'll need to reset this upon successful completion of the lookahead.
@@ -2060,7 +2063,16 @@ void EmitNegativeLookaheadAssertion(RegexNode node)
20602063
doneLabel = negativeLookaheadDoneLabel;
20612064

20622065
// Emit the child.
2063-
EmitNode(node.Child(0));
2066+
RegexNode child = node.Child(0);
2067+
if (analysis.MayBacktrack(child))
2068+
{
2069+
// Lookarounds are implicitly atomic, so we need to emit the node as atomic if it might backtrack.
2070+
EmitAtomic(node, null);
2071+
}
2072+
else
2073+
{
2074+
EmitNode(child);
2075+
}
20642076

20652077
// If the generated code ends up here, it matched the lookahead, which actually
20662078
// means failure for a _negative_ lookahead, so we need to jump to the original done.
@@ -2204,7 +2216,7 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
22042216
// Emits the node for an atomic.
22052217
void EmitAtomic(RegexNode node, RegexNode? subsequent)
22062218
{
2207-
Debug.Assert(node.Kind is RegexNodeKind.Atomic, $"Unexpected type: {node.Kind}");
2219+
Debug.Assert(node.Kind is RegexNodeKind.Atomic or RegexNodeKind.PositiveLookaround or RegexNodeKind.NegativeLookaround, $"Unexpected type: {node.Kind}");
22082220
Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
22092221

22102222
RegexNode child = node.Child(0);

src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,8 @@ public static IEnumerable<object[]> Match_MemberData()
126126
yield return (Case("(?>(?>a*))123"), "aa1234", options, 0, 5, true, "aa123");
127127
yield return (Case("(?>a{2,})b"), "aaab", options, 0, 4, true, "aaab");
128128
yield return (Case("[a-z]{0,4}(?>[x-z]*.)(?=xyz1)"), "abcdxyz1", options, 0, 8, true, "abcd");
129+
yield return (Case("[a-z]{0,4}(?=[x-z]*.)(?=cd)"), "abcdxyz1", options, 0, 8, true, "ab");
130+
yield return (Case("[a-z]{0,4}(?![x-z]*[wx])(?=cd)"), "abcdxyz1", options, 0, 8, true, "ab");
129131

130132
// Atomic lazy
131133
yield return (Case("(?>[0-9]+?)abc"), "abc12345abc", options, 3, 8, true, "5abc");

0 commit comments

Comments
 (0)