Skip to content

Commit 457b1ff

Browse files
authored
Enable RegexOptions.RightToLeft and lookbehinds in compiler / source generator (#66280)
* Enable RegexOptions.RightToLeft and lookbehinds in compiler / source generator For .NET 7 we rewrote RegexCompiler as we were writing the source generator, and in doing so we left out support for RegexOptions.RightToLeft as well as lookbehinds (which are implemented via RightToLeft). This adds support for both. I initially started incrementally adding in support for various constructs in lookbehinds, but from a testing perspective it made more sense to just add it all, as then all of the RightToLeft tests are used to validate the constructs that are also in lookbehinds. * Address PR feedback
1 parent 30d66a2 commit 457b1ff

File tree

10 files changed

+1313
-502
lines changed

10 files changed

+1313
-502
lines changed

src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs

Lines changed: 433 additions & 204 deletions
Large diffs are not rendered by default.

src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Parser.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ private static bool IsSemanticTargetForGeneration(SemanticModel semanticModel, M
143143
return Diagnostic.Create(DiagnosticDescriptors.InvalidLangVersion, methodSyntax.GetLocation());
144144
}
145145

146-
RegexOptions regexOptions = RegexOptions.Compiled | (options is not null ? (RegexOptions)options : RegexOptions.None);
146+
RegexOptions regexOptions = options is not null ? (RegexOptions)options : RegexOptions.None;
147147

148148
// TODO: This is going to include the culture that's current at the time of compilation.
149149
// What should we do about that? We could:
@@ -181,7 +181,7 @@ private static bool IsSemanticTargetForGeneration(SemanticModel semanticModel, M
181181
RegexTree tree;
182182
try
183183
{
184-
tree = RegexParser.Parse(pattern, regexOptions, culture);
184+
tree = RegexParser.Parse(pattern, regexOptions | RegexOptions.Compiled, culture); // make sure Compiled is included to get all optimizations applied to it
185185
}
186186
catch (Exception e)
187187
{

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs

Lines changed: 769 additions & 218 deletions
Large diffs are not rendered by default.

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options, CultureInfo
5050

5151
// Compute any anchor trailing the expression. If there is one, and we can also compute a fixed length
5252
// for the whole expression, we can use that to quickly jump to the right location in the input.
53-
if (!_rightToLeft) // haven't added FindNextStartingPositionMode support for RTL
53+
if (!_rightToLeft) // haven't added FindNextStartingPositionMode trailing anchor support for RTL
5454
{
5555
bool triedToComputeMaxLength = false;
5656

@@ -297,16 +297,18 @@ public bool TryFindNextStartingPosition(ReadOnlySpan<char> textSpan, ref int pos
297297
// For others, we can jump to the relevant location.
298298

299299
case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning:
300-
if (pos > 0)
300+
if (pos != 0)
301301
{
302+
// If we're not currently at the beginning, we'll never be, so fail immediately.
302303
pos = textSpan.Length;
303304
return false;
304305
}
305306
return true;
306307

307308
case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Start:
308-
if (pos > start)
309+
if (pos != start)
309310
{
311+
// If we're not currently at the start, we'll never be, so fail immediately.
310312
pos = textSpan.Length;
311313
return false;
312314
}
@@ -315,27 +317,38 @@ public bool TryFindNextStartingPosition(ReadOnlySpan<char> textSpan, ref int pos
315317
case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_EndZ:
316318
if (pos < textSpan.Length - 1)
317319
{
320+
// If we're not currently at the end (or a newline just before it), skip ahead
321+
// since nothing until then can possibly match.
318322
pos = textSpan.Length - 1;
319323
}
320324
return true;
321325

322326
case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_End:
323327
if (pos < textSpan.Length)
324328
{
329+
// If we're not currently at the end (or a newline just before it), skip ahead
330+
// since nothing until then can possibly match.
325331
pos = textSpan.Length;
326332
}
327333
return true;
328334

329335
case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning:
330-
if (pos > 0)
336+
if (pos != 0)
331337
{
338+
// If we're not currently at the beginning, skip ahead (or, rather, backwards)
339+
// since nothing until then can possibly match. (We're iterating from the end
340+
// to the beginning in RightToLeft mode.)
332341
pos = 0;
333342
}
334343
return true;
335344

336345
case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start:
337-
if (pos < start)
346+
if (pos != start)
338347
{
348+
// If we're not currently at the starting position, we'll never be, so fail immediately.
349+
// This is different from beginning, since beginning is the fixed location of 0 whereas
350+
// start is wherever the iteration actually starts from; in left-to-right, that's often
351+
// the same as beginning, but in RightToLeft it rarely is.
339352
pos = 0;
340353
return false;
341354
}
@@ -344,6 +357,8 @@ public bool TryFindNextStartingPosition(ReadOnlySpan<char> textSpan, ref int pos
344357
case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_EndZ:
345358
if (pos < textSpan.Length - 1 || ((uint)pos < (uint)textSpan.Length && textSpan[pos] != '\n'))
346359
{
360+
// If we're not currently at the end, we'll never be (we're iterating from end to beginning),
361+
// so fail immediately.
347362
pos = 0;
348363
return false;
349364
}
@@ -352,6 +367,8 @@ public bool TryFindNextStartingPosition(ReadOnlySpan<char> textSpan, ref int pos
352367
case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_End:
353368
if (pos < textSpan.Length)
354369
{
370+
// If we're not currently at the end, we'll never be (we're iterating from end to beginning),
371+
// so fail immediately.
355372
pos = 0;
356373
return false;
357374
}

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ internal sealed class RegexLWCGCompiler : RegexCompiler
5959
EmitTryMatchAtCurrentPosition();
6060

6161
DynamicMethod scanMethod = DefineDynamicMethod($"Regex{regexNum}_Scan{description}", null, typeof(CompiledRegexRunner), new[] { typeof(RegexRunner), typeof(ReadOnlySpan<char>) });
62-
EmitScan(tryfindNextPossibleStartPositionMethod, tryMatchAtCurrentPositionMethod);
62+
EmitScan(options, tryfindNextPossibleStartPositionMethod, tryMatchAtCurrentPositionMethod);
6363

6464
return new CompiledRegexRunnerFactory(scanMethod);
6565
}

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

Lines changed: 27 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2558,39 +2558,46 @@ public int ChildCount()
25582558
// there's no need to localize).
25592559
internal bool SupportsCompilation([NotNullWhen(false)] out string? reason)
25602560
{
2561-
if (!StackHelper.TryEnsureSufficientExecutionStack())
2562-
{
2563-
reason = "run-time limits were exceeded";
2564-
return false;
2565-
}
2566-
2567-
// NonBacktracking isn't supported, nor RightToLeft. The latter applies to both the top-level
2568-
// options as well as when used to specify positive and negative lookbehinds.
25692561
if ((Options & RegexOptions.NonBacktracking) != 0)
25702562
{
2571-
reason = "RegexOptions.NonBacktracking was specified";
2563+
reason = "RegexOptions.NonBacktracking isn't supported";
25722564
return false;
25732565
}
25742566

2575-
if ((Options & RegexOptions.RightToLeft) != 0)
2567+
if (ExceedsMaxDepthAllowedDepth(this, allowedDepth: 40))
25762568
{
2577-
reason = "RegexOptions.RightToLeft or a positive/negative lookbehind was used";
2569+
// For the source generator, deep RegexNode trees can result in emitting C# code that exceeds C# compiler
2570+
// limitations, leading to "CS8078: An expression is too long or complex to compile". As such, we place
2571+
// an artificial limit on max tree depth in order to mitigate such issues. The allowed depth can be tweaked
2572+
// as needed; its exceedingly rare to find expressions with such deep trees. And while RegexCompiler doesn't
2573+
// have to deal with C# compiler limitations, we still want to limit max tree depth as we want to limit
2574+
// how deep recursion we'll employ as part of code generation.
2575+
reason = "the expression may result exceeding run-time or compiler limits";
25782576
return false;
25792577
}
25802578

2581-
int childCount = ChildCount();
2582-
for (int i = 0; i < childCount; i++)
2579+
// Supported.
2580+
reason = null;
2581+
return true;
2582+
2583+
static bool ExceedsMaxDepthAllowedDepth(RegexNode node, int allowedDepth)
25832584
{
2584-
// The node isn't supported if any of its children aren't supported.
2585-
if (!Child(i).SupportsCompilation(out reason))
2585+
if (allowedDepth <= 0)
25862586
{
2587-
return false;
2587+
return true;
25882588
}
2589-
}
25902589

2591-
// Supported.
2592-
reason = null;
2593-
return true;
2590+
int childCount = node.ChildCount();
2591+
for (int i = 0; i < childCount; i++)
2592+
{
2593+
if (ExceedsMaxDepthAllowedDepth(node.Child(i), allowedDepth - 1))
2594+
{
2595+
return true;
2596+
}
2597+
}
2598+
2599+
return false;
2600+
}
25942601
}
25952602

25962603
/// <summary>Gets whether the node is a Set/Setloop/Setloopatomic/Setlazy node.</summary>

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTreeAnalyzer.cs

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,9 @@ static bool TryAnalyze(RegexNode node, AnalysisResults results, bool isAtomicByA
2323
return false;
2424
}
2525

26-
// Track whether we've seen any node with IgnoreCase set.
26+
// Track whether we've seen any nodes with various options set.
2727
results._hasIgnoreCase |= (node.Options & RegexOptions.IgnoreCase) != 0;
28+
results._hasRightToLeft |= (node.Options & RegexOptions.RightToLeft) != 0;
2829

2930
if (isAtomicByAncestor)
3031
{
@@ -149,6 +150,8 @@ internal sealed class AnalysisResults
149150
internal HashSet<RegexNode>? _mayBacktrack;
150151
/// <summary>Whether any node has <see cref="RegexOptions.IgnoreCase"/> set.</summary>
151152
internal bool _hasIgnoreCase;
153+
/// <summary>Whether any node has <see cref="RegexOptions.RightToLeft"/> set.</summary>
154+
internal bool _hasRightToLeft;
152155

153156
/// <summary>Initializes the instance.</summary>
154157
/// <param name="regexTree">The code being analyzed.</param>
@@ -158,23 +161,48 @@ internal sealed class AnalysisResults
158161
public RegexTree RegexTree { get; }
159162

160163
/// <summary>Gets whether a node is considered atomic based on its ancestry.</summary>
164+
/// <remarks>
165+
/// If the whole tree couldn't be examined, this returns false. That could lead to additional
166+
/// code being output as nodes that could have been made atomic aren't, but functionally it's
167+
/// the safe choice.
168+
/// </remarks>
161169
public bool IsAtomicByAncestor(RegexNode node) => _isAtomicByAncestor.Contains(node);
162170

163171
/// <summary>Gets whether a node directly or indirectly contains captures.</summary>
172+
/// <remarks>
173+
/// If the whole tree couldn't be examined, this returns true. That could lead to additional
174+
/// code being emitted to deal with captures that can't occur, but functionally it's the
175+
/// safe choice.
176+
/// </remarks>
164177
public bool MayContainCapture(RegexNode node) => !_complete || _containsCapture.Contains(node);
165178

166179
/// <summary>Gets whether a node is or directory or indirectly contains a backtracking construct that isn't hidden by an internal atomic construct.</summary>
167180
/// <remarks>
168181
/// In most code generation situations, we only need to know after we emit the child code whether
169182
/// the child may backtrack, and that we can see with 100% certainty. This method is useful in situations
170183
/// where we need to predict without traversing the child at code generation time whether it may
171-
/// incur backtracking. This method may have (few) false positives, but won't have any false negatives,
184+
/// incur backtracking. This method may have (few) false positives (return true when it could have
185+
/// returned false), but won't have any false negatives (return false when it should have returned true),
172186
/// meaning it might claim a node requires backtracking even if it doesn't, but it will always return
173-
/// true for any node that requires backtracking.
187+
/// true for any node that requires backtracking. In that vein, if the whole tree couldn't be examined,
188+
/// this returns true.
174189
/// </remarks>
175190
public bool MayBacktrack(RegexNode node) => !_complete || (_mayBacktrack?.Contains(node) ?? false);
176191

177192
/// <summary>Gets whether a node might have <see cref="RegexOptions.IgnoreCase"/> set.</summary>
178-
public bool HasIgnoreCase => _complete && _hasIgnoreCase;
193+
/// <remarks>
194+
/// If the whole tree couldn't be examined, this returns true. That could lead to additional
195+
/// code being emitted to support case-insensitivity in expressions that don't actually need
196+
/// it, but functionally it's the safe choice.
197+
/// </remarks>
198+
public bool HasIgnoreCase => !_complete || _hasIgnoreCase;
199+
200+
/// <summary>Gets whether a node might have <see cref="RegexOptions.RightToLeft"/> set.</summary>
201+
/// <remarks>
202+
/// If the whole tree couldn't be examined, this returns true. That could lead to additional
203+
/// code being emitted to support expressions that don't actually contain any RightToLeft
204+
/// nodes, but functionally it's the safe choice.
205+
/// </remarks>
206+
public bool HasRightToLeft => !_complete || _hasRightToLeft;
179207
}
180208
}

src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,11 +84,36 @@ public static IEnumerable<object[]> Match_MemberData()
8484

8585
if (!RegexHelpers.IsNonBacktracking(engine))
8686
{
87-
// Zero-width negative lookahead assertion: Actual - "abc(?!XXX)\\w+"
87+
// Zero-width negative lookahead assertion
8888
yield return (@"abc(?!XXX)\w+", "abcXXXdef", RegexOptions.None, 0, 9, false, string.Empty);
8989

90-
// Zero-width positive lookbehind assertion: Actual - "(\\w){6}(?<=XXX)def"
90+
// Zero-width positive lookbehind assertion
9191
yield return (@"(\w){6}(?<=XXX)def", "abcXXXdef", RegexOptions.None, 0, 9, true, "abcXXXdef");
92+
yield return (@"(?<=c)def", "123abcdef", RegexOptions.None, 0, 9, true, "def");
93+
yield return (@"(?<=abc)def", "123abcdef", RegexOptions.None, 0, 9, true, "def");
94+
yield return (@"(?<=a\wc)def", "123abcdef", RegexOptions.None, 0, 9, true, "def");
95+
yield return (@"(?<=\ba\wc)def", "123 abcdef", RegexOptions.None, 0, 10, true, "def");
96+
yield return (@"(?<=.\ba\wc\B)def", "123 abcdef", RegexOptions.None, 0, 10, true, "def");
97+
yield return (@"(?<=^123 abc)def", "123 abcdef", RegexOptions.None, 0, 10, true, "def");
98+
yield return (@"(?<=^123 abc)def", "123 abcdef", RegexOptions.Multiline, 0, 10, true, "def");
99+
yield return (@"(?<=123$\nabc)def", "123\nabcdef", RegexOptions.Multiline, 0, 10, true, "def");
100+
yield return (@"123(?<!$) abcdef", "123 abcdef", RegexOptions.None, 0, 10, true, "123 abcdef");
101+
yield return (@"\w{3}(?<=^xyz|^abc)defg", "abcdefg", RegexOptions.None, 0, 7, true, "abcdefg");
102+
yield return (@"(abc)\w(?<=(?(1)d|e))", "abcdabc", RegexOptions.None, 0, 7, true, "abcd");
103+
yield return (@"(abc)\w(?<!(?(1)e|d))", "abcdabc", RegexOptions.None, 0, 7, true, "abcd");
104+
yield return (@"(abc)\w(?<=(?(cd)d|e))", "abcdabc", RegexOptions.None, 0, 7, true, "abcd");
105+
yield return (@"(abc)\w(?<!(?(cd)e|d))", "abcdabc", RegexOptions.None, 0, 7, true, "abcd");
106+
yield return (@"\w{3}(?<=(\w){6,8})", "abcdefghijklmnop", RegexOptions.None, 0, 16, true, "def");
107+
yield return (@"\w{3}(?<=(?:\d\w){4})", "a1b2c3d4e5", RegexOptions.None, 0, 10, true, "d4e");
108+
yield return (@"\w{3}(?<=(\w){6,8}?)", "abcdefghijklmnop", RegexOptions.None, 0, 16, true, "def");
109+
yield return (@"\w{3}(?<=(?:\d\w){3,4}?\d)", "a1b2c3d4e5", RegexOptions.None, 0, 10, true, "3d4");
110+
yield return (@"(\w{3})\w*(?<=\1)", "---abcdefababc123", RegexOptions.None, 0, 17, true, "abcdefababc");
111+
yield return (@"(?<=\w{3})\w{4}", "abcdefghijklmnop", RegexOptions.None, 0, 16, true, "defg");
112+
yield return (@"(?<=\w{3,8})\w{4}", "abcdefghijklmnop", RegexOptions.None, 0, 16, true, "defg");
113+
yield return (@"(?<=\w*)\w{4}", "abcdefghijklmnop", RegexOptions.None, 0, 16, true, "abcd");
114+
yield return (@"(?<=\w?)\w{4}", "abcdefghijklmnop", RegexOptions.None, 0, 16, true, "abcd");
115+
yield return (@"(?<=\d?)a{4}", "123aaaaaaaaa", RegexOptions.None, 0, 12, true, "aaaa");
116+
yield return (@"(?<=a{3,5}[ab]*)1234", "aaaaaaa1234", RegexOptions.None, 0, 11, true, "1234");
92117

93118
// Zero-width negative lookbehind assertion: Actual - "(\\w){6}(?<!XXX)def"
94119
yield return (@"(\w){6}(?<!XXX)def", "XXXabcdef", RegexOptions.None, 0, 9, true, "XXXabcdef");

src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexGeneratorParserTests.cs

Lines changed: 0 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -197,21 +197,6 @@ partial class C
197197
Assert.Equal("SYSLIB1044", Assert.Single(diagnostics).Id);
198198
}
199199

200-
[Fact]
201-
public async Task Diagnostic_RightToLeft_LimitedSupport()
202-
{
203-
IReadOnlyList<Diagnostic> diagnostics = await RegexGeneratorHelper.RunGenerator(@"
204-
using System.Text.RegularExpressions;
205-
partial class C
206-
{
207-
[RegexGenerator(""ab"", RegexOptions.RightToLeft)]
208-
private static partial Regex RightToLeftNotSupported();
209-
}
210-
");
211-
212-
Assert.Equal("SYSLIB1045", Assert.Single(diagnostics).Id);
213-
}
214-
215200
[Fact]
216201
public async Task Diagnostic_NonBacktracking_LimitedSupport()
217202
{
@@ -227,36 +212,6 @@ partial class C
227212
Assert.Equal("SYSLIB1045", Assert.Single(diagnostics).Id);
228213
}
229214

230-
[Fact]
231-
public async Task Diagnostic_PositiveLookbehind_LimitedSupport()
232-
{
233-
IReadOnlyList<Diagnostic> diagnostics = await RegexGeneratorHelper.RunGenerator(@"
234-
using System.Text.RegularExpressions;
235-
partial class C
236-
{
237-
[RegexGenerator(""(?<=\b20)\d{2}\b"")]
238-
private static partial Regex PositiveLookbehindNotSupported();
239-
}
240-
");
241-
242-
Assert.Equal("SYSLIB1045", Assert.Single(diagnostics).Id);
243-
}
244-
245-
[Fact]
246-
public async Task Diagnostic_NegativeLookbehind_LimitedSupport()
247-
{
248-
IReadOnlyList<Diagnostic> diagnostics = await RegexGeneratorHelper.RunGenerator(@"
249-
using System.Text.RegularExpressions;
250-
partial class C
251-
{
252-
[RegexGenerator(""(?<!(Saturday|Sunday) )\b\w+ \d{1,2}, \d{4}\b"")]
253-
private static partial Regex NegativeLookbehindNotSupported();
254-
}
255-
");
256-
257-
Assert.Equal("SYSLIB1045", Assert.Single(diagnostics).Id);
258-
}
259-
260215
[Fact]
261216
public async Task Valid_ClassWithoutNamespace()
262217
{

0 commit comments

Comments
 (0)