Skip to content

Commit 44d28bf

Browse files
authored
Extend RegexCharClass.Canonicalize range inversion optimization (#61562)
* Extend RegexCharClass.Canonicalize range inversion optimization There's a simple optimization in RegexCharClass.Canonicalize that was added in .NET 5, with the goal of taking a set that's made up of exactly two ranges and seeing whether those ranges were leaving out exactly one character. If they were, the set can instead be rewritten as that character negated, which is a normalized form used downstream and optimized. We can extend this normalization ever so slightly to be for two ranges separated not just be a single character but by more than that as well. * Update TODO comment * Add some more reduction tests
1 parent d1b3816 commit 44d28bf

File tree

2 files changed

+21
-8
lines changed

2 files changed

+21
-8
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1390,23 +1390,29 @@ private void Canonicalize(bool isNonBacktracking)
13901390
rangelist.RemoveRange(j, rangelist.Count - j);
13911391
}
13921392

1393-
// If the class now represents a single negated character, but does so by including every
1394-
// other character, invert it to produce a normalized form recognized by IsSingletonInverse.
1395-
if (!isNonBacktracking && // do not produce the IsSingletonInverse transformation in NonBacktracking mode
1393+
// If the class now represents a single negated range, but does so by including every
1394+
// other character, invert it to produce a normalized form with a single range. This
1395+
// is valuable for subsequent optimizations in most of the engines.
1396+
// TODO: https://github.com/dotnet/runtime/issues/61048. The special-casing for NonBacktracking
1397+
// can be deleted once this issue is addressed. The special-casing exists because NonBacktracking
1398+
// is on a different casing plan than the other engines and doesn't use ToLower on each input
1399+
// character at match time; this in turn can highlight differences between sets and their inverted
1400+
// versions of themselves, e.g. a difference between [0-AC-\uFFFF] and [^B].
1401+
if (!isNonBacktracking &&
13961402
!_negate &&
13971403
_subtractor is null &&
13981404
(_categories is null || _categories.Length == 0))
13991405
{
14001406
if (rangelist.Count == 2)
14011407
{
1402-
// There are two ranges in the list. See if there's one missing element between them.
1408+
// There are two ranges in the list. See if there's one missing range between them.
1409+
// Such a range might be as small as a single character.
14031410
if (rangelist[0].First == 0 &&
1404-
rangelist[0].Last == (char)(rangelist[1].First - 2) &&
1405-
rangelist[1].Last == LastChar)
1411+
rangelist[1].Last == LastChar &&
1412+
rangelist[0].Last < rangelist[1].First - 1)
14061413
{
1407-
char ch = (char)(rangelist[0].Last + 1);
1414+
rangelist[0] = new SingleRange((char)(rangelist[0].Last + 1), (char)(rangelist[1].First - 1));
14081415
rangelist.RemoveAt(1);
1409-
rangelist[0] = new SingleRange(ch, ch);
14101416
_negate = true;
14111417
}
14121418
}

src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,13 @@ private static int GetMinRequiredLength(Regex r)
324324
[InlineData("[^\n]*", ".*")]
325325
[InlineData("(?>[^\n]*)", "(?>.*)")]
326326
[InlineData("[^\n]*?", ".*?")]
327+
// Set reduction
328+
[InlineData("[\u0001-\uFFFF]", "[^\u0000]")]
329+
[InlineData("[\u0000-\uFFFE]", "[^\uFFFF]")]
330+
[InlineData("[\u0000-AB-\uFFFF]", "[\u0000-\uFFFF]")]
331+
[InlineData("[ABC-EG-J]", "[A-EG-J]")]
332+
[InlineData("[\u0000-AC-\uFFFF]", "[^B]")]
333+
[InlineData("[\u0000-AF-\uFFFF]", "[^B-E]")]
327334
// Large loop patterns
328335
[InlineData("a*a*a*a*a*a*a*b*b*?a+a*", "a*b*b*?a+")]
329336
[InlineData("a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "a{0,30}aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")]

0 commit comments

Comments
 (0)