Skip to content

Commit c208114

Browse files
committed
Fix double-negatives with MatchCharacterClass
Also add missing set description rendering for \d, \D
1 parent 7dcb289 commit c208114

File tree

2 files changed

+58
-54
lines changed

2 files changed

+58
-54
lines changed

src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs

Lines changed: 48 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -551,7 +551,7 @@ void EmitFixedSet()
551551
for (; setIndex < setsToUse; setIndex++)
552552
{
553553
string spanIndex = $"span[i{(sets[setIndex].Distance > 0 ? $" + {sets[setIndex].Distance}" : "")}]";
554-
string charInClassExpr = MatchCharacterClass(hasTextInfo, options, spanIndex, sets[setIndex].Set, sets[setIndex].CaseInsensitive, additionalDeclarations, ref requiredHelpers);
554+
string charInClassExpr = MatchCharacterClass(hasTextInfo, options, spanIndex, sets[setIndex].Set, sets[setIndex].CaseInsensitive, negate: false, additionalDeclarations, ref requiredHelpers);
555555

556556
if (setIndex == start)
557557
{
@@ -1898,7 +1898,7 @@ void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, string? offset
18981898

18991899
if (node.IsSetFamily)
19001900
{
1901-
expr = $"!{MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), additionalDeclarations, ref requiredHelpers)}";
1901+
expr = $"{MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), negate: true, additionalDeclarations, ref requiredHelpers)}";
19021902
}
19031903
else
19041904
{
@@ -2696,7 +2696,7 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired =
26962696
string expr = $"{sliceSpan}[{iterationLocal}]";
26972697
if (node.IsSetFamily)
26982698
{
2699-
expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), additionalDeclarations, ref requiredHelpers);
2699+
expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), negate: false, additionalDeclarations, ref requiredHelpers);
27002700
}
27012701
else
27022702
{
@@ -2750,7 +2750,7 @@ void EmitAtomicSingleCharZeroOrOne(RegexNode node)
27502750
string expr = $"{sliceSpan}[{sliceStaticPos}]";
27512751
if (node.IsSetFamily)
27522752
{
2753-
expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), additionalDeclarations, ref requiredHelpers);
2753+
expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), negate: false, additionalDeclarations, ref requiredHelpers);
27542754
}
27552755
else
27562756
{
@@ -3104,7 +3104,7 @@ private static bool EmitInitializeCultureForGoIfNecessary(IndentedTextWriter wri
31043104

31053105
private static string ToLowerIfNeeded(bool hasTextInfo, RegexOptions options, string expression, bool toLower) => toLower ? ToLower(hasTextInfo, options, expression) : expression;
31063106

3107-
private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options, string chExpr, string charClass, bool caseInsensitive, HashSet<string> additionalDeclarations, ref RequiredHelperFunctions requiredHelpers)
3107+
private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options, string chExpr, string charClass, bool caseInsensitive, bool negate, HashSet<string> additionalDeclarations, ref RequiredHelperFunctions requiredHelpers)
31083108
{
31093109
// We need to perform the equivalent of calling RegexRunner.CharInClass(ch, charClass),
31103110
// but that call is relatively expensive. Before we fall back to it, we try to optimize
@@ -3118,27 +3118,23 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
31183118
{
31193119
case RegexCharClass.AnyClass:
31203120
// ideally this could just be "return true;", but we need to evaluate the expression for its side effects
3121-
return $"({chExpr} >= 0)"; // a char is unsigned and thus won't ever be negative, so this is equivalent to true
3121+
return $"({chExpr} {(negate ? "<" : ">=")} 0)"; // a char is unsigned and thus won't ever be negative
31223122

31233123
case RegexCharClass.DigitClass:
3124-
return $"char.IsDigit({chExpr})";
3125-
31263124
case RegexCharClass.NotDigitClass:
3127-
return $"!char.IsDigit({chExpr})";
3125+
negate ^= charClass == RegexCharClass.NotDigitClass;
3126+
return $"{(negate ? "!" : "")}char.IsDigit({chExpr})";
31283127

31293128
case RegexCharClass.SpaceClass:
3130-
return $"char.IsWhiteSpace({chExpr})";
3131-
31323129
case RegexCharClass.NotSpaceClass:
3133-
return $"!char.IsWhiteSpace({chExpr})";
3130+
negate ^= charClass == RegexCharClass.NotSpaceClass;
3131+
return $"{(negate ? "!" : "")}char.IsWhiteSpace({chExpr})";
31343132

31353133
case RegexCharClass.WordClass:
3136-
requiredHelpers |= RequiredHelperFunctions.IsWordChar;
3137-
return $"IsWordChar({chExpr})";
3138-
31393134
case RegexCharClass.NotWordClass:
31403135
requiredHelpers |= RequiredHelperFunctions.IsWordChar;
3141-
return $"!IsWordChar({chExpr})";
3136+
negate ^= charClass == RegexCharClass.NotWordClass;
3137+
return $"{(negate ? "!" : "")}IsWordChar({chExpr})";
31423138
}
31433139

31443140
// If we're meant to be doing a case-insensitive lookup, and if we're not using the invariant culture,
@@ -3160,18 +3156,19 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
31603156
// Next, handle simple sets of one range, e.g. [A-Z], [0-9], etc. This includes some built-in classes, like ECMADigitClass.
31613157
if (!invariant && RegexCharClass.TryGetSingleRange(charClass, out char lowInclusive, out char highInclusive))
31623158
{
3163-
bool invert = RegexCharClass.IsNegated(charClass);
3159+
negate ^= RegexCharClass.IsNegated(charClass);
31643160
return lowInclusive == highInclusive ?
3165-
$"({chExpr} {(invert ? "!=" : "==")} {Literal(lowInclusive)})" :
3166-
$"(((uint){chExpr}) - {Literal(lowInclusive)} {(invert ? ">" : "<=")} (uint)({Literal(highInclusive)} - {Literal(lowInclusive)}))";
3161+
$"({chExpr} {(negate ? "!=" : "==")} {Literal(lowInclusive)})" :
3162+
$"(((uint){chExpr}) - {Literal(lowInclusive)} {(negate ? ">" : "<=")} (uint)({Literal(highInclusive)} - {Literal(lowInclusive)}))";
31673163
}
31683164

31693165
// Next if the character class contains nothing but a single Unicode category, we can calle char.GetUnicodeCategory and
31703166
// compare against it. It has a fast-lookup path for ASCII, so is as good or better than any lookup we'd generate (plus
31713167
// we get smaller code), and it's what we'd do for the fallback (which we get to avoid generating) as part of CharInClass.
31723168
if (!invariant && RegexCharClass.TryGetSingleUnicodeCategory(charClass, out UnicodeCategory category, out bool negated))
31733169
{
3174-
return $"(char.GetUnicodeCategory({chExpr}) {(negated ? "!=" : "==")} global::System.Globalization.UnicodeCategory.{category})";
3170+
negate ^= negated;
3171+
return $"(char.GetUnicodeCategory({chExpr}) {(negate ? "!=" : "==")} global::System.Globalization.UnicodeCategory.{category})";
31753172
}
31763173

31773174
// Next, if there's only 2, 3, or 4 chars in the set (fairly common due to the sets we create for prefixes),
@@ -3186,23 +3183,31 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
31863183
case 2:
31873184
if ((setChars[0] | 0x20) == setChars[1])
31883185
{
3189-
return $"(({chExpr} | 0x20) == {Literal(setChars[1])})";
3186+
return $"(({chExpr} | 0x20) {(negate ? "!=" : "==")} {Literal(setChars[1])})";
31903187
}
31913188
additionalDeclarations.Add("char ch;");
3192-
return $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}))";
3189+
return negate ?
3190+
$"(((ch = {chExpr}) != {Literal(setChars[0])}) & (ch != {Literal(setChars[1])}))" :
3191+
$"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}))";
31933192

31943193
case 3:
31953194
additionalDeclarations.Add("char ch;");
3196-
return (setChars[0] | 0x20) == setChars[1] ?
3197-
$"((((ch = {chExpr}) | 0x20) == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))" :
3198-
$"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))";
3195+
return (negate, (setChars[0] | 0x20) == setChars[1]) switch
3196+
{
3197+
(false, false) => $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))",
3198+
(true, false) => $"(((ch = {chExpr}) != {Literal(setChars[0])}) & (ch != {Literal(setChars[1])}) & (ch != {Literal(setChars[2])}))",
3199+
(false, true) => $"((((ch = {chExpr}) | 0x20) == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))",
3200+
(true, true) => $"((((ch = {chExpr}) | 0x20) != {Literal(setChars[1])}) & (ch != {Literal(setChars[2])}))",
3201+
};
31993202

32003203
case 4:
32013204
if (((setChars[0] | 0x20) == setChars[1]) &&
32023205
((setChars[2] | 0x20) == setChars[3]))
32033206
{
32043207
additionalDeclarations.Add("char ch;");
3205-
return $"(((ch = ({chExpr} | 0x20)) == {Literal(setChars[1])}) | (ch == {Literal(setChars[3])}))";
3208+
return negate ?
3209+
$"(((ch = ({chExpr} | 0x20)) != {Literal(setChars[1])}) & (ch != {Literal(setChars[3])}))" :
3210+
$"(((ch = ({chExpr} | 0x20)) == {Literal(setChars[1])}) | (ch == {Literal(setChars[3])}))";
32063211
}
32073212
break;
32083213
}
@@ -3223,8 +3228,8 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
32233228
// the same as [\u0370-\u03FF\u1F00-1FFF]. (In the future, we could possibly
32243229
// extend the analysis to produce a known lower-bound and compare against
32253230
// that rather than always using 128 as the pivot point.)
3226-
return invariant ?
3227-
$"((ch = {chExpr}) >= 128 && global::System.Text.RegularExpressions.RegexRunner.CharInClass(char.ToLowerInvariant((char)ch), {Literal(charClass)}))" :
3231+
return negate ?
3232+
$"((ch = {chExpr}) < 128 || !global::System.Text.RegularExpressions.RegexRunner.CharInClass((char)ch, {Literal(charClass)}))" :
32283233
$"((ch = {chExpr}) >= 128 && global::System.Text.RegularExpressions.RegexRunner.CharInClass((char)ch, {Literal(charClass)}))";
32293234
}
32303235

@@ -3233,8 +3238,8 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
32333238
// We determined that every ASCII character is in the class, for example
32343239
// if the class were the negated example from case 1 above:
32353240
// [^\p{IsGreek}\p{IsGreekExtended}].
3236-
return invariant ?
3237-
$"((ch = {chExpr}) < 128 || global::System.Text.RegularExpressions.RegexRunner.CharInClass(char.ToLowerInvariant((char)ch), {Literal(charClass)}))" :
3241+
return negate ?
3242+
$"((ch = {chExpr}) >= 128 && !global::System.Text.RegularExpressions.RegexRunner.CharInClass((char)ch, {Literal(charClass)}))" :
32383243
$"((ch = {chExpr}) < 128 || global::System.Text.RegularExpressions.RegexRunner.CharInClass((char)ch, {Literal(charClass)}))";
32393244
}
32403245
}
@@ -3277,23 +3282,31 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
32773282
// We know that all inputs that could match are ASCII, for example if the
32783283
// character class were [A-Za-z0-9], so since the ch is now known to be >= 128, we
32793284
// can just fail the comparison.
3280-
return $"((ch = {chExpr}) < 128 && ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0)";
3285+
return negate ?
3286+
$"((ch = {chExpr}) >= 128 || ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0)" :
3287+
$"((ch = {chExpr}) < 128 && ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0)";
32813288
}
32823289

32833290
if (analysis.AllNonAsciiContained)
32843291
{
32853292
// We know that all non-ASCII inputs match, for example if the character
32863293
// class were [^\r\n], so since we just determined the ch to be >= 128, we can just
32873294
// give back success.
3288-
return $"((ch = {chExpr}) >= 128 || ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0)";
3295+
return negate ?
3296+
$"((ch = {chExpr}) < 128 && ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0)" :
3297+
$"((ch = {chExpr}) >= 128 || ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0)";
32893298
}
32903299

32913300
// We know that the whole class wasn't ASCII, and we don't know anything about the non-ASCII
32923301
// characters other than that some might be included, for example if the character class
32933302
// were [\w\d], so since ch >= 128, we need to fall back to calling CharInClass.
3294-
return invariant ?
3295-
$"((ch = {chExpr}) < 128 ? ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0 : global::System.Text.RegularExpressions.RegexRunner.CharInClass(char.ToLowerInvariant((char)ch), {Literal(charClass)}))" :
3296-
$"((ch = {chExpr}) < 128 ? ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0 : global::System.Text.RegularExpressions.RegexRunner.CharInClass((char)ch, {Literal(charClass)}))";
3303+
return (negate, invariant) switch
3304+
{
3305+
(false, false) => $"((ch = {chExpr}) < 128 ? ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0 : global::System.Text.RegularExpressions.RegexRunner.CharInClass((char)ch, {Literal(charClass)}))",
3306+
(true, false) => $"((ch = {chExpr}) < 128 ? ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0 : !global::System.Text.RegularExpressions.RegexRunner.CharInClass((char)ch, {Literal(charClass)}))",
3307+
(false, true) => $"((ch = {chExpr}) < 128 ? ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0 : global::System.Text.RegularExpressions.RegexRunner.CharInClass(char.ToLowerInvariant((char)ch), {Literal(charClass)}))",
3308+
(true, true) => $"((ch = {chExpr}) < 128 ? ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0 : !global::System.Text.RegularExpressions.RegexRunner.CharInClass(char.ToLowerInvariant((char)ch), {Literal(charClass)}))",
3309+
};
32973310
}
32983311

32993312
/// <summary>

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs

Lines changed: 10 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1924,25 +1924,16 @@ public static string CharDescription(char ch) =>
19241924
};
19251925

19261926
[ExcludeFromCodeCoverage]
1927-
private static string CategoryDescription(char ch)
1928-
{
1929-
if (ch == SpaceConst)
1930-
{
1931-
return "\\s";
1932-
}
1933-
1934-
if ((short)ch == NotSpaceConst)
1935-
{
1936-
return "\\S";
1937-
}
1938-
1939-
if ((short)ch < 0)
1940-
{
1941-
return "\\P{" + CategoryIdToName[(-((short)ch) - 1)] + "}";
1942-
}
1943-
1944-
return "\\p{" + CategoryIdToName[(ch - 1)] + "}";
1945-
}
1927+
private static string CategoryDescription(char ch) =>
1928+
(short)ch switch
1929+
{
1930+
SpaceConst => @"\s",
1931+
NotSpaceConst => @"\S",
1932+
(short)(UnicodeCategory.DecimalDigitNumber + 1) => @"\d",
1933+
-(short)(UnicodeCategory.DecimalDigitNumber + 1) => @"\D",
1934+
< 0 => $"\\P{{{CategoryIdToName[-(short)ch - 1]}}}",
1935+
_ => $"\\p{{{CategoryIdToName[ch - 1]}}}",
1936+
};
19461937

19471938
/// <summary>
19481939
/// A first/last pair representing a single range of characters.

0 commit comments

Comments
 (0)