@@ -551,7 +551,7 @@ void EmitFixedSet()
551
551
for ( ; setIndex < setsToUse ; setIndex ++ )
552
552
{
553
553
string spanIndex = $ "span[i{ ( sets [ setIndex ] . Distance > 0 ? $ " + { sets [ setIndex ] . Distance } " : "" ) } ]";
554
- string charInClassExpr = MatchCharacterClass ( hasTextInfo , options , spanIndex , sets [ setIndex ] . Set , sets [ setIndex ] . CaseInsensitive , additionalDeclarations , ref requiredHelpers ) ;
554
+ string charInClassExpr = MatchCharacterClass ( hasTextInfo , options , spanIndex , sets [ setIndex ] . Set , sets [ setIndex ] . CaseInsensitive , negate : false , additionalDeclarations , ref requiredHelpers ) ;
555
555
556
556
if ( setIndex == start )
557
557
{
@@ -1898,7 +1898,7 @@ void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, string? offset
1898
1898
1899
1899
if ( node . IsSetFamily )
1900
1900
{
1901
- expr = $ "! { MatchCharacterClass ( hasTextInfo , options , expr , node . Str ! , IsCaseInsensitive ( node ) , additionalDeclarations , ref requiredHelpers ) } ";
1901
+ expr = $ "{ MatchCharacterClass ( hasTextInfo , options , expr , node . Str ! , IsCaseInsensitive ( node ) , negate : true , additionalDeclarations , ref requiredHelpers ) } ";
1902
1902
}
1903
1903
else
1904
1904
{
@@ -2696,7 +2696,7 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired =
2696
2696
string expr = $ "{ sliceSpan } [{ iterationLocal } ]";
2697
2697
if ( node . IsSetFamily )
2698
2698
{
2699
- expr = MatchCharacterClass ( hasTextInfo , options , expr , node . Str ! , IsCaseInsensitive ( node ) , additionalDeclarations , ref requiredHelpers ) ;
2699
+ expr = MatchCharacterClass ( hasTextInfo , options , expr , node . Str ! , IsCaseInsensitive ( node ) , negate : false , additionalDeclarations , ref requiredHelpers ) ;
2700
2700
}
2701
2701
else
2702
2702
{
@@ -2750,7 +2750,7 @@ void EmitAtomicSingleCharZeroOrOne(RegexNode node)
2750
2750
string expr = $ "{ sliceSpan } [{ sliceStaticPos } ]";
2751
2751
if ( node . IsSetFamily )
2752
2752
{
2753
- expr = MatchCharacterClass ( hasTextInfo , options , expr , node . Str ! , IsCaseInsensitive ( node ) , additionalDeclarations , ref requiredHelpers ) ;
2753
+ expr = MatchCharacterClass ( hasTextInfo , options , expr , node . Str ! , IsCaseInsensitive ( node ) , negate : false , additionalDeclarations , ref requiredHelpers ) ;
2754
2754
}
2755
2755
else
2756
2756
{
@@ -3104,7 +3104,7 @@ private static bool EmitInitializeCultureForGoIfNecessary(IndentedTextWriter wri
3104
3104
3105
3105
private static string ToLowerIfNeeded ( bool hasTextInfo , RegexOptions options , string expression , bool toLower ) => toLower ? ToLower ( hasTextInfo , options , expression ) : expression ;
3106
3106
3107
- private static string MatchCharacterClass ( bool hasTextInfo , RegexOptions options , string chExpr , string charClass , bool caseInsensitive , HashSet < string > additionalDeclarations , ref RequiredHelperFunctions requiredHelpers )
3107
+ private static string MatchCharacterClass ( bool hasTextInfo , RegexOptions options , string chExpr , string charClass , bool caseInsensitive , bool negate , HashSet < string > additionalDeclarations , ref RequiredHelperFunctions requiredHelpers )
3108
3108
{
3109
3109
// We need to perform the equivalent of calling RegexRunner.CharInClass(ch, charClass),
3110
3110
// but that call is relatively expensive. Before we fall back to it, we try to optimize
@@ -3118,27 +3118,23 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
3118
3118
{
3119
3119
case RegexCharClass . AnyClass :
3120
3120
// ideally this could just be "return true;", but we need to evaluate the expression for its side effects
3121
- return $ "({ chExpr } >= 0)"; // a char is unsigned and thus won't ever be negative, so this is equivalent to true
3121
+ return $ "({ chExpr } { ( negate ? "<" : ">=" ) } 0)"; // a char is unsigned and thus won't ever be negative
3122
3122
3123
3123
case RegexCharClass . DigitClass :
3124
- return $ "char.IsDigit({ chExpr } )";
3125
-
3126
3124
case RegexCharClass . NotDigitClass :
3127
- return $ "!char.IsDigit({ chExpr } )";
3125
+ negate ^= charClass == RegexCharClass . NotDigitClass ;
3126
+ return $ "{ ( negate ? "!" : "" ) } char.IsDigit({ chExpr } )";
3128
3127
3129
3128
case RegexCharClass . SpaceClass :
3130
- return $ "char.IsWhiteSpace({ chExpr } )";
3131
-
3132
3129
case RegexCharClass . NotSpaceClass :
3133
- return $ "!char.IsWhiteSpace({ chExpr } )";
3130
+ negate ^= charClass == RegexCharClass . NotSpaceClass ;
3131
+ return $ "{ ( negate ? "!" : "" ) } char.IsWhiteSpace({ chExpr } )";
3134
3132
3135
3133
case RegexCharClass . WordClass :
3136
- requiredHelpers |= RequiredHelperFunctions . IsWordChar ;
3137
- return $ "IsWordChar({ chExpr } )";
3138
-
3139
3134
case RegexCharClass . NotWordClass :
3140
3135
requiredHelpers |= RequiredHelperFunctions . IsWordChar ;
3141
- return $ "!IsWordChar({ chExpr } )";
3136
+ negate ^= charClass == RegexCharClass . NotWordClass ;
3137
+ return $ "{ ( negate ? "!" : "" ) } IsWordChar({ chExpr } )";
3142
3138
}
3143
3139
3144
3140
// If we're meant to be doing a case-insensitive lookup, and if we're not using the invariant culture,
@@ -3160,18 +3156,19 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
3160
3156
// Next, handle simple sets of one range, e.g. [A-Z], [0-9], etc. This includes some built-in classes, like ECMADigitClass.
3161
3157
if ( ! invariant && RegexCharClass . TryGetSingleRange ( charClass , out char lowInclusive , out char highInclusive ) )
3162
3158
{
3163
- bool invert = RegexCharClass . IsNegated ( charClass ) ;
3159
+ negate ^ = RegexCharClass . IsNegated ( charClass ) ;
3164
3160
return lowInclusive == highInclusive ?
3165
- $ "({ chExpr } { ( invert ? "!=" : "==" ) } { Literal ( lowInclusive ) } )" :
3166
- $ "(((uint){ chExpr } ) - { Literal ( lowInclusive ) } { ( invert ? ">" : "<=" ) } (uint)({ Literal ( highInclusive ) } - { Literal ( lowInclusive ) } ))";
3161
+ $ "({ chExpr } { ( negate ? "!=" : "==" ) } { Literal ( lowInclusive ) } )" :
3162
+ $ "(((uint){ chExpr } ) - { Literal ( lowInclusive ) } { ( negate ? ">" : "<=" ) } (uint)({ Literal ( highInclusive ) } - { Literal ( lowInclusive ) } ))";
3167
3163
}
3168
3164
3169
3165
// Next if the character class contains nothing but a single Unicode category, we can calle char.GetUnicodeCategory and
3170
3166
// compare against it. It has a fast-lookup path for ASCII, so is as good or better than any lookup we'd generate (plus
3171
3167
// we get smaller code), and it's what we'd do for the fallback (which we get to avoid generating) as part of CharInClass.
3172
3168
if ( ! invariant && RegexCharClass . TryGetSingleUnicodeCategory ( charClass , out UnicodeCategory category , out bool negated ) )
3173
3169
{
3174
- return $ "(char.GetUnicodeCategory({ chExpr } ) { ( negated ? "!=" : "==" ) } global::System.Globalization.UnicodeCategory.{ category } )";
3170
+ negate ^= negated ;
3171
+ return $ "(char.GetUnicodeCategory({ chExpr } ) { ( negate ? "!=" : "==" ) } global::System.Globalization.UnicodeCategory.{ category } )";
3175
3172
}
3176
3173
3177
3174
// Next, if there's only 2, 3, or 4 chars in the set (fairly common due to the sets we create for prefixes),
@@ -3186,23 +3183,31 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
3186
3183
case 2 :
3187
3184
if ( ( setChars [ 0 ] | 0x20 ) == setChars [ 1 ] )
3188
3185
{
3189
- return $ "(({ chExpr } | 0x20) == { Literal ( setChars [ 1 ] ) } )";
3186
+ return $ "(({ chExpr } | 0x20) { ( negate ? "!=" : "==" ) } { Literal ( setChars [ 1 ] ) } )";
3190
3187
}
3191
3188
additionalDeclarations . Add ( "char ch;" ) ;
3192
- return $ "(((ch = { chExpr } ) == { Literal ( setChars [ 0 ] ) } ) | (ch == { Literal ( setChars [ 1 ] ) } ))";
3189
+ return negate ?
3190
+ $ "(((ch = { chExpr } ) != { Literal ( setChars [ 0 ] ) } ) & (ch != { Literal ( setChars [ 1 ] ) } ))" :
3191
+ $ "(((ch = { chExpr } ) == { Literal ( setChars [ 0 ] ) } ) | (ch == { Literal ( setChars [ 1 ] ) } ))";
3193
3192
3194
3193
case 3 :
3195
3194
additionalDeclarations . Add ( "char ch;" ) ;
3196
- return ( setChars [ 0 ] | 0x20 ) == setChars [ 1 ] ?
3197
- $ "((((ch = { chExpr } ) | 0x20) == { Literal ( setChars [ 1 ] ) } ) | (ch == { Literal ( setChars [ 2 ] ) } ))" :
3198
- $ "(((ch = { chExpr } ) == { Literal ( setChars [ 0 ] ) } ) | (ch == { Literal ( setChars [ 1 ] ) } ) | (ch == { Literal ( setChars [ 2 ] ) } ))";
3195
+ return ( negate , ( setChars [ 0 ] | 0x20 ) == setChars [ 1 ] ) switch
3196
+ {
3197
+ ( false , false ) => $ "(((ch = { chExpr } ) == { Literal ( setChars [ 0 ] ) } ) | (ch == { Literal ( setChars [ 1 ] ) } ) | (ch == { Literal ( setChars [ 2 ] ) } ))",
3198
+ ( true , false ) => $ "(((ch = { chExpr } ) != { Literal ( setChars [ 0 ] ) } ) & (ch != { Literal ( setChars [ 1 ] ) } ) & (ch != { Literal ( setChars [ 2 ] ) } ))",
3199
+ ( false , true ) => $ "((((ch = { chExpr } ) | 0x20) == { Literal ( setChars [ 1 ] ) } ) | (ch == { Literal ( setChars [ 2 ] ) } ))",
3200
+ ( true , true ) => $ "((((ch = { chExpr } ) | 0x20) != { Literal ( setChars [ 1 ] ) } ) & (ch != { Literal ( setChars [ 2 ] ) } ))",
3201
+ } ;
3199
3202
3200
3203
case 4 :
3201
3204
if ( ( ( setChars [ 0 ] | 0x20 ) == setChars [ 1 ] ) &&
3202
3205
( ( setChars [ 2 ] | 0x20 ) == setChars [ 3 ] ) )
3203
3206
{
3204
3207
additionalDeclarations . Add ( "char ch;" ) ;
3205
- return $ "(((ch = ({ chExpr } | 0x20)) == { Literal ( setChars [ 1 ] ) } ) | (ch == { Literal ( setChars [ 3 ] ) } ))";
3208
+ return negate ?
3209
+ $ "(((ch = ({ chExpr } | 0x20)) != { Literal ( setChars [ 1 ] ) } ) & (ch != { Literal ( setChars [ 3 ] ) } ))" :
3210
+ $ "(((ch = ({ chExpr } | 0x20)) == { Literal ( setChars [ 1 ] ) } ) | (ch == { Literal ( setChars [ 3 ] ) } ))";
3206
3211
}
3207
3212
break ;
3208
3213
}
@@ -3223,8 +3228,8 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
3223
3228
// the same as [\u0370-\u03FF\u1F00-1FFF]. (In the future, we could possibly
3224
3229
// extend the analysis to produce a known lower-bound and compare against
3225
3230
// that rather than always using 128 as the pivot point.)
3226
- return invariant ?
3227
- $ "((ch = { chExpr } ) >= 128 && global::System.Text.RegularExpressions.RegexRunner.CharInClass(char.ToLowerInvariant(( char)ch) , { Literal ( charClass ) } ))" :
3231
+ return negate ?
3232
+ $ "((ch = { chExpr } ) < 128 || ! global::System.Text.RegularExpressions.RegexRunner.CharInClass(( char)ch, { Literal ( charClass ) } ))" :
3228
3233
$ "((ch = { chExpr } ) >= 128 && global::System.Text.RegularExpressions.RegexRunner.CharInClass((char)ch, { Literal ( charClass ) } ))";
3229
3234
}
3230
3235
@@ -3233,8 +3238,8 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
3233
3238
// We determined that every ASCII character is in the class, for example
3234
3239
// if the class were the negated example from case 1 above:
3235
3240
// [^\p{IsGreek}\p{IsGreekExtended}].
3236
- return invariant ?
3237
- $ "((ch = { chExpr } ) < 128 || global::System.Text.RegularExpressions.RegexRunner.CharInClass(char.ToLowerInvariant(( char)ch) , { Literal ( charClass ) } ))" :
3241
+ return negate ?
3242
+ $ "((ch = { chExpr } ) >= 128 && ! global::System.Text.RegularExpressions.RegexRunner.CharInClass(( char)ch, { Literal ( charClass ) } ))" :
3238
3243
$ "((ch = { chExpr } ) < 128 || global::System.Text.RegularExpressions.RegexRunner.CharInClass((char)ch, { Literal ( charClass ) } ))";
3239
3244
}
3240
3245
}
@@ -3277,23 +3282,31 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
3277
3282
// We know that all inputs that could match are ASCII, for example if the
3278
3283
// character class were [A-Za-z0-9], so since the ch is now known to be >= 128, we
3279
3284
// can just fail the comparison.
3280
- return $ "((ch = { chExpr } ) < 128 && ({ Literal ( bitVectorString ) } [ch >> 4] & (1 << (ch & 0xF))) != 0)";
3285
+ return negate ?
3286
+ $ "((ch = { chExpr } ) >= 128 || ({ Literal ( bitVectorString ) } [ch >> 4] & (1 << (ch & 0xF))) == 0)" :
3287
+ $ "((ch = { chExpr } ) < 128 && ({ Literal ( bitVectorString ) } [ch >> 4] & (1 << (ch & 0xF))) != 0)";
3281
3288
}
3282
3289
3283
3290
if ( analysis . AllNonAsciiContained )
3284
3291
{
3285
3292
// We know that all non-ASCII inputs match, for example if the character
3286
3293
// class were [^\r\n], so since we just determined the ch to be >= 128, we can just
3287
3294
// give back success.
3288
- return $ "((ch = { chExpr } ) >= 128 || ({ Literal ( bitVectorString ) } [ch >> 4] & (1 << (ch & 0xF))) != 0)";
3295
+ return negate ?
3296
+ $ "((ch = { chExpr } ) < 128 && ({ Literal ( bitVectorString ) } [ch >> 4] & (1 << (ch & 0xF))) == 0)" :
3297
+ $ "((ch = { chExpr } ) >= 128 || ({ Literal ( bitVectorString ) } [ch >> 4] & (1 << (ch & 0xF))) != 0)";
3289
3298
}
3290
3299
3291
3300
// We know that the whole class wasn't ASCII, and we don't know anything about the non-ASCII
3292
3301
// characters other than that some might be included, for example if the character class
3293
3302
// were [\w\d], so since ch >= 128, we need to fall back to calling CharInClass.
3294
- return invariant ?
3295
- $ "((ch = { chExpr } ) < 128 ? ({ Literal ( bitVectorString ) } [ch >> 4] & (1 << (ch & 0xF))) != 0 : global::System.Text.RegularExpressions.RegexRunner.CharInClass(char.ToLowerInvariant((char)ch), { Literal ( charClass ) } ))" :
3296
- $ "((ch = { chExpr } ) < 128 ? ({ Literal ( bitVectorString ) } [ch >> 4] & (1 << (ch & 0xF))) != 0 : global::System.Text.RegularExpressions.RegexRunner.CharInClass((char)ch, { Literal ( charClass ) } ))";
3303
+ return ( negate , invariant ) switch
3304
+ {
3305
+ ( false , false ) => $ "((ch = { chExpr } ) < 128 ? ({ Literal ( bitVectorString ) } [ch >> 4] & (1 << (ch & 0xF))) != 0 : global::System.Text.RegularExpressions.RegexRunner.CharInClass((char)ch, { Literal ( charClass ) } ))",
3306
+ ( true , false ) => $ "((ch = { chExpr } ) < 128 ? ({ Literal ( bitVectorString ) } [ch >> 4] & (1 << (ch & 0xF))) == 0 : !global::System.Text.RegularExpressions.RegexRunner.CharInClass((char)ch, { Literal ( charClass ) } ))",
3307
+ ( false , true ) => $ "((ch = { chExpr } ) < 128 ? ({ Literal ( bitVectorString ) } [ch >> 4] & (1 << (ch & 0xF))) != 0 : global::System.Text.RegularExpressions.RegexRunner.CharInClass(char.ToLowerInvariant((char)ch), { Literal ( charClass ) } ))",
3308
+ ( true , true ) => $ "((ch = { chExpr } ) < 128 ? ({ Literal ( bitVectorString ) } [ch >> 4] & (1 << (ch & 0xF))) == 0 : !global::System.Text.RegularExpressions.RegexRunner.CharInClass(char.ToLowerInvariant((char)ch), { Literal ( charClass ) } ))",
3309
+ } ;
3297
3310
}
3298
3311
3299
3312
/// <summary>
0 commit comments