Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ private RegexFindOptimizations(RegexNode root, RegexOptions options, bool isLead
// We're now left-to-right only and looking for multiple prefixes and/or sets.

// If there are multiple leading strings, we can search for any of them.
string[]? caseSensitivePrefixes = null;
float leadingStringsFrequency = -1;
if (!interpreter) // this works in the interpreter, but we avoid it due to additional cost during construction
{
if (RegexPrefixAnalyzer.FindPrefixes(root, ignoreCase: true) is { Length: > 1 } caseInsensitivePrefixes)
Expand All @@ -183,18 +185,11 @@ private RegexFindOptimizations(RegexNode root, RegexOptions options, bool isLead
return;
}

// TODO: While some benchmarks benefit from this significantly, others regressed a bit (in particular those with few
// matches). Before enabling this, we need to investigate the performance impact on real-world scenarios,
// and see if there are ways to reduce the impact.
//if (RegexPrefixAnalyzer.FindPrefixes(root, ignoreCase: false) is { Length: > 1 } caseSensitivePrefixes)
//{
// LeadingPrefixes = caseSensitivePrefixes;
// FindMode = FindNextStartingPositionMode.LeadingStrings_LeftToRight;
#if SYSTEM_TEXT_REGULAREXPRESSIONS
// LeadingStrings = SearchValues.Create(LeadingPrefixes, StringComparison.Ordinal);
#endif
// return;
//}
// Compute case-sensitive leading prefixes, but don't commit yet. We'll compare
// their starting-char frequency against the best FixedDistanceSet below to decide
// which strategy to use.
caseSensitivePrefixes = RegexPrefixAnalyzer.FindPrefixes(root, ignoreCase: false) is { Length: > 1 } csp ? csp : null;
leadingStringsFrequency = caseSensitivePrefixes is not null ? SumStartingCharFrequencies(caseSensitivePrefixes) : -1;
}

// Build up a list of all of the sets that are a fixed distance from the start of the expression.
Expand Down Expand Up @@ -227,6 +222,39 @@ private RegexFindOptimizations(RegexNode root, RegexOptions options, bool isLead
// In some searches, we may use multiple sets, so we want the subsequent ones to also be the efficiency runners-up.
RegexPrefixAnalyzer.SortFixedDistanceSetsByQuality(fixedDistanceSets);

// If we have case-sensitive leading prefixes, compare the frequency of their starting characters
// against the best fixed-distance set's characters. If the best set isn't more selective than the
// starting chars (i.e. its frequency is at least as high), prefer LeadingStrings (SearchValues)
// which can match full multi-character prefixes simultaneously. Also prefer LeadingStrings when
// the best set is negated or range-based (no Chars), since those are weak filters.
if (leadingStringsFrequency > 0)
{
bool preferLeadingStrings = true;
if (fixedDistanceSets[0].Chars is { } bestSetChars &&
!fixedDistanceSets[0].Negated)
{
ReadOnlySpan<float> frequency = RegexPrefixAnalyzer.Frequency;
Debug.Assert(frequency.Length == 128);
float bestSetFrequency = 0;
foreach (char c in bestSetChars)
{
bestSetFrequency += c < frequency.Length ? frequency[c] : 0;
}

preferLeadingStrings = bestSetFrequency >= leadingStringsFrequency;
}

if (preferLeadingStrings)
{
LeadingPrefixes = caseSensitivePrefixes!;
FindMode = FindNextStartingPositionMode.LeadingStrings_LeftToRight;
#if SYSTEM_TEXT_REGULAREXPRESSIONS
LeadingStrings = SearchValues.Create(LeadingPrefixes, StringComparison.Ordinal);
#endif
return;
}
}

// If there is no literal after the loop, use whatever set we got.
// If there is a literal after the loop, consider it to be better than a negated set and better than a set with many characters.
if (literalAfterLoop is null ||
Expand Down Expand Up @@ -272,6 +300,17 @@ private RegexFindOptimizations(RegexNode root, RegexOptions options, bool isLead
_asciiLookups = new uint[1][];
return;
}

// If we have case-sensitive leading prefixes and nothing else was selected, use them.
if (leadingStringsFrequency > 0)
{
LeadingPrefixes = caseSensitivePrefixes!;
FindMode = FindNextStartingPositionMode.LeadingStrings_LeftToRight;
#if SYSTEM_TEXT_REGULAREXPRESSIONS
LeadingStrings = SearchValues.Create(LeadingPrefixes, StringComparison.Ordinal);
#endif
return;
}
}

/// <summary>true iff <see cref="TryFindNextStartingPositionLeftToRight"/> might advance the position.</summary>
Expand Down Expand Up @@ -867,6 +906,45 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan<char> textSpan,
}
}
#endif

/// <summary>
/// Computes the sum of frequencies of the distinct starting characters of the prefixes.
/// Each frequency is a percentage (0..100) from <see cref="RegexPrefixAnalyzer.Frequency"/>,
/// so the returned sum increases with the number and commonness of distinct starting chars.
/// Returns -1 if any prefix starts with a non-ASCII character (no frequency data available).
/// </summary>
private static float SumStartingCharFrequencies(string[] prefixes)
{
ReadOnlySpan<float> frequency = RegexPrefixAnalyzer.Frequency;
Debug.Assert(frequency.Length == 128);
float totalFrequency = 0;

// Use two longs as a 128-bit bitset to track seen ASCII chars.
long seenLo = 0, seenHi = 0;
foreach (string prefix in prefixes)
{
char c = prefix[0];
if (c >= frequency.Length)
{
// Non-ASCII starting chars have no frequency data; bail out and let
// FixedDistanceSets choose a strategy, since it can examine all offsets.
return -1;
}

// Skip duplicate starting chars.
ref long seen = ref (c < 64 ? ref seenLo : ref seenHi);
long mask = 1L << (c & 63);
if ((seen & mask) != 0)
{
continue;
}
seen |= mask;

totalFrequency += frequency[c];
}

return totalFrequency;
}
}

/// <summary>Mode to use for searching for the next location of a possible match.</summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1507,7 +1507,7 @@ static bool IsBestAnchor(RegexNodeKind anchor) =>
}

/// <summary>Percent occurrences in source text (100 * char count / total count).</summary>
private static ReadOnlySpan<float> Frequency =>
internal static ReadOnlySpan<float> Frequency =>
[
0.000f /* '\x00' */, 0.000f /* '\x01' */, 0.000f /* '\x02' */, 0.000f /* '\x03' */, 0.000f /* '\x04' */, 0.000f /* '\x05' */, 0.000f /* '\x06' */, 0.000f /* '\x07' */,
0.000f /* '\x08' */, 0.001f /* '\x09' */, 0.000f /* '\x0A' */, 0.000f /* '\x0B' */, 0.000f /* '\x0C' */, 0.000f /* '\x0D' */, 0.000f /* '\x0E' */, 0.000f /* '\x0F' */,
Expand Down