-
Notifications
You must be signed in to change notification settings - Fork 5.1k
Move the IsLeft/IsRight decision out of the loop and use computed substring set #88516
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
677d59d
c8c8801
a168767
860ebb3
c1dd5d9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,9 +4,7 @@ | |
using System.Buffers; | ||
using System.Collections.Generic; | ||
using System.Diagnostics; | ||
#if !NET8_0_OR_GREATER | ||
using System.Runtime.CompilerServices; | ||
#endif | ||
|
||
namespace System.Collections.Frozen | ||
{ | ||
|
@@ -31,86 +29,72 @@ internal static class KeyAnalyzer | |
public static AnalysisResults Analyze( | ||
ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength) | ||
{ | ||
Debug.Assert(!uniqueStrings.IsEmpty); | ||
Debug.Assert(uniqueStrings.Length > 0); | ||
|
||
// Try to pick a substring comparer. If we can't find a good substring comparer, fallback to a full string comparer. | ||
AnalysisResults results; | ||
if (minLength == 0 || !TryUseSubstring(uniqueStrings, ignoreCase, minLength, maxLength, out results)) | ||
if (minLength > 0) | ||
{ | ||
results = CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, 0, 0, static (s, _, _) => s.AsSpan()); | ||
} | ||
|
||
return results; | ||
} | ||
const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit...it's not worth the increase in algorithmic complexity to analyze longer substrings | ||
|
||
/// <summary>Try to find the minimal unique substring index/length to use for comparisons.</summary> | ||
private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, out AnalysisResults results) | ||
{ | ||
const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit... t's not worth the increase in algorithmic complexity to analyze longer substrings | ||
// Sufficient uniqueness factor of 95% is good enough. | ||
IDisposable marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad. | ||
int acceptableNonUniqueCount = uniqueStrings.Length / 20; | ||
|
||
SubstringComparer comparer = ignoreCase ? new JustifiedCaseInsensitiveSubstringComparer() : new JustifiedSubstringComparer(); | ||
HashSet<string> set = new HashSet<string>( | ||
// Try to pick a substring comparer. | ||
SubstringComparer comparer = ignoreCase ? new JustifiedCaseInsensitiveSubstringComparer() : new JustifiedSubstringComparer(); | ||
HashSet<string> set = new HashSet<string>( | ||
#if NET6_0_OR_GREATER | ||
uniqueStrings.Length, | ||
uniqueStrings.Length, | ||
#endif | ||
comparer); | ||
|
||
// For each substring length... | ||
int maxSubstringLength = Math.Min(minLength, MaxSubstringLengthLimit); | ||
for (int count = 1; count <= maxSubstringLength; count++) | ||
{ | ||
comparer.IsLeft = true; | ||
comparer.Count = count; | ||
comparer); | ||
|
||
// For each index, get a uniqueness factor for the left-justified substrings. | ||
// If any is above our threshold, we're done. | ||
for (int index = 0; index <= minLength - count; index++) | ||
// For each substring length...preferring the shortest length that provides | ||
// enough uniqueness | ||
int maxSubstringLength = Math.Min(minLength, MaxSubstringLengthLimit); | ||
for (int count = 1; count <= maxSubstringLength; count++) | ||
{ | ||
comparer.Index = index; | ||
comparer.Count = count; | ||
|
||
if (HasSufficientUniquenessFactor(set, uniqueStrings)) | ||
// For each index, get a uniqueness factor for the left-justified substrings. | ||
// If any is above our threshold, we're done. | ||
for (int index = 0; index <= minLength - count; index++) | ||
{ | ||
results = CreateAnalysisResults( | ||
uniqueStrings, ignoreCase, minLength, maxLength, index, count, | ||
static (string s, int index, int count) => s.AsSpan(index, count)); | ||
IDisposable marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return true; | ||
} | ||
} | ||
comparer.Index = index; | ||
|
||
// There were no left-justified substrings of this length available. | ||
// If all of the strings are of the same length, then just checking left-justification is sufficient. | ||
// But if any strings are of different lengths, then we'll get different alignments for left- vs | ||
// right-justified substrings, and so we also check right-justification. | ||
if (minLength != maxLength) | ||
{ | ||
// toggle the direction and re-use the comparer and hashset (HasSufficientUniquenessFactor clears it) | ||
comparer.IsLeft = false; | ||
if (HasSufficientUniquenessFactor(set, uniqueStrings, acceptableNonUniqueCount)) | ||
{ | ||
return CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, index, count); | ||
} | ||
} | ||
|
||
// For each index, get a uniqueness factor for the right-justified substrings. | ||
// If any is above our threshold, we're done. | ||
for (int index = 0; index <= minLength - count; index++) | ||
// There were no left-justified substrings of this length available. | ||
// If all of the strings are of the same length, then just checking left-justification is sufficient. | ||
// But if any strings are of different lengths, then we'll get different alignments for left- vs | ||
// right-justified substrings, and so we also check right-justification. | ||
if (minLength != maxLength) | ||
{ | ||
// Get a uniqueness factor for the right-justified substrings. | ||
// If it's above our threshold, we're done. | ||
comparer.Index = -index - count; | ||
if (HasSufficientUniquenessFactor(set, uniqueStrings)) | ||
// when Index is negative, we're offsetting from the right, ensure we're at least | ||
// far enough from the right that we have count characters available | ||
comparer.Index = -count; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here's a real improvement. We do the math for setting a negative index (e.g. from the right side) starting with |
||
|
||
// For each index, get a uniqueness factor for the right-justified substrings. | ||
// If any is above our threshold, we're done. | ||
for (int offset = 0; offset <= minLength - count; offset++, comparer.Index--) | ||
{ | ||
results = CreateAnalysisResults( | ||
uniqueStrings, ignoreCase, minLength, maxLength, comparer.Index, count, | ||
static (string s, int index, int count) => s.AsSpan(s.Length + index, count)); | ||
IDisposable marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return true; | ||
IDisposable marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if (HasSufficientUniquenessFactor(set, uniqueStrings, acceptableNonUniqueCount)) | ||
{ | ||
return CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, comparer.Index, count); | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
// Could not find a substring index/length that was good enough. | ||
results = default; | ||
return false; | ||
// Could not find a substring index/length that was good enough, use the entire string. | ||
return CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, 0, 0); | ||
IDisposable marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
private static AnalysisResults CreateAnalysisResults( | ||
ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, int index, int count, GetSpan getSubstringSpan) | ||
ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, int index, int count) | ||
IDisposable marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{ | ||
// Start off by assuming all strings are ASCII | ||
bool allAsciiIfIgnoreCase = true; | ||
|
@@ -129,7 +113,7 @@ private static AnalysisResults CreateAnalysisResults( | |
foreach (string s in uniqueStrings) | ||
{ | ||
// Get the span for the substring. | ||
ReadOnlySpan<char> substring = getSubstringSpan(s, index, count); | ||
ReadOnlySpan<char> substring = count == 0 ? s.AsSpan() : Slicer(s, index, count); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to re-slice the string again here. It would be awesome if we could have a |
||
|
||
// If the substring isn't ASCII, bail out to return the results. | ||
if (!IsAllAscii(substring)) | ||
|
@@ -158,8 +142,6 @@ private static AnalysisResults CreateAnalysisResults( | |
return new AnalysisResults(ignoreCase, allAsciiIfIgnoreCase, index, count, minLength, maxLength); | ||
} | ||
|
||
private delegate ReadOnlySpan<char> GetSpan(string s, int index, int count); | ||
IDisposable marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
internal static unsafe bool IsAllAscii(ReadOnlySpan<char> s) | ||
{ | ||
#if NET8_0_OR_GREATER | ||
|
@@ -221,17 +203,13 @@ private static bool ContainsAnyLetters(ReadOnlySpan<char> s) | |
#endif | ||
} | ||
|
||
private static bool HasSufficientUniquenessFactor(HashSet<string> set, ReadOnlySpan<string> uniqueStrings) | ||
private static bool HasSufficientUniquenessFactor(HashSet<string> set, ReadOnlySpan<string> uniqueStrings, int acceptableNonUniqueCount) | ||
IDisposable marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{ | ||
set.Clear(); | ||
|
||
// Sufficient uniqueness factor of 95% is good enough. | ||
// Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad. | ||
IDisposable marked this conversation as resolved.
Show resolved
Hide resolved
|
||
int acceptableNonUniqueCount = uniqueStrings.Length / 20; | ||
|
||
foreach (string s in uniqueStrings) | ||
{ | ||
if (!set.Add(s) && --acceptableNonUniqueCount < 0) | ||
if (!set.Add(s) && acceptableNonUniqueCount-- <= 0) | ||
{ | ||
return false; | ||
} | ||
|
@@ -263,25 +241,34 @@ public AnalysisResults(bool ignoreCase, bool allAsciiIfIgnoreCase, int hashIndex | |
public bool RightJustifiedSubstring => HashIndex < 0; | ||
} | ||
|
||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
public static ReadOnlySpan<char> Slicer(this string s, int index, int count) => s.AsSpan((index >= 0 ? index : s.Length + index), count); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This ternary is the sole remaining conditional jump in the hot loop, but there's no good way to avoid that simultaneous avoiding There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have tried running this as a jumpless method body, but it makes little difference and is harder to grok; passing public static ReadOnlySpan<char> Slicer(this string s, byte fromRight, int index, int count) => s.AsSpan((s.Length * fromRight) + index), count); Also, if we COULD swap the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did something like that in PR #89689 which is a huge win... since we can't swap the comparer, I ended up creating both a left and right HashSet with backing comparer that "hard codes" the left/right logic. HUGE WINS, see that PR. |
||
|
||
private abstract class SubstringComparer : IEqualityComparer<string> | ||
{ | ||
public int Index; | ||
public int Count; | ||
public bool IsLeft; | ||
public int Index; // offset from left side (if positive) or right side (if negative) of the string | ||
IDisposable marked this conversation as resolved.
Show resolved
Hide resolved
|
||
public int Count; // number of characters in the span | ||
|
||
public abstract bool Equals(string? x, string? y); | ||
public abstract int GetHashCode(string s); | ||
} | ||
|
||
private sealed class JustifiedSubstringComparer : SubstringComparer | ||
{ | ||
public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).SequenceEqual(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count)); | ||
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count)); | ||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
public override bool Equals(string? x, string? y) => x!.Slicer(Index, Count).SequenceEqual(y!.Slicer(Index, Count)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The Slicer does the left/right based on the sign of the I really wish that String.AsSpan understood the use of |
||
|
||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(s.Slicer(Index, Count)); | ||
} | ||
|
||
private sealed class JustifiedCaseInsensitiveSubstringComparer : SubstringComparer | ||
{ | ||
public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).Equals(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count), StringComparison.OrdinalIgnoreCase); | ||
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count)); | ||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
public override bool Equals(string? x, string? y) => x!.Slicer(Index, Count).Equals(y!.Slicer(Index, Count), StringComparison.OrdinalIgnoreCase); | ||
|
||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.Slicer(Index, Count)); | ||
} | ||
} | ||
} |
Uh oh!
There was an error while loading. Please reload this page.