Skip to content

Speed up KeyAnalyzer for substring based frozen collections #89863

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ The System.Collections.Immutable library is built-in as part of the shared frame

<ItemGroup>
<Compile Include="Properties\InternalsVisibleTo.cs" />
<Compile Include="System\Collections\Frozen\String\SubstringEquality\SubstringEqualityComparers.cs" />
<Compile Include="System\Collections\Frozen\String\SubstringEquality\SubstringEqualityComparerBase.cs" />

<Compile Include="System\Polyfills.cs" />
<Compile Include="System\Collections\ThrowHelper.cs" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,10 @@
// The .NET Foundation licenses this file to you under the MIT license.

using System.Buffers;
using System.Collections.Frozen.String.SubstringEquality;
using System.Collections.Generic;
using System.Diagnostics;
#if !NET8_0_OR_GREATER
using System.Runtime.CompilerServices;
#endif

namespace System.Collections.Frozen
{
Expand All @@ -33,84 +32,88 @@ public static AnalysisResults Analyze(
{
Debug.Assert(!uniqueStrings.IsEmpty);

// Try to pick a substring comparer. If we can't find a good substring comparer, fallback to a full string comparer.
AnalysisResults results;
if (minLength == 0 || !TryUseSubstring(uniqueStrings, ignoreCase, minLength, maxLength, out results))
if (minLength > 0)
{
results = CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, 0, 0, static (s, _, _) => s.AsSpan());
}
const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit...it's not worth the increase in algorithmic complexity to analyze longer substrings
int uniqueStringsLength = uniqueStrings.Length;

return results;
}

/// <summary>Try to find the minimal unique substring index/length to use for comparisons.</summary>
private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, out AnalysisResults results)
{
const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit... t's not worth the increase in algorithmic complexity to analyze longer substrings
// Sufficient uniqueness factor of 95% is good enough.
// Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad.
int acceptableNonUniqueCount = uniqueStringsLength / 20;

SubstringComparer comparer = ignoreCase ? new JustifiedCaseInsensitiveSubstringComparer() : new JustifiedSubstringComparer();
HashSet<string> set = new HashSet<string>(
#if NET6_0_OR_GREATER
uniqueStrings.Length,
#endif
comparer);
ISubstringEqualityComparer leftComparer = ignoreCase ? new LeftSubstringCaseInsensitiveComparer() : new LeftSubstringOrdinalComparer();
HashSet<string> leftSet = MakeHashSet(uniqueStringsLength, leftComparer);

// For each substring length...
int maxSubstringLength = Math.Min(minLength, MaxSubstringLengthLimit);
for (int count = 1; count <= maxSubstringLength; count++)
{
comparer.IsLeft = true;
comparer.Count = count;
// we lazily spin up the right comparators when/if needed
ISubstringEqualityComparer? rightComparer = null;
HashSet<string>? rightSet = null;

// For each index, get a uniqueness factor for the left-justified substrings.
// If any is above our threshold, we're done.
for (int index = 0; index <= minLength - count; index++)
// For each substring length...preferring the shortest length that provides
// enough uniqueness
int maxSubstringLength = Math.Min(minLength, MaxSubstringLengthLimit);
for (int count = 1; count <= maxSubstringLength; count++)
{
comparer.Index = index;
int maxOffset = minLength - count;
leftComparer.Start(0, count);

if (HasSufficientUniquenessFactor(set, uniqueStrings))
// For each offset, get a uniqueness factor for the left-justified substrings.
// If any is above our threshold, we're done.
for (int offset = 0; offset <= maxOffset; offset++)
{
results = CreateAnalysisResults(
uniqueStrings, ignoreCase, minLength, maxLength, index, count,
static (string s, int index, int count) => s.AsSpan(index, count));
return true;
}
}
if (HasSufficientUniquenessFactor(leftSet, uniqueStrings, acceptableNonUniqueCount))
{
return CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, leftComparer);
}

// There were no left-justified substrings of this length available.
// If all of the strings are of the same length, then just checking left-justification is sufficient.
// But if any strings are of different lengths, then we'll get different alignments for left- vs
// right-justified substrings, and so we also check right-justification.
if (minLength != maxLength)
{
// toggle the direction and re-use the comparer and hashset (HasSufficientUniquenessFactor clears it)
comparer.IsLeft = false;
leftComparer.GoRight();
}

// For each index, get a uniqueness factor for the right-justified substrings.
// If any is above our threshold, we're done.
for (int index = 0; index <= minLength - count; index++)
// There were no left-justified substrings of this length available.
// If all of the strings are of the same length, then just checking left-justification is sufficient.
// But if any strings are of different lengths, then we'll get different alignments for left-justified
// vs right-justified substrings, and so we also check right-justification.
if (minLength != maxLength)
{
// Get a uniqueness factor for the right-justified substrings.
// If it's above our threshold, we're done.
comparer.Index = -index - count;
if (HasSufficientUniquenessFactor(set, uniqueStrings))
if (rightComparer is null)
{
results = CreateAnalysisResults(
uniqueStrings, ignoreCase, minLength, maxLength, comparer.Index, count,
static (string s, int index, int count) => s.AsSpan(s.Length + index, count));
return true;
rightComparer = ignoreCase ? new RightSubstringCaseInsensitiveComparer() : new RightSubstringOrdinalComparer();
rightSet = MakeHashSet(uniqueStringsLength, rightComparer);
}

// when we're offsetting from the right, ensure we're at least far enough
// from the right that we have count characters available
rightComparer.Start(-count, count);

// For each offset, get a uniqueness factor for the right-justified substrings.
// If any is above our threshold, we're done.
for (int offset = 0; offset <= maxOffset; offset++)
{
if (HasSufficientUniquenessFactor(rightSet!, uniqueStrings, acceptableNonUniqueCount))
{
return CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, rightComparer);
}

rightComparer.GoLeft();
}
}
}
}

// Could not find a substring index/length that was good enough.
results = default;
return false;
// Could not find a substring index/length that was good enough, use the entire string.
return CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, new FullStringEqualityComparer());
}

private static HashSet<string> MakeHashSet(int length, IEqualityComparer<string> comparer)
{
return new HashSet<string>(
#if NET6_0_OR_GREATER
length,
#endif
comparer);
}

private static AnalysisResults CreateAnalysisResults(
ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, int index, int count, GetSpan getSubstringSpan)
ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, ISubstringEqualityComparer comparer)
{
// Start off by assuming all strings are ASCII
bool allAsciiIfIgnoreCase = true;
Expand All @@ -129,7 +132,7 @@ private static AnalysisResults CreateAnalysisResults(
foreach (string s in uniqueStrings)
{
// Get the span for the substring.
ReadOnlySpan<char> substring = getSubstringSpan(s, index, count);
ReadOnlySpan<char> substring = comparer.Slice(s);

// If the substring isn't ASCII, bail out to return the results.
if (!IsAllAscii(substring))
Expand All @@ -155,11 +158,9 @@ private static AnalysisResults CreateAnalysisResults(
}

// Return the analysis results.
return new AnalysisResults(ignoreCase, allAsciiIfIgnoreCase, index, count, minLength, maxLength);
return new AnalysisResults(ignoreCase, allAsciiIfIgnoreCase, comparer.Index, comparer.Count, minLength, maxLength);
}

private delegate ReadOnlySpan<char> GetSpan(string s, int index, int count);

internal static unsafe bool IsAllAscii(ReadOnlySpan<char> s)
{
#if NET8_0_OR_GREATER
Expand Down Expand Up @@ -202,7 +203,7 @@ internal static unsafe bool IsAllAscii(ReadOnlySpan<char> s)
#if NET8_0_OR_GREATER
private static readonly SearchValues<char> s_asciiLetters = SearchValues.Create("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
#endif
private static bool ContainsAnyLetters(ReadOnlySpan<char> s)
internal static bool ContainsAnyLetters(ReadOnlySpan<char> s)
{
Debug.Assert(IsAllAscii(s));

Expand All @@ -221,18 +222,13 @@ private static bool ContainsAnyLetters(ReadOnlySpan<char> s)
#endif
}

private static bool HasSufficientUniquenessFactor(HashSet<string> set, ReadOnlySpan<string> uniqueStrings)
internal static bool HasSufficientUniquenessFactor(HashSet<string> set, ReadOnlySpan<string> uniqueStrings, int acceptableNonUniqueCount)
{
set.Clear();

// Sufficient uniqueness factor of 95% is good enough.
// Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad.
int acceptableNonUniqueCount = uniqueStrings.Length / 20;

foreach (string s in uniqueStrings)
{
if (!set.Add(s) && --acceptableNonUniqueCount < 0)
{
set.Clear();
return false;
}
}
Expand Down Expand Up @@ -262,26 +258,5 @@ public AnalysisResults(bool ignoreCase, bool allAsciiIfIgnoreCase, int hashIndex
public bool SubstringHashing => HashCount != 0;
public bool RightJustifiedSubstring => HashIndex < 0;
}

private abstract class SubstringComparer : IEqualityComparer<string>
{
public int Index;
public int Count;
public bool IsLeft;
public abstract bool Equals(string? x, string? y);
public abstract int GetHashCode(string s);
}

private sealed class JustifiedSubstringComparer : SubstringComparer
{
public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).SequenceEqual(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count));
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count));
}

private sealed class JustifiedCaseInsensitiveSubstringComparer : SubstringComparer
{
public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).Equals(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count), StringComparison.OrdinalIgnoreCase);
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count));
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Collections.Generic;
using System.Runtime.CompilerServices;

namespace System.Collections.Frozen.String.SubstringEquality
{
internal interface ISubstringEqualityComparer : IEqualityComparer<string>
{
/// <summary>
/// The index at which to begin this slice
/// </summary>
/// <remarks>Offset from the left side (if zero or positive) or right side (if negative)</remarks>
public int Index { get; }

/// <summary>
/// The desired length for the slice (exclusive).
/// </summary>
public int Count { get; }

/// <summary>
/// Moves the starting index one to the left (decrements)
/// <remarks>When we're doing left-justified slicing, the would move the starting point closer to zero, when
/// we're doing right-justified slicing, this moves index away from zero, so toward the beginning of the
/// string (as indexed from the right side).</remarks>
/// </summary>
public abstract void GoLeft();

/// <summary>
/// Moves the starting index one to the right (increments)
/// <remarks>When we're doing left-justified slicing, the would move the starting point away from zero, when
/// we're doing right-justified slicing, this moves index toward zero, so toward the end of the
/// string (as indexed from the right side).</remarks>
/// </summary>
public abstract void GoRight();

/// <summary>
/// Sets up for either left or right justified slicing of a string
/// </summary>
/// <param name="index">The starting index for slicing, if zero or greater, then this is a left-justified slice from
/// the start of the input string. If less then zero, then this is a right-justified slice from the end of the
/// input string.</param>
/// <param name="count">The number of characters to include in the slice.</param>
/// <remarks>Typical left-justified slices would pass <paramref name="index"/> of zero. Typical right-justified
/// slices would pass <paramref name="index"/> value that is the negated value of the <paramref name="count"/> count
/// so that the slice would be the last set of characters</remarks>
public abstract void Start(int index, int count);

/// <summary>
/// Creates a new readonly span over the portion of the target string.
/// </summary>
/// <param name="s">The target string.</param>
/// <exception cref="ArgumentNullException"><paramref name="s"/> is null.</exception>
/// <exception cref="ArgumentOutOfRangeException">
/// Thrown when the specified index or count is not in range.
/// </exception>
public abstract ReadOnlySpan<char> Slice(string s);
}

internal abstract class SubstringEqualityComparerBase<TThisWrapper> : ISubstringEqualityComparer
where TThisWrapper : struct, SubstringEqualityComparerBase<TThisWrapper>.IGenericSpecializedWrapper
{
/// <summary>A wrapper around this that enables access to important members without making virtual calls.</summary>
private readonly TThisWrapper _this;

protected SubstringEqualityComparerBase()
{
_this = default;
_this.Store(this);
}

protected int _index;
protected int _count;

/// <inheritdoc />
public int Index { get => _index; }
/// <inheritdoc />
public int Count { get => _count; }

/// <inheritdoc />
public void GoLeft() => _index--;

/// <inheritdoc />
public void GoRight() => _index++;

/// <inheritdoc />
public void Start(int index, int count)
{
_index = index;
_count = count;
}

/// <inheritdoc />
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public ReadOnlySpan<char> Slice(string s) => _this.Slice(s);

/// <inheritdoc />
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool Equals(string? x, string? y) => _this.Equals(x, y);

/// <inheritdoc />
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int GetHashCode(string s) => _this.GetHashCode(s);

/// <summary>Used to enable generic specialization with reference types.</summary>
/// <remarks>
/// To avoid each of those incurring virtual dispatch to the derived type, the derived
/// type hands down a struct wrapper through which all calls are performed. This base
/// class uses that generic struct wrapper to specialize and de-virtualize.
/// </remarks>
internal interface IGenericSpecializedWrapper
{
void Store(ISubstringEqualityComparer @this);
public ReadOnlySpan<char> Slice(string s);
public bool Equals(string? x, string? y);
public int GetHashCode(string s);
}
}
}
Loading