@@ -46,29 +46,35 @@ public static AnalysisResults Analyze(
46
46
/// <summary>Try to find the minimal unique substring index/length to use for comparisons.</summary>
47
47
private static bool TryUseSubstring ( ReadOnlySpan < string > uniqueStrings , bool ignoreCase , int minLength , int maxLength , out AnalysisResults results )
48
48
{
49
- const int MaxSubstringLengthLimit = 8 ; // arbitrary small-ish limit... t's not worth the increase in algorithmic complexity to analyze longer substrings
49
+ const int MaxSubstringLengthLimit = 8 ; // arbitrary small-ish limit... it's not worth the increase in algorithmic complexity to analyze longer substrings
50
+ int uniqueStringsLength = uniqueStrings . Length ;
51
+
52
+ // Sufficient uniqueness factor of 95% is good enough.
53
+ // Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad.
54
+ int acceptableNonUniqueCount = uniqueStringsLength / 20 ;
50
55
51
56
SubstringComparer comparer = ignoreCase ? new JustifiedCaseInsensitiveSubstringComparer ( ) : new JustifiedSubstringComparer ( ) ;
52
57
HashSet < string > set = new HashSet < string > (
53
58
#if NET6_0_OR_GREATER
54
- uniqueStrings . Length ,
59
+ uniqueStringsLength ,
55
60
#endif
56
61
comparer) ;
57
62
58
- // For each substring length...
63
+ // For each substring length...preferring the shortest length that provides
64
+ // enough uniqueness
59
65
int maxSubstringLength = Math . Min ( minLength , MaxSubstringLengthLimit ) ;
60
66
for ( int count = 1 ; count <= maxSubstringLength ; count ++ )
61
67
{
62
68
comparer . IsLeft = true ;
63
69
comparer . Count = count ;
64
70
65
- // For each index, get a uniqueness factor for the left-justified substrings.
71
+ // For each index from , get a uniqueness factor for the left-justified substrings.
66
72
// If any is above our threshold, we're done.
67
73
for ( int index = 0 ; index <= minLength - count ; index ++ )
68
74
{
69
75
comparer . Index = index ;
70
76
71
- if ( HasSufficientUniquenessFactor ( set , uniqueStrings ) )
77
+ if ( HasSufficientUniquenessFactor ( set , uniqueStrings , acceptableNonUniqueCount ) )
72
78
{
73
79
results = CreateAnalysisResults (
74
80
uniqueStrings , ignoreCase , minLength , maxLength , index , count ,
@@ -90,10 +96,9 @@ private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool ign
90
96
// If any is above our threshold, we're done.
91
97
for ( int index = 0 ; index <= minLength - count ; index ++ )
92
98
{
93
- // Get a uniqueness factor for the right-justified substrings.
94
- // If it's above our threshold, we're done.
95
99
comparer . Index = - index - count ;
96
- if ( HasSufficientUniquenessFactor ( set , uniqueStrings ) )
100
+
101
+ if ( HasSufficientUniquenessFactor ( set , uniqueStrings , acceptableNonUniqueCount ) )
97
102
{
98
103
results = CreateAnalysisResults (
99
104
uniqueStrings , ignoreCase , minLength , maxLength , comparer . Index , count ,
@@ -202,7 +207,7 @@ internal static unsafe bool IsAllAscii(ReadOnlySpan<char> s)
202
207
#if NET8_0_OR_GREATER
203
208
private static readonly SearchValues < char > s_asciiLetters = SearchValues . Create ( "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" ) ;
204
209
#endif
205
- private static bool ContainsAnyLetters ( ReadOnlySpan < char > s )
210
+ internal static bool ContainsAnyLetters ( ReadOnlySpan < char > s )
206
211
{
207
212
Debug . Assert ( IsAllAscii ( s ) ) ;
208
213
@@ -221,14 +226,10 @@ private static bool ContainsAnyLetters(ReadOnlySpan<char> s)
221
226
#endif
222
227
}
223
228
224
- private static bool HasSufficientUniquenessFactor ( HashSet < string > set , ReadOnlySpan < string > uniqueStrings )
229
+ internal static bool HasSufficientUniquenessFactor ( HashSet < string > set , ReadOnlySpan < string > uniqueStrings , int acceptableNonUniqueCount )
225
230
{
226
231
set . Clear ( ) ;
227
232
228
- // Sufficient uniqueness factor of 95% is good enough.
229
- // Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad.
230
- int acceptableNonUniqueCount = uniqueStrings . Length / 20 ;
231
-
232
233
foreach ( string s in uniqueStrings )
233
234
{
234
235
if ( ! set . Add ( s ) && -- acceptableNonUniqueCount < 0 )
0 commit comments