5
5
using System . Collections . Generic ;
6
6
using System . Diagnostics ;
7
7
using System . Runtime . CompilerServices ;
8
+ using System . Collections . Frozen . String . SubstringComparers ;
8
9
9
10
namespace System . Collections . Frozen
10
11
{
@@ -29,40 +30,40 @@ internal static class KeyAnalyzer
29
30
public static AnalysisResults Analyze (
30
31
ReadOnlySpan < string > uniqueStrings , bool ignoreCase , int minLength , int maxLength )
31
32
{
32
- Debug . Assert ( uniqueStrings . Length > 0 ) ;
33
+ Debug . Assert ( ! uniqueStrings . IsEmpty ) ;
33
34
34
35
if ( minLength > 0 )
35
36
{
36
37
const int MaxSubstringLengthLimit = 8 ; // arbitrary small-ish limit...it's not worth the increase in algorithmic complexity to analyze longer substrings
38
+ int uniqueStringsLength = uniqueStrings . Length ;
37
39
38
40
// Sufficient uniqueness factor of 95% is good enough.
39
41
// Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad.
40
- int acceptableNonUniqueCount = uniqueStrings . Length / 20 ;
42
+ int acceptableNonUniqueCount = uniqueStringsLength / 20 ;
41
43
42
- // Try to pick a substring comparer.
43
- SubstringComparer comparer = ignoreCase ? new JustifiedCaseInsensitiveSubstringComparer ( ) : new JustifiedSubstringComparer ( ) ;
44
- HashSet < string > set = new HashSet < string > (
45
- #if NET6_0_OR_GREATER
46
- uniqueStrings . Length ,
47
- #endif
48
- comparer) ;
44
+ ISubstringComparer leftComparer = ignoreCase ? new LeftSubstringCaseInsensitiveComparer ( ) : new LeftSubstringOrdinalComparer ( ) ;
45
+ HashSet < string > leftSet = MakeHashSet ( uniqueStringsLength , leftComparer ) ;
46
+
47
+ // we lazily spin up the right comparators when/if needed
48
+ ISubstringComparer ? rightComparer = null ;
49
+ HashSet < string > ? rightSet = null ;
49
50
50
51
// For each substring length...preferring the shortest length that provides
51
52
// enough uniqueness
52
53
int maxSubstringLength = Math . Min ( minLength , MaxSubstringLengthLimit ) ;
53
54
for ( int count = 1 ; count <= maxSubstringLength ; count ++ )
54
55
{
55
- comparer . Count = count ;
56
+ leftComparer . Count = count ;
56
57
57
58
// For each index, get a uniqueness factor for the left-justified substrings.
58
59
// If any is above our threshold, we're done.
59
60
for ( int index = 0 ; index <= minLength - count ; index ++ )
60
61
{
61
- comparer . Index = index ;
62
+ leftComparer . Index = index ;
62
63
63
- if ( HasSufficientUniquenessFactor ( set , uniqueStrings , acceptableNonUniqueCount ) )
64
+ if ( HasSufficientUniquenessFactor ( leftSet , uniqueStrings , acceptableNonUniqueCount ) )
64
65
{
65
- return CreateAnalysisResults ( uniqueStrings , ignoreCase , minLength , maxLength , index , count ) ;
66
+ return CreateAnalysisResults ( uniqueStrings , ignoreCase , minLength , maxLength , leftComparer ) ;
66
67
}
67
68
}
68
69
@@ -72,29 +73,42 @@ public static AnalysisResults Analyze(
72
73
// right-justified substrings, and so we also check right-justification.
73
74
if ( minLength != maxLength )
74
75
{
76
+ rightComparer ??= ignoreCase ? new RightSubstringCaseInsensitiveComparer ( ) : new RightSubstringOrdinalComparer ( ) ;
77
+ rightSet ??= MakeHashSet ( uniqueStringsLength , rightComparer ) ;
78
+
75
79
// when Index is negative, we're offsetting from the right, ensure we're at least
76
80
// far enough from the right that we have count characters available
77
- comparer . Index = - count ;
81
+ rightComparer ! . Count = count ;
82
+ rightComparer ! . Index = - count ;
78
83
79
84
// For each index, get a uniqueness factor for the right-justified substrings.
80
85
// If any is above our threshold, we're done.
81
- for ( int offset = 0 ; offset <= minLength - count ; offset ++ , comparer . Index -- )
86
+ for ( int offset = 0 ; offset <= minLength - count ; offset ++ , rightComparer ! . Index -- )
82
87
{
83
- if ( HasSufficientUniquenessFactor ( set , uniqueStrings , acceptableNonUniqueCount ) )
88
+ if ( HasSufficientUniquenessFactor ( rightSet ! , uniqueStrings , acceptableNonUniqueCount ) )
84
89
{
85
- return CreateAnalysisResults ( uniqueStrings , ignoreCase , minLength , maxLength , comparer . Index , count ) ;
90
+ return CreateAnalysisResults ( uniqueStrings , ignoreCase , minLength , maxLength , rightComparer ) ;
86
91
}
87
92
}
88
93
}
89
94
}
90
95
}
91
96
92
97
// Could not find a substring index/length that was good enough, use the entire string.
93
- return CreateAnalysisResults ( uniqueStrings , ignoreCase , minLength , maxLength , 0 , 0 ) ;
98
+ return CreateAnalysisResults ( uniqueStrings , ignoreCase , minLength , maxLength , s_FullStringComparer ) ;
99
+ }
100
+
101
+ private static HashSet < string > MakeHashSet ( int length , IEqualityComparer < string > comparer )
102
+ {
103
+ return new HashSet < string > (
104
+ #if NET6_0_OR_GREATER
105
+ length ,
106
+ #endif
107
+ comparer) ;
94
108
}
95
109
96
110
private static AnalysisResults CreateAnalysisResults (
97
- ReadOnlySpan < string > uniqueStrings , bool ignoreCase , int minLength , int maxLength , int index , int count )
111
+ ReadOnlySpan < string > uniqueStrings , bool ignoreCase , int minLength , int maxLength , ISubstringComparer comparer )
98
112
{
99
113
// Start off by assuming all strings are ASCII
100
114
bool allAsciiIfIgnoreCase = true ;
@@ -113,7 +127,7 @@ private static AnalysisResults CreateAnalysisResults(
113
127
foreach ( string s in uniqueStrings )
114
128
{
115
129
// Get the span for the substring.
116
- ReadOnlySpan < char > substring = count == 0 ? s . AsSpan ( ) : Slicer ( s , index , count ) ;
130
+ ReadOnlySpan < char > substring = comparer . Slice ( s ) ;
117
131
118
132
// If the substring isn't ASCII, bail out to return the results.
119
133
if ( ! IsAllAscii ( substring ) )
@@ -139,7 +153,7 @@ private static AnalysisResults CreateAnalysisResults(
139
153
}
140
154
141
155
// Return the analysis results.
142
- return new AnalysisResults ( ignoreCase , allAsciiIfIgnoreCase , index , count , minLength , maxLength ) ;
156
+ return new AnalysisResults ( ignoreCase , allAsciiIfIgnoreCase , comparer . Index , comparer . Count , minLength , maxLength ) ;
143
157
}
144
158
145
159
internal static unsafe bool IsAllAscii ( ReadOnlySpan < char > s )
@@ -184,7 +198,7 @@ internal static unsafe bool IsAllAscii(ReadOnlySpan<char> s)
184
198
#if NET8_0_OR_GREATER
185
199
private static readonly SearchValues < char > s_asciiLetters = SearchValues . Create ( "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" ) ;
186
200
#endif
187
- private static bool ContainsAnyLetters ( ReadOnlySpan < char > s )
201
+ internal static bool ContainsAnyLetters ( ReadOnlySpan < char > s )
188
202
{
189
203
Debug . Assert ( IsAllAscii ( s ) ) ;
190
204
@@ -203,14 +217,13 @@ private static bool ContainsAnyLetters(ReadOnlySpan<char> s)
203
217
#endif
204
218
}
205
219
206
- private static bool HasSufficientUniquenessFactor ( HashSet < string > set , ReadOnlySpan < string > uniqueStrings , int acceptableNonUniqueCount )
220
+ internal static bool HasSufficientUniquenessFactor ( HashSet < string > set , ReadOnlySpan < string > uniqueStrings , int acceptableNonUniqueCount )
207
221
{
208
- set . Clear ( ) ;
209
-
210
222
foreach ( string s in uniqueStrings )
211
223
{
212
- if ( ! set . Add ( s ) && acceptableNonUniqueCount -- <= 0 )
224
+ if ( ! set . Add ( s ) && -- acceptableNonUniqueCount < 0 )
213
225
{
226
+ set . Clear ( ) ;
214
227
return false ;
215
228
}
216
229
}
@@ -241,34 +254,6 @@ public AnalysisResults(bool ignoreCase, bool allAsciiIfIgnoreCase, int hashIndex
241
254
public bool RightJustifiedSubstring => HashIndex < 0 ;
242
255
}
243
256
244
- [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
245
- public static ReadOnlySpan < char > Slicer ( this string s , int index , int count ) => s . AsSpan ( ( index >= 0 ? index : s . Length + index ) , count ) ;
246
-
247
- private abstract class SubstringComparer : IEqualityComparer < string >
248
- {
249
- public int Index ; // offset from left side (if positive) or right side (if negative) of the string
250
- public int Count ; // number of characters in the span
251
-
252
- public abstract bool Equals ( string ? x , string ? y ) ;
253
- public abstract int GetHashCode ( string s ) ;
254
- }
255
-
256
- private sealed class JustifiedSubstringComparer : SubstringComparer
257
- {
258
- [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
259
- public override bool Equals ( string ? x , string ? y ) => x ! . Slicer ( Index , Count ) . SequenceEqual ( y ! . Slicer ( Index , Count ) ) ;
260
-
261
- [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
262
- public override int GetHashCode ( string s ) => Hashing . GetHashCodeOrdinal ( s . Slicer ( Index , Count ) ) ;
263
- }
264
-
265
- private sealed class JustifiedCaseInsensitiveSubstringComparer : SubstringComparer
266
- {
267
- [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
268
- public override bool Equals ( string ? x , string ? y ) => x ! . Slicer ( Index , Count ) . Equals ( y ! . Slicer ( Index , Count ) , StringComparison . OrdinalIgnoreCase ) ;
269
-
270
- [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
271
- public override int GetHashCode ( string s ) => Hashing . GetHashCodeOrdinalIgnoreCase ( s . Slicer ( Index , Count ) ) ;
272
- }
257
+ private static FullStringComparer s_FullStringComparer = new FullStringComparer ( ) ;
273
258
}
274
259
}
0 commit comments