|
| 1 | +const std = @import("std"); |
| 2 | +const builtin = @import("builtin"); |
| 3 | +const assert = std.debug.assert; |
| 4 | +const offsets = @import("offsets.zig"); |
| 5 | + |
| 6 | +const WorkspaceSymbolStore = @This(); |
| 7 | + |
| 8 | +pub const empty: WorkspaceSymbolStore = .{ |
| 9 | + .entries = .empty, |
| 10 | +}; |
| 11 | + |
| 12 | +entries: std.ArrayListUnmanaged(Entry), |
| 13 | + |
| 14 | +pub fn deinit(store: *WorkspaceSymbolStore, allocator: std.mem.Allocator) void { |
| 15 | + for (store.entries.items) |*entry| { |
| 16 | + entry.deinit(allocator); |
| 17 | + } |
| 18 | + store.entries.deinit(allocator); |
| 19 | +} |
| 20 | + |
| 21 | +pub const Entry = struct { |
| 22 | + pub const Trigram = [3]u8; |
| 23 | + |
| 24 | + pub const NameSlice = struct { start: u32, end: u32 }; |
| 25 | + |
| 26 | + pub const Declaration = struct { |
| 27 | + pub const Slice = struct { start: u32, end: u32 }; |
| 28 | + |
| 29 | + trigram: Trigram, |
| 30 | + name: NameSlice, |
| 31 | + range: offsets.Range, |
| 32 | + }; |
| 33 | + |
| 34 | + pub const empty: Entry = .{ |
| 35 | + .has_filter = false, |
| 36 | + .filter_buckets = .empty, |
| 37 | + .trigram_to_declarations = .empty, |
| 38 | + .declarations = .empty, |
| 39 | + .names = .empty, |
| 40 | + }; |
| 41 | + |
| 42 | + has_filter: bool, |
| 43 | + filter_buckets: std.ArrayListUnmanaged(CuckooFilter.Bucket), |
| 44 | + trigram_to_declarations: std.AutoArrayHashMapUnmanaged(Trigram, Declaration.Slice), |
| 45 | + declarations: std.MultiArrayList(Declaration), |
| 46 | + names: std.ArrayListUnmanaged(u8), |
| 47 | + |
| 48 | + pub fn deinit(entry: *Entry, allocator: std.mem.Allocator) void { |
| 49 | + entry.filter_buckets.deinit(allocator); |
| 50 | + entry.trigram_to_declarations.clearRetainingCapacity(); |
| 51 | + entry.declarations.clearRetainingCapacity(); |
| 52 | + entry.names.clearRetainingCapacity(); |
| 53 | + entry.* = undefined; |
| 54 | + } |
| 55 | + |
| 56 | + pub fn clearRetainingCapacity(entry: *Entry, allocator: std.mem.Allocator) void { |
| 57 | + entry.filter_buckets.clearAndFree(allocator); |
| 58 | + entry.has_filter = false; |
| 59 | + entry.trigram_to_declarations.clearRetainingCapacity(); |
| 60 | + entry.declarations.clearRetainingCapacity(); |
| 61 | + entry.names.clearRetainingCapacity(); |
| 62 | + } |
| 63 | + |
| 64 | + /// Caller must not submit name.len < 3. |
| 65 | + pub fn appendDeclarations( |
| 66 | + entry: *Entry, |
| 67 | + allocator: std.mem.Allocator, |
| 68 | + name: []const u8, |
| 69 | + range: offsets.Range, |
| 70 | + ) error{ OutOfMemory, InvalidUtf8 }!void { |
| 71 | + assert(name.len >= 3); |
| 72 | + |
| 73 | + const name_slice: NameSlice = blk: { |
| 74 | + const start = entry.bytes.items.len; |
| 75 | + try entry.bytes.appendSlice(allocator, name); |
| 76 | + break :blk .{ |
| 77 | + .start = @intCast(start), |
| 78 | + .end = @intCast(entry.bytes.items.len), |
| 79 | + }; |
| 80 | + }; |
| 81 | + try entry.declarations.ensureUnusedCapacity(allocator, name.len - 2); |
| 82 | + |
| 83 | + for (0..name.len - 2) |index| { |
| 84 | + const trigram = name[index..][0..3].*; |
| 85 | + entry.declarations.appendAssumeCapacity(.{ |
| 86 | + .trigram = trigram, |
| 87 | + .name = name_slice, |
| 88 | + .range = range, |
| 89 | + }); |
| 90 | + } |
| 91 | + } |
| 92 | + |
| 93 | + pub const SortDeclarations = struct { |
| 94 | + trigrams: []const Trigram, |
| 95 | + range: []const offsets.Range, |
| 96 | + |
| 97 | + pub fn lessThan(ctx: SortDeclarations, a_index: usize, b_index: usize) bool { |
| 98 | + const a_trigram_numeric: u24 = @bitCast(ctx.trigrams[a_index]); |
| 99 | + const b_trigram_numeric: u24 = @bitCast(ctx.trigrams[b_index]); |
| 100 | + |
| 101 | + return a_trigram_numeric < b_trigram_numeric or |
| 102 | + (a_trigram_numeric == b_trigram_numeric and |
| 103 | + ctx.range[a_index].start < ctx.range[b_index].start); |
| 104 | + } |
| 105 | + }; |
| 106 | + |
| 107 | + /// Must be called before any queries are executed. |
| 108 | + pub fn finalize(entry: *Entry, allocator: std.mem.Allocator) error{OutOfMemory}!void { |
| 109 | + entry.declarations.sortUnstable(SortDeclarations{ |
| 110 | + .trigrams = entry.declarations.items(.trigram), |
| 111 | + .ranges = entry.declarations.items(.range), |
| 112 | + }); |
| 113 | + |
| 114 | + var prng = std.Random.DefaultPrng.init(0); |
| 115 | + |
| 116 | + try entry.filter_buckets.ensureTotalCapacityPrecise( |
| 117 | + allocator, |
| 118 | + entry.trigram_to_declarations.count(), |
| 119 | + ); |
| 120 | + entry.filter_buckets.items.len = entry.trigram_to_declarations.count(); |
| 121 | + |
| 122 | + var filter = CuckooFilter{ .buckets = entry.filter_buckets.items }; |
| 123 | + filter.reset(); |
| 124 | + entry.has_filter = true; |
| 125 | + |
| 126 | + for (entry.trigram_to_declarations.keys()) |trigram| { |
| 127 | + filter.append(prng.random(), trigram) catch |err| switch (err) { |
| 128 | + error.EvictionFailed => { |
| 129 | + // NOTE: This should generally be quite rare. |
| 130 | + entry.has_filter = false; |
| 131 | + break; |
| 132 | + }, |
| 133 | + }; |
| 134 | + } |
| 135 | + } |
| 136 | +}; |
| 137 | + |
| 138 | +// TODO: The pow2 requirement is quite inefficient: explore ideas posted in |
| 139 | +// https://databasearchitects.blogspot.com/2019/07/cuckoo-filters-with-arbitrarily-sized.html |
| 140 | +// (rocksdb even-odd scheme from comments looks interesting). |
| 141 | +// TODO: Look more into FPR scaling. |
| 142 | +pub const CuckooFilter = struct { |
| 143 | + /// len must be a power of 2. |
| 144 | + /// |
| 145 | + /// ### Pathological case with buckets.len power of 2 |
| 146 | + /// |
| 147 | + /// - `BucketIndex(alias_0)` -> `bucket_1`, `BucketIndex(alias_0).alternate()` -> `bucket_2` |
| 148 | + /// - `BucketIndex(alias_1)` -> `bucket_1`, `BucketIndex(alias_1).alternate()` -> `bucket_2` |
| 149 | + /// |
| 150 | + /// Our alternate mappings hold and `contains()` will not return false negatives. |
| 151 | + /// |
| 152 | + /// ### Pathological case with buckets.len NOT power of 2: |
| 153 | + /// |
| 154 | + /// - `BucketIndex(alias_0)` -> `bucket_1`, `BucketIndex(alias_0).alternate()` -> `bucket_3` |
| 155 | + /// - `BucketIndex(alias_1)` -> `bucket_2`, `BucketIndex(alias_1).alternate()` -> `bucket_4` |
| 156 | + /// |
| 157 | + /// Our alternate mappings do not hold and `contains()` can return false negatives. This is not |
| 158 | + /// acceptable as the entire point of an AMQ datastructure is the presence of false positives |
| 159 | + /// but not false negatives. |
| 160 | + buckets: []Bucket, |
| 161 | + |
| 162 | + pub const Fingerprint = enum(u8) { |
| 163 | + none = std.math.maxInt(u8), |
| 164 | + _, |
| 165 | + |
| 166 | + pub fn hash(fingerprint: Fingerprint) u32 { |
| 167 | + return @truncate(std.hash.Murmur2_64.hash(&.{@intFromEnum(fingerprint)})); |
| 168 | + } |
| 169 | + }; |
| 170 | + pub const Bucket = [4]Fingerprint; |
| 171 | + pub const BucketIndex = enum(u32) { |
| 172 | + _, |
| 173 | + |
| 174 | + pub fn alternate(index: BucketIndex, fingerprint: Fingerprint) BucketIndex { |
| 175 | + assert(fingerprint != .none); |
| 176 | + return @enumFromInt(@intFromEnum(index) ^ fingerprint.hash()); |
| 177 | + } |
| 178 | + }; |
| 179 | + |
| 180 | + pub const Triplet = struct { |
| 181 | + fingerprint: Fingerprint, |
| 182 | + index_1: BucketIndex, |
| 183 | + index_2: BucketIndex, |
| 184 | + |
| 185 | + pub fn initFromTrigram(trigram: Entry.Trigram) Triplet { |
| 186 | + const split: packed struct { |
| 187 | + fingerprint: Fingerprint, |
| 188 | + padding: u24, |
| 189 | + index_1: BucketIndex, |
| 190 | + } = @bitCast(std.hash.Murmur2_64.hash(&trigram)); |
| 191 | + |
| 192 | + const fingerprint: Fingerprint = if (split.fingerprint == .none) |
| 193 | + @enumFromInt(0) |
| 194 | + else |
| 195 | + split.fingerprint; |
| 196 | + |
| 197 | + const triplet: Triplet = .{ |
| 198 | + .fingerprint = fingerprint, |
| 199 | + .index_1 = split.index_1, |
| 200 | + .index_2 = split.index_1.alternate(fingerprint), |
| 201 | + }; |
| 202 | + assert(triplet.index_2.alternate(fingerprint) == triplet.index_1); |
| 203 | + |
| 204 | + return triplet; |
| 205 | + } |
| 206 | + }; |
| 207 | + |
| 208 | + pub fn reset(filter: CuckooFilter) void { |
| 209 | + @memset(filter.buckets, [1]Fingerprint{.none} ** 4); |
| 210 | + } |
| 211 | + |
| 212 | + // TODO: Dubious |
| 213 | + pub fn capacityForCount(count: usize) error{Overflow}!usize { |
| 214 | + const fill_rate = 0.95; |
| 215 | + return try std.math.ceilPowerOfTwo(usize, @ceil(@as(f32, @floatFromInt(count)) / fill_rate)); |
| 216 | + } |
| 217 | + |
| 218 | + pub fn append(filter: CuckooFilter, random: std.Random, trigram: Entry.Trigram) error{EvictionFailed}!void { |
| 219 | + const triplet: Triplet = .initFromTrigram(trigram); |
| 220 | + |
| 221 | + if (filter.appendToBucket(triplet.index_1, triplet.fingerprint) or |
| 222 | + filter.appendToBucket(triplet.index_2, triplet.fingerprint)) |
| 223 | + { |
| 224 | + return; |
| 225 | + } |
| 226 | + |
| 227 | + var fingerprint = triplet.fingerprint; |
| 228 | + var index = if (random.boolean()) triplet.index_1 else triplet.index_2; |
| 229 | + for (0..500) |_| { |
| 230 | + fingerprint = filter.swapFromBucket(random, index, fingerprint); |
| 231 | + index = index.alternate(fingerprint); |
| 232 | + |
| 233 | + if (filter.appendToBucket(index, fingerprint)) { |
| 234 | + return; |
| 235 | + } |
| 236 | + } |
| 237 | + |
| 238 | + return error.EvictionFailed; |
| 239 | + } |
| 240 | + |
| 241 | + fn bucketAt(filter: CuckooFilter, index: BucketIndex) *Bucket { |
| 242 | + assert(std.math.isPowerOfTwo(filter.buckets.len)); |
| 243 | + return &filter.buckets[@intFromEnum(index) & (filter.buckets.len - 1)]; |
| 244 | + } |
| 245 | + |
| 246 | + fn appendToBucket(filter: CuckooFilter, index: BucketIndex, fingerprint: Fingerprint) bool { |
| 247 | + assert(fingerprint != .none); |
| 248 | + |
| 249 | + const bucket = filter.bucketAt(index); |
| 250 | + for (bucket) |*slot| { |
| 251 | + if (slot.* == .none) { |
| 252 | + slot.* = fingerprint; |
| 253 | + return true; |
| 254 | + } |
| 255 | + } |
| 256 | + |
| 257 | + return false; |
| 258 | + } |
| 259 | + |
| 260 | + fn swapFromBucket( |
| 261 | + filter: CuckooFilter, |
| 262 | + random: std.Random, |
| 263 | + index: BucketIndex, |
| 264 | + fingerprint: Fingerprint, |
| 265 | + ) Fingerprint { |
| 266 | + assert(fingerprint != .none); |
| 267 | + |
| 268 | + const target = &filter.bucketAt(index)[random.int(u2)]; |
| 269 | + |
| 270 | + const old_fingerprint = target.*; |
| 271 | + assert(old_fingerprint != .none); |
| 272 | + |
| 273 | + target.* = fingerprint; |
| 274 | + |
| 275 | + return old_fingerprint; |
| 276 | + } |
| 277 | + |
| 278 | + pub fn contains(filter: CuckooFilter, trigram: Entry.Trigram) bool { |
| 279 | + const triplet: Triplet = .initFromTrigram(trigram); |
| 280 | + |
| 281 | + return filter.containsInBucket(triplet.index_1, triplet.fingerprint) or |
| 282 | + filter.containsInBucket(triplet.index_2, triplet.fingerprint); |
| 283 | + } |
| 284 | + |
| 285 | + fn containsInBucket(filter: CuckooFilter, index: BucketIndex, fingerprint: Fingerprint) bool { |
| 286 | + assert(fingerprint != .none); |
| 287 | + |
| 288 | + const bucket = filter.bucketAt(index); |
| 289 | + for (bucket) |*slot| { |
| 290 | + if (slot.* == fingerprint) { |
| 291 | + return true; |
| 292 | + } |
| 293 | + } |
| 294 | + |
| 295 | + return false; |
| 296 | + } |
| 297 | +}; |
| 298 | + |
| 299 | +// TODO: More extensive (different capacities) testing. |
| 300 | +test CuckooFilter { |
| 301 | + const allocator = std.testing.allocator; |
| 302 | + |
| 303 | + const element_count = 486; |
| 304 | + const filter_size = comptime CuckooFilter.capacityForCount(element_count) catch unreachable; |
| 305 | + try std.testing.expectEqual(512, filter_size); |
| 306 | + |
| 307 | + var entries: std.AutoArrayHashMapUnmanaged(Entry.Trigram, void) = .empty; |
| 308 | + defer entries.deinit(allocator); |
| 309 | + try entries.ensureTotalCapacity(allocator, element_count); |
| 310 | + |
| 311 | + var buckets: [filter_size]CuckooFilter.Bucket = undefined; |
| 312 | + var filter = CuckooFilter{ .buckets = &buckets }; |
| 313 | + var filter_prng = std.Random.DefaultPrng.init(42); |
| 314 | + |
| 315 | + for (0..2_500) |gen_prng_seed| { |
| 316 | + entries.clearRetainingCapacity(); |
| 317 | + filter.reset(); |
| 318 | + |
| 319 | + var gen_prng = std.Random.DefaultPrng.init(gen_prng_seed); |
| 320 | + for (0..element_count) |_| { |
| 321 | + const trigram: Entry.Trigram = @bitCast(gen_prng.random().int(u24)); |
| 322 | + try entries.put(allocator, trigram, {}); |
| 323 | + try filter.append(filter_prng.random(), trigram); |
| 324 | + } |
| 325 | + |
| 326 | + // No false negatives |
| 327 | + for (entries.keys()) |trigram| { |
| 328 | + try std.testing.expect(filter.contains(trigram)); |
| 329 | + } |
| 330 | + |
| 331 | + // Reasonable false positive rate |
| 332 | + const fpr_count = 2_500; |
| 333 | + var false_positives: usize = 0; |
| 334 | + var negative_prng = std.Random.DefaultPrng.init(~gen_prng_seed); |
| 335 | + for (0..fpr_count) |_| { |
| 336 | + var trigram: Entry.Trigram = @bitCast(negative_prng.random().int(u24)); |
| 337 | + while (entries.contains(trigram)) { |
| 338 | + trigram = @bitCast(negative_prng.random().int(u24)); |
| 339 | + } |
| 340 | + |
| 341 | + false_positives += @intFromBool(filter.contains(trigram)); |
| 342 | + } |
| 343 | + |
| 344 | + const fpr = @as(f32, @floatFromInt(false_positives)) / fpr_count; |
| 345 | + std.testing.expect(fpr < 0.035) catch |err| { |
| 346 | + std.log.err("fpr: {d}%", .{fpr * 100}); |
| 347 | + return err; |
| 348 | + }; |
| 349 | + } |
| 350 | +} |
0 commit comments