Skip to content

Commit 8df4206

Browse files
committed
Lay foundation for workspace symbols again
1 parent bb6d647 commit 8df4206

File tree

2 files changed

+351
-0
lines changed

2 files changed

+351
-0
lines changed

src/WorkspaceSymbolStore.zig

Lines changed: 350 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,350 @@
1+
const std = @import("std");
2+
const builtin = @import("builtin");
3+
const assert = std.debug.assert;
4+
const offsets = @import("offsets.zig");
5+
6+
const WorkspaceSymbolStore = @This();
7+
8+
pub const empty: WorkspaceSymbolStore = .{
9+
.entries = .empty,
10+
};
11+
12+
entries: std.ArrayListUnmanaged(Entry),
13+
14+
pub fn deinit(store: *WorkspaceSymbolStore, allocator: std.mem.Allocator) void {
15+
for (store.entries.items) |*entry| {
16+
entry.deinit(allocator);
17+
}
18+
store.entries.deinit(allocator);
19+
}
20+
21+
pub const Entry = struct {
22+
pub const Trigram = [3]u8;
23+
24+
pub const NameSlice = struct { start: u32, end: u32 };
25+
26+
pub const Declaration = struct {
27+
pub const Slice = struct { start: u32, end: u32 };
28+
29+
trigram: Trigram,
30+
name: NameSlice,
31+
range: offsets.Range,
32+
};
33+
34+
pub const empty: Entry = .{
35+
.has_filter = false,
36+
.filter_buckets = .empty,
37+
.trigram_to_declarations = .empty,
38+
.declarations = .empty,
39+
.names = .empty,
40+
};
41+
42+
has_filter: bool,
43+
filter_buckets: std.ArrayListUnmanaged(CuckooFilter.Bucket),
44+
trigram_to_declarations: std.AutoArrayHashMapUnmanaged(Trigram, Declaration.Slice),
45+
declarations: std.MultiArrayList(Declaration),
46+
names: std.ArrayListUnmanaged(u8),
47+
48+
pub fn deinit(entry: *Entry, allocator: std.mem.Allocator) void {
49+
entry.filter_buckets.deinit(allocator);
50+
entry.trigram_to_declarations.clearRetainingCapacity();
51+
entry.declarations.clearRetainingCapacity();
52+
entry.names.clearRetainingCapacity();
53+
entry.* = undefined;
54+
}
55+
56+
pub fn clearRetainingCapacity(entry: *Entry, allocator: std.mem.Allocator) void {
57+
entry.filter_buckets.clearAndFree(allocator);
58+
entry.has_filter = false;
59+
entry.trigram_to_declarations.clearRetainingCapacity();
60+
entry.declarations.clearRetainingCapacity();
61+
entry.names.clearRetainingCapacity();
62+
}
63+
64+
/// Caller must not submit name.len < 3.
65+
pub fn appendDeclarations(
66+
entry: *Entry,
67+
allocator: std.mem.Allocator,
68+
name: []const u8,
69+
range: offsets.Range,
70+
) error{ OutOfMemory, InvalidUtf8 }!void {
71+
assert(name.len >= 3);
72+
73+
const name_slice: NameSlice = blk: {
74+
const start = entry.bytes.items.len;
75+
try entry.bytes.appendSlice(allocator, name);
76+
break :blk .{
77+
.start = @intCast(start),
78+
.end = @intCast(entry.bytes.items.len),
79+
};
80+
};
81+
try entry.declarations.ensureUnusedCapacity(allocator, name.len - 2);
82+
83+
for (0..name.len - 2) |index| {
84+
const trigram = name[index..][0..3].*;
85+
entry.declarations.appendAssumeCapacity(.{
86+
.trigram = trigram,
87+
.name = name_slice,
88+
.range = range,
89+
});
90+
}
91+
}
92+
93+
pub const SortDeclarations = struct {
94+
trigrams: []const Trigram,
95+
range: []const offsets.Range,
96+
97+
pub fn lessThan(ctx: SortDeclarations, a_index: usize, b_index: usize) bool {
98+
const a_trigram_numeric: u24 = @bitCast(ctx.trigrams[a_index]);
99+
const b_trigram_numeric: u24 = @bitCast(ctx.trigrams[b_index]);
100+
101+
return a_trigram_numeric < b_trigram_numeric or
102+
(a_trigram_numeric == b_trigram_numeric and
103+
ctx.range[a_index].start < ctx.range[b_index].start);
104+
}
105+
};
106+
107+
/// Must be called before any queries are executed.
108+
pub fn finalize(entry: *Entry, allocator: std.mem.Allocator) error{OutOfMemory}!void {
109+
entry.declarations.sortUnstable(SortDeclarations{
110+
.trigrams = entry.declarations.items(.trigram),
111+
.ranges = entry.declarations.items(.range),
112+
});
113+
114+
var prng = std.Random.DefaultPrng.init(0);
115+
116+
try entry.filter_buckets.ensureTotalCapacityPrecise(
117+
allocator,
118+
entry.trigram_to_declarations.count(),
119+
);
120+
entry.filter_buckets.items.len = entry.trigram_to_declarations.count();
121+
122+
var filter = CuckooFilter{ .buckets = entry.filter_buckets.items };
123+
filter.reset();
124+
entry.has_filter = true;
125+
126+
for (entry.trigram_to_declarations.keys()) |trigram| {
127+
filter.append(prng.random(), trigram) catch |err| switch (err) {
128+
error.EvictionFailed => {
129+
// NOTE: This should generally be quite rare.
130+
entry.has_filter = false;
131+
break;
132+
},
133+
};
134+
}
135+
}
136+
};
137+
138+
// TODO: The pow2 requirement is quite inefficient: explore ideas posted in
139+
// https://databasearchitects.blogspot.com/2019/07/cuckoo-filters-with-arbitrarily-sized.html
140+
// (rocksdb even-odd scheme from comments looks interesting).
141+
// TODO: Look more into FPR scaling.
142+
pub const CuckooFilter = struct {
143+
/// len must be a power of 2.
144+
///
145+
/// ### Pathological case with buckets.len power of 2
146+
///
147+
/// - `BucketIndex(alias_0)` -> `bucket_1`, `BucketIndex(alias_0).alternate()` -> `bucket_2`
148+
/// - `BucketIndex(alias_1)` -> `bucket_1`, `BucketIndex(alias_1).alternate()` -> `bucket_2`
149+
///
150+
/// Our alternate mappings hold and `contains()` will not return false negatives.
151+
///
152+
/// ### Pathological case with buckets.len NOT power of 2:
153+
///
154+
/// - `BucketIndex(alias_0)` -> `bucket_1`, `BucketIndex(alias_0).alternate()` -> `bucket_3`
155+
/// - `BucketIndex(alias_1)` -> `bucket_2`, `BucketIndex(alias_1).alternate()` -> `bucket_4`
156+
///
157+
/// Our alternate mappings do not hold and `contains()` can return false negatives. This is not
158+
/// acceptable as the entire point of an AMQ datastructure is the presence of false positives
159+
/// but not false negatives.
160+
buckets: []Bucket,
161+
162+
pub const Fingerprint = enum(u8) {
163+
none = std.math.maxInt(u8),
164+
_,
165+
166+
pub fn hash(fingerprint: Fingerprint) u32 {
167+
return @truncate(std.hash.Murmur2_64.hash(&.{@intFromEnum(fingerprint)}));
168+
}
169+
};
170+
pub const Bucket = [4]Fingerprint;
171+
pub const BucketIndex = enum(u32) {
172+
_,
173+
174+
pub fn alternate(index: BucketIndex, fingerprint: Fingerprint) BucketIndex {
175+
assert(fingerprint != .none);
176+
return @enumFromInt(@intFromEnum(index) ^ fingerprint.hash());
177+
}
178+
};
179+
180+
pub const Triplet = struct {
181+
fingerprint: Fingerprint,
182+
index_1: BucketIndex,
183+
index_2: BucketIndex,
184+
185+
pub fn initFromTrigram(trigram: Entry.Trigram) Triplet {
186+
const split: packed struct {
187+
fingerprint: Fingerprint,
188+
padding: u24,
189+
index_1: BucketIndex,
190+
} = @bitCast(std.hash.Murmur2_64.hash(&trigram));
191+
192+
const fingerprint: Fingerprint = if (split.fingerprint == .none)
193+
@enumFromInt(0)
194+
else
195+
split.fingerprint;
196+
197+
const triplet: Triplet = .{
198+
.fingerprint = fingerprint,
199+
.index_1 = split.index_1,
200+
.index_2 = split.index_1.alternate(fingerprint),
201+
};
202+
assert(triplet.index_2.alternate(fingerprint) == triplet.index_1);
203+
204+
return triplet;
205+
}
206+
};
207+
208+
pub fn reset(filter: CuckooFilter) void {
209+
@memset(filter.buckets, [1]Fingerprint{.none} ** 4);
210+
}
211+
212+
// TODO: Dubious
213+
pub fn capacityForCount(count: usize) error{Overflow}!usize {
214+
const fill_rate = 0.95;
215+
return try std.math.ceilPowerOfTwo(usize, @ceil(@as(f32, @floatFromInt(count)) / fill_rate));
216+
}
217+
218+
pub fn append(filter: CuckooFilter, random: std.Random, trigram: Entry.Trigram) error{EvictionFailed}!void {
219+
const triplet: Triplet = .initFromTrigram(trigram);
220+
221+
if (filter.appendToBucket(triplet.index_1, triplet.fingerprint) or
222+
filter.appendToBucket(triplet.index_2, triplet.fingerprint))
223+
{
224+
return;
225+
}
226+
227+
var fingerprint = triplet.fingerprint;
228+
var index = if (random.boolean()) triplet.index_1 else triplet.index_2;
229+
for (0..500) |_| {
230+
fingerprint = filter.swapFromBucket(random, index, fingerprint);
231+
index = index.alternate(fingerprint);
232+
233+
if (filter.appendToBucket(index, fingerprint)) {
234+
return;
235+
}
236+
}
237+
238+
return error.EvictionFailed;
239+
}
240+
241+
fn bucketAt(filter: CuckooFilter, index: BucketIndex) *Bucket {
242+
assert(std.math.isPowerOfTwo(filter.buckets.len));
243+
return &filter.buckets[@intFromEnum(index) & (filter.buckets.len - 1)];
244+
}
245+
246+
fn appendToBucket(filter: CuckooFilter, index: BucketIndex, fingerprint: Fingerprint) bool {
247+
assert(fingerprint != .none);
248+
249+
const bucket = filter.bucketAt(index);
250+
for (bucket) |*slot| {
251+
if (slot.* == .none) {
252+
slot.* = fingerprint;
253+
return true;
254+
}
255+
}
256+
257+
return false;
258+
}
259+
260+
fn swapFromBucket(
261+
filter: CuckooFilter,
262+
random: std.Random,
263+
index: BucketIndex,
264+
fingerprint: Fingerprint,
265+
) Fingerprint {
266+
assert(fingerprint != .none);
267+
268+
const target = &filter.bucketAt(index)[random.int(u2)];
269+
270+
const old_fingerprint = target.*;
271+
assert(old_fingerprint != .none);
272+
273+
target.* = fingerprint;
274+
275+
return old_fingerprint;
276+
}
277+
278+
pub fn contains(filter: CuckooFilter, trigram: Entry.Trigram) bool {
279+
const triplet: Triplet = .initFromTrigram(trigram);
280+
281+
return filter.containsInBucket(triplet.index_1, triplet.fingerprint) or
282+
filter.containsInBucket(triplet.index_2, triplet.fingerprint);
283+
}
284+
285+
fn containsInBucket(filter: CuckooFilter, index: BucketIndex, fingerprint: Fingerprint) bool {
286+
assert(fingerprint != .none);
287+
288+
const bucket = filter.bucketAt(index);
289+
for (bucket) |*slot| {
290+
if (slot.* == fingerprint) {
291+
return true;
292+
}
293+
}
294+
295+
return false;
296+
}
297+
};
298+
299+
// TODO: More extensive (different capacities) testing.
300+
test CuckooFilter {
301+
const allocator = std.testing.allocator;
302+
303+
const element_count = 486;
304+
const filter_size = comptime CuckooFilter.capacityForCount(element_count) catch unreachable;
305+
try std.testing.expectEqual(512, filter_size);
306+
307+
var entries: std.AutoArrayHashMapUnmanaged(Entry.Trigram, void) = .empty;
308+
defer entries.deinit(allocator);
309+
try entries.ensureTotalCapacity(allocator, element_count);
310+
311+
var buckets: [filter_size]CuckooFilter.Bucket = undefined;
312+
var filter = CuckooFilter{ .buckets = &buckets };
313+
var filter_prng = std.Random.DefaultPrng.init(42);
314+
315+
for (0..2_500) |gen_prng_seed| {
316+
entries.clearRetainingCapacity();
317+
filter.reset();
318+
319+
var gen_prng = std.Random.DefaultPrng.init(gen_prng_seed);
320+
for (0..element_count) |_| {
321+
const trigram: Entry.Trigram = @bitCast(gen_prng.random().int(u24));
322+
try entries.put(allocator, trigram, {});
323+
try filter.append(filter_prng.random(), trigram);
324+
}
325+
326+
// No false negatives
327+
for (entries.keys()) |trigram| {
328+
try std.testing.expect(filter.contains(trigram));
329+
}
330+
331+
// Reasonable false positive rate
332+
const fpr_count = 2_500;
333+
var false_positives: usize = 0;
334+
var negative_prng = std.Random.DefaultPrng.init(~gen_prng_seed);
335+
for (0..fpr_count) |_| {
336+
var trigram: Entry.Trigram = @bitCast(negative_prng.random().int(u24));
337+
while (entries.contains(trigram)) {
338+
trigram = @bitCast(negative_prng.random().int(u24));
339+
}
340+
341+
false_positives += @intFromBool(filter.contains(trigram));
342+
}
343+
344+
const fpr = @as(f32, @floatFromInt(false_positives)) / fpr_count;
345+
std.testing.expect(fpr < 0.035) catch |err| {
346+
std.log.err("fpr: {d}%", .{fpr * 100});
347+
return err;
348+
};
349+
}
350+
}

src/zls.zig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ pub const lsp = @import("lsp");
1414
pub const types = lsp.types;
1515
pub const URI = @import("uri.zig");
1616
pub const DocumentStore = @import("DocumentStore.zig");
17+
pub const WorkspaceSymbolStore = @import("WorkspaceSymbolStore.zig");
1718
pub const diff = @import("diff.zig");
1819
pub const analyser = @import("analyser/analyser.zig");
1920
pub const configuration = @import("configuration.zig");

0 commit comments

Comments
 (0)