Skip to content

Commit caf3ff4

Browse files
committed
perf(indexing): speed up LSP enrichment throughput
1 parent a6be85c commit caf3ff4

File tree

2 files changed

+284
-48
lines changed

2 files changed

+284
-48
lines changed

crates/codegraph-mcp/src/analyzers/lsp.rs

Lines changed: 240 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ pub fn enrich_nodes_and_edges_with_lsp(
141141
std::collections::HashMap::new();
142142
let mut node_file_by_id: std::collections::HashMap<codegraph_core::NodeId, String> =
143143
std::collections::HashMap::new();
144+
let mut files_with_nodes: std::collections::HashSet<String> = std::collections::HashSet::new();
144145

145146
for (idx, node) in nodes.iter().enumerate() {
146147
let file = node.location.file_path.clone();
@@ -151,6 +152,7 @@ pub fn enrich_nodes_and_edges_with_lsp(
151152
nodes_by_file_line
152153
.entry((abs.clone(), line0))
153154
.or_insert(idx);
155+
files_with_nodes.insert(abs);
154156
}
155157
}
156158
let line0 = node.location.line.saturating_sub(1);
@@ -159,21 +161,42 @@ pub fn enrich_nodes_and_edges_with_lsp(
159161
.entry((file.clone(), line0))
160162
.or_insert(idx);
161163
node_file_by_id.insert(node.id, file);
164+
if let Some(file) = node_file_by_id.get(&node.id) {
165+
files_with_nodes.insert(file.clone());
166+
}
162167
}
163168

169+
let def_edges_by_file = definition_edge_indices_by_file(&project_root, nodes, edges);
170+
164171
let mut stats = LspEnrichmentStats::default();
165-
let total_files = files.len().max(1);
172+
let mut files_to_process: Vec<PathBuf> = Vec::new();
173+
for file_path in files {
174+
let abs_path = absolute_file_path(&project_root, file_path);
175+
let file_str = file_path.to_string_lossy().to_string();
176+
let abs_file_str = abs_path.to_string_lossy().to_string();
177+
if !files_with_nodes.contains(&file_str)
178+
&& !files_with_nodes.contains(&abs_file_str)
179+
&& !def_edges_by_file.contains_key(&file_str)
180+
&& !def_edges_by_file.contains_key(&abs_file_str)
181+
{
182+
continue;
183+
}
184+
files_to_process.push(file_path.clone());
185+
}
186+
187+
let total_files = files_to_process.len().max(1);
166188
let mut processed_files: usize = 0;
167189
let mut last_progress_log = Instant::now();
168190

169-
for file_path in files {
191+
for file_path in &files_to_process {
170192
let abs_path = absolute_file_path(&project_root, file_path);
171193
let content = std::fs::read_to_string(&abs_path)?;
172194
let file_str = file_path.to_string_lossy().to_string();
173195
let uri = Url::from_file_path(&abs_path)
174196
.map_err(|_| anyhow::anyhow!("failed to create file URI for {}", abs_path.display()))?
175197
.to_string();
176198
let abs_file_str = abs_path.to_string_lossy().to_string();
199+
let pos_index = LspPositionIndex::new(&content);
177200

178201
proc.notify(
179202
"textDocument/didOpen",
@@ -216,55 +239,64 @@ pub fn enrich_nodes_and_edges_with_lsp(
216239
}
217240
}
218241

219-
for edge in edges.iter_mut() {
220-
let Some(from_file) = node_file_by_id.get(&edge.from) else {
221-
continue;
222-
};
223-
if *from_file != file_str && *from_file != abs_file_str {
224-
continue;
225-
}
226-
let Some(span) = edge.span.as_ref() else {
227-
continue;
228-
};
229-
230-
let pos = byte_offset_to_utf16_position(&content, span.start_byte);
231-
let def = proc.request(
232-
"textDocument/definition",
233-
serde_json::json!({
234-
"textDocument": { "uri": uri },
235-
"position": { "line": pos.line, "character": pos.character }
236-
}),
237-
)?;
238-
239-
let Some((target_file, target_line0)) = extract_first_definition_location(&def) else {
240-
continue;
241-
};
242-
243-
let target_idx = nodes_by_file_line
244-
.get(&(target_file.clone(), target_line0))
245-
.copied()
246-
.or_else(|| {
247-
let rel_target = Path::new(&target_file);
248-
let rel_key = relative_file_key(&project_root, rel_target)?;
249-
nodes_by_file_line.get(&(rel_key, target_line0)).copied()
250-
});
251-
if let Some(target_idx) = target_idx {
252-
let target = &nodes[target_idx];
253-
let target_name = target
254-
.metadata
255-
.attributes
256-
.get("qualified_name")
257-
.cloned()
258-
.unwrap_or_else(|| target.name.to_string());
259-
edge.to = target_name;
260-
edge.metadata
261-
.insert("analyzer".to_string(), "lsp_definition".to_string());
262-
edge.metadata
263-
.insert("analyzer_confidence".to_string(), "1.0".to_string());
264-
stats.edges_resolved += 1;
242+
if let Some(edge_indices) = def_edges_by_file
243+
.get(&abs_file_str)
244+
.or_else(|| def_edges_by_file.get(&file_str))
245+
{
246+
for &edge_idx in edge_indices {
247+
let edge = &mut edges[edge_idx];
248+
let Some(span) = edge.span.as_ref() else {
249+
continue;
250+
};
251+
252+
let pos = pos_index.position_for_byte_offset(span.start_byte);
253+
let def = proc.request(
254+
"textDocument/definition",
255+
serde_json::json!({
256+
"textDocument": { "uri": uri },
257+
"position": { "line": pos.line, "character": pos.character }
258+
}),
259+
)?;
260+
261+
let Some((target_file, target_line0)) =
262+
extract_first_definition_location(&def)
263+
else {
264+
continue;
265+
};
266+
267+
let target_idx = nodes_by_file_line
268+
.get(&(target_file.clone(), target_line0))
269+
.copied()
270+
.or_else(|| {
271+
let rel_target = Path::new(&target_file);
272+
let rel_key = relative_file_key(&project_root, rel_target)?;
273+
nodes_by_file_line.get(&(rel_key, target_line0)).copied()
274+
});
275+
if let Some(target_idx) = target_idx {
276+
let target = &nodes[target_idx];
277+
let target_name = target
278+
.metadata
279+
.attributes
280+
.get("qualified_name")
281+
.cloned()
282+
.unwrap_or_else(|| target.name.to_string());
283+
edge.to = target_name;
284+
edge.metadata
285+
.insert("analyzer".to_string(), "lsp_definition".to_string());
286+
edge.metadata
287+
.insert("analyzer_confidence".to_string(), "1.0".to_string());
288+
stats.edges_resolved += 1;
289+
}
265290
}
266291
}
267292

293+
proc.notify(
294+
"textDocument/didClose",
295+
serde_json::json!({
296+
"textDocument": { "uri": uri }
297+
}),
298+
)?;
299+
268300
processed_files += 1;
269301
if last_progress_log.elapsed() >= Duration::from_secs(10) {
270302
info!(
@@ -362,6 +394,83 @@ pub fn byte_offset_to_utf16_position(text: &str, byte_offset: u32) -> LspPositio
362394
LspPosition { line, character }
363395
}
364396

397+
#[derive(Debug, Clone)]
398+
pub struct LspPositionIndex<'a> {
399+
text: &'a str,
400+
line_starts: Vec<usize>,
401+
}
402+
403+
impl<'a> LspPositionIndex<'a> {
404+
pub fn new(text: &'a str) -> Self {
405+
let mut line_starts = Vec::new();
406+
line_starts.push(0);
407+
for (idx, ch) in text.char_indices() {
408+
if ch == '\n' {
409+
let next = idx.saturating_add(1);
410+
if next <= text.len() {
411+
line_starts.push(next);
412+
}
413+
}
414+
}
415+
Self { text, line_starts }
416+
}
417+
418+
pub fn position_for_byte_offset(&self, byte_offset: u32) -> LspPosition {
419+
let target = (byte_offset as usize).min(self.text.len());
420+
let line_idx = match self.line_starts.binary_search(&target) {
421+
Ok(i) => i,
422+
Err(insert) => insert.saturating_sub(1),
423+
};
424+
let line_start = *self.line_starts.get(line_idx).unwrap_or(&0);
425+
426+
let mut character: u32 = 0;
427+
for (idx, ch) in self.text[line_start..].char_indices() {
428+
let abs = line_start.saturating_add(idx);
429+
if abs >= target {
430+
break;
431+
}
432+
character += ch.encode_utf16(&mut [0u16; 2]).len() as u32;
433+
}
434+
435+
LspPosition {
436+
line: line_idx as u32,
437+
character,
438+
}
439+
}
440+
}
441+
442+
fn definition_edge_indices_by_file(
443+
project_root: &Path,
444+
nodes: &[CodeNode],
445+
edges: &[EdgeRelationship],
446+
) -> std::collections::HashMap<String, Vec<usize>> {
447+
let mut file_by_id: std::collections::HashMap<codegraph_core::NodeId, String> =
448+
std::collections::HashMap::with_capacity(nodes.len());
449+
450+
for node in nodes {
451+
file_by_id.insert(node.id, node.location.file_path.clone());
452+
}
453+
454+
let mut out: std::collections::HashMap<String, Vec<usize>> = std::collections::HashMap::new();
455+
for (idx, edge) in edges.iter().enumerate() {
456+
if edge.span.is_none() {
457+
continue;
458+
}
459+
let Some(file_key) = file_by_id.get(&edge.from) else {
460+
continue;
461+
};
462+
463+
out.entry(file_key.clone()).or_default().push(idx);
464+
if let Some(abs) = absolute_file_key(project_root, Path::new(file_key)) {
465+
if abs != *file_key {
466+
out.entry(abs).or_default().push(idx);
467+
}
468+
}
469+
}
470+
471+
out
472+
}
473+
365474
pub struct LspProcess {
366475
child: Child,
367476
stdin: ChildStdin,
@@ -773,4 +882,87 @@ mod tests {
773882

774883
let _ = std::fs::remove_dir_all(&root);
775884
}
885+
886+
#[test]
887+
fn utf16_line_index_matches_reference_mapping() {
888+
let text = "a🙂b\nc";
889+
let index = LspPositionIndex::new(text);
890+
891+
for offset in 0..=(text.len() as u32) {
892+
let expected = byte_offset_to_utf16_position(text, offset);
893+
let observed = index.position_for_byte_offset(offset);
894+
assert_eq!(observed, expected, "mismatch at byte offset {offset}");
895+
}
896+
}
897+
898+
#[test]
899+
fn groups_edge_indices_by_file_path() {
900+
let project_root =
901+
std::env::temp_dir().join(format!("codegraph_lsp_edges_{}", std::process::id()));
902+
let _ = std::fs::create_dir_all(&project_root);
903+
904+
let a_path = project_root.join("a.rs");
905+
let b_path = project_root.join("b.rs");
906+
let _ = std::fs::write(&a_path, "fn a() {}");
907+
let _ = std::fs::write(&b_path, "fn b() {}");
908+
909+
let node_a = CodeNode::new(
910+
"a",
911+
None,
912+
None,
913+
codegraph_core::Location {
914+
file_path: "a.rs".to_string(),
915+
line: 1,
916+
column: 0,
917+
end_line: Some(1),
918+
end_column: Some(0),
919+
},
920+
);
921+
let node_b = CodeNode::new(
922+
"b",
923+
None,
924+
None,
925+
codegraph_core::Location {
926+
file_path: "b.rs".to_string(),
927+
line: 1,
928+
column: 0,
929+
end_line: Some(1),
930+
end_column: Some(0),
931+
},
932+
);
933+
934+
let edges = vec![
935+
EdgeRelationship {
936+
from: node_a.id,
937+
to: "x".to_string(),
938+
edge_type: codegraph_core::EdgeType::Uses,
939+
metadata: std::collections::HashMap::new(),
940+
span: Some(codegraph_core::Span {
941+
start_byte: 0,
942+
end_byte: 1,
943+
}),
944+
},
945+
EdgeRelationship {
946+
from: node_b.id,
947+
to: "y".to_string(),
948+
edge_type: codegraph_core::EdgeType::Uses,
949+
metadata: std::collections::HashMap::new(),
950+
span: Some(codegraph_core::Span {
951+
start_byte: 0,
952+
end_byte: 1,
953+
}),
954+
},
955+
];
956+
957+
let nodes = vec![node_a, node_b];
958+
let grouped = definition_edge_indices_by_file(&project_root, &nodes, &edges);
959+
960+
assert_eq!(grouped.get("a.rs"), Some(&vec![0]));
961+
assert_eq!(grouped.get("b.rs"), Some(&vec![1]));
962+
963+
let a_abs = a_path.to_string_lossy().to_string();
964+
let b_abs = b_path.to_string_lossy().to_string();
965+
assert_eq!(grouped.get(&a_abs), Some(&vec![0]));
966+
assert_eq!(grouped.get(&b_abs), Some(&vec![1]));
967+
}
776968
}
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
ABOUTME: Specifies performance and behavioral contracts for the LSP enrichment phase.
2+
ABOUTME: Focuses on throughput and bounded work per file to keep indexing responsive.
3+
4+
# Specification: LSP Enrichment Throughput
5+
6+
## Intent
7+
8+
The LSP enrichment phase should provide symbol/name enrichment and best-effort definition resolution without making indexing feel “stuck”.
9+
10+
In particular, LSP enrichment must avoid work that scales with `O(files * total_edges)` when it can be organized as `O(files + total_edges)` by pre-grouping edges per file.
11+
12+
## Context Boundary
13+
14+
Inputs:
15+
- `project_root`: filesystem root for URI generation
16+
- `files`: paths passed to the LSP analyzer for a language
17+
- `nodes`: extracted nodes to enrich (qualified names)
18+
- `edges`: extracted edges to optionally retarget via definition resolution
19+
20+
Outputs:
21+
- Mutated `nodes` metadata (qualified names + provenance)
22+
- Mutated `edges` `to` + metadata (when a definition location resolves to a known node)
23+
- `LspEnrichmentStats`
24+
25+
## Contracts
26+
27+
1. **Bounded per-file edge processing**
28+
- Only edges whose `from` node belongs to the currently processed file are considered for definition resolution.
29+
- The implementation MUST NOT scan all edges for every file when sufficient information exists to pre-group edges by file.
30+
31+
2. **Deterministic position mapping**
32+
- Mapping from byte offsets to LSP UTF-16 positions must be correct for Unicode input.
33+
- The mapping should avoid repeatedly scanning the entire file for each lookup.
34+
35+
3. **Observability**
36+
- Progress logging must continue to report processed files and enrichment counts at least every 10 seconds.
37+
38+
## Acceptance Criteria
39+
40+
1. Unit tests cover:
41+
- UTF-16 position mapping remains correct for representative Unicode cases.
42+
- Per-file edge grouping selects only edges for the matching file key(s).
43+
2. `cargo test -p codegraph-mcp -q` passes with pristine output.
44+

0 commit comments

Comments
 (0)