Skip to content

Commit 6b105e3

Browse files
committed
fix(code-index): preserve identifiers when splitting large nodes (#10715)
When the code parser splits a node that exceeds MAX_BLOCK_CHARS, small children (export keyword, class/function name, etc.) fall below MIN_BLOCK_CHARS and are silently discarded. This causes function and class names to disappear from the search index. Add a split-local sibling accumulator that prepends small children to the next large sibling as a synthetic QueueItem, and appends trailing small children to the last large sibling. Only the split path is affected; main-loop discard behaviour for non-split nodes is unchanged. Closes #10715
1 parent dcb33c4 commit 6b105e3

File tree

2 files changed

+367
-19
lines changed

2 files changed

+367
-19
lines changed

src/services/code-index/processors/__tests__/parser.spec.ts

Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import { loadRequiredLanguageParsers } from "../../../tree-sitter/languageParser
55
import { parseMarkdown } from "../../../tree-sitter/markdownParser"
66
import { readFile } from "fs/promises"
77
import { Node } from "web-tree-sitter"
8+
import { MAX_BLOCK_CHARS, MIN_BLOCK_CHARS, MAX_CHARS_TOLERANCE_FACTOR } from "../../constants"
89

910
// Mock TelemetryService
1011
vi.mock("../../../../../packages/telemetry/src/TelemetryService", () => ({
@@ -243,6 +244,225 @@ describe("CodeParser", () => {
243244
})
244245
})
245246

247+
describe("split-path signature accumulation (#10715)", () => {
248+
it("should preserve class/function identifier when splitting a large node", async () => {
249+
// Simulate a large class node whose children are:
250+
// - "export" (7 chars, < MIN_BLOCK_CHARS)
251+
// - "class" (5 chars, < MIN_BLOCK_CHARS)
252+
// - "TestParser" (10 chars, < MIN_BLOCK_CHARS)
253+
// - class body (large, > MIN_BLOCK_CHARS)
254+
// Total must exceed the split threshold to trigger the split path.
255+
const splitThreshold = MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR
256+
const bodyText = "x".repeat(Math.ceil(splitThreshold) + 100)
257+
const mockCapture = {
258+
node: {
259+
text: `export class TestParser ${bodyText}`,
260+
startPosition: { row: 0 },
261+
endPosition: { row: 10 },
262+
type: "class_declaration",
263+
childForFieldName: vi.fn().mockReturnValue({ text: "TestParser" }),
264+
children: [
265+
{
266+
text: "export",
267+
startPosition: { row: 0 },
268+
endPosition: { row: 0 },
269+
type: "export",
270+
childForFieldName: vi.fn().mockReturnValue(null),
271+
children: [],
272+
},
273+
{
274+
text: "class",
275+
startPosition: { row: 0 },
276+
endPosition: { row: 0 },
277+
type: "class",
278+
childForFieldName: vi.fn().mockReturnValue(null),
279+
children: [],
280+
},
281+
{
282+
text: "TestParser",
283+
startPosition: { row: 0 },
284+
endPosition: { row: 0 },
285+
type: "identifier",
286+
childForFieldName: vi.fn().mockReturnValue(null),
287+
children: [],
288+
},
289+
{
290+
text: bodyText,
291+
startPosition: { row: 1 },
292+
endPosition: { row: 10 },
293+
type: "class_body",
294+
childForFieldName: vi.fn().mockReturnValue(null),
295+
children: [],
296+
},
297+
],
298+
},
299+
name: "definition.class",
300+
}
301+
302+
const totalText = mockCapture.node.text
303+
expect(totalText.length).toBeGreaterThan(splitThreshold)
304+
305+
mockLanguageParser.js.query.captures.mockReturnValue([mockCapture])
306+
const result = await parser["parseContent"]("test.js", totalText, "hash-sig")
307+
308+
// At least one emitted chunk must contain the class name "TestParser"
309+
const hasIdentifier = result.some((block) => block.content.includes("TestParser"))
310+
expect(hasIdentifier).toBe(true)
311+
// No chunks should be empty
312+
result.forEach((block) => {
313+
expect(block.content.length).toBeGreaterThan(0)
314+
})
315+
})
316+
317+
it("should not contaminate across unrelated parent nodes", async () => {
318+
// Two separate captures: a small standalone comment, then a large class.
319+
// The comment should NOT appear inside the class's chunks.
320+
const splitThreshold = MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR
321+
const bodyText = "y".repeat(Math.ceil(splitThreshold) + 100)
322+
const commentText = "// standalone comment that is small"
323+
const classText = `export class Foo { ${bodyText} }`
324+
325+
const commentCapture = {
326+
node: {
327+
text: commentText,
328+
startPosition: { row: 0 },
329+
endPosition: { row: 0 },
330+
type: "comment",
331+
childForFieldName: vi.fn().mockReturnValue(null),
332+
children: [],
333+
},
334+
name: "definition.comment",
335+
}
336+
337+
const classCapture = {
338+
node: {
339+
text: classText,
340+
startPosition: { row: 2 },
341+
endPosition: { row: 20 },
342+
type: "class_declaration",
343+
childForFieldName: vi.fn().mockReturnValue({ text: "Foo" }),
344+
children: [
345+
{
346+
text: "export",
347+
startPosition: { row: 2 },
348+
endPosition: { row: 2 },
349+
type: "export",
350+
childForFieldName: vi.fn().mockReturnValue(null),
351+
children: [],
352+
},
353+
{
354+
text: "class",
355+
startPosition: { row: 2 },
356+
endPosition: { row: 2 },
357+
type: "class",
358+
childForFieldName: vi.fn().mockReturnValue(null),
359+
children: [],
360+
},
361+
{
362+
text: "Foo",
363+
startPosition: { row: 2 },
364+
endPosition: { row: 2 },
365+
type: "identifier",
366+
childForFieldName: vi.fn().mockReturnValue(null),
367+
children: [],
368+
},
369+
{
370+
text: `{ ${bodyText} }`,
371+
startPosition: { row: 2 },
372+
endPosition: { row: 20 },
373+
type: "class_body",
374+
childForFieldName: vi.fn().mockReturnValue(null),
375+
children: [],
376+
},
377+
],
378+
},
379+
name: "definition.class",
380+
}
381+
382+
mockLanguageParser.js.query.captures.mockReturnValue([commentCapture, classCapture])
383+
const result = await parser["parseContent"]("test.js", commentText + "\n\n" + classText, "hash-contam")
384+
385+
// The comment is < MIN_BLOCK_CHARS so it should be discarded by the main loop
386+
// (not accumulated -- accumulation only happens within split children).
387+
// No class chunk should contain "standalone comment".
388+
const classChunks = result.filter(
389+
(block) => block.content.includes("Foo") || block.content.includes(bodyText.slice(0, 20)),
390+
)
391+
classChunks.forEach((block) => {
392+
expect(block.content).not.toContain("standalone comment")
393+
})
394+
})
395+
396+
it("should append trailing small siblings to the last large sibling", async () => {
397+
// A split node with: large body, then a small unique trailing token.
398+
// Use a unique token that does NOT appear in the body text.
399+
const splitThreshold = MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR
400+
const bodyText = "z".repeat(Math.ceil(splitThreshold) + 100)
401+
const tailToken = "/*TAIL*/"
402+
403+
const parentCapture = {
404+
node: {
405+
text: `function bar() { ${bodyText} ${tailToken}`,
406+
startPosition: { row: 0 },
407+
endPosition: { row: 10 },
408+
type: "function_declaration",
409+
childForFieldName: vi.fn().mockReturnValue({ text: "bar" }),
410+
children: [
411+
{
412+
text: "function",
413+
startPosition: { row: 0 },
414+
endPosition: { row: 0 },
415+
type: "function",
416+
childForFieldName: vi.fn().mockReturnValue(null),
417+
children: [],
418+
},
419+
{
420+
text: "bar",
421+
startPosition: { row: 0 },
422+
endPosition: { row: 0 },
423+
type: "identifier",
424+
childForFieldName: vi.fn().mockReturnValue(null),
425+
children: [],
426+
},
427+
{
428+
text: `{ ${bodyText} }`,
429+
startPosition: { row: 1 },
430+
endPosition: { row: 9 },
431+
type: "statement_block",
432+
childForFieldName: vi.fn().mockReturnValue(null),
433+
children: [],
434+
},
435+
{
436+
text: tailToken,
437+
startPosition: { row: 10 },
438+
endPosition: { row: 10 },
439+
type: "comment",
440+
childForFieldName: vi.fn().mockReturnValue(null),
441+
children: [],
442+
},
443+
],
444+
},
445+
name: "definition.function",
446+
}
447+
448+
const fullText = parentCapture.node.text
449+
expect(fullText.length).toBeGreaterThan(splitThreshold)
450+
// Verify the tail token is unique -- not in the body
451+
expect(bodyText).not.toContain(tailToken)
452+
453+
mockLanguageParser.js.query.captures.mockReturnValue([parentCapture])
454+
const result = await parser["parseContent"]("test.js", fullText, "hash-tail")
455+
456+
// The tail token should be present in some chunk (appended to the last large sibling)
457+
const allContent = result.map((block) => block.content).join("")
458+
expect(allContent).toContain(tailToken)
459+
460+
// There should be no chunk that is just the tail token by itself
461+
const tinyChunks = result.filter((block) => block.content === tailToken)
462+
expect(tinyChunks).toHaveLength(0)
463+
})
464+
})
465+
246466
describe("_chunkLeafNodeByLines", () => {
247467
it("should chunk leaf nodes by lines", async () => {
248468
const mockNode = {

0 commit comments

Comments
 (0)