@@ -5,6 +5,7 @@ import { loadRequiredLanguageParsers } from "../../../tree-sitter/languageParser
55import { parseMarkdown } from "../../../tree-sitter/markdownParser"
66import { readFile } from "fs/promises"
77import { Node } from "web-tree-sitter"
8+ import { MAX_BLOCK_CHARS , MIN_BLOCK_CHARS , MAX_CHARS_TOLERANCE_FACTOR } from "../../constants"
89
910// Mock TelemetryService
1011vi . mock ( "../../../../../packages/telemetry/src/TelemetryService" , ( ) => ( {
@@ -243,6 +244,225 @@ describe("CodeParser", () => {
243244 } )
244245 } )
245246
247+ describe ( "split-path signature accumulation (#10715)" , ( ) => {
248+ it ( "should preserve class/function identifier when splitting a large node" , async ( ) => {
249+ // Simulate a large class node whose children are:
250+ // - "export" (7 chars, < MIN_BLOCK_CHARS)
251+ // - "class" (5 chars, < MIN_BLOCK_CHARS)
252+ // - "TestParser" (10 chars, < MIN_BLOCK_CHARS)
253+ // - class body (large, > MIN_BLOCK_CHARS)
254+ // Total must exceed the split threshold to trigger the split path.
255+ const splitThreshold = MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR
256+ const bodyText = "x" . repeat ( Math . ceil ( splitThreshold ) + 100 )
257+ const mockCapture = {
258+ node : {
259+ text : `export class TestParser ${ bodyText } ` ,
260+ startPosition : { row : 0 } ,
261+ endPosition : { row : 10 } ,
262+ type : "class_declaration" ,
263+ childForFieldName : vi . fn ( ) . mockReturnValue ( { text : "TestParser" } ) ,
264+ children : [
265+ {
266+ text : "export" ,
267+ startPosition : { row : 0 } ,
268+ endPosition : { row : 0 } ,
269+ type : "export" ,
270+ childForFieldName : vi . fn ( ) . mockReturnValue ( null ) ,
271+ children : [ ] ,
272+ } ,
273+ {
274+ text : "class" ,
275+ startPosition : { row : 0 } ,
276+ endPosition : { row : 0 } ,
277+ type : "class" ,
278+ childForFieldName : vi . fn ( ) . mockReturnValue ( null ) ,
279+ children : [ ] ,
280+ } ,
281+ {
282+ text : "TestParser" ,
283+ startPosition : { row : 0 } ,
284+ endPosition : { row : 0 } ,
285+ type : "identifier" ,
286+ childForFieldName : vi . fn ( ) . mockReturnValue ( null ) ,
287+ children : [ ] ,
288+ } ,
289+ {
290+ text : bodyText ,
291+ startPosition : { row : 1 } ,
292+ endPosition : { row : 10 } ,
293+ type : "class_body" ,
294+ childForFieldName : vi . fn ( ) . mockReturnValue ( null ) ,
295+ children : [ ] ,
296+ } ,
297+ ] ,
298+ } ,
299+ name : "definition.class" ,
300+ }
301+
302+ const totalText = mockCapture . node . text
303+ expect ( totalText . length ) . toBeGreaterThan ( splitThreshold )
304+
305+ mockLanguageParser . js . query . captures . mockReturnValue ( [ mockCapture ] )
306+ const result = await parser [ "parseContent" ] ( "test.js" , totalText , "hash-sig" )
307+
308+ // At least one emitted chunk must contain the class name "TestParser"
309+ const hasIdentifier = result . some ( ( block ) => block . content . includes ( "TestParser" ) )
310+ expect ( hasIdentifier ) . toBe ( true )
311+ // No chunks should be empty
312+ result . forEach ( ( block ) => {
313+ expect ( block . content . length ) . toBeGreaterThan ( 0 )
314+ } )
315+ } )
316+
317+ it ( "should not contaminate across unrelated parent nodes" , async ( ) => {
318+ // Two separate captures: a small standalone comment, then a large class.
319+ // The comment should NOT appear inside the class's chunks.
320+ const splitThreshold = MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR
321+ const bodyText = "y" . repeat ( Math . ceil ( splitThreshold ) + 100 )
322+ const commentText = "// standalone comment that is small"
323+ const classText = `export class Foo { ${ bodyText } }`
324+
325+ const commentCapture = {
326+ node : {
327+ text : commentText ,
328+ startPosition : { row : 0 } ,
329+ endPosition : { row : 0 } ,
330+ type : "comment" ,
331+ childForFieldName : vi . fn ( ) . mockReturnValue ( null ) ,
332+ children : [ ] ,
333+ } ,
334+ name : "definition.comment" ,
335+ }
336+
337+ const classCapture = {
338+ node : {
339+ text : classText ,
340+ startPosition : { row : 2 } ,
341+ endPosition : { row : 20 } ,
342+ type : "class_declaration" ,
343+ childForFieldName : vi . fn ( ) . mockReturnValue ( { text : "Foo" } ) ,
344+ children : [
345+ {
346+ text : "export" ,
347+ startPosition : { row : 2 } ,
348+ endPosition : { row : 2 } ,
349+ type : "export" ,
350+ childForFieldName : vi . fn ( ) . mockReturnValue ( null ) ,
351+ children : [ ] ,
352+ } ,
353+ {
354+ text : "class" ,
355+ startPosition : { row : 2 } ,
356+ endPosition : { row : 2 } ,
357+ type : "class" ,
358+ childForFieldName : vi . fn ( ) . mockReturnValue ( null ) ,
359+ children : [ ] ,
360+ } ,
361+ {
362+ text : "Foo" ,
363+ startPosition : { row : 2 } ,
364+ endPosition : { row : 2 } ,
365+ type : "identifier" ,
366+ childForFieldName : vi . fn ( ) . mockReturnValue ( null ) ,
367+ children : [ ] ,
368+ } ,
369+ {
370+ text : `{ ${ bodyText } }` ,
371+ startPosition : { row : 2 } ,
372+ endPosition : { row : 20 } ,
373+ type : "class_body" ,
374+ childForFieldName : vi . fn ( ) . mockReturnValue ( null ) ,
375+ children : [ ] ,
376+ } ,
377+ ] ,
378+ } ,
379+ name : "definition.class" ,
380+ }
381+
382+ mockLanguageParser . js . query . captures . mockReturnValue ( [ commentCapture , classCapture ] )
383+ const result = await parser [ "parseContent" ] ( "test.js" , commentText + "\n\n" + classText , "hash-contam" )
384+
385+ // The comment is < MIN_BLOCK_CHARS so it should be discarded by the main loop
386+ // (not accumulated -- accumulation only happens within split children).
387+ // No class chunk should contain "standalone comment".
388+ const classChunks = result . filter (
389+ ( block ) => block . content . includes ( "Foo" ) || block . content . includes ( bodyText . slice ( 0 , 20 ) ) ,
390+ )
391+ classChunks . forEach ( ( block ) => {
392+ expect ( block . content ) . not . toContain ( "standalone comment" )
393+ } )
394+ } )
395+
396+ it ( "should append trailing small siblings to the last large sibling" , async ( ) => {
397+ // A split node with: large body, then a small unique trailing token.
398+ // Use a unique token that does NOT appear in the body text.
399+ const splitThreshold = MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR
400+ const bodyText = "z" . repeat ( Math . ceil ( splitThreshold ) + 100 )
401+ const tailToken = "/*TAIL*/"
402+
403+ const parentCapture = {
404+ node : {
405+ text : `function bar() { ${ bodyText } ${ tailToken } ` ,
406+ startPosition : { row : 0 } ,
407+ endPosition : { row : 10 } ,
408+ type : "function_declaration" ,
409+ childForFieldName : vi . fn ( ) . mockReturnValue ( { text : "bar" } ) ,
410+ children : [
411+ {
412+ text : "function" ,
413+ startPosition : { row : 0 } ,
414+ endPosition : { row : 0 } ,
415+ type : "function" ,
416+ childForFieldName : vi . fn ( ) . mockReturnValue ( null ) ,
417+ children : [ ] ,
418+ } ,
419+ {
420+ text : "bar" ,
421+ startPosition : { row : 0 } ,
422+ endPosition : { row : 0 } ,
423+ type : "identifier" ,
424+ childForFieldName : vi . fn ( ) . mockReturnValue ( null ) ,
425+ children : [ ] ,
426+ } ,
427+ {
428+ text : `{ ${ bodyText } }` ,
429+ startPosition : { row : 1 } ,
430+ endPosition : { row : 9 } ,
431+ type : "statement_block" ,
432+ childForFieldName : vi . fn ( ) . mockReturnValue ( null ) ,
433+ children : [ ] ,
434+ } ,
435+ {
436+ text : tailToken ,
437+ startPosition : { row : 10 } ,
438+ endPosition : { row : 10 } ,
439+ type : "comment" ,
440+ childForFieldName : vi . fn ( ) . mockReturnValue ( null ) ,
441+ children : [ ] ,
442+ } ,
443+ ] ,
444+ } ,
445+ name : "definition.function" ,
446+ }
447+
448+ const fullText = parentCapture . node . text
449+ expect ( fullText . length ) . toBeGreaterThan ( splitThreshold )
450+ // Verify the tail token is unique -- not in the body
451+ expect ( bodyText ) . not . toContain ( tailToken )
452+
453+ mockLanguageParser . js . query . captures . mockReturnValue ( [ parentCapture ] )
454+ const result = await parser [ "parseContent" ] ( "test.js" , fullText , "hash-tail" )
455+
456+ // The tail token should be present in some chunk (appended to the last large sibling)
457+ const allContent = result . map ( ( block ) => block . content ) . join ( "" )
458+ expect ( allContent ) . toContain ( tailToken )
459+
460+ // There should be no chunk that is just the tail token by itself
461+ const tinyChunks = result . filter ( ( block ) => block . content === tailToken )
462+ expect ( tinyChunks ) . toHaveLength ( 0 )
463+ } )
464+ } )
465+
246466 describe ( "_chunkLeafNodeByLines" , ( ) => {
247467 it ( "should chunk leaf nodes by lines" , async ( ) => {
248468 const mockNode = {
0 commit comments