Skip to content

Commit e2f53b3

Browse files
committed
fix(text): correct positions on duplicate markdown chunks
Because - The markdown splitter in the text component returned wrong chunk positions when one of the sub-chunks (chunks before merge) had a content that occrred several times in the raw text. Additionally, after merging, that sub-chunk had to end up the last in the merged chunk, which caused the end position to be wrong. This commit - Uses the block positions as the starting point for the chunk location. - Adds a test with the document that signaled the error.
1 parent cd1bd55 commit e2f53b3

File tree

4 files changed

+1632
-2
lines changed

4 files changed

+1632
-2
lines changed

pkg/component/operator/text/v0/chunk_text_test.go

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,63 @@
11
package text
22

33
import (
4+
"encoding/json"
45
"os"
56
"testing"
67

78
"github.com/frankban/quicktest"
89
)
910

10-
func TestChunkText(t *testing.T) {
11+
func TestChunkText_LongMD(t *testing.T) {
12+
c := quicktest.New(t)
13+
14+
inputData, err := os.ReadFile("testdata/chunk-markdown-input.md")
15+
c.Assert(err, quicktest.IsNil)
16+
17+
// Expectations are complex and hence stored in a file
18+
expectedData, err := os.ReadFile("testdata/chunk-markdown-output.json")
19+
c.Assert(err, quicktest.IsNil)
20+
21+
var want ChunkTextOutput
22+
err = json.Unmarshal(expectedData, &want)
23+
c.Assert(err, quicktest.IsNil)
24+
25+
input := ChunkTextInput{
26+
Text: string(inputData),
27+
Strategy: Strategy{
28+
Setting: Setting{
29+
ChunkMethod: "Markdown",
30+
ModelName: "gpt-4",
31+
ChunkSize: 1024,
32+
ChunkOverlap: 200,
33+
},
34+
},
35+
}
1136

37+
// Run the algorithm
38+
got, err := chunkMarkdown(input)
39+
c.Assert(err, quicktest.IsNil)
40+
41+
// Compare the results
42+
c.Check(got.ChunkNum, quicktest.Equals, want.ChunkNum)
43+
c.Check(got.TokenCount, quicktest.Equals, want.TokenCount)
44+
c.Check(got.ChunksTokenCount, quicktest.Equals, want.ChunksTokenCount)
45+
c.Check(len(got.TextChunks), quicktest.Equals, len(want.TextChunks))
46+
47+
// Compare each chunk (ignoring the actual text content for performance)
48+
for i, chunk := range got.TextChunks {
49+
// NOTE: We don't compare the Text field as it can be large and the
50+
// positions/tokens are sufficient
51+
52+
wantChunk := want.TextChunks[i]
53+
c.Check(chunk.StartPosition, quicktest.Equals, wantChunk.StartPosition)
54+
c.Check(chunk.EndPosition, quicktest.Equals, wantChunk.EndPosition)
55+
c.Check(chunk.TokenCount, quicktest.Equals, wantChunk.TokenCount)
56+
57+
}
58+
}
59+
60+
func TestChunkText(t *testing.T) {
1261
c := quicktest.New(t)
1362

1463
testCases := []struct {

pkg/component/operator/text/v0/markdown_splitter.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -457,7 +457,7 @@ func (sp MarkdownTextSplitter) chunkPlainText(content Content, headers []Header)
457457
}
458458

459459
rawRunes := []rune(sp.RawText)
460-
startScanPosition := 0
460+
startScanPosition := content.BlockStartPosition
461461

462462
contentChunks := []ContentChunk{}
463463
for _, chunk := range chunks {

0 commit comments

Comments
 (0)