instill-ai
diff --git a/‎pkg/component/operator/text/v0/chunk_text_test.go‎
Lines changed: 50 additions & 1 deletion b/‎pkg/component/operator/text/v0/chunk_text_test.go‎
Lines changed: 50 additions & 1 deletion
diff --git a/‎pkg/component/operator/text/v0/markdown_splitter.go‎
Lines changed: 1 addition & 1 deletion b/‎pkg/component/operator/text/v0/markdown_splitter.go‎
Lines changed: 1 addition & 1 deletion
@@ -1,14 +1,63 @@
 package text
 
 import (
+	"encoding/json"
 	"os"
 	"testing"
 
 	"github.com/frankban/quicktest"
 )
 
-func TestChunkText(t *testing.T) {
+func TestChunkText_LongMD(t *testing.T) {
+	c := quicktest.New(t)
+
+	inputData, err := os.ReadFile("testdata/chunk-markdown-input.md")
+	c.Assert(err, quicktest.IsNil)
+
+	// Expectations are complex and hence stored in a file
+	expectedData, err := os.ReadFile("testdata/chunk-markdown-output.json")
+	c.Assert(err, quicktest.IsNil)
+
+	var want ChunkTextOutput
+	err = json.Unmarshal(expectedData, &want)
+	c.Assert(err, quicktest.IsNil)
+
+	input := ChunkTextInput{
+		Text: string(inputData),
+		Strategy: Strategy{
+			Setting: Setting{
+				ChunkMethod:  "Markdown",
+				ModelName:    "gpt-4",
+				ChunkSize:    1024,
+				ChunkOverlap: 200,
+			},
+		},
+	}
 
+	// Run the algorithm
+	got, err := chunkMarkdown(input)
+	c.Assert(err, quicktest.IsNil)
+
+	// Compare the results
+	c.Check(got.ChunkNum, quicktest.Equals, want.ChunkNum)
+	c.Check(got.TokenCount, quicktest.Equals, want.TokenCount)
+	c.Check(got.ChunksTokenCount, quicktest.Equals, want.ChunksTokenCount)
+	c.Check(len(got.TextChunks), quicktest.Equals, len(want.TextChunks))
+
+	// Compare each chunk (ignoring the actual text content for performance)
+	for i, chunk := range got.TextChunks {
+		// NOTE: We don't compare the Text field as it can be large and the
+		// positions/tokens are sufficient
+
+		wantChunk := want.TextChunks[i]
+		c.Check(chunk.StartPosition, quicktest.Equals, wantChunk.StartPosition)
+		c.Check(chunk.EndPosition, quicktest.Equals, wantChunk.EndPosition)
+		c.Check(chunk.TokenCount, quicktest.Equals, wantChunk.TokenCount)
+
+	}
+}
+
+func TestChunkText(t *testing.T) {
 	c := quicktest.New(t)
 
 	testCases := []struct {
 
@@ -457,7 +457,7 @@ func (sp MarkdownTextSplitter) chunkPlainText(content Content, headers []Header)
 	}
 
 	rawRunes := []rune(sp.RawText)
-	startScanPosition := 0
+	startScanPosition := content.BlockStartPosition
 
 	contentChunks := []ContentChunk{}
 	for _, chunk := range chunks {
Original file line number	Diff line number	Diff line change
`@@ -457,7 +457,7 @@ func (sp MarkdownTextSplitter) chunkPlainText(content Content, headers []Header)`
`457`	`457`	`}`
`458`	`458`
`459`	`459`	`rawRunes := []rune(sp.RawText)`
`460`		`- startScanPosition := 0`
	`460`	`+ startScanPosition := content.BlockStartPosition`
`461`	`461`
`462`	`462`	`contentChunks := []ContentChunk{}`
`463`	`463`	`for _, chunk := range chunks {`