Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 99 additions & 0 deletions pkg/component/operator/document/v0/benchmark_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
package document

import (
"context"
"os"
"testing"

"github.com/instill-ai/pipeline-backend/pkg/component/base"
"github.com/instill-ai/pipeline-backend/pkg/component/internal/mock"
"github.com/instill-ai/pipeline-backend/pkg/data"
"github.com/instill-ai/pipeline-backend/pkg/data/format"
)

// BenchmarkFileReading compares cached vs uncached file reading
func BenchmarkFileReading(b *testing.B) {
testFile := "testdata/test.pdf"

b.Run("Uncached", func(b *testing.B) {
for i := 0; i < b.N; i++ {
_, err := readFileDirectly(testFile)
if err != nil {
b.Fatal(err)
}
}
})

b.Run("Cached", func(b *testing.B) {
// Clear cache before benchmark
clearTestFileCache()

for i := 0; i < b.N; i++ {
_, err := getTestFileContent(testFile)
if err != nil {
b.Fatal(err)
}
}
})
}

// BenchmarkHTMLConversion benchmarks the fastest conversion (HTML)
func BenchmarkHTMLConversion(b *testing.B) {
if !checkExternalDependency("python3") && !checkExternalDependency("python") {
b.Skip("Python not found, skipping benchmark")
return
}

fileContent, err := getTestFileContent("testdata/test.html")
if err != nil {
b.Fatal(err)
}

component := Init(base.Component{})
execution, err := component.CreateExecution(base.ComponentExecution{
Component: component,
Task: "TASK_CONVERT_TO_MARKDOWN",
})
if err != nil {
b.Fatal(err)
}

b.ResetTimer()

for i := 0; i < b.N; i++ {
ctx := context.Background()

ir, ow, eh, job := mock.GenerateMockJob(nil)
ir.ReadDataMock.Set(func(ctx context.Context, input any) error {
switch input := input.(type) {
case *ConvertDocumentToMarkdownInput:
*input = ConvertDocumentToMarkdownInput{
Document: func() format.Document {
doc, err := data.NewDocumentFromBytes(fileContent, "text/html", "")
if err != nil {
return nil
}
return doc
}(),
DisplayImageTag: false,
}
}
return nil
})

ow.WriteDataMock.Set(func(ctx context.Context, output any) error {
return nil
})
eh.ErrorMock.Optional()

err := execution.Execute(ctx, []*base.Job{job})
if err != nil {
b.Fatal(err)
}
}
}

// readFileDirectly reads file without caching (for benchmark comparison)
func readFileDirectly(filepath string) ([]byte, error) {
return os.ReadFile(filepath)
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package document

import (
"context"
"os"
"testing"

qt "github.com/frankban/quicktest"
Expand All @@ -18,15 +17,18 @@ func TestConvertDocumentToMarkdown(t *testing.T) {
c.Parallel()

tests := []struct {
name string
filepath string
converter string
expected ConvertDocumentToMarkdownOutput
name string
filepath string
converter string
expected ConvertDocumentToMarkdownOutput
requiresLibreOffice bool
skipIfMissingDeps bool
}{
{
name: "Convert PDF file - pdfplumber",
filepath: "testdata/test.pdf",
converter: "pdfplumber",
name: "Convert PDF file - pdfplumber",
filepath: "testdata/test.pdf",
converter: "pdfplumber",
skipIfMissingDeps: true, // Skip if Python dependencies missing
expected: ConvertDocumentToMarkdownOutput{
Body: "# This is test file for markdown\n",
Images: []format.Image{},
Expand All @@ -35,9 +37,10 @@ func TestConvertDocumentToMarkdown(t *testing.T) {
},
},
{
name: "Convert PDF file - Docling",
filepath: "testdata/test.pdf",
converter: "docling",
name: "Convert PDF file - Docling",
filepath: "testdata/test.pdf",
converter: "docling",
skipIfMissingDeps: true, // Skip if Python dependencies missing
expected: ConvertDocumentToMarkdownOutput{
Body: "This is test file for markdown",
Images: []format.Image{},
Expand All @@ -46,8 +49,10 @@ func TestConvertDocumentToMarkdown(t *testing.T) {
},
},
{
name: "Convert DOCX file",
filepath: "testdata/test.docx",
name: "Convert DOCX file",
filepath: "testdata/test.docx",
requiresLibreOffice: true,
skipIfMissingDeps: true,
expected: ConvertDocumentToMarkdownOutput{
Body: "# This is test file for markdown\n",
Images: []format.Image{},
Expand All @@ -56,8 +61,10 @@ func TestConvertDocumentToMarkdown(t *testing.T) {
},
},
{
name: "Convert DOC file",
filepath: "testdata/test.doc",
name: "Convert DOC file",
filepath: "testdata/test.doc",
requiresLibreOffice: true,
skipIfMissingDeps: true,
expected: ConvertDocumentToMarkdownOutput{
Body: "# This is test file for markdown\n",
Images: []format.Image{},
Expand All @@ -75,8 +82,10 @@ func TestConvertDocumentToMarkdown(t *testing.T) {
},
},
{
name: "Convert PPTX file",
filepath: "testdata/test.pptx",
name: "Convert PPTX file",
filepath: "testdata/test.pptx",
requiresLibreOffice: true,
skipIfMissingDeps: true,
expected: ConvertDocumentToMarkdownOutput{
Body: "# This is test file for markdown\n",
Images: []format.Image{},
Expand All @@ -85,8 +94,10 @@ func TestConvertDocumentToMarkdown(t *testing.T) {
},
},
{
name: "Convert PPT file",
filepath: "testdata/test.ppt",
name: "Convert PPT file",
filepath: "testdata/test.ppt",
requiresLibreOffice: true,
skipIfMissingDeps: true,
expected: ConvertDocumentToMarkdownOutput{
Body: "# This is test file for markdown\n",
Images: []format.Image{},
Expand Down Expand Up @@ -127,6 +138,18 @@ func TestConvertDocumentToMarkdown(t *testing.T) {
c.Run(test.name, func(c *qt.C) {
c.Parallel()

// Skip tests that require external dependencies if they're not available
if test.skipIfMissingDeps {
if test.requiresLibreOffice && !checkExternalDependency("libreoffice") {
c.Skip("LibreOffice not found, skipping test")
return
}
if !checkExternalDependency("python3") && !checkExternalDependency("python") {
c.Skip("Python not found, skipping test")
return
}
}

ctx := context.Background()
component := Init(base.Component{})
c.Assert(component, qt.IsNotNil)
Expand All @@ -138,7 +161,8 @@ func TestConvertDocumentToMarkdown(t *testing.T) {
c.Assert(err, qt.IsNil)
c.Assert(execution, qt.IsNotNil)

fileContent, err := os.ReadFile(test.filepath)
// Use cached file content for better performance
fileContent, err := getTestFileContent(test.filepath)
c.Assert(err, qt.IsNil)

ir, ow, eh, job := mock.GenerateMockJob(c)
Expand Down
14 changes: 12 additions & 2 deletions pkg/component/operator/document/v0/convert_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package document

import (
"context"
"os"
"testing"

qt "github.com/frankban/quicktest"
Expand All @@ -15,6 +14,14 @@ import (

func TestConvertToText(t *testing.T) {
c := qt.New(t)
c.Parallel()

// Skip test if Python dependencies are not available
if !checkExternalDependency("python3") && !checkExternalDependency("python") {
c.Skip("Python not found, skipping test")
return
}

tests := []struct {
name string
filepath string
Expand Down Expand Up @@ -57,10 +64,13 @@ func TestConvertToText(t *testing.T) {
bc := base.Component{}
for _, test := range tests {
c.Run(test.name, func(c *qt.C) {
c.Parallel()

component := Init(bc)
ctx := context.Background()

fileContent, err := os.ReadFile(test.filepath)
// Use cached file content for better performance
fileContent, err := getTestFileContent(test.filepath)
c.Assert(err, qt.IsNil)

execution, err := component.CreateExecution(base.ComponentExecution{
Expand Down
10 changes: 8 additions & 2 deletions pkg/component/operator/document/v0/convert_to_images_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package document

import (
"context"
"os"
"testing"

qt "github.com/frankban/quicktest"
Expand All @@ -16,6 +15,12 @@ import (
func Test_ConvertDocumentToImages(t *testing.T) {
c := qt.New(t)

// Skip test if Python dependencies are not available
if !checkExternalDependency("python3") && !checkExternalDependency("python") {
c.Skip("Python not found, skipping test")
return
}

test := struct {
name string
filepath string
Expand All @@ -39,7 +44,8 @@ func Test_ConvertDocumentToImages(t *testing.T) {
c.Assert(err, qt.IsNil)
c.Assert(execution, qt.IsNotNil)

fileContent, err := os.ReadFile(test.filepath)
// Use cached file content for better performance
fileContent, err := getTestFileContent(test.filepath)
c.Assert(err, qt.IsNil)

ir, ow, eh, job := mock.GenerateMockJob(c)
Expand Down
Loading
Loading