Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 81 additions & 28 deletions pkg/component/ai/gemini/v0/task_chat.go
Original file line number Diff line number Diff line change
Expand Up @@ -490,14 +490,29 @@ func buildParts(ps []part) []genai.Part {
}

// buildReqParts constructs the user request parts from input, including prompt/contents, images, and documents.
// Following best practices: text content (from both Contents and Prompt) is placed after visual content (images/documents).
func buildReqParts(in TaskChatInput) ([]genai.Part, error) {
parts := []genai.Part{}
if in.Prompt != nil && *in.Prompt != "" {
parts = append(parts, genai.Part{Text: *in.Prompt})
} else if len(in.Contents) > 0 {

// Separate non-text and text parts from Contents for proper ordering
var nonTextParts []genai.Part
var textParts []genai.Part
if len(in.Contents) > 0 {
last := in.Contents[len(in.Contents)-1]
parts = append(parts, buildParts(last.Parts)...)
contentParts := buildParts(last.Parts)
for _, part := range contentParts {
if part.Text != "" {
textParts = append(textParts, part)
} else {
nonTextParts = append(nonTextParts, part)
}
}
}

// Add non-text parts from Contents first (images, files, etc.)
parts = append(parts, nonTextParts...)

// Add images before documents for optimal processing
for _, img := range in.Images {
imgBase64, err := img.Base64()
if err != nil {
Expand All @@ -507,41 +522,79 @@ func buildReqParts(in TaskChatInput) ([]genai.Part, error) {
parts = append(parts, *p)
}
}
// Process documents according to their capabilities:
// - PDFs: Full document vision support (charts, diagrams, formatting preserved)
// - Text-based: Extract as plain text (HTML tags, Markdown formatting, etc. lost)
// - Office documents: Recommend PDF conversion for visual understanding
for _, doc := range in.Documents {
// Validate document MIME type - only PDF is supported
contentType := doc.ContentType().String()
if contentType != data.PDF {
if isConvertibleToPDF(contentType) {
return nil, fmt.Errorf("unsupported document MIME type %s, only PDF documents are supported; use \":pdf\" syntax in your input variable to convert the document to PDF format", contentType)

if contentType == data.PDF {
// PDFs support full document vision capabilities
// The model can interpret visual elements like charts, diagrams, and formatting
docBase64, err := doc.Base64()
if err != nil {
return nil, err
}
return nil, fmt.Errorf("unsupported document MIME type: %s, only PDF documents are supported", contentType)
if p := newURIOrDataPart(docBase64.String(), detectMIMEFromPath(docBase64.String(), "application/pdf")); p != nil {
parts = append(parts, *p)
}
} else if isTextBasedDocument(contentType) {
// Text-based documents (TXT, Markdown, HTML, XML, etc.)
// These are processed as pure text content - visual formatting is lost
// The model won't see HTML tags, Markdown formatting, etc.
textContent, err := doc.Text()
if err != nil {
return nil, fmt.Errorf("failed to extract text from document: %w", err)
}
if textContent.String() != "" {
parts = append(parts, genai.Part{Text: textContent.String()})
}
} else if isConvertibleToPDF(contentType) {
// Office documents (DOC, DOCX, PPT, PPTX, XLS, XLSX)
// These can contain visual elements that would be lost in text extraction
return nil, fmt.Errorf("document type %s will be processed as text only, losing visual elements like charts and formatting; use \":pdf\" syntax in your input variable to convert to PDF for document vision capabilities", contentType)
} else {
return nil, fmt.Errorf("unsupported document type: %s", contentType)
}
}

docBase64, err := doc.Base64()
if err != nil {
return nil, err
}
if p := newURIOrDataPart(docBase64.String(), detectMIMEFromPath(docBase64.String(), "application/pdf")); p != nil {
parts = append(parts, *p)
}
// Add text parts after documents for best results (as per best practices)
// This includes both text parts from Contents and the Prompt field
parts = append(parts, textParts...)
if in.Prompt != nil && *in.Prompt != "" {
parts = append(parts, genai.Part{Text: *in.Prompt})
}

return parts, nil
}

// isConvertibleToPDF checks if a MIME type can be converted to PDF using the :pdf syntax
// isTextBasedDocument checks if a document type should be processed as text content.
// Text-based documents are extracted as plain text, losing visual formatting but preserving content.
// This includes HTML (tags removed), Markdown (formatting lost), plain text, CSV, XML, etc.
func isTextBasedDocument(contentType string) bool {
textBasedTypes := []string{
data.HTML, // text/html - HTML tags will be lost, only text content preserved
data.MARKDOWN, // text/markdown - Markdown formatting will be lost
data.TEXT, // text - already plain text
data.PLAIN, // text/plain - already plain text
data.CSV, // text/csv - processed as structured text
}

return slices.Contains(textBasedTypes, contentType) || strings.HasPrefix(contentType, "text/")
}

// isConvertibleToPDF checks if a MIME type can be converted to PDF using the :pdf syntax.
// These document types often contain visual elements (charts, diagrams, formatting)
// that would be lost if processed as text only. PDF conversion preserves visual understanding.
func isConvertibleToPDF(contentType string) bool {
convertibleTypes := []string{
data.DOC, // application/msword
data.DOCX, // application/vnd.openxmlformats-officedocument.wordprocessingml.document
data.PPT, // application/vnd.ms-powerpoint
data.PPTX, // application/vnd.openxmlformats-officedocument.presentationml.presentation
data.XLS, // application/vnd.ms-excel
data.XLSX, // application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
data.HTML, // text/html
data.MARKDOWN, // text/markdown
data.TEXT, // text
data.PLAIN, // text/plain
data.CSV, // text/csv
data.DOC, // application/msword - may contain charts, images, formatting
data.DOCX, // application/vnd.openxmlformats-officedocument.wordprocessingml.document
data.PPT, // application/vnd.ms-powerpoint - presentations with slides, charts
data.PPTX, // application/vnd.openxmlformats-officedocument.presentationml.presentation
data.XLS, // application/vnd.ms-excel - spreadsheets with charts, formatting
data.XLSX, // application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
}

return slices.Contains(convertibleTypes, contentType)
Expand Down
197 changes: 185 additions & 12 deletions pkg/component/ai/gemini/v0/task_chat_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,13 +101,13 @@ func Test_buildReqParts_Prompt_Images_Documents(t *testing.T) {
}
got, err := buildReqParts(in)
c.Assert(err, qt.IsNil)
// Expect 1 text + 1 image + 1 doc = 3 parts
// Expect 1 image + 1 PDF doc + 1 text prompt = 3 parts (prompt now comes last)
c.Assert(got, qt.HasLen, 3)
c.Check(got[0].Text, qt.Equals, prompt)
c.Check(got[0].InlineData, qt.Not(qt.IsNil))
c.Check(got[0].InlineData.MIMEType, qt.Equals, "image/png")
c.Check(got[1].InlineData, qt.Not(qt.IsNil))
c.Check(got[1].InlineData.MIMEType, qt.Equals, "image/png")
c.Check(got[2].InlineData, qt.Not(qt.IsNil))
c.Check(got[2].InlineData.MIMEType, qt.Equals, "application/pdf")
c.Check(got[1].InlineData.MIMEType, qt.Equals, "application/pdf")
c.Check(got[2].Text, qt.Equals, prompt) // Prompt is now last
}

func Test_buildReqParts_UnsupportedDocumentMIME_Convertible(t *testing.T) {
Expand All @@ -128,17 +128,18 @@ func Test_buildReqParts_UnsupportedDocumentMIME_Convertible(t *testing.T) {

got, err := buildReqParts(in)
c.Assert(err, qt.Not(qt.IsNil))
c.Assert(err.Error(), qt.Contains, "unsupported document MIME type: application/msword")
c.Assert(err.Error(), qt.Contains, "Use \":pdf\" syntax")
c.Assert(err.Error(), qt.Contains, "document type application/msword will be processed as text only")
c.Assert(err.Error(), qt.Contains, "use \":pdf\" syntax")
c.Assert(got, qt.IsNil)
}

func Test_buildReqParts_UnsupportedDocumentMIME_ConvertibleText(t *testing.T) {
func Test_buildReqParts_TextBasedDocument_CSV(t *testing.T) {
c := qt.New(t)
prompt := "Summarize this."

// Create a document with convertible text MIME type (CSV)
docBytes := []byte("name,age\nJohn,30\nJane,25")
// Create a document with text-based MIME type (CSV)
csvContent := "name,age\nJohn,30\nJane,25"
docBytes := []byte(csvContent)
doc, err := data.NewDocumentFromBytes(docBytes, data.CSV, "test.csv")
if err != nil {
t.Fatal(err)
Expand All @@ -149,13 +150,185 @@ func Test_buildReqParts_UnsupportedDocumentMIME_ConvertibleText(t *testing.T) {
Documents: []format.Document{doc},
}

got, err := buildReqParts(in)
c.Assert(err, qt.IsNil)
// Expect 1 text part (CSV content) + 1 text part (prompt) = 2 parts
c.Assert(got, qt.HasLen, 2)
c.Check(got[0].Text, qt.Equals, csvContent) // CSV content as text
c.Check(got[1].Text, qt.Equals, prompt) // Prompt comes last
}

func Test_buildReqParts_TextBasedDocument_HTML(t *testing.T) {
c := qt.New(t)
prompt := "Extract the main content."

// Create an HTML document
htmlContent := "<html><body><h1>Title</h1><p>Content</p></body></html>"
docBytes := []byte(htmlContent)
doc, err := data.NewDocumentFromBytes(docBytes, data.HTML, "test.html")
if err != nil {
t.Fatal(err)
}

in := TaskChatInput{
Prompt: &prompt,
Documents: []format.Document{doc},
}

got, err := buildReqParts(in)
c.Assert(err, qt.IsNil)
// Expect 1 text part (HTML content) + 1 text part (prompt) = 2 parts
c.Assert(got, qt.HasLen, 2)
c.Check(got[0].Text, qt.Equals, htmlContent) // HTML content as text (tags preserved in extraction)
c.Check(got[1].Text, qt.Equals, prompt) // Prompt comes last
}

func Test_buildReqParts_TextBasedDocument_Markdown(t *testing.T) {
c := qt.New(t)
prompt := "Convert to HTML."

// Create a Markdown document
markdownContent := "# Title\n\nThis is **bold** text."
docBytes := []byte(markdownContent)
doc, err := data.NewDocumentFromBytes(docBytes, data.MARKDOWN, "test.md")
if err != nil {
t.Fatal(err)
}

in := TaskChatInput{
Prompt: &prompt,
Documents: []format.Document{doc},
}

got, err := buildReqParts(in)
c.Assert(err, qt.IsNil)
// Expect 1 text part (Markdown content) + 1 text part (prompt) = 2 parts
c.Assert(got, qt.HasLen, 2)
c.Check(got[0].Text, qt.Equals, markdownContent) // Markdown content as text
c.Check(got[1].Text, qt.Equals, prompt) // Prompt comes last
}

func Test_buildReqParts_UnsupportedDocumentType(t *testing.T) {
c := qt.New(t)
prompt := "Process this."

// Create a mock document that simulates an unsupported type
// We'll create a document with a supported data package type but use a filename that won't trigger conversion
docBytes := []byte("binary data")
doc, err := data.NewDocumentFromBytes(docBytes, data.OCTETSTREAM, "test.unknown")
if err != nil {
t.Fatal(err)
}

in := TaskChatInput{
Prompt: &prompt,
Documents: []format.Document{doc},
}

got, err := buildReqParts(in)
c.Assert(err, qt.Not(qt.IsNil))
c.Assert(err.Error(), qt.Contains, "unsupported document MIME type: text/csv")
c.Assert(err.Error(), qt.Contains, "Use \":pdf\" syntax")
// Since OCTETSTREAM with unknown extension gets converted to DOC by default,
// it will be caught by our convertible check
c.Assert(err.Error(), qt.Contains, "document type application/msword will be processed as text only")
c.Assert(got, qt.IsNil)
}

func Test_buildReqParts_Contents_TextOrdering(t *testing.T) {
c := qt.New(t)

// Create test data
imgData := "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGNgYAAAAAMAASsJTYQAAAAASUVORK5CYII="
pdfHeader := "JVBORi0xLjQK"
imageBytes, err := base64.StdEncoding.DecodeString(imgData)
if err != nil {
t.Fatal(err)
}
img, err := data.NewImageFromBytes(imageBytes, "image/png", "test.png", true)
if err != nil {
t.Fatal(err)
}
pdfBytes, err := base64.StdEncoding.DecodeString(pdfHeader)
if err != nil {
t.Fatal(err)
}
doc, err := data.NewDocumentFromBytes(pdfBytes, "application/pdf", "test.pdf")
if err != nil {
t.Fatal(err)
}

// Create Contents with mixed text and non-text parts
textPart1 := "First text from Contents"
textPart2 := "Second text from Contents"
imgBase64 := base64.StdEncoding.EncodeToString(imageBytes)

in := TaskChatInput{
Images: []format.Image{img},
Documents: []format.Document{doc},
Contents: []content{
{
Parts: []part{
{Text: &textPart1},
{InlineData: &blob{MIMEType: "image/png", Data: imgBase64}},
{Text: &textPart2},
},
},
},
}

got, err := buildReqParts(in)
c.Assert(err, qt.IsNil)
// Expect: 1 image from Contents + 1 image from Images + 1 PDF doc + 2 text parts from Contents = 5 parts
c.Assert(got, qt.HasLen, 5)

// Verify ordering: non-text from Contents, then Images, then Documents, then text from Contents
c.Check(got[0].InlineData, qt.Not(qt.IsNil)) // Image from Contents
c.Check(got[0].InlineData.MIMEType, qt.Equals, "image/png")
c.Check(got[1].InlineData, qt.Not(qt.IsNil)) // Image from Images field
c.Check(got[1].InlineData.MIMEType, qt.Equals, "image/png")
c.Check(got[2].InlineData, qt.Not(qt.IsNil)) // PDF from Documents
c.Check(got[2].InlineData.MIMEType, qt.Equals, "application/pdf")
c.Check(got[3].Text, qt.Equals, textPart1) // First text from Contents (placed after documents)
c.Check(got[4].Text, qt.Equals, textPart2) // Second text from Contents
}

func Test_isTextBasedDocument(t *testing.T) {
c := qt.New(t)

// Test text-based document types
c.Check(isTextBasedDocument(data.HTML), qt.Equals, true)
c.Check(isTextBasedDocument(data.MARKDOWN), qt.Equals, true)
c.Check(isTextBasedDocument(data.PLAIN), qt.Equals, true)
c.Check(isTextBasedDocument(data.CSV), qt.Equals, true)
c.Check(isTextBasedDocument("text/xml"), qt.Equals, true)
c.Check(isTextBasedDocument("text/javascript"), qt.Equals, true)

// Test non-text-based document types
c.Check(isTextBasedDocument(data.PDF), qt.Equals, false)
c.Check(isTextBasedDocument(data.DOC), qt.Equals, false)
c.Check(isTextBasedDocument(data.DOCX), qt.Equals, false)
c.Check(isTextBasedDocument("application/octet-stream"), qt.Equals, false)
c.Check(isTextBasedDocument("image/png"), qt.Equals, false)
}

func Test_isConvertibleToPDF(t *testing.T) {
c := qt.New(t)

// Test convertible document types
c.Check(isConvertibleToPDF(data.DOC), qt.Equals, true)
c.Check(isConvertibleToPDF(data.DOCX), qt.Equals, true)
c.Check(isConvertibleToPDF(data.PPT), qt.Equals, true)
c.Check(isConvertibleToPDF(data.PPTX), qt.Equals, true)
c.Check(isConvertibleToPDF(data.XLS), qt.Equals, true)
c.Check(isConvertibleToPDF(data.XLSX), qt.Equals, true)

// Test non-convertible document types
c.Check(isConvertibleToPDF(data.PDF), qt.Equals, false)
c.Check(isConvertibleToPDF(data.HTML), qt.Equals, false)
c.Check(isConvertibleToPDF(data.MARKDOWN), qt.Equals, false)
c.Check(isConvertibleToPDF(data.PLAIN), qt.Equals, false)
c.Check(isConvertibleToPDF("application/octet-stream"), qt.Equals, false)
}

func Test_renderFinal_Minimal(t *testing.T) {
c := qt.New(t)
// Build a minimal GenerateContentResponse with one candidate and usage
Expand Down
Loading