Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions pkg/component/ai/gemini/v0/io.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,17 @@ import (
// TaskChatInput is the input for the TASK_CHAT task.
type TaskChatInput struct {
// Flattened chat input properties
Stream *bool `instill:"stream"`
Prompt *string `instill:"prompt"`
Images []string `instill:"images"`
Documents []string `instill:"documents"`
SystemMessage *string `instill:"system-message"`
ChatHistory []content `instill:"chat-history"`
MaxOutputTokens *int32 `instill:"max-output-tokens"`
Temperature *float32 `instill:"temperature"`
TopP *float32 `instill:"top-p"`
TopK *int32 `instill:"top-k"`
Seed *int32 `instill:"seed"`
Stream *bool `instill:"stream"`
Prompt *string `instill:"prompt"`
Images []format.Image `instill:"images"`
Documents []format.Document `instill:"documents"`
SystemMessage *string `instill:"system-message"`
ChatHistory []content `instill:"chat-history"`
MaxOutputTokens *int32 `instill:"max-output-tokens"`
Temperature *float32 `instill:"temperature"`
TopP *float32 `instill:"top-p"`
TopK *int32 `instill:"top-k"`
Seed *int32 `instill:"seed"`

// Other properties
Model string `instill:"model"`
Expand Down
53 changes: 48 additions & 5 deletions pkg/component/ai/gemini/v0/task_chat.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,16 @@ package gemini
import (
"context"
"encoding/base64"
"fmt"
"mime"
"path"
"slices"
"strings"

"google.golang.org/genai"

"github.com/instill-ai/pipeline-backend/pkg/component/base"
"github.com/instill-ai/pipeline-backend/pkg/data"
)

func (e *execution) textGeneration(ctx context.Context, job *base.Job) error {
Expand Down Expand Up @@ -45,7 +48,11 @@ func (e *execution) textGeneration(ctx context.Context, job *base.Job) error {
cfg := buildGenerateContentConfig(in, systemMessage)

// Build user parts (prompt/contents + images + documents)
inParts := buildReqParts(in)
inParts, err := buildReqParts(in)
if err != nil {
job.Error.Error(ctx, err)
return nil
}
if len(inParts) == 0 {
return nil
}
Expand Down Expand Up @@ -483,7 +490,7 @@ func buildParts(ps []part) []genai.Part {
}

// buildReqParts constructs the user request parts from input, including prompt/contents, images, and documents.
func buildReqParts(in TaskChatInput) []genai.Part {
func buildReqParts(in TaskChatInput) ([]genai.Part, error) {
parts := []genai.Part{}
if in.Prompt != nil && *in.Prompt != "" {
parts = append(parts, genai.Part{Text: *in.Prompt})
Expand All @@ -492,16 +499,52 @@ func buildReqParts(in TaskChatInput) []genai.Part {
parts = append(parts, buildParts(last.Parts)...)
}
for _, img := range in.Images {
if p := newURIOrDataPart(img, detectMIMEFromPath(img, "image/png")); p != nil {
imgBase64, err := img.Base64()
if err != nil {
return nil, err
}
if p := newURIOrDataPart(imgBase64.String(), detectMIMEFromPath(imgBase64.String(), "image/png")); p != nil {
parts = append(parts, *p)
}
}
for _, doc := range in.Documents {
if p := newURIOrDataPart(doc, detectMIMEFromPath(doc, "application/pdf")); p != nil {
// Validate document MIME type - only PDF is supported
contentType := doc.ContentType().String()
if contentType != data.PDF {
if isConvertibleToPDF(contentType) {
return nil, fmt.Errorf("unsupported document MIME type %s, only PDF documents are supported; use \":pdf\" syntax in your input variable to convert the document to PDF format", contentType)
}
return nil, fmt.Errorf("unsupported document MIME type: %s, only PDF documents are supported", contentType)
}

docBase64, err := doc.Base64()
if err != nil {
return nil, err
}
if p := newURIOrDataPart(docBase64.String(), detectMIMEFromPath(docBase64.String(), "application/pdf")); p != nil {
parts = append(parts, *p)
}
}
return parts
return parts, nil
}

// isConvertibleToPDF checks if a MIME type can be converted to PDF using the :pdf syntax
func isConvertibleToPDF(contentType string) bool {
convertibleTypes := []string{
data.DOC, // application/msword
data.DOCX, // application/vnd.openxmlformats-officedocument.wordprocessingml.document
data.PPT, // application/vnd.ms-powerpoint
data.PPTX, // application/vnd.openxmlformats-officedocument.presentationml.presentation
data.XLS, // application/vnd.ms-excel
data.XLSX, // application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
data.HTML, // text/html
data.MARKDOWN, // text/markdown
data.TEXT, // text
data.PLAIN, // text/plain
data.CSV, // text/csv
}

return slices.Contains(convertibleTypes, contentType)
}

// buildGenerateContentConfig creates a genai.GenerateContentConfig from the input parameters
Expand Down
77 changes: 72 additions & 5 deletions pkg/component/ai/gemini/v0/task_chat_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@ import (
"encoding/base64"
"testing"

"google.golang.org/genai"

qt "github.com/frankban/quicktest"

"google.golang.org/genai"
"github.com/instill-ai/pipeline-backend/pkg/data"
"github.com/instill-ai/pipeline-backend/pkg/data/format"
)

func Test_newURIOrDataPart_DataURI_ImagePNG(t *testing.T) {
Expand Down Expand Up @@ -72,14 +75,32 @@ func Test_buildParts_TextAndInlineData(t *testing.T) {
func Test_buildReqParts_Prompt_Images_Documents(t *testing.T) {
c := qt.New(t)
prompt := "Summarize this."
imgData := ""
imgData := "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGNgYAAAAAMAASsJTYQAAAAASUVORK5CYII="
pdfHeader := "JVBERi0xLjQK" // raw base64 PDF header
imageBytes, err := base64.StdEncoding.DecodeString(imgData)
if err != nil {
t.Fatal(err)
}
img, err := data.NewImageFromBytes(imageBytes, "image/png", "test.png", true)
if err != nil {
t.Fatal(err)
}
pdfBytes, err := base64.StdEncoding.DecodeString(pdfHeader)
if err != nil {
t.Fatal(err)
}
doc, err := data.NewDocumentFromBytes(pdfBytes, "application/pdf", "test.pdf")
if err != nil {
t.Fatal(err)
}

in := TaskChatInput{
Prompt: &prompt,
Images: []string{imgData},
Documents: []string{pdfHeader},
Images: []format.Image{img},
Documents: []format.Document{doc},
}
got := buildReqParts(in)
got, err := buildReqParts(in)
c.Assert(err, qt.IsNil)
// Expect 1 text + 1 image + 1 doc = 3 parts
c.Assert(got, qt.HasLen, 3)
c.Check(got[0].Text, qt.Equals, prompt)
Expand All @@ -89,6 +110,52 @@ func Test_buildReqParts_Prompt_Images_Documents(t *testing.T) {
c.Check(got[2].InlineData.MIMEType, qt.Equals, "application/pdf")
}

func Test_buildReqParts_UnsupportedDocumentMIME_Convertible(t *testing.T) {
c := qt.New(t)
prompt := "Summarize this."

// Create a document with convertible MIME type (DOC)
docBytes := []byte("This is a DOC document")
doc, err := data.NewDocumentFromBytes(docBytes, data.DOC, "test.doc")
if err != nil {
t.Fatal(err)
}

in := TaskChatInput{
Prompt: &prompt,
Documents: []format.Document{doc},
}

got, err := buildReqParts(in)
c.Assert(err, qt.Not(qt.IsNil))
c.Assert(err.Error(), qt.Contains, "unsupported document MIME type: application/msword")
c.Assert(err.Error(), qt.Contains, "Use \":pdf\" syntax")
c.Assert(got, qt.IsNil)
}

func Test_buildReqParts_UnsupportedDocumentMIME_ConvertibleText(t *testing.T) {
c := qt.New(t)
prompt := "Summarize this."

// Create a document with convertible text MIME type (CSV)
docBytes := []byte("name,age\nJohn,30\nJane,25")
doc, err := data.NewDocumentFromBytes(docBytes, data.CSV, "test.csv")
if err != nil {
t.Fatal(err)
}

in := TaskChatInput{
Prompt: &prompt,
Documents: []format.Document{doc},
}

got, err := buildReqParts(in)
c.Assert(err, qt.Not(qt.IsNil))
c.Assert(err.Error(), qt.Contains, "unsupported document MIME type: text/csv")
c.Assert(err.Error(), qt.Contains, "Use \":pdf\" syntax")
c.Assert(got, qt.IsNil)
}

func Test_renderFinal_Minimal(t *testing.T) {
c := qt.New(t)
// Build a minimal GenerateContentResponse with one candidate and usage
Expand Down
Loading