Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 148 additions & 0 deletions pkg/component/ai/gemini/v0/common.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
package gemini

import (
"fmt"
"slices"
"strings"

"github.com/instill-ai/pipeline-backend/pkg/data"
)

// formatSupport defines format support levels for different media types
type formatSupport struct {
gemini []string
instill []string
}

var (
imageFormats = formatSupport{
gemini: []string{
data.PNG, // PNG
data.JPEG, // JPEG
data.WEBP, // WEBP
"image/heic", // HEIC
"image/heif", // HEIF
},
instill: []string{
data.PNG, // PNG
data.JPEG, // JPEG
data.WEBP, // WEBP
data.GIF, // GIF
data.BMP, // BMP
data.TIFF, // TIFF
},
}

audioFormats = formatSupport{
gemini: []string{
data.WAV, // WAV
"audio/mp3", // MP3
data.MP3, // MP3 (audio/mpeg - alternative MIME type)
data.AIFF, // AIFF
data.AAC, // AAC
data.OGG, // OGG Vorbis
data.FLAC, // FLAC
},
instill: []string{
data.MP3, // MP3 (audio/mpeg)
data.WAV, // WAV
data.AAC, // AAC
data.OGG, // OGG
data.FLAC, // FLAC
data.M4A, // M4A (audio/mp4)
data.WMA, // WMA (audio/x-ms-wma)
data.AIFF, // AIFF
},
}

videoFormats = formatSupport{
gemini: []string{
data.MP4, // MP4
data.MPEG, // MPEG
data.MOV, // MOV (video/quicktime)
"video/mov", // MOV (standard MIME type)
data.AVI, // AVI (video/x-msvideo)
"video/avi", // AVI (standard MIME type)
data.FLV, // FLV (video/x-flv)
"video/mpg", // MPG - supported by Gemini but not defined in video.go
data.WEBM, // WEBM
data.WMV, // WMV (video/x-ms-wmv)
"video/wmv", // WMV (standard MIME type)
"video/3gpp", // 3GPP - supported by Gemini but not defined in video.go
},
instill: []string{
data.MP4, // MP4
data.AVI, // AVI (video/x-msvideo)
data.MOV, // MOV (video/quicktime)
data.WEBM, // WEBM
data.MKV, // MKV (video/x-matroska)
data.FLV, // FLV (video/x-flv)
data.WMV, // WMV (video/x-ms-wmv)
data.MPEG, // MPEG
},
}

documentFormats = formatSupport{
gemini: []string{
data.PDF, // PDF - only visual document format supported by Gemini
},
instill: []string{
data.PDF, // PDF
data.DOC, // DOC
data.DOCX, // DOCX
data.PPT, // PPT
data.PPTX, // PPTX
data.XLS, // XLS
data.XLSX, // XLSX
data.HTML, // HTML
data.MARKDOWN, // Markdown
data.TEXT, // Plain text
data.PLAIN, // Plain text
data.CSV, // CSV
},
}
)

// validateFormat checks if a format is supported and returns appropriate error messages
// For documents, it also returns the processing mode ("visual", "text", or "" for error)
func validateFormat(contentType, mediaType string, formats formatSupport, convertibleFormats, supportedTargets, examples string) (string, error) {
// Check if the format is supported by Gemini API
if slices.Contains(formats.gemini, contentType) {
// Special handling for documents to determine processing mode
if mediaType == "document" {
if contentType == data.PDF {
return "visual", nil // PDF supports visual processing
}
// Text-based documents supported by Gemini (currently none, but future-proof)
return "text", nil
}
return "", nil // Other media types don't need mode
}

// Check if it's a known Instill Core format that can be converted
if slices.Contains(formats.instill, contentType) {
// Special handling for documents
if mediaType == "document" {
// Text-based documents: Process as plain text
textBasedTypes := []string{data.HTML, data.MARKDOWN, data.TEXT, data.PLAIN, data.CSV}
if slices.Contains(textBasedTypes, contentType) || strings.HasPrefix(contentType, "text/") {
return "text", nil
}

// Office documents: Need PDF conversion for visual elements
officeTypes := []string{data.DOC, data.DOCX, data.PPT, data.PPTX, data.XLS, data.XLSX}
if slices.Contains(officeTypes, contentType) {
return "", fmt.Errorf("document format %s will be processed as text only, losing visual elements like charts and formatting. Use \":pdf\" syntax to convert to PDF for document vision capabilities", contentType)
}

// Other known document formats
return "", fmt.Errorf("document format %s is not supported by Gemini API. Use \":pdf\" syntax to convert DOC, DOCX, PPT, PPTX, XLS, XLSX to PDF (supported by both Gemini and Instill Core), such as \":pdf\"", contentType)
}

// Non-document media types
return "", fmt.Errorf("%s format %s is not supported by Gemini API. Use \":\" syntax to convert %s to %s (supported by both Gemini and Instill Core), such as \"%s\"", mediaType, contentType, convertibleFormats, supportedTargets, examples)
}

// Unknown format - can't be processed at all
return "", fmt.Errorf("%s format %s is not supported and cannot be processed", mediaType, contentType)
}
42 changes: 28 additions & 14 deletions pkg/component/ai/gemini/v0/config/tasks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1513,8 +1513,22 @@ TASK_CHAT:
type: array
items:
type: string
documents:
audio:
uiOrder: 4
title: Audio
description: URI references or base64 content of input audio.
type: array
items:
type: string
videos:
uiOrder: 5
title: Videos
description: URI references or base64 content of input videos.
type: array
items:
type: string
documents:
uiOrder: 6
title: Documents
description: >-
URI references or base64 content of input documents. Different vendors might have different constraints on the document format. For example,
Expand All @@ -1523,32 +1537,32 @@ TASK_CHAT:
items:
type: string
system-message:
uiOrder: 5
uiOrder: 7
title: System Message
description: Instruction to set the assistant's behavior, tone, or persona. Different vendors might name this field differently.
type: string
chat-history:
uiOrder: 6
uiOrder: 8
title: Chat History
description: Conversation history, each message includes a role and content.
type: array
items:
$ref: "#/$defs/content"
max-output-tokens:
uiOrder: 7
uiOrder: 9
title: Max Output Token
description: The maximum number of tokens to generate in the model output.
type: integer
temperature:
uiOrder: 8
uiOrder: 10
title: Temperature
description: >-
A parameter that controls the randomness and creativity of a large language model's output by adjusting the probability of the next
word it chooses. A low temperature (e.g., near 0) produces more deterministic, focused, and consistent text, while a high temperature (e.g., near
1) leads to more creative, random, and varied output.
type: number
top-k:
uiOrder: 9
uiOrder: 11
title: Top-K
description: >-
A text generation parameter that limits the selection of the next token to the K most probable tokens, discarding the rest to control randomness
Expand All @@ -1568,44 +1582,44 @@ TASK_CHAT:
choice to a smaller, more focused set of highly probable words, resulting in more factual and conservative output.
type: number
seed:
uiOrder: 11
uiOrder: 12
title: Seed
description: A random seed used to control the stochasticity of text generation to produce repeatable outputs
type: integer
contents:
uiOrder: 12
uiOrder: 13
title: Contents
description: The input contents to the model. Each item represents a user or model turn composed of parts (text or images).
type: array
items:
$ref: "#/$defs/content"
tools:
uiOrder: 13
uiOrder: 14
title: Tools
description: Tools available to the model, e.g., function declarations.
type: array
items:
$ref: "#/$defs/tool"
tool-config:
uiOrder: 14
uiOrder: 15
$ref: "#/$defs/tool-config"
safety-settings:
uiOrder: 15
uiOrder: 16
title: Safety Settings
description: Safety settings for content filtering.
type: array
items:
$ref: "#/$defs/safety-setting"
system-instruction:
uiOrder: 16
uiOrder: 17
title: System Instruction
description: A system instruction to guide the model behavior.
$ref: "#/$defs/content"
generation-config:
uiOrder: 17
uiOrder: 18
$ref: "#/$defs/generation-config"
cached-content:
uiOrder: 18
uiOrder: 19
title: Cached Content
description: "The name of a cached content to use as context. Format: cachedContents/{cachedContent}."
type: string
Expand Down
2 changes: 2 additions & 0 deletions pkg/component/ai/gemini/v0/io.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ type TaskChatInput struct {
Stream *bool `instill:"stream"`
Prompt *string `instill:"prompt"`
Images []format.Image `instill:"images"`
Audio []format.Audio `instill:"audio"`
Videos []format.Video `instill:"videos"`
Documents []format.Document `instill:"documents"`
SystemMessage *string `instill:"system-message"`
ChatHistory []*genai.Content `instill:"chat-history"`
Expand Down
Loading
Loading