feat(component,ai,gemini): add text embeddings task support (#1129)

pinglin · pinglin · commit d7ca6cf8fb00 · 2025-09-27T22:30:01.000+01:00
Because

- The Gemini component needed support for text embeddings functionality
to enable use cases like semantic search, classification, and clustering
- Users required the ability to generate embeddings using the
`gemini-embedding-001` model with various task types for optimal
performance

This commit

- Adds `TASK_TEXT_EMBEDDINGS` task with comprehensive input/output
schema supporting multiple task types
- Implements TaskTextEmbeddingsInput and TaskTextEmbeddingsOutput
structs with proper validation and embedding generation
- Adds comprehensive test coverage for the text embeddings functionality
diff --git a/pkg/component/ai/gemini/v0/.compogen/bottom.mdx b/pkg/component/ai/gemini/v0/.compogen/bottom.mdx
@@ -1,5 +1,7 @@
 ## Example Recipes
 
+### Chat with multimodality inputs
+
 ```yaml
 version: v1beta
 component:
@@ -58,3 +60,55 @@ output:
     title: response-id
     value: ${gemini.output.response-id}
 ```
+
+### Cache a document
+
+```yaml
+version: v1beta
+component:
+  gemini:
+    type: gemini
+    task: TASK_CACHE
+    input:
+      model: gemini-2.5-flash
+      operation: create
+      ttl: 60s
+      documents:
+        - ${variable.document}
+      system-message: You are a helpful assistant.
+variable:
+  document:
+    title: Document
+    description: Document to convert to Markdown
+    type: document
+  stream:
+    title: Enable Stream
+    description: whether to enable streaming
+    type: boolean
+output:
+  cached-content:
+    title: cached content
+    value: ${gemini.output.cached-content}
+```
+
+### Embed a text input
+
+```yaml
+version: v1beta
+component:
+  gemini:
+    type: gemini
+    task: TASK_TEXT_EMBEDDINGS
+    input:
+      model: gemini-embedding-001
+      text: ${variable.text}
+variable:
+  text:
+    title: Text
+    description: Text to embed
+    type: string
+output:
+  embedding:
+    title: Embedding result
+    value: ${gemini.output.embedding}
+```
diff --git a/pkg/component/ai/gemini/v0/README.mdx b/pkg/component/ai/gemini/v0/README.mdx
diff --git a/pkg/component/ai/gemini/v0/config/definition.yaml b/pkg/component/ai/gemini/v0/config/definition.yaml
@@ -1,6 +1,7 @@
 availableTasks:
   - TASK_CHAT
   - TASK_CACHE
+  - TASK_TEXT_EMBEDDINGS
 custom: false
 icon: assets/gemini.svg
 iconUrl: ""
diff --git a/pkg/component/ai/gemini/v0/config/tasks.yaml b/pkg/component/ai/gemini/v0/config/tasks.yaml
@@ -1968,3 +1968,96 @@ TASK_CACHE:
         description: >-
           [**LIST**] Token for retrieving the next page of results for list operations.
         type: string
+TASK_TEXT_EMBEDDINGS:
+  shortDescription: Turn text into numbers, unlocking use cases like search.
+  input:
+    uiOrder: 0
+    title: Input
+    description: Input schema of the text embeddings task.
+    type: object
+    required:
+      - text
+      - model
+      - task-type
+    properties:
+      model:
+        uiOrder: 0
+        title: Model
+        shortDescription: ID of the model to use
+        description: >
+          The Gemini embedding model, gemini-embedding-001, is trained using the Matryoshka Representation Learning (MRL)  technique which teaches a model
+          to learn high-dimensional embeddings that have initial segments (or prefixes)  which are also useful, simpler versions of the same data.
+        type: string
+        enum:
+          - gemini-embedding-001
+        instillCredentialMap:
+          values:
+            - gemini-embedding-001
+          targets:
+            - setup.api-key
+      text:
+        uiOrder: 1
+        title: Text
+        description: The text to generate embeddings for.
+        type: string
+      task-type:
+        uiOrder: 2
+        title: Task Type
+        description: >-
+          The type of task to perform for optimal embedding generation.
+          The value is one of the following:
+            `SEMANTIC_SIMILARITY`: (Default) Embeddings optimized to assess text similarity. Examples: Recommendation systems, duplicate detection.
+            `CLASSIFICATION`: Embeddings optimized to classify texts according to preset labels. Examples: Sentiment analysis, spam detection.
+            `CLUSTERING`: Embeddings optimized to group similar texts together. Examples: Document organization, market research, anomaly detection.
+            `RETRIEVAL_DOCUMENT`: Embeddings optimized for document search. Examples: Indexing articles, books, or web pages for search.
+            `RETRIEVAL_QUERY`: Embeddings optimized for general search queries. Use `RETRIEVAL_QUERY` for queries; `RETRIEVAL_DOCUMENT` for documents to
+          be retrieved. Examples: General search queries for custom search applications.
+            `CODE_RETRIEVAL_QUERY`: Embeddings optimized for retrieval of code blocks based on natural language queries. Use `CODE_RETRIEVAL_QUERY` for
+          queries; `RETRIEVAL_DOCUMENT` for code blocks to be retrieved. Examples: Natural language queries about code for code suggestions and search.
+            `QUESTION_ANSWERING`: Embeddings for questions in a question-answering system, optimized for finding documents that answer the question. Use
+          `QUESTION_ANSWERING` for questions; `RETRIEVAL_DOCUMENT` for documents to be retrieved. Examples: Questions in Q&A systems, chatbots, knowledge
+          bases.
+            `FACT_VERIFICATION`: Embeddings for statements that need to be verified, optimized for retrieving documents that contain evidence supporting
+          or refuting the statement. Use `FACT_VERIFICATION` for the target text; `RETRIEVAL_DOCUMENT` for documents to be retrieved. Examples: Statements
+          that need verification for automated fact-checking systems.
+        type: string
+        enum:
+          - SEMANTIC_SIMILARITY
+          - CLASSIFICATION
+          - CLUSTERING
+          - RETRIEVAL_DOCUMENT
+          - RETRIEVAL_QUERY
+          - CODE_RETRIEVAL_QUERY
+          - QUESTION_ANSWERING
+          - FACT_VERIFICATION
+        default: SEMANTIC_SIMILARITY
+      title:
+        uiOrder: 3
+        title: Title
+        description: >-
+          Title for the text. Only applicable when TaskType is `RETRIEVAL_DOCUMENT`.
+        type: string
+      output-dimensionality:
+        uiOrder: 4
+        title: Output Dimensionality
+        description: >-
+          Use this parameter to control the size of the output embedding vector. Selecting a smaller  output dimensionality can save storage space and increase
+          computational efficiency for  downstream applications, while sacrificing little in terms of quality. By default, it  outputs a 3072-dimensional
+          embedding, but you can truncate it to a smaller size without  losing quality to save storage space. We recommend using 768, 1536, or 3072 output
+          dimensions.
+        type: integer
+        minimum: 1
+        maximum: 3072
+  output:
+    uiOrder: 0
+    title: Output
+    description: Output schema of the text embeddings task.
+    type: object
+    required:
+      - embedding
+    properties:
+      embedding:
+        uiOrder: 0
+        title: Embedding
+        description: The embedding of the text.
+        $ref: schema.yaml#/$defs/instill-types/embedding
diff --git a/pkg/component/ai/gemini/v0/io.go b/pkg/component/ai/gemini/v0/io.go
@@ -127,3 +127,17 @@ type TaskCacheOutput struct {
 	CachedContents []*genai.CachedContent `instill:"cached-contents"`
 	NextPageToken  *string                `instill:"next-page-token"`
 }
+
+// TaskTextEmbeddingsInput is the input for the TASK_TEXT_EMBEDDINGS task.
+type TaskTextEmbeddingsInput struct {
+	Model                string `instill:"model"`
+	Text                 string `instill:"text"`
+	TaskType             string `instill:"task-type"`
+	Title                string `instill:"title"`
+	OutputDimensionality *int32 `instill:"output-dimensionality"`
+}
+
+// TaskTextEmbeddingsOutput is the output for the TASK_TEXT_EMBEDDINGS task.
+type TaskTextEmbeddingsOutput struct {
+	Embedding []float64 `instill:"embedding"`
+}
diff --git a/pkg/component/ai/gemini/v0/main.go b/pkg/component/ai/gemini/v0/main.go
@@ -13,11 +13,13 @@ import (
 	"google.golang.org/protobuf/types/known/structpb"
 
 	"github.com/instill-ai/pipeline-backend/pkg/component/base"
+	"github.com/instill-ai/pipeline-backend/pkg/component/resources/schemas"
 )
 
 const (
-	ChatTask  = "TASK_CHAT"
-	CacheTask = "TASK_CACHE"
+	ChatTask           = "TASK_CHAT"
+	CacheTask          = "TASK_CACHE"
+	TextEmbeddingsTask = "TASK_TEXT_EMBEDDINGS"
 
 	cfgAPIKey = "api-key"
 )
@@ -45,7 +47,10 @@ type component struct {
 func Init(bc base.Component) *component {
 	once.Do(func() {
 		comp = &component{Component: bc}
-		err := comp.LoadDefinition(definitionYAML, setupYAML, tasksYAML, nil, nil)
+		additionalYAMLBytes := map[string][]byte{
+			"schema.yaml": schemas.SchemaYAML,
+		}
+		err := comp.LoadDefinition(definitionYAML, setupYAML, tasksYAML, nil, additionalYAMLBytes)
 		if err != nil {
 			panic(err)
 		}
@@ -82,6 +87,8 @@ func (c *component) CreateExecution(x base.ComponentExecution) (base.IExecution,
 		e.execute = e.chat
 	case CacheTask:
 		e.execute = e.cache
+	case TextEmbeddingsTask:
+		e.execute = e.textEmbeddings
 	default:
 		return nil, fmt.Errorf("unsupported task")
 	}
diff --git a/pkg/component/ai/gemini/v0/task_text_embeddings.go b/pkg/component/ai/gemini/v0/task_text_embeddings.go
@@ -0,0 +1,78 @@
+package gemini
+
+import (
+	"context"
+
+	"google.golang.org/genai"
+
+	"github.com/instill-ai/pipeline-backend/pkg/component/base"
+)
+
+func (e *execution) textEmbeddings(ctx context.Context, job *base.Job) error {
+	// Read input
+	in := TaskTextEmbeddingsInput{}
+	if err := job.Input.ReadData(ctx, &in); err != nil {
+		job.Error.Error(ctx, err)
+		return nil
+	}
+
+	// Create Gemini client
+	client, err := e.createGeminiClient(ctx)
+	if err != nil {
+		job.Error.Error(ctx, err)
+		return nil
+	}
+
+	// Create content from input text
+	contents := []*genai.Content{
+		genai.NewContentFromText(in.Text, genai.RoleUser),
+	}
+
+	// Use the task type from input, defaulting to SEMANTIC_SIMILARITY if empty
+	taskType := in.TaskType
+	if taskType == "" {
+		taskType = "SEMANTIC_SIMILARITY"
+	}
+
+	// Generate embeddings using the Gemini API
+	result, err := client.Models.EmbedContent(ctx, in.Model, contents, &genai.EmbedContentConfig{
+		TaskType:             taskType,
+		OutputDimensionality: in.OutputDimensionality,
+		Title:                in.Title,
+	})
+	if err != nil {
+		job.Error.Error(ctx, err)
+		return nil
+	}
+
+	// Extract embeddings from the result
+	if len(result.Embeddings) == 0 {
+		job.Error.Error(ctx, err)
+		return nil
+	}
+
+	embedding := result.Embeddings[0]
+	if len(embedding.Values) == 0 {
+		job.Error.Error(ctx, err)
+		return nil
+	}
+
+	// Convert from []float32 to []float64 for consistency with other components
+	embeddingFloat64 := make([]float64, len(embedding.Values))
+	for i, v := range embedding.Values {
+		embeddingFloat64[i] = float64(v)
+	}
+
+	// Prepare output
+	output := TaskTextEmbeddingsOutput{
+		Embedding: embeddingFloat64,
+	}
+
+	// Write output
+	if err := job.Output.WriteData(ctx, output); err != nil {
+		job.Error.Error(ctx, err)
+		return nil
+	}
+
+	return nil
+}
diff --git a/pkg/component/ai/gemini/v0/task_text_embeddings_test.go b/pkg/component/ai/gemini/v0/task_text_embeddings_test.go
diff --git a/pkg/component/operator/image/v0/config/schema.yaml b/pkg/component/operator/image/v0/config/schema.yaml