-
Notifications
You must be signed in to change notification settings - Fork 150
feat: add support for automatic embeddings for the insert many tool MCP-236 #688
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
9b31d88
b8a37ef
00a2b6b
c22f2c5
6ab9231
aa6dd8a
dc2f207
fcfdd32
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,3 +13,5 @@ tests/tmp | |
| coverage | ||
| # Generated assets by accuracy runs | ||
| .accuracy | ||
|
|
||
| .DS_Store | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,6 +3,17 @@ import type { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; | |
| import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js"; | ||
| import { type ToolArgs, type OperationType, formatUntrustedData } from "../../tool.js"; | ||
| import { zEJSON } from "../../args.js"; | ||
| import { type Document } from "bson"; | ||
| import { zSupportedEmbeddingParameters } from "../../../common/search/embeddingsProvider.js"; | ||
| import { ErrorCodes, MongoDBError } from "../../../common/errors.js"; | ||
|
|
||
| const zSupportedEmbeddingParametersWithInput = zSupportedEmbeddingParameters.extend({ | ||
| input: z | ||
| .array(z.object({}).passthrough()) | ||
| .describe( | ||
| "Array of objects with vector search index fields as keys (in dot notation) and the raw text values to generate embeddings for as values. The index of each object corresponds to the index of the document in the documents array." | ||
| ), | ||
| }); | ||
|
|
||
| export class InsertManyTool extends MongoDBToolBase { | ||
| public name = "insert-many"; | ||
|
|
@@ -12,46 +23,44 @@ export class InsertManyTool extends MongoDBToolBase { | |
| documents: z | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. okay so I did my best to prompt engineer it to generate embeddings by supplying plain text to the vector search indexed fields as strings inside Open to suggestions if I may be overlooking something here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this should work fine, we have the accuracy tests if something goes wary in the future and we can safely tweak the prompt. |
||
| .array(zEJSON().describe("An individual MongoDB document")) | ||
| .describe( | ||
| "The array of documents to insert, matching the syntax of the document argument of db.collection.insertMany()" | ||
| "The array of documents to insert, matching the syntax of the document argument of db.collection.insertMany()." | ||
| ), | ||
| ...(this.isFeatureEnabled("vectorSearch") | ||
| ? { | ||
| embeddingParameters: zSupportedEmbeddingParametersWithInput | ||
| .optional() | ||
| .describe( | ||
| "The embedding model and its parameters to use to generate embeddings for fields with vector search indexes. Note to LLM: If unsure which embedding model to use, ask the user before providing one." | ||
| ), | ||
| } | ||
| : {}), | ||
| }; | ||
| public operationType: OperationType = "create"; | ||
|
|
||
| protected async execute({ | ||
| database, | ||
| collection, | ||
| documents, | ||
| embeddingParameters: providedEmbeddingParameters, | ||
| }: ToolArgs<typeof this.argsShape>): Promise<CallToolResult> { | ||
| const provider = await this.ensureConnected(); | ||
|
|
||
| const embeddingValidations = new Set( | ||
| ...(await Promise.all( | ||
| documents.flatMap((document) => | ||
| this.session.vectorSearchEmbeddingsManager.findFieldsWithWrongEmbeddings( | ||
| { database, collection }, | ||
| document | ||
| ) | ||
| ) | ||
| )) | ||
| ); | ||
| const embeddingParameters = this.isFeatureEnabled("vectorSearch") | ||
| ? (providedEmbeddingParameters as z.infer<typeof zSupportedEmbeddingParametersWithInput>) | ||
| : undefined; | ||
|
|
||
| if (embeddingValidations.size > 0) { | ||
| // tell the LLM what happened | ||
| const embeddingValidationMessages = [...embeddingValidations].map( | ||
| (validation) => | ||
| `- Field ${validation.path} is an embedding with ${validation.expectedNumDimensions} dimensions and ${validation.expectedQuantization}` + | ||
| ` quantization, and the provided value is not compatible. Actual dimensions: ${validation.actualNumDimensions}, ` + | ||
| `actual quantization: ${validation.actualQuantization}. Error: ${validation.error}` | ||
| ); | ||
|
|
||
| return { | ||
| content: formatUntrustedData( | ||
| "There were errors when inserting documents. No document was inserted.", | ||
| ...embeddingValidationMessages | ||
| ), | ||
| isError: true, | ||
| }; | ||
| } | ||
| // Process documents to replace raw string values with generated embeddings | ||
| documents = await this.replaceRawValuesWithEmbeddingsIfNecessary({ | ||
| database, | ||
| collection, | ||
| documents, | ||
| embeddingParameters, | ||
| }); | ||
|
|
||
| await this.session.vectorSearchEmbeddingsManager.assertFieldsHaveCorrectEmbeddings( | ||
| { database, collection }, | ||
| documents | ||
|
Comment on lines
+60
to
+62
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [q] do we need to run this if embeddings are not enabled? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this was existing behavior so I kept it as we would want to prevent the model from adding artbirary data to a vector search indexed field. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So @gagik is completely right. It's to prevent the agent adding a raw field in a place where the server expects an embedding. Currently, the server does not reject these invalid values (unless the user specifies a JSON Schema) and it would break the VS Index and the behaviour can be pretty inconsistent depending on how it breaks. |
||
| ); | ||
|
Comment on lines
+60
to
+63
|
||
|
|
||
| const result = await provider.insertMany(database, collection, documents); | ||
| const content = formatUntrustedData( | ||
|
|
@@ -63,4 +72,84 @@ export class InsertManyTool extends MongoDBToolBase { | |
| content, | ||
| }; | ||
| } | ||
|
|
||
| private async replaceRawValuesWithEmbeddingsIfNecessary({ | ||
| database, | ||
| collection, | ||
| documents, | ||
| embeddingParameters, | ||
| }: { | ||
| database: string; | ||
| collection: string; | ||
| documents: Document[]; | ||
| embeddingParameters?: z.infer<typeof zSupportedEmbeddingParametersWithInput>; | ||
| }): Promise<Document[]> { | ||
| // If no embedding parameters or no input specified, return documents as-is | ||
| if (!embeddingParameters?.input || embeddingParameters.input.length === 0) { | ||
| return documents; | ||
| } | ||
|
|
||
| // Get vector search indexes for the collection | ||
| const vectorIndexes = await this.session.vectorSearchEmbeddingsManager.embeddingsForNamespace({ | ||
| database, | ||
| collection, | ||
| }); | ||
|
|
||
| // Ensure for inputted fields, the vector search index exists. | ||
| for (const input of embeddingParameters.input) { | ||
| for (const fieldPath of Object.keys(input)) { | ||
| if (!vectorIndexes.some((index) => index.path === fieldPath)) { | ||
| throw new MongoDBError( | ||
| ErrorCodes.AtlasVectorSearchInvalidQuery, | ||
| `Field '${fieldPath}' does not have a vector search index in collection ${database}.${collection}. Only fields with vector search indexes can have embeddings generated.` | ||
| ); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // We make one call to generate embeddings for all documents at once to avoid making too many API calls. | ||
| const flattenedEmbeddingsInput = embeddingParameters.input.flatMap((documentInput, index) => | ||
| Object.entries(documentInput).map(([fieldPath, rawTextValue]) => ({ | ||
| fieldPath, | ||
| rawTextValue, | ||
| documentIndex: index, | ||
| })) | ||
| ); | ||
|
|
||
| const generatedEmbeddings = await this.session.vectorSearchEmbeddingsManager.generateEmbeddings({ | ||
| rawValues: flattenedEmbeddingsInput.map(({ rawTextValue }) => rawTextValue) as string[], | ||
| embeddingParameters, | ||
| inputType: "document", | ||
| }); | ||
|
|
||
| const processedDocuments: Document[] = [...documents]; | ||
|
|
||
| for (const [index, { fieldPath, documentIndex }] of flattenedEmbeddingsInput.entries()) { | ||
| if (!processedDocuments[documentIndex]) { | ||
| throw new MongoDBError(ErrorCodes.Unexpected, `Document at index ${documentIndex} does not exist.`); | ||
| } | ||
| // Ensure no nested fields are present in the field path. | ||
| this.deleteFieldPath(processedDocuments[documentIndex], fieldPath); | ||
| processedDocuments[documentIndex][fieldPath] = generatedEmbeddings[index]; | ||
| } | ||
|
|
||
| return processedDocuments; | ||
| } | ||
|
|
||
| // Delete a specified field path from a document using dot notation. | ||
| private deleteFieldPath(document: Record<string, unknown>, fieldPath: string): void { | ||
| const parts = fieldPath.split("."); | ||
| let current: Record<string, unknown> = document; | ||
| for (let i = 0; i < parts.length; i++) { | ||
| const part = parts[i]; | ||
| const key = part as keyof typeof current; | ||
| if (!current[key]) { | ||
| return; | ||
| } else if (i === parts.length - 1) { | ||
| delete current[key]; | ||
| } else { | ||
| current = current[key] as Record<string, unknown>; | ||
| } | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know if this is considered niche but I find colima to be a nice simple way to run docker on Mac and if this existed I'd not have lost many painful moments of figuring this bit out