Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,5 @@ tests/tmp
coverage
# Generated assets by accuracy runs
.accuracy

.DS_Store
4 changes: 4 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ npm test -- path/to/test/file.test.ts
npm test -- path/to/directory
```

#### Accuracy Tests and colima
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know if this is considered niche but I find colima to be a nice simple way to run docker on Mac and if this existed I'd not have lost many painful moments of figuring this bit out


If you use [colima](https://github.com/abiosoft/colima) to run Docker on Mac, you will need to apply [additional configuration](https://node.testcontainers.org/supported-container-runtimes/#colima) to ensure the accuracy tests run correctly.

## Troubleshooting

### Restart Server
Expand Down
1 change: 1 addition & 0 deletions src/common/errors.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ export enum ErrorCodes {
NoEmbeddingsProviderConfigured = 1_000_005,
AtlasVectorSearchIndexNotFound = 1_000_006,
AtlasVectorSearchInvalidQuery = 1_000_007,
Unexpected = 1_000_008,
}

export class MongoDBError<ErrorCode extends ErrorCodes = ErrorCodes> extends Error {
Expand Down
2 changes: 1 addition & 1 deletion src/common/search/embeddingsProvider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import { createFetch } from "@mongodb-js/devtools-proxy-support";
import { z } from "zod";

type EmbeddingsInput = string;
type Embeddings = number[];
type Embeddings = number[] | unknown[];
export type EmbeddingParameters = {
inputType: "query" | "document";
};
Expand Down
62 changes: 47 additions & 15 deletions src/common/search/vectorSearchEmbeddingsManager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import z from "zod";
import { ErrorCodes, MongoDBError } from "../errors.js";
import { getEmbeddingsProvider } from "./embeddingsProvider.js";
import type { EmbeddingParameters, SupportedEmbeddingParameters } from "./embeddingsProvider.js";
import { formatUntrustedData } from "../../tools/tool.js";

export const similarityEnum = z.enum(["cosine", "euclidean", "dotProduct"]);
export type Similarity = z.infer<typeof similarityEnum>;
Expand Down Expand Up @@ -103,7 +104,34 @@ export class VectorSearchEmbeddingsManager {
return definition;
}

async findFieldsWithWrongEmbeddings(
async assertFieldsHaveCorrectEmbeddings(
{ database, collection }: { database: string; collection: string },
documents: Document[]
): Promise<void> {
const embeddingValidationResults = (
await Promise.all(
documents.map((document) => this.findFieldsWithWrongEmbeddings({ database, collection }, document))
)
).flat();

if (embeddingValidationResults.length > 0) {
const embeddingValidationMessages = embeddingValidationResults.map(
(validation) =>
`- Field ${validation.path} is an embedding with ${validation.expectedNumDimensions} dimensions and ${validation.expectedQuantization}` +
` quantization, and the provided value is not compatible. Actual dimensions: ${validation.actualNumDimensions}, ` +
`actual quantization: ${validation.actualQuantization}. Error: ${validation.error}`
);

throw new MongoDBError(
ErrorCodes.AtlasVectorSearchInvalidQuery,
formatUntrustedData("", ...embeddingValidationMessages)
.map(({ text }) => text)
.join("\n")
);
}
}

public async findFieldsWithWrongEmbeddings(
{
database,
collection,
Expand Down Expand Up @@ -239,21 +267,34 @@ export class VectorSearchEmbeddingsManager {
return undefined;
}

public async generateEmbeddings({
public async assertVectorSearchIndexExists({
database,
collection,
path,
rawValues,
embeddingParameters,
inputType,
}: {
database: string;
collection: string;
path: string;
}): Promise<void> {
const embeddingInfoForCollection = await this.embeddingsForNamespace({ database, collection });
const embeddingInfoForPath = embeddingInfoForCollection.find((definition) => definition.path === path);
if (!embeddingInfoForPath) {
throw new MongoDBError(
ErrorCodes.AtlasVectorSearchIndexNotFound,
`No Vector Search index found for path "${path}" in namespace "${database}.${collection}"`
);
}
}

public async generateEmbeddings({
rawValues,
embeddingParameters,
inputType,
}: {
rawValues: string[];
embeddingParameters: SupportedEmbeddingParameters;
inputType: EmbeddingParameters["inputType"];
}): Promise<unknown[]> {
}): Promise<unknown[][]> {
const provider = await this.atlasSearchEnabledProvider();
if (!provider) {
throw new MongoDBError(
Expand All @@ -275,15 +316,6 @@ export class VectorSearchEmbeddingsManager {
});
}

const embeddingInfoForCollection = await this.embeddingsForNamespace({ database, collection });
const embeddingInfoForPath = embeddingInfoForCollection.find((definition) => definition.path === path);
if (!embeddingInfoForPath) {
throw new MongoDBError(
ErrorCodes.AtlasVectorSearchIndexNotFound,
`No Vector Search index found for path "${path}" in namespace "${database}.${collection}"`
);
}

return await embeddingsProvider.embed(embeddingParameters.model, rawValues, {
inputType,
...embeddingParameters,
Expand Down
145 changes: 117 additions & 28 deletions src/tools/mongodb/create/insertMany.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,17 @@ import type { CallToolResult } from "@modelcontextprotocol/sdk/types.js";
import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js";
import { type ToolArgs, type OperationType, formatUntrustedData } from "../../tool.js";
import { zEJSON } from "../../args.js";
import { type Document } from "bson";
import { zSupportedEmbeddingParameters } from "../../../common/search/embeddingsProvider.js";
import { ErrorCodes, MongoDBError } from "../../../common/errors.js";

const zSupportedEmbeddingParametersWithInput = zSupportedEmbeddingParameters.extend({
input: z
.array(z.object({}).passthrough())
.describe(
"Array of objects with vector search index fields as keys (in dot notation) and the raw text values to generate embeddings for as values. The index of each object corresponds to the index of the document in the documents array."
),
});

export class InsertManyTool extends MongoDBToolBase {
public name = "insert-many";
Expand All @@ -12,46 +23,44 @@ export class InsertManyTool extends MongoDBToolBase {
documents: z
Copy link
Collaborator Author

@gagik gagik Oct 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

okay so I did my best to prompt engineer it to generate embeddings by supplying plain text to the vector search indexed fields as strings inside documents.
However, in my experience models get confused about this idea of "the schema says it's a vector but I need to ignore this and supply a string" and I also think relying on the description of the documents field for this is less scalable.
So, the best way I found to force the model to provide these vector index raw strings is to put it as a required input as part of embeddingParameters. It seems to do a good job at mapping the 2 arrays and respective indexes. It does sometimes provide a string in place of the original document's field in the input, but this gets overriden so seems quite fine.

Open to suggestions if I may be overlooking something here.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should work fine, we have the accuracy tests if something goes wary in the future and we can safely tweak the prompt.

.array(zEJSON().describe("An individual MongoDB document"))
.describe(
"The array of documents to insert, matching the syntax of the document argument of db.collection.insertMany()"
"The array of documents to insert, matching the syntax of the document argument of db.collection.insertMany()."
),
...(this.isFeatureEnabled("vectorSearch")
? {
embeddingParameters: zSupportedEmbeddingParametersWithInput
.optional()
.describe(
"The embedding model and its parameters to use to generate embeddings for fields with vector search indexes. Note to LLM: If unsure which embedding model to use, ask the user before providing one."
),
}
: {}),
};
public operationType: OperationType = "create";

protected async execute({
database,
collection,
documents,
embeddingParameters: providedEmbeddingParameters,
}: ToolArgs<typeof this.argsShape>): Promise<CallToolResult> {
const provider = await this.ensureConnected();

const embeddingValidations = new Set(
...(await Promise.all(
documents.flatMap((document) =>
this.session.vectorSearchEmbeddingsManager.findFieldsWithWrongEmbeddings(
{ database, collection },
document
)
)
))
);
const embeddingParameters = this.isFeatureEnabled("vectorSearch")
? (providedEmbeddingParameters as z.infer<typeof zSupportedEmbeddingParametersWithInput>)
: undefined;

if (embeddingValidations.size > 0) {
// tell the LLM what happened
const embeddingValidationMessages = [...embeddingValidations].map(
(validation) =>
`- Field ${validation.path} is an embedding with ${validation.expectedNumDimensions} dimensions and ${validation.expectedQuantization}` +
` quantization, and the provided value is not compatible. Actual dimensions: ${validation.actualNumDimensions}, ` +
`actual quantization: ${validation.actualQuantization}. Error: ${validation.error}`
);

return {
content: formatUntrustedData(
"There were errors when inserting documents. No document was inserted.",
...embeddingValidationMessages
),
isError: true,
};
}
// Process documents to replace raw string values with generated embeddings
documents = await this.replaceRawValuesWithEmbeddingsIfNecessary({
database,
collection,
documents,
embeddingParameters,
});

await this.session.vectorSearchEmbeddingsManager.assertFieldsHaveCorrectEmbeddings(
{ database, collection },
documents
Comment on lines +60 to +62
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[q] do we need to run this if embeddings are not enabled?

Copy link
Collaborator Author

@gagik gagik Oct 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this was existing behavior so I kept it as we would want to prevent the model from adding artbirary data to a vector search indexed field.
that said, I'm wondering if the insert many call would have failed at DB-level anyhow; not sure if that's redundant then.
cc: @kmruiz

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So @gagik is completely right. It's to prevent the agent adding a raw field in a place where the server expects an embedding. Currently, the server does not reject these invalid values (unless the user specifies a JSON Schema) and it would break the VS Index and the behaviour can be pretty inconsistent depending on how it breaks.

);
Comment on lines +60 to +63
Copy link

Copilot AI Oct 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This validation is performed after potentially expensive embedding generation. Consider moving this validation before the embedding generation step to fail fast and avoid unnecessary API calls when documents have invalid embeddings.

Copilot uses AI. Check for mistakes.
Copy link
Collaborator Author

@gagik gagik Oct 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

well the embeddings generation (when it happens) is relevant for asserting if the embeddings are correct, mr robot


const result = await provider.insertMany(database, collection, documents);
const content = formatUntrustedData(
Expand All @@ -63,4 +72,84 @@ export class InsertManyTool extends MongoDBToolBase {
content,
};
}

private async replaceRawValuesWithEmbeddingsIfNecessary({
database,
collection,
documents,
embeddingParameters,
}: {
database: string;
collection: string;
documents: Document[];
embeddingParameters?: z.infer<typeof zSupportedEmbeddingParametersWithInput>;
}): Promise<Document[]> {
// If no embedding parameters or no input specified, return documents as-is
if (!embeddingParameters?.input || embeddingParameters.input.length === 0) {
return documents;
}

// Get vector search indexes for the collection
const vectorIndexes = await this.session.vectorSearchEmbeddingsManager.embeddingsForNamespace({
database,
collection,
});

// Ensure for inputted fields, the vector search index exists.
for (const input of embeddingParameters.input) {
for (const fieldPath of Object.keys(input)) {
if (!vectorIndexes.some((index) => index.path === fieldPath)) {
throw new MongoDBError(
ErrorCodes.AtlasVectorSearchInvalidQuery,
`Field '${fieldPath}' does not have a vector search index in collection ${database}.${collection}. Only fields with vector search indexes can have embeddings generated.`
);
}
}
}

// We make one call to generate embeddings for all documents at once to avoid making too many API calls.
const flattenedEmbeddingsInput = embeddingParameters.input.flatMap((documentInput, index) =>
Object.entries(documentInput).map(([fieldPath, rawTextValue]) => ({
fieldPath,
rawTextValue,
documentIndex: index,
}))
);

const generatedEmbeddings = await this.session.vectorSearchEmbeddingsManager.generateEmbeddings({
rawValues: flattenedEmbeddingsInput.map(({ rawTextValue }) => rawTextValue) as string[],
embeddingParameters,
inputType: "document",
});

const processedDocuments: Document[] = [...documents];

for (const [index, { fieldPath, documentIndex }] of flattenedEmbeddingsInput.entries()) {
if (!processedDocuments[documentIndex]) {
throw new MongoDBError(ErrorCodes.Unexpected, `Document at index ${documentIndex} does not exist.`);
}
// Ensure no nested fields are present in the field path.
this.deleteFieldPath(processedDocuments[documentIndex], fieldPath);
processedDocuments[documentIndex][fieldPath] = generatedEmbeddings[index];
}

return processedDocuments;
}

// Delete a specified field path from a document using dot notation.
private deleteFieldPath(document: Record<string, unknown>, fieldPath: string): void {
const parts = fieldPath.split(".");
let current: Record<string, unknown> = document;
for (let i = 0; i < parts.length; i++) {
const part = parts[i];
const key = part as keyof typeof current;
if (!current[key]) {
return;
} else if (i === parts.length - 1) {
delete current[key];
} else {
current = current[key] as Record<string, unknown>;
}
}
}
}
19 changes: 17 additions & 2 deletions src/tools/mongodb/read/aggregate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -276,22 +276,37 @@ export class AggregateTool extends MongoDBToolBase {
const embeddingParameters = vectorSearchStage.embeddingParameters;
delete vectorSearchStage.embeddingParameters;

const [embeddings] = await this.session.vectorSearchEmbeddingsManager.generateEmbeddings({
await this.session.vectorSearchEmbeddingsManager.assertVectorSearchIndexExists({
database,
collection,
path: vectorSearchStage.path,
});

const [embeddings] = await this.session.vectorSearchEmbeddingsManager.generateEmbeddings({
rawValues: [vectorSearchStage.queryVector],
embeddingParameters,
inputType: "query",
});

if (!embeddings) {
throw new MongoDBError(
ErrorCodes.AtlasVectorSearchInvalidQuery,
"Failed to generate embeddings for the query vector."
);
}

// $vectorSearch.queryVector can be a BSON.Binary: that it's not either number or an array.
// It's not exactly valid from the LLM perspective (they can't provide binaries).
// That's why we overwrite the stage in an untyped way, as what we expose and what LLMs can use is different.
vectorSearchStage.queryVector = embeddings as number[];
vectorSearchStage.queryVector = embeddings as string | number[];
}
}

await this.session.vectorSearchEmbeddingsManager.assertFieldsHaveCorrectEmbeddings(
{ database, collection },
pipeline
);

return pipeline;
}

Expand Down
Loading
Loading