- 
                Notifications
    You must be signed in to change notification settings 
- Fork 151
feat: add ability to create atlas search indexes MCP-275 #692
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
|  | @@ -6,61 +6,121 @@ import type { IndexDirection } from "mongodb"; | |
| import { quantizationEnum, similarityEnum } from "../../../common/search/vectorSearchEmbeddingsManager.js"; | ||
|  | ||
| export class CreateIndexTool extends MongoDBToolBase { | ||
| private vectorSearchIndexDefinition = z.object({ | ||
| type: z.literal("vectorSearch"), | ||
| fields: z | ||
| .array( | ||
| z.discriminatedUnion("type", [ | ||
| z | ||
| .object({ | ||
| type: z.literal("filter"), | ||
| path: z | ||
| .string() | ||
| .describe( | ||
| "Name of the field to index. For nested fields, use dot notation to specify path to embedded fields" | ||
| ), | ||
| }) | ||
| .strict() | ||
| .describe("Definition for a field that will be used for pre-filtering results."), | ||
| z | ||
| .object({ | ||
| type: z.literal("vector"), | ||
| path: z | ||
| .string() | ||
| .describe( | ||
| "Name of the field to index. For nested fields, use dot notation to specify path to embedded fields" | ||
| ), | ||
| numDimensions: z | ||
| .number() | ||
| .min(1) | ||
| .max(8192) | ||
| .default(this.config.vectorSearchDimensions) | ||
| .describe( | ||
| "Number of vector dimensions that MongoDB Vector Search enforces at index-time and query-time" | ||
| ), | ||
| similarity: similarityEnum | ||
| .default(this.config.vectorSearchSimilarityFunction) | ||
| .describe( | ||
| "Vector similarity function to use to search for top K-nearest neighbors. You can set this field only for vector-type fields." | ||
| ), | ||
| quantization: quantizationEnum | ||
| .default("none") | ||
| private vectorSearchIndexDefinition = z | ||
| .object({ | ||
| type: z.literal("vectorSearch"), | ||
| fields: z | ||
| .array( | ||
| z.discriminatedUnion("type", [ | ||
| z | ||
| .object({ | ||
| type: z.literal("filter"), | ||
| path: z | ||
| .string() | ||
| .describe( | ||
| "Name of the field to index. For nested fields, use dot notation to specify path to embedded fields" | ||
| ), | ||
| }) | ||
| .strict() | ||
| .describe("Definition for a field that will be used for pre-filtering results."), | ||
| z | ||
| .object({ | ||
| type: z.literal("vector"), | ||
| path: z | ||
| .string() | ||
| .describe( | ||
| "Name of the field to index. For nested fields, use dot notation to specify path to embedded fields" | ||
| ), | ||
| numDimensions: z | ||
| .number() | ||
| .min(1) | ||
| .max(8192) | ||
| .default(this.config.vectorSearchDimensions) | ||
| .describe( | ||
| "Number of vector dimensions that MongoDB Vector Search enforces at index-time and query-time" | ||
| ), | ||
| similarity: similarityEnum | ||
| .default(this.config.vectorSearchSimilarityFunction) | ||
| .describe( | ||
| "Vector similarity function to use to search for top K-nearest neighbors. You can set this field only for vector-type fields." | ||
| ), | ||
| quantization: quantizationEnum | ||
| .default("none") | ||
| .describe( | ||
| "Type of automatic vector quantization for your vectors. Use this setting only if your embeddings are float or double vectors." | ||
| ), | ||
| }) | ||
| .strict() | ||
| .describe("Definition for a field that contains vector embeddings."), | ||
| ]) | ||
| ) | ||
| .nonempty() | ||
| .refine((fields) => fields.some((f) => f.type === "vector"), { | ||
| message: "At least one vector field must be defined", | ||
| }) | ||
| .describe( | ||
| "Definitions for the vector and filter fields to index, one definition per document. You must specify `vector` for fields that contain vector embeddings and `filter` for additional fields to filter on. At least one vector-type field definition is required." | ||
| ), | ||
| }) | ||
| .describe("Definition for a Vector Search index."); | ||
|  | ||
| private atlasSearchIndexDefinition = z | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why aren't we supporting custom analyzers? https://www.mongodb.com/docs/atlas/atlas-search/analyzers/custom/ | ||
| .object({ | ||
| type: z.literal("search"), | ||
| analyzer: z | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably this should be an enum of the analyzers. | ||
| .string() | ||
| .optional() | ||
| .default("lucene.standard") | ||
| .describe( | ||
| "The analyzer to use for the index. Can be one of the built-in lucene analyzers (`lucene.standard`, `lucene.simple`, `lucene.whitespace`, `lucene.keyword`), a language-specific analyzer, such as `lucene.cjk` or `lucene.czech`, or a custom analyzer defined in the Atlas UI." | ||
| ), | ||
| mappings: z | ||
| .object({ | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lack support of: 
 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could say that custom analyzers are not that important, but storedSources is actually relevant most of the times. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The args shape is based on the POC the search team did for index support and I was going off of the assumption that they've selected the fields that they see the most value in exposing to LLMs. I realize there's a lot more configuration that's possible, I'm just not sure how much of that is stuff we expect agents to configure vs an actual human who wants to fine-tune the index. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A POC to see the feasibility to create search indexes and production code are likely to have different requirements. | ||
| dynamic: z | ||
| .boolean() | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Dynamic can be an object of typeSets. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is in preview though, so I don't expect there's sufficient docs or training data for general-purpose models to accurately choose which one to use. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The preview is for vector search though, FTS is explicitly out of scope of the vector search project. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The  | ||
| .optional() | ||
| .default(false) | ||
| .describe( | ||
| "Enables or disables dynamic mapping of fields for this index. If set to true, Atlas Search recursively indexes all dynamically indexable fields. If set to false, you must specify individual fields to index using mappings.fields." | ||
| ), | ||
| fields: z | ||
| .record( | ||
| z.string().describe("The field name"), | ||
| z | ||
| .object({ | ||
| type: z | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Objects will require additional fields depending on the type. I know passthrough will keep them, but we should document them so the agent knows which ones to use and how. For example, autocomplete supports defining a custom analyzer, how to tokenize (which is really important) and similarity functions. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The exact shape is extremely complex to represent in a json schema. I'm worried that being overly specific will result in this being more harmful than helpful, especially if we expect the majority of the use cases to revolve around just specifying the type. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, the schema is complicated, it has a lot of options that are not compatible even between them. We should have proper documentation of which ones we want to expose and which ones not, something that we haven't discussed yet because supporting the most used bits of Atlas Search is already a substantial effort. | ||
| .enum([ | ||
| "autocomplete", | ||
| "boolean", | ||
| "date", | ||
| "document", | ||
| "embeddedDocuments", | ||
| "geo", | ||
| "number", | ||
| "objectId", | ||
| "string", | ||
| "token", | ||
| "uuid", | ||
| ]) | ||
| .describe("The field type"), | ||
| }) | ||
| .passthrough() | ||
| .describe( | ||
| "Type of automatic vector quantization for your vectors. Use this setting only if your embeddings are float or double vectors." | ||
| ), | ||
| }) | ||
| .strict() | ||
| .describe("Definition for a field that contains vector embeddings."), | ||
| ]) | ||
| ) | ||
| .nonempty() | ||
| .refine((fields) => fields.some((f) => f.type === "vector"), { | ||
| message: "At least one vector field must be defined", | ||
| }) | ||
| .describe( | ||
| "Definitions for the vector and filter fields to index, one definition per document. You must specify `vector` for fields that contain vector embeddings and `filter` for additional fields to filter on. At least one vector-type field definition is required." | ||
| ), | ||
| }); | ||
| "The field index definition. It must contain the field type, as well as any additional options for that field type." | ||
| ) | ||
| ) | ||
| .optional() | ||
| .describe("The field mapping definitions. If `dynamic` is set to `false`, this is required."), | ||
| }) | ||
| .refine((data) => data.dynamic !== !!(data.fields && Object.keys(data.fields).length > 0), { | ||
|         
                  nirinchev marked this conversation as resolved.
              Show resolved
            Hide resolved         
                  nirinchev marked this conversation as resolved.
              Show resolved
            Hide resolved | ||
| message: | ||
| "Either `dynamic` must be `true` and `fields` empty or `dynamic` must be `false` and at least one field must be defined in `fields`", | ||
| }) | ||
| .describe( | ||
| "Document describing the index to create. Either `dynamic` must be `true` and `fields` empty or `dynamic` must be `false` and at least one field must be defined in the `fields` document." | ||
| ), | ||
| }) | ||
| .describe("Definition for an Atlas Search (lexical) index."); | ||
|  | ||
| public name = "create-index"; | ||
| protected description = "Create an index for a collection"; | ||
|  | @@ -70,15 +130,19 @@ export class CreateIndexTool extends MongoDBToolBase { | |
| definition: z | ||
| .array( | ||
| z.discriminatedUnion("type", [ | ||
| z.object({ | ||
| type: z.literal("classic"), | ||
| keys: z.object({}).catchall(z.custom<IndexDirection>()).describe("The index definition"), | ||
| }), | ||
| ...(this.isFeatureEnabled("vectorSearch") ? [this.vectorSearchIndexDefinition] : []), | ||
| z | ||
| .object({ | ||
| type: z.literal("classic"), | ||
| keys: z.object({}).catchall(z.custom<IndexDirection>()).describe("The index definition"), | ||
| }) | ||
| .describe("Definition for a MongoDB index (e.g. ascending/descending/geospatial)."), | ||
| ...(this.isFeatureEnabled("vectorSearch") | ||
| ? [this.vectorSearchIndexDefinition, this.atlasSearchIndexDefinition] | ||
| : []), | ||
| ]) | ||
| ) | ||
| .describe( | ||
| "The index definition. Use 'classic' for standard indexes and 'vectorSearch' for vector search indexes" | ||
| `The index definition.${this.isFeatureEnabled("vectorSearch") ? " Use 'classic' for standard indexes, 'vectorSearch' for vector search indexes, and 'search' for Atlas Search (lexical) indexes." : ""}` | ||
| ), | ||
| }; | ||
|  | ||
|  | @@ -128,6 +192,25 @@ export class CreateIndexTool extends MongoDBToolBase { | |
| this.session.vectorSearchEmbeddingsManager.cleanupEmbeddingsForNamespace({ database, collection }); | ||
| } | ||
|  | ||
| break; | ||
| case "search": | ||
| { | ||
| await this.ensureSearchIsSupported(); | ||
| indexes = await provider.createSearchIndexes(database, collection, [ | ||
| { | ||
| name, | ||
| definition: { | ||
| mappings: definition.mappings, | ||
| analyzer: definition.analyzer, | ||
| }, | ||
| type: "search", | ||
| }, | ||
| ]); | ||
|  | ||
| responseClarification = | ||
| " Since this is a search index, it may take a while for the index to build. Use the `list-indexes` tool to check the index status."; | ||
| } | ||
|  | ||
| break; | ||
| } | ||
|  | ||
|  | ||
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
|  | @@ -91,7 +91,7 @@ export class AccuracyTestingClient { | |
| return [`--${key}`, value]; | ||
| }); | ||
|  | ||
| const args = [MCP_SERVER_CLI_SCRIPT, "--connectionString", mdbConnectionString, ...additionalArgs]; | ||
| const args = [MCP_SERVER_CLI_SCRIPT, mdbConnectionString, ...additionalArgs]; | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 
 | ||
|  | ||
| const clientTransport = new StdioClientTransport({ | ||
| command: process.execPath, | ||
|  | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is best viewed with the "Hide whitespace" option - it's just prettier reformatting the indents.