Skip to content

Commit 534d860

Browse files
Python: Introducing AzureCosmosDBforMongoDB store and collection (#10609)
### Motivation and Context <!-- Thank you for your contribution to the semantic-kernel repo! Please help reviewers and future users, providing the following information: 1. Why is this change required? 2. What problem does it solve? 3. What scenario does it contribute to? 4. If it fixes an open issue, please link to the issue here. --> Adds Azure Cosmos DB for MongoDB support using the new vector store model. Builds on the MongoDB Atlas integration, replaced the index creation and settings to make it work. Closes #6836 ### Description <!-- Describe your changes, the overall approach, the underlying design. These notes will help understanding how your code works. Thanks! --> ### Contribution Checklist <!-- Before submitting this PR, please make sure: --> - [x] The code builds clean without any errors or warnings - [x] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [x] All unit tests pass, and I have added new tests where possible - [x] I didn't break anyone 😄
1 parent 4c91cfd commit 534d860

25 files changed

+903
-360
lines changed

python/samples/concepts/README.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -85,12 +85,13 @@
8585

8686
- [Setup Logging](./logging/setup_logging.py)
8787

88-
### Memory - Using [`Memory`](https://github.com/microsoft/semantic-kernel/tree/main/dotnet/src/SemanticKernel.Abstractions/Memory) AI concepts
88+
### Memory - Using [`Memory`](https://learn.microsoft.com/en-us/semantic-kernel/concepts/vector-store-connectors/?pivots=programming-language-python) AI concepts
8989

90-
- [Azure Cognitive Search Memory](./memory/azure_cognitive_search_memory.py)
90+
- [Simple Memory](./memory/simple_memory.py)
9191
- [Memory Data Models](./memory/data_models.py)
92-
- [New Memory](./memory/new_memory.py)
93-
- [Pandas Memory](./memory/pandas_memory.py)
92+
- [Memory with Pandas Dataframes](./memory/memory_with_pandas.py)
93+
- [Complex memory](./memory/complex_memory.py)
94+
- [Full sample with Azure AI Search including function calling](./memory/azure_ai_search_hotel_samples/README.md)
9495

9596
### Model-as-a-Service - Using models deployed as [`serverless APIs on Azure AI Studio`](https://learn.microsoft.com/en-us/azure/ai-studio/how-to/deploy-models-serverless?tabs=azure-ai-studio) to benchmark model performance against open-source datasets
9697

python/samples/concepts/memory/azure_cognitive_search_memory.py

Lines changed: 0 additions & 66 deletions
This file was deleted.

python/samples/concepts/memory/new_memory.py renamed to python/samples/concepts/memory/complex_memory.py

Lines changed: 86 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -4,32 +4,37 @@
44
import asyncio
55
from collections.abc import Callable
66
from dataclasses import dataclass, field
7-
from typing import Annotated
7+
from typing import Annotated, Literal
88
from uuid import uuid4
99

1010
import numpy as np
1111

12+
from samples.concepts.memory.utils import print_record
13+
from samples.concepts.resources.utils import Colors, print_with_color
1214
from semantic_kernel import Kernel
1315
from semantic_kernel.connectors.ai.open_ai import (
1416
AzureTextEmbedding,
1517
OpenAIEmbeddingPromptExecutionSettings,
1618
OpenAITextEmbedding,
1719
)
1820
from semantic_kernel.connectors.memory.azure_ai_search import AzureAISearchCollection
19-
from semantic_kernel.connectors.memory.azure_cosmos_db import AzureCosmosDBNoSQLCollection
21+
from semantic_kernel.connectors.memory.azure_cosmos_db import (
22+
AzureCosmosDBforMongoDBCollection,
23+
AzureCosmosDBNoSQLCollection,
24+
)
2025
from semantic_kernel.connectors.memory.in_memory import InMemoryVectorCollection
2126
from semantic_kernel.connectors.memory.postgres import PostgresCollection
2227
from semantic_kernel.connectors.memory.qdrant import QdrantCollection
2328
from semantic_kernel.connectors.memory.redis import RedisHashsetCollection, RedisJsonCollection
2429
from semantic_kernel.connectors.memory.weaviate import WeaviateCollection
2530
from semantic_kernel.data import (
31+
DISTANCE_FUNCTION_DIRECTION_HELPER,
2632
DistanceFunction,
2733
IndexKind,
2834
VectorizableTextSearchMixin,
2935
VectorizedSearchMixin,
3036
VectorSearchFilter,
3137
VectorSearchOptions,
32-
VectorSearchResult,
3338
VectorStoreRecordCollection,
3439
VectorStoreRecordDataField,
3540
VectorStoreRecordKeyField,
@@ -39,40 +44,48 @@
3944
vectorstoremodel,
4045
)
4146

47+
# This is a rather complex sample, showing how to use the vector store
48+
# with a number of different collections.
49+
# It also shows how to use the vector store with a number of different data models.
50+
# It also uses all the types of search available in the vector store.
51+
# For a simpler example, see "simple_memory.py"
4252

43-
def get_data_model_array(index_kind: IndexKind, distance_function: DistanceFunction) -> type:
44-
@vectorstoremodel
45-
@dataclass
46-
class DataModelArray:
47-
vector: Annotated[
48-
np.ndarray | None,
49-
VectorStoreRecordVectorField(
50-
embedding_settings={"embedding": OpenAIEmbeddingPromptExecutionSettings(dimensions=1536)},
51-
index_kind=index_kind,
52-
dimensions=1536,
53-
distance_function=distance_function,
54-
property_type="float",
55-
serialize_function=np.ndarray.tolist,
56-
deserialize_function=np.array,
57-
),
58-
] = None
59-
id: Annotated[str, VectorStoreRecordKeyField()] = field(default_factory=lambda: str(uuid4()))
60-
content: Annotated[
61-
str,
62-
VectorStoreRecordDataField(
63-
has_embedding=True,
64-
embedding_property_name="vector",
65-
property_type="str",
66-
is_full_text_searchable=True,
67-
),
68-
] = "content1"
69-
title: Annotated[str, VectorStoreRecordDataField(property_type="str", is_full_text_searchable=True)] = "title"
70-
tag: Annotated[str, VectorStoreRecordDataField(property_type="str", is_filterable=True)] = "tag"
7153

72-
return DataModelArray
54+
def get_data_model(type: Literal["array", "list"], index_kind: IndexKind, distance_function: DistanceFunction) -> type:
55+
if type == "array":
7356

57+
@vectorstoremodel
58+
@dataclass
59+
class DataModelArray:
60+
vector: Annotated[
61+
np.ndarray | None,
62+
VectorStoreRecordVectorField(
63+
embedding_settings={"embedding": OpenAIEmbeddingPromptExecutionSettings(dimensions=1536)},
64+
index_kind=index_kind,
65+
dimensions=1536,
66+
distance_function=distance_function,
67+
property_type="float",
68+
serialize_function=np.ndarray.tolist,
69+
deserialize_function=np.array,
70+
),
71+
] = None
72+
id: Annotated[str, VectorStoreRecordKeyField()] = field(default_factory=lambda: str(uuid4()))
73+
content: Annotated[
74+
str,
75+
VectorStoreRecordDataField(
76+
has_embedding=True,
77+
embedding_property_name="vector",
78+
property_type="str",
79+
is_full_text_searchable=True,
80+
),
81+
] = "content1"
82+
title: Annotated[str, VectorStoreRecordDataField(property_type="str", is_full_text_searchable=True)] = (
83+
"title"
84+
)
85+
tag: Annotated[str, VectorStoreRecordDataField(property_type="str", is_filterable=True)] = "tag"
86+
87+
return DataModelArray
7488

75-
def get_data_model_list(index_kind: IndexKind, distance_function: DistanceFunction) -> type:
7689
@vectorstoremodel
7790
@dataclass
7891
class DataModelList:
@@ -103,9 +116,10 @@ class DataModelList:
103116

104117

105118
collection_name = "test"
119+
distance_function = DistanceFunction.COSINE_SIMILARITY
106120
# Depending on the vector database, the index kind and distance function may need to be adjusted,
107121
# since not all combinations are supported by all databases.
108-
DataModel = get_data_model_array(IndexKind.HNSW, DistanceFunction.COSINE_SIMILARITY)
122+
DataModel = get_data_model("array", IndexKind.IVF_FLAT, distance_function)
109123

110124
# A list of VectorStoreRecordCollection that can be used.
111125
# Available collections are:
@@ -124,6 +138,8 @@ class DataModelList:
124138
# https://learn.microsoft.com/en-us/azure/cosmos-db/how-to-develop-emulator?tabs=windows%2Cpython&pivots=api-nosql
125139
# Please see the link above to learn how to set up the Azure Cosmos NoSQL emulator on your machine.
126140
# For this sample to work with Azure Cosmos NoSQL, please adjust the index_kind of the data model to QUANTIZED_FLAT.
141+
# - azure_cosmos_mongodb: Azure Cosmos MongoDB
142+
# https://learn.microsoft.com/en-us/azure/cosmos-db/mongodb/introduction
127143
# This is represented as a mapping from the collection name to a
128144
# function which returns the collection.
129145
# Using a function allows for lazy initialization of the collection,
@@ -162,29 +178,22 @@ class DataModelList:
162178
collection_name=collection_name,
163179
create_database=True,
164180
),
181+
"azure_cosmos_mongodb": lambda: AzureCosmosDBforMongoDBCollection(
182+
data_model_type=DataModel,
183+
collection_name=collection_name,
184+
),
165185
}
166186

167187

168-
def print_record(result: VectorSearchResult | None = None, record: DataModel | None = None):
169-
if result:
170-
record = result.record
171-
print(f" Found id: {record.id}")
172-
print(f" Content: {record.content}")
173-
if record.vector is not None:
174-
print(f" Vector (first five): {record.vector[:5]}")
175-
176-
177-
async def main(collection: str, use_azure_openai: bool, embedding_model: str):
188+
async def main(collection: str, use_azure_openai: bool):
178189
print("-" * 30)
179190
kernel = Kernel()
180-
service_id = "embedding"
181-
if use_azure_openai:
182-
embedder = AzureTextEmbedding(service_id=service_id, deployment_name=embedding_model)
183-
else:
184-
embedder = OpenAITextEmbedding(service_id=service_id, ai_model_id=embedding_model)
191+
embedder = (
192+
AzureTextEmbedding(service_id="embedding") if use_azure_openai else OpenAITextEmbedding(service_id="embedding")
193+
)
185194
kernel.add_service(embedder)
186195
async with collections[collection]() as record_collection:
187-
print(f"Creating {collection} collection!")
196+
print_with_color(f"Creating {collection} collection!", Colors.CGREY)
188197
await record_collection.delete_collection()
189198
await record_collection.create_collection_if_not_exists()
190199

@@ -200,16 +209,22 @@ async def main(collection: str, use_azure_openai: bool, embedding_model: str):
200209
title="Semantic Kernel Languages",
201210
tag="general",
202211
)
212+
record3 = DataModel(
213+
content="```python\nfrom semantic_kernel import Kernel\nkernel = Kernel()\n```",
214+
id="d5c9913a-e015-4944-b960-5d4a84bca002",
215+
title="Code sample",
216+
tag="code",
217+
)
203218

204-
print("Adding records!")
219+
print_with_color("Adding records!", Colors.CBLUE)
205220
records = await VectorStoreRecordUtils(kernel).add_vector_to_records(
206-
[record1, record2], data_model_type=DataModel
221+
[record1, record2, record3], data_model_type=DataModel
207222
)
208223

209224
keys = await record_collection.upsert_batch(records)
210225
print(f" Upserted {keys=}")
211-
print("Getting records!")
212-
results = await record_collection.get_batch([record1.id, record2.id])
226+
print_with_color("Getting records!", Colors.CBLUE)
227+
results = await record_collection.get_batch([record1.id, record2.id, record3.id])
213228
if results:
214229
[print_record(record=result) for result in results]
215230
else:
@@ -219,9 +234,11 @@ async def main(collection: str, use_azure_openai: bool, embedding_model: str):
219234
include_vectors=True,
220235
filter=VectorSearchFilter.equal_to("tag", "general"),
221236
)
237+
print("-" * 30)
238+
print_with_color("Searching for 'python', with filter 'tag == general'", Colors.CBLUE)
222239
if isinstance(record_collection, VectorTextSearchMixin):
223240
print("-" * 30)
224-
print("Using text search")
241+
print_with_color("Using text search", Colors.CBLUE)
225242
try:
226243
search_results = await record_collection.text_search("python", options)
227244
if search_results.total_count == 0:
@@ -232,14 +249,16 @@ async def main(collection: str, use_azure_openai: bool, embedding_model: str):
232249
print("Text search could not execute.")
233250
if isinstance(record_collection, VectorizedSearchMixin):
234251
print("-" * 30)
235-
print(
236-
"Using vectorized search, depending on the distance function, "
237-
"the better score might be higher or lower."
252+
print_with_color(
253+
f"Using vectorized search, for {distance_function.value}, "
254+
f"the {'higher' if DISTANCE_FUNCTION_DIRECTION_HELPER[distance_function](1, 0) else 'lower'} the score the better" # noqa: E501
255+
f"",
256+
Colors.CBLUE,
238257
)
239258
try:
240259
search_results = await record_collection.vectorized_search(
241260
vector=(await embedder.generate_raw_embeddings(["python"]))[0],
242-
options=VectorSearchOptions(vector_field_name="vector", include_vectors=True),
261+
options=options,
243262
)
244263
if search_results.total_count == 0:
245264
print("\nNothing found...\n")
@@ -249,7 +268,11 @@ async def main(collection: str, use_azure_openai: bool, embedding_model: str):
249268
print("Vectorized search could not execute.")
250269
if isinstance(record_collection, VectorizableTextSearchMixin):
251270
print("-" * 30)
252-
print("Using vectorizable text search")
271+
print_with_color(
272+
f"Using vectorized search, for {distance_function.value}, "
273+
f"the {'higher' if DISTANCE_FUNCTION_DIRECTION_HELPER[distance_function](1, 0) else 'lower'} the score the better", # noqa: E501
274+
Colors.CBLUE,
275+
)
253276
try:
254277
search_results = await record_collection.vectorizable_text_search("python", options)
255278
if search_results.total_count == 0:
@@ -259,9 +282,9 @@ async def main(collection: str, use_azure_openai: bool, embedding_model: str):
259282
except Exception:
260283
print("Vectorizable text search could not execute.")
261284
print("-" * 30)
262-
print("Deleting collection!")
285+
print_with_color("Deleting collection!", Colors.CBLUE)
263286
await record_collection.delete_collection()
264-
print("Done!")
287+
print_with_color("Done!", Colors.CGREY)
265288

266289

267290
if __name__ == "__main__":
@@ -271,10 +294,5 @@ async def main(collection: str, use_azure_openai: bool, embedding_model: str):
271294
parser.add_argument("--collection", default="in_memory", choices=collections.keys(), help="What collection to use.")
272295
# Option of whether to use OpenAI or Azure OpenAI.
273296
parser.add_argument("--use-azure-openai", action="store_true", help="Use Azure OpenAI instead of OpenAI.")
274-
# Model
275-
parser.add_argument(
276-
"--model", default="text-embedding-3-small", help="The model or deployment to use for embeddings."
277-
)
278297
args = parser.parse_args()
279-
280-
asyncio.run(main(collection=args.collection, use_azure_openai=args.use_azure_openai, embedding_model=args.model))
298+
asyncio.run(main(collection=args.collection, use_azure_openai=args.use_azure_openai))

0 commit comments

Comments
 (0)