Python: Introducing Vector Search to Weaviate and adding the ability …

…to have unnamed vectors (#9684) ### Motivation and Context  This PR adds vector search to Weaviate. All three types of search are supported, however vectorizable_text_search depends on a setup outside SK. Also adds a parameter to the Weaviate Collection called 'named_vectors', default is True. When set to False it uses unnamed vectors instead of named. Because of this there is a slight difference in how vectors are represented to Weaviate which might be breaking. ### Description  The breaking change is that, vector were set to have the name of the content that it vectorized, for instance: With a datamodel like this (shortened some of the class names): ```python class DataModel: content: Annotated[str, Data Field(embedding_property_name="vector")] vector: Annotated[list[float], VectorField()] ``` This would be set as `DataObject(vector: {"content": [vector content]})`, while it will now set this to `DataObject(vector: {"vector": [vector content]})` Where DataObject is the Weaviate object used to insert items. Closes #6839 ### Contribution Checklist  - [x] The code builds clean without any errors or warnings - [x] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [x] All unit tests pass, and I have added new tests where possible - [ ] I didn't break anyone 😄
microsoft · Nov 15, 2024 · a83abb9 · a83abb9
1 parent 5d8d738
commit a83abb9
Show file tree

Hide file tree

Showing 12 changed files with 892 additions and 490 deletions.
diff --git a/.github/workflows/python-integration-tests.yml b/.github/workflows/python-integration-tests.yml
@@ -173,7 +173,7 @@ jobs:
           subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
       - name: Run Integration Tests - Completions
         id: run_tests_completions
-        timeout-minutes: 10
+        timeout-minutes: 15
         shell: bash
         run: |
           uv run pytest -n logical --dist loadfile --dist worksteal ./tests/integration/completions -v --junitxml=pytest-completions.xml
@@ -185,7 +185,7 @@ jobs:
           uv run pytest -n logical --dist loadfile --dist worksteal ./tests/integration/embeddings -v --junitxml=pytest-embeddings.xml
       - name: Run Integration Tests - Memory
         id: run_tests_memory
-        timeout-minutes: 5
+        timeout-minutes: 10
         shell: bash
         run: |
           uv run pytest -n logical --dist loadfile --dist worksteal ./tests/integration/memory -v --junitxml=pytest-memory.xml

diff --git a/python/samples/concepts/memory/new_memory.py b/python/samples/concepts/memory/new_memory.py
@@ -30,7 +30,11 @@
     vectorstoremodel,
 )
 from semantic_kernel.data.const import DistanceFunction, IndexKind
+from semantic_kernel.data.vector_search.vector_search_filter import VectorSearchFilter
 from semantic_kernel.data.vector_search.vector_search_options import VectorSearchOptions
+from semantic_kernel.data.vector_search.vector_search_result import VectorSearchResult
+from semantic_kernel.data.vector_search.vector_text_search import VectorTextSearchMixin
+from semantic_kernel.data.vector_search.vectorizable_text_search import VectorizableTextSearchMixin
 from semantic_kernel.data.vector_search.vectorized_search import VectorizedSearchMixin
 
 
@@ -50,11 +54,18 @@ class DataModelArray:
                 deserialize_function=np.array,
             ),
         ] = None
-        other: str | None = None
         id: Annotated[str, VectorStoreRecordKeyField()] = field(default_factory=lambda: str(uuid4()))
         content: Annotated[
-            str, VectorStoreRecordDataField(has_embedding=True, embedding_property_name="vector", property_type="str")
+            str,
+            VectorStoreRecordDataField(
+                has_embedding=True,
+                embedding_property_name="vector",
+                property_type="str",
+                is_full_text_searchable=True,
+            ),
         ] = "content1"
+        title: Annotated[str, VectorStoreRecordDataField(property_type="str", is_full_text_searchable=True)] = "title1"
+        tag: Annotated[str, VectorStoreRecordDataField(property_type="str", is_filterable=True)] = "tag1"
 
     return DataModelArray
 
@@ -73,19 +84,26 @@ class DataModelList:
                 property_type="float",
             ),
         ] = None
-        other: str | None = None
         id: Annotated[str, VectorStoreRecordKeyField()] = field(default_factory=lambda: str(uuid4()))
         content: Annotated[
-            str, VectorStoreRecordDataField(has_embedding=True, embedding_property_name="vector", property_type="str")
+            str,
+            VectorStoreRecordDataField(
+                has_embedding=True,
+                embedding_property_name="vector",
+                property_type="str",
+                is_full_text_searchable=True,
+            ),
         ] = "content1"
+        title: Annotated[str, VectorStoreRecordDataField(property_type="str", is_full_text_searchable=True)] = "title1"
+        tag: Annotated[str, VectorStoreRecordDataField(property_type="str", is_filterable=True)] = "tag1"
 
     return DataModelList
 
 
 collection_name = "test"
 # Depending on the vector database, the index kind and distance function may need to be adjusted,
 # since not all combinations are supported by all databases.
-DataModel = get_data_model_array(IndexKind.HNSW, DistanceFunction.COSINE_SIMILARITY)
+DataModel = get_data_model_array(IndexKind.HNSW, DistanceFunction.COSINE_DISTANCE)
 
 # A list of VectorStoreRecordCollection that can be used.
 # Available collections are:
@@ -133,7 +151,7 @@ class DataModelList:
         data_model_type=DataModel,
         collection_name=collection_name,
     ),
-    "weaviate": lambda: WeaviateCollection[DataModel](
+    "weaviate": lambda: WeaviateCollection[str, DataModel](
         data_model_type=DataModel,
         collection_name=collection_name,
     ),
@@ -146,6 +164,18 @@ class DataModelList:
 }
 
 
+def print_record(result: VectorSearchResult | None = None, record: DataModel | None = None):
+    if result:
+        record = result.record
+    print(f"  Found id: {record.id}")
+    print(f"    Content: {record.content}")
+    if record.vector is not None:
+        print(f"    Vector (first five): {record.vector[:5]}")
+    if result:
+        print(f"  Score: {result.score:.4f}")
+    print()
+
+
 async def main(collection: str, use_azure_openai: bool, embedding_model: str):
     print("-" * 30)
     kernel = Kernel()
@@ -157,12 +187,20 @@ async def main(collection: str, use_azure_openai: bool, embedding_model: str):
     kernel.add_service(embedder)
     async with collections[collection]() as record_collection:
         print(f"Creating {collection} collection!")
+        await record_collection.delete_collection()
         await record_collection.create_collection_if_not_exists()
 
-        record1 = DataModel(content="Semantic Kernel is awesome", id="e6103c03-487f-4d7d-9c23-4723651c17f4")
+        record1 = DataModel(
+            content="Semantic Kernel is awesome",
+            id="e6103c03-487f-4d7d-9c23-4723651c17f4",
+            title="Overview",
+            tag="general",
+        )
         record2 = DataModel(
             content="Semantic Kernel is available in dotnet, python and Java.",
             id="09caec77-f7e1-466a-bcec-f1d51c5b15be",
+            title="Semantic Kernel Languages",
+            tag="general",
         )
 
         print("Adding records!")
@@ -174,29 +212,53 @@ async def main(collection: str, use_azure_openai: bool, embedding_model: str):
         print("Getting records!")
         results = await record_collection.get_batch([record1.id, record2.id])
         if results:
-            for result in results:
-                print(f"  Found id: {result.id}")
-                print(f"    Content: {result.content}")
-                if result.vector is not None:
-                    print(f"    Vector (first five): {result.vector[:5]}")
+            [print_record(record=result) for result in results]
         else:
             print("Nothing found...")
+        options = VectorSearchOptions(
+            vector_field_name="vector",
+            include_vectors=True,
+            filter=VectorSearchFilter.equal_to("tag", "general"),
+        )
+        if isinstance(record_collection, VectorTextSearchMixin):
+            print("-" * 30)
+            print("Using text search")
+            try:
+                search_results = await record_collection.text_search("python", options)
+                if search_results.total_count == 0:
+                    print("\nNothing found...\n")
+                else:
+                    [print_record(result) async for result in search_results.results]
+            except Exception:
+                print("Text search could not execute.")
         if isinstance(record_collection, VectorizedSearchMixin):
             print("-" * 30)
-            print("Using vectorized search, the distance function is set to cosine_similarity.")
-            print("This means that the higher the score the more similar.")
-            search_results = await record_collection.vectorized_search(
-                vector=(await embedder.generate_raw_embeddings(["python"]))[0],
-                options=VectorSearchOptions(vector_field_name="vector", include_vectors=True),
+            print(
+                "Using vectorized search, depending on the distance function, "
+                "the better score might be higher or lower."
             )
-            results = [record async for record in search_results.results]
-            for result in results:
-                print(f"  Found id: {result.record.id}")
-                print(f"    Content: {result.record.content}")
-                if result.record.vector is not None:
-                    print(f"    Vector (first five): {result.record.vector[:5]}")
-                print(f"  Score: {result.score:.4f}")
-                print("")
+            try:
+                search_results = await record_collection.vectorized_search(
+                    vector=(await embedder.generate_raw_embeddings(["python"]))[0],
+                    options=VectorSearchOptions(vector_field_name="vector", include_vectors=True),
+                )
+                if search_results.total_count == 0:
+                    print("\nNothing found...\n")
+                else:
+                    [print_record(result) async for result in search_results.results]
+            except Exception:
+                print("Vectorized search could not execute.")
+        if isinstance(record_collection, VectorizableTextSearchMixin):
+            print("-" * 30)
+            print("Using vectorizable text search")
+            try:
+                search_results = await record_collection.vectorizable_text_search("python", options)
+                if search_results.total_count == 0:
+                    print("\nNothing found...\n")
+                else:
+                    [print_record(result) async for result in search_results.results]
+            except Exception:
+                print("Vectorizable text search could not execute.")
         print("-" * 30)
         print("Deleting collection!")
         await record_collection.delete_collection()

diff --git a/python/semantic_kernel/connectors/memory/azure_ai_search/azure_ai_search_collection.py b/python/semantic_kernel/connectors/memory/azure_ai_search/azure_ai_search_collection.py
@@ -6,8 +6,6 @@
 from collections.abc import Sequence
 from typing import Any, ClassVar, Generic, TypeVar
 
-from semantic_kernel.data.vector_search.vector_search_result import VectorSearchResult
-
 if sys.version_info >= (3, 12):
     from typing import override  # pragma: no cover
 else:
@@ -33,6 +31,7 @@
     VectorSearchOptions,
 )
 from semantic_kernel.data.vector_search.vector_search import VectorSearchBase
+from semantic_kernel.data.vector_search.vector_search_result import VectorSearchResult
 from semantic_kernel.data.vector_search.vector_text_search import VectorTextSearchMixin
 from semantic_kernel.data.vector_search.vectorized_search import VectorizedSearchMixin
 from semantic_kernel.exceptions import MemoryConnectorException, MemoryConnectorInitializationError