Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

+mdb atlas vectordb [clean_final] #3000

Merged
merged 51 commits into from
Jul 25, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
28a009b
+mdb atlas
Jun 22, 2024
383168e
Update test/agentchat/contrib/vectordb/test_mongodb.py
ranfysvalle02 Jun 22, 2024
f531568
update test_mongodb.py; we dont need to do the assert .collection_nam…
Jun 22, 2024
a1c385c
Try fix mongodb service
thinkall Jun 22, 2024
bb1a183
Try fix mongodb service
thinkall Jun 22, 2024
1e30425
Update username and password
thinkall Jun 22, 2024
1d6dbaf
Merge branch 'main' into main
thinkall Jun 22, 2024
e0f3c59
Update autogen/agentchat/contrib/vectordb/mongodb.py
thinkall Jun 22, 2024
d6a1162
closer --- but im not super thrilled about the solution...
Jun 23, 2024
334cc25
Merge branch 'main' into main
thinkall Jun 24, 2024
de48057
PYTHON-4506 Expanded tests and simplified vector search pipelines
caseyclements Jun 24, 2024
f5e5fdf
Merge branch 'main' into pull/3000
ranfysvalle02 Jun 24, 2024
6245c30
Merge pull request #1 from caseyclements/pull/3000
ranfysvalle02 Jun 24, 2024
a367426
Update mongodb.py
ranfysvalle02 Jun 24, 2024
ffa3e38
Update mongodb.py - Casey
ranfysvalle02 Jun 24, 2024
d741ef6
Merge branch 'main' into main
ranfysvalle02 Jun 27, 2024
d2fbd02
Merge branch 'main' into main
ranfysvalle02 Jun 30, 2024
3646d1e
search_index_magic
Jun 30, 2024
3e0ac8e
Fix format
thinkall Jun 30, 2024
95e2f79
Fix tests
thinkall Jun 30, 2024
64a157c
hacking trying to figure this out
Jul 1, 2024
17d02d1
Merge branch 'main' of https://github.com/ranfysvalle02/autogen
Jul 1, 2024
6cfb689
Merge branch 'main' into main
thinkall Jul 2, 2024
66e46e8
Merge branch 'main' into main
thinkall Jul 2, 2024
7405463
Streamline checks for indexes in construction and restructure tests
Jibola Jul 18, 2024
7d778fe
Add tests for score_threshold, embedding inclusion, and multiple quer…
Jibola Jul 18, 2024
0fcf320
Merge branch 'main' into main
Jibola Jul 19, 2024
0921c53
refactored create_collection to meet base object requirements
Jibola Jul 19, 2024
01f96c7
lint
Jibola Jul 19, 2024
311259e
change the localhost port to 27017
Jibola Jul 19, 2024
cf97466
add test to check that no embedding is there unless explicitly provided
Jibola Jul 19, 2024
6df51df
Merge branch 'main' into main
ranfysvalle02 Jul 20, 2024
e003d1f
Merge branch 'main' into main
thinkall Jul 21, 2024
8491d5a
Update logger
thinkall Jul 21, 2024
1b41e18
Add test get docs with ids=None
thinkall Jul 21, 2024
14776e4
Rename and update notebook
thinkall Jul 21, 2024
de12cd1
have index management include waiting behaviors
Jibola Jul 23, 2024
5e00b2d
Adds further optional waits or users and tests. Cleans up upsert.
caseyclements Jul 23, 2024
f3a2a0c
Merge branch 'microsoft:main' into main
cozypet Jul 24, 2024
347fd0e
ensure the embedding size for multiple embedding inputs is equal to d…
Jibola Jul 24, 2024
5790e48
fix up tests and add configuration to ensure documents and indexes ar…
Jibola Jul 25, 2024
cdc6b6d
Merge branch 'main' into main
Jibola Jul 25, 2024
8804087
fix import failure
Jibola Jul 25, 2024
1f41bbd
Merge branch 'main' of https://github.com/ranfysvalle02/autogen into …
Jibola Jul 25, 2024
ead65ca
adjust typing for 3.9
Jibola Jul 25, 2024
892b81a
fix up the notebook output
Jibola Jul 25, 2024
2cca0c0
changed language to communicate time taken on first init_chat call
Jibola Jul 25, 2024
2f1bb68
Merge branch 'main' into main
Jibola Jul 25, 2024
7a44641
replace environment variable usage
Jibola Jul 25, 2024
bb9d57a
Merge branch 'main' of https://github.com/ranfysvalle02/autogen into …
Jibola Jul 25, 2024
2c788bd
Merge branch 'main' into main
thinkall Jul 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Adds further optional waits or users and tests. Cleans up upsert.
  • Loading branch information
caseyclements committed Jul 23, 2024
commit 5e00b2dee0fd9784abe751b028259d6ba863a435
42 changes: 22 additions & 20 deletions autogen/agentchat/contrib/vectordb/mongodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,10 @@ def __init__(
connection_string: str | The MongoDB connection string to connect to. Default is ''.
database_name: str | The name of the database. Default is 'vector_db'.
embedding_function: The embedding function used to generate the vector representation.
overwrite: bool | Overwrite existing collection with new information from this object
overwrite: bool | Overwrite existing collection with new information from this object.
defaults to False
wait_until_ready: bool | Blocking call to wait until the database indexes are READY
will timeout after 20 seconds. Defaults to False
wait_until_ready: float | None | Blocking call to wait until the
database indexes are ready. None, the default, means no wait.
"""
self.embedding_function = embedding_function
self.index_name = index_name
Expand Down Expand Up @@ -89,14 +89,21 @@ def _is_index_ready(self, collection: Collection, index_name: str):
return True
return False

def _wait_for_index(self, collection: Collection, index_name: str):
"""Waits for the index to be created to be ready, otherwise
throws a TimeoutError. Timeout set on instantiation"""
def _wait_for_index(self, collection: Collection, index_name: str, action: str = "create"):
"""Waits for the index action to be completed. Otherwise throws a TimeoutError.

Timeout set on instantiation.
action: "create" or "delete"
"""
assert action in ["create", "delete"], f"{action=} must be create or delete."
start = monotonic()
while monotonic() - start < self._wait_until_ready:
if self._is_index_ready(collection, index_name):
if action == "create" and self._is_index_ready(collection, index_name):
return
elif action == "delete" and len(list(collection.list_search_indexes())) == 0:
return
sleep(_DELAY)

raise TimeoutError(f"Index {self.index_name} is not ready!")

def _get_embedding_size(self):
Expand Down Expand Up @@ -126,7 +133,7 @@ def create_collection(
get_or_create: bool | Whether to get or create the collection. Default is True
"""
if overwrite:
self.db.drop_collection(collection_name)
self.delete_collection(collection_name)

if collection_name not in self.db.list_collection_names():
# Create a new collection
Expand Down Expand Up @@ -186,6 +193,8 @@ def delete_collection(self, collection_name: str) -> None:
"""
for index in self.db[collection_name].list_search_indexes():
self.db[collection_name].drop_search_index(index["name"])
if self._wait_until_ready:
self._wait_for_index(self.db[collection_name], index["name"], "delete")
return self.db[collection_name].drop()

def create_vector_search_index(
Expand Down Expand Up @@ -223,7 +232,7 @@ def create_vector_search_index(
try:
collection.create_search_index(model=search_index_model)
if self._wait_until_ready:
self._wait_for_index(collection, index_name)
self._wait_for_index(collection, index_name, "create")
logger.debug(f"Search index {index_name} created successfully.")
except Exception as e:
logger.error(
Expand All @@ -234,13 +243,6 @@ def create_vector_search_index(
)
raise e

def upsert_docs(self, docs, collection):
for doc in docs:
query = {"id": doc["id"]}
doc["embedding"] = np.array(self.embedding_function([doc["content"]])).tolist()[0]
new_values = {"$set": doc}
collection.update_one(query, new_values, upsert=True)

def insert_docs(
self,
docs: List[Document],
Expand All @@ -265,7 +267,7 @@ def insert_docs(

collection = self.get_collection(collection_name)
if upsert:
self.upsert_docs(docs, collection)
self.update_docs(docs, collection.name, upsert=True)
else:
# Sanity checking the first document
if docs[0].get("content") is None:
Expand Down Expand Up @@ -341,7 +343,7 @@ def _insert_batch(
]
# insert the documents in MongoDB Atlas
insert_result = collection.insert_many(to_insert) # type: ignore
return insert_result.inserted_ids
return insert_result.inserted_ids # TODO Remove this. Replace by log like update_docs

def update_docs(self, docs: List[Document], collection_name: str = None, **kwargs: Any) -> None:
"""Update documents, including their embeddings, in the Collection.
Expand Down Expand Up @@ -457,11 +459,11 @@ def retrieve_docs(
# Check status of index!
if self._wait_until_ready:
self._wait_for_index(collection, self.index_name)
logger.info(f"Using index: {self.index_name}")
logger.debug(f"Using index: {self.index_name}")
results = []
for query_text in queries:
# Compute embedding vector from semantic query
logger.info(f"Query: {query_text}")
logger.debug(f"Query: {query_text}")
query_vector = np.array(self.embedding_function([query_text])).tolist()[0]
# Find documents with similar vectors using the specified index
query_result = _vector_search(
Jibola marked this conversation as resolved.
Show resolved Hide resolved
Expand Down
8 changes: 4 additions & 4 deletions test/agentchat/contrib/vectordb/test_mongodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

RETRIES = 10
DELAY = 2
TIMEOUT = 60.0
TIMEOUT = 120.0


def _wait_for_predicate(predicate, err, timeout=TIMEOUT, interval=DELAY):
Expand All @@ -55,7 +55,7 @@ def _wait_for_predicate(predicate, err, timeout=TIMEOUT, interval=DELAY):
sleep(DELAY)


def _delete_search_indexes(collection: Collection, wait=False):
def _delete_search_indexes(collection: Collection, wait=True):
"""Deletes all indexes in a collection

Args:
Expand All @@ -71,15 +71,15 @@ def _delete_search_indexes(collection: Collection, wait=False):
_wait_for_predicate(lambda: not list(collection.list_search_indexes()), "Not all collections deleted")


def _empty_collections_and_delete_indexes(database, collections=None, wait=False):
def _empty_collections_and_delete_indexes(database, collections=None, wait=True):
"""Empty all collections within the database and remove indexes

Args:
database (pymongo.Database): MongoDB Database Abstraction
"""
for collection_name in collections or database.list_collection_names():
_delete_search_indexes(database[collection_name], wait)
database[collection_name].delete_many({})
database[collection_name].drop()


@pytest.fixture
Expand Down