diff --git a/docs/extras/integrations/providers/bageldb.mdx b/docs/extras/integrations/providers/bageldb.mdx new file mode 100644 index 0000000000000..ec05493169d09 --- /dev/null +++ b/docs/extras/integrations/providers/bageldb.mdx @@ -0,0 +1,21 @@ +# BagelDB + +> [BagelDB](https://www.bageldb.ai/) (`Open Vector Database for AI`), is like GitHub for AI data. +It is a collaborative platform where users can create, +share, and manage vector datasets. It can support private projects for independent developers, +internal collaborations for enterprises, and public contributions for data DAOs. + +## Installation and Setup + +```bash +pip install betabageldb +``` + + +## VectorStore + +See a [usage example](/docs/integrations/vectorstores/bageldb). + +```python +from langchain.vectorstores import Bagel +``` diff --git a/docs/extras/integrations/vectorstores/bageldb.ipynb b/docs/extras/integrations/vectorstores/bageldb.ipynb new file mode 100644 index 0000000000000..7f65486569f29 --- /dev/null +++ b/docs/extras/integrations/vectorstores/bageldb.ipynb @@ -0,0 +1,300 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BagelDB\n", + "\n", + "> [BagelDB](https://www.bageldb.ai/) (`Open Vector Database for AI`), is like GitHub for AI data.\n", + "It is a collaborative platform where users can create,\n", + "share, and manage vector datasets. It can support private projects for independent developers,\n", + "internal collaborations for enterprises, and public contributions for data DAOs.\n", + "\n", + "### Installation and Setup\n", + "\n", + "```bash\n", + "pip install betabageldb\n", + "```\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create VectorStore from texts" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.vectorstores import Bagel\n", + "\n", + "texts = [\"hello bagel\", \"hello langchain\", \"I love salad\", \"my car\", \"a dog\"]\n", + "# create cluster and add texts\n", + "cluster = Bagel.from_texts(cluster_name=\"testing\", texts=texts)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='hello bagel', metadata={}),\n", + " Document(page_content='my car', metadata={}),\n", + " Document(page_content='I love salad', metadata={})]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# similarity search\n", + "cluster.similarity_search(\"bagel\", k=3)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(Document(page_content='hello bagel', metadata={}), 0.27392977476119995),\n", + " (Document(page_content='my car', metadata={}), 1.4783176183700562),\n", + " (Document(page_content='I love salad', metadata={}), 1.5342965126037598)]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# the score is a distance metric, so lower is better\n", + "cluster.similarity_search_with_score(\"bagel\", k=3)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# delete the cluster\n", + "cluster.delete_cluster()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create VectorStore from docs" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import TextLoader\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "\n", + "loader = TextLoader(\"../../../state_of_the_union.txt\")\n", + "documents = loader.load()\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "# create cluster with docs\n", + "cluster = Bagel.from_documents(cluster_name=\"testing_with_docs\", documents=docs)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the \n" + ] + } + ], + "source": [ + "# similarity search\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = cluster.similarity_search(query)\n", + "print(docs[0].page_content[:102])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get all text/doc from Cluster" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "texts = [\"hello bagel\", \"this is langchain\"]\n", + "cluster = Bagel.from_texts(cluster_name=\"testing\", texts=texts)\n", + "cluster_data = cluster.get()" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['ids', 'embeddings', 'metadatas', 'documents'])" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# all keys\n", + "cluster_data.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'ids': ['578c6d24-3763-11ee-a8ab-b7b7b34f99ba',\n", + " '578c6d25-3763-11ee-a8ab-b7b7b34f99ba',\n", + " 'fb2fc7d8-3762-11ee-a8ab-b7b7b34f99ba',\n", + " 'fb2fc7d9-3762-11ee-a8ab-b7b7b34f99ba',\n", + " '6b40881a-3762-11ee-a8ab-b7b7b34f99ba',\n", + " '6b40881b-3762-11ee-a8ab-b7b7b34f99ba',\n", + " '581e691e-3762-11ee-a8ab-b7b7b34f99ba',\n", + " '581e691f-3762-11ee-a8ab-b7b7b34f99ba'],\n", + " 'embeddings': None,\n", + " 'metadatas': [{}, {}, {}, {}, {}, {}, {}, {}],\n", + " 'documents': ['hello bagel',\n", + " 'this is langchain',\n", + " 'hello bagel',\n", + " 'this is langchain',\n", + " 'hello bagel',\n", + " 'this is langchain',\n", + " 'hello bagel',\n", + " 'this is langchain']}" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# all values and keys\n", + "cluster_data" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "cluster.delete_cluster()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create cluster with metadata & filter using metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(Document(page_content='hello bagel', metadata={'source': 'notion'}), 0.0)]" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "texts = [\"hello bagel\", \"this is langchain\"]\n", + "metadatas = [{\"source\": \"notion\"}, {\"source\": \"google\"}]\n", + "\n", + "cluster = Bagel.from_texts(cluster_name=\"testing\", texts=texts, metadatas=metadatas)\n", + "cluster.similarity_search_with_score(\"hello bagel\", where={\"source\": \"notion\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "# delete the cluster\n", + "cluster.delete_cluster()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/libs/langchain/poetry.lock b/libs/langchain/poetry.lock index 043113826ece7..45b415503b05a 100644 --- a/libs/langchain/poetry.lock +++ b/libs/langchain/poetry.lock @@ -13637,7 +13637,7 @@ clarifai = ["clarifai"] cohere = ["cohere"] docarray = ["docarray"] embeddings = ["sentence-transformers"] -extended-testing = ["amazon-textract-caller", "atlassian-python-api", "beautifulsoup4", "betabageldb", "bibtexparser", "cassio", "chardet", "esprima", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xata", "xinference", "xmltodict", "zep-python"] +extended-testing = ["amazon-textract-caller", "anthropic", "atlassian-python-api", "beautifulsoup4", "betabageldb", "bibtexparser", "cassio", "chardet", "esprima", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xata", "xinference", "xmltodict", "zep-python"] javascript = ["esprima"] llms = ["anthropic", "clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openllm", "openlm", "torch", "transformers", "xinference"] openai = ["openai", "tiktoken"] @@ -13647,4 +13647,4 @@ text-helpers = ["chardet"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "defec69059c5925ac902fa02a03712a4db6ea85a01f316da1f99b10510cc9864" +content-hash = "8cb267636a8782cedcbce75338f5759323353e1cc84cee6051e82e1506ac4d45" diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml index 1860680d8638b..1739e1213e44b 100644 --- a/libs/langchain/pyproject.toml +++ b/libs/langchain/pyproject.toml @@ -377,6 +377,11 @@ extended_testing = [ "xata", "xmltodict", "betabageldb", + "anthropic", +] + +scheduled_testing = [ + "openai", ] [tool.ruff] diff --git a/libs/langchain/tests/integration_tests/vectorstores/test_bagel.py b/libs/langchain/tests/integration_tests/vectorstores/test_bagel.py index e9400b3b8e5d8..c04bb0f25184a 100644 --- a/libs/langchain/tests/integration_tests/vectorstores/test_bagel.py +++ b/libs/langchain/tests/integration_tests/vectorstores/test_bagel.py @@ -167,21 +167,3 @@ def test_bagel_update_document() -> None: docsearch.update_document(document_id=document_id, document=updated_doc) output = docsearch.similarity_search(updated_content, k=1) assert output == [Document(page_content=updated_content, metadata={"page": "0"})] - - -def main() -> None: - """Bagel intigaration test""" - test_similarity_search() - test_bagel() - test_with_metadatas() - test_with_metadatas_with_scores() - test_with_metadatas_with_scores_using_vector() - test_search_filter() - test_search_filter_with_scores() - test_with_include_parameter() - test_bagel_update_document() - test_with_metadatas_with_scores_using_vector_embe() - - -if __name__ == "__main__": - main()