From 696114e145f97ae7b824c6ab9d3a197a8adca003 Mon Sep 17 00:00:00 2001 From: Abhi Agarwal Date: Thu, 26 Sep 2024 13:37:10 -0400 Subject: [PATCH] community: add sqlite-vec vectorstore (#25003) **Description**: Adds a vector store integration with [sqlite-vec](https://alexgarcia.xyz/sqlite-vec/), the successor to sqlite-vss that is a single C file with no external dependencies. Pretty straightforward, just copy-pasted the sqlite-vss integration and made a few tweaks and added integration tests. Only question is whether all documentation should be directed away from sqlite-vss if it is defacto deprecated (cc @asg017). --------- Co-authored-by: Erick Friis Co-authored-by: philippe-oger --- docs/docs/integrations/providers/sqlite.mdx | 5 +- .../integrations/vectorstores/sqlitevec.ipynb | 323 ++++++++++++++++++ libs/community/extended_testing_deps.txt | 1 + .../vectorstores/__init__.py | 5 + .../vectorstores/sqlitevec.py | 242 +++++++++++++ .../vectorstores/test_sqlitevec.py | 58 ++++ .../unit_tests/vectorstores/test_imports.py | 1 + 7 files changed, 633 insertions(+), 2 deletions(-) create mode 100644 docs/docs/integrations/vectorstores/sqlitevec.ipynb create mode 100644 libs/community/langchain_community/vectorstores/sqlitevec.py create mode 100644 libs/community/tests/integration_tests/vectorstores/test_sqlitevec.py diff --git a/docs/docs/integrations/providers/sqlite.mdx b/docs/docs/integrations/providers/sqlite.mdx index e45a47f11372c..3bba662f0d888 100644 --- a/docs/docs/integrations/providers/sqlite.mdx +++ b/docs/docs/integrations/providers/sqlite.mdx @@ -16,10 +16,11 @@ pip install SQLAlchemy ## Vector Store -See a [usage example](/docs/integrations/vectorstores/sqlitevss). +See a [usage example](/docs/integrations/vectorstores/sqlitevec). ```python -from langchain_community.vectorstores import SQLiteVSS +from langchain_community.vectorstores import SQLiteVec +from langchain_community.vectorstores import SQLiteVSS # legacy ``` ## Memory diff --git a/docs/docs/integrations/vectorstores/sqlitevec.ipynb b/docs/docs/integrations/vectorstores/sqlitevec.ipynb new file mode 100644 index 0000000000000..33eb5b854d502 --- /dev/null +++ b/docs/docs/integrations/vectorstores/sqlitevec.ipynb @@ -0,0 +1,323 @@ +{ + "cells": [ + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "---\n", + "sidebar_label: SQLiteVec\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "# SQLite as a Vector Store with SQLiteVec\n", + "\n", + "This notebook covers how to get started with the SQLiteVec vector store.\n", + "\n", + ">[SQLite-Vec](https://alexgarcia.xyz/sqlite-vec/) is an `SQLite` extension designed for vector search, emphasizing local-first operations and easy integration into applications without external servers. It is the successor to [SQLite-VSS](https://alexgarcia.xyz/sqlite-vss/) by the same author. It is written in zero-dependency C and designed to be easy to build and use.\n", + "\n", + "This notebook shows how to use the `SQLiteVec` vector database." + ] + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Setup\n", + "You'll need to install `langchain-community` with `pip install -qU langchain-community` to use this integration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "# You need to install sqlite-vec as a dependency.\n", + "%pip install --upgrade --quiet sqlite-vec" + ] + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "### Credentials\n", + "SQLiteVec does not require any credentials to use as the vector store is a simple SQLite file." + ] + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "## Initialization" + }, + { + "metadata": { + "jupyter": { + "is_executing": true + } + }, + "cell_type": "code", + "source": [ + "from langchain_community.embeddings.sentence_transformer import (\n", + " SentenceTransformerEmbeddings,\n", + ")\n", + "from langchain_community.vectorstores import SQLiteVec\n", + "\n", + "embedding_function = SentenceTransformerEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n", + "vector_store = SQLiteVec(\n", + " table=\"state_union\", db_file=\"/tmp/vec.db\", embedding=embedding_function\n", + ")" + ], + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "## Manage vector store" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "### Add items to vector store" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "vector_store.add_texts(texts=[\"Ketanji Brown Jackson is awesome\", \"foo\", \"bar\"])" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "### Update items in vector store\n", + "Not supported yet" + ] + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "### Delete items from vector store\n", + "Not supported yet" + ] + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "## Query vector store" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "### Query directly" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "data = vector_store.similarity_search(\"Ketanji Brown Jackson\", k=4)" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "### Query by turning into retriever\n", + "Not supported yet" + ] + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Usage for retrieval-augmented generation\n", + "Refer to the documentation on sqlite-vec at https://alexgarcia.xyz/sqlite-vec/ for more information on how to use it for retrieval-augmented generation." + ] + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## API reference\n", + "For detailed documentation of all SQLiteVec features and configurations head to the API reference:https://api.python.langchain.com/en/latest/vectorstores/langchain_community.vectorstores.sqlitevec.SQLiteVec.html" + ] + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "### Other examples" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2023-09-06T14:55:55.370351Z", + "start_time": "2023-09-06T14:55:53.547755Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_community.document_loaders import TextLoader\n", + "from langchain_community.embeddings.sentence_transformer import (\n", + " SentenceTransformerEmbeddings,\n", + ")\n", + "from langchain_community.vectorstores import SQLiteVec\n", + "from langchain_text_splitters import CharacterTextSplitter\n", + "\n", + "# load the document and split it into chunks\n", + "loader = TextLoader(\"../../how_to/state_of_the_union.txt\")\n", + "documents = loader.load()\n", + "\n", + "# split it into chunks\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)\n", + "texts = [doc.page_content for doc in docs]\n", + "\n", + "\n", + "# create the open-source embedding function\n", + "embedding_function = SentenceTransformerEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n", + "\n", + "\n", + "# load it in sqlite-vss in a table named state_union.\n", + "# the db_file parameter is the name of the file you want\n", + "# as your sqlite database.\n", + "db = SQLiteVec.from_texts(\n", + " texts=texts,\n", + " embedding=embedding_function,\n", + " table=\"state_union\",\n", + " db_file=\"/tmp/vec.db\",\n", + ")\n", + "\n", + "# query it\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "data = db.similarity_search(query)\n", + "\n", + "# print results\n", + "data[0].page_content" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": "### Example using existing SQLite connection" + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2023-09-06T14:59:22.086252Z", + "start_time": "2023-09-06T14:59:21.693237Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'Ketanji Brown Jackson is awesome'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_community.document_loaders import TextLoader\n", + "from langchain_community.embeddings.sentence_transformer import (\n", + " SentenceTransformerEmbeddings,\n", + ")\n", + "from langchain_community.vectorstores import SQLiteVec\n", + "from langchain_text_splitters import CharacterTextSplitter\n", + "\n", + "# load the document and split it into chunks\n", + "loader = TextLoader(\"../../how_to/state_of_the_union.txt\")\n", + "documents = loader.load()\n", + "\n", + "# split it into chunks\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)\n", + "texts = [doc.page_content for doc in docs]\n", + "\n", + "\n", + "# create the open-source embedding function\n", + "embedding_function = SentenceTransformerEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n", + "connection = SQLiteVec.create_connection(db_file=\"/tmp/vec.db\")\n", + "\n", + "db1 = SQLiteVec(\n", + " table=\"state_union\", embedding=embedding_function, connection=connection\n", + ")\n", + "\n", + "db1.add_texts([\"Ketanji Brown Jackson is awesome\"])\n", + "# query it again\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "data = db1.similarity_search(query)\n", + "\n", + "# print results\n", + "data[0].page_content" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/libs/community/extended_testing_deps.txt b/libs/community/extended_testing_deps.txt index d9879fd6aa07c..e39716707dc3c 100644 --- a/libs/community/extended_testing_deps.txt +++ b/libs/community/extended_testing_deps.txt @@ -75,6 +75,7 @@ rspace_client>=2.5.0,<3 scikit-learn>=1.2.2,<2 simsimd>=5.0.0,<6 sqlite-vss>=0.1.2,<0.2 +sqlite-vec>=0.1.0,<0.2 sseclient-py>=1.8.0,<2 streamlit>=1.18.0,<2 sympy>=1.12,<2 diff --git a/libs/community/langchain_community/vectorstores/__init__.py b/libs/community/langchain_community/vectorstores/__init__.py index faa2e35b3103f..c38beea0ed6d2 100644 --- a/libs/community/langchain_community/vectorstores/__init__.py +++ b/libs/community/langchain_community/vectorstores/__init__.py @@ -230,6 +230,9 @@ from langchain_community.vectorstores.sklearn import ( SKLearnVectorStore, ) + from langchain_community.vectorstores.sqlitevec import ( + SQLiteVec, + ) from langchain_community.vectorstores.sqlitevss import ( SQLiteVSS, ) @@ -380,6 +383,7 @@ "Relyt", "Rockset", "SKLearnVectorStore", + "SQLiteVec", "SQLiteVSS", "ScaNN", "SemaDB", @@ -483,6 +487,7 @@ "Relyt": "langchain_community.vectorstores.relyt", "Rockset": "langchain_community.vectorstores.rocksetdb", "SKLearnVectorStore": "langchain_community.vectorstores.sklearn", + "SQLiteVec": "langchain_community.vectorstores.sqlitevec", "SQLiteVSS": "langchain_community.vectorstores.sqlitevss", "ScaNN": "langchain_community.vectorstores.scann", "SemaDB": "langchain_community.vectorstores.semadb", diff --git a/libs/community/langchain_community/vectorstores/sqlitevec.py b/libs/community/langchain_community/vectorstores/sqlitevec.py new file mode 100644 index 0000000000000..13a2d5ee9208c --- /dev/null +++ b/libs/community/langchain_community/vectorstores/sqlitevec.py @@ -0,0 +1,242 @@ +from __future__ import annotations + +import json +import logging +import struct +import warnings +from typing import ( + TYPE_CHECKING, + Any, + Iterable, + List, + Optional, + Tuple, + Type, +) + +from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings +from langchain_core.vectorstores import VectorStore + +if TYPE_CHECKING: + import sqlite3 + +logger = logging.getLogger(__name__) + + +def serialize_f32(vector: List[float]) -> bytes: + """Serializes a list of floats into a compact "raw bytes" format + + Source: https://github.com/asg017/sqlite-vec/blob/21c5a14fc71c83f135f5b00c84115139fd12c492/examples/simple-python/demo.py#L8-L10 + """ + return struct.pack("%sf" % len(vector), *vector) + + +class SQLiteVec(VectorStore): + """SQLite with Vec extension as a vector database. + + To use, you should have the ``sqlite-vec`` python package installed. + Example: + .. code-block:: python + from langchain_community.vectorstores import SQLiteVec + from langchain_community.embeddings.openai import OpenAIEmbeddings + ... + """ + + def __init__( + self, + table: str, + connection: Optional[sqlite3.Connection], + embedding: Embeddings, + db_file: str = "vec.db", + ): + """Initialize with sqlite client with vss extension.""" + try: + import sqlite_vec # noqa # pylint: disable=unused-import + except ImportError: + raise ImportError( + "Could not import sqlite-vec python package. " + "Please install it with `pip install sqlite-vec`." + ) + + if not connection: + connection = self.create_connection(db_file) + + if not isinstance(embedding, Embeddings): + warnings.warn("embeddings input must be Embeddings object.") + + self._connection = connection + self._table = table + self._embedding = embedding + + self.create_table_if_not_exists() + + def create_table_if_not_exists(self) -> None: + self._connection.execute( + f""" + CREATE TABLE IF NOT EXISTS {self._table} + ( + rowid INTEGER PRIMARY KEY AUTOINCREMENT, + text TEXT, + metadata BLOB, + text_embedding BLOB + ) + ; + """ + ) + self._connection.execute( + f""" + CREATE VIRTUAL TABLE IF NOT EXISTS {self._table}_vec USING vec0( + rowid INTEGER PRIMARY KEY, + text_embedding float[{self.get_dimensionality()}] + ) + ; + """ + ) + self._connection.execute( + f""" + CREATE TRIGGER IF NOT EXISTS embed_text + AFTER INSERT ON {self._table} + BEGIN + INSERT INTO {self._table}_vec(rowid, text_embedding) + VALUES (new.rowid, new.text_embedding) + ; + END; + """ + ) + self._connection.commit() + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> List[str]: + """Add more texts to the vectorstore index. + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + kwargs: vectorstore specific parameters + """ + max_id = self._connection.execute( + f"SELECT max(rowid) as rowid FROM {self._table}" + ).fetchone()["rowid"] + if max_id is None: # no text added yet + max_id = 0 + + embeds = self._embedding.embed_documents(list(texts)) + if not metadatas: + metadatas = [{} for _ in texts] + data_input = [ + (text, json.dumps(metadata), serialize_f32(embed)) + for text, metadata, embed in zip(texts, metadatas, embeds) + ] + self._connection.executemany( + f"INSERT INTO {self._table}(text, metadata, text_embedding) " + f"VALUES (?,?,?)", + data_input, + ) + self._connection.commit() + # pulling every ids we just inserted + results = self._connection.execute( + f"SELECT rowid FROM {self._table} WHERE rowid > {max_id}" + ) + return [row["rowid"] for row in results] + + def similarity_search_with_score_by_vector( + self, embedding: List[float], k: int = 4, **kwargs: Any + ) -> List[Tuple[Document, float]]: + sql_query = f""" + SELECT + text, + metadata, + distance + FROM {self._table} AS e + INNER JOIN {self._table}_vec AS v on v.rowid = e.rowid + WHERE + v.text_embedding MATCH ? + AND k = ? + ORDER BY distance + """ + cursor = self._connection.cursor() + cursor.execute( + sql_query, + [serialize_f32(embedding), k], + ) + results = cursor.fetchall() + + documents = [] + for row in results: + metadata = json.loads(row["metadata"]) or {} + doc = Document(page_content=row["text"], metadata=metadata) + documents.append((doc, row["distance"])) + + return documents + + def similarity_search( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Document]: + """Return docs most similar to query.""" + embedding = self._embedding.embed_query(query) + documents = self.similarity_search_with_score_by_vector( + embedding=embedding, k=k + ) + return [doc for doc, _ in documents] + + def similarity_search_with_score( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Tuple[Document, float]]: + """Return docs most similar to query.""" + embedding = self._embedding.embed_query(query) + documents = self.similarity_search_with_score_by_vector( + embedding=embedding, k=k + ) + return documents + + def similarity_search_by_vector( + self, embedding: List[float], k: int = 4, **kwargs: Any + ) -> List[Document]: + documents = self.similarity_search_with_score_by_vector( + embedding=embedding, k=k + ) + return [doc for doc, _ in documents] + + @classmethod + def from_texts( + cls: Type[SQLiteVec], + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + table: str = "langchain", + db_file: str = "vec.db", + **kwargs: Any, + ) -> SQLiteVec: + """Return VectorStore initialized from texts and embeddings.""" + connection = cls.create_connection(db_file) + vec = cls( + table=table, connection=connection, db_file=db_file, embedding=embedding + ) + vec.add_texts(texts=texts, metadatas=metadatas) + return vec + + @staticmethod + def create_connection(db_file: str) -> sqlite3.Connection: + import sqlite3 + + import sqlite_vec + + connection = sqlite3.connect(db_file) + connection.row_factory = sqlite3.Row + connection.enable_load_extension(True) + sqlite_vec.load(connection) + connection.enable_load_extension(False) + return connection + + def get_dimensionality(self) -> int: + """ + Function that does a dummy embedding to figure out how many dimensions + this embedding function returns. Needed for the virtual table DDL. + """ + dummy_text = "This is a dummy text" + dummy_embedding = self._embedding.embed_query(dummy_text) + return len(dummy_embedding) diff --git a/libs/community/tests/integration_tests/vectorstores/test_sqlitevec.py b/libs/community/tests/integration_tests/vectorstores/test_sqlitevec.py new file mode 100644 index 0000000000000..f7c67ba529902 --- /dev/null +++ b/libs/community/tests/integration_tests/vectorstores/test_sqlitevec.py @@ -0,0 +1,58 @@ +from typing import List, Optional + +import pytest +from langchain_core.documents import Document + +from langchain_community.vectorstores import SQLiteVec +from tests.integration_tests.vectorstores.fake_embeddings import ( + FakeEmbeddings, + fake_texts, +) + + +def _sqlite_vec_from_texts( + metadatas: Optional[List[dict]] = None, drop: bool = True +) -> SQLiteVec: + return SQLiteVec.from_texts( + fake_texts, + FakeEmbeddings(), + metadatas=metadatas, + table="test", + db_file=":memory:", + ) + + +@pytest.mark.requires("sqlite-vec") +def test_sqlitevec() -> None: + """Test end to end construction and search.""" + docsearch = _sqlite_vec_from_texts() + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo", metadata={})] + + +@pytest.mark.requires("sqlite-vec") +def test_sqlitevec_with_score() -> None: + """Test end to end construction and search with scores and IDs.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = _sqlite_vec_from_texts(metadatas=metadatas) + output = docsearch.similarity_search_with_score("foo", k=3) + docs = [o[0] for o in output] + distances = [o[1] for o in output] + assert docs == [ + Document(page_content="foo", metadata={"page": 0}), + Document(page_content="bar", metadata={"page": 1}), + Document(page_content="baz", metadata={"page": 2}), + ] + assert distances[0] < distances[1] < distances[2] + + +@pytest.mark.requires("sqlite-vec") +def test_sqlitevec_add_extra() -> None: + """Test end to end construction and MRR search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = _sqlite_vec_from_texts(metadatas=metadatas) + docsearch.add_texts(texts, metadatas) + output = docsearch.similarity_search("foo", k=10) + assert len(output) == 6 diff --git a/libs/community/tests/unit_tests/vectorstores/test_imports.py b/libs/community/tests/unit_tests/vectorstores/test_imports.py index 2a59b0ebc7c3f..5ac0ca72b49c5 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_imports.py +++ b/libs/community/tests/unit_tests/vectorstores/test_imports.py @@ -76,6 +76,7 @@ "Relyt", "Rockset", "SKLearnVectorStore", + "SQLiteVec", "SQLiteVSS", "ScaNN", "SemaDB",