Skip to content

feat: Pinecone Vector Database #86

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ The following environment variables are required to run the application:
- `AWS_DEFAULT_REGION`: (Optional) defaults to `us-east-1`
- `AWS_ACCESS_KEY_ID`: (Optional) needed for bedrock embeddings
- `AWS_SECRET_ACCESS_KEY`: (Optional) needed for bedrock embeddings
- `PINECONE_API_KEY`: (Optional) needed for pinecone vector database

Make sure to set these environment variables before running the application. You can set them in a `.env` file or as system environment variables.

Expand All @@ -97,7 +98,7 @@ COLLECTION_NAME=<vector collection>
ATLAS_SEARCH_INDEX=<vector search index>
```

The `ATLAS_MONGO_DB_URI` could be the same or different from what is used by LibreChat. Even if it is the same, the `$COLLECTION_NAME` collection needs to be a completely new one, separate from all collections used by LibreChat. In addition, create a vector search index for collection above (remember to assign `$ATLAS_SEARCH_INDEX`) with the following json:
The `ATLAS_MONGO_DB_URI` could be the same or different from what is used by LibreChat. Even if it is the same, the `$COLLECTION_NAME` collection needs to be a completely new one, separate from all collections used by LibreChat. In addition, create a vector search index for collection above (remember to assign `$ATLAS_SEARCH_INDEX`) with the following json:

```json
{
Expand All @@ -118,6 +119,18 @@ The `ATLAS_MONGO_DB_URI` could be the same or different from what is used by Lib

Follow one of the [four documented methods](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-index/#procedure) to create the vector index.

### Use Pinecone as Vector Database

Another option for a vector database, we could use [Pinecone](https://www.pinecone.io/). To do so, set the following environment variables along with creating a pinecone account to get api key.

```env
VECTOR_DB_TYPE=pinecone
COLLECTION_NAME=<index name>
PINECOIN_API_KEY=<api key>
AWS_DEFAULT_REGION=<defaults 'us-east-1'>
```

A new index with name `COLLECTION_NAME` will be created automatically if one does not already exist in your Pinecone vector database.

### Cloud Installation Settings:

Expand Down
13 changes: 13 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from enum import Enum
from datetime import datetime
from dotenv import find_dotenv, load_dotenv
from langchain_pinecone import Pinecone
from openai import api_key
from starlette.middleware.base import BaseHTTPMiddleware
from store_factory import get_vector_store

Expand All @@ -15,6 +17,7 @@
class VectorDBType(Enum):
PGVECTOR = "pgvector"
ATLAS_MONGO = "atlas-mongo"
PINECONE = "pinecone"


class EmbeddingsProvider(Enum):
Expand Down Expand Up @@ -169,6 +172,7 @@ async def dispatch(self, request, call_next):
OLLAMA_BASE_URL = get_env_variable("OLLAMA_BASE_URL", "http://ollama:11434")
AWS_ACCESS_KEY_ID = get_env_variable("AWS_ACCESS_KEY_ID", "")
AWS_SECRET_ACCESS_KEY = get_env_variable("AWS_SECRET_ACCESS_KEY", "")
PINECONE_API_KEY = get_env_variable("PINECONE_API_KEY", "")

## Embeddings

Expand Down Expand Up @@ -276,6 +280,15 @@ def init_embeddings(provider, model):
mode="atlas-mongo",
search_index=ATLAS_SEARCH_INDEX,
)
elif VECTOR_DB_TYPE == VectorDBType.PINECONE:
AWS_DEFAULT_REGION = get_env_variable("AWS_DEFAULT_REGION", "us-east-1")
vector_store = get_vector_store(
connection_string=AWS_DEFAULT_REGION,
embeddings=embeddings,
collection_name=COLLECTION_NAME,
mode="pinecone",
api_key=PINECONE_API_KEY,
)
else:
raise ValueError(f"Unsupported vector store type: {VECTOR_DB_TYPE}")

Expand Down
1 change: 1 addition & 0 deletions requirements.lite.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ rapidocr-onnxruntime==1.3.24
opencv-python-headless==4.9.0.80
pymongo==4.6.3
langchain-mongodb==0.2.0
langchain-pinecone==0.2.0
cryptography==42.0.7
python-magic==0.4.27
python-pptx==0.6.23
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ langchain-mongodb==0.2.0
langchain-ollama==0.2.0
langchain-openai==0.2.0
langchain-huggingface==0.1.0
langchain-pinecone==0.2.0
cryptography==42.0.7
python-magic==0.4.27
python-pptx==0.6.23
Expand Down
86 changes: 85 additions & 1 deletion store.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from langchain_core.documents import Document
from langchain_core.runnables.config import run_in_executor
from sqlalchemy.orm import Session

from langchain_pinecone import PineconeVectorStore
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_core.embeddings import Embeddings
from typing import (
Expand Down Expand Up @@ -150,3 +150,87 @@ def delete(self, ids: Optional[list[str]] = None) -> None:
# implement the deletion of documents by file_id in self._collection
if ids is not None:
self._collection.delete_many({"file_id": {"$in": ids}})


class ExtendedPCVector(PineconeVectorStore):
@property
def embedding_function(self) -> Embeddings:
return self.embeddings

def add_documents(self, docs: list[Document], ids: list[str]):
# {file_id}_{idx}
new_ids = [id for id in range(len(ids))]
file_id = docs[0].metadata["file_id"]
f_ids = [f"{file_id}_{id}" for id in new_ids]
return super().add_documents(docs, ids=f_ids)

def get_ids_prefix(self, file_id: str) -> list[str]:
prefix = file_id + "_"
ids = []
for results in self._index.list(prefix=prefix, namespace=self._namespace):
ids.extend(results)
return ids

def get_all_ids(self) -> List[str]:
baseIDs = set()
for ids in self._index.list(namespace=self._namespace):
for id in ids:
splitIndex = id.rfind('_')
base = id[:splitIndex]
baseIDs.add(base)
return list(baseIDs)

def get_documents_by_ids(self, ids: List[str]) -> List[Document]:
idList = []
for id in ids:
result = self.get_ids_prefix(id)
idList.extend(result)
results = self._index.fetch(idList)
documents = []
for vector in results["vectors"]:
metadata = results["vectors"][vector]["metadata"]
metadataF = {
"file_id": metadata["file_id"],
"digest": metadata["digest"],
"source": metadata["source"],
"user_id": metadata["user_id"],
}
doc = Document(page_content=metadata["text"], metadata=metadataF)
documents.append(doc)
return documents

def delete(self, ids: Optional[List[str]] = None) -> None:
for id in ids:
idList = self.get_ids_prefix(id)
self._index.delete(ids=idList, namespace=self._namespace)
# Deletes based on prefix, more efficient but delete more if names overlap, implement for all?

def similarity_search_with_score_by_vector(
self,
embedding: List[float],
k: int = 4,
filter: Optional[dict] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""
Perform a similarity search with scores using an embedding vector.
"""
query_results = self._index.query(
vector=embedding,
top_k=k,
include_metadata=True,
filter=filter,
namespace=self._namespace,
**kwargs,
)
docs = query_results["matches"]
processed_documents = []
for match in docs:
metadata = match["metadata"]
if "metadata" in metadata and "_id" in metadata["metadata"]:
del metadata["metadata"]["_id"]
text = metadata["text"]
del metadata["text"]
doc = Document(page_content=text, metadata=metadata)
processed_documents.append((doc, match["score"]))
return processed_documents
31 changes: 27 additions & 4 deletions store_factory.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
from typing import Optional
from langchain_core.embeddings import Embeddings
from store import AsyncPgVector, ExtendedPgVector
from store import AsyncPgVector, ExtendedPCVector, ExtendedPgVector
from store import AtlasMongoVector
from pymongo import MongoClient

from pinecone import Pinecone
from pinecone import ServerlessSpec
import time

def get_vector_store(
connection_string: str,
embeddings: Embeddings,
collection_name: str,
mode: str = "sync",
search_index: Optional[str] = None
search_index: Optional[str] = None,
api_key: Optional[str] = None
):
if mode == "sync":
return ExtendedPgVector(
Expand All @@ -30,7 +33,24 @@ def get_vector_store(
return AtlasMongoVector(
collection=mong_collection, embedding=embeddings, index_name=search_index
)

elif mode == "pinecone":
region = connection_string
index_name = collection_name
pc = Pinecone(api_key)
spec = ServerlessSpec(cloud="aws", region=region)
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
if index_name not in existing_indexes:
pc.create_index(
name=index_name,
dimension=get_dimension_size(embeddings),
metric="cosine",
spec=spec,
)
while not pc.describe_index(index_name).status["ready"]:
time.sleep(1)
host = pc.describe_index(index_name).host
index = pc.Index(host=host)
return ExtendedPCVector(index=index, embedding=embeddings)
else:
raise ValueError("Invalid mode specified. Choose 'sync' or 'async'.")

Expand Down Expand Up @@ -61,3 +81,6 @@ async def create_index_if_not_exists(conn, table_name: str, column_name: str):
print(f"Index {index_name} created on {table_name}.{column_name}")
else:
print(f"Index {index_name} already exists on {table_name}.{column_name}")

def get_dimension_size(embeddings:Embeddings):
return len(embeddings.embed_query("Dimensions"))