danny-avila · FinnConnor · Oct 9, 2024 · Oct 9, 2024
diff --git a/README.md b/README.md
@@ -83,6 +83,7 @@ The following environment variables are required to run the application:
 - `AWS_DEFAULT_REGION`: (Optional) defaults to `us-east-1`
 - `AWS_ACCESS_KEY_ID`: (Optional) needed for bedrock embeddings
 - `AWS_SECRET_ACCESS_KEY`: (Optional) needed for bedrock embeddings
+- `PINECONE_API_KEY`: (Optional) needed for pinecone vector database
 
 Make sure to set these environment variables before running the application. You can set them in a `.env` file or as system environment variables.
 
@@ -97,7 +98,7 @@ COLLECTION_NAME=<vector collection>
 ATLAS_SEARCH_INDEX=<vector search index>
 ```
 
-The `ATLAS_MONGO_DB_URI` could be the same or different from what is used by LibreChat. Even if it is the same, the `$COLLECTION_NAME` collection needs to be a completely new one, separate from all collections used by LibreChat. In addition,  create a vector search index for collection above (remember to assign `$ATLAS_SEARCH_INDEX`) with the following json:
+The `ATLAS_MONGO_DB_URI` could be the same or different from what is used by LibreChat. Even if it is the same, the `$COLLECTION_NAME` collection needs to be a completely new one, separate from all collections used by LibreChat. In addition, create a vector search index for collection above (remember to assign `$ATLAS_SEARCH_INDEX`) with the following json:
 
 ```json
 {
@@ -118,6 +119,18 @@ The `ATLAS_MONGO_DB_URI` could be the same or different from what is used by Lib
 
 Follow one of the [four documented methods](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-index/#procedure) to create the vector index.
 
+### Use Pinecone as Vector Database
+
+Another option for a vector database, we could use [Pinecone](https://www.pinecone.io/). To do so, set the following environment variables along with creating a pinecone account to get api key.
+
+```env
+VECTOR_DB_TYPE=pinecone
+COLLECTION_NAME=<index name>
+PINECOIN_API_KEY=<api key>
+AWS_DEFAULT_REGION=<defaults 'us-east-1'>
+```
+
+A new index with name `COLLECTION_NAME` will be created automatically if one does not already exist in your Pinecone vector database.
 
 ### Cloud Installation Settings:
 

diff --git a/config.py b/config.py
@@ -6,6 +6,8 @@
 from enum import Enum
 from datetime import datetime
 from dotenv import find_dotenv, load_dotenv
+from langchain_pinecone import Pinecone
+from openai import api_key
 from starlette.middleware.base import BaseHTTPMiddleware
 from store_factory import get_vector_store
 
@@ -15,6 +17,7 @@
 class VectorDBType(Enum):
     PGVECTOR = "pgvector"
     ATLAS_MONGO = "atlas-mongo"
+    PINECONE = "pinecone"
 
 
 class EmbeddingsProvider(Enum):
@@ -169,6 +172,7 @@ async def dispatch(self, request, call_next):
 OLLAMA_BASE_URL = get_env_variable("OLLAMA_BASE_URL", "http://ollama:11434")
 AWS_ACCESS_KEY_ID = get_env_variable("AWS_ACCESS_KEY_ID", "")
 AWS_SECRET_ACCESS_KEY = get_env_variable("AWS_SECRET_ACCESS_KEY", "")
+PINECONE_API_KEY = get_env_variable("PINECONE_API_KEY", "")
 
 ## Embeddings
 
@@ -276,6 +280,15 @@ def init_embeddings(provider, model):
         mode="atlas-mongo",
         search_index=ATLAS_SEARCH_INDEX,
     )
+elif VECTOR_DB_TYPE == VectorDBType.PINECONE:
+    AWS_DEFAULT_REGION = get_env_variable("AWS_DEFAULT_REGION", "us-east-1")
+    vector_store = get_vector_store(
+        connection_string=AWS_DEFAULT_REGION,
+        embeddings=embeddings,
+        collection_name=COLLECTION_NAME,
+        mode="pinecone",
+        api_key=PINECONE_API_KEY,
+    )
 else:
     raise ValueError(f"Unsupported vector store type: {VECTOR_DB_TYPE}")
 

diff --git a/requirements.lite.txt b/requirements.lite.txt
@@ -24,6 +24,7 @@ rapidocr-onnxruntime==1.3.24
 opencv-python-headless==4.9.0.80
 pymongo==4.6.3
 langchain-mongodb==0.2.0
+langchain-pinecone==0.2.0
 cryptography==42.0.7
 python-magic==0.4.27
 python-pptx==0.6.23

diff --git a/requirements.txt b/requirements.txt
@@ -30,6 +30,7 @@ langchain-mongodb==0.2.0
 langchain-ollama==0.2.0
 langchain-openai==0.2.0
 langchain-huggingface==0.1.0
+langchain-pinecone==0.2.0
 cryptography==42.0.7
 python-magic==0.4.27
 python-pptx==0.6.23

diff --git a/store.py b/store.py
@@ -4,7 +4,7 @@
 from langchain_core.documents import Document
 from langchain_core.runnables.config import run_in_executor
 from sqlalchemy.orm import Session
-
+from langchain_pinecone import PineconeVectorStore
 from langchain_mongodb import MongoDBAtlasVectorSearch
 from langchain_core.embeddings import Embeddings
 from typing import (
@@ -150,3 +150,87 @@ def delete(self, ids: Optional[list[str]] = None) -> None:
         # implement the deletion of documents by file_id in self._collection
         if ids is not None:
             self._collection.delete_many({"file_id": {"$in": ids}})
+
+
+class ExtendedPCVector(PineconeVectorStore):
+    @property
+    def embedding_function(self) -> Embeddings:
+        return self.embeddings
+
+    def add_documents(self, docs: list[Document], ids: list[str]):
+        # {file_id}_{idx}
+        new_ids = [id for id in range(len(ids))]
+        file_id = docs[0].metadata["file_id"]
+        f_ids = [f"{file_id}_{id}" for id in new_ids]
+        return super().add_documents(docs, ids=f_ids)
+
+    def get_ids_prefix(self, file_id: str) -> list[str]:
+        prefix = file_id + "_"
+        ids = []
+        for results in self._index.list(prefix=prefix, namespace=self._namespace):
+            ids.extend(results)
+        return ids
+
+    def get_all_ids(self) -> List[str]:
+        baseIDs = set()
+        for ids in self._index.list(namespace=self._namespace):
+            for id in ids:
+                splitIndex = id.rfind('_')
+                base = id[:splitIndex]
+                baseIDs.add(base)
+        return list(baseIDs)
+
+    def get_documents_by_ids(self, ids: List[str]) -> List[Document]:
+        idList = []
+        for id in ids:
+            result = self.get_ids_prefix(id)
+            idList.extend(result)
+        results = self._index.fetch(idList)
+        documents = []
+        for vector in results["vectors"]:
+            metadata = results["vectors"][vector]["metadata"]
+            metadataF = {
+                "file_id": metadata["file_id"],
+                "digest": metadata["digest"],
+                "source": metadata["source"],
+                "user_id": metadata["user_id"],
+            }
+            doc = Document(page_content=metadata["text"], metadata=metadataF)
+            documents.append(doc)
+        return documents
+
+    def delete(self, ids: Optional[List[str]] = None) -> None:
+        for id in ids:
+            idList = self.get_ids_prefix(id)
+            self._index.delete(ids=idList, namespace=self._namespace)
+            # Deletes based on prefix, more efficient but delete more if names overlap, implement for all?
+
+    def similarity_search_with_score_by_vector(
+        self,
+        embedding: List[float],
+        k: int = 4,
+        filter: Optional[dict] = None,
+        **kwargs: Any,
+    ) -> List[Tuple[Document, float]]:
+        """
+        Perform a similarity search with scores using an embedding vector.
+        """
+        query_results = self._index.query(
+            vector=embedding,
+            top_k=k,
+            include_metadata=True,
+            filter=filter,
+            namespace=self._namespace,
+            **kwargs,
+        )
+        docs = query_results["matches"]
+        processed_documents = []
+        for match in docs:
+            metadata = match["metadata"]
+            if "metadata" in metadata and "_id" in metadata["metadata"]:
+                del metadata["metadata"]["_id"]
+            text = metadata["text"]
+            del metadata["text"]
+            doc = Document(page_content=text, metadata=metadata)
+            processed_documents.append((doc, match["score"]))
+        return processed_documents
diff --git a/store_factory.py b/store_factory.py
@@ -1,16 +1,19 @@
 from typing import Optional
 from langchain_core.embeddings import Embeddings
-from store import AsyncPgVector, ExtendedPgVector
+from store import AsyncPgVector, ExtendedPCVector, ExtendedPgVector
 from store import AtlasMongoVector
 from pymongo import MongoClient
-
+from pinecone import Pinecone 
+from pinecone import ServerlessSpec
+import time
 
 def get_vector_store(
     connection_string: str,
     embeddings: Embeddings,
     collection_name: str,
     mode: str = "sync",
-    search_index: Optional[str] = None 
+    search_index: Optional[str] = None,
+    api_key: Optional[str]  = None
 ):
     if mode == "sync":
         return ExtendedPgVector(
@@ -30,7 +33,24 @@ def get_vector_store(
         return AtlasMongoVector(
             collection=mong_collection, embedding=embeddings, index_name=search_index
         )
-
+    elif mode == "pinecone":
+        region = connection_string
+        index_name = collection_name
+        pc = Pinecone(api_key)
+        spec = ServerlessSpec(cloud="aws", region=region)
+        existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
+        if index_name not in existing_indexes:
+            pc.create_index(
+            name=index_name,
+            dimension=get_dimension_size(embeddings),
+            metric="cosine",
+            spec=spec,
+    )
+            while not pc.describe_index(index_name).status["ready"]:
+                time.sleep(1)
+        host = pc.describe_index(index_name).host
+        index = pc.Index(host=host)
+        return ExtendedPCVector(index=index, embedding=embeddings)
     else:
         raise ValueError("Invalid mode specified. Choose 'sync' or 'async'.")
 
@@ -61,3 +81,6 @@ async def create_index_if_not_exists(conn, table_name: str, column_name: str):
         print(f"Index {index_name} created on {table_name}.{column_name}")
     else:
         print(f"Index {index_name} already exists on {table_name}.{column_name}")
+
+def get_dimension_size(embeddings:Embeddings):
+    return len(embeddings.embed_query("Dimensions"))