-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
5e930b2
commit 3488725
Showing
7 changed files
with
88 additions
and
149 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
# ========================= | ||
# Vector DB Build | ||
# Author: Kenneth Leung | ||
# ========================= | ||
import box | ||
import yaml | ||
from langchain.vectorstores import Chroma, FAISS | ||
from langchain.text_splitter import RecursiveCharacterTextSplitter | ||
from langchain.document_loaders import PyPDFLoader, DirectoryLoader | ||
# from langchain.embeddings import HuggingFaceInstructEmbeddings | ||
from langchain.embeddings import HuggingFaceEmbeddings | ||
|
||
# Import config vars | ||
with open('config/config.yml', 'r', encoding='utf8') as ymlfile: | ||
cfg = box.Box(yaml.safe_load(ymlfile)) | ||
|
||
# See for more info: https://huggingface.co/hkunlp/instructor-xl | ||
# EMBED_MODEL = 'hkunlp/instructor-large' # or 'hkunlp/instructor-xl' | ||
|
||
# Build vector database | ||
def build_db(vectorstore='FAISS'): | ||
loader = DirectoryLoader(cfg.DATA_PATH, | ||
glob="*.pdf", | ||
loader_cls=PyPDFLoader) | ||
documents = loader.load() | ||
|
||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=cfg.CHUNK_SIZE, | ||
chunk_overlap=cfg.CHUNK_OVERLAP) | ||
texts = text_splitter.split_documents(documents) | ||
# embedding = HuggingFaceInstructEmbeddings(model_name=EMBED_MODEL, | ||
# model_kwargs={"device": 'cuda} | ||
# ) | ||
# model_name = "sentence-transformers/all-mpnet-base-v2" | ||
|
||
model_name = "sentence-transformers/all-MiniLM-L6-v2" | ||
model_kwargs = {'device': 'cpu'} | ||
|
||
embeddings = HuggingFaceEmbeddings(model_name=model_name, | ||
model_kwargs=model_kwargs) | ||
# Build specific DB | ||
if vectorstore == 'Chroma': | ||
vectordb = Chroma.from_documents(documents=texts, | ||
embedding=embeddings, | ||
persist_directory=cfg.DB_CHROMA_PATH) | ||
vectordb.persist() | ||
elif vectorstore == 'FAISS': | ||
vectorstore = FAISS.from_documents(texts, embeddings) | ||
vectorstore.save_local(cfg.DB_FAISS_PATH) | ||
print('FAISS Vectorstore - Build Complete') | ||
else: | ||
raise ValueError('Error in DB selection') | ||
|
||
if __name__ == "__main__": | ||
build_db() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters