-
Notifications
You must be signed in to change notification settings - Fork 0
/
ingest_data.py
68 lines (57 loc) · 2.49 KB
/
ingest_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.document_loaders import UnstructuredHTMLLoader, BSHTMLLoader
from langchain.document_loaders import TextLoader
from langchain.document_loaders import JSONLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import os
print("Please make sure you have a HUGGINGFACEHUB_API_TOKEN environment variable set up in your system\n")
path = input("Please provide the directory containing datafiles that need embedding and vector database storage: \n")
DATA_PATH = path
faiss_path = input("Please provide where you'd like to generate a FAISS vector database: \n")
DB_FAISS_PATH = faiss_path
huggingfacehub_api_token=os.environ['HUGGINGFACEHUB_API_TOKEN']
def create_vector_db():
documents=[]
processed_htmls=0
processed_pdfs=0
processed_txts=0
processed_jsons=0
for f in os.listdir(DATA_PATH):
try:
if f.endswith(".pdf"):
pdf_path = DATA_PATH + f
loader = PyPDFLoader(pdf_path)
documents.extend(loader.load())
processed_pdfs+=1
elif f.endswith(".html"):
html_path = DATA_PATH + f
loader = BSHTMLLoader(html_path)
documents.extend(loader.load())
processed_htmls+=1
elif f.endswith(".txt"):
txt_path = DATA_PATH + f
loader = TextLoader(txt_path)
documents.extend(loader.load())
processed_txts+=1
elif f.endswith(".json"):
json_path = DATA_PATH + f
loader = JSONLoader(json_path)
documents.extend(loader.load())
processed_jsons+=1
except:
print("issue with ",f)
pass
print("Processed",processed_txts, "text files, ", processed_jsons,
"json files, ", processed_htmls,"html files, and ",
processed_pdfs,"pdf files")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
texts=text_splitter.split_documents(documents)
embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
model_kwargs={'device':'cpu'})
db=FAISS.from_documents(texts,embeddings)
db.save_local(DB_FAISS_PATH)
if __name__=="__main__":
create_vector_db()