Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion backend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,33 @@ EXPOSE 8000
RUN apt-get update && \
apt-get install -y --no-install-recommends \
libmagic1 \
libgl1-mesa-glx \
libgl1 \
libglx-mesa0 \
libreoffice \
cmake \
poppler-utils \
tesseract-ocr && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Set LD_LIBRARY_PATH
ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
# Copy requirements file and install Python dependencies
COPY requirements.txt constraints.txt /code/
# --no-cache-dir --upgrade
RUN pip install --upgrade pip
RUN pip install -r requirements.txt -c constraints.txt

RUN python -c "from transformers import AutoTokenizer, AutoModel; \
name='sentence-transformers/all-MiniLM-L6-v2'; \
tok=AutoTokenizer.from_pretrained(name); \
mod=AutoModel.from_pretrained(name); \
tok.save_pretrained('./local_model'); \
mod.save_pretrained('./local_model')"

RUN python -m nltk.downloader -d /usr/local/nltk_data punkt
RUN python -m nltk.downloader -d /usr/local/nltk_data averaged_perceptron_tagger

# Copy application code
COPY . /code
# Set command
Expand Down
4 changes: 2 additions & 2 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,12 @@ wrapt==1.17.2
yarl==1.20.1
youtube-transcript-api==1.1.0
zipp==3.23.0
sentence-transformers==4.1.0
sentence-transformers==5.0.0
google-cloud-logging==3.12.1
pypandoc==1.15
graphdatascience==1.15.1
Secweb==1.18.1
ragas==0.2.15
ragas==0.3.1
rouge_score==0.1.2
langchain-neo4j==0.4.0
pypandoc-binary==1.15
Expand Down
4 changes: 2 additions & 2 deletions backend/src/QA_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
load_dotenv()

EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL')
EMBEDDING_FUNCTION , _ = load_embedding_model(EMBEDDING_MODEL)

class SessionChatHistory:
history_dict = {}
Expand Down Expand Up @@ -304,6 +303,7 @@ def create_document_retriever_chain(llm, retriever):
output_parser = StrOutputParser()

splitter = TokenTextSplitter(chunk_size=CHAT_DOC_SPLIT_SIZE, chunk_overlap=0)
EMBEDDING_FUNCTION , _ = load_embedding_model(EMBEDDING_MODEL)
embeddings_filter = EmbeddingsFilter(
embeddings=EMBEDDING_FUNCTION,
similarity_threshold=CHAT_EMBEDDING_FILTER_SCORE_THRESHOLD
Expand Down Expand Up @@ -344,7 +344,7 @@ def initialize_neo4j_vector(graph, chat_mode_settings):

if not retrieval_query or not index_name:
raise ValueError("Required settings 'retrieval_query' or 'index_name' are missing.")

EMBEDDING_FUNCTION , _ = load_embedding_model(EMBEDDING_MODEL)
if keyword_index:
neo_db = Neo4jVector.from_existing_graph(
embedding=EMBEDDING_FUNCTION,
Expand Down
84 changes: 48 additions & 36 deletions backend/src/document_sources/gcs_bucket.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,46 +46,58 @@ def gcs_loader_func(file_path):
return loader

def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token=None):
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
if gcs_bucket_folder is not None and gcs_bucket_folder.strip()!="":
if gcs_bucket_folder.endswith('/'):
blob_name = gcs_bucket_folder+gcs_blob_filename

nltk.data.path.append("/usr/local/nltk_data")
nltk.data.path.append(os.path.expanduser("~/.nltk_data"))
try:
nltk.data.find("tokenizers/punkt")
except LookupError:
for resource in ["punkt", "averaged_perceptron_tagger"]:
try:
nltk.data.find(f"tokenizers/{resource}" if resource == "punkt" else f"taggers/{resource}")
except LookupError:
logging.info(f"Downloading NLTK resource: {resource}")
nltk.download(resource, download_dir=os.path.expanduser("~/.nltk_data"))

logging.info("NLTK resources downloaded successfully.")
if gcs_bucket_folder is not None and gcs_bucket_folder.strip()!="":
if gcs_bucket_folder.endswith('/'):
blob_name = gcs_bucket_folder+gcs_blob_filename
else:
blob_name = gcs_bucket_folder+'/'+gcs_blob_filename
else:
blob_name = gcs_bucket_folder+'/'+gcs_blob_filename
else:
blob_name = gcs_blob_filename

logging.info(f"GCS project_id : {gcs_project_id}")

if access_token is None:
storage_client = storage.Client(project=gcs_project_id)
bucket = storage_client.bucket(gcs_bucket_name)
blob = bucket.blob(blob_name)
blob_name = gcs_blob_filename

if blob.exists():
loader = GCSFileLoader(project_name=gcs_project_id, bucket=gcs_bucket_name, blob=blob_name, loader_func=gcs_loader_func)
pages = loader.load()
else :
raise LLMGraphBuilderException('File does not exist, Please re-upload the file and try again.')
else:
creds= Credentials(access_token)
storage_client = storage.Client(project=gcs_project_id, credentials=creds)
logging.info(f"GCS project_id : {gcs_project_id}")

bucket = storage_client.bucket(gcs_bucket_name)
blob = bucket.blob(blob_name)
if blob.exists():
content = blob.download_as_bytes()
pdf_file = io.BytesIO(content)
pdf_reader = PdfReader(pdf_file)
# Extract text from all pages
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
pages = [Document(page_content = text)]
if access_token is None:
storage_client = storage.Client(project=gcs_project_id)
bucket = storage_client.bucket(gcs_bucket_name)
blob = bucket.blob(blob_name)

if blob.exists():
loader = GCSFileLoader(project_name=gcs_project_id, bucket=gcs_bucket_name, blob=blob_name, loader_func=gcs_loader_func)
pages = loader.load()
else :
raise LLMGraphBuilderException('File does not exist, Please re-upload the file and try again.')
else:
raise LLMGraphBuilderException(f'File Not Found in GCS bucket - {gcs_bucket_name}')
return gcs_blob_filename, pages
creds= Credentials(access_token)
storage_client = storage.Client(project=gcs_project_id, credentials=creds)

bucket = storage_client.bucket(gcs_bucket_name)
blob = bucket.blob(blob_name)
if blob.exists():
content = blob.download_as_bytes()
pdf_file = io.BytesIO(content)
pdf_reader = PdfReader(pdf_file)
# Extract text from all pages
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
pages = [Document(page_content = text)]
else:
raise LLMGraphBuilderException(f'File Not Found in GCS bucket - {gcs_bucket_name}')
return gcs_blob_filename, pages

def upload_file_to_gcs(file_chunk, chunk_number, original_file_name, bucket_name, folder_name_sha1_hashed):
try:
Expand Down
4 changes: 2 additions & 2 deletions backend/src/make_relationships.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
logging.basicConfig(format='%(asctime)s - %(message)s',level='INFO')

EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL')
EMBEDDING_FUNCTION , EMBEDDING_DIMENSION = load_embedding_model(EMBEDDING_MODEL)

def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_documents_chunk_chunk_Id : list):
batch_data = []
Expand Down Expand Up @@ -41,7 +40,7 @@ def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_docume
def create_chunk_embeddings(graph, chunkId_chunkDoc_list, file_name):
isEmbedding = os.getenv('IS_EMBEDDING')

embeddings, dimension = EMBEDDING_FUNCTION , EMBEDDING_DIMENSION
embeddings, dimension = load_embedding_model(EMBEDDING_MODEL)
logging.info(f'embedding model:{embeddings} and dimesion:{dimension}')
data_for_query = []
logging.info(f"update embedding and vector index for chunks")
Expand Down Expand Up @@ -161,6 +160,7 @@ def create_chunk_vector_index(graph):
vector_index_query = "SHOW INDEXES YIELD name, type, labelsOrTypes, properties WHERE name = 'vector' AND type = 'VECTOR' AND 'Chunk' IN labelsOrTypes AND 'embedding' IN properties RETURN name"
vector_index = execute_graph_query(graph,vector_index_query)
if not vector_index:
EMBEDDING_FUNCTION , EMBEDDING_DIMENSION = load_embedding_model(EMBEDDING_MODEL)
vector_store = Neo4jVector(embedding=EMBEDDING_FUNCTION,
graph=graph,
node_label="Chunk",
Expand Down
8 changes: 7 additions & 1 deletion backend/src/ragas_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,13 @@
from ragas.embeddings import LangchainEmbeddingsWrapper
import nltk

nltk.download('punkt')
nltk.data.path.append("/usr/local/nltk_data")
nltk.data.path.append(os.path.expanduser("~/.nltk_data"))
try:
nltk.data.find("tokenizers/punkt")
except LookupError:
nltk.download("punkt", download_dir=os.path.expanduser("~/.nltk_data"))

load_dotenv()

EMBEDDING_MODEL = os.getenv("RAGAS_EMBEDDING_MODEL")
Expand Down
44 changes: 40 additions & 4 deletions backend/src/shared/common_fn.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import hashlib
import os
from transformers import AutoTokenizer, AutoModel
from langchain_huggingface import HuggingFaceEmbeddings
from threading import Lock
import logging
from src.document_sources.youtube import create_youtube_url
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_google_vertexai import VertexAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_neo4j import Neo4jGraph
Expand All @@ -16,6 +19,40 @@
import boto3
from langchain_community.embeddings import BedrockEmbeddings

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
MODEL_PATH = "./local_model"
_lock = Lock()
_embedding_instance = None

def ensure_sentence_transformer_model_downloaded():
if os.path.isdir(MODEL_PATH):
print("Model already downloaded at:", MODEL_PATH)
return
else:
print("Downloading model to:", MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
tokenizer.save_pretrained(MODEL_PATH)
model.save_pretrained(MODEL_PATH)
print("Model downloaded and saved.")

def get_local_sentence_transformer_embedding():
"""
Lazy, threadsafe singleton. Caller does not need to worry about
import-time initialization or download race.
"""
global _embedding_instance
if _embedding_instance is not None:
return _embedding_instance
with _lock:
if _embedding_instance is not None:
return _embedding_instance
# Ensure model is present before instantiating
ensure_sentence_transformer_model_downloaded()
_embedding_instance = HuggingFaceEmbeddings(model_name=MODEL_PATH)
print("Embedding model initialized.")
return _embedding_instance

def check_url_source(source_type, yt_url:str=None, wiki_query:str=None):
language=''
try:
Expand Down Expand Up @@ -85,9 +122,8 @@ def load_embedding_model(embedding_model_name: str):
dimension = 1536
logging.info(f"Embedding: Using bedrock titan Embeddings , Dimension:{dimension}")
else:
embeddings = HuggingFaceEmbeddings(
model_name="all-MiniLM-L6-v2"#, cache_folder="/embedding_model"
)
# embeddings = HuggingFaceEmbeddings(model_name="./local_model")
embeddings = get_local_sentence_transformer_embedding()
dimension = 384
logging.info(f"Embedding: Using Langchain HuggingFaceEmbeddings , Dimension:{dimension}")
return embeddings, dimension
Expand Down
8 changes: 2 additions & 6 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,9 @@ services:
dockerfile: Dockerfile
volumes:
- ./backend:/code
env_file:
- ./backend/.env
environment:
- NEO4J_URI=${NEO4J_URI-neo4j://database:7687}
- NEO4J_PASSWORD=${NEO4J_PASSWORD-password}
- NEO4J_USERNAME=${NEO4J_USERNAME-neo4j}
- OPENAI_API_KEY=${OPENAI_API_KEY-}
- DIFFBOT_API_KEY=${DIFFBOT_API_KEY-}
- EMBEDDING_MODEL=${EMBEDDING_MODEL-all-MiniLM-L6-v2}
- LANGCHAIN_ENDPOINT=${LANGCHAIN_ENDPOINT-}
- LANGCHAIN_TRACING_V2=${LANGCHAIN_TRACING_V2-}
- LANGCHAIN_PROJECT=${LANGCHAIN_PROJECT-}
Expand Down