Skip to content

Staging #1245

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Apr 21, 2025
Merged

Staging #1245

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -172,4 +172,5 @@ google-cloud-cli-linux-x86_64.tar.gz
.vennv
newenv
files

startupbackend.sh
startupfrontend.sh
317 changes: 168 additions & 149 deletions README.md

Large diffs are not rendered by default.

7 changes: 5 additions & 2 deletions backend/example.env
Original file line number Diff line number Diff line change
Expand Up @@ -31,16 +31,19 @@ DEFAULT_DIFFBOT_CHAT_MODEL="openai_gpt_4o" #whichever model specified here , ne
LLM_MODEL_CONFIG_openai_gpt_3.5="gpt-3.5-turbo-0125,openai_api_key"
LLM_MODEL_CONFIG_openai_gpt_4o_mini="gpt-4o-mini-2024-07-18,openai_api_key"
LLM_MODEL_CONFIG_openai_gpt_4o="gpt-4o-2024-11-20,openai_api_key"
LLM_MODEL_CONFIG_openai-gpt-o3-mini="o3-mini-2025-01-31,openai_api_key"
LLM_MODEL_CONFIG_openai_gpt_4.1_mini="gpt-4.1-mini,openai_api_key"
LLM_MODEL_CONFIG_openai_gpt_4.1="gpt-4.1,openai_api_key"
LLM_MODEL_CONFIG_openai_gpt_o3_mini="o3-mini-2025-01-31,openai_api_key"
LLM_MODEL_CONFIG_gemini_1.5_pro="gemini-1.5-pro-002"
LLM_MODEL_CONFIG_gemini_1.5_flash="gemini-1.5-flash-002"
LLM_MODEL_CONFIG_gemini_2.0_flash="gemini-2.0-flash-001"
LLM_MODEL_CONFIG_gemini_2.5_pro="gemini-2.5-pro-exp-03-25"
LLM_MODEL_CONFIG_diffbot="diffbot,diffbot_api_key"
LLM_MODEL_CONFIG_azure_ai_gpt_35="azure_deployment_name,azure_endpoint or base_url,azure_api_key,api_version"
LLM_MODEL_CONFIG_azure_ai_gpt_4o="gpt-4o,https://YOUR-ENDPOINT.openai.azure.com/,azure_api_key,api_version"
LLM_MODEL_CONFIG_groq_llama3_70b="model_name,base_url,groq_api_key"
LLM_MODEL_CONFIG_anthropic_claude_3_5_sonnet="model_name,anthropic_api_key"
LLM_MODEL_CONFIG_fireworks_llama_v3_70b="model_name,fireworks_api_key"
LLM_MODEL_CONFIG_fireworks_llama4_maverick="model_name,fireworks_api_key"
LLM_MODEL_CONFIG_bedrock_claude_3_5_sonnet="model_name,aws_access_key_id,aws_secret__access_key,region_name"
LLM_MODEL_CONFIG_ollama_llama3="model_name,model_local_url"
YOUTUBE_TRANSCRIPT_PROXY="https://user:pass@domain:port"
Expand Down
85 changes: 43 additions & 42 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,63 +1,64 @@
accelerate==1.6.0
asyncio==3.4.3
boto3==1.36.2
botocore==1.36.2
certifi==2024.8.30
fastapi==0.115.6
boto3==1.37.29
botocore==1.37.29
certifi==2025.1.31
fastapi==0.115.11
fastapi-health==0.4.0
google-api-core==2.24.0
google-auth==2.37.0
google-api-core==2.24.2
google-auth==2.38.0
google_auth_oauthlib==1.2.1
google-cloud-core==2.4.1
json-repair==0.30.2
google-cloud-core==2.4.3
json-repair==0.39.1
pip-install==1.3.5
langchain==0.3.15
langchain-aws==0.2.11
langchain-anthropic==0.3.3
langchain-fireworks==0.2.6
langchain-community==0.3.15
langchain-core==0.3.31
langchain==0.3.23
langchain-aws==0.2.18
langchain-anthropic==0.3.9
langchain-fireworks==0.2.9
langchain-community==0.3.19
langchain-core==0.3.51
langchain-experimental==0.3.4
langchain-google-vertexai==2.0.11
langchain-groq==0.2.3
langchain-openai==0.3.1
langchain-text-splitters==0.3.5
langchain-google-vertexai==2.0.19
langchain-groq==0.2.5
langchain-openai==0.3.12
langchain-text-splitters==0.3.8
langchain-huggingface==0.1.2
langdetect==1.0.9
langsmith==0.2.11
langsmith==0.3.26
langserve==0.3.1
neo4j-rust-ext
nltk==3.9.1
openai==1.59.9
opencv-python==4.10.0.84
psutil==6.1.0
pydantic==2.9.2
openai==1.71.0
opencv-python==4.11.0.86
psutil==7.0.0
pydantic==2.10.6
python-dotenv==1.0.1
python-magic==0.4.27
PyPDF2==3.0.1
PyMuPDF==1.24.14
starlette==0.41.3
sse-starlette==2.1.3
PyMuPDF==1.25.5
starlette==0.46.1
sse-starlette==2.2.1
starlette-session==0.4.3
tqdm==4.67.1
unstructured[all-docs]
unstructured==0.16.11
unstructured-client==0.28.1
unstructured-inference==0.8.1
urllib3==2.2.2
uvicorn==0.32.1
unstructured==0.17.2
unstructured-client==0.32.3
unstructured-inference==0.8.10
urllib3==2.3.0
uvicorn==0.34.0
gunicorn==23.0.0
wikipedia==1.4.0
wrapt==1.16.0
yarl==1.9.4
youtube-transcript-api==0.6.3
zipp==3.17.0
sentence-transformers==3.3.1
google-cloud-logging==3.11.3
pypandoc==1.13
graphdatascience==1.12
Secweb==1.11.0
ragas==0.2.11
wrapt==1.17.2
yarl==1.18.3
youtube-transcript-api==1.0.3
zipp==3.21.0
sentence-transformers==4.0.2
google-cloud-logging==3.11.4
pypandoc==1.15
graphdatascience==1.14
Secweb==1.18.1
ragas==0.2.14
rouge_score==0.1.2
langchain-neo4j==0.3.0
langchain-neo4j==0.4.0
pypandoc-binary==1.15
chardet==5.2.0
21 changes: 12 additions & 9 deletions backend/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,16 +576,20 @@ async def upload_large_file_into_chunks(file:UploadFile = File(...), chunkNumber
result = await asyncio.to_thread(upload_file, graph, model, file, chunkNumber, totalChunks, originalname, uri, CHUNK_DIR, MERGED_DIR)
end = time.time()
elapsed_time = end - start
json_obj = {'api_name':'upload','db_url':uri,'userName':userName, 'database':database, 'chunkNumber':chunkNumber,'totalChunks':totalChunks,
'original_file_name':originalname,'model':model, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{elapsed_time:.2f}','email':email}
logger.log_struct(json_obj, "INFO")
if int(chunkNumber) == int(totalChunks):
json_obj = {'api_name':'upload','db_url':uri,'userName':userName, 'database':database, 'chunkNumber':chunkNumber,'totalChunks':totalChunks,
'original_file_name':originalname,'model':model, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{elapsed_time:.2f}','email':email}
logger.log_struct(json_obj, "INFO")
if int(chunkNumber) == int(totalChunks):
return create_api_response('Success',data=result, message='Source Node Created Successfully')
else:
return create_api_response('Success', message=result)
except Exception as e:
message="Unable to upload large file into chunks. "
message="Unable to upload file in chunks"
error_message = str(e)
graph = create_graph_database_connection(uri, userName, password, database)
graphDb_data_Access = graphDBdataAccess(graph)
graphDb_data_Access.update_exception_db(originalname,error_message)
logging.info(message)
logging.exception(f'Exception:{error_message}')
return create_api_response('Failed', message=message + error_message[:100], error=error_message, file_name = originalname)
Expand All @@ -596,8 +600,7 @@ async def upload_large_file_into_chunks(file:UploadFile = File(...), chunkNumber
async def get_structured_schema(uri=Form(None), userName=Form(None), password=Form(None), database=Form(None),email=Form(None)):
try:
start = time.time()
graph = create_graph_database_connection(uri, userName, password, database)
result = await asyncio.to_thread(get_labels_and_relationtypes, graph)
result = await asyncio.to_thread(get_labels_and_relationtypes, uri, userName, password, database)
end = time.time()
elapsed_time = end - start
logging.info(f'Schema result from DB: {result}')
Expand Down Expand Up @@ -764,10 +767,10 @@ async def cancelled_job(uri=Form(None), userName=Form(None), password=Form(None)
gc.collect()

@app.post("/populate_graph_schema")
async def populate_graph_schema(input_text=Form(None), model=Form(None), is_schema_description_checked=Form(None),email=Form(None)):
async def populate_graph_schema(input_text=Form(None), model=Form(None), is_schema_description_checked=Form(None),is_local_storage=Form(None),email=Form(None)):
try:
start = time.time()
result = populate_graph_schema_from_text(input_text, model, is_schema_description_checked)
result = populate_graph_schema_from_text(input_text, model, is_schema_description_checked, is_local_storage)
end = time.time()
elapsed_time = end - start
json_obj = {'api_name':'populate_graph_schema', 'model':model, 'is_schema_description_checked':is_schema_description_checked, 'input_text':input_text, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{elapsed_time:.2f}','email':email}
Expand Down Expand Up @@ -894,7 +897,7 @@ async def retry_processing(uri=Form(None), userName=Form(None), password=Form(No
try:
start = time.time()
graph = create_graph_database_connection(uri, userName, password, database)
chunks = graph.query(QUERY_TO_GET_CHUNKS, params={"filename":file_name})
chunks = execute_graph_query(graph,QUERY_TO_GET_CHUNKS,params={"filename":file_name})
end = time.time()
elapsed_time = end - start
json_obj = {'api_name':'retry_processing', 'db_url':uri, 'userName':userName, 'database':database, 'file_name':file_name,'retry_condition':retry_condition,
Expand Down
4 changes: 2 additions & 2 deletions backend/src/QA_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,7 @@ def create_retriever(neo_db, document_names, chat_mode_settings,search_k, score_
retriever = neo_db.as_retriever(
search_type="similarity_score_threshold",
search_kwargs={
'k': search_k,
'top_k': search_k,
'effective_search_ratio': ef_ratio,
'score_threshold': score_threshold,
'filter': {'fileName': {'$in': document_names}}
Expand All @@ -390,7 +390,7 @@ def create_retriever(neo_db, document_names, chat_mode_settings,search_k, score_
else:
retriever = neo_db.as_retriever(
search_type="similarity_score_threshold",
search_kwargs={'k': search_k,'effective_search_ratio': ef_ratio, 'score_threshold': score_threshold}
search_kwargs={'top_k': search_k,'effective_search_ratio': ef_ratio, 'score_threshold': score_threshold}
)
logging.info(f"Successfully created retriever with search_k={search_k}, score_threshold={score_threshold}")
return retriever
Expand Down
2 changes: 1 addition & 1 deletion backend/src/document_sources/wikipedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,6 @@ def get_documents_from_Wikipedia(wiki_query:str, language:str):
except Exception as e:
message="Failed To Process Wikipedia Query"
error_message = str(e)
logging.exception(f'Failed To Process Wikipedia Query: {file_name}, Exception Stack trace: {error_message}')
logging.exception(f'Failed To Process Wikipedia Query, Exception Stack trace: {error_message}')
raise LLMGraphBuilderException(error_message+' '+message)

7 changes: 5 additions & 2 deletions backend/src/document_sources/youtube.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from langchain.docstore.document import Document
from src.shared.llm_graph_builder_exception import LLMGraphBuilderException
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.proxies import GenericProxyConfig
import logging
from urllib.parse import urlparse,parse_qs
from difflib import SequenceMatcher
Expand All @@ -12,8 +13,10 @@
def get_youtube_transcript(youtube_id):
try:
proxy = os.environ.get("YOUTUBE_TRANSCRIPT_PROXY")
proxies = { 'https': proxy }
transcript_pieces = YouTubeTranscriptApi.get_transcript(youtube_id, proxies = proxies)
proxy_config = GenericProxyConfig(http_url=proxy, https_url=proxy) if proxy else None
youtube_api = YouTubeTranscriptApi(proxy_config=proxy_config)
transcript_pieces = youtube_api.fetch(youtube_id, preserve_formatting=True)
transcript_pieces = transcript_pieces.to_raw_data()
return transcript_pieces
except Exception as e:
message = f"Youtube transcript is not available for youtube Id: {youtube_id}"
Expand Down
22 changes: 18 additions & 4 deletions backend/src/graphDB_dataAccess.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import logging
import os
import time
from neo4j.exceptions import TransientError
from langchain_neo4j import Neo4jGraph
from src.shared.common_fn import create_gcs_bucket_folder_name_hashed, delete_uploaded_local_file, load_embedding_model
from src.document_sources.gcs_bucket import delete_file_from_gcs
Expand All @@ -16,7 +18,7 @@ class graphDBdataAccess:
def __init__(self, graph: Neo4jGraph):
self.graph = graph

def update_exception_db(self, file_name, exp_msg, retry_condition):
def update_exception_db(self, file_name, exp_msg, retry_condition=None):
try:
job_status = "Failed"
result = self.get_current_status_document_node(file_name)
Expand All @@ -39,7 +41,7 @@ def update_exception_db(self, file_name, exp_msg, retry_condition):
def create_source_node(self, obj_source_node:sourceNode):
try:
job_status = "New"
logging.info("creating source node if does not exist")
logging.info(f"creating source node if does not exist in database {self.graph._database}")
self.graph.query("""MERGE(d:Document {fileName :$fn}) SET d.fileSize = $fs, d.fileType = $ft ,
d.status = $st, d.url = $url, d.awsAccessKeyId = $awsacc_key_id,
d.fileSource = $f_source, d.createdAt = $c_at, d.updatedAt = $u_at,
Expand Down Expand Up @@ -254,8 +256,20 @@ def connection_check_and_get_vector_dimensions(self,database):
else:
return {'message':"Connection Successful","gds_status": gds_status,"write_access":write_access}

def execute_query(self, query, param=None):
return self.graph.query(query, param)
def execute_query(self, query, param=None,max_retries=3, delay=2):
retries = 0
while retries < max_retries:
try:
return self.graph.query(query, param)
except TransientError as e:
if "DeadlockDetected" in str(e):
retries += 1
logging.info(f"Deadlock detected. Retrying {retries}/{max_retries} in {delay} seconds...")
time.sleep(delay) # Wait before retrying
else:
raise
logging.error("Failed to execute query after maximum retries due to persistent deadlocks.")
raise RuntimeError("Query execution failed after multiple retries due to deadlock.")

def get_current_status_document_node(self, file_name):
query = """
Expand Down
15 changes: 7 additions & 8 deletions backend/src/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import google.auth
from src.shared.constants import ADDITIONAL_INSTRUCTIONS
import re
import json

def get_llm(model: str):
"""Retrieve the specified language model based on the model name."""
Expand Down Expand Up @@ -200,15 +201,13 @@ async def get_graph_from_llm(model, chunkId_chunkDoc_list, allowedNodes, allowed
llm, model_name = get_llm(model)
combined_chunk_document_list = get_combined_chunks(chunkId_chunkDoc_list, chunks_to_combine)

if allowedNodes is None or allowedNodes=="":
allowedNodes =[]
else:
allowedNodes = allowedNodes.split(',')
if allowedRelationship is None or allowedRelationship=="":
allowedRelationship=[]
allowedNodes = allowedNodes.split(',') if allowedNodes else []

if not allowedRelationship:
allowedRelationship = []
else:
allowedRelationship = allowedRelationship.split(',')

items = allowedRelationship.split(',')
allowedRelationship = [tuple(items[i:i+3]) for i in range(0, len(items), 3)]
graph_document_list = await get_graph_document_list(
llm, combined_chunk_document_list, allowedNodes, allowedRelationship, additional_instructions
)
Expand Down
Loading