neo4j-labs · kartikpersistent · Apr 21, 2025 · Feb 12, 2025 · Feb 13, 2025 · Feb 13, 2025
diff --git a/.gitignore b/.gitignore
@@ -172,4 +172,5 @@ google-cloud-cli-linux-x86_64.tar.gz
 .vennv
 newenv
 files
-
+startupbackend.sh
+startupfrontend.sh
diff --git a/README.md b/README.md
diff --git a/backend/example.env b/backend/example.env
@@ -31,16 +31,19 @@ DEFAULT_DIFFBOT_CHAT_MODEL="openai_gpt_4o"  #whichever model specified here , ne
 LLM_MODEL_CONFIG_openai_gpt_3.5="gpt-3.5-turbo-0125,openai_api_key"
 LLM_MODEL_CONFIG_openai_gpt_4o_mini="gpt-4o-mini-2024-07-18,openai_api_key"
 LLM_MODEL_CONFIG_openai_gpt_4o="gpt-4o-2024-11-20,openai_api_key"
-LLM_MODEL_CONFIG_openai-gpt-o3-mini="o3-mini-2025-01-31,openai_api_key"
+LLM_MODEL_CONFIG_openai_gpt_4.1_mini="gpt-4.1-mini,openai_api_key"
+LLM_MODEL_CONFIG_openai_gpt_4.1="gpt-4.1,openai_api_key"
+LLM_MODEL_CONFIG_openai_gpt_o3_mini="o3-mini-2025-01-31,openai_api_key"
 LLM_MODEL_CONFIG_gemini_1.5_pro="gemini-1.5-pro-002"
 LLM_MODEL_CONFIG_gemini_1.5_flash="gemini-1.5-flash-002"
 LLM_MODEL_CONFIG_gemini_2.0_flash="gemini-2.0-flash-001"
+LLM_MODEL_CONFIG_gemini_2.5_pro="gemini-2.5-pro-exp-03-25"
 LLM_MODEL_CONFIG_diffbot="diffbot,diffbot_api_key"
 LLM_MODEL_CONFIG_azure_ai_gpt_35="azure_deployment_name,azure_endpoint or base_url,azure_api_key,api_version"
 LLM_MODEL_CONFIG_azure_ai_gpt_4o="gpt-4o,https://YOUR-ENDPOINT.openai.azure.com/,azure_api_key,api_version"
 LLM_MODEL_CONFIG_groq_llama3_70b="model_name,base_url,groq_api_key"
 LLM_MODEL_CONFIG_anthropic_claude_3_5_sonnet="model_name,anthropic_api_key"
-LLM_MODEL_CONFIG_fireworks_llama_v3_70b="model_name,fireworks_api_key"
+LLM_MODEL_CONFIG_fireworks_llama4_maverick="model_name,fireworks_api_key"
 LLM_MODEL_CONFIG_bedrock_claude_3_5_sonnet="model_name,aws_access_key_id,aws_secret__access_key,region_name"
 LLM_MODEL_CONFIG_ollama_llama3="model_name,model_local_url"
 YOUTUBE_TRANSCRIPT_PROXY="https://user:pass@domain:port"

diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -1,63 +1,64 @@
+accelerate==1.6.0
 asyncio==3.4.3
-boto3==1.36.2
-botocore==1.36.2
-certifi==2024.8.30
-fastapi==0.115.6
+boto3==1.37.29
+botocore==1.37.29
+certifi==2025.1.31
+fastapi==0.115.11
 fastapi-health==0.4.0
-google-api-core==2.24.0
-google-auth==2.37.0
+google-api-core==2.24.2
+google-auth==2.38.0
 google_auth_oauthlib==1.2.1
-google-cloud-core==2.4.1
-json-repair==0.30.2
+google-cloud-core==2.4.3
+json-repair==0.39.1
 pip-install==1.3.5
-langchain==0.3.15
-langchain-aws==0.2.11
-langchain-anthropic==0.3.3
-langchain-fireworks==0.2.6
-langchain-community==0.3.15
-langchain-core==0.3.31
+langchain==0.3.23
+langchain-aws==0.2.18
+langchain-anthropic==0.3.9
+langchain-fireworks==0.2.9
+langchain-community==0.3.19
+langchain-core==0.3.51
 langchain-experimental==0.3.4
-langchain-google-vertexai==2.0.11
-langchain-groq==0.2.3
-langchain-openai==0.3.1
-langchain-text-splitters==0.3.5
+langchain-google-vertexai==2.0.19
+langchain-groq==0.2.5
+langchain-openai==0.3.12
+langchain-text-splitters==0.3.8
 langchain-huggingface==0.1.2
 langdetect==1.0.9
-langsmith==0.2.11
+langsmith==0.3.26
 langserve==0.3.1
 neo4j-rust-ext
 nltk==3.9.1
-openai==1.59.9
-opencv-python==4.10.0.84
-psutil==6.1.0
-pydantic==2.9.2
+openai==1.71.0
+opencv-python==4.11.0.86
+psutil==7.0.0
+pydantic==2.10.6
 python-dotenv==1.0.1
 python-magic==0.4.27
 PyPDF2==3.0.1
-PyMuPDF==1.24.14
-starlette==0.41.3
-sse-starlette==2.1.3
+PyMuPDF==1.25.5
+starlette==0.46.1
+sse-starlette==2.2.1
 starlette-session==0.4.3
 tqdm==4.67.1
 unstructured[all-docs]
-unstructured==0.16.11
-unstructured-client==0.28.1
-unstructured-inference==0.8.1
-urllib3==2.2.2
-uvicorn==0.32.1
+unstructured==0.17.2
+unstructured-client==0.32.3
+unstructured-inference==0.8.10
+urllib3==2.3.0
+uvicorn==0.34.0
 gunicorn==23.0.0
 wikipedia==1.4.0
-wrapt==1.16.0
-yarl==1.9.4
-youtube-transcript-api==0.6.3
-zipp==3.17.0
-sentence-transformers==3.3.1
-google-cloud-logging==3.11.3
-pypandoc==1.13
-graphdatascience==1.12
-Secweb==1.11.0
-ragas==0.2.11
+wrapt==1.17.2
+yarl==1.18.3
+youtube-transcript-api==1.0.3
+zipp==3.21.0
+sentence-transformers==4.0.2
+google-cloud-logging==3.11.4
+pypandoc==1.15
+graphdatascience==1.14
+Secweb==1.18.1
+ragas==0.2.14
 rouge_score==0.1.2
-langchain-neo4j==0.3.0
+langchain-neo4j==0.4.0
 pypandoc-binary==1.15
 chardet==5.2.0
diff --git a/backend/score.py b/backend/score.py
@@ -576,16 +576,20 @@ async def upload_large_file_into_chunks(file:UploadFile = File(...), chunkNumber
         result = await asyncio.to_thread(upload_file, graph, model, file, chunkNumber, totalChunks, originalname, uri, CHUNK_DIR, MERGED_DIR)
         end = time.time()
         elapsed_time = end - start
-        json_obj = {'api_name':'upload','db_url':uri,'userName':userName, 'database':database, 'chunkNumber':chunkNumber,'totalChunks':totalChunks,
-                            'original_file_name':originalname,'model':model, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{elapsed_time:.2f}','email':email}
-        logger.log_struct(json_obj, "INFO")
+        if int(chunkNumber) == int(totalChunks):
+            json_obj = {'api_name':'upload','db_url':uri,'userName':userName, 'database':database, 'chunkNumber':chunkNumber,'totalChunks':totalChunks,
+                                'original_file_name':originalname,'model':model, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{elapsed_time:.2f}','email':email}
+            logger.log_struct(json_obj, "INFO")
         if int(chunkNumber) == int(totalChunks):
             return create_api_response('Success',data=result, message='Source Node Created Successfully')
         else:
             return create_api_response('Success', message=result)
     except Exception as e:
-        message="Unable to upload large file into chunks. "
+        message="Unable to upload file in chunks"
         error_message = str(e)
+        graph = create_graph_database_connection(uri, userName, password, database)   
+        graphDb_data_Access = graphDBdataAccess(graph)
+        graphDb_data_Access.update_exception_db(originalname,error_message)
         logging.info(message)
         logging.exception(f'Exception:{error_message}')
         return create_api_response('Failed', message=message + error_message[:100], error=error_message, file_name = originalname)
@@ -596,8 +600,7 @@ async def upload_large_file_into_chunks(file:UploadFile = File(...), chunkNumber
 async def get_structured_schema(uri=Form(None), userName=Form(None), password=Form(None), database=Form(None),email=Form(None)):
     try:
         start = time.time()
-        graph = create_graph_database_connection(uri, userName, password, database)
-        result = await asyncio.to_thread(get_labels_and_relationtypes, graph)
+        result = await asyncio.to_thread(get_labels_and_relationtypes, uri, userName, password, database)
         end = time.time()
         elapsed_time = end - start
         logging.info(f'Schema result from DB: {result}')
@@ -764,10 +767,10 @@ async def cancelled_job(uri=Form(None), userName=Form(None), password=Form(None)
         gc.collect()
 
 @app.post("/populate_graph_schema")
-async def populate_graph_schema(input_text=Form(None), model=Form(None), is_schema_description_checked=Form(None),email=Form(None)):
+async def populate_graph_schema(input_text=Form(None), model=Form(None), is_schema_description_checked=Form(None),is_local_storage=Form(None),email=Form(None)):
     try:
         start = time.time()
-        result = populate_graph_schema_from_text(input_text, model, is_schema_description_checked)
+        result = populate_graph_schema_from_text(input_text, model, is_schema_description_checked, is_local_storage)
         end = time.time()
         elapsed_time = end - start
         json_obj = {'api_name':'populate_graph_schema', 'model':model, 'is_schema_description_checked':is_schema_description_checked, 'input_text':input_text, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{elapsed_time:.2f}','email':email}
@@ -894,7 +897,7 @@ async def retry_processing(uri=Form(None), userName=Form(None), password=Form(No
     try:
         start = time.time()
         graph = create_graph_database_connection(uri, userName, password, database)
-        chunks =  graph.query(QUERY_TO_GET_CHUNKS, params={"filename":file_name})
+        chunks = execute_graph_query(graph,QUERY_TO_GET_CHUNKS,params={"filename":file_name})
         end = time.time()
         elapsed_time = end - start
         json_obj = {'api_name':'retry_processing', 'db_url':uri, 'userName':userName, 'database':database, 'file_name':file_name,'retry_condition':retry_condition,

diff --git a/backend/src/QA_integration.py b/backend/src/QA_integration.py
@@ -380,7 +380,7 @@ def create_retriever(neo_db, document_names, chat_mode_settings,search_k, score_
         retriever = neo_db.as_retriever(
             search_type="similarity_score_threshold",
             search_kwargs={
-                'k': search_k,
+                'top_k': search_k,
                 'effective_search_ratio': ef_ratio,
                 'score_threshold': score_threshold,
                 'filter': {'fileName': {'$in': document_names}}
@@ -390,7 +390,7 @@ def create_retriever(neo_db, document_names, chat_mode_settings,search_k, score_
     else:
         retriever = neo_db.as_retriever(
             search_type="similarity_score_threshold",
-            search_kwargs={'k': search_k,'effective_search_ratio': ef_ratio, 'score_threshold': score_threshold}
+            search_kwargs={'top_k': search_k,'effective_search_ratio': ef_ratio, 'score_threshold': score_threshold}
         )
         logging.info(f"Successfully created retriever with search_k={search_k}, score_threshold={score_threshold}")
     return retriever

diff --git a/backend/src/document_sources/wikipedia.py b/backend/src/document_sources/wikipedia.py
@@ -11,6 +11,6 @@ def get_documents_from_Wikipedia(wiki_query:str, language:str):
   except Exception as e:
     message="Failed To Process Wikipedia Query"
     error_message = str(e)
-    logging.exception(f'Failed To Process Wikipedia Query: {file_name}, Exception Stack trace: {error_message}')
+    logging.exception(f'Failed To Process Wikipedia Query, Exception Stack trace: {error_message}')
     raise LLMGraphBuilderException(error_message+' '+message)
 
diff --git a/backend/src/document_sources/youtube.py b/backend/src/document_sources/youtube.py
@@ -1,6 +1,7 @@
 from langchain.docstore.document import Document
 from src.shared.llm_graph_builder_exception import LLMGraphBuilderException
 from youtube_transcript_api import YouTubeTranscriptApi 
+from youtube_transcript_api.proxies import GenericProxyConfig
 import logging
 from urllib.parse import urlparse,parse_qs
 from difflib import SequenceMatcher
@@ -12,8 +13,10 @@
 def get_youtube_transcript(youtube_id):
   try:
     proxy = os.environ.get("YOUTUBE_TRANSCRIPT_PROXY") 
-    proxies = { 'https': proxy }
-    transcript_pieces = YouTubeTranscriptApi.get_transcript(youtube_id, proxies = proxies)
+    proxy_config = GenericProxyConfig(http_url=proxy, https_url=proxy) if proxy else None
+    youtube_api = YouTubeTranscriptApi(proxy_config=proxy_config)
+    transcript_pieces = youtube_api.fetch(youtube_id, preserve_formatting=True)
+    transcript_pieces = transcript_pieces.to_raw_data()
     return transcript_pieces
   except Exception as e:
     message = f"Youtube transcript is not available for youtube Id: {youtube_id}"

diff --git a/backend/src/graphDB_dataAccess.py b/backend/src/graphDB_dataAccess.py
@@ -1,5 +1,7 @@
 import logging
 import os
+import time
+from neo4j.exceptions import TransientError
 from langchain_neo4j import Neo4jGraph
 from src.shared.common_fn import create_gcs_bucket_folder_name_hashed, delete_uploaded_local_file, load_embedding_model
 from src.document_sources.gcs_bucket import delete_file_from_gcs
@@ -16,7 +18,7 @@ class graphDBdataAccess:
     def __init__(self, graph: Neo4jGraph):
         self.graph = graph
 
-    def update_exception_db(self, file_name, exp_msg, retry_condition):
+    def update_exception_db(self, file_name, exp_msg, retry_condition=None):
         try:
             job_status = "Failed"
             result = self.get_current_status_document_node(file_name)
@@ -39,7 +41,7 @@ def update_exception_db(self, file_name, exp_msg, retry_condition):
     def create_source_node(self, obj_source_node:sourceNode):
         try:
             job_status = "New"
-            logging.info("creating source node if does not exist")
+            logging.info(f"creating source node if does not exist in database {self.graph._database}")
             self.graph.query("""MERGE(d:Document {fileName :$fn}) SET d.fileSize = $fs, d.fileType = $ft ,
                             d.status = $st, d.url = $url, d.awsAccessKeyId = $awsacc_key_id, 
                             d.fileSource = $f_source, d.createdAt = $c_at, d.updatedAt = $u_at, 
@@ -254,8 +256,20 @@ def connection_check_and_get_vector_dimensions(self,database):
                 else:
                     return {'message':"Connection Successful","gds_status": gds_status,"write_access":write_access}
 
-    def execute_query(self, query, param=None):
-        return self.graph.query(query, param)
+    def execute_query(self, query, param=None,max_retries=3, delay=2):
+        retries = 0
+        while retries < max_retries:
+            try:
+                return self.graph.query(query, param)
+            except TransientError as e:
+                if "DeadlockDetected" in str(e):
+                    retries += 1
+                    logging.info(f"Deadlock detected. Retrying {retries}/{max_retries} in {delay} seconds...")
+                    time.sleep(delay)  # Wait before retrying
+                else:
+                    raise 
+        logging.error("Failed to execute query after maximum retries due to persistent deadlocks.")
+        raise RuntimeError("Query execution failed after multiple retries due to deadlock.")
 
     def get_current_status_document_node(self, file_name):
         query = """

diff --git a/backend/src/llm.py b/backend/src/llm.py
@@ -15,6 +15,7 @@
 import google.auth
 from src.shared.constants import ADDITIONAL_INSTRUCTIONS
 import re
+import json
 
 def get_llm(model: str):
     """Retrieve the specified language model based on the model name."""
@@ -200,15 +201,13 @@ async def get_graph_from_llm(model, chunkId_chunkDoc_list, allowedNodes, allowed
     llm, model_name = get_llm(model)
     combined_chunk_document_list = get_combined_chunks(chunkId_chunkDoc_list, chunks_to_combine)
 
-    if  allowedNodes is None or allowedNodes=="":
-        allowedNodes =[]
-    else:
-        allowedNodes = allowedNodes.split(',')    
-    if  allowedRelationship is None or allowedRelationship=="":   
-        allowedRelationship=[]
+    allowedNodes = allowedNodes.split(',') if allowedNodes else []
+
+    if not allowedRelationship:
+        allowedRelationship = []
     else:
-        allowedRelationship = allowedRelationship.split(',')
-
+        items = allowedRelationship.split(',')
+        allowedRelationship = [tuple(items[i:i+3]) for i in range(0, len(items), 3)]
     graph_document_list = await get_graph_document_list(
         llm, combined_chunk_document_list, allowedNodes, allowedRelationship, additional_instructions
     )
-Original file line number
+Diff line change
@@ Expand Up / @@ -172,4 +172,5 @@ google-cloud-cli-linux-x86_64.tar.gz @@
     .vennv
     newenv
     files
+    startupbackend.sh
+    startupfrontend.sh