neo4j-labs · kaustubh-darekar · Jan 30, 2025 · Jan 20, 2025 · Jan 22, 2025 · Jan 24, 2025
diff --git a/README.md b/README.md
@@ -32,9 +32,11 @@ If you are using Neo4j Desktop, you will not be able to use the docker-compose b
 #### Running through docker-compose
 By default only OpenAI and Diffbot are enabled since Gemini requires extra GCP configurations.
 According to enviornment we are configuring the models which is indicated by VITE_LLM_MODELS_PROD variable we can configure model based on our need.
+
 EX:
 ```env
 VITE_LLM_MODELS_PROD="openai_gpt_4o,openai_gpt_4o_mini,diffbot,gemini_1.5_flash"
+
 You can then run Docker Compose to build and start all components:
 ```bash
 docker-compose up --build
@@ -67,6 +69,7 @@ VITE_CHAT_MODES=""
 If however you want to specify the only vector mode or only graph mode you can do that by specifying the mode in the env:
 ```env
 VITE_CHAT_MODES="vector,graph"
+VITE_CHAT_MODES="vector,graph"
 ```
 
 #### Running Backend and Frontend separately (dev environment)

diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -1,33 +1,33 @@
 asyncio==3.4.3
-boto3==1.35.90
-botocore==1.35.90
+boto3==1.36.2
+botocore==1.36.2
 certifi==2024.8.30
 fastapi==0.115.6
 fastapi-health==0.4.0
-google-api-core==2.23.0
-google-auth==2.36.0
+google-api-core==2.24.0
+google-auth==2.37.0
 google_auth_oauthlib==1.2.1
 google-cloud-core==2.4.1
 json-repair==0.30.2
 pip-install==1.3.5
-langchain==0.3.13
-langchain-aws==0.2.10
-langchain-anthropic==0.3.0
-langchain-fireworks==0.2.5
-langchain-community==0.3.13
-langchain-core==0.3.28
+langchain==0.3.15
+langchain-aws==0.2.11
+langchain-anthropic==0.3.3
+langchain-fireworks==0.2.6
+langchain-community==0.3.15
+langchain-core==0.3.31
 langchain-experimental==0.3.4
-langchain-google-vertexai==2.0.7
-langchain-groq==0.2.1
-langchain-openai==0.2.14
-langchain-text-splitters==0.3.4
+langchain-google-vertexai==2.0.11
+langchain-groq==0.2.3
+langchain-openai==0.3.1
+langchain-text-splitters==0.3.5
 langchain-huggingface==0.1.2
 langdetect==1.0.9
-langsmith==0.2.4
-langserve==0.3.0
+langsmith==0.2.11
+langserve==0.3.1
 neo4j-rust-ext
 nltk==3.9.1
-openai==1.58.1
+openai==1.59.9
 opencv-python==4.10.0.84
 psutil==6.1.0
 pydantic==2.9.2
@@ -56,7 +56,7 @@ google-cloud-logging==3.11.3
 pypandoc==1.13
 graphdatascience==1.12
 Secweb==1.11.0
-ragas==0.2.6
+ragas==0.2.11
 rouge_score==0.1.2
-langchain-neo4j==0.2.0
+langchain-neo4j==0.3.0
 
diff --git a/backend/score.py b/backend/score.py
@@ -18,7 +18,7 @@
 from src.communities import create_communities
 from src.neighbours import get_neighbour_nodes
 import json
-from typing import List
+from typing import List, Optional
 from google.oauth2.credentials import Credentials
 import os
 from src.logger import CustomLogger
@@ -189,6 +189,9 @@ async def extract_knowledge_graph_from_file(
     file_name=Form(None),
     allowedNodes=Form(None),
     allowedRelationship=Form(None),
+    chunk_size: Optional[int] = Form(None),
+    chunk_overlap: Optional[int] = Form(None),
+    chunks_to_combine: Optional[int] = Form(None),
     language=Form(None),
     access_token=Form(None),
     retry_condition=Form(None),
@@ -215,22 +218,22 @@ async def extract_knowledge_graph_from_file(
         graphDb_data_Access = graphDBdataAccess(graph)
         merged_file_path = os.path.join(MERGED_DIR,file_name)
         if source_type == 'local file':
-            uri_latency, result = await extract_graph_from_file_local_file(uri, userName, password, database, model, merged_file_path, file_name, allowedNodes, allowedRelationship, retry_condition, additional_instructions)
+            uri_latency, result = await extract_graph_from_file_local_file(uri, userName, password, database, model, merged_file_path, file_name, allowedNodes, allowedRelationship, chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions)
 
         elif source_type == 's3 bucket' and source_url:
-            uri_latency, result = await extract_graph_from_file_s3(uri, userName, password, database, model, source_url, aws_access_key_id, aws_secret_access_key, file_name, allowedNodes, allowedRelationship, retry_condition, additional_instructions)
+            uri_latency, result = await extract_graph_from_file_s3(uri, userName, password, database, model, source_url, aws_access_key_id, aws_secret_access_key, file_name, allowedNodes, allowedRelationship, chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions)
 
         elif source_type == 'web-url':
-            uri_latency, result = await extract_graph_from_web_page(uri, userName, password, database, model, source_url, file_name, allowedNodes, allowedRelationship, retry_condition, additional_instructions)
+            uri_latency, result = await extract_graph_from_web_page(uri, userName, password, database, model, source_url, file_name, allowedNodes, allowedRelationship, chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions)
 
         elif source_type == 'youtube' and source_url:
-            uri_latency, result = await extract_graph_from_file_youtube(uri, userName, password, database, model, source_url, file_name, allowedNodes, allowedRelationship, retry_condition, additional_instructions)
+            uri_latency, result = await extract_graph_from_file_youtube(uri, userName, password, database, model, source_url, file_name, allowedNodes, allowedRelationship, chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions)
 
         elif source_type == 'Wikipedia' and wiki_query:
-            uri_latency, result = await extract_graph_from_file_Wikipedia(uri, userName, password, database, model, wiki_query, language, file_name, allowedNodes, allowedRelationship, retry_condition, additional_instructions)
+            uri_latency, result = await extract_graph_from_file_Wikipedia(uri, userName, password, database, model, wiki_query, language, file_name, allowedNodes, allowedRelationship, chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions)
 
         elif source_type == 'gcs bucket' and gcs_bucket_name:
-            uri_latency, result = await extract_graph_from_file_gcs(uri, userName, password, database, model, gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token, file_name, allowedNodes, allowedRelationship, retry_condition, additional_instructions)
+            uri_latency, result = await extract_graph_from_file_gcs(uri, userName, password, database, model, gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token, file_name, allowedNodes, allowedRelationship, chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions)
         else:
             return create_api_response('Failed',message='source_type is other than accepted source')
         extract_api_time = time.time() - start_time

diff --git a/backend/src/create_chunks.py b/backend/src/create_chunks.py
@@ -14,7 +14,7 @@ def __init__(self, pages: list[Document], graph: Neo4jGraph):
         self.pages = pages
         self.graph = graph
 
-    def split_file_into_chunks(self):
+    def split_file_into_chunks(self,chunk_size, chunk_overlap):
         """
         Split a list of documents(file pages) into chunks of fixed size.
 
@@ -25,7 +25,7 @@ def split_file_into_chunks(self):
             A list of chunks each of which is a langchain Document.
         """
         logging.info("Split file into smaller chunks")
-        text_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=20)
+        text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
         chunk_to_be_created = int(os.environ.get('CHUNKS_TO_BE_CREATED', '50'))
         if 'page' in self.pages[0].metadata:
             chunks = []

diff --git a/backend/src/graphDB_dataAccess.py b/backend/src/graphDB_dataAccess.py
@@ -399,9 +399,9 @@ def get_duplicate_nodes_list(self):
                 AND 
                 (
                 // either contains each other as substrings or has a text edit distinct of less than 3
-                (size(toString(other.id)) > 2 AND toLower(n.id) CONTAINS toLower(other.id)) OR 
-                (size(toString(n.id)) > 2 AND toLower(other.id) CONTAINS toLower(n.id))
-                OR (size(toString(n.id))>5 AND apoc.text.distance(toLower(n.id), toLower(other.id)) < $duplicate_text_distance)
+                (size(toString(other.id)) > 2 AND toLower(toString(n.id)) CONTAINS toLower(toString(other.id))) OR 
+                (size(toString(n.id)) > 2 AND toLower(toString(other.id)) CONTAINS toLower(toString(n.id)))
+                OR (size(toString(n.id))>5 AND apoc.text.distance(toLower(toString(n.id)), toLower(toString(other.id))) < $duplicate_text_distance)
                 OR
                 vector.similarity.cosine(other.embedding, n.embedding) > $duplicate_score_value
                 )] as similar
@@ -535,6 +535,7 @@ def update_node_relationship_count(self,document_name):
                     "nodeCount" : nodeCount,
                     "relationshipCount" : relationshipCount
                     }
+
         return response
 
     def get_nodelabels_relationships(self):

diff --git a/backend/src/llm.py b/backend/src/llm.py
@@ -121,7 +121,7 @@ def get_llm(model: str):
     return llm, model_name
 
 
-def get_combined_chunks(chunkId_chunkDoc_list):
+def get_combined_chunks(chunkId_chunkDoc_list,chunks_to_combine):
     chunks_to_combine = int(os.environ.get("NUMBER_OF_CHUNKS_TO_COMBINE"))
     logging.info(f"Combining {chunks_to_combine} chunks before sending request to LLM")
     combined_chunk_document_list = []
@@ -190,8 +190,6 @@ async def get_graph_document_list(
         graph_document_list = await llm_transformer.aconvert_to_graph_documents(combined_chunk_document_list)
     return graph_document_list
 
-
-
 async def get_graph_from_llm(model, chunkId_chunkDoc_list, allowedNodes, allowedRelationship, additional_instructions=None):
     try:
         llm, model_name = get_llm(model)