Skip to content

Make chunk_size, chunk_overlap and chunks_to_combine Configurable #1012

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,11 @@ If you are using Neo4j Desktop, you will not be able to use the docker-compose b
#### Running through docker-compose
By default only OpenAI and Diffbot are enabled since Gemini requires extra GCP configurations.
According to enviornment we are configuring the models which is indicated by VITE_LLM_MODELS_PROD variable we can configure model based on our need.

EX:
```env
VITE_LLM_MODELS_PROD="openai_gpt_4o,openai_gpt_4o_mini,diffbot,gemini_1.5_flash"

You can then run Docker Compose to build and start all components:
```bash
docker-compose up --build
Expand Down Expand Up @@ -67,6 +69,7 @@ VITE_CHAT_MODES=""
If however you want to specify the only vector mode or only graph mode you can do that by specifying the mode in the env:
```env
VITE_CHAT_MODES="vector,graph"
VITE_CHAT_MODES="vector,graph"
```

#### Running Backend and Frontend separately (dev environment)
Expand Down
38 changes: 19 additions & 19 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,33 +1,33 @@
asyncio==3.4.3
boto3==1.35.90
botocore==1.35.90
boto3==1.36.2
botocore==1.36.2
certifi==2024.8.30
fastapi==0.115.6
fastapi-health==0.4.0
google-api-core==2.23.0
google-auth==2.36.0
google-api-core==2.24.0
google-auth==2.37.0
google_auth_oauthlib==1.2.1
google-cloud-core==2.4.1
json-repair==0.30.2
pip-install==1.3.5
langchain==0.3.13
langchain-aws==0.2.10
langchain-anthropic==0.3.0
langchain-fireworks==0.2.5
langchain-community==0.3.13
langchain-core==0.3.28
langchain==0.3.15
langchain-aws==0.2.11
langchain-anthropic==0.3.3
langchain-fireworks==0.2.6
langchain-community==0.3.15
langchain-core==0.3.31
langchain-experimental==0.3.4
langchain-google-vertexai==2.0.7
langchain-groq==0.2.1
langchain-openai==0.2.14
langchain-text-splitters==0.3.4
langchain-google-vertexai==2.0.11
langchain-groq==0.2.3
langchain-openai==0.3.1
langchain-text-splitters==0.3.5
langchain-huggingface==0.1.2
langdetect==1.0.9
langsmith==0.2.4
langserve==0.3.0
langsmith==0.2.11
langserve==0.3.1
neo4j-rust-ext
nltk==3.9.1
openai==1.58.1
openai==1.59.9
opencv-python==4.10.0.84
psutil==6.1.0
pydantic==2.9.2
Expand Down Expand Up @@ -56,7 +56,7 @@ google-cloud-logging==3.11.3
pypandoc==1.13
graphdatascience==1.12
Secweb==1.11.0
ragas==0.2.6
ragas==0.2.11
rouge_score==0.1.2
langchain-neo4j==0.2.0
langchain-neo4j==0.3.0

17 changes: 10 additions & 7 deletions backend/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from src.communities import create_communities
from src.neighbours import get_neighbour_nodes
import json
from typing import List
from typing import List, Optional
from google.oauth2.credentials import Credentials
import os
from src.logger import CustomLogger
Expand Down Expand Up @@ -189,6 +189,9 @@ async def extract_knowledge_graph_from_file(
file_name=Form(None),
allowedNodes=Form(None),
allowedRelationship=Form(None),
chunk_size: Optional[int] = Form(None),
chunk_overlap: Optional[int] = Form(None),
chunks_to_combine: Optional[int] = Form(None),
language=Form(None),
access_token=Form(None),
retry_condition=Form(None),
Expand All @@ -215,22 +218,22 @@ async def extract_knowledge_graph_from_file(
graphDb_data_Access = graphDBdataAccess(graph)
merged_file_path = os.path.join(MERGED_DIR,file_name)
if source_type == 'local file':
uri_latency, result = await extract_graph_from_file_local_file(uri, userName, password, database, model, merged_file_path, file_name, allowedNodes, allowedRelationship, retry_condition, additional_instructions)
uri_latency, result = await extract_graph_from_file_local_file(uri, userName, password, database, model, merged_file_path, file_name, allowedNodes, allowedRelationship, chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions)

elif source_type == 's3 bucket' and source_url:
uri_latency, result = await extract_graph_from_file_s3(uri, userName, password, database, model, source_url, aws_access_key_id, aws_secret_access_key, file_name, allowedNodes, allowedRelationship, retry_condition, additional_instructions)
uri_latency, result = await extract_graph_from_file_s3(uri, userName, password, database, model, source_url, aws_access_key_id, aws_secret_access_key, file_name, allowedNodes, allowedRelationship, chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions)

elif source_type == 'web-url':
uri_latency, result = await extract_graph_from_web_page(uri, userName, password, database, model, source_url, file_name, allowedNodes, allowedRelationship, retry_condition, additional_instructions)
uri_latency, result = await extract_graph_from_web_page(uri, userName, password, database, model, source_url, file_name, allowedNodes, allowedRelationship, chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions)

elif source_type == 'youtube' and source_url:
uri_latency, result = await extract_graph_from_file_youtube(uri, userName, password, database, model, source_url, file_name, allowedNodes, allowedRelationship, retry_condition, additional_instructions)
uri_latency, result = await extract_graph_from_file_youtube(uri, userName, password, database, model, source_url, file_name, allowedNodes, allowedRelationship, chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions)

elif source_type == 'Wikipedia' and wiki_query:
uri_latency, result = await extract_graph_from_file_Wikipedia(uri, userName, password, database, model, wiki_query, language, file_name, allowedNodes, allowedRelationship, retry_condition, additional_instructions)
uri_latency, result = await extract_graph_from_file_Wikipedia(uri, userName, password, database, model, wiki_query, language, file_name, allowedNodes, allowedRelationship, chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions)

elif source_type == 'gcs bucket' and gcs_bucket_name:
uri_latency, result = await extract_graph_from_file_gcs(uri, userName, password, database, model, gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token, file_name, allowedNodes, allowedRelationship, retry_condition, additional_instructions)
uri_latency, result = await extract_graph_from_file_gcs(uri, userName, password, database, model, gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token, file_name, allowedNodes, allowedRelationship, chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions)
else:
return create_api_response('Failed',message='source_type is other than accepted source')
extract_api_time = time.time() - start_time
Expand Down
4 changes: 2 additions & 2 deletions backend/src/create_chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def __init__(self, pages: list[Document], graph: Neo4jGraph):
self.pages = pages
self.graph = graph

def split_file_into_chunks(self):
def split_file_into_chunks(self,chunk_size, chunk_overlap):
"""
Split a list of documents(file pages) into chunks of fixed size.

Expand All @@ -25,7 +25,7 @@ def split_file_into_chunks(self):
A list of chunks each of which is a langchain Document.
"""
logging.info("Split file into smaller chunks")
text_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=20)
text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
chunk_to_be_created = int(os.environ.get('CHUNKS_TO_BE_CREATED', '50'))
if 'page' in self.pages[0].metadata:
chunks = []
Expand Down
7 changes: 4 additions & 3 deletions backend/src/graphDB_dataAccess.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,9 +399,9 @@ def get_duplicate_nodes_list(self):
AND
(
// either contains each other as substrings or has a text edit distinct of less than 3
(size(toString(other.id)) > 2 AND toLower(n.id) CONTAINS toLower(other.id)) OR
(size(toString(n.id)) > 2 AND toLower(other.id) CONTAINS toLower(n.id))
OR (size(toString(n.id))>5 AND apoc.text.distance(toLower(n.id), toLower(other.id)) < $duplicate_text_distance)
(size(toString(other.id)) > 2 AND toLower(toString(n.id)) CONTAINS toLower(toString(other.id))) OR
(size(toString(n.id)) > 2 AND toLower(toString(other.id)) CONTAINS toLower(toString(n.id)))
OR (size(toString(n.id))>5 AND apoc.text.distance(toLower(toString(n.id)), toLower(toString(other.id))) < $duplicate_text_distance)
OR
vector.similarity.cosine(other.embedding, n.embedding) > $duplicate_score_value
)] as similar
Expand Down Expand Up @@ -535,6 +535,7 @@ def update_node_relationship_count(self,document_name):
"nodeCount" : nodeCount,
"relationshipCount" : relationshipCount
}

return response

def get_nodelabels_relationships(self):
Expand Down
4 changes: 1 addition & 3 deletions backend/src/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def get_llm(model: str):
return llm, model_name


def get_combined_chunks(chunkId_chunkDoc_list):
def get_combined_chunks(chunkId_chunkDoc_list,chunks_to_combine):
chunks_to_combine = int(os.environ.get("NUMBER_OF_CHUNKS_TO_COMBINE"))
logging.info(f"Combining {chunks_to_combine} chunks before sending request to LLM")
combined_chunk_document_list = []
Expand Down Expand Up @@ -190,8 +190,6 @@ async def get_graph_document_list(
graph_document_list = await llm_transformer.aconvert_to_graph_documents(combined_chunk_document_list)
return graph_document_list



async def get_graph_from_llm(model, chunkId_chunkDoc_list, allowedNodes, allowedRelationship, additional_instructions=None):
try:
llm, model_name = get_llm(model)
Expand Down
Loading