neo4j-labs
diff --git a/‎backend/Dockerfile
Lines changed: 2 additions & 11 deletions b/‎backend/Dockerfile
Lines changed: 2 additions & 11 deletions
diff --git a/‎backend/example.env
Lines changed: 1 addition & 9 deletions b/‎backend/example.env
Lines changed: 1 addition & 9 deletions
diff --git a/‎backend/score.py
Lines changed: 10 additions & 9 deletions b/‎backend/score.py
Lines changed: 10 additions & 9 deletions
diff --git a/‎backend/src/QA_integration_new.py
Lines changed: 23 additions & 31 deletions b/‎backend/src/QA_integration_new.py
Lines changed: 23 additions & 31 deletions
diff --git a/‎backend/src/chunkid_entities.py
Lines changed: 1 addition & 1 deletion b/‎backend/src/chunkid_entities.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backend/src/document_sources/gcs_bucket.py
Lines changed: 90 additions & 6 deletions b/‎backend/src/document_sources/gcs_bucket.py
Lines changed: 90 additions & 6 deletions
@@ -6,7 +6,6 @@ EXPOSE 8000
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        libgl1-mesa-glx \
-       libreoffice \
        cmake \
        poppler-utils \
        tesseract-ocr && \
@@ -19,13 +18,5 @@ COPY requirements.txt /code/
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
 # Copy application code
 COPY . /code
-RUN apt-get update \
-    && apt-get install -y libgl1-mesa-glx cmake \
-    && apt-get install -y poppler-utils \
-    && apt install -y tesseract-ocr \
-    && export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH \
-    && pip install --no-cache-dir --upgrade -r /code/requirements.txt
-    
-# CMD ["uvicorn", "score:app", "--host", "0.0.0.0", "--port", "8000","--workers", "4"]
-CMD ["gunicorn", "score:app","--workers","4","--worker-class","uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:8000", "--timeout", "300"]
-
+# Set command
+CMD ["gunicorn", "score:app", "--workers", "2", "--worker-class", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:8000", "--timeout", "300"]
@@ -20,12 +20,4 @@ LANGCHAIN_API_KEY = ""
 LANGCHAIN_PROJECT = ""
 LANGCHAIN_TRACING_V2 = ""
 LANGCHAIN_ENDPOINT = ""
-NUMBER_OF_CHUNKS_TO_COMBINE = ""
-# NUMBER_OF_CHUNKS_ALLOWED = ""
-# Enable Gemini (default is True)
-GEMINI_ENABLED = True|False
-# Enable Google Cloud logs (default is True)
-GCP_LOG_METRICS_ENABLED = True|False
-UPDATE_GRAPH_CHUNKS_PROCESSED = 20
-NEO4J_USER_AGENT = ""
-UPDATE_GRAPH_CHUNKS_PROCESSED = 20
+GCS_FILE_CACHE = "" #save the file into GCS or local, SHould be True or False
@@ -168,7 +168,7 @@ async def extract_knowledge_graph_from_file(
         graphDb_data_Access = graphDBdataAccess(graph)
         if source_type == 'local file':
             result = await asyncio.to_thread(
-                extract_graph_from_file_local_file, graph, model, merged_file_path, file_name, allowedNodes, allowedRelationship, uri)
+                extract_graph_from_file_local_file, graph, model, merged_file_path, file_name, allowedNodes, allowedRelationship)
 
         elif source_type == 's3 bucket' and source_url:
             result = await asyncio.to_thread(
@@ -198,12 +198,7 @@ async def extract_knowledge_graph_from_file(
         graphDb_data_Access.update_exception_db(file_name,error_message)
         gcs_file_cache = os.environ.get('GCS_FILE_CACHE')
         if source_type == 'local file':
-            if gcs_file_cache == 'True':
-                folder_name = create_gcs_bucket_folder_name_hashed(uri,file_name)
-                delete_file_from_gcs(BUCKET_UPLOAD,folder_name,file_name)
-            else:
-                logging.info(f'Deleted File Path: {merged_file_path} and Deleted File Name : {file_name}')
-                delete_uploaded_local_file(merged_file_path,file_name)
+            delete_file_from_gcs(BUCKET_UPLOAD,file_name)
         josn_obj = {'message':message,'error_message':error_message, 'file_name': file_name,'status':'Failed','db_url':uri,'failed_count':1, 'source_type': source_type}
         logger.log_struct(josn_obj)
         logging.exception(f'File Failed in extraction: {josn_obj}')
@@ -350,8 +345,14 @@ async def upload_large_file_into_chunks(file:UploadFile = File(...), chunkNumber
                                         originalname=Form(None), model=Form(None), uri=Form(None), userName=Form(None), 
                                         password=Form(None), database=Form(None)):
     try:
-        result = await asyncio.to_thread(upload_file,uri,userName,password,database,model,file,chunkNumber,totalChunks,originalname)
-        return create_api_response('Success', message=result)
+        graph = create_graph_database_connection(uri, userName, password, database)
+        result = await asyncio.to_thread(upload_file, graph, model, file, chunkNumber, totalChunks, originalname, uri, CHUNK_DIR, MERGED_DIR)
+        josn_obj = {'api_name':'upload','db_url':uri}
+        logger.log_struct(josn_obj)
+        if int(chunkNumber) == int(totalChunks):
+            return create_api_response('Success',data=result, message='Source Node Created Successfully')
+        else:
+            return create_api_response('Success', message=result)
     except Exception as e:
         job_status = "Failed"
         message="Unable to upload large file into chunks or saving the chunks"
 
@@ -38,7 +38,7 @@
 MATCH (chunk)-[:PART_OF]->(d:Document)
 CALL { WITH chunk
 MATCH (chunk)-[:HAS_ENTITY]->(e)
-MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,3}(:!Chunk&!Document)
+MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,2}(:!Chunk&!Document)
 UNWIND rels as r
 RETURN collect(distinct r) as rels
 }
@@ -49,23 +49,27 @@
 WITH d, score,
 apoc.text.join(texts,"\n----\n") +
 apoc.text.join(entities,"\n")
-as text, entities, chunkIds, page_numbers
-RETURN text, score, {source: COALESCE(CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName), chunkIds:chunkIds, page_numbers:page_numbers} as metadata
+as text, entities, chunkIds, page_numbers ,start_times
+RETURN text, score, {source: COALESCE(CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName), chunkIds:chunkIds, page_numbers:page_numbers,start_times:start_times,entities:entities} as metadata
 """
 
 SYSTEM_TEMPLATE = """
-You are an AI-powered question-answering agent. Your task is to provide accurate and concise responses to user queries based on the given context, chat history, and available resources.
+You are an AI-powered question-answering agent. Your task is to provide accurate and comprehensive responses to user queries based on the given context, chat history, and available resources.
 
 ### Response Guidelines:
-1. **Direct Answers**: Provide straightforward answers to the user's queries without headers unless requested. Avoid speculative responses.
+1. **Direct Answers**: Provide clear and thorough answers to the user's queries without headers unless requested. Avoid speculative responses.
 2. **Utilize History and Context**: Leverage relevant information from previous interactions, the current user input, and the context provided below.
 3. **No Greetings in Follow-ups**: Start with a greeting in initial interactions. Avoid greetings in subsequent responses unless there's a significant break or the chat restarts.
 4. **Admit Unknowns**: Clearly state if an answer is unknown. Avoid making unsupported statements.
 5. **Avoid Hallucination**: Only provide information based on the context provided. Do not invent information.
-6. **Response Length**: Keep responses concise and relevant. Aim for clarity and completeness within 2-3 sentences unless more detail is requested.
+6. **Response Length**: Keep responses concise and relevant. Aim for clarity and completeness within 4-5 sentences unless more detail is requested.
 7. **Tone and Style**: Maintain a professional and informative tone. Be friendly and approachable.
 8. **Error Handling**: If a query is ambiguous or unclear, ask for clarification rather than providing a potentially incorrect answer.
 9. **Fallback Options**: If the required information is not available in the provided context, provide a polite and helpful response. Example: "I don't have that information right now." or "I'm sorry, but I don't have that information. Is there something else I can help with?"
+10. **Context Availability**: If the context is empty, do not provide answers based solely on internal knowledge. Instead, respond appropriately by indicating the lack of information.
+
+
+**IMPORTANT** : DO NOT ANSWER FROM YOUR KNOWLEDGE BASE USE THE BELOW CONTEXT
 
 ### Context:
 <context>
@@ -77,15 +81,18 @@
 AI Response: 'Hello there! How can I assist you today?'
 
 User: "What is Langchain?"
-AI Response: "Langchain is a framework that enables the development of applications powered by large language models, such as chatbots."
+AI Response: "Langchain is a framework that enables the development of applications powered by large language models, such as chatbots. It simplifies the integration of language models into various applications by providing useful tools and components."
 
 User: "Can you explain how to use memory management in Langchain?"
-AI Response: "Langchain's memory management involves utilizing built-in mechanisms to manage conversational context effectively, ensuring a coherent user experience."
+AI Response: "Langchain's memory management involves utilizing built-in mechanisms to manage conversational context effectively. It ensures that the conversation remains coherent and relevant by maintaining the history of interactions and using it to inform responses."
 
 User: "I need help with PyCaret's classification model."
-AI Response: "PyCaret simplifies the process of building and deploying machine learning models. For classification tasks, you can use PyCaret's setup function to prepare your data, then compare and tune models."
+AI Response: "PyCaret simplifies the process of building and deploying machine learning models. For classification tasks, you can use PyCaret's setup function to prepare your data. After setup, you can compare multiple models to find the best one, and then fine-tune it for better performance."
 
-Note: This system does not generate answers based solely on internal knowledge. It answers from the information provided in the user's current and previous inputs, and from explicitly referenced external sources.
+User: "What can you tell me about the latest realtime trends in AI?"
+AI Response: "I don't have that information right now. Is there something else I can help with?"
+
+Note: This system does not generate answers based solely on internal knowledge. It answers from the information provided in the user's current and previous inputs, and from the context.
 """
 
 # def get_llm(model: str,max_tokens=CHAT_MAX_TOKENS) -> Any:
@@ -316,27 +323,12 @@ def QA_RAG(graph,model,question,session_id):
                 "messages":messages
             }
         )
-        formatted_docs,sources = format_documents(docs)
-        doc_retrieval_time = time.time() - start_time
-        logging.info(f"Modified question and Documents retrieved in {doc_retrieval_time:.2f} seconds")
-
-        start_time = time.time()
-        rag_chain = get_rag_chain(llm=llm)
-        ai_response = rag_chain.invoke(
-            {
-            "messages" : messages[:-1],
-            "context"  : formatted_docs,
-            "input"    : question 
-        }
-        )
-        result = get_sources_and_chunks(sources,docs)
-        content = ai_response.content
-        if "Gemini" in model:
-            total_tokens = ai_response.response_metadata['usage_metadata']['prompt_token_count']
-        else:    
-            total_tokens = ai_response.response_metadata['token_usage']['total_tokens']
-        predict_time = time.time() - start_time
-        logging.info(f"Final Response predicted in {predict_time:.2f} seconds")
+        if docs:
+            # print(docs)
+            formatted_docs,sources = format_documents(docs)
+            
+            doc_retrieval_time = time.time() - start_time
+            logging.info(f"Modified question and Documents retrieved in {doc_retrieval_time:.2f} seconds")
 
         start_time = time.time()
         messages.append(ai_response)
 
@@ -10,7 +10,7 @@
 MATCH (chunk)-[:PART_OF]->(d:Document)
 CALL {WITH chunk
 MATCH (chunk)-[:HAS_ENTITY]->(e) 
-MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,3}(:!Chunk&!Document) 
+MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,2}(:!Chunk&!Document) 
 UNWIND rels as r
 RETURN collect(distinct r) as rels
 }
 
@@ -1,7 +1,13 @@
 import os
 import logging
 from google.cloud import storage
-from langchain_community.document_loaders import GCSFileLoader
+from langchain_community.document_loaders import GCSFileLoader, GCSDirectoryLoader
+from langchain_community.document_loaders import PyMuPDFLoader
+from langchain_core.documents import Document
+from PyPDF2 import PdfReader
+import io
+from google.oauth2.credentials import Credentials
+import time
 
 def get_gcs_bucket_files_info(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, creds):
     storage_client = storage.Client(project=gcs_project_id, credentials=creds)
@@ -36,7 +42,7 @@ def get_gcs_bucket_files_info(gcs_project_id, gcs_bucket_name, gcs_bucket_folder
 def load_pdf(file_path):
     return PyMuPDFLoader(file_path)
 
-def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename):
+def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token=None):
 
   if gcs_bucket_folder is not None:
     if gcs_bucket_folder.endswith('/'):
@@ -47,8 +53,86 @@ def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, g
       blob_name = gcs_blob_filename  
   #credentials, project_id = google.auth.default()
   logging.info(f"GCS project_id : {gcs_project_id}")  
-  loader = GCSFileLoader(project_name=gcs_project_id, bucket=gcs_bucket_name, blob=blob_name)
-  pages = loader.load()
-  file_name = gcs_blob_filename
-  return file_name, pages
+  #loader = GCSFileLoader(project_name=gcs_project_id, bucket=gcs_bucket_name, blob=blob_name, loader_func=load_pdf)
+  # pages = loader.load()
+  # file_name = gcs_blob_filename
+  #creds= Credentials(access_token)
+  if access_token is None:
+    storage_client = storage.Client(project=gcs_project_id)
+  else:
+    creds= Credentials(access_token)
+    storage_client = storage.Client(project=gcs_project_id, credentials=creds)
+  print(f'BLOB Name: {blob_name}')
+  bucket = storage_client.bucket(gcs_bucket_name)
+  blob = bucket.blob(blob_name) 
+  content = blob.download_as_bytes()
+  pdf_file = io.BytesIO(content)
+  pdf_reader = PdfReader(pdf_file)
+
+    # Extract text from all pages
+  text = ""
+  for page in pdf_reader.pages:
+        text += page.extract_text()
+  pages = [Document(page_content = text)]
+  return gcs_blob_filename, pages
+
+def upload_file_to_gcs(file_chunk, chunk_number, original_file_name, bucket_name):
+  storage_client = storage.Client()
+  
+  file_name = f'{original_file_name}_part_{chunk_number}'
+  bucket = storage_client.bucket(bucket_name)
+  file_data = file_chunk.file.read()
+  # print(f'data after read {file_data}')
 
+  blob = bucket.blob(file_name)
+  file_io = io.BytesIO(file_data)
+  blob.upload_from_file(file_io)
+  # Define the lifecycle rule to delete objects after 6 hours
+  # rule = {
+  #     "action": {"type": "Delete"},
+  #     "condition": {"age": 1}  # Age in days (24 hours = 1 days)
+  # }
+
+  # # Get the current lifecycle policy
+  # lifecycle = list(bucket.lifecycle_rules)
+
+  # # Add the new rule
+  # lifecycle.append(rule)
+
+  # # Set the lifecycle policy on the bucket
+  # bucket.lifecycle_rules = lifecycle
+  # bucket.patch()
+  time.sleep(1)
+  logging.info('Chunk uploaded successfully in gcs')
+  
+def merge_file_gcs(bucket_name, original_file_name: str):
+    storage_client = storage.Client()
+    # Retrieve chunks from GCS
+    blobs = storage_client.list_blobs(bucket_name, prefix=f"{original_file_name}_part_")
+    chunks = []
+    for blob in blobs:
+      chunks.append(blob.download_as_bytes())
+      blob.delete()
+
+    # Merge chunks into a single file
+    merged_file = b"".join(chunks)
+    blob = storage_client.bucket(bucket_name).blob(original_file_name)
+    logging.info('save the merged file from chunks in gcs')
+    file_io = io.BytesIO(merged_file)
+    blob.upload_from_file(file_io)
+    pdf_reader = PdfReader(file_io)
+    file_size = len(merged_file)
+    total_pages = len(pdf_reader.pages)
+    
+    return total_pages, file_size
+  
+def delete_file_from_gcs(bucket_name, file_name):
+  try:
+    storage_client = storage.Client()
+    bucket = storage_client.bucket(bucket_name)
+    blob = bucket.blob(file_name)
+    if blob.exists():
+      blob.delete()
+    logging.info('File deleted from GCS successfully')
+  except:
+    raise Exception('BLOB not exists in GCS')
Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@`
`10`	`10`	`MATCH (chunk)-[:PART_OF]->(d:Document)`
`11`	`11`	`CALL {WITH chunk`
`12`	`12`	`MATCH (chunk)-[:HAS_ENTITY]->(e)`
`13`		`-MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,3}(:!Chunk&!Document)`
	`13`	`+MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,2}(:!Chunk&!Document)`
`14`	`14`	`UNWIND rels as r`
`15`	`15`	`RETURN collect(distinct r) as rels`
`16`	`16`	`}`