neo4j-labs · prakriti-solankey · Jun 18, 2024 · Jun 18, 2024
diff --git a/backend/Dockerfile b/backend/Dockerfile
@@ -1,15 +1,22 @@
-FROM python:3.10
+FROM python:3.10-slim
 WORKDIR /code
 ENV PORT 8000
 EXPOSE 8000
+# Install dependencies and clean up in one layer
+RUN apt-get update && \
+   apt-get install -y --no-install-recommends \
+       libgl1-mesa-glx \
+       cmake \
+       poppler-utils \
+       tesseract-ocr && \
+   apt-get clean && \
+   rm -rf /var/lib/apt/lists/*
+# Set LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
+# Copy requirements file and install Python dependencies
+COPY requirements.txt /code/
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+# Copy application code
 COPY . /code
-RUN apt-get update \
-    && apt-get install -y libgl1-mesa-glx cmake \
-    && apt-get install -y poppler-utils \
-    && apt install -y tesseract-ocr \
-    && export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH \
-    && pip install --no-cache-dir --upgrade -r /code/requirements.txt
-
-# CMD ["uvicorn", "score:app", "--host", "0.0.0.0", "--port", "8000","--workers", "4"]
-CMD ["gunicorn", "score:app","--workers","4","--worker-class","uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:8000", "--timeout", "300"]
-
+# Set command
+CMD ["gunicorn", "score:app", "--workers", "2", "--worker-class", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:8000", "--timeout", "300"]
diff --git a/backend/example.env b/backend/example.env
@@ -20,3 +20,4 @@ LANGCHAIN_API_KEY = ""
 LANGCHAIN_PROJECT = ""
 LANGCHAIN_TRACING_V2 = ""
 LANGCHAIN_ENDPOINT = ""
+GCS_FILE_CACHE = "" #save the file into GCS or local, SHould be True or False
diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -43,7 +43,7 @@ google-cloud-aiplatform
 google-cloud-bigquery==3.19.0
 google-cloud-core==2.4.1
 google-cloud-resource-manager==1.12.3
-google-cloud-storage==2.16.0
+google-cloud-storage
 google-crc32c==1.5.0
 google-resumable-media==2.7.0
 googleapis-common-protos==1.63.0

diff --git a/backend/score.py b/backend/score.py
@@ -162,7 +162,7 @@ async def extract_knowledge_graph_from_file(
             merged_file_path = os.path.join(MERGED_DIR,file_name)
             logging.info(f'File path:{merged_file_path}')
             result = await asyncio.to_thread(
-                extract_graph_from_file_local_file, graph, model, file_name, merged_file_path, allowedNodes, allowedRelationship)
+                extract_graph_from_file_local_file, graph, model, merged_file_path, file_name, allowedNodes, allowedRelationship)
 
         elif source_type == 's3 bucket' and source_url:
             result = await asyncio.to_thread(
@@ -191,7 +191,7 @@ async def extract_knowledge_graph_from_file(
         error_message = str(e)
         graphDb_data_Access.update_exception_db(file_name,error_message)
         if source_type == 'local file':
-            delete_uploaded_local_file(merged_file_path, file_name)
+            delete_file_from_gcs(BUCKET_UPLOAD,file_name)
         josn_obj = {'message':message,'error_message':error_message, 'file_name': file_name,'status':'Failed','db_url':uri,'failed_count':1, 'source_type': source_type}
         logger.log_struct(josn_obj)
         logging.exception(f'File Failed in extraction: {josn_obj}')
@@ -342,7 +342,7 @@ async def upload_large_file_into_chunks(file:UploadFile = File(...), chunkNumber
                                         password=Form(None), database=Form(None)):
     try:
         graph = create_graph_database_connection(uri, userName, password, database)
-        result = await asyncio.to_thread(upload_file, graph, model, file, chunkNumber, totalChunks, originalname, CHUNK_DIR, MERGED_DIR)
+        result = await asyncio.to_thread(upload_file, graph, model, file, chunkNumber, totalChunks, originalname, uri, CHUNK_DIR, MERGED_DIR)
         josn_obj = {'api_name':'upload','db_url':uri}
         logger.log_struct(josn_obj)
         if int(chunkNumber) == int(totalChunks):

diff --git a/backend/src/QA_integration_new.py b/backend/src/QA_integration_new.py
@@ -33,7 +33,7 @@
 MATCH (chunk)-[:PART_OF]->(d:Document)
 CALL { WITH chunk
 MATCH (chunk)-[:HAS_ENTITY]->(e)
-MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,3}(:!Chunk&!Document)
+MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,2}(:!Chunk&!Document)
 UNWIND rels as r
 RETURN collect(distinct r) as rels
 }
@@ -45,22 +45,26 @@
 apoc.text.join(texts,"\n----\n") +
 apoc.text.join(entities,"\n")
 as text, entities, chunkIds, page_numbers ,start_times
-RETURN text, score, {source: COALESCE(CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName), chunkIds:chunkIds, page_numbers:page_numbers,start_times:start_times} as metadata
+RETURN text, score, {source: COALESCE(CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName), chunkIds:chunkIds, page_numbers:page_numbers,start_times:start_times,entities:entities} as metadata
 """
 
 SYSTEM_TEMPLATE = """
-You are an AI-powered question-answering agent. Your task is to provide accurate and concise responses to user queries based on the given context, chat history, and available resources.
+You are an AI-powered question-answering agent. Your task is to provide accurate and comprehensive responses to user queries based on the given context, chat history, and available resources.
 
 ### Response Guidelines:
-1. **Direct Answers**: Provide straightforward answers to the user's queries without headers unless requested. Avoid speculative responses.
+1. **Direct Answers**: Provide clear and thorough answers to the user's queries without headers unless requested. Avoid speculative responses.
 2. **Utilize History and Context**: Leverage relevant information from previous interactions, the current user input, and the context provided below.
 3. **No Greetings in Follow-ups**: Start with a greeting in initial interactions. Avoid greetings in subsequent responses unless there's a significant break or the chat restarts.
 4. **Admit Unknowns**: Clearly state if an answer is unknown. Avoid making unsupported statements.
 5. **Avoid Hallucination**: Only provide information based on the context provided. Do not invent information.
-6. **Response Length**: Keep responses concise and relevant. Aim for clarity and completeness within 2-3 sentences unless more detail is requested.
+6. **Response Length**: Keep responses concise and relevant. Aim for clarity and completeness within 4-5 sentences unless more detail is requested.
 7. **Tone and Style**: Maintain a professional and informative tone. Be friendly and approachable.
 8. **Error Handling**: If a query is ambiguous or unclear, ask for clarification rather than providing a potentially incorrect answer.
 9. **Fallback Options**: If the required information is not available in the provided context, provide a polite and helpful response. Example: "I don't have that information right now." or "I'm sorry, but I don't have that information. Is there something else I can help with?"
+10. **Context Availability**: If the context is empty, do not provide answers based solely on internal knowledge. Instead, respond appropriately by indicating the lack of information.
+
+
+**IMPORTANT** : DO NOT ANSWER FROM YOUR KNOWLEDGE BASE USE THE BELOW CONTEXT
 
 ### Context:
 <context>
@@ -72,15 +76,18 @@
 AI Response: 'Hello there! How can I assist you today?'
 
 User: "What is Langchain?"
-AI Response: "Langchain is a framework that enables the development of applications powered by large language models, such as chatbots."
+AI Response: "Langchain is a framework that enables the development of applications powered by large language models, such as chatbots. It simplifies the integration of language models into various applications by providing useful tools and components."
 
 User: "Can you explain how to use memory management in Langchain?"
-AI Response: "Langchain's memory management involves utilizing built-in mechanisms to manage conversational context effectively, ensuring a coherent user experience."
+AI Response: "Langchain's memory management involves utilizing built-in mechanisms to manage conversational context effectively. It ensures that the conversation remains coherent and relevant by maintaining the history of interactions and using it to inform responses."
 
 User: "I need help with PyCaret's classification model."
-AI Response: "PyCaret simplifies the process of building and deploying machine learning models. For classification tasks, you can use PyCaret's setup function to prepare your data, then compare and tune models."
+AI Response: "PyCaret simplifies the process of building and deploying machine learning models. For classification tasks, you can use PyCaret's setup function to prepare your data. After setup, you can compare multiple models to find the best one, and then fine-tune it for better performance."
+
+User: "What can you tell me about the latest realtime trends in AI?"
+AI Response: "I don't have that information right now. Is there something else I can help with?"
 
-Note: This system does not generate answers based solely on internal knowledge. It answers from the information provided in the user's current and previous inputs, and from explicitly referenced external sources.
+Note: This system does not generate answers based solely on internal knowledge. It answers from the information provided in the user's current and previous inputs, and from the context.
 """
 
 def get_neo4j_retriever(graph, index_name="vector", search_k=CHAT_SEARCH_KWARG_K, score_threshold=CHAT_SEARCH_KWARG_SCORE_THRESHOLD):
@@ -288,7 +295,9 @@ def QA_RAG(graph,model,question,session_id):
             }
         )
         if docs:
+            # print(docs)
             formatted_docs,sources = format_documents(docs)
+
             doc_retrieval_time = time.time() - start_time
             logging.info(f"Modified question and Documents retrieved in {doc_retrieval_time:.2f} seconds")
 

diff --git a/backend/src/chunkid_entities.py b/backend/src/chunkid_entities.py
@@ -10,7 +10,7 @@
 MATCH (chunk)-[:PART_OF]->(d:Document)
 CALL {WITH chunk
 MATCH (chunk)-[:HAS_ENTITY]->(e) 
-MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,3}(:!Chunk&!Document) 
+MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,2}(:!Chunk&!Document) 
 UNWIND rels as r
 RETURN collect(distinct r) as rels
 }

diff --git a/backend/src/document_sources/gcs_bucket.py b/backend/src/document_sources/gcs_bucket.py
@@ -7,6 +7,7 @@
 from PyPDF2 import PdfReader
 import io
 from google.oauth2.credentials import Credentials
+import time
 
 def get_gcs_bucket_files_info(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, creds):
     storage_client = storage.Client(project=gcs_project_id, credentials=creds)
@@ -41,7 +42,7 @@ def get_gcs_bucket_files_info(gcs_project_id, gcs_bucket_name, gcs_bucket_folder
 def load_pdf(file_path):
     return PyMuPDFLoader(file_path)
 
-def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token):
+def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token=None):
 
   if gcs_bucket_folder is not None:
     if gcs_bucket_folder.endswith('/'):
@@ -56,8 +57,12 @@ def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, g
   # pages = loader.load()
   # file_name = gcs_blob_filename
   #creds= Credentials(access_token)
-  creds= Credentials(access_token)
-  storage_client = storage.Client(project=gcs_project_id, credentials=creds)
+  if access_token is None:
+    storage_client = storage.Client(project=gcs_project_id)
+  else:
+    creds= Credentials(access_token)
+    storage_client = storage.Client(project=gcs_project_id, credentials=creds)
+  print(f'BLOB Name: {blob_name}')
   bucket = storage_client.bucket(gcs_bucket_name)
   blob = bucket.blob(blob_name) 
   content = blob.download_as_bytes()
@@ -70,4 +75,64 @@ def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, g
         text += page.extract_text()
   pages = [Document(page_content = text)]
   return gcs_blob_filename, pages
-
+
+def upload_file_to_gcs(file_chunk, chunk_number, original_file_name, bucket_name):
+  storage_client = storage.Client()
+
+  file_name = f'{original_file_name}_part_{chunk_number}'
+  bucket = storage_client.bucket(bucket_name)
+  file_data = file_chunk.file.read()
+  # print(f'data after read {file_data}')
+
+  blob = bucket.blob(file_name)
+  file_io = io.BytesIO(file_data)
+  blob.upload_from_file(file_io)
+  # Define the lifecycle rule to delete objects after 6 hours
+  # rule = {
+  #     "action": {"type": "Delete"},
+  #     "condition": {"age": 1}  # Age in days (24 hours = 1 days)
+  # }
+
+  # # Get the current lifecycle policy
+  # lifecycle = list(bucket.lifecycle_rules)
+
+  # # Add the new rule
+  # lifecycle.append(rule)
+
+  # # Set the lifecycle policy on the bucket
+  # bucket.lifecycle_rules = lifecycle
+  # bucket.patch()
+  time.sleep(1)
+  logging.info('Chunk uploaded successfully in gcs')
+
+def merge_file_gcs(bucket_name, original_file_name: str):
+    storage_client = storage.Client()
+    # Retrieve chunks from GCS
+    blobs = storage_client.list_blobs(bucket_name, prefix=f"{original_file_name}_part_")
+    chunks = []
+    for blob in blobs:
+      chunks.append(blob.download_as_bytes())
+      blob.delete()
+
+    # Merge chunks into a single file
+    merged_file = b"".join(chunks)
+    blob = storage_client.bucket(bucket_name).blob(original_file_name)
+    logging.info('save the merged file from chunks in gcs')
+    file_io = io.BytesIO(merged_file)
+    blob.upload_from_file(file_io)
+    pdf_reader = PdfReader(file_io)
+    file_size = len(merged_file)
+    total_pages = len(pdf_reader.pages)
+
+    return total_pages, file_size
+
+def delete_file_from_gcs(bucket_name, file_name):
+  try:
+    storage_client = storage.Client()
+    bucket = storage_client.bucket(bucket_name)
+    blob = bucket.blob(file_name)
+    if blob.exists():
+      blob.delete()
+    logging.info('File deleted from GCS successfully')
+  except:
+    raise Exception('BLOB not exists in GCS')
diff --git a/backend/src/document_sources/local_file.py b/backend/src/document_sources/local_file.py
@@ -43,13 +43,17 @@ def get_documents_from_file_by_path(file_path,file_name):
 
                     if page.metadata['page_number']>page_number:
                         page_number+=1
+                        if not metadata:
+                            metadata = {'total_pages':unstructured_pages[-1].metadata['page_number']}
                         pages.append(Document(page_content = page_content, metadata=metadata))
                         page_content='' 
 
                     if page == unstructured_pages[-1]:
+                        if not metadata:
+                            metadata = {'total_pages':unstructured_pages[-1].metadata['page_number']}
                         pages.append(Document(page_content = page_content, metadata=metadata))
 
-                elif page.metadata['category']=='PageBreak':
+                elif page.metadata['category']=='PageBreak' and page!=unstructured_pages[0]:
                     page_number+=1
                     pages.append(Document(page_content = page_content, metadata=metadata))
                     page_content=''
@@ -65,5 +69,4 @@ def get_documents_from_file_by_path(file_path,file_name):
     else:
         logging.info(f'File {file_name} does not exist')
         raise Exception(f'File {file_name} does not exist')
-
     return file_name, pages , file_extension
diff --git a/backend/src/graphDB_dataAccess.py b/backend/src/graphDB_dataAccess.py
@@ -2,8 +2,8 @@
 import os
 from datetime import datetime
 from langchain_community.graphs import Neo4jGraph
-from src.shared.common_fn import delete_uploaded_local_file
-from src.api_response import create_api_response
+from src.document_sources.gcs_bucket import delete_file_from_gcs
+from src.shared.constants import BUCKET_UPLOAD
 from src.entities.source_node import sourceNode
 import json
 
@@ -175,7 +175,7 @@ def delete_file_from_graph(self, filenames, source_types, deleteEntities:str, me
             merged_file_path = os.path.join(merged_dir, file_name)
             if source_type == 'local file':
                 logging.info(f'Deleted File Path: {merged_file_path} and Deleted File Name : {file_name}')
-                delete_uploaded_local_file(merged_file_path, file_name)
+                delete_file_from_gcs(BUCKET_UPLOAD,file_name)
 
         query_to_delete_document=""" 
            MATCH (d:Document) where d.fileName in $filename_list and d.fileSource in $source_types_list