FuturetalkDev · ian-katsuno · May 29, 2025 · Apr 3, 2025 · May 7, 2025 · May 29, 2025
diff --git a/backend/Dockerfile b/backend/Dockerfile
@@ -1,26 +1,53 @@
-FROM python:3.10-slim
+FROM python:3.10-slim-bullseye
 WORKDIR /code
 ENV PORT 8000
 EXPOSE 8000
-# Install dependencies and clean up in one layer
+
+# Clean up apt cache and install keyring packages
+RUN rm -rf /var/lib/apt/lists/* && \
+    apt-get clean && \
+    apt-get -o Acquire::AllowInsecureRepositories=true update && \
+    apt-get install -y --no-install-recommends --no-install-suggests \
+    debian-keyring \
+    debian-archive-keyring \
+    gnupg \
+    ca-certificates && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Now update and install dependencies including build tools
 RUN apt-get update && \
-   apt-get install -y --no-install-recommends \
+   apt-get install -y --no-install-recommends --no-install-suggests \
        libmagic1 \
        libgl1-mesa-glx \
        libreoffice \
        cmake \
        poppler-utils \
-       tesseract-ocr && \
+       tesseract-ocr \
+       build-essential \
+       python3-dev \
+       gcc \
+       autoconf \
+       automake \
+       libtool \
+       pkg-config && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
+
+# Manually fetch the latest Debian signing key before updating apt repositories
+RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
+
 # Set LD_LIBRARY_PATH
 ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
+
 # Copy requirements file and install Python dependencies
 COPY requirements.txt /code/
 # --no-cache-dir --upgrade 
 RUN pip install -r requirements.txt 
+
 # Copy application code
 COPY . /code
+
 # Set command
 # CMD ["gunicorn", "score:app", "--workers", "8","--threads", "8", "--worker-class", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:8000", "--timeout", "300"] # original line
 CMD ["gunicorn", "score:app", "--workers", "2","--threads", "2", "--worker-class", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:8000", "--timeout", "300"]
diff --git a/backend/Pipfile b/backend/Pipfile
@@ -41,6 +41,7 @@ python-dotenv = "==1.0.1"
 python-magic = "==0.4.27"
 pypdf2 = "==3.0.1"
 pymupdf = "==1.24.14"
+pycryptodome = "*"
 starlette = "==0.41.3"
 sse-starlette = "==2.1.3"
 starlette-session = "==0.4.3"
@@ -64,6 +65,8 @@ ragas = "==0.2.6"
 rouge-score = "==0.1.2"
 langchain-neo4j = "==0.1.1"
 unstructured = "==0.16.11"
+pi-heif = "*"
+"pdfminer.six" = "==20240706"
 
 [dev-packages]
 

diff --git a/backend/Pipfile.lock b/backend/Pipfile.lock
diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -1,3 +1,4 @@
+pycryptodome==3.22.0
 asyncio==3.4.3
 boto3==1.35.69
 botocore==1.35.69
@@ -59,3 +60,4 @@ Secweb==1.11.0
 ragas==0.2.6
 rouge_score==0.1.2
 langchain-neo4j==0.1.1
+pdfminer.six==20240706
diff --git a/backend/score.py b/backend/score.py
@@ -18,7 +18,7 @@
 from src.communities import create_communities
 from src.neighbours import get_neighbour_nodes
 import json
-from typing import List
+from typing import List, Optional, Dict, Any
 from starlette.middleware.sessions import SessionMiddleware
 from google.oauth2.credentials import Credentials
 import os
@@ -34,22 +34,51 @@
 from langchain_neo4j import Neo4jGraph
 from pydantic import BaseModel
 
+# this load_dotenv added by ian for non-docker environments
+from dotenv import load_dotenv
+
+# Get the absolute path to the .env file
+env_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '.env')
+logging.info(f"Loading .env file from: {env_path}")
+
+# Debug: Print contents of .env file
+try:
+    with open(env_path, 'r') as f:
+        env_contents = f.read()
+        logging.info(f"\nContents of main .env file:\n{env_contents}")
+except Exception as e:
+    logging.error(f"Error reading .env file: {e}")
+
+load_dotenv(env_path)
+
 class Message(BaseModel):
     role: str
     content: str
 
 class MessageData(BaseModel):
     messages: List[Message]  # Wrap the messages array in an object    
     question: str
+    filter_properties: Optional[Dict[str, Any]] = None
+    requireGrounding: bool = True
     # tools: List[str]
 
 logger = CustomLogger()
 CHUNK_DIR = os.path.join(os.path.dirname(__file__), "chunks")
 MERGED_DIR = os.path.join(os.path.dirname(__file__), "merged_files")
 
 def printEnvKey(key):
-    value = os.environ.get(key)
-    logging.info(f"{key} = {value}")
+    if key in os.environ:
+        value = os.environ.get(key)
+        logging.info(f"{key} exists and has value: {value}")
+    else:
+        logging.info(f"{key} does NOT exist in environment variables")
+
+def printAllEnvVars():
+    logging.info("All environment variables:")
+    for key, value in os.environ.items():
+        logging.info(f"{key} = {value}")
+
+# printAllEnvVars()
 
 def healthy_condition():
     output = {"healthy": True}
@@ -388,9 +417,12 @@ async def magic_trek_chat_bot(
     logging.info(messageData)
     messages = messageData.messages
     question = messageData.question
+    filter_properties = messageData.filter_properties
+    requireGrounding = messageData.requireGrounding if hasattr(messageData, 'requireGrounding') else True
     print(f'question = {question}')
     print(messages)
     print(len(messages))
+    logging.info(f"requireGrounding = {requireGrounding}")
     logging.info(f"IAN-TEST called at {datetime.now()}")
     qa_rag_start_time = time.time()
 
@@ -401,10 +433,18 @@ async def magic_trek_chat_bot(
     ## added by ian
     # mode = "graph_vector_fulltext" ## added by ian
     mode = "graph_vector" ## changed by ian from graph_vector_fulltext to graph_vector april 1, 2025
-    uri = "neo4j+s://939cd2a2.databases.neo4j.io"
+
+    # magic trek rag grade 6
+    # uri = "neo4j+s://939cd2a2.databases.neo4j.io"
+    # userName = "neo4j"
+    # password = "4SG1SUwI22yLo6DMNkRZe1ItvGp778LIEFpCE_aOEL0"
+    # database = "neo4j"
+    # trek-labs-rag-db-dev
+    uri = "neo4j+s://90ea8c8c.databases.neo4j.io"
     userName = "neo4j"
-    password = "4SG1SUwI22yLo6DMNkRZe1ItvGp778LIEFpCE_aOEL0"
+    password = "5MJ_WCRv7htDYH5XeQbQU1vVjuw9GC7F-bkPE8kKymk"
     database = "neo4j"
+
     # model = "openai_gpt_3.5" # changed from this model by ian friday mar 21, 2025
     model = "openai_gpt_4o_mini" 
     # model = "gpt-4-turbo-preview"
@@ -421,15 +461,17 @@ async def magic_trek_chat_bot(
         graph_DB_dataAccess = graphDBdataAccess(graph)
         write_access = graph_DB_dataAccess.check_account_access(database=database)
         result = await asyncio.to_thread(
-            MAGIC_TREK_QA_RAG ,
+            MAGIC_TREK_QA_RAG,
             graph=graph,
             model=model,
             messages=messages, 
             question=question,
             document_names=document_names,
             session_id=session_id,
             mode=mode,
-            write_access=write_access
+            write_access=write_access,
+            filter_properties=filter_properties,
+            requireGrounding=requireGrounding
         )
 
         total_call_time = time.time() - qa_rag_start_time

diff --git a/backend/src/QA_integration.py b/backend/src/QA_integration.py
@@ -8,6 +8,10 @@
 from typing import Any
 from dotenv import load_dotenv
 
+# Set tokenizer parallelism to false to avoid warnings and deadlock
+# added by ian because of hugging face threading tokenizer deadlock warning
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
 from langchain_neo4j import Neo4jVector
 from langchain_neo4j import Neo4jChatMessageHistory
 from langchain_neo4j import GraphCypherQAChain
@@ -75,6 +79,7 @@ def on_llm_end(
     ) -> None:
         logging.info("question transformed")
         self.transformed_question = response.generations[0][0].text.strip()
+        logging.info(self.transformed_question)
 
 def get_history_by_session_id(session_id):
     try:
@@ -226,13 +231,13 @@ def format_documents(documents, model):
 
     return "\n\n".join(formatted_docs), sources,entities,global_communities
 
-def process_documents(docs, question, messages, llm, model,chat_mode_settings):
+def process_documents(docs, question, messages, llm, model,chat_mode_settings, requireGrounding=True):
     start_time = time.time()
 
     try:
         formatted_docs, sources, entitydetails, communities = format_documents(docs, model)
-
-        rag_chain = get_rag_chain(llm=llm)
+        logging.info(f"process_documents requireGrounding = {requireGrounding}")
+        rag_chain = get_rag_chain(llm=llm, system_template=CHAT_SYSTEM_TEMPLATE if requireGrounding else CHAT_SYSTEM_TEMPLATE_UNGROUNDED)
 
         ai_response = rag_chain.invoke({
             "messages": messages[:-1],
@@ -301,6 +306,8 @@ def create_document_retriever_chain(llm, retriever):
                 MessagesPlaceholder(variable_name="messages")
             ]
         )
+        logging.info("query_transform_prompt created")
+        logging.info(query_transform_prompt)
 
         output_parser = StrOutputParser()
 
@@ -376,7 +383,7 @@ def initialize_neo4j_vector(graph, chat_mode_settings):
         raise
     return neo_db
 
-def create_retriever(neo_db, document_names, chat_mode_settings,search_k, score_threshold):
+def create_retriever(neo_db, document_names, chat_mode_settings, search_k, score_threshold, filter_properties=None):
     if document_names and chat_mode_settings["document_filter"]:
         retriever = neo_db.as_retriever(
             search_type="similarity_score_threshold",
@@ -388,19 +395,26 @@ def create_retriever(neo_db, document_names, chat_mode_settings,search_k, score_
         )
         logging.info(f"Successfully created retriever with search_k={search_k}, score_threshold={score_threshold} for documents {document_names}")
     else:
+        search_kwargs = {
+            'k': search_k, 
+            'score_threshold': score_threshold
+        }
+        if filter_properties is not None:
+            search_kwargs['filter'] = filter_properties
+
         retriever = neo_db.as_retriever(
             search_type="similarity_score_threshold",
-            search_kwargs={'k': search_k, 'score_threshold': score_threshold}
+            search_kwargs=search_kwargs
         )
         logging.info(f"Successfully created retriever with search_k={search_k}, score_threshold={score_threshold}")
     return retriever
 
-def get_neo4j_retriever(graph, document_names,chat_mode_settings, score_threshold=CHAT_SEARCH_KWARG_SCORE_THRESHOLD):
+def get_neo4j_retriever(graph, document_names, chat_mode_settings, score_threshold=CHAT_SEARCH_KWARG_SCORE_THRESHOLD, filter_properties=None):
     try:
         neo_db = initialize_neo4j_vector(graph, chat_mode_settings)
         # document_names= list(map(str.strip, json.loads(document_names)))
         search_k = chat_mode_settings["top_k"]
-        retriever = create_retriever(neo_db, document_names,chat_mode_settings, search_k, score_threshold)
+        retriever = create_retriever(neo_db, document_names, chat_mode_settings, search_k, score_threshold, filter_properties)
         return retriever
     except Exception as e:
         index_name = chat_mode_settings.get("index_name")
@@ -451,7 +465,7 @@ def extract_tool_calls_direct(model, messages):
     tools = list(map(remap_tool_names, tools))
     return tools
 
-def setup_chat(model, graph, document_names, chat_mode_settings):
+def setup_chat(model, graph, document_names, chat_mode_settings, filter_properties=None):
     start_time = time.time()
     try:
         if model == "diffbot":
@@ -460,7 +474,7 @@ def setup_chat(model, graph, document_names, chat_mode_settings):
         llm, model_name = get_llm(model=model)
         logging.info(f"Model called in chat: {model} (version: {model_name})")
 
-        retriever = get_neo4j_retriever(graph=graph, chat_mode_settings=chat_mode_settings, document_names=document_names)
+        retriever = get_neo4j_retriever(graph=graph, chat_mode_settings=chat_mode_settings, document_names=document_names, filter_properties=filter_properties)
         doc_retriever = create_document_retriever_chain(llm, retriever)
 
         chat_setup_time = time.time() - start_time
@@ -507,9 +521,9 @@ def extract_tool_calls(model, messages):
     return tools
 
 
-def process_chat_response(messages, history, question, model, graph, document_names, chat_mode_settings, extract_tools = False):
+def process_chat_response(messages, history, question, model, graph, document_names, chat_mode_settings, extract_tools=False, filter_properties=None, requireGrounding=True):
     try:
-        llm, doc_retriever, model_version = setup_chat(model, graph, document_names, chat_mode_settings)
+        llm, doc_retriever, model_version = setup_chat(model, graph, document_names, chat_mode_settings, filter_properties)
 
         # Shared variable to store tool calls, initialized as empty list
         tool_calls = []
@@ -535,7 +549,7 @@ def extract_tools_thread():
 
         if docs:
             logging.info("documents found, process_documents about to be called")
-            content, result, total_tokens, formatted_docs = process_documents(docs, question, messages, llm, model, chat_mode_settings)
+            content, result, total_tokens, formatted_docs = process_documents(docs, question, messages, llm, model, chat_mode_settings, requireGrounding)
         else:
             logging.info("No document clause running")
             content = "I couldn't find any relevant documents to answer your question."
@@ -777,11 +791,12 @@ def convert_messages_to_langchain(messages):
 
     return langchain_messages
 
-def MAGIC_TREK_QA_RAG(graph,model, messages, question, document_names, session_id, mode, write_access=True):
+def MAGIC_TREK_QA_RAG(graph,model, messages, question, document_names, session_id, mode, write_access=True, filter_properties=None, requireGrounding=True):
     logging.info(f"Chat Mode: {mode}")
     document_names = "[]"
 
     logging.info(f"question = {question}")
+    logging.info(f"filter_properties = {filter_properties}")
     # receive the message history from our frontend
     messages = convert_messages_to_langchain(messages)
     logging.info("translated message history:")
@@ -802,6 +817,7 @@ def MAGIC_TREK_QA_RAG(graph,model, messages, question, document_names, session_i
     logging.info(messages)
 
     if mode == CHAT_GRAPH_MODE:
+        logging.info("process_graph_response called")
         result = process_graph_response(model, graph, question, messages, history=None)
     else:
         chat_mode_settings = get_chat_mode_settings(mode=mode)
@@ -823,6 +839,7 @@ def MAGIC_TREK_QA_RAG(graph,model, messages, question, document_names, session_i
                 "user": "chatbot"
             }
         else:
+            logging.info("process_chat_response called")
             result = process_chat_response(
                 messages=messages,
                 history=None, # TODO: once we set up history pass it again instead of None
@@ -831,7 +848,9 @@ def MAGIC_TREK_QA_RAG(graph,model, messages, question, document_names, session_i
                 graph=graph, 
                 document_names=document_names,
                 chat_mode_settings=chat_mode_settings,
-                extract_tools=True
+                extract_tools=True,
+                filter_properties=filter_properties,
+                requireGrounding=requireGrounding
             )
 
     # result["session_id"] = session_id
@@ -876,7 +895,7 @@ def QA_RAG(graph,model, question, document_names, session_id, mode, write_access
                 "user": "chatbot"
             }
         else:
-            result = process_chat_response(messages,history, question, model, graph, document_names,chat_mode_settings, extract_tools=False)
+            result = process_chat_response(messages,history, question, model, graph, document_names,chat_mode_settings, extract_tools=False, filter_properties={})
 
     result["session_id"] = session_id
 

diff --git a/backend/src/chunkid_entities.py b/backend/src/chunkid_entities.py
@@ -98,7 +98,7 @@ def process_chunkids(driver, chunk_ids, entities):
         logging.info(f"Query process completed successfully for chunk ids: {chunk_ids}")
         return result
     except Exception as e:
-        logging.error(f"chunkid_entities module: Error processing chunk ids: {chunk_ids}. Error: {e}")
+        logging.error(f"_entities module: Error processing chunk ids: {chunk_ids}. Error: {e}")
         raise  
 
 def remove_duplicate_nodes(nodes,property="element_id"):