neo4j-labs · kaustubh-darekar · Dec 3, 2024 · Nov 20, 2024 · Nov 25, 2024 · Nov 26, 2024
diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -1,183 +1,57 @@
-aiohttp==3.9.3
-aiosignal==1.3.1
-annotated-types==0.6.0
-antlr4-python3-runtime==4.9.3
-anyio==4.3.0
-async-timeout==4.0.3
 asyncio==3.4.3
-attrs==23.2.0
-backoff==2.2.1
-beautifulsoup4==4.12.3
-boto3==1.34.140
-botocore==1.34.140
-cachetools==5.3.3
-certifi==2024.2.2
-cffi==1.16.0
-chardet==5.2.0
-charset-normalizer==3.3.2
-click==8.1.7
-coloredlogs==15.0.1
-contourpy==1.2.0
-cryptography==42.0.2
-cycler==0.12.1
-dataclasses-json==0.6.4
-dataclasses-json-speakeasy==0.5.11
-Deprecated==1.2.14
-distro==1.9.0
-docstring_parser==0.16
-effdet==0.4.1
-emoji==2.10.1
-exceptiongroup==1.2.0
-fastapi==0.111.0
+boto3==1.35.69
+botocore==1.35.69
+certifi==2024.8.30
+fastapi==0.115.5
 fastapi-health==0.4.0
-filelock==3.13.1
-filetype==1.2.0
-flatbuffers==23.5.26
-fonttools==4.49.0
-frozenlist==1.4.1
-fsspec==2024.2.0
-google-api-core==2.18.0
-google-auth==2.29.0
-google_auth_oauthlib==1.2.0
-google-cloud-aiplatform==1.58.0
-google-cloud-bigquery==3.19.0
+google-api-core==2.23.0
+google-auth==2.36.0
+google_auth_oauthlib==1.2.1
 google-cloud-core==2.4.1
-google-cloud-resource-manager==1.12.3
-google-cloud-storage==2.17.0
-google-crc32c==1.5.0
-google-resumable-media==2.7.0
-googleapis-common-protos==1.63.0
-greenlet==3.0.3
-grpc-google-iam-v1==0.13.0
-grpcio==1.62.1
-google-ai-generativelanguage==0.6.6
-grpcio-status==1.62.1
-h11==0.14.0
-httpcore==1.0.4
-httpx==0.27.0
-huggingface-hub
-humanfriendly==10.0
-idna==3.6
-importlib-resources==6.1.1
+json-repair==0.30.2
 pip-install==1.3.5
-iopath==0.1.10
-Jinja2==3.1.3
-jmespath==1.0.1
-joblib==1.3.2
-jsonpatch==1.33
-jsonpath-python==1.0.6
-jsonpointer==2.4
-json-repair==0.25.2
-kiwisolver==1.4.5
-langchain==0.3.0
-langchain-aws==0.2.1
-langchain-anthropic==0.2.1
-langchain-fireworks==0.2.0
-langchain-google-genai==2.0.0
-langchain-community==0.3.0
-langchain-core==0.3.5
-langchain-experimental==0.3.1
-langchain-google-vertexai==2.0.1
-langchain-groq==0.2.0
-langchain-openai==0.2.0
-langchain-text-splitters==0.3.0
+langchain==0.3.8
+langchain-aws==0.2.7
+langchain-anthropic==0.3.0
+langchain-fireworks==0.2.5
+langchain-community==0.3.8
+langchain-core==0.3.21
+langchain-experimental==0.3.3
+langchain-google-vertexai==2.0.7
+langchain-groq==0.2.1
+langchain-openai==0.2.9
+langchain-text-splitters==0.3.2
+langchain-huggingface==0.1.2
 langdetect==1.0.9
-langsmith==0.1.128
-layoutparser==0.3.4
+langsmith==0.1.146
 langserve==0.3.0
-#langchain-cli==0.0.25
-lxml==5.1.0
-MarkupSafe==2.1.5
-marshmallow==3.20.2
-matplotlib==3.7.2
-mpmath==1.3.0
-multidict==6.0.5
-mypy-extensions==1.0.0
 neo4j-rust-ext
-networkx==3.2.1
-nltk==3.8.1
-numpy==1.26.4
-omegaconf==2.3.0
-onnx==1.16.1
-onnxruntime==1.18.1
-openai==1.47.1
-opencv-python==4.8.0.76
-orjson==3.9.15
-packaging==23.2
-pandas==2.2.0
-pdf2image==1.17.0
-pdfminer.six==20221105
-pdfplumber==0.10.4
-pikepdf==8.11.0
-pillow==10.2.0
-pillow_heif==0.15.0
-portalocker==2.8.2
-proto-plus==1.23.0
-protobuf==4.23.4
-psutil==6.0.0
-pyasn1==0.6.0
-pyasn1_modules==0.4.0
-pycocotools==2.0.7
-pycparser==2.21
-pydantic==2.8.2
-pydantic_core==2.20.1
-pyparsing==3.0.9
-pypdf==4.0.1
-PyPDF2==3.0.1
-pypdfium2==4.27.0
-pytesseract==0.3.10
-python-dateutil==2.8.2
+nltk==3.9.1
+openai==1.55.1
+opencv-python==4.10.0.84
+psutil==6.1.0
+pydantic==2.9.0
 python-dotenv==1.0.1
-python-iso639==2024.2.7
-python-magic==0.4.27
-python-multipart==0.0.9
-pytube==15.0.0
-pytz==2024.1
-PyYAML==6.0.1
-rapidfuzz==3.6.1
-regex==2023.12.25
-requests==2.32.3
-rsa==4.9
-s3transfer==0.10.1
-safetensors==0.4.1
-shapely==2.0.3
-six==1.16.0
-sniffio==1.3.1
-soupsieve==2.5
-starlette==0.37.2
-sse-starlette==2.1.2
+PyPDF2==3.0.1
+PyMuPDF==1.24.14
+starlette==0.41.3
+sse-starlette==2.1.3
 starlette-session==0.4.3
-sympy==1.12
-tabulate==0.9.0
-tenacity==8.2.3
-tiktoken==0.7.0
-timm==0.9.12
-tokenizers==0.19
-tqdm==4.66.2
-transformers==4.42.3
-types-protobuf
-types-requests
-typing-inspect==0.9.0
-typing_extensions==4.12.2
-tzdata==2024.1
-unstructured==0.14.9
-unstructured-client==0.23.8
-unstructured-inference==0.7.36
-unstructured.pytesseract==0.3.12
-unstructured[all-docs]==0.14.9
+tqdm==4.67.1
+unstructured[all-docs]==0.16.6
 urllib3==2.2.2
-uvicorn==0.30.1
-gunicorn==22.0.0
+uvicorn==0.32.1
+gunicorn==23.0.0
 wikipedia==1.4.0
 wrapt==1.16.0
 yarl==1.9.4
-youtube-transcript-api==0.6.2
+youtube-transcript-api==0.6.3
 zipp==3.17.0
-sentence-transformers==3.0.1
-google-cloud-logging==3.10.0
-PyMuPDF==1.24.5
+sentence-transformers==3.3.1
+google-cloud-logging==3.11.3
 pypandoc==1.13
-graphdatascience==1.10
+graphdatascience==1.12
 Secweb==1.11.0
-ragas==0.2.2
+ragas==0.2.6
 rouge_score==0.1.2
+langchain-neo4j==0.1.1
diff --git a/backend/score.py b/backend/score.py
@@ -36,6 +36,7 @@
 from src.ragas_eval import *
 from starlette.types import ASGIApp, Message, Receive, Scope, Send
 import gzip
+from langchain_neo4j import Neo4jGraph
 
 logger = CustomLogger()
 CHUNK_DIR = os.path.join(os.path.dirname(__file__), "chunks")
@@ -581,7 +582,6 @@ async def generate():
                     graph = create_graph_database_connection(uri, userName, decoded_password, database)
                     graphDb_data_Access = graphDBdataAccess(graph)
                     result = graphDb_data_Access.get_current_status_document_node(file_name)
-                    # print(f'Result of document status in SSE : {result}')
                     if len(result) > 0:
                         status = json.dumps({'fileName':file_name, 
                         'status':result[0]['Status'],
@@ -668,7 +668,7 @@ async def get_document_status(file_name, url, userName, password, database):
                 }
         else:
             status = {'fileName':file_name, 'status':'Failed'}
-        print(f'Result of document status in refresh : {result}')
+        logging.info(f'Result of document status in refresh : {result}')
         return create_api_response('Success',message="",file_name=status)
     except Exception as e:
         message=f"Unable to get the document status"
@@ -961,7 +961,7 @@ async def fetch_chunktext(
 async def backend_connection_configuation():
     try:
         graph = Neo4jGraph()
-        print(f'login connection status of object: {graph}')
+        logging.info(f'login connection status of object: {graph}')
         if graph is not None:
             graph_connection = True
             isURI = os.getenv('NEO4J_URI')

diff --git a/backend/src/QA_integration.py b/backend/src/QA_integration.py
@@ -4,15 +4,13 @@
 import logging
 
 import threading
-from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
 from typing import Any
 from dotenv import load_dotenv
 
-
-# LangChain imports
-from langchain_community.vectorstores.neo4j_vector import Neo4jVector
-from langchain_community.chat_message_histories import Neo4jChatMessageHistory
+from langchain_neo4j import Neo4jVector
+from langchain_neo4j import Neo4jChatMessageHistory
+from langchain_neo4j import GraphCypherQAChain
 from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables import RunnableBranch
@@ -21,7 +19,6 @@
 from langchain.retrievers.document_compressors import EmbeddingsFilter, DocumentCompressorPipeline
 from langchain_text_splitters import TokenTextSplitter
 from langchain_core.messages import HumanMessage, AIMessage
-from langchain.chains import GraphCypherQAChain
 from langchain_community.chat_message_histories import ChatMessageHistory 
 from langchain_core.callbacks import StdOutCallbackHandler, BaseCallbackHandler
 
@@ -38,8 +35,6 @@
 from src.llm import get_llm
 from src.shared.common_fn import load_embedding_model
 from src.shared.constants import *
-from src.graphDB_dataAccess import graphDBdataAccess
-from src.ragas_eval import get_ragas_metrics
 load_dotenv() 
 
 EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL')

diff --git a/backend/src/create_chunks.py b/backend/src/create_chunks.py
@@ -1,8 +1,7 @@
 from langchain_text_splitters import TokenTextSplitter
 from langchain.docstore.document import Document
-from langchain_community.graphs import Neo4jGraph
+from langchain_neo4j import Neo4jGraph
 import logging
-import os
 from src.document_sources.youtube import get_chunks_with_timestamps, get_calculated_timestamps
 import re
 
@@ -25,7 +24,6 @@ def split_file_into_chunks(self):
             A list of chunks each of which is a langchain Document.
         """
         logging.info("Split file into smaller chunks")
-        # number_of_chunks_allowed = int(os.environ.get('NUMBER_OF_CHUNKS_ALLOWED'))
         text_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=20)
         if 'page' in self.pages[0].metadata:
             chunks = []

diff --git a/backend/src/diffbot_transformer.py b/backend/src/diffbot_transformer.py
@@ -1,5 +1,6 @@
 from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer
-from langchain_community.graphs import Neo4jGraph
+#from langchain_community.graphs import Neo4jGraph
+from langchain_neo4j import Neo4jGraph
 from langchain.docstore.document import Document
 from typing import List
 import os

diff --git a/backend/src/document_sources/gcs_bucket.py b/backend/src/document_sources/gcs_bucket.py
@@ -101,15 +101,12 @@ def merge_file_gcs(bucket_name, original_file_name: str, folder_name_sha1_hashed
   try:
       storage_client = storage.Client()
       bucket = storage_client.bucket(bucket_name)
-      # Retrieve chunks from GCS
-      # blobs = storage_client.list_blobs(bucket_name, prefix=folder_name_sha1_hashed)
-      # print(f'before sorted blobs: {blobs}')
       chunks = []
       for i in range(1,total_chunks+1):
         blob_name = folder_name_sha1_hashed + '/' + f"{original_file_name}_part_{i}"
         blob = bucket.blob(blob_name) 
         if blob.exists():
-          print(f'Blob Name: {blob.name}')
+          logging.info(f'Blob Name: {blob.name}')
           chunks.append(blob.download_as_bytes())
         blob.delete()
 

diff --git a/backend/src/document_sources/local_file.py b/backend/src/document_sources/local_file.py
@@ -20,10 +20,8 @@
 
 def load_document_content(file_path):
     if Path(file_path).suffix.lower() == '.pdf':
-        print("in if")
         return PyMuPDFLoader(file_path)
     else:
-        print("in else")
         return UnstructuredFileLoader(file_path, mode="elements",autodetect_encoding=True)
 
 def get_documents_from_file_by_path(file_path,file_name):