neo4j-labs · karanchellani · Feb 27, 2025 · Feb 27, 2025
diff --git a/backend/src/document_sources/gcs_bucket.py b/backend/src/document_sources/gcs_bucket.py
@@ -1,8 +1,7 @@
 import os
 import logging
 from google.cloud import storage
-from langchain_community.document_loaders import GCSFileLoader, GCSDirectoryLoader
-from langchain_community.document_loaders import PyMuPDFLoader
+from langchain_community.document_loaders import GCSFileLoader
 from langchain_core.documents import Document
 from PyPDF2 import PdfReader
 import io
@@ -42,8 +41,9 @@ def get_gcs_bucket_files_info(gcs_project_id, gcs_bucket_name, gcs_bucket_folder
       logging.exception(f'Exception Stack trace: {error_message}')
       raise LLMGraphBuilderException(error_message)
 
-def load_pdf(file_path):
-    return PyMuPDFLoader(file_path)
+def gcs_loader_func(file_path):
+   loader, _ = load_document_content(file_path)
+   return loader
 
 def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token=None):
   nltk.download('punkt')
@@ -64,7 +64,7 @@ def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, g
     blob = bucket.blob(blob_name) 
 
     if blob.exists():
-        loader = GCSFileLoader(project_name=gcs_project_id, bucket=gcs_bucket_name, blob=blob_name, loader_func=load_document_content)
+        loader = GCSFileLoader(project_name=gcs_project_id, bucket=gcs_bucket_name, blob=blob_name, loader_func=gcs_loader_func)
         pages = loader.load() 
     else :
       raise LLMGraphBuilderException('File does not exist, Please re-upload the file and try again.')

diff --git a/backend/src/document_sources/local_file.py b/backend/src/document_sources/local_file.py
@@ -4,7 +4,15 @@
 from langchain_community.document_loaders import UnstructuredFileLoader
 from langchain_core.documents import Document
 import chardet
+from langchain_core.document_loaders import BaseLoader
 
+class ListLoader(BaseLoader):
+   """A wrapper to make a list of Documents compatible with BaseLoader."""
+   def __init__(self, documents):
+       self.documents = documents
+   def load(self):
+       return self.documents
+
 def detect_encoding(file_path):
    """Detects the file encoding to avoid UnicodeDecodeError."""
    with open(file_path, 'rb') as f:
@@ -27,7 +35,7 @@ def load_document_content(file_path):
         else:
             with open(file_path, encoding=encoding, errors="replace") as f:
                content = f.read()
-            loader = [Document(page_content=content, metadata={"source": file_path})]
+            loader = ListLoader([Document(page_content=content, metadata={"source": file_path})])
             encoding_flag =  True
             return loader,encoding_flag
     else:
@@ -36,27 +44,20 @@ def load_document_content(file_path):
 
 def get_documents_from_file_by_path(file_path,file_name):
     file_path = Path(file_path)
-    if file_path.exists():
-        logging.info(f'file {file_name} processing')        
-        file_extension = file_path.suffix.lower()
-        try:
-            loader,encoding_flag = load_document_content(file_path)
-            if file_extension == ".pdf":
-                pages = loader.load()
-            elif file_extension == ".txt":
-                if encoding_flag:
-                    pages = loader
-                else:
-                    unstructured_pages = loader.load()   
-                    pages= get_pages_with_page_numbers(unstructured_pages)   
-            else:
-                unstructured_pages = loader.load()   
-                pages= get_pages_with_page_numbers(unstructured_pages)      
-        except Exception as e:
-            raise Exception(f'Error while reading the file content or metadata ,{e}')
-    else:
+    if not file_path.exists():
         logging.info(f'File {file_name} does not exist')
         raise Exception(f'File {file_name} does not exist')
+    logging.info(f'file {file_name} processing')
+    try:
+        loader, encoding_flag = load_document_content(file_path)
+        file_extension = file_path.suffix.lower()
+        if file_extension == ".pdf" or (file_extension == ".txt" and encoding_flag):
+            pages = loader.load()
+        else:
+            unstructured_pages = loader.load()
+            pages = get_pages_with_page_numbers(unstructured_pages)
+    except Exception as e:
+        raise Exception(f'Error while reading the file content or metadata, {e}')
     return file_name, pages , file_extension
 
 def get_pages_with_page_numbers(unstructured_pages):