MODSetter · MODSetter · May 31, 2025 · May 31, 2025 · May 31, 2025 · May 31, 2025
diff --git a/README.md b/README.md
@@ -31,7 +31,7 @@ https://github.com/user-attachments/assets/bf64a6ca-934b-47ac-9e1b-edac5fe972ec
 ### 💡 **Idea**: 
 Have your own highly customizable private NotebookLM and Perplexity integrated with external sources.
 ### 📁 **Multiple File Format Uploading Support**
-Save content from your own personal files *(Documents, images, videos and supports **34 file extensions**)* to your own personal knowledge base .
+Save content from your own personal files *(Documents, images, videos and supports **50+ file extensions**)* to your own personal knowledge base .
 ### 🔍 **Powerful Search**
 Quickly research or find anything in your saved content .
 ### 💬 **Chat with your Saved Content**
@@ -64,37 +64,35 @@ Open source and easy to deploy locally.
 - GitHub
 - and more to come.....
 
-### 📄 **Supported File Extensions**
+## 📄 **Supported File Extensions**
 
-#### Document
+> **Note**: File format support depends on your ETL service configuration. LlamaCloud supports 50+ formats, while Unstructured supports 34+ core formats.
 
-`.doc`, `.docx`, `.odt`, `.rtf`, `.pdf`, `.xml`
+### Documents & Text
+**LlamaCloud**: `.pdf`, `.doc`, `.docx`, `.docm`, `.dot`, `.dotm`, `.rtf`, `.txt`, `.xml`, `.epub`, `.odt`, `.wpd`, `.pages`, `.key`, `.numbers`, `.602`, `.abw`, `.cgm`, `.cwk`, `.hwp`, `.lwp`, `.mw`, `.mcw`, `.pbd`, `.sda`, `.sdd`, `.sdp`, `.sdw`, `.sgl`, `.sti`, `.sxi`, `.sxw`, `.stw`, `.sxg`, `.uof`, `.uop`, `.uot`, `.vor`, `.wps`, `.zabw`
 
-#### Text & Markup
+**Unstructured**: `.doc`, `.docx`, `.odt`, `.rtf`, `.pdf`, `.xml`, `.txt`, `.md`, `.markdown`, `.rst`, `.html`, `.org`, `.epub`
 
-`.txt`, `.md`, `.markdown`, `.rst`, `.html`, `.org`
+### Presentations
+**LlamaCloud**: `.ppt`, `.pptx`, `.pptm`, `.pot`, `.potm`, `.potx`, `.odp`, `.key`
 
-#### Spreadsheets & Tables
+**Unstructured**: `.ppt`, `.pptx`
 
-`.xls`, `.xlsx`, `.csv`, `.tsv`
+### Spreadsheets & Data
+**LlamaCloud**: `.xlsx`, `.xls`, `.xlsm`, `.xlsb`, `.xlw`, `.csv`, `.tsv`, `.ods`, `.fods`, `.numbers`, `.dbf`, `.123`, `.dif`, `.sylk`, `.slk`, `.prn`, `.et`, `.uos1`, `.uos2`, `.wk1`, `.wk2`, `.wk3`, `.wk4`, `.wks`, `.wq1`, `.wq2`, `.wb1`, `.wb2`, `.wb3`, `.qpw`, `.xlr`, `.eth`
 
-#### Audio & Video
+**Unstructured**: `.xls`, `.xlsx`, `.csv`, `.tsv`
 
-`.mp3`, `.mpga`, `.m4a`, `.wav`, `.mp4`, `.mpeg`, `.webm`
-
-#### Images
-
-`.jpg`, `.jpeg`, `.png`, `.bmp`, `.tiff`, `.heic`
-
-#### Email & eBooks
+### Images
+**LlamaCloud**: `.jpg`, `.jpeg`, `.png`, `.gif`, `.bmp`, `.svg`, `.tiff`, `.webp`, `.html`, `.htm`, `.web`
 
-`.eml`, `.msg`, `.epub`
-
-#### PowerPoint Presentations & Other
-
-`.ppt`, `.pptx`, `.p7s`
+**Unstructured**: `.jpg`, `.jpeg`, `.png`, `.bmp`, `.tiff`, `.heic`
 
+### Audio & Video *(Always Supported)*
+`.mp3`, `.mpga`, `.m4a`, `.wav`, `.mp4`, `.mpeg`, `.webm`
 
+### Email & Communication
+**Unstructured**: `.eml`, `.msg`, `.p7s`
 
 ### 🔖 Cross Browser Extension
 - The SurfSense extension can be used to save any webpage you like.

diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example
@@ -30,9 +30,13 @@ STT_SERVICE="openai/whisper-1"
 OPENAI_API_KEY="sk-proj-iA"
 GEMINI_API_KEY="AIzaSyB6-1641124124124124124124124124124"
 
-UNSTRUCTURED_API_KEY="Tpu3P0U8iy"
 FIRECRAWL_API_KEY="fcr-01J0000000000000000000000"
 
+#File Parser Service
+ETL_SERVICE="UNSTRUCTURED" or "LLAMACLOUD"
+UNSTRUCTURED_API_KEY="Tpu3P0U8iy"
+LLAMA_CLOUD_API_KEY="llx-nnn"
+
 #OPTIONAL: Add these for LangSmith Observability
 LANGSMITH_TRACING=true
 LANGSMITH_ENDPOINT="https://api.smith.langchain.com"

diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py
@@ -96,9 +96,18 @@ class Config:
     # OAuth JWT
     SECRET_KEY = os.getenv("SECRET_KEY")
 
-    # Unstructured API Key
-    UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
+    # ETL Service
+    ETL_SERVICE = os.getenv("ETL_SERVICE")
 
+    if ETL_SERVICE == "UNSTRUCTURED":
+        # Unstructured API Key
+        UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
+
+    elif ETL_SERVICE == "LLAMACLOUD":
+        # LlamaCloud API Key
+        LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
+
-    # ETL Service
-    ETL_SERVICE = os.getenv("ETL_SERVICE")
-    
-    if ETL_SERVICE == "UNSTRUCTURED":
-        # Unstructured API Key
-        UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
-        
-    elif ETL_SERVICE == "LLAMACLOUD":
-        # LlamaCloud API Key
-        LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
-        
+    # ETL Service
+    ETL_SERVICE = os.getenv("ETL_SERVICE")
+    
+    if ETL_SERVICE == "UNSTRUCTURED":
+        # Unstructured API Key
+        UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
+        if not UNSTRUCTURED_API_KEY:
+            raise ValueError("UNSTRUCTURED_API_KEY is required when ETL_SERVICE is set to 'UNSTRUCTURED'")
+        
+    elif ETL_SERVICE == "LLAMACLOUD":
+        # LlamaCloud API Key
+        LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
+        if not LLAMA_CLOUD_API_KEY:
+            raise ValueError("LLAMA_CLOUD_API_KEY is required when ETL_SERVICE is set to 'LLAMACLOUD'")
+        
+    elif ETL_SERVICE is not None:
+        raise ValueError(f"Invalid ETL_SERVICE value: '{ETL_SERVICE}'. Must be 'UNSTRUCTURED' or 'LLAMACLOUD'")
+    else:
+        raise ValueError("ETL_SERVICE environment variable is required")
-    # ETL Service
-    ETL_SERVICE = os.getenv("ETL_SERVICE")
-    
-    if ETL_SERVICE == "UNSTRUCTURED":
-        # Unstructured API Key
-        UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
-        
-    elif ETL_SERVICE == "LLAMACLOUD":
-        # LlamaCloud API Key
-        LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
-        
+    # ETL Service
+    ETL_SERVICE = os.getenv("ETL_SERVICE")
+    
+    if ETL_SERVICE == "UNSTRUCTURED":
+        # Unstructured API Key
+        UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
+        if not UNSTRUCTURED_API_KEY:
+            raise ValueError("UNSTRUCTURED_API_KEY is required when ETL_SERVICE is set to 'UNSTRUCTURED'")
+        
+    elif ETL_SERVICE == "LLAMACLOUD":
+        # LlamaCloud API Key
+        LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
+        if not LLAMA_CLOUD_API_KEY:
+            raise ValueError("LLAMA_CLOUD_API_KEY is required when ETL_SERVICE is set to 'LLAMACLOUD'")
+        
+    elif ETL_SERVICE is not None:
+        raise ValueError(f"Invalid ETL_SERVICE value: '{ETL_SERVICE}'. Must be 'UNSTRUCTURED' or 'LLAMACLOUD'")
+    else:
+        raise ValueError("ETL_SERVICE environment variable is required")
+
     # Firecrawl API Key
     FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", None) 
 

diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py
@@ -7,7 +7,7 @@
 from app.schemas import DocumentsCreate, DocumentUpdate, DocumentRead
 from app.users import current_active_user
 from app.utils.check_ownership import check_ownership
-from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document, add_crawled_url_document, add_youtube_video_document
+from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document_using_unstructured, add_crawled_url_document, add_youtube_video_document, add_received_file_document_using_llamacloud
 from app.config import config as app_config
 # Force asyncio to use standard event loop before unstructured imports
 import asyncio
@@ -101,8 +101,7 @@ async def create_documents(
                 content = await file.read()
                 with open(temp_path, "wb") as f:
                     f.write(content)
-
-                # Process in background to avoid uvloop conflicts
+
                 fastapi_background_tasks.add_task(
                     process_file_in_background_with_new_session,
                     temp_path,
@@ -191,36 +190,74 @@ async def process_file_in_background(
                 search_space_id
             )
         else:
-            # Use synchronous unstructured API to avoid event loop issues
-            from langchain_unstructured import UnstructuredLoader
-
-            # Process the file
-            loader = UnstructuredLoader(
-                file_path,
-                mode="elements",
-                post_processors=[],
-                languages=["eng"],
-                include_orig_elements=False,
-                include_metadata=False,
-                strategy="auto",
-            )
+            if app_config.ETL_SERVICE == "UNSTRUCTURED":
+                from langchain_unstructured import UnstructuredLoader
+                
+                # Process the file
+                loader = UnstructuredLoader(
+                    file_path,
+                    mode="elements",
+                    post_processors=[],
+                    languages=["eng"],
+                    include_orig_elements=False,
+                    include_metadata=False,
+                    strategy="auto",
+                )
 
-            docs = await loader.aload()
+                docs = await loader.aload()
 
-            # Clean up the temp file
-            import os
-            try:
-                os.unlink(file_path)
-            except:
-                pass
-
-            # Pass the documents to the existing background task
-            await add_received_file_document(
-                session,
-                filename,
-                docs,
-                search_space_id
-            )
+                # Clean up the temp file
+                import os
+                try:
+                    os.unlink(file_path)
+                except:
+                    pass
+
+                # Pass the documents to the existing background task
+                await add_received_file_document_using_unstructured(
+                    session,
+                    filename,
+                    docs,
+                    search_space_id
+                )
+            elif app_config.ETL_SERVICE == "LLAMACLOUD":
+                from llama_cloud_services import LlamaParse
+                from llama_cloud_services.parse.utils import ResultType
+
+
+                # Create LlamaParse parser instance
+                parser = LlamaParse(
+                    api_key=app_config.LLAMA_CLOUD_API_KEY,
+                    num_workers=1,  # Use single worker for file processing
+                    verbose=True,
+                    language="en",
+                    result_type=ResultType.MD
+                )
+
+                # Parse the file asynchronously
+                result = await parser.aparse(file_path)
+
+                # Clean up the temp file
+                import os
+                try:
+                    os.unlink(file_path)
+                except:
+                    pass
+
+                # Get markdown documents from the result
+                markdown_documents = await result.aget_markdown_documents(split_by_page=False)
+
+                for doc in markdown_documents:
+                    # Extract text content from the markdown documents
+                    markdown_content = doc.text
+
+                    # Process the documents using our LlamaCloud background task
+                    await add_received_file_document_using_llamacloud(
+                        session,
+                        filename,
+                        llamacloud_markdown_document=markdown_content,
+                        search_space_id=search_space_id
+                    )
     except Exception as e:
         import logging
         logging.error(f"Error processing file in background: {str(e)}")
@@ -442,3 +479,5 @@ async def process_youtube_video_with_new_session(
         except Exception as e:
             import logging
             logging.error(f"Error processing YouTube video: {str(e)}")
+
+
diff --git a/surfsense_backend/app/tasks/background_tasks.py b/surfsense_backend/app/tasks/background_tasks.py
@@ -268,7 +268,6 @@ async def add_received_markdown_file_document(
             document_type=DocumentType.FILE,
             document_metadata={
                 "FILE_NAME": file_name,
-                "SAVED_AT": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
             },
             content=summary_content,
             embedding=summary_embedding,
@@ -289,7 +288,7 @@ async def add_received_markdown_file_document(
         raise RuntimeError(f"Failed to process file document: {str(e)}")
 
 
-async def add_received_file_document(
+async def add_received_file_document_using_unstructured(
     session: AsyncSession,
     file_name: str,
     unstructured_processed_elements: List[LangChainDocument],
@@ -336,7 +335,7 @@ async def add_received_file_document(
             document_type=DocumentType.FILE,
             document_metadata={
                 "FILE_NAME": file_name,
-                "SAVED_AT": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                "ETL_SERVICE": "UNSTRUCTURED",
             },
             content=summary_content,
             embedding=summary_embedding,
@@ -357,6 +356,83 @@ async def add_received_file_document(
         raise RuntimeError(f"Failed to process file document: {str(e)}")
 
 
+async def add_received_file_document_using_llamacloud(
+    session: AsyncSession,
+    file_name: str,
+    llamacloud_markdown_document: str,
+    search_space_id: int,
+) -> Optional[Document]:
+    """
+    Process and store document content parsed by LlamaCloud.
+
+    Args:
+        session: Database session
+        file_name: Name of the processed file
+        llamacloud_markdown_documents: List of markdown content from LlamaCloud parsing
+        search_space_id: ID of the search space
-        llamacloud_markdown_documents: List of markdown content from LlamaCloud parsing
-        search_space_id: ID of the search space
+        llamacloud_markdown_document: Markdown content from LlamaCloud parsing
+        search_space_id: ID of the search space
-        llamacloud_markdown_documents: List of markdown content from LlamaCloud parsing
-        search_space_id: ID of the search space
+        llamacloud_markdown_document: Markdown content from LlamaCloud parsing
+        search_space_id: ID of the search space
+
+    Returns:
+        Document object if successful, None if failed
+    """
+    try:
+        # Combine all markdown documents into one
+        file_in_markdown = llamacloud_markdown_document
+
+        content_hash = generate_content_hash(file_in_markdown)
+
+        # Check if document with this content hash already exists
+        existing_doc_result = await session.execute(
+            select(Document).where(Document.content_hash == content_hash)
+        )
+        existing_document = existing_doc_result.scalars().first()
+
+        if existing_document:
+            logging.info(f"Document with content hash {content_hash} already exists. Skipping processing.")
+            return existing_document
+
+        # Generate summary
+        summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance
+        summary_result = await summary_chain.ainvoke({"document": file_in_markdown})
+        summary_content = summary_result.content
+        summary_embedding = config.embedding_model_instance.embed(summary_content)
+
+        # Process chunks
+        chunks = [
+            Chunk(
+                content=chunk.text,
+                embedding=config.embedding_model_instance.embed(chunk.text),
+            )
+            for chunk in config.chunker_instance.chunk(file_in_markdown)
+        ]
+
+        # Create and store document
+        document = Document(
+            search_space_id=search_space_id,
+            title=file_name,
+            document_type=DocumentType.FILE,
+            document_metadata={
+                "FILE_NAME": file_name,
+                "ETL_SERVICE": "LLAMACLOUD",
+            },
+            content=summary_content,
+            embedding=summary_embedding,
+            chunks=chunks,
+            content_hash=content_hash,
+        )
+
+        session.add(document)
+        await session.commit()
+        await session.refresh(document)
+
+        return document
+    except SQLAlchemyError as db_error:
+        await session.rollback()
+        raise db_error
+    except Exception as e:
+        await session.rollback()
+        raise RuntimeError(f"Failed to process file document using LlamaCloud: {str(e)}")
-        raise RuntimeError(f"Failed to process file document using LlamaCloud: {str(e)}")
+        raise RuntimeError(f"Failed to process file document using LlamaCloud: {str(e)}") from e
-        raise RuntimeError(f"Failed to process file document using LlamaCloud: {str(e)}")
+        raise RuntimeError(f"Failed to process file document using LlamaCloud: {str(e)}") from e
+
+
 async def add_youtube_video_document(
     session: AsyncSession, url: str, search_space_id: int
 ):

diff --git a/surfsense_backend/pyproject.toml b/surfsense_backend/pyproject.toml
@@ -17,6 +17,7 @@ dependencies = [
     "langgraph>=0.3.29",
     "linkup-sdk>=0.2.4",
     "litellm>=1.61.4",
+    "llama-cloud-services>=0.6.25",
     "markdownify>=0.14.1",
     "notion-client>=2.3.0",
     "pgvector>=0.3.6",