FuturetalkDev · ian-katsuno · Jun 24, 2025 · Jun 24, 2025
diff --git a/backend/Pipfile b/backend/Pipfile
@@ -67,6 +67,7 @@ langchain-neo4j = "==0.4.0"
 pypandoc-binary = "==1.15"
 chardet = "==5.2.0"
 unstructured = "==0.17.2"
+contextgem = "==0.8.1"
 
 [dev-packages]
 

diff --git a/backend/Pipfile.lock b/backend/Pipfile.lock
diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -67,3 +67,4 @@ rouge_score==0.1.2
 langchain-neo4j==0.4.0
 pypandoc-binary==1.15
 chardet==5.2.0
+contextgem==0.8.1
diff --git a/backend/src/document_metadata_extractor/__init__.py b/backend/src/document_metadata_extractor/__init__.py
@@ -0,0 +1,16 @@
+from .metadata_processor import MetadataProcessor, DocumentMetadata, MetadataExtractor
+from .domain_extractor import DomainExtractor
+
+def create_metadata_processor() -> MetadataProcessor:
+    """Create and configure a metadata processor with default extractors"""
+    processor = MetadataProcessor()
+    processor.add_extractor(DomainExtractor())
+    return processor
+
+__all__ = [
+    'MetadataProcessor',
+    'DocumentMetadata',
+    'MetadataExtractor',
+    'DomainExtractor',
+    'create_metadata_processor'
+] 
diff --git a/backend/src/document_metadata_extractor/domain_extractor.py b/backend/src/document_metadata_extractor/domain_extractor.py
@@ -0,0 +1,54 @@
+from contextgem import Document as ContextGemDocument, DocumentLLM, LabelConcept
+from langchain.schema import Document
+from .metadata_processor import MetadataExtractor, DocumentMetadata
+from typing import List
+
+DOMAIN_LABELS = [
+    "Multiple Myeloma",
+    "Emotional Intelligence",
+    "Business Management",
+    "Instruction Manual",
+    "Sales Training",
+    "Leadership Coaching",
+]
+
+class DomainExtractor(MetadataExtractor):
+    """Classifies document into knowledge domains"""
+
+    def __init__(self):
+        super().__init__()
+        self.num_pages = 20
+
+    def extract(self, pages: List[Document], file_name: str) -> DocumentMetadata:
+        raw_text = f"filename: {file_name}" + "\n".join([page.page_content for page in pages[:self.num_pages]])
+        print("classification raw text: ", raw_text)
+        contextgem_doc = ContextGemDocument(raw_text=raw_text)
+
+        # Define a LabelConcept for contract type classification
+        document_type_concept = LabelConcept(
+            name="Document Domain Label",
+            description="Classify the knowledge domain of the document",
+            labels=DOMAIN_LABELS,
+            classification_type="multi_class",  # only one label can be selected (mutually exclusive labels)
+            singular_occurrence=True,  # expect only one classification result
+        )
+
+        contextgem_doc.add_concepts([document_type_concept])
+
+        model = "ollama/phi4:14b"
+        llm = DocumentLLM(
+            model=model,
+            # api_key=os.getenv("CONTEXTGEM_OPENAI_API_KEY"),
+        )
+
+        concept_label = llm.extract_concepts_from_document(contextgem_doc)[0]
+
+        if concept_label.extracted_items:
+            # Get the classified document type
+            classified_type = concept_label.extracted_items[0].value
+            print(f"Document: {file_name}, classified as: {classified_type}")  # Output: ['NDA']        
+            return DocumentMetadata(domain=classified_type)
+        else: 
+            print(f"no label assigned to document {file_name}")
+            return DocumentMetadata(domain=None)
+
diff --git a/backend/src/document_metadata_extractor/metadata_processor.py b/backend/src/document_metadata_extractor/metadata_processor.py
@@ -0,0 +1,66 @@
+from typing import Dict, Any, List, Optional
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
+import logging
+from datetime import datetime
+
+@dataclass
+class DocumentMetadata:
+    """Container for document-level metadata"""
+    title: Optional[str] = None
+    author: Optional[str] = None
+    date: Optional[datetime] = None
+    language: Optional[str] = None
+    domain: Optional[str] = None
+    keywords: List[str] = None
+    summary: Optional[str] = None
+    custom_metadata: Dict[str, Any] = None
+
+    def __post_init__(self):
+        if self.keywords is None:
+            self.keywords = []
+        if self.custom_metadata is None:
+            self.custom_metadata = {}
+
+class MetadataExtractor(ABC):
+    """Abstract base class for metadata extractors"""
+
+    @abstractmethod
+    def extract(self, pages: str, file_name: str) -> DocumentMetadata:
+        """Extract metadata from document content"""
+        pass
+
+class MetadataProcessor:
+    """Orchestrates metadata extraction using multiple extractors"""
+
+    def __init__(self):
+        self.extractors: List[MetadataExtractor] = []
+
+    def add_extractor(self, extractor: MetadataExtractor):
+        """Add a metadata extractor to the pipeline"""
+        self.extractors.append(extractor)
+
+    def process_document(self, document_content: str, file_name: str) -> DocumentMetadata:
+        """Process document through all extractors and combine results"""
+        metadata = DocumentMetadata()
+
+        for extractor in self.extractors:
+            try:
+                extracted = extractor.extract(document_content, file_name)
+                # Merge extracted metadata
+                for field in metadata.__dataclass_fields__:
+                    value = getattr(extracted, field)
+                    if value is not None:
+                        setattr(metadata, field, value)
+            except Exception as e:
+                logging.error(f"Error in {extractor.__class__.__name__}: {str(e)}")
+
+        return metadata
+
+# Factory function to create a configured metadata processor
+def create_metadata_processor() -> MetadataProcessor:
+    """Create and configure a metadata processor with default extractors"""
+    processor = MetadataProcessor()
+    # processor.add_extractor(TitleExtractor())
+    # processor.add_extractor(DomainClassifier())
+    return processor 
diff --git a/backend/src/main.py b/backend/src/main.py
@@ -32,6 +32,7 @@
 import urllib.parse
 import json
 from src.shared.llm_graph_builder_exception import LLMGraphBuilderException
+from src.document_metadata_extractor import create_metadata_processor, DocumentMetadata
 
 warnings.filterwarnings("ignore")
 load_dotenv()
@@ -322,6 +323,16 @@ async def processing_source(uri, userName, password, database, model, file_name,
   uri_latency["create_connection"] = f'{elapsed_create_connection:.2f}'
   graphDb_data_Access = graphDBdataAccess(graph)
   create_chunk_vector_index(graph)
+
+  # Process document metadata
+  # if pages and len(pages) > 0:
+  metadata_processor = create_metadata_processor()
+  # document_content = " ".join([page.page_content for page in pages])
+  metadata = metadata_processor.process_document(pages, file_name)
+  logging.info(f"Extracted metadata for {file_name}: {metadata}")
+  # else:
+  #     metadata = DocumentMetadata()
+
   start_get_chunkId_chunkDoc_list = time.time()
   total_chunks, chunkId_chunkDoc_list = get_chunkId_chunkDoc_list(graph, file_name, pages, token_chunk_size, chunk_overlap, retry_condition)
   end_get_chunkId_chunkDoc_list = time.time()
@@ -330,6 +341,16 @@ async def processing_source(uri, userName, password, database, model, file_name,
   uri_latency["create_list_chunk_and_document"] = f'{elapsed_get_chunkId_chunkDoc_list:.2f}'
   uri_latency["total_chunks"] = total_chunks
 
+  # this was autogenerated by cursor - we may want to write the data to the document differenttly
+  # Update document node with metadata
+  # if metadata.domain:
+  #     query = """
+  #     MATCH (d:Document {fileName: $file_name})
+  #     SET d.domain = $domain
+  #     """
+  #     graph.query(query, {"file_name": file_name, "domain": metadata.domain})
+  # end of cursor generated code
+
   start_status_document_node = time.time()
   result = graphDb_data_Access.get_current_status_document_node(file_name)
   end_status_document_node = time.time()