Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backend/Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ langchain-neo4j = "==0.4.0"
pypandoc-binary = "==1.15"
chardet = "==5.2.0"
unstructured = "==0.17.2"
contextgem = "==0.8.1"

[dev-packages]

Expand Down
768 changes: 489 additions & 279 deletions backend/Pipfile.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,4 @@ rouge_score==0.1.2
langchain-neo4j==0.4.0
pypandoc-binary==1.15
chardet==5.2.0
contextgem==0.8.1
16 changes: 16 additions & 0 deletions backend/src/document_metadata_extractor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from .metadata_processor import MetadataProcessor, DocumentMetadata, MetadataExtractor
from .domain_extractor import DomainExtractor

def create_metadata_processor() -> MetadataProcessor:
"""Create and configure a metadata processor with default extractors"""
processor = MetadataProcessor()
processor.add_extractor(DomainExtractor())
return processor

__all__ = [
'MetadataProcessor',
'DocumentMetadata',
'MetadataExtractor',
'DomainExtractor',
'create_metadata_processor'
]
54 changes: 54 additions & 0 deletions backend/src/document_metadata_extractor/domain_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from contextgem import Document as ContextGemDocument, DocumentLLM, LabelConcept
from langchain.schema import Document
from .metadata_processor import MetadataExtractor, DocumentMetadata
from typing import List

DOMAIN_LABELS = [
"Multiple Myeloma",
"Emotional Intelligence",
"Business Management",
"Instruction Manual",
"Sales Training",
"Leadership Coaching",
]

class DomainExtractor(MetadataExtractor):
"""Classifies document into knowledge domains"""

def __init__(self):
super().__init__()
self.num_pages = 20

def extract(self, pages: List[Document], file_name: str) -> DocumentMetadata:
raw_text = f"filename: {file_name}" + "\n".join([page.page_content for page in pages[:self.num_pages]])
print("classification raw text: ", raw_text)
contextgem_doc = ContextGemDocument(raw_text=raw_text)

# Define a LabelConcept for contract type classification
document_type_concept = LabelConcept(
name="Document Domain Label",
description="Classify the knowledge domain of the document",
labels=DOMAIN_LABELS,
classification_type="multi_class", # only one label can be selected (mutually exclusive labels)
singular_occurrence=True, # expect only one classification result
)

contextgem_doc.add_concepts([document_type_concept])

model = "ollama/phi4:14b"
llm = DocumentLLM(
model=model,
# api_key=os.getenv("CONTEXTGEM_OPENAI_API_KEY"),
)

concept_label = llm.extract_concepts_from_document(contextgem_doc)[0]

if concept_label.extracted_items:
# Get the classified document type
classified_type = concept_label.extracted_items[0].value
print(f"Document: {file_name}, classified as: {classified_type}") # Output: ['NDA']
return DocumentMetadata(domain=classified_type)
else:
print(f"no label assigned to document {file_name}")
return DocumentMetadata(domain=None)

66 changes: 66 additions & 0 deletions backend/src/document_metadata_extractor/metadata_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from typing import Dict, Any, List, Optional
from dataclasses import dataclass
from abc import ABC, abstractmethod
import logging
from datetime import datetime

@dataclass
class DocumentMetadata:
"""Container for document-level metadata"""
title: Optional[str] = None
author: Optional[str] = None
date: Optional[datetime] = None
language: Optional[str] = None
domain: Optional[str] = None
keywords: List[str] = None
summary: Optional[str] = None
custom_metadata: Dict[str, Any] = None

def __post_init__(self):
if self.keywords is None:
self.keywords = []
if self.custom_metadata is None:
self.custom_metadata = {}

class MetadataExtractor(ABC):
"""Abstract base class for metadata extractors"""

@abstractmethod
def extract(self, pages: str, file_name: str) -> DocumentMetadata:
"""Extract metadata from document content"""
pass

class MetadataProcessor:
"""Orchestrates metadata extraction using multiple extractors"""

def __init__(self):
self.extractors: List[MetadataExtractor] = []

def add_extractor(self, extractor: MetadataExtractor):
"""Add a metadata extractor to the pipeline"""
self.extractors.append(extractor)

def process_document(self, document_content: str, file_name: str) -> DocumentMetadata:
"""Process document through all extractors and combine results"""
metadata = DocumentMetadata()

for extractor in self.extractors:
try:
extracted = extractor.extract(document_content, file_name)
# Merge extracted metadata
for field in metadata.__dataclass_fields__:
value = getattr(extracted, field)
if value is not None:
setattr(metadata, field, value)
except Exception as e:
logging.error(f"Error in {extractor.__class__.__name__}: {str(e)}")

return metadata

# Factory function to create a configured metadata processor
def create_metadata_processor() -> MetadataProcessor:
"""Create and configure a metadata processor with default extractors"""
processor = MetadataProcessor()
# processor.add_extractor(TitleExtractor())
# processor.add_extractor(DomainClassifier())
return processor
21 changes: 21 additions & 0 deletions backend/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import urllib.parse
import json
from src.shared.llm_graph_builder_exception import LLMGraphBuilderException
from src.document_metadata_extractor import create_metadata_processor, DocumentMetadata

warnings.filterwarnings("ignore")
load_dotenv()
Expand Down Expand Up @@ -322,6 +323,16 @@ async def processing_source(uri, userName, password, database, model, file_name,
uri_latency["create_connection"] = f'{elapsed_create_connection:.2f}'
graphDb_data_Access = graphDBdataAccess(graph)
create_chunk_vector_index(graph)

# Process document metadata
# if pages and len(pages) > 0:
metadata_processor = create_metadata_processor()
# document_content = " ".join([page.page_content for page in pages])
metadata = metadata_processor.process_document(pages, file_name)
logging.info(f"Extracted metadata for {file_name}: {metadata}")
# else:
# metadata = DocumentMetadata()

start_get_chunkId_chunkDoc_list = time.time()
total_chunks, chunkId_chunkDoc_list = get_chunkId_chunkDoc_list(graph, file_name, pages, token_chunk_size, chunk_overlap, retry_condition)
end_get_chunkId_chunkDoc_list = time.time()
Expand All @@ -330,6 +341,16 @@ async def processing_source(uri, userName, password, database, model, file_name,
uri_latency["create_list_chunk_and_document"] = f'{elapsed_get_chunkId_chunkDoc_list:.2f}'
uri_latency["total_chunks"] = total_chunks

# this was autogenerated by cursor - we may want to write the data to the document differenttly
# Update document node with metadata
# if metadata.domain:
# query = """
# MATCH (d:Document {fileName: $file_name})
# SET d.domain = $domain
# """
# graph.query(query, {"file_name": file_name, "domain": metadata.domain})
# end of cursor generated code

start_status_document_node = time.time()
result = graphDb_data_Access.get_current_status_document_node(file_name)
end_status_document_node = time.time()
Expand Down