Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ aider-chat>=0.35.0

# Knowledge graph database client
neo4j
docling

# Configuration file parsing
PyYAML
Expand Down
64 changes: 52 additions & 12 deletions src/knowledge/learners/paper_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,17 @@
# Extracts knowledge from research papers (PDFs).
# Parses sections, abstracts, formulas, and key findings.

import os
from typing import Any, Dict, List

from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, PictureDescriptionVlmOptions, PictureDescriptionApiOptions
from docling_core.types.doc.document import PictureDescriptionData

from src.knowledge.learners.base import Learner, KnowledgeChunk
from src.knowledge.learners.factory import register_learner


@register_learner("paper")
class PaperLearner(Learner):
"""
Expand All @@ -34,22 +39,30 @@ def learn(self, source_data: Dict[str, Any]) -> List[KnowledgeChunk]:

Args:
source_data: Dict with "path" (local file) or "url" (remote PDF)

Returns:
List of KnowledgeChunk from the paper
"""
path = source_data.get("path", source_data.get("url", ""))

pipeline_options = PdfPipelineOptions(
do_formula_enrichment = True,
do_picture_description = True,
picture_description_options=self._create_picture_description_options(),
enable_remote_services=True,
)

converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options
)
}
)
result = converter.convert(path)
markdown_content = result.document.export_to_markdown()

chunks = []

# TODO: Implement actual PDF parsing
# 1. Load PDF (local or download from URL)
# 2. Extract text using PyPDF2 or pdfplumber
# 3. Identify sections (Abstract, Methods, Results, etc.)
# 4. Extract formulas using OCR if needed
# 5. Create structured chunks per section

# Placeholder: Create a single chunk indicating the source
# TODO: Convert markdown to KG.
chunks = []
chunks.append(KnowledgeChunk(
content=f"Paper knowledge from {path}",
chunk_type="text",
Expand All @@ -60,3 +73,30 @@ def learn(self, source_data: Dict[str, Any]) -> List[KnowledgeChunk]:
print(f"[PaperLearner] Learned from paper: {path}")
return chunks

def _create_picture_description_options(self) -> PictureDescriptionApiOptions:
"""
Create the picture description options.
"""

image_description_prompt = """
Describe the picture in details. Make sure to include all the details, for exampel, convert flows and diagrams to text.
Ignore examples, and details of messy diagrams. Only extract and summarize the main content and idea of the picture.
put your description in the following format:
<image_description>
Textual description of the picture.
</image_description>
"""
# TODO: Add compatibility for other LLM provider APIs.
return PictureDescriptionApiOptions(
url="https://api.openai.com/v1/chat/completions",
headers={
"Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}"
},
params=dict(
model="gpt-4o",
max_completion_tokens=500,
),
prompt=image_description_prompt,
timeout=90,
)

12 changes: 12 additions & 0 deletions tests/learners/test_paper_learner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from src.knowledge.learners.paper_learner import PaperLearner


def test_paper_learner():
learner = PaperLearner(params={})

test_data = {"url": "https://arxiv.org/pdf/1706.03762"}
chunks = learner.learn(test_data)
print(chunks)

if __name__ == "__main__":
test_paper_learner()