Leeroo-AI · AlirezaNadafN · Dec 10, 2025 · Dec 11, 2025
diff --git a/requirements.txt b/requirements.txt
@@ -11,6 +11,7 @@ aider-chat>=0.35.0
 
 # Knowledge graph database client
 neo4j
+docling
 
 # Configuration file parsing
 PyYAML

diff --git a/src/knowledge/learners/paper_learner.py b/src/knowledge/learners/paper_learner.py
@@ -3,12 +3,17 @@
 # Extracts knowledge from research papers (PDFs).
 # Parses sections, abstracts, formulas, and key findings.
 
+import os
 from typing import Any, Dict, List
 
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import PdfPipelineOptions, PictureDescriptionVlmOptions, PictureDescriptionApiOptions
+from docling_core.types.doc.document import PictureDescriptionData
+
 from src.knowledge.learners.base import Learner, KnowledgeChunk
 from src.knowledge.learners.factory import register_learner
 
-
 @register_learner("paper")
 class PaperLearner(Learner):
     """
@@ -34,22 +39,30 @@ def learn(self, source_data: Dict[str, Any]) -> List[KnowledgeChunk]:
 
         Args:
             source_data: Dict with "path" (local file) or "url" (remote PDF)
-
         Returns:
             List of KnowledgeChunk from the paper
         """
         path = source_data.get("path", source_data.get("url", ""))
+
+        pipeline_options = PdfPipelineOptions(
+            do_formula_enrichment = True,
+            do_picture_description = True,
+            picture_description_options=self._create_picture_description_options(),
+            enable_remote_services=True,
+        )
+
+        converter = DocumentConverter(
+            format_options={
+                InputFormat.PDF: PdfFormatOption(
+                    pipeline_options=pipeline_options
+                )
+            }
+        )
+        result = converter.convert(path)        
+        markdown_content = result.document.export_to_markdown()
 
-        chunks = []
-
-        # TODO: Implement actual PDF parsing
-        # 1. Load PDF (local or download from URL)
-        # 2. Extract text using PyPDF2 or pdfplumber
-        # 3. Identify sections (Abstract, Methods, Results, etc.)
-        # 4. Extract formulas using OCR if needed
-        # 5. Create structured chunks per section
-
-        # Placeholder: Create a single chunk indicating the source
+        # TODO: Convert markdown to KG.
+        chunks = []        
         chunks.append(KnowledgeChunk(
             content=f"Paper knowledge from {path}",
             chunk_type="text",
@@ -60,3 +73,30 @@ def learn(self, source_data: Dict[str, Any]) -> List[KnowledgeChunk]:
         print(f"[PaperLearner] Learned from paper: {path}")
         return chunks
 
+    def _create_picture_description_options(self) -> PictureDescriptionApiOptions:
+        """
+        Create the picture description options.
+        """
+
+        image_description_prompt = """
+            Describe the picture in details. Make sure to include all the details, for exampel, convert flows and diagrams to text.
+            Ignore examples, and details of messy diagrams. Only extract and summarize the main content and idea of the picture. 
+            put your description in the following format:
+            <image_description>
+                Textual description of the picture.
+            </image_description>
+        """
+        # TODO: Add compatibility for other LLM provider APIs.
+        return PictureDescriptionApiOptions(
+            url="https://api.openai.com/v1/chat/completions",
+            headers={
+                "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}"
+            },
+            params=dict(
+                model="gpt-4o",
+                max_completion_tokens=500,
+            ),
+            prompt=image_description_prompt,
+            timeout=90,
+        )
+
diff --git a/tests/learners/test_paper_learner.py b/tests/learners/test_paper_learner.py
@@ -0,0 +1,12 @@
+from src.knowledge.learners.paper_learner import PaperLearner
+
+
+def test_paper_learner():
+    learner = PaperLearner(params={})
+
+    test_data = {"url": "https://arxiv.org/pdf/1706.03762"}
+    chunks = learner.learn(test_data)
+    print(chunks)
+
+if __name__ == "__main__":
+    test_paper_learner()
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,6 +11,7 @@ aider-chat>=0.35.0 @@
     # Knowledge graph database client
     neo4j
+    docling
     # Configuration file parsing
     PyYAML
@@ Expand Down @@