Skip to content

Commit

Permalink
Fix docling issues (crewAIInc#1909)
Browse files Browse the repository at this point in the history
* Fix docling issues

* update docs
  • Loading branch information
bhancockio authored Jan 16, 2025
1 parent b5779dc commit cc129a0
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 5 deletions.
6 changes: 6 additions & 0 deletions docs/concepts/knowledge.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,12 @@ result = crew.kickoff(inputs={"question": "What city does John live in and how o

Here's another example with the `CrewDoclingSource`. The CrewDoclingSource is actually quite versatile and can handle multiple file formats including TXT, PDF, DOCX, HTML, and more.

<Note>
You need to install `docling` for the following example to work: `uv add docling`
</Note>



```python Code
from crewai import LLM, Agent, Crew, Process, Task
from crewai.knowledge.source.crew_docling_source import CrewDoclingSource
Expand Down
11 changes: 6 additions & 5 deletions src/crewai/knowledge/source/crew_docling_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from docling.exceptions import ConversionError
from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
from docling_core.types.doc.document import DoclingDocument

DOCLING_AVAILABLE = True
except ImportError:
DOCLING_AVAILABLE = False
Expand Down Expand Up @@ -38,8 +39,8 @@ def __init__(self, *args, **kwargs):
file_paths: List[Union[Path, str]] = Field(default_factory=list)
chunks: List[str] = Field(default_factory=list)
safe_file_paths: List[Union[Path, str]] = Field(default_factory=list)
content: List[DoclingDocument] = Field(default_factory=list)
document_converter: DocumentConverter = Field(
content: List["DoclingDocument"] = Field(default_factory=list)
document_converter: "DocumentConverter" = Field(
default_factory=lambda: DocumentConverter(
allowed_formats=[
InputFormat.MD,
Expand All @@ -65,7 +66,7 @@ def model_post_init(self, _) -> None:
self.safe_file_paths = self.validate_content()
self.content = self._load_content()

def _load_content(self) -> List[DoclingDocument]:
def _load_content(self) -> List["DoclingDocument"]:
try:
return self._convert_source_to_docling_documents()
except ConversionError as e:
Expand All @@ -87,11 +88,11 @@ def add(self) -> None:
self.chunks.extend(list(new_chunks_iterable))
self._save_documents()

def _convert_source_to_docling_documents(self) -> List[DoclingDocument]:
def _convert_source_to_docling_documents(self) -> List["DoclingDocument"]:
conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
return [result.document for result in conv_results_iter]

def _chunk_doc(self, doc: DoclingDocument) -> Iterator[str]:
def _chunk_doc(self, doc: "DoclingDocument") -> Iterator[str]:
chunker = HierarchicalChunker()
for chunk in chunker.chunk(doc):
yield chunk.text
Expand Down

0 comments on commit cc129a0

Please sign in to comment.