diff --git a/cookbook/chunking/semantic_chunking.py b/cookbook/chunking/semantic_chunking.py new file mode 100644 index 000000000..f532796ab --- /dev/null +++ b/cookbook/chunking/semantic_chunking.py @@ -0,0 +1,21 @@ +from phi.agent import Agent +from phi.document.chunking.semantic import SemanticChunking +from phi.document.reader.pdf import PDFUrlReader +from phi.knowledge.pdf import PDFUrlKnowledgeBase +from phi.vectordb.pgvector import PgVector + +db_url = "postgresql+psycopg://ai:ai@localhost:5532/ai" + +knowledge_base = PDFUrlKnowledgeBase( + urls=["https://phi-public.s3.amazonaws.com/recipes/ThaiRecipes.pdf"], + vector_db=PgVector(table_name="recipes", db_url=db_url), + reader=PDFUrlReader(chunking_strategy=SemanticChunking()), +) +knowledge_base.load(recreate=False) # Comment out after first run + +agent = Agent( + knowledge_base=knowledge_base, + search_knowledge=True, +) + +agent.print_response("How to make Thai curry?", markdown=True) diff --git a/phi/document/chunking/agentic.py b/phi/document/chunking/agentic.py index f8b3e1c17..e9b5ef7be 100644 --- a/phi/document/chunking/agentic.py +++ b/phi/document/chunking/agentic.py @@ -1,6 +1,6 @@ from typing import List, Optional -from phi.document.chunking.base import ChunkingStrategy +from phi.document.chunking.strategy import ChunkingStrategy from phi.document.base import Document from phi.model.openai import OpenAIChat from phi.model.base import Model diff --git a/phi/document/chunking/document.py b/phi/document/chunking/document.py index b0ad81e85..51267459f 100644 --- a/phi/document/chunking/document.py +++ b/phi/document/chunking/document.py @@ -1,6 +1,6 @@ from typing import List -from phi.document.chunking.base import ChunkingStrategy +from phi.document.chunking.strategy import ChunkingStrategy from phi.document.base import Document diff --git a/phi/document/chunking/fixed.py b/phi/document/chunking/fixed.py index c1fbf81d8..95ffe3fb0 100644 --- a/phi/document/chunking/fixed.py +++ b/phi/document/chunking/fixed.py @@ -1,10 +1,10 @@ from typing import List from phi.document.base import Document -from phi.document.chunking.base import ChunkingStrategy +from phi.document.chunking.strategy import ChunkingStrategy -class FixedChunking(ChunkingStrategy): +class FixedSizeChunking(ChunkingStrategy): """Chunking strategy that splits text into fixed-size chunks with optional overlap""" def __init__(self, chunk_size: int = 5000, overlap: int = 0): diff --git a/phi/document/chunking/recursive.py b/phi/document/chunking/recursive.py index bb3460083..662a9218c 100644 --- a/phi/document/chunking/recursive.py +++ b/phi/document/chunking/recursive.py @@ -1,7 +1,7 @@ from typing import List from phi.document.base import Document -from phi.document.chunking.base import ChunkingStrategy +from phi.document.chunking.strategy import ChunkingStrategy class RecursiveChunking(ChunkingStrategy): diff --git a/phi/document/chunking/semantic.py b/phi/document/chunking/semantic.py index 728d7ff88..655571697 100644 --- a/phi/document/chunking/semantic.py +++ b/phi/document/chunking/semantic.py @@ -1,6 +1,8 @@ from typing import List, Optional -from phi.document.chunking.base import ChunkingStrategy +from pydantic import Field + +from phi.document.chunking.strategy import ChunkingStrategy from phi.document.base import Document from phi.embedder.base import Embedder from phi.embedder.openai import OpenAIEmbedder @@ -15,26 +17,21 @@ class SemanticChunking(ChunkingStrategy): """Chunking strategy that splits text into semantic chunks using chonkie""" - def __init__( - self, - embedding_model: Optional[Embedder] = None, - chunk_size: int = 5000, - similarity_threshold: Optional[float] = 0.5, - ): - self.embedding_model = embedding_model or OpenAIEmbedder(model="text-embedding-3-small") - self.chunk_size = chunk_size - self.similarity_threshold = similarity_threshold - self.chunker = SemanticChunker( - embedding_model=self.embedding_model, - max_chunk_size=self.chunk_size, - similarity_threshold=self.similarity_threshold, - ) + embedding_model: Embedder = Field(default_factory=OpenAIEmbedder(model="text-embedding-3-small")) + chunk_size: int = 5000 + similarity_threshold: Optional[float] = 0.5 def chunk(self, document: Document) -> List[Document]: """Split document into semantic chunks using chokie""" if not document.content: return [document] + self.chunker = SemanticChunker( + embedding_model=self.embedding_model, + chunk_size=self.chunk_size, + similarity_threshold=self.similarity_threshold, + ) + # Use chokie to split into semantic chunks chunks = self.chunker.chunk(self.clean_text(document.content)) diff --git a/phi/document/chunking/base.py b/phi/document/chunking/strategy.py similarity index 100% rename from phi/document/chunking/base.py rename to phi/document/chunking/strategy.py diff --git a/phi/document/reader/base.py b/phi/document/reader/base.py index 044e99a81..2ec0e328c 100644 --- a/phi/document/reader/base.py +++ b/phi/document/reader/base.py @@ -1,19 +1,22 @@ from typing import Any, List, Optional -from pydantic import BaseModel +from pydantic import BaseModel, Field -from phi.document.chunking.base import ChunkingStrategy -from phi.document.chunking.fixed import FixedChunking +from phi.document.chunking.strategy import ChunkingStrategy +from phi.document.chunking.fixed import FixedSizeChunking from phi.document.base import Document class Reader(BaseModel): + """Base class for reading documents""" + chunk: bool = True chunk_size: int = 3000 separators: List[str] = ["\n", "\n\n", "\r", "\r\n", "\n\r", "\t", " ", " "] + chunking_strategy: ChunkingStrategy = Field(default_factory=FixedSizeChunking) - def __init__(self, chunking_strategy: Optional[ChunkingStrategy] = None): - self.chunking_strategy = chunking_strategy or FixedChunking() + def __init__(self, **kwargs): + super().__init__(**kwargs) def read(self, obj: Any) -> List[Document]: raise NotImplementedError diff --git a/phi/document/reader/pdf.py b/phi/document/reader/pdf.py index a2cae897a..883666445 100644 --- a/phi/document/reader/pdf.py +++ b/phi/document/reader/pdf.py @@ -1,7 +1,10 @@ +from dataclasses import Field from pathlib import Path -from typing import List, Union, IO, Any +from typing import List, Optional, Union, IO, Any from phi.document.base import Document +from phi.document.chunking.base import ChunkingStrategy +from phi.document.chunking.fixed import FixedChunking from phi.document.reader.base import Reader from phi.utils.log import logger