Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
manthanguptaa committed Dec 4, 2024
1 parent b667447 commit 7dfb970
Show file tree
Hide file tree
Showing 9 changed files with 50 additions and 26 deletions.
21 changes: 21 additions & 0 deletions cookbook/chunking/semantic_chunking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from phi.agent import Agent
from phi.document.chunking.semantic import SemanticChunking
from phi.document.reader.pdf import PDFUrlReader
from phi.knowledge.pdf import PDFUrlKnowledgeBase
from phi.vectordb.pgvector import PgVector

db_url = "postgresql+psycopg://ai:ai@localhost:5532/ai"

knowledge_base = PDFUrlKnowledgeBase(
urls=["https://phi-public.s3.amazonaws.com/recipes/ThaiRecipes.pdf"],
vector_db=PgVector(table_name="recipes", db_url=db_url),
reader=PDFUrlReader(chunking_strategy=SemanticChunking()),
)
knowledge_base.load(recreate=False) # Comment out after first run

agent = Agent(
knowledge_base=knowledge_base,
search_knowledge=True,
)

agent.print_response("How to make Thai curry?", markdown=True)
2 changes: 1 addition & 1 deletion phi/document/chunking/agentic.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import List, Optional

from phi.document.chunking.base import ChunkingStrategy
from phi.document.chunking.strategy import ChunkingStrategy
from phi.document.base import Document
from phi.model.openai import OpenAIChat
from phi.model.base import Model
Expand Down
2 changes: 1 addition & 1 deletion phi/document/chunking/document.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import List

from phi.document.chunking.base import ChunkingStrategy
from phi.document.chunking.strategy import ChunkingStrategy
from phi.document.base import Document


Expand Down
4 changes: 2 additions & 2 deletions phi/document/chunking/fixed.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from typing import List

from phi.document.base import Document
from phi.document.chunking.base import ChunkingStrategy
from phi.document.chunking.strategy import ChunkingStrategy


class FixedChunking(ChunkingStrategy):
class FixedSizeChunking(ChunkingStrategy):
"""Chunking strategy that splits text into fixed-size chunks with optional overlap"""

def __init__(self, chunk_size: int = 5000, overlap: int = 0):
Expand Down
2 changes: 1 addition & 1 deletion phi/document/chunking/recursive.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import List

from phi.document.base import Document
from phi.document.chunking.base import ChunkingStrategy
from phi.document.chunking.strategy import ChunkingStrategy


class RecursiveChunking(ChunkingStrategy):
Expand Down
27 changes: 12 additions & 15 deletions phi/document/chunking/semantic.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from typing import List, Optional

from phi.document.chunking.base import ChunkingStrategy
from pydantic import Field

from phi.document.chunking.strategy import ChunkingStrategy
from phi.document.base import Document
from phi.embedder.base import Embedder
from phi.embedder.openai import OpenAIEmbedder
Expand All @@ -15,26 +17,21 @@
class SemanticChunking(ChunkingStrategy):
"""Chunking strategy that splits text into semantic chunks using chonkie"""

def __init__(
self,
embedding_model: Optional[Embedder] = None,
chunk_size: int = 5000,
similarity_threshold: Optional[float] = 0.5,
):
self.embedding_model = embedding_model or OpenAIEmbedder(model="text-embedding-3-small")
self.chunk_size = chunk_size
self.similarity_threshold = similarity_threshold
self.chunker = SemanticChunker(
embedding_model=self.embedding_model,
max_chunk_size=self.chunk_size,
similarity_threshold=self.similarity_threshold,
)
embedding_model: Embedder = Field(default_factory=OpenAIEmbedder(model="text-embedding-3-small"))
chunk_size: int = 5000
similarity_threshold: Optional[float] = 0.5

def chunk(self, document: Document) -> List[Document]:
"""Split document into semantic chunks using chokie"""
if not document.content:
return [document]

self.chunker = SemanticChunker(
embedding_model=self.embedding_model,
chunk_size=self.chunk_size,
similarity_threshold=self.similarity_threshold,
)

# Use chokie to split into semantic chunks
chunks = self.chunker.chunk(self.clean_text(document.content))

Expand Down
File renamed without changes.
13 changes: 8 additions & 5 deletions phi/document/reader/base.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,22 @@
from typing import Any, List, Optional

from pydantic import BaseModel
from pydantic import BaseModel, Field

from phi.document.chunking.base import ChunkingStrategy
from phi.document.chunking.fixed import FixedChunking
from phi.document.chunking.strategy import ChunkingStrategy
from phi.document.chunking.fixed import FixedSizeChunking
from phi.document.base import Document


class Reader(BaseModel):
"""Base class for reading documents"""

chunk: bool = True
chunk_size: int = 3000
separators: List[str] = ["\n", "\n\n", "\r", "\r\n", "\n\r", "\t", " ", " "]
chunking_strategy: ChunkingStrategy = Field(default_factory=FixedSizeChunking)

def __init__(self, chunking_strategy: Optional[ChunkingStrategy] = None):
self.chunking_strategy = chunking_strategy or FixedChunking()
def __init__(self, **kwargs):
super().__init__(**kwargs)

def read(self, obj: Any) -> List[Document]:
raise NotImplementedError
Expand Down
5 changes: 4 additions & 1 deletion phi/document/reader/pdf.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from dataclasses import Field
from pathlib import Path
from typing import List, Union, IO, Any
from typing import List, Optional, Union, IO, Any

from phi.document.base import Document
from phi.document.chunking.base import ChunkingStrategy
from phi.document.chunking.fixed import FixedChunking
from phi.document.reader.base import Reader
from phi.utils.log import logger

Expand Down

0 comments on commit 7dfb970

Please sign in to comment.