Skip to content

Commit

Permalink
Feat: CSV Url Knowledgebase (#1543)
Browse files Browse the repository at this point in the history
## Description
- This PR adds CSVUrlKnowledgeBase that allows to loading csv files from
a URL

## Type of change

Please check the options that are relevant:

- [ ] Bug fix (non-breaking change which fixes an issue)
- [x] New feature (non-breaking change which adds functionality)
- [ ] Breaking change (fix or feature that would cause existing
functionality to not work as expected)
- [ ] Model update
- [ ] Infrastructure change

## Checklist

- [x] My code follows Phidata's style guidelines and best practices
- [ ] I have performed a self-review of my code
- [x] I have added docstrings and comments for complex logic
- [ ] My changes generate no new warnings or errors
- [ ] I have added cookbook examples for my new addition (if needed)
- [ ] I have updated requirements.txt/pyproject.toml (if needed)
- [ ] I have verified my changes in a clean environment

## Additional Notes

Include any deployment notes, performance implications, or other
relevant information:

---------

Co-authored-by: Dirk Brand <dirk@volter.ai>
  • Loading branch information
manthanguptaa and dirkvolter authored Dec 12, 2024
1 parent a2497d4 commit e5c11ac
Show file tree
Hide file tree
Showing 7 changed files with 85 additions and 5 deletions.
18 changes: 18 additions & 0 deletions cookbook/knowledge/csv_url_kb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from phi.agent import Agent
from phi.knowledge.csv import CSVUrlKnowledgeBase
from phi.vectordb.pgvector import PgVector

db_url = "postgresql+psycopg://ai:ai@localhost:5532/ai"

knowledge_base = CSVUrlKnowledgeBase(
urls=["https://phi-public.s3.amazonaws.com/csvs/employees.csv"],
vector_db=PgVector(table_name="csv_documents", db_url=db_url),
)
knowledge_base.load(recreate=False) # Comment out after first run

agent = Agent(
knowledge_base=knowledge_base,
search_knowledge=True,
)

agent.print_response("What is the average salary of employees in the Marketing department?", markdown=True)
37 changes: 37 additions & 0 deletions phi/document/reader/csv_reader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import csv
import os
from pathlib import Path
from typing import List, Union, IO, Any
from urllib.parse import urlparse

from phi.document.base import Document
from phi.document.reader.base import Reader
from phi.utils.log import logger
Expand Down Expand Up @@ -48,3 +51,37 @@ def read(self, file: Union[Path, IO[Any]], delimiter: str = ",", quotechar: str
except Exception as e:
logger.error(f"Error reading: {file.name if isinstance(file, IO) else file}: {e}")
return []


class CSVUrlReader(Reader):
"""Reader for CSV files"""

def read(self, url: str) -> List[Document]:
if not url:
raise ValueError("No URL provided")

try:
import httpx
except ImportError:
raise ImportError("`httpx` not installed")

logger.info(f"Reading: {url}")
response = httpx.get(url)

try:
response.raise_for_status()
except httpx.HTTPStatusError as e:
logger.error(f"HTTP error occurred: {e.response.status_code} - {e.response.text}")
raise

parsed_url = urlparse(url)
filename = os.path.basename(parsed_url.path) or "data.csv"

file_obj = io.BytesIO(response.content)
file_obj.name = filename

documents = CSVReader().read(file=file_obj)

file_obj.close()

return documents
6 changes: 6 additions & 0 deletions phi/document/reader/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@ def read(self, url: str) -> List[Document]:
logger.info(f"Reading: {url}")
response = httpx.get(url)

try:
response.raise_for_status()
except httpx.HTTPStatusError as e:
logger.error(f"HTTP error occurred: {e.response.status_code} - {e.response.text}")
raise

doc_name = url.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_")
doc_reader = DocumentReader(BytesIO(response.content))

Expand Down
16 changes: 15 additions & 1 deletion phi/knowledge/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
from typing import Union, List, Iterator

from phi.document import Document
from phi.document.reader.csv_reader import CSVReader
from phi.document.reader.csv_reader import CSVReader, CSVUrlReader
from phi.knowledge.agent import AgentKnowledge
from phi.utils.log import logger


class CSVKnowledgeBase(AgentKnowledge):
Expand All @@ -26,3 +27,16 @@ def document_lists(self) -> Iterator[List[Document]]:
yield self.reader.read(file=_csv)
elif _csv_path.exists() and _csv_path.is_file() and _csv_path.suffix == ".csv":
yield self.reader.read(file=_csv_path)


class CSVUrlKnowledgeBase(AgentKnowledge):
urls: List[str]
reader: CSVUrlReader = CSVUrlReader()

@property
def document_lists(self) -> Iterator[List[Document]]:
for url in self.urls:
if url.endswith(".csv"):
yield self.reader.read(url=url)
else:
logger.error(f"Unsupported URL: {url}")
7 changes: 6 additions & 1 deletion phi/knowledge/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from phi.document import Document
from phi.document.reader.pdf import PDFReader, PDFUrlReader, PDFImageReader, PDFUrlImageReader
from phi.knowledge.agent import AgentKnowledge
from phi.utils.log import logger


class PDFKnowledgeBase(AgentKnowledge):
Expand Down Expand Up @@ -42,4 +43,8 @@ def document_lists(self) -> Iterator[List[Document]]:
"""

for url in self.urls:
yield self.reader.read(url=url)
if url.endswith(".pdf"):
yield self.reader.read(url=url)
else:
logger.error(f"Unsupported URL: {url}")

2 changes: 1 addition & 1 deletion phi/tools/models_labs.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def __init__(
self.register(self.generate_media)

def generate_media(self, agent: Agent, prompt: str) -> str:
"""Use this function to generate a video given a prompt.
"""Use this function to generate a video or image given a prompt.
Args:
prompt (str): A text description of the desired video.
Expand Down
4 changes: 2 additions & 2 deletions phi/vectordb/pgvector/pgvector.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@
from sqlalchemy.sql.expression import text, func, select, desc, bindparam
from sqlalchemy.types import DateTime, String
except ImportError:
raise ImportError("`sqlalchemy` not installed")
raise ImportError("`sqlalchemy` not installed. Please install using `pip install sqlalchemy psycopg`")

try:
from pgvector.sqlalchemy import Vector
except ImportError:
raise ImportError("`pgvector` not installed")
raise ImportError("`pgvector` not installed. Please install using `pip install pgvector`")

from phi.document import Document
from phi.embedder import Embedder
Expand Down

0 comments on commit e5c11ac

Please sign in to comment.