Skip to content

Commit

Permalink
Adding CSV Parser (#1996)
Browse files Browse the repository at this point in the history
* Create csvparser.py

I am adding the CSV parser python code.
it works with basic CSV files.

* Update prepdocs.py

updating the csv parser code and importing the CsvParser class

* Create test_csvparser.py

Added CSV Test file

* Update test_csvparser.py

Formatted the file

* Update csvparser.py

Formatted the file

* Update prepdocs.py

* Update prepdocs.py

* Update csvparser.py

* Update prepdocs.py

* Fix prepdocs and tests to match main

---------

Co-authored-by: Pamela Fox <pamelafox@microsoft.com>
Co-authored-by: Pamela Fox <pamela.fox@gmail.com>
  • Loading branch information
3 people authored Oct 2, 2024
1 parent a127523 commit 2dd7ba9
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 4 deletions.
2 changes: 2 additions & 0 deletions app/backend/prepdocs.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from load_azd_env import load_azd_env
from prepdocslib.blobmanager import BlobManager
from prepdocslib.csvparser import CsvParser
from prepdocslib.embeddings import (
AzureOpenAIEmbeddingService,
ImageEmbeddings,
Expand Down Expand Up @@ -190,6 +191,7 @@ def setup_file_processors(
".json": FileProcessor(JsonParser(), SimpleTextSplitter()),
".md": FileProcessor(TextParser(), sentence_text_splitter),
".txt": FileProcessor(TextParser(), sentence_text_splitter),
".csv": FileProcessor(CsvParser(), sentence_text_splitter),
}
# These require either a Python package or Document Intelligence
if pdf_parser is not None:
Expand Down
31 changes: 31 additions & 0 deletions app/backend/prepdocslib/csvparser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import csv
from typing import IO, AsyncGenerator

from .page import Page
from .parser import Parser


class CsvParser(Parser):
"""
Concrete parser that can parse CSV into Page objects. Each row becomes a Page object.
"""

async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
# Check if content is in bytes (binary file) and decode to string
content_str: str
if isinstance(content, (bytes, bytearray)):
content_str = content.decode("utf-8")
elif hasattr(content, "read"): # Handle BufferedReader
content_str = content.read().decode("utf-8")

# Create a CSV reader from the text content
reader = csv.reader(content_str.splitlines())
offset = 0

# Skip the header row
next(reader, None)

for i, row in enumerate(reader):
page_text = ",".join(row)
yield Page(i, offset, page_text)
offset += len(page_text) + 1 # Account for newline character
8 changes: 4 additions & 4 deletions tests/test_app_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ async def test_app_user_upload_processors(monkeypatch, minimal_env):
async with quart_app.test_app():
ingester = quart_app.config[app.CONFIG_INGESTER]
assert ingester is not None
assert len(ingester.file_processors.keys()) == 5
assert len(ingester.file_processors.keys()) == 6


@pytest.mark.asyncio
Expand All @@ -77,7 +77,7 @@ async def test_app_user_upload_processors_docint(monkeypatch, minimal_env):
async with quart_app.test_app():
ingester = quart_app.config[app.CONFIG_INGESTER]
assert ingester is not None
assert len(ingester.file_processors.keys()) == 14
assert len(ingester.file_processors.keys()) == 15


@pytest.mark.asyncio
Expand All @@ -92,7 +92,7 @@ async def test_app_user_upload_processors_docint_localpdf(monkeypatch, minimal_e
async with quart_app.test_app():
ingester = quart_app.config[app.CONFIG_INGESTER]
assert ingester is not None
assert len(ingester.file_processors.keys()) == 14
assert len(ingester.file_processors.keys()) == 15
assert ingester.file_processors[".pdf"] is not ingester.file_processors[".pptx"]


Expand All @@ -108,7 +108,7 @@ async def test_app_user_upload_processors_docint_localhtml(monkeypatch, minimal_
async with quart_app.test_app():
ingester = quart_app.config[app.CONFIG_INGESTER]
assert ingester is not None
assert len(ingester.file_processors.keys()) == 14
assert len(ingester.file_processors.keys()) == 15
assert ingester.file_processors[".html"] is not ingester.file_processors[".pptx"]


Expand Down
57 changes: 57 additions & 0 deletions tests/test_csvparser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import io

import pytest

from prepdocslib.csvparser import CsvParser # Adjust import to the correct module


@pytest.mark.asyncio
async def test_csvparser_single_row():
# Mock CSV content with a single row in binary format
file = io.BytesIO(b"col1,col2,col3\nvalue1,value2,value3")
file.name = "test.csv"
csvparser = CsvParser()

# Parse the file
pages = [page async for page in csvparser.parse(file)]

# Assertions
assert len(pages) == 1
assert pages[0].page_num == 0
assert pages[0].offset == 0
assert pages[0].text == "value1,value2,value3"


@pytest.mark.asyncio
async def test_csvparser_multiple_rows():
# Mock CSV content with multiple rows in binary format
file = io.BytesIO(b"col1,col2,col3\nvalue1,value2,value3\nvalue4,value5,value6")
file.name = "test.csv"
csvparser = CsvParser()

# Parse the file
pages = [page async for page in csvparser.parse(file)]

# Assertions
assert len(pages) == 2 # Expect only data rows, skipping the header
assert pages[0].page_num == 0
assert pages[0].offset == 0
assert pages[0].text == "value1,value2,value3"

assert pages[1].page_num == 1
assert pages[1].offset == len(pages[0].text) + 1 # Length of the first row plus a newline
assert pages[1].text == "value4,value5,value6"


@pytest.mark.asyncio
async def test_csvparser_empty_file():
# Mock empty CSV content in binary format
file = io.BytesIO(b"")
file.name = "test.csv"
csvparser = CsvParser()

# Parse the file
pages = [page async for page in csvparser.parse(file)]

# Assertions
assert len(pages) == 0 # No rows should be parsed from an empty file

0 comments on commit 2dd7ba9

Please sign in to comment.