Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce pdf support (#7318) #7325

Open
wants to merge 21 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
first version of tests for pdf
  • Loading branch information
yabramuvdi committed Dec 19, 2024
commit 0dcbdf90f4c8122771904d6d284deb88c45dd42f
59 changes: 59 additions & 0 deletions tests/features/test_pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import pytest

from datasets import Dataset, Features, Pdf

from ..utils import require_pdfplumber


@require_pdfplumber
@pytest.mark.parametrize(
"build_example",
[
lambda pdf_path: pdf_path,
lambda pdf_path: open(pdf_path, "rb").read(),
lambda pdf_path: {"path": pdf_path},
lambda pdf_path: {"path": pdf_path, "bytes": None},
lambda pdf_path: {"path": pdf_path, "bytes": open(pdf_path, "rb").read()},
lambda pdf_path: {"path": None, "bytes": open(pdf_path, "rb").read()},
lambda pdf_path: {"bytes": open(pdf_path, "rb").read()},
],
)
def test_pdf_feature_encode_example(shared_datadir, build_example):
import pdfplumber

pdf_path = str(shared_datadir / "test_pdf.pdf")
pdf = Pdf()
encoded_example = pdf.encode_example(build_example(pdf_path))
assert isinstance(encoded_example, dict)
assert encoded_example.keys() == {"bytes", "path"}
assert encoded_example["bytes"] is not None or encoded_example["path"] is not None
decoded_example = pdf.decode_example(encoded_example)
assert isinstance(decoded_example, pdfplumber.pdf.PDF)


@require_pdfplumber
def test_dataset_with_pdf_feature(shared_datadir):
import pdfplumber

pdf_path = str(shared_datadir / "test_pdf.pdf")
data = {"pdf": [pdf_path]}
features = Features({"pdf": Pdf()})
dset = Dataset.from_dict(data, features=features)
item = dset[0]
assert item.keys() == {"pdf"}
assert isinstance(item["pdf"], pdfplumber.pdf.PDF)
batch = dset[:1]
assert len(batch) == 1
assert batch.keys() == {"pdf"}
assert isinstance(batch["pdf"], list) and all(isinstance(item, pdfplumber.pdf.PDF) for item in batch["pdf"])
column = dset["pdf"]
assert len(column) == 1
assert isinstance(column, list) and all(isinstance(item, pdfplumber.pdf.PDF) for item in column)

# from bytes
with open(pdf_path, "rb") as f:
data = {"pdf": [f.read()]}
dset = Dataset.from_dict(data, features=features)
item = dset[0]
assert item.keys() == {"pdf"}
assert isinstance(item["pdf"], pdfplumber.pdf.PDF)