Skip to content

Commit cd295de

Browse files
authored
Robust pdf loading for empty pages (#115)
* more robust PDF loading for "empty PDFs" * Tests * fix tests
1 parent 79bd1af commit cd295de

File tree

3 files changed

+12
-1
lines changed

3 files changed

+12
-1
lines changed

src/layoutparser/io/pdf.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ def extract_words_for_page(
5353
)
5454

5555
df = pd.DataFrame(tokens)
56+
57+
if len(df) == 0:
58+
return Layout()
59+
5660
df[["x0", "x1"]] = (
5761
df[["x0", "x1"]].clip(lower=0, upper=int(page.width)).astype("float")
5862
)

tests/fixtures/io/empty.pdf

17.8 KB
Binary file not shown.

tests/test_io.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,4 +78,11 @@ def test_pdf():
7878
assert attr_name in page_layout.page_data
7979

8080
assert len(set(ele.type for ele in page_layout)) == 3
81-
# Only three types of font show-up in the file
81+
# Only three types of font show-up in the file
82+
83+
def test_empty_pdf():
84+
pdf_layout = load_pdf("tests/fixtures/io/empty.pdf")
85+
assert len(pdf_layout) == 1 # Only one page
86+
87+
page_layout = pdf_layout[0]
88+
assert len(page_layout) == 0 # No selectable tokens on the page

0 commit comments

Comments
 (0)