Robust pdf loading for empty pages (#115)

lolipopshock · web-flow · commit cd295de9b923 · 2022-02-02T10:26:57.000-05:00
* more robust PDF loading for "empty PDFs"

* Tests

* fix tests
diff --git a/src/layoutparser/io/pdf.py b/src/layoutparser/io/pdf.py
@@ -53,6 +53,10 @@ def extract_words_for_page(
     )
 
     df = pd.DataFrame(tokens)
+    
+    if len(df) == 0:
+        return Layout()
+    
     df[["x0", "x1"]] = (
         df[["x0", "x1"]].clip(lower=0, upper=int(page.width)).astype("float")
     )
diff --git a/tests/fixtures/io/empty.pdf b/tests/fixtures/io/empty.pdf
diff --git a/tests/test_io.py b/tests/test_io.py
@@ -78,4 +78,11 @@ def test_pdf():
         assert attr_name in page_layout.page_data
 
     assert len(set(ele.type for ele in page_layout)) == 3
-    # Only three types of font show-up in the file
+    # Only three types of font show-up in the file
+    
+def test_empty_pdf():
+    pdf_layout = load_pdf("tests/fixtures/io/empty.pdf")
+    assert len(pdf_layout) == 1 # Only one page
+    
+    page_layout = pdf_layout[0]
+    assert len(page_layout) == 0 # No selectable tokens on the page

Original file line number	Diff line number	Diff line change
`@@ -53,6 +53,10 @@ def extract_words_for_page(`
`53`	`53`	`)`
`54`	`54`
`55`	`55`	`df = pd.DataFrame(tokens)`
	`56`	`+`
	`57`	`+ if len(df) == 0:`
	`58`	`+ return Layout()`
	`59`	`+`
`56`	`60`	`df[["x0", "x1"]] = (`
`57`	`61`	`df[["x0", "x1"]].clip(lower=0, upper=int(page.width)).astype("float")`
`58`	`62`	`)`