Skip to content

Commit

Permalink
Some utility methods for logical structure (#1095)
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines authored Mar 3, 2024
1 parent efca277 commit 207312e
Show file tree
Hide file tree
Showing 4 changed files with 271 additions and 12 deletions.
15 changes: 15 additions & 0 deletions docs/structure.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,18 @@ In this case, because marked content IDs are specific to a given page,
each element will also have a `page_number` attribute, which is the
number of the page containing (partially or completely) this element,
indexed from 1 (for consistency with `pdfplumber.Page`).

You can also access the underlying `PDFStructTree` object for more
flexibility, including visual debugging. For instance to plot the
bounding boxes of the contents of all of the `TD` elements on the
first page of a document:

page = pdf.pages[0]
stree = PDFStructTree(pdf, page)
img = page.to_image()
img.draw_rects(stree.element_bbox(td) for td in table.find_all("TD"))

The `find_all` method works rather like the same method in
[BeautifulSoup](https://beautiful-soup-4.readthedocs.io/en/latest/#searching-the-tree) -
it takes an element name, a regular expression, or a matching
function.
165 changes: 154 additions & 11 deletions pdfplumber/structure.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,29 @@
import itertools
import logging
import re
from collections import deque
from dataclasses import asdict, dataclass, field
from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Tuple
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Iterable,
Iterator,
List,
Optional,
Pattern,
Tuple,
Union,
)

from pdfminer.data_structures import NumberTree
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdftypes import PDFObjRef, resolve1
from pdfminer.psparser import PSLiteral

from .utils import decode_text
from ._typing import T_bbox, T_obj
from .utils import decode_text, geometry

logger = logging.getLogger(__name__)

Expand All @@ -19,8 +33,73 @@
from .pdf import PDF


MatchFunc = Callable[["PDFStructElement"], bool]


def _find_all(
elements: Iterable["PDFStructElement"],
matcher: Union[str, Pattern[str], MatchFunc],
) -> Iterator["PDFStructElement"]:
"""
Common code for `find_all()` in trees and elements.
"""

def match_tag(x: "PDFStructElement") -> bool:
"""Match an element name."""
return x.type == matcher

def match_regex(x: "PDFStructElement") -> bool:
"""Match an element name by regular expression."""
return matcher.match(x.type) # type: ignore

if isinstance(matcher, str):
match_func = match_tag
elif isinstance(matcher, re.Pattern):
match_func = match_regex
else:
match_func = matcher # type: ignore
d = deque(elements)
while d:
el = d.popleft()
if match_func(el):
yield el
d.extendleft(reversed(el.children))


class Findable:
"""find() and find_all() methods that can be inherited to avoid
repeating oneself"""

children: List["PDFStructElement"]

def find_all(
self, matcher: Union[str, Pattern[str], MatchFunc]
) -> Iterator["PDFStructElement"]:
"""Iterate depth-first over matching elements in subtree.
The `matcher` argument is either an element name, a regular
expression, or a function taking a `PDFStructElement` and
returning `True` if the element matches.
"""
return _find_all(self.children, matcher)

def find(
self, matcher: Union[str, Pattern[str], MatchFunc]
) -> Optional["PDFStructElement"]:
"""Find the first matching element in subtree.
The `matcher` argument is either an element name, a regular
expression, or a function taking a `PDFStructElement` and
returning `True` if the element matches.
"""
try:
return next(_find_all(self.children, matcher))
except StopIteration:
return None


@dataclass
class PDFStructElement:
class PDFStructElement(Findable):
type: str
revision: Optional[int]
id: Optional[str]
Expand All @@ -36,9 +115,24 @@ class PDFStructElement:
def __iter__(self) -> Iterator["PDFStructElement"]:
return iter(self.children)

def all_mcids(self) -> Iterator[Tuple[Optional[int], int]]:
"""Collect all MCIDs (with their page numbers, if there are
multiple pages in the tree) inside a structure element.
"""
# Collect them depth-first to preserve ordering
for mcid in self.mcids:
yield self.page_number, mcid
d = deque(self.children)
while d:
el = d.popleft()
for mcid in el.mcids:
yield el.page_number, mcid
d.extendleft(reversed(el.children))

def to_dict(self) -> Dict[str, Any]:
"""Return a compacted dict representation."""
r = asdict(self)
# Prune empty values (does not matter in which order)
d = deque([r])
while d:
el = d.popleft()
Expand All @@ -54,7 +148,7 @@ class StructTreeMissing(ValueError):
pass


class PDFStructTree:
class PDFStructTree(Findable):
"""Parse the structure tree of a PDF.
The constructor takes a `pdfplumber.PDF` and optionally a
Expand All @@ -72,7 +166,7 @@ class PDFStructTree:
"""

page: Optional[PDFPage]
page: Optional["Page"]

def __init__(self, doc: "PDF", page: Optional["Page"] = None):
self.doc = doc.doc
Expand All @@ -88,7 +182,8 @@ def __init__(self, doc: "PDF", page: Optional["Page"] = None):
# span multiple pages, and the "Pg" attribute is *optional*,
# so this is the approved way to get a page's structure...
if page is not None:
self.page = page.page_obj
self.page = page
self.pages = {page.page_number: page}
self.page_dict = None
# ...EXCEPT that the ParentTree is sometimes missing, in which
# case we fall back to the non-approved way.
Expand All @@ -102,9 +197,9 @@ def __init__(self, doc: "PDF", page: Optional["Page"] = None):
# structure tree) then there is no `StructParents`.
# Note however that if there are XObjects in a page,
# *they* may have `StructParent` (not `StructParents`)
if "StructParents" not in self.page.attrs:
if "StructParents" not in self.page.page_obj.attrs:
return
parent_id = self.page.attrs["StructParents"]
parent_id = self.page.page_obj.attrs["StructParents"]
# NumberTree should have a `get` method like it does in pdf.js...
parent_array = resolve1(
next(array for num, array in parent_tree.values if num == parent_id)
Expand All @@ -113,8 +208,9 @@ def __init__(self, doc: "PDF", page: Optional["Page"] = None):
else:
self.page = None
# Overhead of creating pages shouldn't be too bad we hope!
self.pages = {page.page_number: page for page in doc.pages}
self.page_dict = {
page.page_obj.pageid: page.page_number for page in doc.pages
page.page_obj.pageid: page.page_number for page in self.pages.values()
}
self._parse_struct_tree()

Expand Down Expand Up @@ -246,7 +342,7 @@ def on_parsed_page(self, obj: Dict[str, Any]) -> bool:
return page_objid in self.page_dict
if self.page is not None:
# We have to do this to satisfy mypy
if page_objid != self.page.pageid:
if page_objid != self.page.page_obj.pageid:
return False
return True

Expand Down Expand Up @@ -364,3 +460,50 @@ def _resolve_children(self, seen: Dict[str, Any]) -> None:

def __iter__(self) -> Iterator[PDFStructElement]:
return iter(self.children)

def element_bbox(self, el: PDFStructElement) -> T_bbox:
"""Get the bounding box for an element for visual debugging."""
page = None
if self.page is not None:
page = self.page
elif el.page_number is not None:
page = self.pages[el.page_number]
bbox = el.attributes.get("BBox", None)
if page is not None and bbox is not None:
from .page import CroppedPage, _invert_box, _normalize_box

# Use secret knowledge of CroppedPage (cannot use
# page.height because it is the *cropped* dimension, but
# cropping does not actually translate coordinates)
bbox = _invert_box(
_normalize_box(bbox), page.mediabox[3] - page.mediabox[1]
)
# Use more secret knowledge of CroppedPage
if isinstance(page, CroppedPage):
rect = geometry.bbox_to_rect(bbox)
rects = page._crop_fn([rect])
if not rects:
raise IndexError("Element no longer on page")
return geometry.obj_to_bbox(rects[0])
else:
# Not sure why mypy complains here
return bbox # type: ignore
else:
mcid_objs = []
for page_number, mcid in el.all_mcids():
objects: Iterable[T_obj]
if page_number is None:
if page is not None:
objects = itertools.chain.from_iterable(page.objects.values())
else:
objects = [] # pragma: nocover
else:
objects = itertools.chain.from_iterable(
self.pages[page_number].objects.values()
)
for c in objects:
if c["mcid"] == mcid:
mcid_objs.append(c)
if not mcid_objs:
raise IndexError("No objects found") # pragma: nocover
return geometry.objects_to_bbox(mcid_objs)
3 changes: 2 additions & 1 deletion pdfplumber/utils/geometry.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ def clip_obj(obj: T_obj, bbox: T_bbox) -> Optional[T_obj]:
copy[attr] = dims[attr]

diff = dims["top"] - obj["top"]
copy["doctop"] = obj["doctop"] + diff
if "doctop" in copy:
copy["doctop"] = obj["doctop"] + diff
copy["width"] = copy["x1"] - copy["x0"]
copy["height"] = copy["bottom"] - copy["top"]

Expand Down
100 changes: 100 additions & 0 deletions tests/test_structure.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python3

import os
import re
import unittest
from collections import deque

Expand Down Expand Up @@ -863,6 +864,105 @@ def test_structure_tree_class(self):
doc_elem = next(iter(stree))
assert [k.type for k in doc_elem] == ["P", "P", "Figure"]

def test_find_all_tree(self):
"""
Test find_all() and find() on trees
"""
path = os.path.join(HERE, "pdfs/image_structure.pdf")
pdf = pdfplumber.open(path)
stree = PDFStructTree(pdf, pdf.pages[0])
figs = list(stree.find_all("Figure"))
assert len(figs) == 1
fig = stree.find("Figure")
assert fig == figs[0]
assert stree.find("Fogure") is None
figs = list(stree.find_all(re.compile(r"Fig.*")))
assert len(figs) == 1
figs = list(stree.find_all(lambda x: x.type == "Figure"))
assert len(figs) == 1
figs = list(stree.find_all("Foogure"))
assert len(figs) == 0
figs = list(stree.find_all(re.compile(r"Fog.*")))
assert len(figs) == 0
figs = list(stree.find_all(lambda x: x.type == "Flogger"))
assert len(figs) == 0

def test_find_all_element(self):
"""
Test find_all() and find() on elements
"""
path = os.path.join(HERE, "pdfs/pdf_structure.pdf")
pdf = pdfplumber.open(path)
stree = PDFStructTree(pdf)
for list_elem in stree.find_all("L"):
items = list(list_elem.find_all("LI"))
assert items
for item in items:
body = list(item.find_all("LBody"))
assert body
body1 = item.find("LBody")
assert body1 == body[0]
assert item.find("Loonie") is None

def test_all_mcids(self):
"""
Test all_mcids()
"""
path = os.path.join(HERE, "pdfs/2023-06-20-PV.pdf")
pdf = pdfplumber.open(path)
# Make sure we can get them with page numbers
stree = PDFStructTree(pdf)
sect = next(stree.find_all("Sect"))
mcids = list(sect.all_mcids())
pages = set(page for page, mcid in mcids)
assert 1 in pages
assert 2 in pages
# If we take only a single page there are no page numbers
# (FIXME: may wish to reconsider this API decision...)
page = pdf.pages[1]
stree = PDFStructTree(pdf, page)
sect = next(stree.find_all("Sect"))
mcids = list(sect.all_mcids())
pages = set(page for page, mcid in mcids)
assert None in pages
assert 1 not in pages
assert 2 not in pages
# Assure that we get the MCIDs for a content element
for p in sect.find_all("P"):
assert set(mcid for page, mcid in p.all_mcids()) == set(p.mcids)

def test_element_bbox(self):
"""
Test various ways of getting element bboxes
"""
path = os.path.join(HERE, "pdfs/pdf_structure.pdf")
pdf = pdfplumber.open(path)
stree = PDFStructTree(pdf)
# As BBox attribute
table = next(stree.find_all("Table"))
assert tuple(stree.element_bbox(table)) == (56.7, 489.9, 555.3, 542.25)
# With child elements
tr = next(table.find_all("TR"))
assert tuple(stree.element_bbox(tr)) == (56.8, 495.9, 328.312, 507.9)
# From a specific page it should also work
stree = PDFStructTree(pdf, pdf.pages[0])
table = next(stree.find_all("Table"))
assert tuple(stree.element_bbox(table)) == (56.7, 489.9, 555.3, 542.25)
tr = next(table.find_all("TR"))
assert tuple(stree.element_bbox(tr)) == (56.8, 495.9, 328.312, 507.9)
# Yeah but what happens if you crop the page?
page = pdf.pages[0].crop((10, 400, 500, 500))
stree = PDFStructTree(pdf, page)
table = next(stree.find_all("Table"))
# The element gets cropped too
assert tuple(stree.element_bbox(table)) == (56.7, 489.9, 500, 500)
# And if you crop it out of the page?
page = pdf.pages[0].crop((0, 0, 560, 400))
stree = PDFStructTree(pdf, page)
table = next(stree.find_all("Table"))
with self.assertRaises(IndexError):
_ = stree.element_bbox(table)


class TestUnparsed(unittest.TestCase):
"""Test handling of PDFs with unparsed pages."""
Expand Down

0 comments on commit 207312e

Please sign in to comment.