Skip to content

Commit 98d4425

Browse files
authored
ENH: Add incremental capability to PdfWriter (#2811)
Closes #2780.
1 parent b85c171 commit 98d4425

File tree

13 files changed

+613
-93
lines changed

13 files changed

+613
-93
lines changed

pypdf/_doc_common.py

Lines changed: 58 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,7 @@
6565
from .constants import FieldDictionaryAttributes as FA
6666
from .constants import PageAttributes as PG
6767
from .constants import PagesAttributes as PA
68-
from .errors import (
69-
PdfReadError,
70-
)
68+
from .errors import PdfReadError, PyPdfError
7169
from .generic import (
7270
ArrayObject,
7371
BooleanObject,
@@ -254,6 +252,8 @@ class PdfDocCommon:
254252

255253
_encryption: Optional[Encryption] = None
256254

255+
_readonly: bool = False
256+
257257
@property
258258
@abstractmethod
259259
def root_object(self) -> DictionaryObject:
@@ -349,7 +349,7 @@ def get_num_pages(self) -> int:
349349
return self.root_object["/Pages"]["/Count"] # type: ignore
350350
else:
351351
if self.flattened_pages is None:
352-
self._flatten()
352+
self._flatten(self._readonly)
353353
assert self.flattened_pages is not None
354354
return len(self.flattened_pages)
355355

@@ -366,10 +366,49 @@ def get_page(self, page_number: int) -> PageObject:
366366
A :class:`PageObject<pypdf._page.PageObject>` instance.
367367
"""
368368
if self.flattened_pages is None:
369-
self._flatten()
369+
self._flatten(self._readonly)
370370
assert self.flattened_pages is not None, "hint for mypy"
371371
return self.flattened_pages[page_number]
372372

373+
def _get_page_in_node(
374+
self,
375+
page_number: int,
376+
) -> Tuple[DictionaryObject, int]:
377+
"""
378+
Retrieve the node and position within the /Kids containing the page.
379+
If page_number is greater than the number of pages, it returns the top node, -1.
380+
"""
381+
top = cast(DictionaryObject, self.root_object["/Pages"])
382+
383+
def recursive_call(
384+
node: DictionaryObject, mi: int
385+
) -> Tuple[Optional[PdfObject], int]:
386+
ma = cast(int, node.get("/Count", 1)) # default 1 for /Page types
387+
if node["/Type"] == "/Page":
388+
if page_number == mi:
389+
return node, -1
390+
# else
391+
return None, mi + 1
392+
if (page_number - mi) >= ma: # not in nodes below
393+
if node == top:
394+
return top, -1
395+
# else
396+
return None, mi + ma
397+
for idx, kid in enumerate(cast(ArrayObject, node["/Kids"])):
398+
kid = cast(DictionaryObject, kid.get_object())
399+
n, i = recursive_call(kid, mi)
400+
if n is not None: # page has just been found ...
401+
if i < 0: # ... just below!
402+
return node, idx
403+
# else: # ... at lower levels
404+
return n, i
405+
mi = i
406+
raise PyPdfError("Unexpectedly cannot find the node.")
407+
408+
node, idx = recursive_call(top, 0)
409+
assert isinstance(node, DictionaryObject), "mypy"
410+
return node, idx
411+
373412
@property
374413
def named_destinations(self) -> Dict[str, Any]:
375414
"""
@@ -1082,10 +1121,20 @@ def page_mode(self) -> Optional[PagemodeType]:
10821121

10831122
def _flatten(
10841123
self,
1124+
list_only: bool = False,
10851125
pages: Union[None, DictionaryObject, PageObject] = None,
10861126
inherit: Optional[Dict[str, Any]] = None,
10871127
indirect_reference: Optional[IndirectObject] = None,
10881128
) -> None:
1129+
"""
1130+
Prepare the document pages to ease searching
1131+
1132+
Args:
1133+
list_only: Will only list the pages within _flatten_pages.
1134+
pages:
1135+
inherit:
1136+
indirect_reference: Used recursively to flatten the /Pages object.
1137+
"""
10891138
inheritable_page_attributes = (
10901139
NameObject(PG.RESOURCES),
10911140
NameObject(PG.MEDIABOX),
@@ -1122,7 +1171,7 @@ def _flatten(
11221171
if obj:
11231172
# damaged file may have invalid child in /Pages
11241173
try:
1125-
self._flatten(obj, inherit, **addt)
1174+
self._flatten(list_only, obj, inherit, **addt)
11261175
except RecursionError:
11271176
raise PdfReadError(
11281177
"Maximum recursion depth reached during page flattening."
@@ -1134,7 +1183,8 @@ def _flatten(
11341183
if attr_in not in pages:
11351184
pages[attr_in] = value
11361185
page_obj = PageObject(self, indirect_reference)
1137-
page_obj.update(pages)
1186+
if not list_only:
1187+
page_obj.update(pages)
11381188

11391189
# TODO: Could flattened_pages be None at this point?
11401190
self.flattened_pages.append(page_obj) # type: ignore
@@ -1158,7 +1208,7 @@ def remove_page(
11581208
or destinations to reference a detached page.
11591209
"""
11601210
if self.flattened_pages is None:
1161-
self._flatten()
1211+
self._flatten(self._readonly)
11621212
assert self.flattened_pages is not None
11631213
if isinstance(page, IndirectObject):
11641214
p = page.get_object()

pypdf/_page.py

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -492,6 +492,22 @@ def __init__(
492492
self.inline_images: Optional[Dict[str, ImageFile]] = None
493493
# below Union for mypy but actually Optional[List[str]]
494494
self.indirect_reference = indirect_reference
495+
if indirect_reference is not None:
496+
self.update(cast(DictionaryObject, indirect_reference.get_object()))
497+
498+
def hash_bin(self) -> int:
499+
"""
500+
Used to detect modified object.
501+
502+
Note: this function is overloaded to return the same results
503+
as a DictionaryObject.
504+
505+
Returns:
506+
Hash considering type and value.
507+
"""
508+
return hash(
509+
(DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items())))
510+
)
495511

496512
def hash_value_data(self) -> bytes:
497513
data = super().hash_value_data()
@@ -2399,27 +2415,33 @@ def __delitem__(self, index: Union[int, slice]) -> None:
23992415
raise IndexError("index out of range")
24002416
ind = self[index].indirect_reference
24012417
assert ind is not None
2402-
parent = cast(DictionaryObject, ind.get_object()).get("/Parent", None)
2418+
parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get(
2419+
"/Parent", None
2420+
)
2421+
first = True
24032422
while parent is not None:
24042423
parent = cast(DictionaryObject, parent.get_object())
24052424
try:
2406-
i = parent["/Kids"].index(ind)
2407-
del parent["/Kids"][i]
2425+
i = cast(ArrayObject, parent["/Kids"]).index(ind)
2426+
del cast(ArrayObject, parent["/Kids"])[i]
2427+
first = False
24082428
try:
24092429
assert ind is not None
24102430
del ind.pdf.flattened_pages[index] # case of page in a Reader
24112431
except Exception: # pragma: no cover
24122432
pass
24132433
if "/Count" in parent:
2414-
parent[NameObject("/Count")] = NumberObject(parent["/Count"] - 1)
2415-
if len(parent["/Kids"]) == 0:
2434+
parent[NameObject("/Count")] = NumberObject(
2435+
cast(int, parent["/Count"]) - 1
2436+
)
2437+
if len(cast(ArrayObject, parent["/Kids"])) == 0:
24162438
# No more objects in this part of this sub tree
24172439
ind = parent.indirect_reference
2418-
parent = cast(DictionaryObject, parent.get("/Parent", None))
2419-
else:
2420-
parent = None
2440+
parent = parent.get("/Parent", None)
24212441
except ValueError: # from index
2422-
raise PdfReadError(f"Page Not Found in Page Tree {ind}")
2442+
if first:
2443+
raise PdfReadError(f"Page not found in page tree: {ind}")
2444+
break
24232445

24242446
def __iter__(self) -> Iterator[PageObject]:
24252447
for i in range(len(self)):

pypdf/_protocols.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ class PdfWriterProtocol(PdfCommonDocProtocol, Protocol):
7474
_objects: List[Any]
7575
_id_translated: Dict[int, Dict[int, int]]
7676

77+
incremental: bool
78+
_reader: Any # PdfReader
79+
7780
@abstractmethod
7881
def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]:
7982
... # pragma: no cover

pypdf/_reader.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ def __init__(
136136
with open(stream, "rb") as fh:
137137
stream = BytesIO(fh.read())
138138
self._stream_opened = True
139+
self._startxref: int = 0
139140
self.read(stream)
140141
self.stream = stream
141142

@@ -563,6 +564,7 @@ def read(self, stream: StreamType) -> None:
563564
self._basic_validation(stream)
564565
self._find_eof_marker(stream)
565566
startxref = self._find_startxref_pos(stream)
567+
self._startxref = startxref
566568

567569
# check and eventually correct the startxref only in not strict
568570
xref_issue_nr = self._get_xref_issues(stream, startxref)

0 commit comments

Comments
 (0)