6565from .constants import FieldDictionaryAttributes as FA
6666from .constants import PageAttributes as PG
6767from .constants import PagesAttributes as PA
68- from .errors import (
69- PdfReadError ,
70- )
68+ from .errors import PdfReadError , PyPdfError
7169from .generic import (
7270 ArrayObject ,
7371 BooleanObject ,
@@ -254,6 +252,8 @@ class PdfDocCommon:
254252
255253 _encryption : Optional [Encryption ] = None
256254
255+ _readonly : bool = False
256+
257257 @property
258258 @abstractmethod
259259 def root_object (self ) -> DictionaryObject :
@@ -349,7 +349,7 @@ def get_num_pages(self) -> int:
349349 return self .root_object ["/Pages" ]["/Count" ] # type: ignore
350350 else :
351351 if self .flattened_pages is None :
352- self ._flatten ()
352+ self ._flatten (self . _readonly )
353353 assert self .flattened_pages is not None
354354 return len (self .flattened_pages )
355355
@@ -366,10 +366,49 @@ def get_page(self, page_number: int) -> PageObject:
366366 A :class:`PageObject<pypdf._page.PageObject>` instance.
367367 """
368368 if self .flattened_pages is None :
369- self ._flatten ()
369+ self ._flatten (self . _readonly )
370370 assert self .flattened_pages is not None , "hint for mypy"
371371 return self .flattened_pages [page_number ]
372372
373+ def _get_page_in_node (
374+ self ,
375+ page_number : int ,
376+ ) -> Tuple [DictionaryObject , int ]:
377+ """
378+ Retrieve the node and position within the /Kids containing the page.
379+ If page_number is greater than the number of pages, it returns the top node, -1.
380+ """
381+ top = cast (DictionaryObject , self .root_object ["/Pages" ])
382+
383+ def recursive_call (
384+ node : DictionaryObject , mi : int
385+ ) -> Tuple [Optional [PdfObject ], int ]:
386+ ma = cast (int , node .get ("/Count" , 1 )) # default 1 for /Page types
387+ if node ["/Type" ] == "/Page" :
388+ if page_number == mi :
389+ return node , - 1
390+ # else
391+ return None , mi + 1
392+ if (page_number - mi ) >= ma : # not in nodes below
393+ if node == top :
394+ return top , - 1
395+ # else
396+ return None , mi + ma
397+ for idx , kid in enumerate (cast (ArrayObject , node ["/Kids" ])):
398+ kid = cast (DictionaryObject , kid .get_object ())
399+ n , i = recursive_call (kid , mi )
400+ if n is not None : # page has just been found ...
401+ if i < 0 : # ... just below!
402+ return node , idx
403+ # else: # ... at lower levels
404+ return n , i
405+ mi = i
406+ raise PyPdfError ("Unexpectedly cannot find the node." )
407+
408+ node , idx = recursive_call (top , 0 )
409+ assert isinstance (node , DictionaryObject ), "mypy"
410+ return node , idx
411+
373412 @property
374413 def named_destinations (self ) -> Dict [str , Any ]:
375414 """
@@ -1082,10 +1121,20 @@ def page_mode(self) -> Optional[PagemodeType]:
10821121
10831122 def _flatten (
10841123 self ,
1124+ list_only : bool = False ,
10851125 pages : Union [None , DictionaryObject , PageObject ] = None ,
10861126 inherit : Optional [Dict [str , Any ]] = None ,
10871127 indirect_reference : Optional [IndirectObject ] = None ,
10881128 ) -> None :
1129+ """
1130+ Prepare the document pages to ease searching
1131+
1132+ Args:
1133+ list_only: Will only list the pages within _flatten_pages.
1134+ pages:
1135+ inherit:
1136+ indirect_reference: Used recursively to flatten the /Pages object.
1137+ """
10891138 inheritable_page_attributes = (
10901139 NameObject (PG .RESOURCES ),
10911140 NameObject (PG .MEDIABOX ),
@@ -1122,7 +1171,7 @@ def _flatten(
11221171 if obj :
11231172 # damaged file may have invalid child in /Pages
11241173 try :
1125- self ._flatten (obj , inherit , ** addt )
1174+ self ._flatten (list_only , obj , inherit , ** addt )
11261175 except RecursionError :
11271176 raise PdfReadError (
11281177 "Maximum recursion depth reached during page flattening."
@@ -1134,7 +1183,8 @@ def _flatten(
11341183 if attr_in not in pages :
11351184 pages [attr_in ] = value
11361185 page_obj = PageObject (self , indirect_reference )
1137- page_obj .update (pages )
1186+ if not list_only :
1187+ page_obj .update (pages )
11381188
11391189 # TODO: Could flattened_pages be None at this point?
11401190 self .flattened_pages .append (page_obj ) # type: ignore
@@ -1158,7 +1208,7 @@ def remove_page(
11581208 or destinations to reference a detached page.
11591209 """
11601210 if self .flattened_pages is None :
1161- self ._flatten ()
1211+ self ._flatten (self . _readonly )
11621212 assert self .flattened_pages is not None
11631213 if isinstance (page , IndirectObject ):
11641214 p = page .get_object ()
0 commit comments