From 79345ed519723fb82c27db2375c88e231bc3137c Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 28 Sep 2024 14:49:04 +0200 Subject: [PATCH] ROB: Repair PDF with invalid Root object (#2880) Closes #2875. --- pypdf/_doc_common.py | 5 +-- pypdf/_reader.py | 39 ++++++++++++++++---- pypdf/generic/_base.py | 3 +- tests/test_reader.py | 82 +++++++++++++++++++++++++++++++++++++----- 4 files changed, 111 insertions(+), 18 deletions(-) diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py index 4ea436714..b496b5db3 100644 --- a/pypdf/_doc_common.py +++ b/pypdf/_doc_common.py @@ -1148,8 +1148,9 @@ def _flatten( # Fix issue 327: set flattened_pages attribute only for # decrypted file catalog = self.root_object - pages = catalog["/Pages"].get_object() # type: ignore - assert isinstance(pages, DictionaryObject) + pages = catalog.get("/Pages").get_object() # type: ignore + if not isinstance(pages, DictionaryObject): + raise PdfReadError("Invalid object in /Pages") self.flattened_pages = [] if PA.TYPE in pages: diff --git a/pypdf/_reader.py b/pypdf/_reader.py index d1515bb28..d55b03d10 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -127,6 +127,8 @@ def __init__( # map page indirect_reference number to page number self._page_id2num: Optional[Dict[Any, Any]] = None + self._validated_root: Optional[DictionaryObject] = None + self._initialize_stream(stream) self._override_encryption = False @@ -197,10 +199,35 @@ def close(self) -> None: @property def root_object(self) -> DictionaryObject: """Provide access to "/Root". Standardized with PdfWriter.""" - root = self.trailer[TK.ROOT] - if root is None: - raise PdfReadError('Cannot find "/Root" key in trailer') - return cast(DictionaryObject, root.get_object()) + if self._validated_root: + return self._validated_root + root = self.trailer.get(TK.ROOT) + if is_null_or_none(root): + logger_warning('Cannot find "/Root" key in trailer', __name__) + elif ( + cast(DictionaryObject, cast(PdfObject, root).get_object()).get("/Type") + == "/Catalog" + ): + self._validated_root = cast( + DictionaryObject, cast(PdfObject, root).get_object() + ) + else: + logger_warning("Invalid Root object in trailer", __name__) + if self._validated_root is None: + logger_warning('Searching object with "/Catalog" key', __name__) + nb = cast(int, self.trailer.get("/Size", 0)) + for i in range(nb): + try: + o = self.get_object(i + 1) + except Exception: # to be sure to capture all errors + o = None + if isinstance(o, DictionaryObject) and o.get("/Type") == "/Catalog": + self._validated_root = o + logger_warning(f"Root found at {o.indirect_reference!r}", __name__) + break + if self._validated_root is None: + raise PdfReadError("Cannot find Root object in pdf") + return self._validated_root @property def _info(self) -> Optional[DictionaryObject]: @@ -215,11 +242,11 @@ def _info(self) -> Optional[DictionaryObject]: return None else: info = info.get_object() - if info == None: # noqa: E711 + if not isinstance(info, DictionaryObject): raise PdfReadError( "Trailer not found or does not point to document information directory" ) - return cast(DictionaryObject, info) + return info @property def _ID(self) -> Optional[ArrayObject]: diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index 77caa4736..40fd45568 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -879,5 +879,6 @@ def is_null_or_none(x: Any) -> TypeGuard[Union[None, NullObject, IndirectObject] True if x is None or NullObject. """ return x is None or ( - isinstance(x, PdfObject) and isinstance(x.get_object(), NullObject) + isinstance(x, PdfObject) + and (x.get_object() is None or isinstance(x.get_object(), NullObject)) ) diff --git a/tests/test_reader.py b/tests/test_reader.py index 30da20adb..66d5cdd65 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -137,14 +137,14 @@ def test_iss1943(): def test_broken_meta_data(pdf_path): with open(pdf_path, "rb") as f: reader = PdfReader(f) - with pytest.raises( - PdfReadError, - match=( - "Trailer not found or does not point to document " - "information directory" - ), - ): - reader.metadata + assert reader.metadata is None + + with open(RESOURCE_ROOT / "crazyones.pdf", "rb") as f: + b = f.read(-1) + reader = PdfReader(BytesIO(b.replace(b"/Info 2 0 R", b"/Info 2 "))) + with pytest.raises(PdfReadError) as exc: + reader.metadata + assert "does not point to document information directory" in repr(exc) @pytest.mark.parametrize( @@ -621,7 +621,7 @@ def test_read_unknown_zero_pages(caplog): assert normalize_warnings(caplog.text) == warnings with pytest.raises(PdfReadError) as exc: len(reader.pages) - assert exc.value.args[0] == 'Cannot find "/Root" key in trailer' + assert exc.value.args[0] == "Invalid object in /Pages" def test_read_encrypted_without_decryption(): @@ -1712,3 +1712,67 @@ def test_unbalanced_brackets_in_dictionary_object(caplog): name = "iss2877.pdf" # reused reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert len(reader.pages) == 43 # note: /Count = 46 but 3 kids are None + + +@pytest.mark.enable_socket() +def test_repair_root(caplog): + """Cf #2877""" + url = "https://github.com/user-attachments/files/17162216/crash-6620e8b1abfe3da639b654595da859b87f985748.pdf" + name = "iss2875.pdf" + + b = get_data_from_url(url, name=name) + reader = PdfReader(BytesIO(b)) + assert len(reader.pages) == 1 + assert all( + msg in caplog.text + for msg in ( + "Invalid Root object", + 'Searching object with "/Catalog" key', + "Root found at IndirectObject(2, 0,", + ) + ) + + # no /Root Entry + reader = PdfReader(BytesIO(b.replace(b"/Root", b"/Roo "))) + caplog.clear() + assert len(reader.pages) == 1 + assert all( + msg in caplog.text + for msg in ( + 'Cannot find "/Root" key in trailer', + 'Searching object with "/Catalog" key', + "Root found at IndirectObject(2, 0,", + ) + ) + + # Invalid /Root Entry + caplog.clear() + reader = PdfReader( + BytesIO( + b.replace(b"/Root 1 0 R", b"/Root 2 0 R").replace(b"/Catalog", b"/Catalo ") + ) + ) + with pytest.raises(PdfReadError): + len(reader.pages) + assert all( + msg in caplog.text + for msg in ( + "Invalid Root object in trailer", + 'Searching object with "/Catalog" key', + ) + ) + + # Invalid /Root Entry + error in get_object + caplog.clear() + b = b.replace(b"/Root 1 0 R", b"/Root 2 0 R").replace(b"/Catalog", b"/Catalo ") + b = b[:5124] + b"A" + b[5125:] + reader = PdfReader(BytesIO(b)) + with pytest.raises(PdfReadError): + len(reader.pages) + assert all( + msg in caplog.text + for msg in ( + "Invalid Root object in trailer", + 'Searching object with "/Catalog" key', + ) + )