From 81a9987f5385823951eaa023c2839822a8fffe1c Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 5 Jun 2022 16:52:04 +0200 Subject: [PATCH] ROB: Fix some conversion errors on non conform PDF (#932) Issue: Due to an error in the stream pointing onto \n before the object, a ValueError (invalid literal for int() with base 10: b'') was raised Fix: Move to next non whitespace Closes #925 --- PyPDF2/_reader.py | 2 ++ tests/test_reader.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index 8c9661393..e82d8b0e3 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -980,6 +980,8 @@ def _get_object_from_stream( if self.strict and idx != i: raise PdfReadError("Object is in wrong index.") stream_data.seek(int(obj_stm["/First"] + offset), 0) # type: ignore + read_non_whitespace(stream_data) # to cope with some case where the 'pointer' is on a white space + stream_data.seek(-1, 1) try: obj = read_object(stream_data, self) except PdfStreamError as exc: diff --git a/tests/test_reader.py b/tests/test_reader.py index d84a9fdd1..397331918 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -2,6 +2,7 @@ import os import time from io import BytesIO +import urllib.request import pytest @@ -666,3 +667,16 @@ def test_convertToInt_deprecated(): match=msg, ): assert convertToInt(b"\x01", 8) == 1 + + +def test_iss925(): + reader = PdfReader(BytesIO(urllib.request.urlopen( + "https://github.com/py-pdf/PyPDF2/files/8796328/1.pdf").read())) + + for page_sliced in reader.pages: + page_object = page_sliced.get_object() + # Extracts the PDF's Annots (Annotations and Commenting): + annots = page_object.get("/Annots") + if annots is not None: + for annot in annots: + annot.get_object()