Skip to content

Commit 9d54f63

Browse files
authored
ENH: Robustify parsing for Object streams in XRef rebuilding (#2818)
Closes #2817.
1 parent 98d4425 commit 9d54f63

File tree

3 files changed

+73
-15
lines changed

3 files changed

+73
-15
lines changed

pypdf/_reader.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@
7777
NullObject,
7878
NumberObject,
7979
PdfObject,
80+
StreamObject,
8081
TextStringObject,
8182
read_object,
8283
)
@@ -316,8 +317,6 @@ def _get_object_from_stream(
316317
obj_stm: EncodedStreamObject = IndirectObject(stmnum, 0, self).get_object() # type: ignore
317318
# This is an xref to a stream, so its type better be a stream
318319
assert cast(str, obj_stm["/Type"]) == "/ObjStm"
319-
# /N is the number of indirect objects in the stream
320-
assert idx < obj_stm["/N"]
321320
stream_data = BytesIO(obj_stm.get_data())
322321
for i in range(obj_stm["/N"]): # type: ignore
323322
read_non_whitespace(stream_data)
@@ -999,6 +998,41 @@ def _rebuild_xref_table(self, stream: StreamType) -> None:
999998
if generation not in self.xref:
1000999
self.xref[generation] = {}
10011000
self.xref[generation][idnum] = m.start(1)
1001+
1002+
logger_warning("parsing for Object Streams", __name__)
1003+
for g in self.xref:
1004+
for i in self.xref[g]:
1005+
# get_object in manual
1006+
stream.seek(self.xref[g][i], 0)
1007+
try:
1008+
_ = self.read_object_header(stream)
1009+
o = cast(StreamObject, read_object(stream, self))
1010+
if o.get("/Type", "") != "/ObjStm":
1011+
continue
1012+
strm = BytesIO(o.get_data())
1013+
cpt = 0
1014+
while True:
1015+
s = read_until_whitespace(strm)
1016+
if not s.isdigit():
1017+
break
1018+
_i = int(s)
1019+
skip_over_whitespace(strm)
1020+
strm.seek(-1, 1)
1021+
s = read_until_whitespace(strm)
1022+
if not s.isdigit(): # pragma: no cover
1023+
break # pragma: no cover
1024+
_o = int(s)
1025+
self.xref_objStm[_i] = (i, _o)
1026+
cpt += 1
1027+
if cpt != o.get("/N"): # pragma: no cover
1028+
logger_warning( # pragma: no cover
1029+
f"found {cpt} objects within Object({i},{g})"
1030+
f" whereas {o.get('/N')} expected",
1031+
__name__,
1032+
)
1033+
except Exception: # could be of many cause
1034+
pass
1035+
10021036
stream.seek(0, 0)
10031037
for m in re.finditer(rb"[\r\n \t][ \t]*trailer[\r\n \t]*(<<)", f_):
10041038
stream.seek(m.start(1), 0)

tests/test_filters.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
from io import BytesIO
66
from itertools import product as cartesian_product
77
from pathlib import Path
8-
from unittest.mock import patch
98

109
import pytest
1110
from PIL import Image
@@ -225,14 +224,11 @@ def test_ccitt_fax_decode():
225224

226225

227226
@pytest.mark.enable_socket()
228-
@patch("pypdf._reader.logger_warning")
229-
def test_decompress_zlib_error(mock_logger_warning):
227+
def test_decompress_zlib_error(caplog):
230228
reader = PdfReader(BytesIO(get_data_from_url(name="tika-952445.pdf")))
231229
for page in reader.pages:
232230
page.extract_text()
233-
mock_logger_warning.assert_called_with(
234-
"incorrect startxref pointer(3)", "pypdf._reader"
235-
)
231+
assert "incorrect startxref pointer(3)" in caplog.text
236232

237233

238234
@pytest.mark.enable_socket()

tests/test_reader.py

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -276,14 +276,22 @@ def test_get_images(src, expected_images):
276276
False,
277277
0,
278278
False,
279-
["startxref on same line as offset", "incorrect startxref pointer(1)"],
279+
[
280+
"startxref on same line as offset",
281+
"incorrect startxref pointer(1)",
282+
"parsing for Object Streams",
283+
],
280284
), # error on startxref, but no strict => xref rebuilt,no fail
281285
(
282286
False,
283287
True,
284288
0,
285289
False,
286-
["startxref on same line as offset", "incorrect startxref pointer(1)"],
290+
[
291+
"startxref on same line as offset",
292+
"incorrect startxref pointer(1)",
293+
"parsing for Object Streams",
294+
],
287295
),
288296
],
289297
)
@@ -344,7 +352,10 @@ def test_issue297(caplog):
344352
assert caplog.text == ""
345353
assert "Broken xref table" in exc.value.args[0]
346354
reader = PdfReader(path, strict=False)
347-
assert normalize_warnings(caplog.text) == ["incorrect startxref pointer(1)"]
355+
assert normalize_warnings(caplog.text) == [
356+
"incorrect startxref pointer(1)",
357+
"parsing for Object Streams",
358+
]
348359
reader.pages[0]
349360

350361

@@ -898,23 +909,28 @@ def test_form_topname_with_and_without_acroform(caplog):
898909
def test_extract_text_xref_issue_2(caplog):
899910
# pdf/0264cf510015b2a4b395a15cb23c001e.pdf
900911
url = "https://corpora.tika.apache.org/base/docs/govdocs1/981/981961.pdf"
901-
msg = "incorrect startxref pointer(2)"
912+
msg = [
913+
"incorrect startxref pointer(2)",
914+
"parsing for Object Streams",
915+
]
902916
reader = PdfReader(BytesIO(get_data_from_url(url, name="tika-981961.pdf")))
903917
for page in reader.pages:
904918
page.extract_text()
905-
assert normalize_warnings(caplog.text) == [msg]
919+
assert normalize_warnings(caplog.text) == msg
906920

907921

908922
@pytest.mark.enable_socket()
909923
@pytest.mark.slow()
910924
def test_extract_text_xref_issue_3(caplog):
911925
# pdf/0264cf510015b2a4b395a15cb23c001e.pdf
912926
url = "https://corpora.tika.apache.org/base/docs/govdocs1/977/977774.pdf"
913-
msg = "incorrect startxref pointer(3)"
927+
msg = [
928+
"incorrect startxref pointer(3)",
929+
]
914930
reader = PdfReader(BytesIO(get_data_from_url(url, name="tika-977774.pdf")))
915931
for page in reader.pages:
916932
page.extract_text()
917-
assert normalize_warnings(caplog.text) == [msg]
933+
assert normalize_warnings(caplog.text) == msg
918934

919935

920936
@pytest.mark.enable_socket()
@@ -1589,3 +1605,15 @@ def test_iss2761():
15891605
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)), strict=False)
15901606
with pytest.raises(PdfReadError):
15911607
reader.pages[0].extract_text()
1608+
1609+
1610+
@pytest.mark.enable_socket()
1611+
def test_iss2817():
1612+
"""Test for rebuiling Xref_ObjStm"""
1613+
url = "https://github.com/user-attachments/files/16764070/crash-7e1356f1179b4198337f282304cb611aea26a199.pdf"
1614+
name = "iss2817.pdf"
1615+
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
1616+
assert (
1617+
reader.pages[0]["/Annots"][0].get_object()["/Contents"]
1618+
== "A\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 B"
1619+
)

0 commit comments

Comments
 (0)