Skip to content

Commit fee8ee8

Browse files
committed
Merge remote-tracking branch 'py-pdf/main' into Merger
2 parents 938fc4a + c4e95bd commit fee8ee8

File tree

5 files changed

+75
-17
lines changed

5 files changed

+75
-17
lines changed

pypdf/_reader.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@
7777
NullObject,
7878
NumberObject,
7979
PdfObject,
80+
StreamObject,
8081
TextStringObject,
8182
read_object,
8283
)
@@ -316,8 +317,6 @@ def _get_object_from_stream(
316317
obj_stm: EncodedStreamObject = IndirectObject(stmnum, 0, self).get_object() # type: ignore
317318
# This is an xref to a stream, so its type better be a stream
318319
assert cast(str, obj_stm["/Type"]) == "/ObjStm"
319-
# /N is the number of indirect objects in the stream
320-
assert idx < obj_stm["/N"]
321320
stream_data = BytesIO(obj_stm.get_data())
322321
for i in range(obj_stm["/N"]): # type: ignore
323322
read_non_whitespace(stream_data)
@@ -999,6 +998,41 @@ def _rebuild_xref_table(self, stream: StreamType) -> None:
999998
if generation not in self.xref:
1000999
self.xref[generation] = {}
10011000
self.xref[generation][idnum] = m.start(1)
1001+
1002+
logger_warning("parsing for Object Streams", __name__)
1003+
for g in self.xref:
1004+
for i in self.xref[g]:
1005+
# get_object in manual
1006+
stream.seek(self.xref[g][i], 0)
1007+
try:
1008+
_ = self.read_object_header(stream)
1009+
o = cast(StreamObject, read_object(stream, self))
1010+
if o.get("/Type", "") != "/ObjStm":
1011+
continue
1012+
strm = BytesIO(o.get_data())
1013+
cpt = 0
1014+
while True:
1015+
s = read_until_whitespace(strm)
1016+
if not s.isdigit():
1017+
break
1018+
_i = int(s)
1019+
skip_over_whitespace(strm)
1020+
strm.seek(-1, 1)
1021+
s = read_until_whitespace(strm)
1022+
if not s.isdigit(): # pragma: no cover
1023+
break # pragma: no cover
1024+
_o = int(s)
1025+
self.xref_objStm[_i] = (i, _o)
1026+
cpt += 1
1027+
if cpt != o.get("/N"): # pragma: no cover
1028+
logger_warning( # pragma: no cover
1029+
f"found {cpt} objects within Object({i},{g})"
1030+
f" whereas {o.get('/N')} expected",
1031+
__name__,
1032+
)
1033+
except Exception: # could be of many cause
1034+
pass
1035+
10021036
stream.seek(0, 0)
10031037
for m in re.finditer(rb"[\r\n \t][ \t]*trailer[\r\n \t]*(<<)", f_):
10041038
stream.seek(m.start(1), 0)

pypdf/annotations/_non_markup_annotations.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def __init__(
3636
if is_external and is_internal:
3737
raise ValueError(
3838
"Either 'url' or 'target_page_index' have to be provided. "
39-
f"url={url}, target_page_index={target_page_index}"
39+
f"{url=}, {target_page_index=}"
4040
)
4141

4242
border_arr: BorderArrayType

pypdf/generic/_rectangle.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def __init__(
2626
ArrayObject.__init__(self, [self._ensure_is_number(x) for x in arr]) # type: ignore
2727

2828
def _ensure_is_number(self, value: Any) -> Union[FloatObject, NumberObject]:
29-
if not isinstance(value, (NumberObject, FloatObject)):
29+
if not isinstance(value, (FloatObject, NumberObject)):
3030
value = FloatObject(value)
3131
return value
3232

tests/test_filters.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
from io import BytesIO
66
from itertools import product as cartesian_product
77
from pathlib import Path
8-
from unittest.mock import patch
98

109
import pytest
1110
from PIL import Image
@@ -225,14 +224,11 @@ def test_ccitt_fax_decode():
225224

226225

227226
@pytest.mark.enable_socket()
228-
@patch("pypdf._reader.logger_warning")
229-
def test_decompress_zlib_error(mock_logger_warning):
227+
def test_decompress_zlib_error(caplog):
230228
reader = PdfReader(BytesIO(get_data_from_url(name="tika-952445.pdf")))
231229
for page in reader.pages:
232230
page.extract_text()
233-
mock_logger_warning.assert_called_with(
234-
"incorrect startxref pointer(3)", "pypdf._reader"
235-
)
231+
assert "incorrect startxref pointer(3)" in caplog.text
236232

237233

238234
@pytest.mark.enable_socket()

tests/test_reader.py

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -276,14 +276,22 @@ def test_get_images(src, expected_images):
276276
False,
277277
0,
278278
False,
279-
["startxref on same line as offset", "incorrect startxref pointer(1)"],
279+
[
280+
"startxref on same line as offset",
281+
"incorrect startxref pointer(1)",
282+
"parsing for Object Streams",
283+
],
280284
), # error on startxref, but no strict => xref rebuilt,no fail
281285
(
282286
False,
283287
True,
284288
0,
285289
False,
286-
["startxref on same line as offset", "incorrect startxref pointer(1)"],
290+
[
291+
"startxref on same line as offset",
292+
"incorrect startxref pointer(1)",
293+
"parsing for Object Streams",
294+
],
287295
),
288296
],
289297
)
@@ -344,7 +352,10 @@ def test_issue297(caplog):
344352
assert caplog.text == ""
345353
assert "Broken xref table" in exc.value.args[0]
346354
reader = PdfReader(path, strict=False)
347-
assert normalize_warnings(caplog.text) == ["incorrect startxref pointer(1)"]
355+
assert normalize_warnings(caplog.text) == [
356+
"incorrect startxref pointer(1)",
357+
"parsing for Object Streams",
358+
]
348359
reader.pages[0]
349360

350361

@@ -898,23 +909,28 @@ def test_form_topname_with_and_without_acroform(caplog):
898909
def test_extract_text_xref_issue_2(caplog):
899910
# pdf/0264cf510015b2a4b395a15cb23c001e.pdf
900911
url = "https://corpora.tika.apache.org/base/docs/govdocs1/981/981961.pdf"
901-
msg = "incorrect startxref pointer(2)"
912+
msg = [
913+
"incorrect startxref pointer(2)",
914+
"parsing for Object Streams",
915+
]
902916
reader = PdfReader(BytesIO(get_data_from_url(url, name="tika-981961.pdf")))
903917
for page in reader.pages:
904918
page.extract_text()
905-
assert normalize_warnings(caplog.text) == [msg]
919+
assert normalize_warnings(caplog.text) == msg
906920

907921

908922
@pytest.mark.enable_socket()
909923
@pytest.mark.slow()
910924
def test_extract_text_xref_issue_3(caplog):
911925
# pdf/0264cf510015b2a4b395a15cb23c001e.pdf
912926
url = "https://corpora.tika.apache.org/base/docs/govdocs1/977/977774.pdf"
913-
msg = "incorrect startxref pointer(3)"
927+
msg = [
928+
"incorrect startxref pointer(3)",
929+
]
914930
reader = PdfReader(BytesIO(get_data_from_url(url, name="tika-977774.pdf")))
915931
for page in reader.pages:
916932
page.extract_text()
917-
assert normalize_warnings(caplog.text) == [msg]
933+
assert normalize_warnings(caplog.text) == msg
918934

919935

920936
@pytest.mark.enable_socket()
@@ -1589,3 +1605,15 @@ def test_iss2761():
15891605
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)), strict=False)
15901606
with pytest.raises(PdfReadError):
15911607
reader.pages[0].extract_text()
1608+
1609+
1610+
@pytest.mark.enable_socket()
1611+
def test_iss2817():
1612+
"""Test for rebuiling Xref_ObjStm"""
1613+
url = "https://github.com/user-attachments/files/16764070/crash-7e1356f1179b4198337f282304cb611aea26a199.pdf"
1614+
name = "iss2817.pdf"
1615+
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
1616+
assert (
1617+
reader.pages[0]["/Annots"][0].get_object()["/Contents"]
1618+
== "A\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 B"
1619+
)

0 commit comments

Comments
 (0)