@@ -276,14 +276,22 @@ def test_get_images(src, expected_images):
276276 False ,
277277 0 ,
278278 False ,
279- ["startxref on same line as offset" , "incorrect startxref pointer(1)" ],
279+ [
280+ "startxref on same line as offset" ,
281+ "incorrect startxref pointer(1)" ,
282+ "parsing for Object Streams" ,
283+ ],
280284 ), # error on startxref, but no strict => xref rebuilt,no fail
281285 (
282286 False ,
283287 True ,
284288 0 ,
285289 False ,
286- ["startxref on same line as offset" , "incorrect startxref pointer(1)" ],
290+ [
291+ "startxref on same line as offset" ,
292+ "incorrect startxref pointer(1)" ,
293+ "parsing for Object Streams" ,
294+ ],
287295 ),
288296 ],
289297)
@@ -344,7 +352,10 @@ def test_issue297(caplog):
344352 assert caplog .text == ""
345353 assert "Broken xref table" in exc .value .args [0 ]
346354 reader = PdfReader (path , strict = False )
347- assert normalize_warnings (caplog .text ) == ["incorrect startxref pointer(1)" ]
355+ assert normalize_warnings (caplog .text ) == [
356+ "incorrect startxref pointer(1)" ,
357+ "parsing for Object Streams" ,
358+ ]
348359 reader .pages [0 ]
349360
350361
@@ -898,23 +909,28 @@ def test_form_topname_with_and_without_acroform(caplog):
898909def test_extract_text_xref_issue_2 (caplog ):
899910 # pdf/0264cf510015b2a4b395a15cb23c001e.pdf
900911 url = "https://corpora.tika.apache.org/base/docs/govdocs1/981/981961.pdf"
901- msg = "incorrect startxref pointer(2)"
912+ msg = [
913+ "incorrect startxref pointer(2)" ,
914+ "parsing for Object Streams" ,
915+ ]
902916 reader = PdfReader (BytesIO (get_data_from_url (url , name = "tika-981961.pdf" )))
903917 for page in reader .pages :
904918 page .extract_text ()
905- assert normalize_warnings (caplog .text ) == [ msg ]
919+ assert normalize_warnings (caplog .text ) == msg
906920
907921
908922@pytest .mark .enable_socket ()
909923@pytest .mark .slow ()
910924def test_extract_text_xref_issue_3 (caplog ):
911925 # pdf/0264cf510015b2a4b395a15cb23c001e.pdf
912926 url = "https://corpora.tika.apache.org/base/docs/govdocs1/977/977774.pdf"
913- msg = "incorrect startxref pointer(3)"
927+ msg = [
928+ "incorrect startxref pointer(3)" ,
929+ ]
914930 reader = PdfReader (BytesIO (get_data_from_url (url , name = "tika-977774.pdf" )))
915931 for page in reader .pages :
916932 page .extract_text ()
917- assert normalize_warnings (caplog .text ) == [ msg ]
933+ assert normalize_warnings (caplog .text ) == msg
918934
919935
920936@pytest .mark .enable_socket ()
@@ -1589,3 +1605,15 @@ def test_iss2761():
15891605 reader = PdfReader (BytesIO (get_data_from_url (url , name = name )), strict = False )
15901606 with pytest .raises (PdfReadError ):
15911607 reader .pages [0 ].extract_text ()
1608+
1609+
1610+ @pytest .mark .enable_socket ()
1611+ def test_iss2817 ():
1612+ """Test for rebuiling Xref_ObjStm"""
1613+ url = "https://github.com/user-attachments/files/16764070/crash-7e1356f1179b4198337f282304cb611aea26a199.pdf"
1614+ name = "iss2817.pdf"
1615+ reader = PdfReader (BytesIO (get_data_from_url (url , name = name )))
1616+ assert (
1617+ reader .pages [0 ]["/Annots" ][0 ].get_object ()["/Contents" ]
1618+ == "A\xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 B"
1619+ )
0 commit comments