Skip to content

Commit

Permalink
MAINT: Small refactoring after #788
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinThoma committed Apr 27, 2022
1 parent 904b0df commit e4e67a9
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 59 deletions.
110 changes: 59 additions & 51 deletions PyPDF2/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1837,35 +1837,11 @@ def read(self, stream):
if line[:9] != b_("startxref"):
raise PdfReadError("startxref not found")

#check and eventually correct the startxref only in not strict
rebuildXrefTable = False
try:
stream.seek(startxref - 1,0) #-1 to check character before
line=stream.read(1)
if line not in b_("\r\n \t"):
raise PdfReadWarning("incorrect startxref pointer(1)",line)
line = stream.read(4)
if line != b_("xref"):
#not an xref so check if it is an XREF object
line = b_("")
while line in b_("0123456789 \t"):
line = stream.read(1)
if line == b_(""):
raise PdfReadWarning("incorrect startxref pointer(2)")
line += stream.read(2) #1 char already read, +2 to check "obj"
if line.lower() != b_("obj"):
raise PdfReadWarning("incorrect startxref pointer(3)")
while stream.read(1) in b_(" \t\r\n"):
pass;
line=stream.read(256) # check that it is xref obj
if b_("/xref") not in line.lower():
raise PdfReadWarning("incorrect startxref pointer(4)")
except PdfReadWarning as e:
warnings.warn(str(e)+", need to rebuild xref table (strict=False)",PdfReadWarning)
if( not self.strict):
rebuildXrefTable = True
else:
raise
# check and eventually correct the startxref only in not strict
rebuildXrefTable = self.is_xref_broken(stream, startxref)
if self.strict and rebuildXrefTable:
raise PdfReadError("Broken xref table")

# read all cross reference tables and their trailers
self.xref = {}
self.xref_objStm = {}
Expand Down Expand Up @@ -1952,28 +1928,7 @@ def read(self, stream):
else:
break
elif rebuildXrefTable:
self.xref={}
stream.seek(0,0)
f_ = stream.read(-1)
import re
for m in re.finditer(b_(r"[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj"),f_):
idnum = int(m.group(1))
generation = int(m.group(2))
if generation not in self.xref:
self.xref[generation] = {}
self.xref[generation][idnum] = m.start(1)
trailerPos = f_.rfind(b"trailer") - len(f_) + 7
stream.seek(trailerPos,2)
#code below duplicated
readNonWhitespace(stream)
stream.seek(-1, 1)
newTrailer = readObject(stream, self)
for key, value in list(newTrailer.items()):
if key not in self.trailer:
self.trailer[key] = value
#if "/Prev" in newTrailer:
# startxref = newTrailer["/Prev"]
#else:
self._rebuild_xref_table(stream)
break
elif x.isdigit():
# PDF 1.5+ Cross-Reference Stream
Expand Down Expand Up @@ -2071,6 +2026,59 @@ def used_before(num, generation):
# if not, then either it's just plain wrong, or the non-zero-index is actually correct
stream.seek(loc, 0) # return to where it was

@staticmethod
def is_xref_broken(stream, startxref):
stream.seek(startxref - 1, 0) # -1 to check character before
line = stream.read(1)
if line not in b_("\r\n \t"):
warnings.warn("incorrect startxref pointer(1)", PdfReadWarning)
return True
line = stream.read(4)
if line != b_("xref"):
# not an xref so check if it is an XREF object
line = b_("")
while line in b_("0123456789 \t"):
line = stream.read(1)
if line == b_(""):
warnings.warn("incorrect startxref pointer(2)", PdfReadWarning)
return True
line += stream.read(2) # 1 char already read, +2 to check "obj"
if line.lower() != b_("obj"):
warnings.warn("incorrect startxref pointer(3)", PdfReadWarning)
return True
while stream.read(1) in b_(" \t\r\n"):
pass
line = stream.read(256) # check that it is xref obj
if b_("/xref") not in line.lower():
warnings.warn("incorrect startxref pointer(4)", PdfReadWarning)
return True
return False

def _rebuild_xref_table(self, stream):
self.xref = {}
stream.seek(0, 0)
f_ = stream.read(-1)
import re

for m in re.finditer(b_(r"[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj"), f_):
idnum = int(m.group(1))
generation = int(m.group(2))
if generation not in self.xref:
self.xref[generation] = {}
self.xref[generation][idnum] = m.start(1)
trailerPos = f_.rfind(b"trailer") - len(f_) + 7
stream.seek(trailerPos, 2)
# code below duplicated
readNonWhitespace(stream)
stream.seek(-1, 1)
newTrailer = readObject(stream, self)
for key, value in list(newTrailer.items()):
if key not in self.trailer:
self.trailer[key] = value
# if "/Prev" in newTrailer:
# startxref = newTrailer["/Prev"]
# else:

def _read_xref_subsections(self, idx_pairs, getEntry, used_before):
last_end = 0
for start, size in self._pairs(idx_pairs):
Expand Down
14 changes: 6 additions & 8 deletions Tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,14 +229,12 @@ def test_get_images_raw(strict, with_prev_0, startx_correction, should_fail):
)
pdf_stream = io.BytesIO(pdf_data)
if should_fail:
with pytest.raises(Exception) as exc:
with pytest.raises(PdfReadError) as exc:
PdfFileReader(pdf_stream, strict=strict)
if startx_correction != -1:
assert exc.type == PdfReadWarning
else:
assert exc.type == PdfReadError
if startx_correction == -1:
assert (
exc.type == PdfReadError
and exc.value.args[0]
exc.value.args[0]
== "/Prev=0 in the trailer (try opening with strict=False)"
)
else:
Expand All @@ -245,10 +243,10 @@ def test_get_images_raw(strict, with_prev_0, startx_correction, should_fail):

def test_issue297():
path = os.path.join(RESOURCE_ROOT, "issue-297.pdf")
with pytest.raises(PdfReadWarning) as exc:
with pytest.raises(PdfReadError) as exc:
reader = PdfFileReader(path, strict=True)
reader.getPage(0)
assert "startxref" in exc.value.args[0]
assert "Broken xref table" in exc.value.args[0]
reader = PdfFileReader(path, strict=False)
reader.getPage(0)

Expand Down

0 comments on commit e4e67a9

Please sign in to comment.