From e4e67a9334cc0285af508bda9860eae0370bae39 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Wed, 27 Apr 2022 13:51:19 +0200 Subject: [PATCH] MAINT: Small refactoring after #788 --- PyPDF2/pdf.py | 110 +++++++++++++++++++++++-------------------- Tests/test_reader.py | 14 +++--- 2 files changed, 65 insertions(+), 59 deletions(-) diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py index 24a4e64d1..9ff7ecbf6 100644 --- a/PyPDF2/pdf.py +++ b/PyPDF2/pdf.py @@ -1837,35 +1837,11 @@ def read(self, stream): if line[:9] != b_("startxref"): raise PdfReadError("startxref not found") - #check and eventually correct the startxref only in not strict - rebuildXrefTable = False - try: - stream.seek(startxref - 1,0) #-1 to check character before - line=stream.read(1) - if line not in b_("\r\n \t"): - raise PdfReadWarning("incorrect startxref pointer(1)",line) - line = stream.read(4) - if line != b_("xref"): - #not an xref so check if it is an XREF object - line = b_("") - while line in b_("0123456789 \t"): - line = stream.read(1) - if line == b_(""): - raise PdfReadWarning("incorrect startxref pointer(2)") - line += stream.read(2) #1 char already read, +2 to check "obj" - if line.lower() != b_("obj"): - raise PdfReadWarning("incorrect startxref pointer(3)") - while stream.read(1) in b_(" \t\r\n"): - pass; - line=stream.read(256) # check that it is xref obj - if b_("/xref") not in line.lower(): - raise PdfReadWarning("incorrect startxref pointer(4)") - except PdfReadWarning as e: - warnings.warn(str(e)+", need to rebuild xref table (strict=False)",PdfReadWarning) - if( not self.strict): - rebuildXrefTable = True - else: - raise + # check and eventually correct the startxref only in not strict + rebuildXrefTable = self.is_xref_broken(stream, startxref) + if self.strict and rebuildXrefTable: + raise PdfReadError("Broken xref table") + # read all cross reference tables and their trailers self.xref = {} self.xref_objStm = {} @@ -1952,28 +1928,7 @@ def read(self, stream): else: break elif rebuildXrefTable: - self.xref={} - stream.seek(0,0) - f_ = stream.read(-1) - import re - for m in re.finditer(b_(r"[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj"),f_): - idnum = int(m.group(1)) - generation = int(m.group(2)) - if generation not in self.xref: - self.xref[generation] = {} - self.xref[generation][idnum] = m.start(1) - trailerPos = f_.rfind(b"trailer") - len(f_) + 7 - stream.seek(trailerPos,2) - #code below duplicated - readNonWhitespace(stream) - stream.seek(-1, 1) - newTrailer = readObject(stream, self) - for key, value in list(newTrailer.items()): - if key not in self.trailer: - self.trailer[key] = value - #if "/Prev" in newTrailer: - # startxref = newTrailer["/Prev"] - #else: + self._rebuild_xref_table(stream) break elif x.isdigit(): # PDF 1.5+ Cross-Reference Stream @@ -2071,6 +2026,59 @@ def used_before(num, generation): # if not, then either it's just plain wrong, or the non-zero-index is actually correct stream.seek(loc, 0) # return to where it was + @staticmethod + def is_xref_broken(stream, startxref): + stream.seek(startxref - 1, 0) # -1 to check character before + line = stream.read(1) + if line not in b_("\r\n \t"): + warnings.warn("incorrect startxref pointer(1)", PdfReadWarning) + return True + line = stream.read(4) + if line != b_("xref"): + # not an xref so check if it is an XREF object + line = b_("") + while line in b_("0123456789 \t"): + line = stream.read(1) + if line == b_(""): + warnings.warn("incorrect startxref pointer(2)", PdfReadWarning) + return True + line += stream.read(2) # 1 char already read, +2 to check "obj" + if line.lower() != b_("obj"): + warnings.warn("incorrect startxref pointer(3)", PdfReadWarning) + return True + while stream.read(1) in b_(" \t\r\n"): + pass + line = stream.read(256) # check that it is xref obj + if b_("/xref") not in line.lower(): + warnings.warn("incorrect startxref pointer(4)", PdfReadWarning) + return True + return False + + def _rebuild_xref_table(self, stream): + self.xref = {} + stream.seek(0, 0) + f_ = stream.read(-1) + import re + + for m in re.finditer(b_(r"[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj"), f_): + idnum = int(m.group(1)) + generation = int(m.group(2)) + if generation not in self.xref: + self.xref[generation] = {} + self.xref[generation][idnum] = m.start(1) + trailerPos = f_.rfind(b"trailer") - len(f_) + 7 + stream.seek(trailerPos, 2) + # code below duplicated + readNonWhitespace(stream) + stream.seek(-1, 1) + newTrailer = readObject(stream, self) + for key, value in list(newTrailer.items()): + if key not in self.trailer: + self.trailer[key] = value + # if "/Prev" in newTrailer: + # startxref = newTrailer["/Prev"] + # else: + def _read_xref_subsections(self, idx_pairs, getEntry, used_before): last_end = 0 for start, size in self._pairs(idx_pairs): diff --git a/Tests/test_reader.py b/Tests/test_reader.py index ec8521fef..4512e433c 100644 --- a/Tests/test_reader.py +++ b/Tests/test_reader.py @@ -229,14 +229,12 @@ def test_get_images_raw(strict, with_prev_0, startx_correction, should_fail): ) pdf_stream = io.BytesIO(pdf_data) if should_fail: - with pytest.raises(Exception) as exc: + with pytest.raises(PdfReadError) as exc: PdfFileReader(pdf_stream, strict=strict) - if startx_correction != -1: - assert exc.type == PdfReadWarning - else: + assert exc.type == PdfReadError + if startx_correction == -1: assert ( - exc.type == PdfReadError - and exc.value.args[0] + exc.value.args[0] == "/Prev=0 in the trailer (try opening with strict=False)" ) else: @@ -245,10 +243,10 @@ def test_get_images_raw(strict, with_prev_0, startx_correction, should_fail): def test_issue297(): path = os.path.join(RESOURCE_ROOT, "issue-297.pdf") - with pytest.raises(PdfReadWarning) as exc: + with pytest.raises(PdfReadError) as exc: reader = PdfFileReader(path, strict=True) reader.getPage(0) - assert "startxref" in exc.value.args[0] + assert "Broken xref table" in exc.value.args[0] reader = PdfFileReader(path, strict=False) reader.getPage(0)