From e4e67a9334cc0285af508bda9860eae0370bae39 Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Wed, 27 Apr 2022 13:51:19 +0200
Subject: [PATCH] MAINT: Small refactoring after #788

---
 PyPDF2/pdf.py        | 110 +++++++++++++++++++++++--------------------
 Tests/test_reader.py |  14 +++---
 2 files changed, 65 insertions(+), 59 deletions(-)

diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py
index 24a4e64d1..9ff7ecbf6 100644
--- a/PyPDF2/pdf.py
+++ b/PyPDF2/pdf.py
@@ -1837,35 +1837,11 @@ def read(self, stream):
             if line[:9] != b_("startxref"):
                 raise PdfReadError("startxref not found")
 
-        #check and eventually correct the startxref only in not strict
-        rebuildXrefTable = False
-        try:
-            stream.seek(startxref - 1,0) #-1 to check character before
-            line=stream.read(1)
-            if line not in b_("\r\n \t"):
-                raise PdfReadWarning("incorrect startxref pointer(1)",line)
-            line = stream.read(4)
-            if line != b_("xref"):
-                #not an xref so check if it is an XREF object
-                line = b_("")
-                while line in b_("0123456789 \t"):
-                    line = stream.read(1)
-                    if line == b_(""):
-                        raise PdfReadWarning("incorrect startxref pointer(2)")
-                line += stream.read(2)   #1 char already read, +2 to check "obj"
-                if line.lower() != b_("obj"):
-                    raise PdfReadWarning("incorrect startxref pointer(3)")
-                while stream.read(1) in b_(" \t\r\n"):
-                    pass;
-                line=stream.read(256) # check that it is xref obj
-                if b_("/xref") not in line.lower():
-                    raise PdfReadWarning("incorrect startxref pointer(4)")
-        except PdfReadWarning as e:
-            warnings.warn(str(e)+", need to rebuild xref table (strict=False)",PdfReadWarning)
-            if( not self.strict):
-                rebuildXrefTable = True
-            else:
-                raise
+        # check and eventually correct the startxref only in not strict
+        rebuildXrefTable = self.is_xref_broken(stream, startxref)
+        if self.strict and rebuildXrefTable:
+            raise PdfReadError("Broken xref table")
+
         # read all cross reference tables and their trailers
         self.xref = {}
         self.xref_objStm = {}
@@ -1952,28 +1928,7 @@ def read(self, stream):
                 else:
                     break
             elif rebuildXrefTable:
-                self.xref={}
-                stream.seek(0,0)
-                f_ = stream.read(-1)
-                import re
-                for m in re.finditer(b_(r"[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj"),f_):
-                    idnum = int(m.group(1))
-                    generation = int(m.group(2))
-                    if generation not in self.xref:
-                        self.xref[generation] = {}
-                    self.xref[generation][idnum] = m.start(1)
-                trailerPos = f_.rfind(b"trailer") - len(f_) + 7
-                stream.seek(trailerPos,2)
-                #code below duplicated
-                readNonWhitespace(stream)
-                stream.seek(-1, 1)
-                newTrailer = readObject(stream, self)
-                for key, value in list(newTrailer.items()):
-                    if key not in self.trailer:
-                        self.trailer[key] = value
-                #if "/Prev" in newTrailer:
-                #    startxref = newTrailer["/Prev"]
-                #else:
+                self._rebuild_xref_table(stream)
                 break
             elif x.isdigit():
                 # PDF 1.5+ Cross-Reference Stream
@@ -2071,6 +2026,59 @@ def used_before(num, generation):
                     # if not, then either it's just plain wrong, or the non-zero-index is actually correct
             stream.seek(loc, 0) # return to where it was
 
+    @staticmethod
+    def is_xref_broken(stream, startxref):
+        stream.seek(startxref - 1, 0)  # -1 to check character before
+        line = stream.read(1)
+        if line not in b_("\r\n \t"):
+            warnings.warn("incorrect startxref pointer(1)", PdfReadWarning)
+            return True
+        line = stream.read(4)
+        if line != b_("xref"):
+            # not an xref so check if it is an XREF object
+            line = b_("")
+            while line in b_("0123456789 \t"):
+                line = stream.read(1)
+                if line == b_(""):
+                    warnings.warn("incorrect startxref pointer(2)", PdfReadWarning)
+                    return True
+            line += stream.read(2)  # 1 char already read, +2 to check "obj"
+            if line.lower() != b_("obj"):
+                warnings.warn("incorrect startxref pointer(3)", PdfReadWarning)
+                return True
+            while stream.read(1) in b_(" \t\r\n"):
+                pass
+            line = stream.read(256)  # check that it is xref obj
+            if b_("/xref") not in line.lower():
+                warnings.warn("incorrect startxref pointer(4)", PdfReadWarning)
+                return True
+        return False
+
+    def _rebuild_xref_table(self, stream):
+        self.xref = {}
+        stream.seek(0, 0)
+        f_ = stream.read(-1)
+        import re
+
+        for m in re.finditer(b_(r"[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj"), f_):
+            idnum = int(m.group(1))
+            generation = int(m.group(2))
+            if generation not in self.xref:
+                self.xref[generation] = {}
+            self.xref[generation][idnum] = m.start(1)
+        trailerPos = f_.rfind(b"trailer") - len(f_) + 7
+        stream.seek(trailerPos, 2)
+        # code below duplicated
+        readNonWhitespace(stream)
+        stream.seek(-1, 1)
+        newTrailer = readObject(stream, self)
+        for key, value in list(newTrailer.items()):
+            if key not in self.trailer:
+                self.trailer[key] = value
+        # if "/Prev" in newTrailer:
+        #    startxref = newTrailer["/Prev"]
+        # else:
+
     def _read_xref_subsections(self, idx_pairs, getEntry, used_before):
         last_end = 0
         for start, size in self._pairs(idx_pairs):
diff --git a/Tests/test_reader.py b/Tests/test_reader.py
index ec8521fef..4512e433c 100644
--- a/Tests/test_reader.py
+++ b/Tests/test_reader.py
@@ -229,14 +229,12 @@ def test_get_images_raw(strict, with_prev_0, startx_correction, should_fail):
     )
     pdf_stream = io.BytesIO(pdf_data)
     if should_fail:
-        with pytest.raises(Exception) as exc:
+        with pytest.raises(PdfReadError) as exc:
             PdfFileReader(pdf_stream, strict=strict)
-        if startx_correction != -1:
-            assert exc.type == PdfReadWarning
-        else:
+        assert exc.type == PdfReadError
+        if startx_correction == -1:
             assert (
-                exc.type == PdfReadError
-                and exc.value.args[0]
+                exc.value.args[0]
                 == "/Prev=0 in the trailer (try opening with strict=False)"
             )
     else:
@@ -245,10 +243,10 @@ def test_get_images_raw(strict, with_prev_0, startx_correction, should_fail):
 
 def test_issue297():
     path = os.path.join(RESOURCE_ROOT, "issue-297.pdf")
-    with pytest.raises(PdfReadWarning) as exc:
+    with pytest.raises(PdfReadError) as exc:
         reader = PdfFileReader(path, strict=True)
         reader.getPage(0)
-    assert "startxref" in exc.value.args[0]
+    assert "Broken xref table" in exc.value.args[0]
     reader = PdfFileReader(path, strict=False)
     reader.getPage(0)