BUG: Cope with encoding with too many differences (#2873)

pubpub-zz · web-flow · commit 3b890621267f · 2024-09-26T21:08:59.000+02:00
Closes #2836.
diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
@@ -207,7 +207,8 @@ def parse_encoding(
                 x = o
             else:  # isinstance(o,str):
                 try:
-                    encoding[x] = adobe_glyphs[o]  # type: ignore
+                    if x < len(encoding):
+                        encoding[x] = adobe_glyphs[o]  # type: ignore
                 except Exception:
                     encoding[x] = o  # type: ignore
                     if o == " ":
diff --git a/tests/test_cmap.py b/tests/test_cmap.py
@@ -248,3 +248,14 @@ def test_unigb_utf16():
     name = "iss2812.pdf"
     reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
     assert "《中国能源展望 2060（2024 年版）》编写委员会" in reader.pages[1].extract_text()
+
+
+@pytest.mark.enable_socket()
+def test_too_many_differences():
+    """Cf #2836"""
+    url = (
+        "https://github.com/user-attachments/files/16911741/dumb_extract_text_crash.pdf"
+    )
+    name = "iss2836.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    assert reader.pages[0].extract_text() == ""