BUG: Fix undefined variable for text extraction (regression) (#2934)

stefan6419846 · web-flow · commit 5b50f4786d23 · 2024-11-04T19:24:56.000+01:00
diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
@@ -527,6 +527,8 @@ def _type1_alternative(
                         v = chr(int(words[2][4:], 16))
                     except ValueError:  # pragma: no cover
                         continue
+                else:
+                    continue
             map_dict[chr(i)] = v
             int_entry.append(i)
     return map_dict, int_entry
diff --git a/tests/test_cmap.py b/tests/test_cmap.py
@@ -259,3 +259,13 @@ def test_too_many_differences():
     name = "iss2836.pdf"
     reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
     assert reader.pages[0].extract_text() == ""
+
+
+@pytest.mark.enable_socket
+def test_iss2925():
+    url = (
+        "https://github.com/user-attachments/files/17621508/2305.09315.pdf"
+    )
+    name = "iss2925.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    assert "slicing on the PDG to extract the relevant contextual" in reader.pages[3].extract_text()