Skip to content

Commit 3b89062

Browse files
authored
BUG: Cope with encoding with too many differences (#2873)
Closes #2836.
1 parent dcd15aa commit 3b89062

File tree

2 files changed

+13
-1
lines changed

2 files changed

+13
-1
lines changed

pypdf/_cmap.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,8 @@ def parse_encoding(
207207
x = o
208208
else: # isinstance(o,str):
209209
try:
210-
encoding[x] = adobe_glyphs[o] # type: ignore
210+
if x < len(encoding):
211+
encoding[x] = adobe_glyphs[o] # type: ignore
211212
except Exception:
212213
encoding[x] = o # type: ignore
213214
if o == " ":

tests/test_cmap.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,3 +248,14 @@ def test_unigb_utf16():
248248
name = "iss2812.pdf"
249249
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
250250
assert "《中国能源展望 2060(2024 年版)》编写委员会" in reader.pages[1].extract_text()
251+
252+
253+
@pytest.mark.enable_socket()
254+
def test_too_many_differences():
255+
"""Cf #2836"""
256+
url = (
257+
"https://github.com/user-attachments/files/16911741/dumb_extract_text_crash.pdf"
258+
)
259+
name = "iss2836.pdf"
260+
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
261+
assert reader.pages[0].extract_text() == ""

0 commit comments

Comments
 (0)