ROB: Cope with 2 digit codes in bfchar (#1310)

Fixes #1293
py-pdf · Sep 2, 2022 · 1e089c0 · 1e089c0
1 parent 3326cb7
commit 1e089c0
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 1 deletion.
diff --git a/PyPDF2/_cmap.py b/PyPDF2/_cmap.py
@@ -290,7 +290,7 @@ def parse_bfchar(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> No
         # placeholder (see above) means empty string
         if lst[1] != b".":
             map_to = unhexlify(lst[1]).decode(
-                "utf-16-be", "surrogatepass"
+                "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"
             )  # join is here as some cases where the code was split
         map_dict[
             unhexlify(lst[0]).decode(

diff --git a/tests/test_cmap.py b/tests/test_cmap.py
@@ -48,6 +48,15 @@ def test_get_font_width_from_default():  # L40
         page.extract_text()
 
 
+def test_bfchar_on_2_chars():
+    # iss #1293
+    url = "https://github.com/xyegithub/myBlog/raw/main/posts/c94b2364/paper_pdfs/ImageClassification/2007%2CASurveyofImageClassificationBasedTechniques.pdf"
+    name = "ASurveyofImageClassificationBasedTechniques.pdf"
+    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
+    for page in reader.pages:
+        page.extract_text()
+
+
 def test_ascii_charset():
     # iss #1312
     url = "https://github.com/py-pdf/PyPDF2/files/9472500/main.pdf"