Split build_char_map

py-pdf · Jun 6, 2022 · da7dc02 · da7dc02
1 parent 6957c00
commit da7dc02
Show file tree

Hide file tree

Showing 2 changed files with 122 additions and 106 deletions.
diff --git a/PyPDF2/_cmap.py b/PyPDF2/_cmap.py
@@ -8,111 +8,133 @@
 
 
 # code freely inspired from @twiggy ; see #711
-def _build_char_map(
+def build_char_map(
     font_name: str, space_width: float, obj: DictionaryObject
 ) -> Tuple[str, float, Dict, Dict]:
-    map_dict: Any = {}
-    process_rg: bool = False
-    process_char: bool = False
-    encoding: List[str] = []
     ft: DictionaryObject = obj["/Resources"]["/Font"][font_name]  # type: ignore
     font_type: str = cast(str, ft["/Subtype"])
-    sp_width: float = space_width * 2  # default value
-    w = []
-    # encoding
+
     space_code = 32
-    if "/Encoding" in ft:
-        enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object()  # type: ignore
-        if isinstance(enc, str):
-            try:
-                if enc in ("/Identity-H", "/Identity-V"):
-                    encoding = []
-                else:
-                    encoding = charset_encoding[enc].copy()
-            except Exception:
-                warnings.warn(
-                    f"Advanced encoding {encoding} not implemented yet",
-                    PdfReadWarning,
-                )
-                encoding = charset_encoding["/StandardCoding"].copy()
-        elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:
-            try:
-                encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()
-            except Exception:
-                warnings.warn(
-                    f"Advanced encoding {encoding} not implemented yet",
-                    PdfReadWarning,
-                )
-                encoding = charset_encoding["/StandardCoding"].copy()
-        else:
+    encoding = parse_encoding(ft, space_code)
+    map_dict, space_code = parse_to_unicode(ft, space_code)
+    sp_width = compute_space_width(space_width, ft, space_code)
+
+    return (
+        font_type,
+        float(sp_width / 2),
+        dict(zip(range(256), encoding)),
+        # https://github.com/python/mypy/issues/4374
+        "".maketrans(map_dict),  # type: ignore
+    )
+
+
+def parse_encoding(ft: DictionaryObject, space_code: int) -> Tuple[List[str], int]:
+    encoding: List[str] = []
+    if "/Encoding" not in ft:
+        return encoding, space_code
+    enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object()  # type: ignore
+    if isinstance(enc, str):
+        try:
+            if enc in ("/Identity-H", "/Identity-V"):
+                encoding = []
+            else:
+                encoding = charset_encoding[enc].copy()
+        except Exception:
+            warnings.warn(
+                f"Advanced encoding {encoding} not implemented yet",
+                PdfReadWarning,
+            )
             encoding = charset_encoding["/StandardCoding"].copy()
-        if "/Differences" in enc:
-            x = 0
-            for o in cast(
-                DictionaryObject, cast(DictionaryObject, enc)["/Differences"]
-            ):
-                if isinstance(o, int):
-                    x = o
-                else:
-                    try:
-                        encoding[x] = adobe_glyphs[o]
-                    except Exception:
-                        encoding[x] = o
-                        if o == " ":
-                            space_code = x
-                    x += 1
-    if "/ToUnicode" in ft:
-        cm: str = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data().decode("utf-8")
-        for l in (
-            cm.strip()
-            .replace("<", " ")
-            .replace(">", "")
-            .replace("[", " [ ")
-            .replace("]", " ] ")
-            .split("\n")
-        ):
-            if l == "":
-                continue
-            if "beginbfrange" in l:
-                process_rg = True
-            elif "endbfrange" in l:
-                process_rg = False
-            elif "beginbfchar" in l:
-                process_char = True
-            elif "endbfchar" in l:
-                process_char = False
-            elif process_rg:
-                lst = [x for x in l.split(" ") if x]
-                a = int(lst[0], 16)
-                b = int(lst[1], 16)
-                if lst[2] == "[":
-                    for sq in lst[3:]:
-                        if "]":
-                            break
-                        map_dict[a] = unhexlify(sq).decode("utf-16-be")
-                        a += 1
-                        assert a > b
-                else:
-                    c = int(lst[2], 16)
-                    fmt = b"%%0%dX" % len(lst[2])
-                    while a <= b:
-                        map_dict[a] = unhexlify(fmt % c).decode("utf-16-be")
-                        a += 1
-                        c += 1
-            elif process_char:
-                lst = [x for x in l.split(" ") if x]
-                a = int(lst[0], 16)
-                map_dict[a] = unhexlify("".join(lst[1:])).decode(
-                    "utf-16-be"
-                )  # join is here as some cases where the code was split
+    elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:
+        try:
+            encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()
+        except Exception:
+            warnings.warn(
+                f"Advanced encoding {encoding} not implemented yet",
+                PdfReadWarning,
+            )
+            encoding = charset_encoding["/StandardCoding"].copy()
+    else:
+        encoding = charset_encoding["/StandardCoding"].copy()
+    if "/Differences" in enc:
+        x = 0
+        for o in cast(DictionaryObject, cast(DictionaryObject, enc)["/Differences"]):
+            if isinstance(o, int):
+                x = o
+            else:
+                try:
+                    encoding[x] = adobe_glyphs[o]
+                except Exception:
+                    encoding[x] = o
+                    if o == " ":
+                        space_code = x
+                x += 1
+    return encoding, space_code
+
 
-        # get
-        for a in map_dict:
-            if map_dict[a] == " ":
-                space_code = a
+def parse_to_unicode(ft: DictionaryObject, space_code: int) -> Tuple[Dict, int]:
+    map_dict: Dict[Any, Any] = {}
+    if "/ToUnicode" not in ft:
+        return map_dict, space_code
+    process_rg: bool = False
+    process_char: bool = False
+    cm: str = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data().decode("utf-8")
+    for l in (
+        cm.strip()
+        .replace("<", " ")
+        .replace(">", "")
+        .replace("[", " [ ")
+        .replace("]", " ] ")
+        .split("\n")
+    ):
+        if l == "":
+            continue
+        if "beginbfrange" in l:
+            process_rg = True
+        elif "endbfrange" in l:
+            process_rg = False
+        elif "beginbfchar" in l:
+            process_char = True
+        elif "endbfchar" in l:
+            process_char = False
+        elif process_rg:
+            lst = [x for x in l.split(" ") if x]
+            a = int(lst[0], 16)
+            b = int(lst[1], 16)
+            if lst[2] == "[":
+                for sq in lst[3:]:
+                    if "]":
+                        break
+                    map_dict[a] = unhexlify(sq).decode("utf-16-be")
+                    a += 1
+                    assert a > b
+            else:
+                c = int(lst[2], 16)
+                fmt = b"%%0%dX" % len(lst[2])
+                while a <= b:
+                    map_dict[a] = unhexlify(fmt % c).decode("utf-16-be")
+                    a += 1
+                    c += 1
+        elif process_char:
+            lst = [x for x in l.split(" ") if x]
+            a = int(lst[0], 16)
+            map_dict[a] = unhexlify("".join(lst[1:])).decode(
+                "utf-16-be"
+            )  # join is here as some cases where the code was split
 
-    # compute space width
-    st: int = 0  # declaration for mypy
+    # get
+    for a in map_dict:
+        if map_dict[a] == " ":
+            space_code = a
+    return map_dict, space_code
+
+
+def compute_space_width(
+    space_width: float, ft: DictionaryObject, space_code: int
+) -> float:
+    sp_width: float = space_width * 2  # default value
+    w = []
+    st: int = 0
     if "/W" in ft:
         if "/DW" in ft:
             sp_width = cast(float, ft["/DW"])
@@ -159,10 +181,4 @@ def _build_char_map(
                         m += x
                         cpt += 1
                 sp_width = m / max(1, cpt) / 2
-
-    return (
-        font_type,
-        float(sp_width / 2),
-        dict(zip(range(256), encoding)),
-        "".maketrans(map_dict),
-    )
+    return sp_width
diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py
@@ -45,7 +45,7 @@
     cast,
 )
 
-from ._cmap import _build_char_map
+from ._cmap import build_char_map
 from ._utils import (
     CompressedTransformationMatrix,
     TransformationMatrixType,
@@ -1122,7 +1122,7 @@ def _extract_text(
         resources_dict = cast(DictionaryObject, obj["/Resources"])
         if "/Font" in resources_dict:
             for f in cast(DictionaryObject, resources_dict["/Font"]):
-                cmaps[f] = _build_char_map(f, space_width, obj)
+                cmaps[f] = build_char_map(f, space_width, obj)
         cmap: Union[str, Dict[int, str]] = {}
         content = obj[content_key].get_object() if isinstance(content_key, str) else obj
         if not isinstance(content, ContentStream):