From da7dc0260e368a335cb648375576186b3a68bfb8 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Mon, 6 Jun 2022 12:11:31 +0200 Subject: [PATCH] Split build_char_map --- PyPDF2/_cmap.py | 224 ++++++++++++++++++++++++++---------------------- PyPDF2/_page.py | 4 +- 2 files changed, 122 insertions(+), 106 deletions(-) diff --git a/PyPDF2/_cmap.py b/PyPDF2/_cmap.py index c10e4bba25..841b239a1d 100644 --- a/PyPDF2/_cmap.py +++ b/PyPDF2/_cmap.py @@ -8,111 +8,133 @@ # code freely inspired from @twiggy ; see #711 -def _build_char_map( +def build_char_map( font_name: str, space_width: float, obj: DictionaryObject ) -> Tuple[str, float, Dict, Dict]: - map_dict: Any = {} - process_rg: bool = False - process_char: bool = False - encoding: List[str] = [] ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore font_type: str = cast(str, ft["/Subtype"]) - sp_width: float = space_width * 2 # default value - w = [] - # encoding + space_code = 32 - if "/Encoding" in ft: - enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore - if isinstance(enc, str): - try: - if enc in ("/Identity-H", "/Identity-V"): - encoding = [] - else: - encoding = charset_encoding[enc].copy() - except Exception: - warnings.warn( - f"Advanced encoding {encoding} not implemented yet", - PdfReadWarning, - ) - encoding = charset_encoding["/StandardCoding"].copy() - elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc: - try: - encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy() - except Exception: - warnings.warn( - f"Advanced encoding {encoding} not implemented yet", - PdfReadWarning, - ) - encoding = charset_encoding["/StandardCoding"].copy() - else: + encoding = parse_encoding(ft, space_code) + map_dict, space_code = parse_to_unicode(ft, space_code) + sp_width = compute_space_width(space_width, ft, space_code) + + return ( + font_type, + float(sp_width / 2), + dict(zip(range(256), encoding)), + # https://github.com/python/mypy/issues/4374 + "".maketrans(map_dict), # type: ignore + ) + + +def parse_encoding(ft: DictionaryObject, space_code: int) -> Tuple[List[str], int]: + encoding: List[str] = [] + if "/Encoding" not in ft: + return encoding, space_code + enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore + if isinstance(enc, str): + try: + if enc in ("/Identity-H", "/Identity-V"): + encoding = [] + else: + encoding = charset_encoding[enc].copy() + except Exception: + warnings.warn( + f"Advanced encoding {encoding} not implemented yet", + PdfReadWarning, + ) encoding = charset_encoding["/StandardCoding"].copy() - if "/Differences" in enc: - x = 0 - for o in cast( - DictionaryObject, cast(DictionaryObject, enc)["/Differences"] - ): - if isinstance(o, int): - x = o - else: - try: - encoding[x] = adobe_glyphs[o] - except Exception: - encoding[x] = o - if o == " ": - space_code = x - x += 1 - if "/ToUnicode" in ft: - cm: str = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data().decode("utf-8") - for l in ( - cm.strip() - .replace("<", " ") - .replace(">", "") - .replace("[", " [ ") - .replace("]", " ] ") - .split("\n") - ): - if l == "": - continue - if "beginbfrange" in l: - process_rg = True - elif "endbfrange" in l: - process_rg = False - elif "beginbfchar" in l: - process_char = True - elif "endbfchar" in l: - process_char = False - elif process_rg: - lst = [x for x in l.split(" ") if x] - a = int(lst[0], 16) - b = int(lst[1], 16) - if lst[2] == "[": - for sq in lst[3:]: - if "]": - break - map_dict[a] = unhexlify(sq).decode("utf-16-be") - a += 1 - assert a > b - else: - c = int(lst[2], 16) - fmt = b"%%0%dX" % len(lst[2]) - while a <= b: - map_dict[a] = unhexlify(fmt % c).decode("utf-16-be") - a += 1 - c += 1 - elif process_char: - lst = [x for x in l.split(" ") if x] - a = int(lst[0], 16) - map_dict[a] = unhexlify("".join(lst[1:])).decode( - "utf-16-be" - ) # join is here as some cases where the code was split + elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc: + try: + encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy() + except Exception: + warnings.warn( + f"Advanced encoding {encoding} not implemented yet", + PdfReadWarning, + ) + encoding = charset_encoding["/StandardCoding"].copy() + else: + encoding = charset_encoding["/StandardCoding"].copy() + if "/Differences" in enc: + x = 0 + for o in cast(DictionaryObject, cast(DictionaryObject, enc)["/Differences"]): + if isinstance(o, int): + x = o + else: + try: + encoding[x] = adobe_glyphs[o] + except Exception: + encoding[x] = o + if o == " ": + space_code = x + x += 1 + return encoding, space_code + - # get - for a in map_dict: - if map_dict[a] == " ": - space_code = a +def parse_to_unicode(ft: DictionaryObject, space_code: int) -> Tuple[Dict, int]: + map_dict: Dict[Any, Any] = {} + if "/ToUnicode" not in ft: + return map_dict, space_code + process_rg: bool = False + process_char: bool = False + cm: str = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data().decode("utf-8") + for l in ( + cm.strip() + .replace("<", " ") + .replace(">", "") + .replace("[", " [ ") + .replace("]", " ] ") + .split("\n") + ): + if l == "": + continue + if "beginbfrange" in l: + process_rg = True + elif "endbfrange" in l: + process_rg = False + elif "beginbfchar" in l: + process_char = True + elif "endbfchar" in l: + process_char = False + elif process_rg: + lst = [x for x in l.split(" ") if x] + a = int(lst[0], 16) + b = int(lst[1], 16) + if lst[2] == "[": + for sq in lst[3:]: + if "]": + break + map_dict[a] = unhexlify(sq).decode("utf-16-be") + a += 1 + assert a > b + else: + c = int(lst[2], 16) + fmt = b"%%0%dX" % len(lst[2]) + while a <= b: + map_dict[a] = unhexlify(fmt % c).decode("utf-16-be") + a += 1 + c += 1 + elif process_char: + lst = [x for x in l.split(" ") if x] + a = int(lst[0], 16) + map_dict[a] = unhexlify("".join(lst[1:])).decode( + "utf-16-be" + ) # join is here as some cases where the code was split - # compute space width - st: int = 0 # declaration for mypy + # get + for a in map_dict: + if map_dict[a] == " ": + space_code = a + return map_dict, space_code + + +def compute_space_width( + space_width: float, ft: DictionaryObject, space_code: int +) -> float: + sp_width: float = space_width * 2 # default value + w = [] + st: int = 0 if "/W" in ft: if "/DW" in ft: sp_width = cast(float, ft["/DW"]) @@ -159,10 +181,4 @@ def _build_char_map( m += x cpt += 1 sp_width = m / max(1, cpt) / 2 - - return ( - font_type, - float(sp_width / 2), - dict(zip(range(256), encoding)), - "".maketrans(map_dict), - ) + return sp_width diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index 8051779589..5333e95c67 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -45,7 +45,7 @@ cast, ) -from ._cmap import _build_char_map +from ._cmap import build_char_map from ._utils import ( CompressedTransformationMatrix, TransformationMatrixType, @@ -1122,7 +1122,7 @@ def _extract_text( resources_dict = cast(DictionaryObject, obj["/Resources"]) if "/Font" in resources_dict: for f in cast(DictionaryObject, resources_dict["/Font"]): - cmaps[f] = _build_char_map(f, space_width, obj) + cmaps[f] = build_char_map(f, space_width, obj) cmap: Union[str, Dict[int, str]] = {} content = obj[content_key].get_object() if isinstance(content_key, str) else obj if not isinstance(content, ContentStream):