Skip to content

Commit

Permalink
Split build_char_map
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinThoma committed Jun 6, 2022
1 parent 6957c00 commit da7dc02
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 106 deletions.
224 changes: 120 additions & 104 deletions PyPDF2/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,111 +8,133 @@


# code freely inspired from @twiggy ; see #711
def _build_char_map(
def build_char_map(
font_name: str, space_width: float, obj: DictionaryObject
) -> Tuple[str, float, Dict, Dict]:
map_dict: Any = {}
process_rg: bool = False
process_char: bool = False
encoding: List[str] = []
ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore
font_type: str = cast(str, ft["/Subtype"])
sp_width: float = space_width * 2 # default value
w = []
# encoding

space_code = 32
if "/Encoding" in ft:
enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore
if isinstance(enc, str):
try:
if enc in ("/Identity-H", "/Identity-V"):
encoding = []
else:
encoding = charset_encoding[enc].copy()
except Exception:
warnings.warn(
f"Advanced encoding {encoding} not implemented yet",
PdfReadWarning,
)
encoding = charset_encoding["/StandardCoding"].copy()
elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:
try:
encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()
except Exception:
warnings.warn(
f"Advanced encoding {encoding} not implemented yet",
PdfReadWarning,
)
encoding = charset_encoding["/StandardCoding"].copy()
else:
encoding = parse_encoding(ft, space_code)
map_dict, space_code = parse_to_unicode(ft, space_code)
sp_width = compute_space_width(space_width, ft, space_code)

return (
font_type,
float(sp_width / 2),
dict(zip(range(256), encoding)),
# https://github.com/python/mypy/issues/4374
"".maketrans(map_dict), # type: ignore
)


def parse_encoding(ft: DictionaryObject, space_code: int) -> Tuple[List[str], int]:
encoding: List[str] = []
if "/Encoding" not in ft:
return encoding, space_code
enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore
if isinstance(enc, str):
try:
if enc in ("/Identity-H", "/Identity-V"):
encoding = []
else:
encoding = charset_encoding[enc].copy()
except Exception:
warnings.warn(
f"Advanced encoding {encoding} not implemented yet",
PdfReadWarning,
)
encoding = charset_encoding["/StandardCoding"].copy()
if "/Differences" in enc:
x = 0
for o in cast(
DictionaryObject, cast(DictionaryObject, enc)["/Differences"]
):
if isinstance(o, int):
x = o
else:
try:
encoding[x] = adobe_glyphs[o]
except Exception:
encoding[x] = o
if o == " ":
space_code = x
x += 1
if "/ToUnicode" in ft:
cm: str = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data().decode("utf-8")
for l in (
cm.strip()
.replace("<", " ")
.replace(">", "")
.replace("[", " [ ")
.replace("]", " ] ")
.split("\n")
):
if l == "":
continue
if "beginbfrange" in l:
process_rg = True
elif "endbfrange" in l:
process_rg = False
elif "beginbfchar" in l:
process_char = True
elif "endbfchar" in l:
process_char = False
elif process_rg:
lst = [x for x in l.split(" ") if x]
a = int(lst[0], 16)
b = int(lst[1], 16)
if lst[2] == "[":
for sq in lst[3:]:
if "]":
break
map_dict[a] = unhexlify(sq).decode("utf-16-be")
a += 1
assert a > b
else:
c = int(lst[2], 16)
fmt = b"%%0%dX" % len(lst[2])
while a <= b:
map_dict[a] = unhexlify(fmt % c).decode("utf-16-be")
a += 1
c += 1
elif process_char:
lst = [x for x in l.split(" ") if x]
a = int(lst[0], 16)
map_dict[a] = unhexlify("".join(lst[1:])).decode(
"utf-16-be"
) # join is here as some cases where the code was split
elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:
try:
encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()
except Exception:
warnings.warn(
f"Advanced encoding {encoding} not implemented yet",
PdfReadWarning,
)
encoding = charset_encoding["/StandardCoding"].copy()
else:
encoding = charset_encoding["/StandardCoding"].copy()
if "/Differences" in enc:
x = 0
for o in cast(DictionaryObject, cast(DictionaryObject, enc)["/Differences"]):
if isinstance(o, int):
x = o
else:
try:
encoding[x] = adobe_glyphs[o]
except Exception:
encoding[x] = o
if o == " ":
space_code = x
x += 1
return encoding, space_code


# get
for a in map_dict:
if map_dict[a] == " ":
space_code = a
def parse_to_unicode(ft: DictionaryObject, space_code: int) -> Tuple[Dict, int]:
map_dict: Dict[Any, Any] = {}
if "/ToUnicode" not in ft:
return map_dict, space_code
process_rg: bool = False
process_char: bool = False
cm: str = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data().decode("utf-8")
for l in (
cm.strip()
.replace("<", " ")
.replace(">", "")
.replace("[", " [ ")
.replace("]", " ] ")
.split("\n")
):
if l == "":
continue
if "beginbfrange" in l:
process_rg = True
elif "endbfrange" in l:
process_rg = False
elif "beginbfchar" in l:
process_char = True
elif "endbfchar" in l:
process_char = False
elif process_rg:
lst = [x for x in l.split(" ") if x]
a = int(lst[0], 16)
b = int(lst[1], 16)
if lst[2] == "[":
for sq in lst[3:]:
if "]":
break
map_dict[a] = unhexlify(sq).decode("utf-16-be")
a += 1
assert a > b
else:
c = int(lst[2], 16)
fmt = b"%%0%dX" % len(lst[2])
while a <= b:
map_dict[a] = unhexlify(fmt % c).decode("utf-16-be")
a += 1
c += 1
elif process_char:
lst = [x for x in l.split(" ") if x]
a = int(lst[0], 16)
map_dict[a] = unhexlify("".join(lst[1:])).decode(
"utf-16-be"
) # join is here as some cases where the code was split

# compute space width
st: int = 0 # declaration for mypy
# get
for a in map_dict:
if map_dict[a] == " ":
space_code = a
return map_dict, space_code


def compute_space_width(
space_width: float, ft: DictionaryObject, space_code: int
) -> float:
sp_width: float = space_width * 2 # default value
w = []
st: int = 0
if "/W" in ft:
if "/DW" in ft:
sp_width = cast(float, ft["/DW"])
Expand Down Expand Up @@ -159,10 +181,4 @@ def _build_char_map(
m += x
cpt += 1
sp_width = m / max(1, cpt) / 2

return (
font_type,
float(sp_width / 2),
dict(zip(range(256), encoding)),
"".maketrans(map_dict),
)
return sp_width
4 changes: 2 additions & 2 deletions PyPDF2/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
cast,
)

from ._cmap import _build_char_map
from ._cmap import build_char_map
from ._utils import (
CompressedTransformationMatrix,
TransformationMatrixType,
Expand Down Expand Up @@ -1122,7 +1122,7 @@ def _extract_text(
resources_dict = cast(DictionaryObject, obj["/Resources"])
if "/Font" in resources_dict:
for f in cast(DictionaryObject, resources_dict["/Font"]):
cmaps[f] = _build_char_map(f, space_width, obj)
cmaps[f] = build_char_map(f, space_width, obj)
cmap: Union[str, Dict[int, str]] = {}
content = obj[content_key].get_object() if isinstance(content_key, str) else obj
if not isinstance(content, ContentStream):
Expand Down

0 comments on commit da7dc02

Please sign in to comment.