4949 overload ,
5050)
5151
52- from ._cmap import build_char_map , build_font_width_map , compute_font_width , unknown_char_map
52+ from ._cmap import (
53+ build_char_map ,
54+ build_font_width_map ,
55+ compute_font_width ,
56+ parse_encoding ,
57+ parse_to_unicode ,
58+ unknown_char_map ,
59+ )
5360from ._protocols import PdfCommonDocProtocol
5461from ._text_extraction import (
5562 OrientationNotFoundError ,
5663 _layout_mode ,
5764 crlf_space_check ,
58- handle_tj ,
65+ get_display_str ,
66+ get_text_operands ,
5967 mult ,
6068)
6169from ._utils import (
8492 PdfObject ,
8593 RectangleObject ,
8694 StreamObject ,
95+ TextStringObject ,
8796 is_null_or_none ,
8897)
8998
@@ -496,7 +505,7 @@ def __init__(
496505 if not is_null_or_none (indirect_reference ):
497506 assert indirect_reference is not None , "mypy"
498507 self .update (cast (DictionaryObject , indirect_reference .get_object ()))
499- self ._font_width_maps : Dict [str , Dict [str , float ]] = {}
508+ self ._font_width_maps : Dict [str , Tuple [ Dict [str , float ], str , float ]] = {}
500509
501510 def hash_bin (self ) -> int :
502511 """
@@ -1722,19 +1731,78 @@ def _get_acutual_font_widths(
17221731 cmap : Tuple [
17231732 Union [str , Dict [int , str ]], Dict [str , str ], str , Optional [DictionaryObject ]
17241733 ],
1725- add_text : str ,
1734+ text_operands : str ,
17261735 font_size : float ,
1727- default_space_width : float
1736+ space_width : float
17281737 ) -> Tuple [float , float , float ]:
17291738 font_widths : float = 0
17301739 font_name : str = cmap [2 ]
17311740 if font_name not in self ._font_width_maps :
1732- self ._font_width_maps [font_name ] = build_font_width_map (cmap [3 ], cmap [1 ], default_space_width * 2 )
1733- font_width_map : Dict [Any , float ] = self ._font_width_maps [font_name ]
1734- if add_text :
1735- for char in add_text :
1741+ if cmap [3 ] is None :
1742+ font_width_map : Dict [Any , float ] = {}
1743+ space_char = " "
1744+ actual_space_width : float = space_width
1745+ font_width_map ["default" ] = actual_space_width * 2
1746+ else :
1747+ space_code = 32
1748+ _ , space_code = parse_encoding (cmap [3 ], space_code )
1749+ _ , space_code , _ = parse_to_unicode (cmap [3 ], space_code )
1750+ if isinstance (space_code , str ):
1751+ space_char = space_code
1752+ else :
1753+ space_char = chr (space_code )
1754+ font_width_map = build_font_width_map (cmap [3 ], space_width * 2 )
1755+ actual_space_width = compute_font_width (font_width_map , space_char )
1756+ if actual_space_width == 0 :
1757+ actual_space_width = space_width
1758+ self ._font_width_maps [font_name ] = (font_width_map , space_char , actual_space_width )
1759+ font_width_map = self ._font_width_maps [font_name ][0 ]
1760+ space_char = self ._font_width_maps [font_name ][1 ]
1761+ actual_space_width = self ._font_width_maps [font_name ][2 ]
1762+
1763+ if text_operands :
1764+ for char in text_operands :
1765+ if char == space_char :
1766+ font_widths += actual_space_width
1767+ continue
17361768 font_widths += compute_font_width (font_width_map , char )
1737- return (font_widths * font_size , default_space_width * font_size , font_size )
1769+ return (font_widths * font_size , space_width * font_size , font_size )
1770+
1771+ def _handle_tj (
1772+ self ,
1773+ text : str ,
1774+ operands : List [Union [str , TextStringObject ]],
1775+ cm_matrix : List [float ],
1776+ tm_matrix : List [float ],
1777+ cmap : Tuple [
1778+ Union [str , Dict [int , str ]], Dict [str , str ], str , Optional [DictionaryObject ]
1779+ ],
1780+ orientations : Tuple [int , ...],
1781+ font_size : float ,
1782+ rtl_dir : bool ,
1783+ visitor_text : Optional [Callable [[Any , Any , Any , Any , Any ], None ]],
1784+ space_width : float ,
1785+ actual_str_size : Dict [str , float ]
1786+ ) -> Tuple [str , bool , Dict [str , float ]]:
1787+ text_operands , is_str_operands = get_text_operands (
1788+ operands , cm_matrix , tm_matrix , cmap , orientations )
1789+ if is_str_operands :
1790+ text += text_operands
1791+ else :
1792+ text , rtl_dir = get_display_str (
1793+ text ,
1794+ cm_matrix ,
1795+ tm_matrix , # text matrix
1796+ cmap ,
1797+ text_operands ,
1798+ font_size ,
1799+ rtl_dir ,
1800+ visitor_text )
1801+ font_widths , actual_str_size ["space_width" ], actual_str_size ["str_height" ] = (
1802+ self ._get_acutual_font_widths (cmap , text_operands , font_size , space_width ))
1803+ actual_str_size ["str_widths" ] += font_widths
1804+
1805+ return text , rtl_dir , actual_str_size
17381806
17391807 def _extract_text (
17401808 self ,
@@ -1818,11 +1886,8 @@ def _extract_text(
18181886 TL = 0.0
18191887 font_size = 12.0 # init just in case of
18201888
1821- def current_spacewidth () -> float :
1822- return _space_width / 1000.0
1823-
1824- def current_strwidths () -> float :
1825- return _actual_str_size ["str_widths" ] / 1000.0
1889+ def compute_strwidths (str_widths : float ) -> float :
1890+ return str_widths / 1000.0
18261891
18271892 def process_operation (operator : bytes , operands : List [Any ]) -> None :
18281893 nonlocal cm_matrix , cm_stack , tm_matrix , cm_prev , tm_prev , memo_cm , memo_tm
@@ -1945,7 +2010,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
19452010 ty = float (operands [1 ])
19462011 tm_matrix [4 ] += tx * tm_matrix [0 ] + ty * tm_matrix [2 ]
19472012 tm_matrix [5 ] += tx * tm_matrix [1 ] + ty * tm_matrix [3 ]
1948- str_widths = current_strwidths ( )
2013+ str_widths = compute_strwidths ( _actual_str_size [ "str_widths" ] )
19492014 _actual_str_size ["str_widths" ] = 0.0
19502015 elif operator == b"Tm" :
19512016 check_crlf_space = True
@@ -1957,28 +2022,26 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
19572022 float (operands [4 ]),
19582023 float (operands [5 ]),
19592024 ]
1960- str_widths = current_strwidths ( )
2025+ str_widths = compute_strwidths ( _actual_str_size [ "str_widths" ] )
19612026 _actual_str_size ["str_widths" ] = 0.0
19622027 elif operator == b"T*" :
19632028 check_crlf_space = True
19642029 tm_matrix [5 ] -= TL
19652030 elif operator == b"Tj" :
19662031 check_crlf_space = True
1967- text , rtl_dir , add_text = handle_tj (
2032+ text , rtl_dir , _actual_str_size = self . _handle_tj (
19682033 text ,
19692034 operands ,
19702035 cm_matrix ,
19712036 tm_matrix , # text matrix
19722037 cmap ,
19732038 orientations ,
1974- output ,
19752039 font_size ,
19762040 rtl_dir ,
19772041 visitor_text ,
2042+ _space_width ,
2043+ _actual_str_size ,
19782044 )
1979- current_font_widths , _actual_str_size ["space_width" ], _actual_str_size ["str_height" ] = (
1980- self ._get_acutual_font_widths (cmap , add_text , font_size , current_spacewidth ()))
1981- _actual_str_size ["str_widths" ] += current_font_widths
19822045 else :
19832046 return None
19842047 if check_crlf_space :
@@ -1994,7 +2057,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
19942057 font_size ,
19952058 visitor_text ,
19962059 str_widths ,
1997- _actual_str_size ["space_width" ],
2060+ compute_strwidths ( _actual_str_size ["space_width" ]) ,
19982061 _actual_str_size ["str_height" ]
19992062 )
20002063 if text == "" :
0 commit comments