Skip to content

Commit

Permalink
ROB: Improve inline image extraction (#2622)
Browse files Browse the repository at this point in the history
Closes  #2598.
  • Loading branch information
pubpub-zz authored May 27, 2024
1 parent c8d722c commit 23a81ba
Show file tree
Hide file tree
Showing 10 changed files with 591 additions and 177 deletions.
143 changes: 81 additions & 62 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
# POSSIBILITY OF SUCH DAMAGE.

import math
import re
import sys
from decimal import Decimal
from pathlib import Path
Expand Down Expand Up @@ -58,7 +57,6 @@
mult,
)
from ._utils import (
WHITESPACES_AS_REGEXP,
CompressedTransformationMatrix,
File,
ImageFile,
Expand All @@ -82,6 +80,7 @@
NameObject,
NullObject,
NumberObject,
PdfObject,
RectangleObject,
StreamObject,
)
Expand Down Expand Up @@ -335,7 +334,6 @@ def __init__(
self.pdf = pdf
self.inline_images: Optional[Dict[str, ImageFile]] = None
# below Union for mypy but actually Optional[List[str]]
self.inline_images_keys: Optional[List[Union[str, List[str]]]] = None
self.indirect_reference = indirect_reference

def hash_value_data(self) -> bytes:
Expand Down Expand Up @@ -439,19 +437,8 @@ def _get_ids_image(
return []
else:
call_stack.append(_i)
if self.inline_images_keys is None:
content = self._get_contents_as_bytes() or b""
nb_inlines = 0
for matching in re.finditer(
WHITESPACES_AS_REGEXP + b"BI" + WHITESPACES_AS_REGEXP,
content,
):
start_of_string = content[: matching.start()]
if len(re.findall(b"[^\\\\]\\(", start_of_string)) == len(
re.findall(b"[^\\\\]\\)", start_of_string)
):
nb_inlines += 1
self.inline_images_keys = [f"~{x}~" for x in range(nb_inlines)]
if self.inline_images is None:
self.inline_images = self._get_inline_images()
if obj is None:
obj = self
if ancest is None:
Expand All @@ -460,7 +447,7 @@ def _get_ids_image(
if PG.RESOURCES not in obj or RES.XOBJECT not in cast(
DictionaryObject, obj[PG.RESOURCES]
):
return self.inline_images_keys
return [] if self.inline_images is None else list(self.inline_images.keys())

x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore
for o in x_object:
Expand All @@ -470,7 +457,9 @@ def _get_ids_image(
lst.append(o if len(ancest) == 0 else ancest + [o])
else: # is a form with possible images inside
lst.extend(self._get_ids_image(x_object[o], ancest + [o], call_stack))
return lst + self.inline_images_keys
assert self.inline_images is not None
lst.extend(list(self.inline_images.keys()))
return lst

def _get_image(
self,
Expand Down Expand Up @@ -551,6 +540,46 @@ def images(self) -> List[ImageFile]:
"""
return _VirtualListImages(self._get_ids_image, self._get_image) # type: ignore

def _translate_value_inlineimage(self, k: str, v: PdfObject) -> PdfObject:
"""Translate values used in inline image"""
try:
v = NameObject(
{
"/G": "/DeviceGray",
"/RGB": "/DeviceRGB",
"/CMYK": "/DeviceCMYK",
"/I": "/Indexed",
"/AHx": "/ASCIIHexDecode",
"/A85": "/ASCII85Decode",
"/LZW": "/LZWDecode",
"/Fl": "/FlateDecode",
"/RL": "/RunLengthDecode",
"/CCF": "/CCITTFaxDecode",
"/DCT": "/DCTDecode",
"/DeviceGray": "/DeviceGray",
"/DeviceRGB": "/DeviceRGB",
"/DeviceCMYK": "/DeviceCMYK",
"/Indexed": "/Indexed",
"/ASCIIHexDecode": "/ASCIIHexDecode",
"/ASCII85Decode": "/ASCII85Decode",
"/LZWDecode": "/LZWDecode",
"/FlateDecode": "/FlateDecode",
"/RunLengthDecode": "/RunLengthDecode",
"/CCITTFaxDecode": "/CCITTFaxDecode",
"/DCTDecode": "/DCTDecode",
}[cast(str, v)]
)
except (TypeError, KeyError):
if isinstance(v, NameObject):
# It is a custom name, thus we have to look in resources.
# The only applicable case is for ColorSpace.
try:
res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"]
v = cast(DictionaryObject, res)[v]
except KeyError: # for res and v
raise PdfReadError(f"Cannot find resource entry {v} for {k}")
return v

def _get_inline_images(self) -> Dict[str, ImageFile]:
"""
get inline_images
Expand Down Expand Up @@ -593,51 +622,39 @@ def _get_inline_images(self) -> Dict[str, ImageFile]:
"/Length": len(ii["__streamdata__"]),
}
for k, v in ii["settings"].items():
try:
v = NameObject(
{
"/G": "/DeviceGray",
"/RGB": "/DeviceRGB",
"/CMYK": "/DeviceCMYK",
"/I": "/Indexed",
"/AHx": "/ASCIIHexDecode",
"/A85": "/ASCII85Decode",
"/LZW": "/LZWDecode",
"/Fl": "/FlateDecode",
"/RL": "/RunLengthDecode",
"/CCF": "/CCITTFaxDecode",
"/DCT": "/DCTDecode",
}[v]
)
except (TypeError, KeyError):
if isinstance(v, NameObject):
# it is a custom name : we have to look in resources :
# the only applicable case is for ColorSpace
try:
res = cast(DictionaryObject, self["/Resources"])[
"/ColorSpace"
]
v = cast(DictionaryObject, res)[v]
except KeyError: # for res and v
raise PdfReadError(
f"Can not find resource entry {v} for {k}"
)
init[
NameObject(
{
"/BPC": "/BitsPerComponent",
"/CS": "/ColorSpace",
"/D": "/Decode",
"/DP": "/DecodeParms",
"/F": "/Filter",
"/H": "/Height",
"/W": "/Width",
"/I": "/Interpolate",
"/Intent": "/Intent",
"/IM": "/ImageMask",
}[k]
if k in {"/Length", "/L"}: # no length is expected
continue
if isinstance(v, list):
v = ArrayObject(
[self._translate_value_inlineimage(k, x) for x in v]
)
] = v
else:
v = self._translate_value_inlineimage(k, v)
k = NameObject(
{
"/BPC": "/BitsPerComponent",
"/CS": "/ColorSpace",
"/D": "/Decode",
"/DP": "/DecodeParms",
"/F": "/Filter",
"/H": "/Height",
"/W": "/Width",
"/I": "/Interpolate",
"/Intent": "/Intent",
"/IM": "/ImageMask",
"/BitsPerComponent": "/BitsPerComponent",
"/ColorSpace": "/ColorSpace",
"/Decode": "/Decode",
"/DecodeParms": "/DecodeParms",
"/Filter": "/Filter",
"/Height": "/Height",
"/Width": "/Width",
"/Interpolate": "/Interpolate",
"/ImageMask": "/ImageMask",
}[k]
)
if k not in init:
init[k] = v
ii["object"] = EncodedStreamObject.initialize_from_dictionary(init)
extension, byte_stream, img = _xobj_to_image(ii["object"])
files[f"~{num}~"] = ImageFile(
Expand Down Expand Up @@ -934,6 +951,8 @@ def replace_contents(
# as a backup solution, we put content as an object although not in accordance with pdf ref
# this will be fixed with the _add_object
self[NameObject(PG.CONTENTS)] = content
# forces recalculation of inline_images
self.inline_images = None

def merge_page(
self, page2: "PageObject", expand: bool = False, over: bool = True
Expand Down
3 changes: 2 additions & 1 deletion pypdf/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,8 @@ def ord_(b: Union[int, str, bytes]) -> Union[int, bytes]:


WHITESPACES = (b" ", b"\n", b"\r", b"\t", b"\x00")
WHITESPACES_AS_REGEXP = b"[ \n\r\t\x00]"
WHITESPACES_AS_BYTES = b"".join(WHITESPACES)
WHITESPACES_AS_REGEXP = b"[" + WHITESPACES_AS_BYTES + b"]"


def paeth_predictor(left: int, up: int, up_left: int) -> int:
Expand Down
38 changes: 19 additions & 19 deletions pypdf/_xobj_image_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@


try:
from PIL import Image
from PIL import Image, UnidentifiedImageError # noqa: F401
except ImportError:
raise ImportError(
"pillow is required to do image extraction. "
Expand Down Expand Up @@ -123,6 +123,24 @@ def _get_imagemode(
return mode, mode == "CMYK"


def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes:
mask = (1 << bits) - 1
nbuff = bytearray(size[0] * size[1])
by = 0
bit = 8 - bits
for y in range(size[1]):
if (bit != 0) and (bit != 8 - bits):
by += 1
bit = 8 - bits
for x in range(size[0]):
nbuff[y * size[0] + x] = (data[by] >> bit) & mask
bit -= bits
if bit < 0:
by += 1
bit = 8 - bits
return bytes(nbuff)


def _extended_image_frombytes(
mode: str, size: Tuple[int, int], data: bytes
) -> Image.Image:
Expand Down Expand Up @@ -150,24 +168,6 @@ def _handle_flate(
Process image encoded in flateEncode
Returns img, image_format, extension, color inversion
"""

def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes:
mask = (2 << bits) - 1
nbuff = bytearray(size[0] * size[1])
by = 0
bit = 8 - bits
for y in range(size[1]):
if (bit != 0) and (bit != 8 - bits):
by += 1
bit = 8 - bits
for x in range(size[0]):
nbuff[y * size[0] + x] = (data[by] >> bit) & mask
bit -= bits
if bit < 0:
by += 1
bit = 8 - bits
return bytes(nbuff)

extension = ".png" # mime_type = "image/png"
image_format = "PNG"
lookup: Any
Expand Down
45 changes: 21 additions & 24 deletions pypdf/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,12 @@
import math
import struct
import zlib
from base64 import a85decode
from io import BytesIO
from typing import Any, Dict, List, Optional, Tuple, Union, cast

from ._utils import (
WHITESPACES_AS_BYTES,
b_,
deprecate_with_replacement,
deprecation_no_replacement,
Expand Down Expand Up @@ -467,7 +469,7 @@ def decode(
Decode an LZW encoded data stream.
Args:
data: bytes`` or ``str`` text to decode.
data: ``bytes`` or ``str`` text to decode.
decode_parms: a dictionary of parameter values.
Returns:
Expand All @@ -487,29 +489,20 @@ def decode(
decode_parms: Optional[DictionaryObject] = None,
**kwargs: Any,
) -> bytes:
# decode_parms is unused here
"""
Decode an Ascii85 encoded data stream.
Args:
data: ``bytes`` or ``str`` text to decode.
decode_parms: a dictionary of parameter values.
Returns:
decoded data.
"""
if isinstance(data, str):
data = data.encode("ascii")
group_index = b = 0
out = bytearray()
for char in data:
if ord("!") <= char <= ord("u"):
group_index += 1
b = b * 85 + (char - 33)
if group_index == 5:
out += struct.pack(b">L", b)
group_index = b = 0
elif char == ord("z"):
assert group_index == 0
out += b"\0\0\0\0"
elif char == ord("~"):
if group_index:
for _ in range(5 - group_index):
b = b * 85 + 84
out += struct.pack(b">L", b)[: group_index - 1]
break
return bytes(out)
data = data.encode()
data = data.strip(WHITESPACES_AS_BYTES)
return a85decode(data, adobe=True, ignorechars=WHITESPACES_AS_BYTES)


class DCTDecode:
Expand Down Expand Up @@ -742,6 +735,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
"""
from ._xobj_image_helpers import (
Image,
UnidentifiedImageError,
_extended_image_frombytes,
_get_imagemode,
_handle_flate,
Expand Down Expand Up @@ -808,13 +802,16 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
# I'm not sure if the following logic is correct.
# There might not be any relationship between the filters and the
# extension
if x_object_obj[SA.FILTER] in [[FT.LZW_DECODE], [FT.CCITT_FAX_DECODE]]:
if lfilters in (FT.LZW_DECODE, FT.CCITT_FAX_DECODE):
extension = ".tiff" # mime_type = "image/tiff"
image_format = "TIFF"
else:
extension = ".png" # mime_type = "image/png"
image_format = "PNG"
img = Image.open(BytesIO(data), formats=("TIFF", "PNG"))
try:
img = Image.open(BytesIO(data), formats=("TIFF", "PNG"))
except UnidentifiedImageError:
img = _extended_image_frombytes(mode, size, data)
elif lfilters == FT.DCT_DECODE:
img, image_format, extension = Image.open(BytesIO(data)), "JPEG", ".jpg"
# invert_color kept unchanged
Expand Down
Loading

0 comments on commit 23a81ba

Please sign in to comment.