ROB: Improve inline image extraction (#2622)

Closes #2598.
py-pdf · May 27, 2024 · 23a81ba · 23a81ba
1 parent c8d722c
commit 23a81ba
Show file tree

Hide file tree

Showing 10 changed files with 591 additions and 177 deletions.
diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -28,7 +28,6 @@
 # POSSIBILITY OF SUCH DAMAGE.
 
 import math
-import re
 import sys
 from decimal import Decimal
 from pathlib import Path
@@ -58,7 +57,6 @@
     mult,
 )
 from ._utils import (
-    WHITESPACES_AS_REGEXP,
     CompressedTransformationMatrix,
     File,
     ImageFile,
@@ -82,6 +80,7 @@
     NameObject,
     NullObject,
     NumberObject,
+    PdfObject,
     RectangleObject,
     StreamObject,
 )
@@ -335,7 +334,6 @@ def __init__(
         self.pdf = pdf
         self.inline_images: Optional[Dict[str, ImageFile]] = None
         # below Union for mypy but actually Optional[List[str]]
-        self.inline_images_keys: Optional[List[Union[str, List[str]]]] = None
         self.indirect_reference = indirect_reference
 
     def hash_value_data(self) -> bytes:
@@ -439,19 +437,8 @@ def _get_ids_image(
             return []
         else:
             call_stack.append(_i)
-        if self.inline_images_keys is None:
-            content = self._get_contents_as_bytes() or b""
-            nb_inlines = 0
-            for matching in re.finditer(
-                WHITESPACES_AS_REGEXP + b"BI" + WHITESPACES_AS_REGEXP,
-                content,
-            ):
-                start_of_string = content[: matching.start()]
-                if len(re.findall(b"[^\\\\]\\(", start_of_string)) == len(
-                    re.findall(b"[^\\\\]\\)", start_of_string)
-                ):
-                    nb_inlines += 1
-            self.inline_images_keys = [f"~{x}~" for x in range(nb_inlines)]
+        if self.inline_images is None:
+            self.inline_images = self._get_inline_images()
         if obj is None:
             obj = self
         if ancest is None:
@@ -460,7 +447,7 @@ def _get_ids_image(
         if PG.RESOURCES not in obj or RES.XOBJECT not in cast(
             DictionaryObject, obj[PG.RESOURCES]
         ):
-            return self.inline_images_keys
+            return [] if self.inline_images is None else list(self.inline_images.keys())
 
         x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object()  # type: ignore
         for o in x_object:
@@ -470,7 +457,9 @@ def _get_ids_image(
                 lst.append(o if len(ancest) == 0 else ancest + [o])
             else:  # is a form with possible images inside
                 lst.extend(self._get_ids_image(x_object[o], ancest + [o], call_stack))
-        return lst + self.inline_images_keys
+        assert self.inline_images is not None
+        lst.extend(list(self.inline_images.keys()))
+        return lst
 
     def _get_image(
         self,
@@ -551,6 +540,46 @@ def images(self) -> List[ImageFile]:
         """
         return _VirtualListImages(self._get_ids_image, self._get_image)  # type: ignore
 
+    def _translate_value_inlineimage(self, k: str, v: PdfObject) -> PdfObject:
+        """Translate values used in inline image"""
+        try:
+            v = NameObject(
+                {
+                    "/G": "/DeviceGray",
+                    "/RGB": "/DeviceRGB",
+                    "/CMYK": "/DeviceCMYK",
+                    "/I": "/Indexed",
+                    "/AHx": "/ASCIIHexDecode",
+                    "/A85": "/ASCII85Decode",
+                    "/LZW": "/LZWDecode",
+                    "/Fl": "/FlateDecode",
+                    "/RL": "/RunLengthDecode",
+                    "/CCF": "/CCITTFaxDecode",
+                    "/DCT": "/DCTDecode",
+                    "/DeviceGray": "/DeviceGray",
+                    "/DeviceRGB": "/DeviceRGB",
+                    "/DeviceCMYK": "/DeviceCMYK",
+                    "/Indexed": "/Indexed",
+                    "/ASCIIHexDecode": "/ASCIIHexDecode",
+                    "/ASCII85Decode": "/ASCII85Decode",
+                    "/LZWDecode": "/LZWDecode",
+                    "/FlateDecode": "/FlateDecode",
+                    "/RunLengthDecode": "/RunLengthDecode",
+                    "/CCITTFaxDecode": "/CCITTFaxDecode",
+                    "/DCTDecode": "/DCTDecode",
+                }[cast(str, v)]
+            )
+        except (TypeError, KeyError):
+            if isinstance(v, NameObject):
+                # It is a custom name, thus we have to look in resources.
+                # The only applicable case is for ColorSpace.
+                try:
+                    res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"]
+                    v = cast(DictionaryObject, res)[v]
+                except KeyError:  # for res and v
+                    raise PdfReadError(f"Cannot find resource entry {v} for {k}")
+        return v
+
     def _get_inline_images(self) -> Dict[str, ImageFile]:
         """
         get inline_images
@@ -593,51 +622,39 @@ def _get_inline_images(self) -> Dict[str, ImageFile]:
                 "/Length": len(ii["__streamdata__"]),
             }
             for k, v in ii["settings"].items():
-                try:
-                    v = NameObject(
-                        {
-                            "/G": "/DeviceGray",
-                            "/RGB": "/DeviceRGB",
-                            "/CMYK": "/DeviceCMYK",
-                            "/I": "/Indexed",
-                            "/AHx": "/ASCIIHexDecode",
-                            "/A85": "/ASCII85Decode",
-                            "/LZW": "/LZWDecode",
-                            "/Fl": "/FlateDecode",
-                            "/RL": "/RunLengthDecode",
-                            "/CCF": "/CCITTFaxDecode",
-                            "/DCT": "/DCTDecode",
-                        }[v]
-                    )
-                except (TypeError, KeyError):
-                    if isinstance(v, NameObject):
-                        #  it is a custom name : we have to look in resources :
-                        # the only applicable case is for ColorSpace
-                        try:
-                            res = cast(DictionaryObject, self["/Resources"])[
-                                "/ColorSpace"
-                            ]
-                            v = cast(DictionaryObject, res)[v]
-                        except KeyError:  # for res and v
-                            raise PdfReadError(
-                                f"Can not find resource entry {v} for {k}"
-                            )
-                init[
-                    NameObject(
-                        {
-                            "/BPC": "/BitsPerComponent",
-                            "/CS": "/ColorSpace",
-                            "/D": "/Decode",
-                            "/DP": "/DecodeParms",
-                            "/F": "/Filter",
-                            "/H": "/Height",
-                            "/W": "/Width",
-                            "/I": "/Interpolate",
-                            "/Intent": "/Intent",
-                            "/IM": "/ImageMask",
-                        }[k]
+                if k in {"/Length", "/L"}:  # no length is expected
+                    continue
+                if isinstance(v, list):
+                    v = ArrayObject(
+                        [self._translate_value_inlineimage(k, x) for x in v]
                     )
-                ] = v
+                else:
+                    v = self._translate_value_inlineimage(k, v)
+                k = NameObject(
+                    {
+                        "/BPC": "/BitsPerComponent",
+                        "/CS": "/ColorSpace",
+                        "/D": "/Decode",
+                        "/DP": "/DecodeParms",
+                        "/F": "/Filter",
+                        "/H": "/Height",
+                        "/W": "/Width",
+                        "/I": "/Interpolate",
+                        "/Intent": "/Intent",
+                        "/IM": "/ImageMask",
+                        "/BitsPerComponent": "/BitsPerComponent",
+                        "/ColorSpace": "/ColorSpace",
+                        "/Decode": "/Decode",
+                        "/DecodeParms": "/DecodeParms",
+                        "/Filter": "/Filter",
+                        "/Height": "/Height",
+                        "/Width": "/Width",
+                        "/Interpolate": "/Interpolate",
+                        "/ImageMask": "/ImageMask",
+                    }[k]
+                )
+                if k not in init:
+                    init[k] = v
             ii["object"] = EncodedStreamObject.initialize_from_dictionary(init)
             extension, byte_stream, img = _xobj_to_image(ii["object"])
             files[f"~{num}~"] = ImageFile(
@@ -934,6 +951,8 @@ def replace_contents(
                 # as a backup solution, we put content as an object although not in accordance with pdf ref
                 # this will be fixed with the _add_object
                 self[NameObject(PG.CONTENTS)] = content
+        # forces recalculation of inline_images
+        self.inline_images = None
 
     def merge_page(
         self, page2: "PageObject", expand: bool = False, over: bool = True

diff --git a/pypdf/_utils.py b/pypdf/_utils.py
@@ -386,7 +386,8 @@ def ord_(b: Union[int, str, bytes]) -> Union[int, bytes]:
 
 
 WHITESPACES = (b" ", b"\n", b"\r", b"\t", b"\x00")
-WHITESPACES_AS_REGEXP = b"[ \n\r\t\x00]"
+WHITESPACES_AS_BYTES = b"".join(WHITESPACES)
+WHITESPACES_AS_REGEXP = b"[" + WHITESPACES_AS_BYTES + b"]"
 
 
 def paeth_predictor(left: int, up: int, up_left: int) -> int:

diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py
@@ -29,7 +29,7 @@
 
 
 try:
-    from PIL import Image
+    from PIL import Image, UnidentifiedImageError  # noqa: F401
 except ImportError:
     raise ImportError(
         "pillow is required to do image extraction. "
@@ -123,6 +123,24 @@ def _get_imagemode(
     return mode, mode == "CMYK"
 
 
+def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes:
+    mask = (1 << bits) - 1
+    nbuff = bytearray(size[0] * size[1])
+    by = 0
+    bit = 8 - bits
+    for y in range(size[1]):
+        if (bit != 0) and (bit != 8 - bits):
+            by += 1
+            bit = 8 - bits
+        for x in range(size[0]):
+            nbuff[y * size[0] + x] = (data[by] >> bit) & mask
+            bit -= bits
+            if bit < 0:
+                by += 1
+                bit = 8 - bits
+    return bytes(nbuff)
+
+
 def _extended_image_frombytes(
     mode: str, size: Tuple[int, int], data: bytes
 ) -> Image.Image:
@@ -150,24 +168,6 @@ def _handle_flate(
     Process image encoded in flateEncode
     Returns img, image_format, extension, color inversion
     """
-
-    def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes:
-        mask = (2 << bits) - 1
-        nbuff = bytearray(size[0] * size[1])
-        by = 0
-        bit = 8 - bits
-        for y in range(size[1]):
-            if (bit != 0) and (bit != 8 - bits):
-                by += 1
-                bit = 8 - bits
-            for x in range(size[0]):
-                nbuff[y * size[0] + x] = (data[by] >> bit) & mask
-                bit -= bits
-                if bit < 0:
-                    by += 1
-                    bit = 8 - bits
-        return bytes(nbuff)
-
     extension = ".png"  # mime_type = "image/png"
     image_format = "PNG"
     lookup: Any

diff --git a/pypdf/filters.py b/pypdf/filters.py
@@ -37,10 +37,12 @@
 import math
 import struct
 import zlib
+from base64 import a85decode
 from io import BytesIO
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
 
 from ._utils import (
+    WHITESPACES_AS_BYTES,
     b_,
     deprecate_with_replacement,
     deprecation_no_replacement,
@@ -467,7 +469,7 @@ def decode(
         Decode an LZW encoded data stream.
 
         Args:
-          data: bytes`` or ``str`` text to decode.
+          data: ``bytes`` or ``str`` text to decode.
           decode_parms: a dictionary of parameter values.
 
         Returns:
@@ -487,29 +489,20 @@ def decode(
         decode_parms: Optional[DictionaryObject] = None,
         **kwargs: Any,
     ) -> bytes:
-        # decode_parms is unused here
+        """
+        Decode an Ascii85 encoded data stream.
+
+        Args:
+          data: ``bytes`` or ``str`` text to decode.
+          decode_parms: a dictionary of parameter values.
 
+        Returns:
+          decoded data.
+        """
         if isinstance(data, str):
-            data = data.encode("ascii")
-        group_index = b = 0
-        out = bytearray()
-        for char in data:
-            if ord("!") <= char <= ord("u"):
-                group_index += 1
-                b = b * 85 + (char - 33)
-                if group_index == 5:
-                    out += struct.pack(b">L", b)
-                    group_index = b = 0
-            elif char == ord("z"):
-                assert group_index == 0
-                out += b"\0\0\0\0"
-            elif char == ord("~"):
-                if group_index:
-                    for _ in range(5 - group_index):
-                        b = b * 85 + 84
-                    out += struct.pack(b">L", b)[: group_index - 1]
-                break
-        return bytes(out)
+            data = data.encode()
+        data = data.strip(WHITESPACES_AS_BYTES)
+        return a85decode(data, adobe=True, ignorechars=WHITESPACES_AS_BYTES)
 
 
 class DCTDecode:
@@ -742,6 +735,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
     """
     from ._xobj_image_helpers import (
         Image,
+        UnidentifiedImageError,
         _extended_image_frombytes,
         _get_imagemode,
         _handle_flate,
@@ -808,13 +802,16 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
         # I'm not sure if the following logic is correct.
         # There might not be any relationship between the filters and the
         # extension
-        if x_object_obj[SA.FILTER] in [[FT.LZW_DECODE], [FT.CCITT_FAX_DECODE]]:
+        if lfilters in (FT.LZW_DECODE, FT.CCITT_FAX_DECODE):
             extension = ".tiff"  # mime_type = "image/tiff"
             image_format = "TIFF"
         else:
             extension = ".png"  # mime_type = "image/png"
             image_format = "PNG"
-        img = Image.open(BytesIO(data), formats=("TIFF", "PNG"))
+        try:
+            img = Image.open(BytesIO(data), formats=("TIFF", "PNG"))
+        except UnidentifiedImageError:
+            img = _extended_image_frombytes(mode, size, data)
     elif lfilters == FT.DCT_DECODE:
         img, image_format, extension = Image.open(BytesIO(data)), "JPEG", ".jpg"
         # invert_color kept unchanged