Skip to content

Commit

Permalink
ENH: Add decode_as_image() to ContentStreams (#2615)
Browse files Browse the repository at this point in the history
Closes #2613
  • Loading branch information
pubpub-zz authored Jun 9, 2024
1 parent 4b086ef commit 26d1615
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 0 deletions.
22 changes: 22 additions & 0 deletions docs/user/extract-images.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,25 @@ for image_file_object in page.images:
fp.write(image_file_object.data)
count += 1
```

# Other images

Some other objects can contain images, such as stamp annotations.

For example, this document contains such stamps:
[test_stamp.pdf](https://github.com/user-attachments/files/15751424/test_stamp.pdf)

You can extract the image from the annotation with the following code:

```python
from pypdf import PdfReader

reader = PdfReader("test_stamp.pdf")
im = (
reader.pages[0]["/Annots"][0]
.get_object()["/AP"]["/N"]["/Resources"]["/XObject"]["/Im4"]
.decode_as_image()
)

im.show()
```
25 changes: 25 additions & 0 deletions pypdf/generic/_data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -955,6 +955,31 @@ def flate_encode(self, level: int = -1) -> "EncodedStreamObject":
retval._data = FlateDecode.encode(b_(self._data), level)
return retval

def decode_as_image(self) -> Any:
"""
Try to decode the stream object as an image
Returns:
a PIL image if proper decoding has been found
Raises:
Exception: (any)during decoding to to invalid object or
errors during decoding will be reported
It is recommended to catch exceptions to prevent
stops in your program.
"""
from ..filters import _xobj_to_image

if self.get("/Subtype", "") != "/Image":
try:
msg = f"{self.indirect_reference} does not seem to be an Image" # pragma: no cover
except AttributeError:
msg = f"{self.__repr__()} object does not seem to be an Image" # pragma: no cover
logger_warning(msg, __name__)
extension, byte_stream, img = _xobj_to_image(self)
if extension is None:
return None # pragma: no cover
return img


class DecodedStreamObject(StreamObject):
pass
Expand Down
21 changes: 21 additions & 0 deletions tests/test_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,3 +441,24 @@ def test_inline_image_extraction():
name = "iss2598d.png"
img = Image.open(BytesIO(get_data_from_url(url, name=name)))
assert image_similarity(reader.pages[0].images[0].image, img) == 1


@pytest.mark.enable_socket()
def test_extract_image_from_object(caplog):
url = "https://github.com/py-pdf/pypdf/files/15176076/B2.pdf"
name = "iss2613.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
image = reader.pages[0]["/Resources"]["/Pattern"]["/P1"]["/Resources"]["/XObject"][
"/X1"
].decode_as_image()
assert isinstance(image, Image.Image)
with pytest.raises(Exception):
co = reader.pages[0].get_contents()
co.decode_as_image()
assert "does not seem to be an Image" in caplog.text
caplog.clear()
co.indirect_reference = "for_test"
with pytest.raises(Exception):
co = reader.pages[0].get_contents()
co.decode_as_image()
assert "does not seem to be an Image" in caplog.text

0 comments on commit 26d1615

Please sign in to comment.