Skip to content

Commit 05ffe34

Browse files
committed
bugfix: allow loading truncated images, enable Pillow 10.4.0
1 parent d343739 commit 05ffe34

File tree

4 files changed

+9
-5
lines changed

4 files changed

+9
-5
lines changed

Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ RUN apt-get -qq -y update \
2525
# image processing, djvu
2626
imagemagick-common imagemagick mdbtools djvulibre-bin \
2727
libtiff5-dev libjpeg-dev libfreetype6-dev libwebp-dev \
28-
libtiff-tools ghostscript librsvg2-bin jbig2dec \
28+
libtiff-tools ghostscript librsvg2-bin jbig2dec libopenjp2-7-dev \
2929
pst-utils \
3030
### tesseract
3131
tesseract-ocr-eng \
@@ -126,7 +126,7 @@ RUN mkdir /models/ && \
126126
COPY requirements.txt /tmp/
127127
RUN pip3 install --no-cache-dir --prefer-binary --upgrade pip
128128
RUN pip3 install --no-cache-dir --prefer-binary --upgrade setuptools wheel
129-
RUN pip3 install --no-cache-dir --no-binary "tesserocr" -r /tmp/requirements.txt
129+
RUN pip3 install --no-cache-dir --no-binary "tesserocr" --no-binary "Pillow" -r /tmp/requirements.txt
130130

131131
# Install spaCy models
132132
RUN python3 -m spacy download en_core_web_sm \

ingestors/media/image.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import logging
22
from io import BytesIO
3-
from PIL import Image, ExifTags
3+
from PIL import Image, ExifTags, ImageFile
44
from followthemoney import model
55

66
from ingestors.ingestor import Ingestor
@@ -10,6 +10,9 @@
1010

1111
log = logging.getLogger(__name__)
1212

13+
# from https://stackoverflow.com/a/47958486
14+
ImageFile.LOAD_TRUNCATED_IMAGES = True
15+
1316

1417
class ImageIngestor(Ingestor, OCRSupport, TimestampSupport):
1518
"""Image file ingestor class. Extracts the text from images using OCR."""

ingestors/support/ocr.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import threading
44
from hashlib import sha1
55
from normality import stringify
6-
from PIL import Image
6+
from PIL import Image, ImageFile
77
from io import BytesIO
88
from languagecodes import list_to_alpha3 as alpha3
99

@@ -13,6 +13,7 @@
1313

1414
log = logging.getLogger(__name__)
1515
TESSERACT_LOCALE = "C"
16+
ImageFile.LOAD_TRUNCATED_IMAGES = True
1617

1718

1819
class OCRSupport(CacheSupport):

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ odfpy==1.4.1
3030
cchardet==2.1.7
3131
lxml==5.0.0
3232
olefile==0.47
33-
Pillow==10.3.0
33+
Pillow==10.4.0
3434
vobject==0.9.6.1
3535
msglite==0.30.0
3636
icalendar==5.0.12

0 commit comments

Comments
 (0)