File tree Expand file tree Collapse file tree 4 files changed +9
-5
lines changed Expand file tree Collapse file tree 4 files changed +9
-5
lines changed Original file line number Diff line number Diff line change @@ -25,7 +25,7 @@ RUN apt-get -qq -y update \
25
25
# image processing, djvu
26
26
imagemagick-common imagemagick mdbtools djvulibre-bin \
27
27
libtiff5-dev libjpeg-dev libfreetype6-dev libwebp-dev \
28
- libtiff-tools ghostscript librsvg2-bin jbig2dec \
28
+ libtiff-tools ghostscript librsvg2-bin jbig2dec libopenjp2-7-dev \
29
29
pst-utils \
30
30
# ## tesseract
31
31
tesseract-ocr-eng \
@@ -126,7 +126,7 @@ RUN mkdir /models/ && \
126
126
COPY requirements.txt /tmp/
127
127
RUN pip3 install --no-cache-dir --prefer-binary --upgrade pip
128
128
RUN pip3 install --no-cache-dir --prefer-binary --upgrade setuptools wheel
129
- RUN pip3 install --no-cache-dir --no-binary "tesserocr" -r /tmp/requirements.txt
129
+ RUN pip3 install --no-cache-dir --no-binary "tesserocr" --no-binary "Pillow" - r /tmp/requirements.txt
130
130
131
131
# Install spaCy models
132
132
RUN python3 -m spacy download en_core_web_sm \
Original file line number Diff line number Diff line change 1
1
import logging
2
2
from io import BytesIO
3
- from PIL import Image , ExifTags
3
+ from PIL import Image , ExifTags , ImageFile
4
4
from followthemoney import model
5
5
6
6
from ingestors .ingestor import Ingestor
10
10
11
11
log = logging .getLogger (__name__ )
12
12
13
+ # from https://stackoverflow.com/a/47958486
14
+ ImageFile .LOAD_TRUNCATED_IMAGES = True
15
+
13
16
14
17
class ImageIngestor (Ingestor , OCRSupport , TimestampSupport ):
15
18
"""Image file ingestor class. Extracts the text from images using OCR."""
Original file line number Diff line number Diff line change 3
3
import threading
4
4
from hashlib import sha1
5
5
from normality import stringify
6
- from PIL import Image
6
+ from PIL import Image , ImageFile
7
7
from io import BytesIO
8
8
from languagecodes import list_to_alpha3 as alpha3
9
9
13
13
14
14
log = logging .getLogger (__name__ )
15
15
TESSERACT_LOCALE = "C"
16
+ ImageFile .LOAD_TRUNCATED_IMAGES = True
16
17
17
18
18
19
class OCRSupport (CacheSupport ):
Original file line number Diff line number Diff line change @@ -30,7 +30,7 @@ odfpy==1.4.1
30
30
cchardet == 2.1.7
31
31
lxml == 5.0.0
32
32
olefile == 0.47
33
- Pillow == 10.3 .0
33
+ Pillow == 10.4 .0
34
34
vobject == 0.9.6.1
35
35
msglite == 0.30.0
36
36
icalendar == 5.0.12
You can’t perform that action at this time.
0 commit comments