1
- FROM ubuntu:20.04
1
+ FROM python:3.9-bookworm
2
2
ENV DEBIAN_FRONTEND noninteractive
3
3
4
4
LABEL org.opencontainers.image.title "FollowTheMoney File Ingestors"
5
5
LABEL org.opencontainers.image.licenses MIT
6
6
LABEL org.opencontainers.image.source https://github.com/alephdata/ingest-file
7
7
8
8
# Enable non-free archive for `unrar`.
9
- # RUN echo "deb http://http.us.debian.org/debian stretch non-free" >/etc/apt/sources.list.d/nonfree.list
10
- RUN apt-get -qq -y update \
11
- && apt-get -qq -y install build-essential locales ca-certificates \
9
+ RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/sources.list.d/nonfree.list \
10
+ && apt-get -qq -y update \
11
+ && apt-get -qq -y install build-essential locales \
12
12
# python deps (mostly to install their dependencies)
13
- python3-pip python3- dev python3-pil \
13
+ python3-dev \
14
14
# tesseract
15
- tesseract-ocr libtesseract-dev libleptonica-dev pkg-config \
15
+ tesseract-ocr libtesseract-dev libleptonica-dev \
16
16
# libraries
17
- libxslt1-dev libpq-dev libldap2-dev libsasl2-dev \
18
- zlib1g-dev libicu-dev libxml2-dev \
17
+ libldap2-dev libsasl2-dev \
19
18
# package tools
20
19
unrar p7zip-full \
21
20
# audio & video metadata
22
21
libmediainfo-dev \
23
22
# image processing, djvu
24
- imagemagick-common imagemagick mdbtools djvulibre-bin \
25
- libtiff5-dev libjpeg-dev libfreetype6-dev libwebp-dev \
23
+ mdbtools djvulibre-bin \
24
+ libtiff5-dev \
26
25
libtiff-tools ghostscript librsvg2-bin jbig2dec \
27
- pst-utils \
26
+ pst-utils libgif-dev \
28
27
# ## tesseract
29
28
tesseract-ocr-eng \
30
29
tesseract-ocr-swa \
@@ -98,7 +97,7 @@ RUN apt-get -qq -y update \
98
97
tesseract-ocr-uzb \
99
98
# ## pdf convert: libreoffice + a bunch of fonts
100
99
libreoffice fonts-opensymbol hyphen-fr hyphen-de \
101
- hyphen-en-us hyphen-it hyphen-ru fonts-dejavu fonts-dejavu-core fonts-dejavu- extra \
100
+ hyphen-en-us hyphen-it hyphen-ru fonts-dejavu fonts-dejavu-extra \
102
101
fonts-droid-fallback fonts-dustin fonts-f500 fonts-fanwood fonts-freefont-ttf \
103
102
fonts-liberation fonts-lmodern fonts-lyx fonts-sil-gentium fonts-texgyre \
104
103
fonts-tlwg-purisa \
@@ -121,11 +120,7 @@ RUN groupadd -g 1000 -r app \
121
120
RUN mkdir /models/ && \
122
121
curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz"
123
122
124
- # Having updated pip/setuptools seems to break the test run for some reason (12/01/2022)
125
- # RUN pip3 install --no-cache-dir -U pip setuptools
126
123
COPY requirements.txt /tmp/
127
- RUN pip3 install --no-cache-dir --prefer-binary --upgrade pip
128
- RUN pip3 install --no-cache-dir --prefer-binary --upgrade setuptools wheel
129
124
RUN pip3 install --no-cache-dir --no-binary "tesserocr" -r /tmp/requirements.txt
130
125
131
126
# Install spaCy models
@@ -148,14 +143,15 @@ RUN python3 -m spacy download el_core_news_sm \
148
143
149
144
COPY . /ingestors
150
145
WORKDIR /ingestors
151
- RUN pip3 install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
146
+ RUN pip install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
152
147
RUN chown -R app:app /ingestors
153
148
154
149
ENV ARCHIVE_TYPE=file \
155
150
ARCHIVE_PATH=/data \
156
151
FTM_STORE_URI=postgresql://aleph:aleph@postgres/aleph \
157
152
REDIS_URL=redis://redis:6379/0 \
158
- TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata
153
+ TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata \
154
+ LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libgomp.so.1
159
155
160
156
# USER app
161
157
CMD ingestors process
0 commit comments