Skip to content

Commit 57e78de

Browse files
committed
Bump Python image to 3.9
1 parent c67c3fe commit 57e78de

File tree

5 files changed

+32
-29
lines changed

5 files changed

+32
-29
lines changed

Dockerfile

Lines changed: 14 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,29 @@
1-
FROM ubuntu:20.04
1+
FROM python:3.9-bookworm
22
ENV DEBIAN_FRONTEND noninteractive
33

44
LABEL org.opencontainers.image.title "FollowTheMoney File Ingestors"
55
LABEL org.opencontainers.image.licenses MIT
66
LABEL org.opencontainers.image.source https://github.com/alephdata/ingest-file
77

88
# Enable non-free archive for `unrar`.
9-
# RUN echo "deb http://http.us.debian.org/debian stretch non-free" >/etc/apt/sources.list.d/nonfree.list
10-
RUN apt-get -qq -y update \
11-
&& apt-get -qq -y install build-essential locales ca-certificates \
9+
RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/sources.list.d/nonfree.list \
10+
&& apt-get -qq -y update \
11+
&& apt-get -qq -y install build-essential locales \
1212
# python deps (mostly to install their dependencies)
13-
python3-pip python3-dev python3-pil \
13+
python3-dev \
1414
# tesseract
15-
tesseract-ocr libtesseract-dev libleptonica-dev pkg-config\
15+
tesseract-ocr libtesseract-dev libleptonica-dev \
1616
# libraries
17-
libxslt1-dev libpq-dev libldap2-dev libsasl2-dev \
18-
zlib1g-dev libicu-dev libxml2-dev \
17+
libldap2-dev libsasl2-dev \
1918
# package tools
2019
unrar p7zip-full \
2120
# audio & video metadata
2221
libmediainfo-dev \
2322
# image processing, djvu
24-
imagemagick-common imagemagick mdbtools djvulibre-bin \
25-
libtiff5-dev libjpeg-dev libfreetype6-dev libwebp-dev \
23+
mdbtools djvulibre-bin \
24+
libtiff5-dev \
2625
libtiff-tools ghostscript librsvg2-bin jbig2dec \
27-
pst-utils \
26+
pst-utils libgif-dev \
2827
### tesseract
2928
tesseract-ocr-eng \
3029
tesseract-ocr-swa \
@@ -98,7 +97,7 @@ RUN apt-get -qq -y update \
9897
tesseract-ocr-uzb \
9998
### pdf convert: libreoffice + a bunch of fonts
10099
libreoffice fonts-opensymbol hyphen-fr hyphen-de \
101-
hyphen-en-us hyphen-it hyphen-ru fonts-dejavu fonts-dejavu-core fonts-dejavu-extra \
100+
hyphen-en-us hyphen-it hyphen-ru fonts-dejavu fonts-dejavu-extra \
102101
fonts-droid-fallback fonts-dustin fonts-f500 fonts-fanwood fonts-freefont-ttf \
103102
fonts-liberation fonts-lmodern fonts-lyx fonts-sil-gentium fonts-texgyre \
104103
fonts-tlwg-purisa \
@@ -121,11 +120,7 @@ RUN groupadd -g 1000 -r app \
121120
RUN mkdir /models/ && \
122121
curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz"
123122

124-
# Having updated pip/setuptools seems to break the test run for some reason (12/01/2022)
125-
# RUN pip3 install --no-cache-dir -U pip setuptools
126123
COPY requirements.txt /tmp/
127-
RUN pip3 install --no-cache-dir --prefer-binary --upgrade pip
128-
RUN pip3 install --no-cache-dir --prefer-binary --upgrade setuptools wheel
129124
RUN pip3 install --no-cache-dir --no-binary "tesserocr" -r /tmp/requirements.txt
130125

131126
# Install spaCy models
@@ -148,14 +143,15 @@ RUN python3 -m spacy download el_core_news_sm \
148143

149144
COPY . /ingestors
150145
WORKDIR /ingestors
151-
RUN pip3 install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
146+
RUN pip install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
152147
RUN chown -R app:app /ingestors
153148

154149
ENV ARCHIVE_TYPE=file \
155150
ARCHIVE_PATH=/data \
156151
FTM_STORE_URI=postgresql://aleph:aleph@postgres/aleph \
157152
REDIS_URL=redis://redis:6379/0 \
158-
TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata
153+
TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata \
154+
LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libgomp.so.1
159155

160156
# USER app
161157
CMD ingestors process

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
INGEST=ghcr.io/alephdata/ingest-file
2-
COMPOSE=docker-compose
2+
COMPOSE=docker compose
33
DOCKER=$(COMPOSE) run --rm ingest-file
44

55
.PHONY: build
@@ -36,7 +36,7 @@ format-check:
3636
black --check .
3737

3838
test: services
39-
$(DOCKER) pytest --cov=ingestors --cov-report html --cov-report term
39+
PYTHONDEVMODE=1 PYTHONTRACEMALLOC=1 $(DOCKER) pytest --cov=ingestors --cov-report html --cov-report term
4040

4141
restart: build
4242
$(COMPOSE) up --force-recreate --no-deps --detach ingest-file

docker-compose.yml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
version: "3.2"
2-
31
services:
42
postgres:
53
image: postgres:10.0
@@ -12,6 +10,9 @@ services:
1210
image: redis:alpine
1311
command: ["redis-server", "--save", "3600", "10"]
1412

13+
rabbitmq:
14+
image: rabbitmq:3.9-management-alpine
15+
1516
ingest-file:
1617
build:
1718
context: .
@@ -22,7 +23,7 @@ services:
2223
- /data:mode=777
2324
environment:
2425
FTM_STORE_URI: postgresql://ingest:ingest@postgres/ingest
25-
LOG_FORMAT: TEXT # TEXT or JSON
26+
LOG_FORMAT: TEXT # TEXT or JSON
2627
volumes:
2728
- "./ingestors:/ingestors/ingestors"
2829
- "./tests:/ingestors/tests"
@@ -33,3 +34,4 @@ services:
3334
depends_on:
3435
- postgres
3536
- redis
37+
- rabbitmq

requirements.txt

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,17 @@ normality==2.5.0
33
pantomime==0.6.1
44
followthemoney==3.5.9
55
followthemoney-store[postgresql]==3.1.0
6-
servicelayer[google,amazon]==1.22.2
6+
servicelayer[google,amazon]==1.23.0
77
languagecodes==1.1.1
88
countrytagger==0.1.2
99
pyicu==2.12
1010
google-cloud-vision==3.7.2
11-
tesserocr==2.6.2
12-
spacy==3.6.1
11+
tesserocr==2.7.1
12+
spacy==3.6.1 # pinned because spacy 3.8 requires numpy >2 which breaks fasttext (see https://groups.google.com/g/fasttext-library/c/4EOM0-S6xHU)
13+
numpy<2.0.0 # pinned because otherwise spacy requires an incompatible numpy
1314
fingerprints==1.1.1
1415
fasttext==0.9.2
16+
pika==1.3.2
1517

1618
# Development
1719
pytest==8.2.0
@@ -38,4 +40,5 @@ cryptography==41.0.7
3840
requests[security]==2.31.0
3941
pymupdf==1.21.1
4042

43+
prometheus-client==0.17.1
4144
sentry_sdk==2.0.1

tests/test_audio.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,12 @@ def test_audio(self):
1010
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
1111
self.assertEqual(entity.first("title"), "Core Media Audio")
1212
self.assertEqual(entity.first("generator"), "com.apple.VoiceMemos (iOS 11.4)")
13-
self.assertEqual(
14-
entity.first("authoredAt"),
13+
# with the change to Python 3.9 and the debian base image we are seeing a change
14+
# in the amount of data we are parsing here
15+
assert entity.get("authoredAt") == [
16+
datetime.datetime(2018, 6, 20, 12, 9, 28).isoformat(),
1517
datetime.datetime(2018, 6, 20, 12, 9, 42).isoformat(),
16-
)
18+
]
1719
self.assertEqual(entity.first("duration"), "2808")
1820
self.assertEqual(entity.first("samplingRate"), "44100")
1921
self.assertEqual(entity.schema.name, "Audio")

0 commit comments

Comments
 (0)