Skip to content

Bump Python image to 3.9 #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
- name: Set up
run: |
docker --version
docker-compose --version
docker compose --version
echo "${GITHUB_REF}"
- name: Start services
run: |
Expand Down
32 changes: 14 additions & 18 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,30 +1,29 @@
FROM ubuntu:20.04
FROM python:3.9-bookworm
ENV DEBIAN_FRONTEND noninteractive

LABEL org.opencontainers.image.title "FollowTheMoney File Ingestors"
LABEL org.opencontainers.image.licenses MIT
LABEL org.opencontainers.image.source https://github.com/alephdata/ingest-file

# Enable non-free archive for `unrar`.
# RUN echo "deb http://http.us.debian.org/debian stretch non-free" >/etc/apt/sources.list.d/nonfree.list
RUN apt-get -qq -y update \
&& apt-get -qq -y install build-essential locales ca-certificates \
RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/sources.list.d/nonfree.list \
&& apt-get -qq -y update \
&& apt-get -qq -y install build-essential locales \
# python deps (mostly to install their dependencies)
python3-pip python3-dev python3-pil \
python3-dev \
# tesseract
tesseract-ocr libtesseract-dev libleptonica-dev pkg-config\
tesseract-ocr libtesseract-dev libleptonica-dev \
# libraries
libxslt1-dev libpq-dev libldap2-dev libsasl2-dev \
zlib1g-dev libicu-dev libxml2-dev \
libldap2-dev libsasl2-dev \
# package tools
unrar p7zip-full \
# audio & video metadata
libmediainfo-dev \
# image processing, djvu
imagemagick-common imagemagick mdbtools djvulibre-bin \
libtiff5-dev libjpeg-dev libfreetype6-dev libwebp-dev \
mdbtools djvulibre-bin \
libtiff5-dev \
libtiff-tools ghostscript librsvg2-bin jbig2dec \
pst-utils \
pst-utils libgif-dev \
### tesseract
tesseract-ocr-eng \
tesseract-ocr-swa \
Expand Down Expand Up @@ -98,7 +97,7 @@ RUN apt-get -qq -y update \
tesseract-ocr-uzb \
### pdf convert: libreoffice + a bunch of fonts
libreoffice fonts-opensymbol hyphen-fr hyphen-de \
hyphen-en-us hyphen-it hyphen-ru fonts-dejavu fonts-dejavu-core fonts-dejavu-extra \
hyphen-en-us hyphen-it hyphen-ru fonts-dejavu fonts-dejavu-extra \
fonts-droid-fallback fonts-dustin fonts-f500 fonts-fanwood fonts-freefont-ttf \
fonts-liberation fonts-lmodern fonts-lyx fonts-sil-gentium fonts-texgyre \
fonts-tlwg-purisa \
Expand All @@ -121,11 +120,7 @@ RUN groupadd -g 1000 -r app \
RUN mkdir /models/ && \
curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz"

# Having updated pip/setuptools seems to break the test run for some reason (12/01/2022)
# RUN pip3 install --no-cache-dir -U pip setuptools
COPY requirements.txt /tmp/
RUN pip3 install --no-cache-dir --prefer-binary --upgrade pip
RUN pip3 install --no-cache-dir --prefer-binary --upgrade setuptools wheel
RUN pip3 install --no-cache-dir --no-binary "tesserocr" -r /tmp/requirements.txt

# Install spaCy models
Expand All @@ -148,14 +143,15 @@ RUN python3 -m spacy download el_core_news_sm \

COPY . /ingestors
WORKDIR /ingestors
RUN pip3 install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
RUN pip install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
RUN chown -R app:app /ingestors

ENV ARCHIVE_TYPE=file \
ARCHIVE_PATH=/data \
FTM_STORE_URI=postgresql://aleph:aleph@postgres/aleph \
REDIS_URL=redis://redis:6379/0 \
TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata
TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata \
LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libgomp.so.1

# USER app
CMD ingestors process
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
INGEST=ghcr.io/alephdata/ingest-file
COMPOSE=docker-compose
COMPOSE=docker compose
DOCKER=$(COMPOSE) run --rm ingest-file

.PHONY: build
Expand Down Expand Up @@ -36,7 +36,7 @@ format-check:
black --check .

test: services
$(DOCKER) pytest --cov=ingestors --cov-report html --cov-report term
PYTHONDEVMODE=1 PYTHONTRACEMALLOC=1 $(DOCKER) pytest --cov=ingestors --cov-report html --cov-report term

restart: build
$(COMPOSE) up --force-recreate --no-deps --detach ingest-file
Expand Down
8 changes: 5 additions & 3 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
version: "3.2"

services:
postgres:
image: postgres:10.0
Expand All @@ -12,6 +10,9 @@ services:
image: redis:alpine
command: ["redis-server", "--save", "3600", "10"]

rabbitmq:
image: rabbitmq:3.9-management-alpine

ingest-file:
build:
context: .
Expand All @@ -22,7 +23,7 @@ services:
- /data:mode=777
environment:
FTM_STORE_URI: postgresql://ingest:ingest@postgres/ingest
LOG_FORMAT: TEXT # TEXT or JSON
LOG_FORMAT: TEXT # TEXT or JSON
volumes:
- "./ingestors:/ingestors/ingestors"
- "./tests:/ingestors/tests"
Expand All @@ -33,3 +34,4 @@ services:
depends_on:
- postgres
- redis
- rabbitmq
9 changes: 6 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,17 @@ normality==2.5.0
pantomime==0.6.1
followthemoney==3.5.9
followthemoney-store[postgresql]==3.1.0
servicelayer[google,amazon]==1.22.2
servicelayer[google,amazon]==1.23.0
languagecodes==1.1.1
countrytagger==0.1.2
pyicu==2.12
google-cloud-vision==3.7.2
tesserocr==2.6.2
spacy==3.6.1
tesserocr==2.7.1
spacy==3.6.1 # pinned because spacy 3.8 requires numpy >2 which breaks fasttext (see https://groups.google.com/g/fasttext-library/c/4EOM0-S6xHU)
numpy<2.0.0 # pinned because otherwise spacy requires an incompatible numpy
fingerprints==1.1.1
fasttext==0.9.2
pika==1.3.2

# Development
pytest==8.2.0
Expand All @@ -38,4 +40,5 @@ cryptography==41.0.7
requests[security]==2.31.0
pymupdf==1.21.1

prometheus-client==0.17.1
sentry_sdk==2.0.1
8 changes: 5 additions & 3 deletions tests/test_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@ def test_audio(self):
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
self.assertEqual(entity.first("title"), "Core Media Audio")
self.assertEqual(entity.first("generator"), "com.apple.VoiceMemos (iOS 11.4)")
self.assertEqual(
entity.first("authoredAt"),
# with the change to Python 3.9 and the debian base image we are seeing a change
# in the amount of data we are parsing here
assert entity.get("authoredAt") == [
datetime.datetime(2018, 6, 20, 12, 9, 28).isoformat(),
datetime.datetime(2018, 6, 20, 12, 9, 42).isoformat(),
)
]
self.assertEqual(entity.first("duration"), "2808")
self.assertEqual(entity.first("samplingRate"), "44100")
self.assertEqual(entity.schema.name, "Audio")