Skip to content

Commit b4fa0d5

Browse files
authored
Dockerfile refactorings (tesserocr 2.6.2, openaleph-servicelayer etc.) (#16)
* Compile tesserocr with c++ 14; use openaleph-servicelayer * Build tesserocr in Dockerfile.base; don't build Apple base docker image * Separate test docker image * Move tesserocr to ocr dependencies * Only generate main requirements from pre-commit hook * Move tesserocr to optional dependencies * Add build-test to Makefile test, before running tests * 🔖 Bump version: 5.0.0-rc3 → 5.0.0-rc4
1 parent 5743921 commit b4fa0d5

File tree

14 files changed

+397
-330
lines changed

14 files changed

+397
-330
lines changed

.bumpversion.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpversion]
2-
current_version = 5.0.0-rc3
2+
current_version = 5.0.0-rc4
33
tag_name = {new_version}
44
commit = True
55
tag = True

.github/workflows/docker-base.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ jobs:
4545
with:
4646
context: .
4747
file: ./Dockerfile.base
48-
platforms: linux/amd64,linux/arm64
48+
platforms: linux/amd64 #,linux/arm64
4949
push: true
5050
tags: ${{ steps.meta.outputs.tags }}
5151
labels: ${{ steps.meta.outputs.labels }}

.pre-commit-config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ repos:
1919
- id: mixed-line-ending
2020
args: ["--fix=lf"]
2121
- id: trailing-whitespace
22-
exclude: ".bumpversion.cfg" # wtf
22+
exclude: ".bumpversion.cfg" # wtf
2323

2424
# - repo: https://github.com/asottile/pyupgrade
2525
# rev: v3.10.1
@@ -79,7 +79,7 @@ repos:
7979
rev: 1.9.0
8080
hooks:
8181
- id: poetry-export
82-
args: ["--without-hashes", "-o", "requirements.txt"]
82+
args: ["--without-hashes", "--with", "main", "-o", "requirements.txt"]
8383
- id: poetry-export
8484
args:
8585
["--without-hashes", "--only", "dev", "-o", "requirements-dev.txt"]

Dockerfile

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,14 @@ FROM ghcr.io/openaleph/ingest-file-base:latest
22

33
# uncomment when running on Apple Silicon
44
# ENV LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libgomp.so.1
5+
ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libgomp.so.1
56

67
COPY . /ingestors
78
RUN rm -rf /ingestors/tests
89
WORKDIR /ingestors
9-
RUN pip3 install --no-cache-dir -r /ingestors/requirements.txt
10-
RUN pip3 install --no-cache-dir /ingestors
10+
11+
RUN pip3 install --no-cache-dir --no-deps -r /ingestors/requirements.txt
12+
RUN pip3 install --no-deps --no-cache-dir /ingestors
1113

1214
ENV ARCHIVE_TYPE=file \
1315
ARCHIVE_PATH=/data \
@@ -17,7 +19,4 @@ ENV ARCHIVE_TYPE=file \
1719

1820
ENV PROCRASTINATE_APP="ingestors.tasks.app"
1921

20-
RUN chmod +x /ingestors/docker-entrypoint.sh
21-
22-
ENTRYPOINT [ "/ingestors/docker-entrypoint.sh" ]
2322
CMD ["procrastinate", "worker", "-q", "ingest"]

Dockerfile.base

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ RUN apt-get -qq -y update \
1212
# python deps (mostly to install their dependencies)
1313
python3-pip python3-dev python3-pil \
1414
# tesseract
15-
tesseract-ocr libtesseract-dev libleptonica-dev pkg-config\
15+
tesseract-ocr libtesseract-dev libleptonica-dev pkg-config \
1616
# libraries
1717
libxslt1-dev libpq-dev libldap2-dev libsasl2-dev \
1818
zlib1g-dev libicu-dev libxml2-dev \
@@ -116,6 +116,14 @@ ENV LANG='en_US.UTF-8' \
116116
OMP_THREAD_LIMIT='1' \
117117
OPENBLAS_NUM_THREADS='1'
118118

119+
# force compile tesserocr 2.6.2 with C++ 14
120+
# to make it compatible with Tesseract 5
121+
RUN pip download --no-binary=:all: "tesserocr==2.6.2" \
122+
&& tar -xzf tesserocr-2.6.2.tar.gz \
123+
&& sed -i "s/-std=c++11/-std=c++14/" tesserocr-2.6.2/setup.py \
124+
&& cd tesserocr-2.6.2 \
125+
&& CXXFLAGS="-std=c++14" pip install --no-cache-dir .
126+
119127
# tesseract 5
120128
ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata
121129

@@ -127,5 +135,3 @@ RUN pip3 install --no-cache-dir --prefer-binary --upgrade setuptools wheel
127135

128136
# Install PyICU
129137
RUN pip3 install --no-binary=:pyicu: pyicu
130-
# Install TesserOCR
131-
RUN pip3 install --no-binary=:tesserocr: tesserocr

Dockerfile.test

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,25 @@ FROM ghcr.io/openaleph/ingest-file-base:latest
22

33
# uncomment when running on Apple Silicon
44
# ENV LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libgomp.so.1
5+
ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libgomp.so.1
56

67
COPY . /ingestors
78
WORKDIR /ingestors
8-
RUN pip3 install --no-cache-dir -r /ingestors/requirements.txt
9-
RUN pip3 install --no-cache-dir /ingestors
109

11-
RUN pip3 install --no-cache-dir -r /ingestors/requirements-dev.txt
12-
RUN pip3 install --no-cache-dir procrastinate==3.2.2 # FIXME
10+
RUN pip3 install --no-cache-dir --no-deps -r /ingestors/requirements.txt
11+
RUN pip3 install --no-deps --no-cache-dir /ingestors
12+
13+
RUN pip3 install --no-deps -r /ingestors/requirements-dev.txt
14+
RUN pip3 install --no-cache-dir procrastinate==3.2.2
1315
RUN chown -R app:app /ingestors
1416

1517
ENV ARCHIVE_TYPE=file \
1618
ARCHIVE_PATH=/data \
1719
FTM_STORE_URI=postgresql://aleph:aleph@postgres/aleph \
1820
REDIS_URL=redis://redis:6379/0 \
19-
TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata
21+
TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata \
22+
DEBUG=1
2023

2124
ENV PROCRASTINATE_APP="ingestors.tasks.app"
2225

23-
USER app
24-
ENTRYPOINT [ "/ingestors/docker-entrypoint.sh" ]
2526
CMD ["pytest"]

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ build-cache:
1818
docker build . --cache-from ghcr.io/openaleph/ingest-file:cache -t ghcr.io/openaleph/ingest-file:cache
1919

2020
build-test:
21-
docker build . -f Dockerfile.test -t ghcr.io/openaleph/ingest-file:test
21+
$(COMPOSE) build test-ingest-file
2222

2323
build-macos:
2424
DOCKER_BUILDKIT=0 COMPOSE_DOCKER_CLI_BUILD=0 $(COMPOSE) build --no-rm --parallel
@@ -39,7 +39,7 @@ format-check:
3939
black --check .
4040

4141
test: build-test services
42-
PYTHONDEVMODE=1 PYTHONTRACEMALLOC=1 $(COMPOSE) run -e DEBUG=1 --rm ingest-file pytest --cov=ingestors --cov-report html --cov-report term
42+
PYTHONDEVMODE=1 PYTHONTRACEMALLOC=1 $(COMPOSE) run --rm test-ingest-file pytest
4343

4444
test-e2e: build services
4545
$(COMPOSE_E2E) run --rm ingest-file

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
5.0.0-rc3
1+
5.0.0-rc4

docker-compose.yml

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ services:
1212

1313
ingest-file:
1414
build:
15-
dockerfile: Dockerfile.test
15+
dockerfile: Dockerfile
1616
hostname: ingest
1717
tmpfs:
1818
- /tmp:mode=777
@@ -21,15 +21,34 @@ services:
2121
OPENALEPH_DB_URI: postgresql://ingest:ingest@postgres/ingest
2222
FTM_STORE_URI: postgresql://ingest:ingest@postgres/ingest
2323
LOG_FORMAT: TEXT # TEXT or JSON
24-
# LD_PRELOAD: /usr/lib/x86_64-linux-gnu/libgomp.so.1
25-
LD_PRELOAD: /usr/lib/aarch64-linux-gnu/libgomp.so.1
24+
entrypoint: ["/bin/sh", "-c", "echo 'opal-procrastinate init-db' && exec \"$@\"", "--"]
2625
volumes:
2726
- "./ingestors:/ingestors/ingestors"
28-
- "./tests:/ingestors/tests"
2927
- "./data:/ingestors/data"
3028
- "./requirements.txt:/ingestors/requirements.txt"
3129
- "./setup.py:/ingestors/setup.py"
32-
- "~:/host"
30+
depends_on:
31+
- postgres
32+
- redis
33+
34+
test-ingest-file:
35+
build:
36+
context: .
37+
dockerfile: Dockerfile.test
38+
image: test-ingest-file
39+
hostname: ingest
40+
tmpfs:
41+
- /tmp:mode=777
42+
- /data:mode=777
43+
environment:
44+
OPENALEPH_DB_URI: postgresql://ingest:ingest@postgres/ingest
45+
FTM_STORE_URI: postgresql://ingest:ingest@postgres/ingest
46+
LOG_FORMAT: TEXT # TEXT or JSON
47+
DEBUG: 1
48+
entrypoint: ["/bin/sh", "-c", "echo 'opal-procrastinate init-db' && exec \"$@\"", "--"]
49+
volumes:
50+
- "./tests:/ingestors/tests"
51+
- "./data:/ingestors/data"
3352
depends_on:
3453
- postgres
3554
- redis

ingestors/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from anystore.logging import configure_logging, get_logger
66
from procrastinate import cli as procrastinate_cli
77

8-
__version__ = "5.0.0-rc3"
8+
__version__ = "5.0.0-rc4"
99

1010
configure_logging()
1111

0 commit comments

Comments
 (0)