Skip to content

Commit

Permalink
Merge pull request #423 from alephdata/release/4.0.0
Browse files Browse the repository at this point in the history
4.0.0 release
  • Loading branch information
stchris authored Oct 11, 2024
2 parents 7bf441d + 3850970 commit 38edbaf
Show file tree
Hide file tree
Showing 18 changed files with 228 additions and 65 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 3.22.0
current_version = 4.0.0-rc28
tag_name = {new_version}
commit = True
tag = True
Expand Down
4 changes: 3 additions & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
- name: Set up
run: |
docker --version
docker-compose --version
docker compose --version
echo "${GITHUB_REF}"
- name: Start services
run: |
Expand All @@ -25,6 +25,8 @@ jobs:
make cached-build
- name: Install development dependencies
run: make dev
env:
PIP_BREAK_SYSTEM_PACKAGES: 1
- name: Check formatting
run: make format-check
- name: Run linter (ruff)
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/daily.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:
- name: Set up
run: |
docker --version
docker-compose --version
docker compose --version
- name: Build docker cache
run: |
make fresh-cache
Expand Down
19 changes: 19 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
name: Auto Assign to Project

on:
issues:
types: [opened, labeled]
pull_request_target:
types: [opened, labeled]

jobs:
assign_one_project:
runs-on: ubuntu-latest
name: Assign to One Project
steps:
- uses: actions/add-to-project@v0.3.0
with:
# You can target a repository in a different organization
# to the issue
project-url: https://github.com/orgs/alephdata/projects/10
github-token: ${{ secrets.ALEPH_GITHUB_TOKEN }}
7 changes: 4 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ LABEL org.opencontainers.image.source https://github.com/alephdata/ingest-file
# RUN echo "deb http://http.us.debian.org/debian stretch non-free" >/etc/apt/sources.list.d/nonfree.list
RUN apt-get -qq -y update \
&& apt-get -qq -y install build-essential locales ca-certificates \
# git
git \
# python deps (mostly to install their dependencies)
python3-pip python3-dev python3-pil \
# tesseract
Expand Down Expand Up @@ -121,8 +123,6 @@ RUN groupadd -g 1000 -r app \
RUN mkdir /models/ && \
curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz"

# Having updated pip/setuptools seems to break the test run for some reason (12/01/2022)
# RUN pip3 install --no-cache-dir -U pip setuptools
COPY requirements.txt /tmp/
RUN pip3 install --no-cache-dir --prefer-binary --upgrade pip
RUN pip3 install --no-cache-dir --prefer-binary --upgrade setuptools wheel
Expand Down Expand Up @@ -155,7 +155,8 @@ ENV ARCHIVE_TYPE=file \
ARCHIVE_PATH=/data \
FTM_STORE_URI=postgresql://aleph:aleph@postgres/aleph \
REDIS_URL=redis://redis:6379/0 \
TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata
TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata \
LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libgomp.so.1

# USER app
CMD ingestors process
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
INGEST=ghcr.io/alephdata/ingest-file
COMPOSE=docker-compose
COMPOSE=docker compose
DOCKER=$(COMPOSE) run --rm ingest-file

.PHONY: build
Expand Down
6 changes: 5 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ services:
image: redis:alpine
command: ["redis-server", "--save", "3600", "10"]

rabbitmq:
image: rabbitmq:3.9-management-alpine

ingest-file:
build:
context: .
Expand All @@ -22,7 +25,7 @@ services:
- /data:mode=777
environment:
FTM_STORE_URI: postgresql://ingest:ingest@postgres/ingest
LOG_FORMAT: TEXT # TEXT or JSON
LOG_FORMAT: TEXT # TEXT or JSON
volumes:
- "./ingestors:/ingestors/ingestors"
- "./tests:/ingestors/tests"
Expand All @@ -33,3 +36,4 @@ services:
depends_on:
- postgres
- redis
- rabbitmq
3 changes: 2 additions & 1 deletion ingestors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@

import logging

__version__ = "3.22.0"
__version__ = "4.0.0-rc28"

logging.getLogger("chardet").setLevel(logging.INFO)
logging.getLogger("PIL").setLevel(logging.INFO)
logging.getLogger("google.auth").setLevel(logging.INFO)
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("msglite").setLevel(logging.WARNING)
logging.getLogger("pika").setLevel(logging.WARNING)
56 changes: 38 additions & 18 deletions ingestors/cli.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,26 @@
import sys
import click
import logging
import uuid
from pprint import pprint
from random import randrange

from ftmstore import get_dataset
from servicelayer.cache import get_redis, get_fakeredis
from servicelayer.cache import get_redis
from servicelayer.logs import configure_logging
from servicelayer.jobs import Job, Dataset
from servicelayer.taskqueue import Dataset, Task
from servicelayer import settings as sl_settings
from servicelayer.archive.util import ensure_path
from servicelayer import settings as sls
from servicelayer.tags import Tags

from ingestors import settings
from ingestors.manager import Manager
from ingestors.directory import DirectoryIngestor
from ingestors.analysis import Analyzer
from ingestors.worker import IngestWorker, OP_ANALYZE, OP_INGEST
from ingestors.worker import get_worker

log = logging.getLogger(__name__)
STAGES = [OP_ANALYZE, OP_INGEST]


@click.group()
Expand All @@ -30,7 +33,7 @@ def cli():
def process(sync):
"""Start the queue and process tasks as they come. Blocks while waiting"""
num_threads = None if sync else sl_settings.WORKER_THREADS
worker = IngestWorker(stages=STAGES, num_threads=num_threads)
worker = get_worker(num_threads=num_threads)
code = worker.run()
sys.exit(code)

Expand All @@ -50,11 +53,22 @@ def killthekitten():
conn.flushall()


def _ingest_path(db, conn, dataset, path, languages=[]):
def _ingest_path(db, dataset, path, languages=[]):
context = {"languages": languages}
job = Job.create(conn, dataset)
stage = job.get_stage(OP_INGEST)
manager = Manager(db, stage, context)

priority = priority = randrange(1, sls.RABBITMQ_MAX_PRIORITY + 1)

task = Task(
task_id=uuid.uuid4().hex,
job_id=uuid.uuid4().hex,
collection_id=dataset,
delivery_tag="",
operation=settings.STAGE_INGEST,
priority=priority,
context=context,
payload={},
)
manager = Manager(db, task)
path = ensure_path(path)
if path is not None:
if path.is_file():
Expand All @@ -76,15 +90,14 @@ def _ingest_path(db, conn, dataset, path, languages=[]):
@click.argument("path", type=click.Path(exists=True))
def ingest(path, dataset, languages=None):
"""Queue a set of files for ingest."""
conn = get_redis()
db = get_dataset(dataset, OP_INGEST)
_ingest_path(db, conn, dataset, path, languages=languages)
db = get_dataset(dataset, settings.STAGE_INGEST)
_ingest_path(db, dataset, path, languages=languages)


@cli.command()
@click.option("--dataset", required=True, help="Name of the dataset")
def analyze(dataset):
db = get_dataset(dataset, OP_ANALYZE)
db = get_dataset(dataset, settings.STAGE_ANALYZE)
analyzer = None
for entity in db.partials():
if analyzer is None or analyzer.entity.id != entity.id:
Expand All @@ -102,13 +115,20 @@ def analyze(dataset):
@click.argument("path", type=click.Path(exists=True))
def debug(path, languages=None):
"""Debug the ingest for the given path."""
conn = get_fakeredis()
settings.fts.DATABASE_URI = "sqlite:////tmp/debug.sqlite3"
db = get_dataset("debug", origin=OP_INGEST, database_uri=settings.fts.DATABASE_URI)

# collection ID that is meant for testing purposes only
debug_datatset_id = 100

db = get_dataset(
debug_datatset_id,
origin=settings.STAGE_INGEST,
database_uri=settings.fts.DATABASE_URI,
)
db.delete()
_ingest_path(db, conn, "debug", path, languages=languages)
worker = IngestWorker(conn=conn, stages=STAGES)
worker.sync()
_ingest_path(db, debug_datatset_id, path, languages=languages)
worker = get_worker()
worker.process(blocking=False)
for entity in db.iterate():
pprint(entity.to_dict())

Expand Down
27 changes: 27 additions & 0 deletions ingestors/documents/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,33 @@ class PDFIngestor(Ingestor, PDFSupport):
EXTENSIONS = ["pdf"]
SCORE = 6

def extract_xmp_metadata(self, pdf, entity):
try:
xmp = pdf.xmp_metadata
if xmp is None:
return
entity.add("messageId", xmp["xmpmm"].get("documentid"))
entity.add("title", xmp["dc"].get("title"))
entity.add("generator", xmp["pdf"].get("producer"))
entity.add("language", xmp["dc"].get("language"))
entity.add("authoredAt", xmp["xmp"].get("createdate"))
entity.add("modifiedAt", xmp["xmp"].get("modifydate"))
except Exception as ex:
log.warning("Error reading XMP: %r", ex)

def extract_metadata(self, pdf, entity):
meta = pdf.metadata
if meta is not None:
entity.add("title", meta.get("title"))
entity.add("author", meta.get("author"))
entity.add("generator", meta.get("creator"))
entity.add("generator", meta.get("producer"))
entity.add("keywords", meta.get("subject"))
if "creationdate" in meta:
entity.add("authoredAt", meta.get("creationdate"))
if "moddate" in meta:
entity.add("modifiedAt", meta.get("moddate"))

def ingest(self, file_path, entity):
"""Ingestor implementation."""
try:
Expand Down
22 changes: 17 additions & 5 deletions ingestors/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
from servicelayer.archive.util import ensure_path
from servicelayer.extensions import get_extensions
from sentry_sdk import capture_exception
from servicelayer.cache import get_redis
from servicelayer.taskqueue import queue_task, get_rabbitmq_channel
from followthemoney.helpers import entity_filename
from followthemoney.namespace import Namespace
from prometheus_client import Counter, Histogram
Expand Down Expand Up @@ -75,11 +77,13 @@ class Manager(object):

MAGIC = magic.Magic(mime=True)

def __init__(self, dataset, stage, context):
def __init__(self, dataset, root_task):
self.conn = get_redis()
self.dataset = dataset
self.writer = dataset.bulk()
self.stage = stage
self.context = context
self.root_task = root_task
self.collection_id = root_task.collection_id
self.context = root_task.context
self.ns = Namespace(self.context.get("namespace"))
self.work_path = ensure_path(mkdtemp(prefix="ingestor-"))
self.emitted = set()
Expand All @@ -92,7 +96,7 @@ def archive(self):

def make_entity(self, schema, parent=None):
schema = model.get(schema)
entity = model.make_entity(schema, key_prefix=self.stage.job.dataset.name)
entity = model.make_entity(schema, key_prefix=self.collection_id)
self.make_child(parent, entity)
return entity

Expand Down Expand Up @@ -150,7 +154,15 @@ def auction(self, file_path, entity):

def queue_entity(self, entity):
log.debug("Queue: %r", entity)
self.stage.queue(entity.to_dict(), self.context)
queue_task(
get_rabbitmq_channel(),
get_redis(),
self.collection_id,
settings.STAGE_INGEST,
self.root_task.job_id,
self.context,
**entity.to_dict(),
)

def store(self, file_path, mime_type=None):
file_path = ensure_path(file_path)
Expand Down
9 changes: 9 additions & 0 deletions ingestors/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,15 @@
# Also store cached values in the SQL database
sls.TAGS_DATABASE_URI = fts.DATABASE_URI

RABBITMQ_URL = env.get("ALEPH_RABBITMQ_URL", "rabbitmq")
# Prefetch count values
# This is the number of tasks the IngestWorker will grab at any given time
RABBITMQ_QOS_INGEST_QUEUE = 1
RABBITMQ_QOS_ANALYZE_QUEUE = 1

STAGE_INGEST = "ingest"
STAGE_ANALYZE = "analyze"

# ProcessingException is thrown whenever something goes wrong wiht
# parsing a file. Enable this with care, it can easily eat up the
# Sentry quota of events.
Expand Down
6 changes: 3 additions & 3 deletions ingestors/support/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ def cache_key(self, *parts):
return make_key(*parts)

def get_cache_set(self, key):
return ensure_list(self.manager.stage.conn.smembers(key))
return ensure_list(self.manager.conn.smembers(key))

def add_cache_set(self, key, value):
self.manager.stage.conn.sadd(key, value)
self.manager.stage.conn.expire(key, REDIS_LONG)
self.manager.conn.sadd(key, value)
self.manager.conn.expire(key, REDIS_LONG)
2 changes: 1 addition & 1 deletion ingestors/support/email.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def parse_references(self, references, in_reply_to):

def resolve_message_ids(self, entity):
# https://cr.yp.to/immhf/thread.html
ctx = self.manager.stage.job.dataset.name
ctx = self.manager.collection_id

for message_id in entity.get("messageId"):
key = self.cache_key("mid-ent", ctx, message_id)
Expand Down
Loading

0 comments on commit 38edbaf

Please sign in to comment.