Skip to content

Commit f162611

Browse files
authored
Make audio and video searchable (#5)
* Refactor ingest-file to prepare it for using nomenklatura (#2) * Remove README & LICENSE from .dockerignore * Refactor ingest-file to be compatible with nomenklatura * Make linter happy * Add poetry.lock * Build whisper.cpp * Add Whispercpp to Dockerfile from multi-stage build * Launch subprocess for transcription. Refactor error handling. * Make image build architecture-independent. Set model. * Aesthtic adjustments to Dockerfile * Apply transcription logic to audio and video. Add tests. * The Processing of audio/video isn't a failure is transcription fails * Make linter happy * Transcription timeout as env var * Fix wrong import from settings * Cast env var timeout to int
1 parent 3c86592 commit f162611

20 files changed

+2204
-23
lines changed

.dockerignore

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
.git
22
.github
3-
LICENSE
4-
README.md
53
__pycache__
64
convert
7-
docker-compose.yml
5+
docker-compose.yml

Dockerfile

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,43 @@
1-
FROM python:3.9-bookworm
2-
ENV DEBIAN_FRONTEND noninteractive
1+
#### BUILD WHISPER.CPP
2+
#----------------------------------
3+
FROM nvidia/cuda:11.6.2-devel-ubuntu20.04 AS build
34

4-
LABEL org.opencontainers.image.title "FollowTheMoney File Ingestors"
5-
LABEL org.opencontainers.image.licenses MIT
6-
LABEL org.opencontainers.image.source https://github.com/alephdata/ingest-file
5+
WORKDIR /usr/local/src
6+
RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y \
7+
bash git make wget g++ ffmpeg cmake
8+
RUN git clone https://github.com/ggml-org/whisper.cpp --depth 1
9+
10+
# whisper.cpp setup
11+
WORKDIR /usr/local/src/whisper.cpp
12+
RUN WHISPER_CUBLAS=0 make -j
13+
RUN bash ./models/download-ggml-model.sh medium-q8_0
14+
15+
#### copy the compiled binaries to the image for prod
16+
# the image above will be discarded
17+
# ----------------------------------
18+
FROM python:3.11-slim
19+
20+
# copy whisper
21+
COPY --from=build /usr/local/src/whisper.cpp /whisper
22+
COPY --from=build /lib/*/libgomp.so.1 /whisper/build/src
23+
24+
# fix some libs
25+
ENV LD_LIBRARY_PATH=/whisper/build/src/:/whisper/build/ggml/src/
26+
27+
# ingest-file
28+
ENV DEBIAN_FRONTEND="noninteractive"
29+
30+
LABEL org.opencontainers.image.title="FollowTheMoney File Ingestors"
31+
LABEL org.opencontainers.image.licenses="MIT"
32+
LABEL org.opencontainers.image.source="https://github.com/alephdata/ingest-file"
733

834
# Enable non-free archive for `unrar`.
935
RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/sources.list.d/nonfree.list \
1036
&& apt-get -qq -y update \
1137
&& apt-get -qq -y install build-essential locales \
1238
# python deps (mostly to install their dependencies)
13-
python3-dev \
39+
git python3-dev \
40+
pkg-config libicu-dev \
1441
# tesseract
1542
tesseract-ocr libtesseract-dev libleptonica-dev \
1643
# libraries
@@ -24,6 +51,8 @@ RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/source
2451
libtiff5-dev \
2552
libtiff-tools ghostscript librsvg2-bin jbig2dec \
2653
pst-utils libgif-dev \
54+
# necessary for python-magic
55+
libmagic1 \
2756
### tesseract
2857
tesseract-ocr-eng \
2958
tesseract-ocr-swa \
@@ -101,6 +130,7 @@ RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/source
101130
fonts-droid-fallback fonts-dustin fonts-f500 fonts-fanwood fonts-freefont-ttf \
102131
fonts-liberation fonts-lmodern fonts-lyx fonts-sil-gentium fonts-texgyre \
103132
fonts-tlwg-purisa \
133+
ffmpeg \
104134
###
105135
&& apt-get -qq -y autoremove \
106136
&& apt-get clean \
@@ -121,6 +151,8 @@ RUN mkdir /models/ && \
121151
curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz"
122152

123153
COPY requirements.txt /tmp/
154+
RUN pip3 install --no-cache-dir -q -U pip setuptools
155+
RUN pip3 install --no-binary=:pyicu: pyicu
124156
RUN pip3 install --no-cache-dir --no-binary "tesserocr" -r /tmp/requirements.txt
125157

126158
# Install spaCy models
@@ -143,7 +175,7 @@ RUN python3 -m spacy download el_core_news_sm \
143175

144176
COPY . /ingestors
145177
WORKDIR /ingestors
146-
RUN pip install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
178+
RUN pip3 install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
147179
RUN chown -R app:app /ingestors
148180

149181
ENV ARCHIVE_TYPE=file \

docker-compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ services:
1616
ingest-file:
1717
build:
1818
context: .
19-
image: ghcr.io/alephdata/ingest-file
19+
# image: ghcr.io/alephdata/ingest-file
2020
hostname: ingest
2121
tmpfs:
2222
- /tmp:mode=777

ingestors/analysis/extract.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from functools import lru_cache
44
from normality import collapse_spaces
55
from languagecodes import list_to_alpha3
6-
from fingerprints import clean_entity_name
6+
from fingerprints import clean_entity_prefix
77
from followthemoney.types import registry
88

99
from ingestors import settings
@@ -27,7 +27,7 @@
2727
def clean_name(text):
2828
if text is None or len(text) > NAME_MAX_LENGTH:
2929
return
30-
text = clean_entity_name(text)
30+
text = clean_entity_prefix(text)
3131
text = collapse_spaces(text)
3232
if text is None or len(text) <= NAME_MIN_LENGTH or " " not in text:
3333
return

ingestors/documents/html.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
class HTMLIngestor(Ingestor, EncodingSupport, HTMLSupport):
99
"HTML file ingestor class. Extracts the text from the web page."
10+
1011
MIME_TYPES = ["text/html"]
1112
EXTENSIONS = [
1213
"htm",

ingestors/exc.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
class ProcessingException(Exception):
55
"A data-related error occuring during file processing."
6+
67
pass
78

89

ingestors/manager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ def ingest(self, file_path, entity, **kwargs):
193193
now_string = now.strftime("%Y-%m-%dT%H:%M:%S.%f")
194194

195195
entity.set("processingStatus", self.STATUS_FAILURE)
196-
entity.set("processingAgent", get_distribution("ingest").version)
196+
entity.set("processingAgent", get_distribution("ingestors").version)
197197
entity.set("processedAt", now_string)
198198

199199
ingestor_class = None

ingestors/media/audio.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
11
import logging
2+
from datetime import datetime
23
from followthemoney import model
34
from pymediainfo import MediaInfo
5+
from normality import stringify
46

57
from ingestors.ingestor import Ingestor
68
from ingestors.support.timestamp import TimestampSupport
79
from ingestors.exc import ProcessingException
10+
from ingestors.support.transcription import TranscriptionSupport
811

912
log = logging.getLogger(__name__)
1013

1114

12-
class AudioIngestor(Ingestor, TimestampSupport):
15+
class AudioIngestor(Ingestor, TimestampSupport, TranscriptionSupport):
1316
MIME_TYPES = [
1417
"audio/mpeg",
1518
"audio/mp3",
@@ -55,7 +58,23 @@ def ingest(self, file_path, entity):
5558
entity.add("samplingRate", track.sampling_rate)
5659
entity.add("duration", track.duration)
5760
except Exception as ex:
58-
raise ProcessingException("Could not read audio: %r", ex) from ex
61+
raise ProcessingException(f"Could not read audio: {ex}") from ex
62+
try:
63+
start = datetime.now()
64+
log.info(f"Attempting to transcribe {file_path}")
65+
self.transcribe(file_path, entity)
66+
elapsed_time = datetime.now() - start
67+
# caution! this can't store an elapsed time larger than 24h
68+
# datetime.seconds capped at [0,86400)
69+
elapsed_time = divmod(elapsed_time.total_seconds(), 60)[0]
70+
log.info(
71+
f"Transcription duration: {elapsed_time} minutes (audio duration: {entity.get('duration')})"
72+
)
73+
except Exception as ex:
74+
# If the transcription fails, the file processing should still count as a success.
75+
# The existance of a transcription is not mandatory, for now.
76+
entity.set("processingError", stringify(ex))
77+
log.error(ex)
5978

6079
@classmethod
6180
def match(cls, file_path, entity):

ingestors/media/video.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
11
import logging
2+
from datetime import datetime
23
from followthemoney import model
34
from pymediainfo import MediaInfo
5+
from normality import stringify
46

57
from ingestors.ingestor import Ingestor
68
from ingestors.support.timestamp import TimestampSupport
79
from ingestors.exc import ProcessingException
10+
from ingestors.support.transcription import TranscriptionSupport
811

912
log = logging.getLogger(__name__)
1013

1114

12-
class VideoIngestor(Ingestor, TimestampSupport):
15+
class VideoIngestor(Ingestor, TimestampSupport, TranscriptionSupport):
1316
MIME_TYPES = [
1417
"application/x-shockwave-flash",
1518
"video/quicktime",
@@ -44,6 +47,23 @@ def ingest(self, file_path, entity):
4447
entity.add("duration", track.duration)
4548
except Exception as ex:
4649
raise ProcessingException("Could not read video: %r", ex) from ex
50+
try:
51+
start = datetime.now()
52+
log.info(f"Attempting to transcribe {file_path}")
53+
audio_only_file = self.extract_audio(file_path)
54+
self.transcribe(audio_only_file, entity)
55+
elapsed_time = datetime.now() - start
56+
# caution! this can't store an elapsed time larger than 24h
57+
# datetime.seconds capped at [0,86400)
58+
elapsed_time = divmod(elapsed_time.total_seconds(), 60)[0]
59+
log.info(
60+
f"Transcription duration: {elapsed_time} minutes (audio duration: {entity.get('duration')})"
61+
)
62+
except Exception as ex:
63+
# If the transcription fails, the file processing should still count as a success.
64+
# The existance of a transcription is not mandatory, for now.
65+
entity.set("processingError", stringify(ex))
66+
log.error(ex)
4767

4868
@classmethod
4969
def match(cls, file_path, entity):

ingestors/settings.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,3 +57,11 @@
5757
SENTRY_CAPTURE_PROCESSING_EXCEPTIONS = env.to_bool(
5858
"SENTRY_CAPTURE_PROCESSING_EXCEPTIONS", False
5959
)
60+
61+
WHISPER_MODEL = env.get("INGESTORS_WHISPER_MODEL", "ggml-medium-q8_0.bin")
62+
# "auto" prompts the model to detect the language
63+
WHISPER_LANGUAGE = env.get("INGESTORS_WHISPER_LANGUAGE", "auto")
64+
# timeout expressed in seconds
65+
WHISPER_TRANSCRIPTION_TIMEOUT = env.get(
66+
"INGESTORS_WHISPER_TRANSCRIPTION_TIMEOUT", 60 * 60 * 2
67+
)

ingestors/support/transcription.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
import json
2+
import logging
3+
import subprocess
4+
from pathlib import Path
5+
6+
from ingestors import settings
7+
from ingestors.exc import ProcessingException
8+
9+
log = logging.getLogger(__name__)
10+
11+
12+
class TranscriptionSupport:
13+
"""Provides a helper for transcribing audio and video files."""
14+
15+
def extract_audio(self, file_path):
16+
audio_only_path = Path("/ingestors") / file_path.parts[-1].split(".")[0]
17+
audio_only_path = audio_only_path.with_suffix(".wav")
18+
19+
# https://github.com/ggml-org/whisper.cpp?tab=readme-ov-file#quick-start
20+
cmd = [
21+
"ffmpeg",
22+
"-i",
23+
file_path,
24+
"-ar",
25+
"16000",
26+
"-ac",
27+
"1",
28+
"-c:a",
29+
"pcm_s16le",
30+
audio_only_path,
31+
]
32+
33+
try:
34+
subprocess.run(
35+
cmd, timeout=settings.WHISPER_TRANSCRIPTION_TIMEOUT, check=True
36+
)
37+
except subprocess.CalledProcessError as e:
38+
raise e
39+
40+
if not audio_only_path.is_file():
41+
raise ProcessingException("Audio extraction failed.")
42+
43+
return audio_only_path
44+
45+
def transcribe(self, file_path, entity):
46+
model = settings.WHISPER_MODEL
47+
48+
models_path = Path("/whisper/models")
49+
50+
output_path = Path("/ingestors") / file_path.parts[-1].split(".")[0]
51+
52+
cmd = [
53+
"/whisper/build/bin/whisper-cli",
54+
"-m",
55+
models_path / model,
56+
"-f",
57+
file_path,
58+
"-oj",
59+
"-of",
60+
output_path,
61+
"-l",
62+
# setting to "auto" sometimes transcribes audio in an unintended language
63+
settings.WHISPER_LANGUAGE,
64+
]
65+
66+
try:
67+
log.info(cmd)
68+
subprocess.run(
69+
cmd, timeout=int(settings.WHISPER_TRANSCRIPTION_TIMEOUT), check=True
70+
)
71+
except subprocess.CalledProcessError as e:
72+
raise e
73+
# if the transcription succeeded, the output is written to a JSON
74+
output_path = output_path.with_suffix(".json")
75+
if not output_path.is_file():
76+
raise ProcessingException(
77+
f"Transcription failed. The file type might be unsupported for {file_path.parts[-1]}."
78+
)
79+
80+
with open(output_path, "r") as f:
81+
transcription_dict = json.loads(f.read())
82+
83+
transcription_intervals = transcription_dict.get("transcription")
84+
if transcription_intervals:
85+
full_transcription = ""
86+
for interval in transcription_intervals:
87+
full_transcription += f"[{interval['timestamps']['from']} -> {interval['timestamps']['to']}] {interval['text'].strip()}"
88+
entity.add("indexText", full_transcription)
89+
90+
else:
91+
self.delete_temporary_file(output_path)
92+
raise ProcessingException(
93+
f"Transcription failed, no output in file {output_path}."
94+
)
95+
96+
self.delete_temporary_file(output_path)
97+
98+
def delete_temporary_file(self, file_path):
99+
if not file_path.is_file():
100+
return
101+
102+
Path.unlink(file_path)

ingestors/support/xml.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def get_xml_parser(self, **kwargs):
1717
recover=True,
1818
resolve_entities=False,
1919
no_network=True,
20-
**kwargs
20+
**kwargs,
2121
)
2222

2323
def parse_xml_path(self, file_path, **kwargs):

0 commit comments

Comments
 (0)