Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 3.20.0
current_version = 3.20.1
tag_name = {new_version}
commit = True
tag = True
Expand Down
8 changes: 4 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ RUN apt-get -qq -y update \
libtiff5-dev libjpeg-dev libfreetype6-dev libwebp-dev \
libtiff-tools ghostscript librsvg2-bin jbig2dec \
pst-utils \
### tesseract
### tesseract
tesseract-ocr-eng \
tesseract-ocr-swa \
tesseract-ocr-swe \
Expand Down Expand Up @@ -96,7 +96,7 @@ RUN apt-get -qq -y update \
tesseract-ocr-aze \
tesseract-ocr-bel \
tesseract-ocr-uzb \
### pdf convert: libreoffice + a bunch of fonts
### pdf convert: libreoffice + a bunch of fonts
libreoffice fonts-opensymbol hyphen-fr hyphen-de \
hyphen-en-us hyphen-it hyphen-ru fonts-dejavu fonts-dejavu-core fonts-dejavu-extra \
fonts-droid-fallback fonts-dustin fonts-f500 fonts-fanwood fonts-freefont-ttf \
Expand Down Expand Up @@ -126,7 +126,7 @@ RUN mkdir /models/ && \
COPY requirements.txt /tmp/
RUN pip3 install --no-cache-dir --prefer-binary --upgrade pip
RUN pip3 install --no-cache-dir --prefer-binary --upgrade setuptools wheel
RUN pip3 install --no-cache-dir --prefer-binary -r /tmp/requirements.txt
RUN pip3 install --no-cache-dir --no-binary "tesserocr" -r /tmp/requirements.txt

# Install spaCy models
RUN python3 -m spacy download en_core_web_sm \
Expand All @@ -136,7 +136,7 @@ RUN python3 -m spacy download en_core_web_sm \
RUN python3 -m spacy download ru_core_news_sm \
&& python3 -m spacy download pt_core_news_sm \
&& python3 -m spacy download ro_core_news_sm \
&& python3 -m spacy download mk_core_news_sm
&& python3 -m spacy download mk_core_news_sm
RUN python3 -m spacy download el_core_news_sm \
&& python3 -m spacy download pl_core_news_sm \
&& python3 -m spacy download it_core_news_sm \
Expand Down
2 changes: 1 addition & 1 deletion ingestors/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Provides a set of ingestors based on different file types."""
import logging

__version__ = "3.20.0"
__version__ = "3.20.1"

logging.getLogger("chardet").setLevel(logging.INFO)
logging.getLogger("PIL").setLevel(logging.INFO)
Expand Down
14 changes: 14 additions & 0 deletions ingestors/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from servicelayer.jobs import Job, Dataset
from servicelayer import settings as sl_settings
from servicelayer.archive.util import ensure_path
from servicelayer.tags import Tags

from ingestors import settings
from ingestors.manager import Manager
Expand Down Expand Up @@ -112,5 +113,18 @@ def debug(path, languages=None):
pprint(entity.to_dict())


@cli.command()
@click.argument(
"prefix",
default="",
)
def cache_clear(prefix):
"""Delete all ingest cache entries.

Only delete entries with the given prefix (e.g: 'ocr:', 'pdf:').
"""
Tags("ingest_cache").delete(prefix=prefix)


if __name__ == "__main__":
cli()
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name="ingest",
version="3.20.0",
version="3.20.1",
author="Organized Crime and Corruption Reporting Project",
packages=find_packages(exclude=["tests"]),
package_dir={"ingestors": "ingestors"},
Expand Down
Binary file added tests/fixtures/jpegtest.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
9 changes: 4 additions & 5 deletions tests/test_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,14 @@ def test_match(self):
def test_ingest_on_svg(self):
fixture_path, entity = self.fixture("image.svg")
self.manager.ingest(fixture_path, entity)
# print result.to_dict()

self.assertIn("TEST", entity.first("bodyText"))
# self.assertIn(u'1..2..3..', result.pages[0]['text'])
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)

def test_ingest_hand_written_text(self):
fixture_path, entity = self.fixture("some hand wirtten veird text.jpg")
def test_ingest_on_jpeg(self):
fixture_path, entity = self.fixture("jpegtest.jpg")
self.manager.ingest(fixture_path, entity)
self.assertIn("Debian", entity.first("bodyText"))
self.assertEqual(entity.first("mimeType"), "image/jpeg")

# self.assert(u'Testing ingestors', result.pages[0]['text'])
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)