Skip to content

Commit 454eb72

Browse files
authored
Merge pull request #585 from alephdata/release/3.20.0
Release/3.20.1
2 parents 04816b3 + 31d1eb1 commit 454eb72

File tree

7 files changed

+25
-12
lines changed

7 files changed

+25
-12
lines changed

.bumpversion.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpversion]
2-
current_version = 3.20.0
2+
current_version = 3.20.1
33
tag_name = {new_version}
44
commit = True
55
tag = True

Dockerfile

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ RUN apt-get -qq -y update \
2525
libtiff5-dev libjpeg-dev libfreetype6-dev libwebp-dev \
2626
libtiff-tools ghostscript librsvg2-bin jbig2dec \
2727
pst-utils \
28-
### tesseract
28+
### tesseract
2929
tesseract-ocr-eng \
3030
tesseract-ocr-swa \
3131
tesseract-ocr-swe \
@@ -96,7 +96,7 @@ RUN apt-get -qq -y update \
9696
tesseract-ocr-aze \
9797
tesseract-ocr-bel \
9898
tesseract-ocr-uzb \
99-
### pdf convert: libreoffice + a bunch of fonts
99+
### pdf convert: libreoffice + a bunch of fonts
100100
libreoffice fonts-opensymbol hyphen-fr hyphen-de \
101101
hyphen-en-us hyphen-it hyphen-ru fonts-dejavu fonts-dejavu-core fonts-dejavu-extra \
102102
fonts-droid-fallback fonts-dustin fonts-f500 fonts-fanwood fonts-freefont-ttf \
@@ -126,7 +126,7 @@ RUN mkdir /models/ && \
126126
COPY requirements.txt /tmp/
127127
RUN pip3 install --no-cache-dir --prefer-binary --upgrade pip
128128
RUN pip3 install --no-cache-dir --prefer-binary --upgrade setuptools wheel
129-
RUN pip3 install --no-cache-dir --prefer-binary -r /tmp/requirements.txt
129+
RUN pip3 install --no-cache-dir --no-binary "tesserocr" -r /tmp/requirements.txt
130130

131131
# Install spaCy models
132132
RUN python3 -m spacy download en_core_web_sm \
@@ -136,7 +136,7 @@ RUN python3 -m spacy download en_core_web_sm \
136136
RUN python3 -m spacy download ru_core_news_sm \
137137
&& python3 -m spacy download pt_core_news_sm \
138138
&& python3 -m spacy download ro_core_news_sm \
139-
&& python3 -m spacy download mk_core_news_sm
139+
&& python3 -m spacy download mk_core_news_sm
140140
RUN python3 -m spacy download el_core_news_sm \
141141
&& python3 -m spacy download pl_core_news_sm \
142142
&& python3 -m spacy download it_core_news_sm \

ingestors/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Provides a set of ingestors based on different file types."""
22
import logging
33

4-
__version__ = "3.20.0"
4+
__version__ = "3.20.1"
55

66
logging.getLogger("chardet").setLevel(logging.INFO)
77
logging.getLogger("PIL").setLevel(logging.INFO)

ingestors/cli.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from servicelayer.jobs import Job, Dataset
99
from servicelayer import settings as sl_settings
1010
from servicelayer.archive.util import ensure_path
11+
from servicelayer.tags import Tags
1112

1213
from ingestors import settings
1314
from ingestors.manager import Manager
@@ -112,5 +113,18 @@ def debug(path, languages=None):
112113
pprint(entity.to_dict())
113114

114115

116+
@cli.command()
117+
@click.argument(
118+
"prefix",
119+
default="",
120+
)
121+
def cache_clear(prefix):
122+
"""Delete all ingest cache entries.
123+
124+
Only delete entries with the given prefix (e.g: 'ocr:', 'pdf:').
125+
"""
126+
Tags("ingest_cache").delete(prefix=prefix)
127+
128+
115129
if __name__ == "__main__":
116130
cli()

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setup(
77
name="ingest",
8-
version="3.20.0",
8+
version="3.20.1",
99
author="Organized Crime and Corruption Reporting Project",
1010
packages=find_packages(exclude=["tests"]),
1111
package_dir={"ingestors": "ingestors"},

tests/fixtures/jpegtest.jpg

67.5 KB
Loading

tests/test_image.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,14 @@ def test_match(self):
1010
def test_ingest_on_svg(self):
1111
fixture_path, entity = self.fixture("image.svg")
1212
self.manager.ingest(fixture_path, entity)
13-
# print result.to_dict()
1413

1514
self.assertIn("TEST", entity.first("bodyText"))
16-
# self.assertIn(u'1..2..3..', result.pages[0]['text'])
1715
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
1816

19-
def test_ingest_hand_written_text(self):
20-
fixture_path, entity = self.fixture("some hand wirtten veird text.jpg")
17+
def test_ingest_on_jpeg(self):
18+
fixture_path, entity = self.fixture("jpegtest.jpg")
2119
self.manager.ingest(fixture_path, entity)
20+
self.assertIn("Debian", entity.first("bodyText"))
21+
self.assertEqual(entity.first("mimeType"), "image/jpeg")
2222

23-
# self.assert(u'Testing ingestors', result.pages[0]['text'])
2423
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)

0 commit comments

Comments
 (0)