Skip to content

Commit f160f1b

Browse files
authored
Merge pull request #587 from alephdata/chore/3.20.1/add-tests
Fix TIFF processing. Add tests to prevent regression in OCR for gif, jpg, jp2, tiff, webp
2 parents 454eb72 + 031f830 commit f160f1b

File tree

9 files changed

+132
-17
lines changed

9 files changed

+132
-17
lines changed

ingestors/media/tiff.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
1-
import logging
21
from followthemoney import model
32

43
from ingestors.ingestor import Ingestor
54
from ingestors.support.pdf import PDFSupport
65
from ingestors.support.shell import ShellSupport
76
from ingestors.support.temp import TempFileSupport
8-
9-
log = logging.getLogger(__name__)
7+
from ingestors.exc import ProcessingException
108

119

1210
class TIFFIngestor(Ingestor, PDFSupport, TempFileSupport, ShellSupport):
@@ -22,8 +20,24 @@ class TIFFIngestor(Ingestor, PDFSupport, TempFileSupport, ShellSupport):
2220
def ingest(self, file_path, entity):
2321
entity.schema = model.get("Pages")
2422
pdf_path = self.make_work_file("tiff.pdf")
25-
self.exec_command(
26-
"tiff2pdf", file_path, "-x", "300", "-y", "300", "-o", pdf_path
27-
)
23+
try:
24+
self.exec_command(
25+
"tiff2pdf",
26+
file_path,
27+
"-n",
28+
"-j",
29+
"-x",
30+
"300",
31+
"-y",
32+
"300",
33+
"-o",
34+
pdf_path,
35+
)
36+
except ProcessingException:
37+
self.exec_command(
38+
"tiff2pdf", file_path, "-x", "300", "-y", "300", "-o", pdf_path
39+
)
40+
2841
self.assert_outfile(pdf_path)
42+
2943
self.pdf_alternative_extract(entity, pdf_path, self.manager)

tests/fixtures/regression_gif.gif

2.41 KB
Loading
File renamed without changes.
81.7 KB
Binary file not shown.
103 KB
Binary file not shown.
23.1 KB
Loading
-69.3 KB
Binary file not shown.

tests/test_image.py

Lines changed: 76 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,80 @@ def test_ingest_on_svg(self):
1414
self.assertIn("TEST", entity.first("bodyText"))
1515
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
1616

17-
def test_ingest_on_jpeg(self):
18-
fixture_path, entity = self.fixture("jpegtest.jpg")
19-
self.manager.ingest(fixture_path, entity)
20-
self.assertIn("Debian", entity.first("bodyText"))
21-
self.assertEqual(entity.first("mimeType"), "image/jpeg")
17+
def test_tesseract_ocr_regression(self):
18+
"""This test is meant to catch a regression in the OCR behaviour
19+
described in this PR: https://github.com/alephdata/ingest-file/pull/585"""
2220

23-
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
21+
test_data = {
22+
"jpeg": {
23+
"file": "regression_jpg.jpg",
24+
"content": "Debian -- Packages",
25+
"mime_type": "image/jpeg",
26+
},
27+
"gif": {
28+
"file": "regression_gif.gif",
29+
"content": "This is text inside a GIF image",
30+
"mime_type": "image/gif",
31+
},
32+
"tiff": {
33+
"file": "regression_tiff.tiff",
34+
"content": "Debian -- Packages",
35+
"mime_type": "image/tiff",
36+
},
37+
"webp": {
38+
"file": "regression_webp.webp",
39+
"content": "Debian -- Packages",
40+
"mime_type": "image/webp",
41+
},
42+
"openjpeg": {
43+
"file": "regression_openjpeg.jp2",
44+
"content": "Debian -- Packages",
45+
"mime_type": "image/jp2",
46+
},
47+
}
48+
49+
for test_image_type in test_data:
50+
fixture_path, entity = self.fixture(test_data[test_image_type]["file"])
51+
self.manager.ingest(fixture_path, entity)
52+
53+
emitted_image_entities = [
54+
x
55+
for x in self.get_emitted()
56+
if "mimeType" in x.properties and "image" in x.first("mimeType")
57+
]
58+
59+
# Have entities been emitted with a mime type that contains "image"?
60+
self.assertTrue(
61+
len(emitted_image_entities) != 0,
62+
f"Test failed for {test_data[test_image_type]['file']}",
63+
)
64+
image_entity = emitted_image_entities.pop()
65+
66+
# Is the mimeType correct?
67+
self.assertEqual(
68+
image_entity.first("mimeType"),
69+
test_data[test_image_type]["mime_type"],
70+
f"Test failed for {test_data[test_image_type]['file']}",
71+
)
72+
73+
# Is the processing status of the entity == SUCCESS?
74+
self.assertEqual(
75+
image_entity.first("processingStatus"),
76+
self.manager.STATUS_SUCCESS,
77+
f"Test failed for {test_data[test_image_type]['file']}",
78+
)
79+
80+
# Does either the bodyText prop or the indexText prop contain
81+
# the text resulted from OCR?
82+
try:
83+
self.assertIn(
84+
test_data[test_image_type]["content"],
85+
image_entity.first("bodyText"),
86+
f"Test failed for {test_data[test_image_type]['file']}",
87+
)
88+
except TypeError:
89+
self.assertIn(
90+
test_data[test_image_type]["content"],
91+
image_entity.first("indexText"),
92+
f"Test failed for {test_data[test_image_type]['file']}",
93+
)

tests/test_tiff.py

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,45 @@ class TIFFIngestorTest(TestCase):
55
def test_match(self):
66
fixture_path, entity = self.fixture("multipage_tiff_example.tif")
77
self.manager.ingest(fixture_path, entity)
8-
self.assertEqual(entity.first("mimeType"), "image/tiff")
9-
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
8+
9+
emitted_image_entities = [
10+
x
11+
for x in self.get_emitted()
12+
if "mimeType" in x.properties and "image" in x.first("mimeType")
13+
]
14+
15+
# Have entities been emitted with a mime type that contains "image"?
16+
self.assertTrue(
17+
len(emitted_image_entities) != 0,
18+
"Test failed for multipage_tiff_example.tif",
19+
)
20+
image_entity = emitted_image_entities.pop()
21+
22+
self.assertEqual(image_entity.first("mimeType"), "image/tiff")
23+
self.assertEqual(
24+
image_entity.first("processingStatus"), self.manager.STATUS_SUCCESS
25+
)
1026
entities = self.get_emitted()
1127
self.assertEqual(len(entities), 11)
1228

1329
def test_ingest_tiff_format(self):
1430
fixture_path, entity = self.fixture("hello_world_tiff.tif")
1531
self.manager.ingest(fixture_path, entity)
16-
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
17-
entity = self.get_emitted_by_id(entity.id)
18-
self.assertEqual(entity.first("indexText"), "HELLO WORLD")
32+
33+
emitted_image_entities = [
34+
x
35+
for x in self.get_emitted()
36+
if "mimeType" in x.properties and "image" in x.first("mimeType")
37+
]
38+
39+
# Have entities been emitted with a mime type that contains "image"?
40+
self.assertTrue(
41+
len(emitted_image_entities) != 0,
42+
"Test failed for multipage_tiff_example.tif",
43+
)
44+
image_entity = emitted_image_entities.pop()
45+
46+
self.assertEqual(
47+
image_entity.first("processingStatus"), self.manager.STATUS_SUCCESS
48+
)
49+
self.assertEqual(image_entity.first("indexText"), "HELLO WORLD")

0 commit comments

Comments
 (0)