-
Notifications
You must be signed in to change notification settings - Fork 28
/
test_image.py
93 lines (81 loc) · 3.47 KB
/
test_image.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from .support import TestCase
class ImageIngestorTest(TestCase):
def test_match(self):
fixture_path, entity = self.fixture("image.svg")
self.manager.ingest(fixture_path, entity)
self.assertEqual(entity.first("mimeType"), "image/svg+xml")
def test_ingest_on_svg(self):
fixture_path, entity = self.fixture("image.svg")
self.manager.ingest(fixture_path, entity)
self.assertIn("TEST", entity.first("bodyText"))
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
def test_tesseract_ocr_regression(self):
"""This test is meant to catch a regression in the OCR behaviour
described in this PR: https://github.com/alephdata/ingest-file/pull/585"""
test_data = {
"jpeg": {
"file": "regression_jpg.jpg",
"content": "Debian -- Packages",
"mime_type": "image/jpeg",
},
"gif": {
"file": "regression_gif.gif",
"content": "This is text inside a GIF image",
"mime_type": "image/gif",
},
"tiff": {
"file": "regression_tiff.tiff",
"content": "Debian -- Packages",
"mime_type": "image/tiff",
},
"webp": {
"file": "regression_webp.webp",
"content": "Debian -- Packages",
"mime_type": "image/webp",
},
"openjpeg": {
"file": "regression_openjpeg.jp2",
"content": "Debian -- Packages",
"mime_type": "image/jp2",
},
}
for test_image_type in test_data:
fixture_path, entity = self.fixture(test_data[test_image_type]["file"])
self.manager.ingest(fixture_path, entity)
emitted_image_entities = [
x
for x in self.get_emitted()
if "mimeType" in x.properties and "image" in x.first("mimeType")
]
# Have entities been emitted with a mime type that contains "image"?
self.assertTrue(
len(emitted_image_entities) != 0,
f"Test failed for {test_data[test_image_type]['file']}",
)
image_entity = emitted_image_entities.pop()
# Is the mimeType correct?
self.assertEqual(
image_entity.first("mimeType"),
test_data[test_image_type]["mime_type"],
f"Test failed for {test_data[test_image_type]['file']}",
)
# Is the processing status of the entity == SUCCESS?
self.assertEqual(
image_entity.first("processingStatus"),
self.manager.STATUS_SUCCESS,
f"Test failed for {test_data[test_image_type]['file']}",
)
# Does either the bodyText prop or the indexText prop contain
# the text resulted from OCR?
try:
self.assertIn(
test_data[test_image_type]["content"],
image_entity.first("bodyText"),
f"Test failed for {test_data[test_image_type]['file']}",
)
except TypeError:
self.assertIn(
test_data[test_image_type]["content"],
image_entity.first("indexText"),
f"Test failed for {test_data[test_image_type]['file']}",
)