@@ -14,10 +14,80 @@ def test_ingest_on_svg(self):
1414 self .assertIn ("TEST" , entity .first ("bodyText" ))
1515 self .assertEqual (entity .first ("processingStatus" ), self .manager .STATUS_SUCCESS )
1616
17- def test_ingest_on_jpeg (self ):
18- fixture_path , entity = self .fixture ("jpegtest.jpg" )
19- self .manager .ingest (fixture_path , entity )
20- self .assertIn ("Debian" , entity .first ("bodyText" ))
21- self .assertEqual (entity .first ("mimeType" ), "image/jpeg" )
17+ def test_tesseract_ocr_regression (self ):
18+ """This test is meant to catch a regression in the OCR behaviour
19+ described in this PR: https://github.com/alephdata/ingest-file/pull/585"""
2220
23- self .assertEqual (entity .first ("processingStatus" ), self .manager .STATUS_SUCCESS )
21+ test_data = {
22+ "jpeg" : {
23+ "file" : "regression_jpg.jpg" ,
24+ "content" : "Debian -- Packages" ,
25+ "mime_type" : "image/jpeg" ,
26+ },
27+ "gif" : {
28+ "file" : "regression_gif.gif" ,
29+ "content" : "This is text inside a GIF image" ,
30+ "mime_type" : "image/gif" ,
31+ },
32+ "tiff" : {
33+ "file" : "regression_tiff.tiff" ,
34+ "content" : "Debian -- Packages" ,
35+ "mime_type" : "image/tiff" ,
36+ },
37+ "webp" : {
38+ "file" : "regression_webp.webp" ,
39+ "content" : "Debian -- Packages" ,
40+ "mime_type" : "image/webp" ,
41+ },
42+ "openjpeg" : {
43+ "file" : "regression_openjpeg.jp2" ,
44+ "content" : "Debian -- Packages" ,
45+ "mime_type" : "image/jp2" ,
46+ },
47+ }
48+
49+ for test_image_type in test_data :
50+ fixture_path , entity = self .fixture (test_data [test_image_type ]["file" ])
51+ self .manager .ingest (fixture_path , entity )
52+
53+ emitted_image_entities = [
54+ x
55+ for x in self .get_emitted ()
56+ if "mimeType" in x .properties and "image" in x .first ("mimeType" )
57+ ]
58+
59+ # Have entities been emitted with a mime type that contains "image"?
60+ self .assertTrue (
61+ len (emitted_image_entities ) != 0 ,
62+ f"Test failed for { test_data [test_image_type ]['file' ]} " ,
63+ )
64+ image_entity = emitted_image_entities .pop ()
65+
66+ # Is the mimeType correct?
67+ self .assertEqual (
68+ image_entity .first ("mimeType" ),
69+ test_data [test_image_type ]["mime_type" ],
70+ f"Test failed for { test_data [test_image_type ]['file' ]} " ,
71+ )
72+
73+ # Is the processing status of the entity == SUCCESS?
74+ self .assertEqual (
75+ image_entity .first ("processingStatus" ),
76+ self .manager .STATUS_SUCCESS ,
77+ f"Test failed for { test_data [test_image_type ]['file' ]} " ,
78+ )
79+
80+ # Does either the bodyText prop or the indexText prop contain
81+ # the text resulted from OCR?
82+ try :
83+ self .assertIn (
84+ test_data [test_image_type ]["content" ],
85+ image_entity .first ("bodyText" ),
86+ f"Test failed for { test_data [test_image_type ]['file' ]} " ,
87+ )
88+ except TypeError :
89+ self .assertIn (
90+ test_data [test_image_type ]["content" ],
91+ image_entity .first ("indexText" ),
92+ f"Test failed for { test_data [test_image_type ]['file' ]} " ,
93+ )
0 commit comments