@@ -14,10 +14,80 @@ def test_ingest_on_svg(self):
14
14
self .assertIn ("TEST" , entity .first ("bodyText" ))
15
15
self .assertEqual (entity .first ("processingStatus" ), self .manager .STATUS_SUCCESS )
16
16
17
- def test_ingest_on_jpeg (self ):
18
- fixture_path , entity = self .fixture ("jpegtest.jpg" )
19
- self .manager .ingest (fixture_path , entity )
20
- self .assertIn ("Debian" , entity .first ("bodyText" ))
21
- self .assertEqual (entity .first ("mimeType" ), "image/jpeg" )
17
+ def test_tesseract_ocr_regression (self ):
18
+ """This test is meant to catch a regression in the OCR behaviour
19
+ described in this PR: https://github.com/alephdata/ingest-file/pull/585"""
22
20
23
- self .assertEqual (entity .first ("processingStatus" ), self .manager .STATUS_SUCCESS )
21
+ test_data = {
22
+ "jpeg" : {
23
+ "file" : "regression_jpg.jpg" ,
24
+ "content" : "Debian -- Packages" ,
25
+ "mime_type" : "image/jpeg" ,
26
+ },
27
+ "gif" : {
28
+ "file" : "regression_gif.gif" ,
29
+ "content" : "This is text inside a GIF image" ,
30
+ "mime_type" : "image/gif" ,
31
+ },
32
+ "tiff" : {
33
+ "file" : "regression_tiff.tiff" ,
34
+ "content" : "Debian -- Packages" ,
35
+ "mime_type" : "image/tiff" ,
36
+ },
37
+ "webp" : {
38
+ "file" : "regression_webp.webp" ,
39
+ "content" : "Debian -- Packages" ,
40
+ "mime_type" : "image/webp" ,
41
+ },
42
+ "openjpeg" : {
43
+ "file" : "regression_openjpeg.jp2" ,
44
+ "content" : "Debian -- Packages" ,
45
+ "mime_type" : "image/jp2" ,
46
+ },
47
+ }
48
+
49
+ for test_image_type in test_data :
50
+ fixture_path , entity = self .fixture (test_data [test_image_type ]["file" ])
51
+ self .manager .ingest (fixture_path , entity )
52
+
53
+ emitted_image_entities = [
54
+ x
55
+ for x in self .get_emitted ()
56
+ if "mimeType" in x .properties and "image" in x .first ("mimeType" )
57
+ ]
58
+
59
+ # Have entities been emitted with a mime type that contains "image"?
60
+ self .assertTrue (
61
+ len (emitted_image_entities ) != 0 ,
62
+ f"Test failed for { test_data [test_image_type ]['file' ]} " ,
63
+ )
64
+ image_entity = emitted_image_entities .pop ()
65
+
66
+ # Is the mimeType correct?
67
+ self .assertEqual (
68
+ image_entity .first ("mimeType" ),
69
+ test_data [test_image_type ]["mime_type" ],
70
+ f"Test failed for { test_data [test_image_type ]['file' ]} " ,
71
+ )
72
+
73
+ # Is the processing status of the entity == SUCCESS?
74
+ self .assertEqual (
75
+ image_entity .first ("processingStatus" ),
76
+ self .manager .STATUS_SUCCESS ,
77
+ f"Test failed for { test_data [test_image_type ]['file' ]} " ,
78
+ )
79
+
80
+ # Does either the bodyText prop or the indexText prop contain
81
+ # the text resulted from OCR?
82
+ try :
83
+ self .assertIn (
84
+ test_data [test_image_type ]["content" ],
85
+ image_entity .first ("bodyText" ),
86
+ f"Test failed for { test_data [test_image_type ]['file' ]} " ,
87
+ )
88
+ except TypeError :
89
+ self .assertIn (
90
+ test_data [test_image_type ]["content" ],
91
+ image_entity .first ("indexText" ),
92
+ f"Test failed for { test_data [test_image_type ]['file' ]} " ,
93
+ )
0 commit comments