Fix the TIFF to PDF conversion command. Add TIFF test.

alephdata · stchris · Feb 29, 2024 · Feb 22, 2024 · Feb 22, 2024 · Feb 22, 2024
commit d701931c1462850bb41a7d29e1da6818543ed254
diff --git a/ingestors/media/tiff.py b/ingestors/media/tiff.py
@@ -23,7 +23,8 @@ def ingest(self, file_path, entity):
         entity.schema = model.get("Pages")
         pdf_path = self.make_work_file("tiff.pdf")
         self.exec_command(
-            "tiff2pdf", file_path, "-x", "300", "-y", "300", "-o", pdf_path
+            "tiff2pdf", file_path, "-n", "-j", "-x", "300", "-y", "300", "-o", pdf_path
         )
         self.assert_outfile(pdf_path)
+
         self.pdf_alternative_extract(entity, pdf_path, self.manager)
diff --git a/tests/test_image.py b/tests/test_image.py
@@ -29,11 +29,11 @@ def test_tesseract_ocr_regression(self):
                 "content": "This is text inside a GIF image",
                 "mime_type": "image/gif",
             },
-            # "tiff": {
-            #     "file": "regression_tiff.tiff",
-            #     "content": "Debian -- Packages",
-            #     "mime_type": "image/tiff",
-            # },
+            "tiff": {
+                "file": "regression_tiff.tiff",
+                "content": "Debian -- Packages",
+                "mime_type": "image/tiff",
+            },
             "webp": {
                 "file": "regression_webp.webp",
                 "content": "Debian -- Packages",
@@ -49,18 +49,38 @@ def test_tesseract_ocr_regression(self):
         for test_image_type in test_data:
             fixture_path, entity = self.fixture(test_data[test_image_type]["file"])
             self.manager.ingest(fixture_path, entity)
-            self.assertIn(
-                test_data[test_image_type]["content"],
-                entity.first("bodyText"),
-                f"Test failed for {test_data[test_image_type]['file']}",
-            )
-            self.assertEqual(
-                entity.first("mimeType"),
-                test_data[test_image_type]["mime_type"],
+
+            emitted_image_entities = [
+                x
+                for x in self.get_emitted()
+                if "mimeType" in x.properties and "image" in x.first("mimeType")
+            ]
+
+            # Have entities been emitted with a mime type that contains "image"?
+            self.assertTrue(
+                len(emitted_image_entities) != 0,
                 f"Test failed for {test_data[test_image_type]['file']}",
             )
+            image_entity = emitted_image_entities.pop()
+
+            # Is the processing status of the entity == SUCCESS?
             self.assertEqual(
-                entity.first("processingStatus"),
+                image_entity.first("processingStatus"),
                 self.manager.STATUS_SUCCESS,
                 f"Test failed for {test_data[test_image_type]['file']}",
             )
+
+            # Does either the bodyText prop or the indexText prop contain
+            # the text resulted from OCR?
+            try:
+                self.assertIn(
+                    test_data[test_image_type]["content"],
+                    image_entity.first("bodyText"),
+                    f"Test failed for {test_data[test_image_type]['file']}",
+                )
+            except TypeError:
+                self.assertIn(
+                    test_data[test_image_type]["content"],
+                    image_entity.first("indexText"),
+                    f"Test failed for {test_data[test_image_type]['file']}",
+                )