From 096d23bc28d96376aaf0822b6980103b120201cc Mon Sep 17 00:00:00 2001
From: Christine Straub <christinemstraub@gmail.com>
Date: Mon, 18 Dec 2023 22:21:56 -0800
Subject: [PATCH] Refactor: support layout analysis (#2273)

### Summary
This PR is the second part of the "layout analysis" refactor to move it
from unstructured-inference repo to unstructured repo, the first part is
done in
https://github.com/Unstructured-IO/unstructured-inference/pull/305. This
PR adds logic to support annotating `inferred` and `extracted` elements.

### Testing

```
PYTHONPATH=. python examples/layout-analysis/visualization.py <file_path> <strategy> <document_type>
```
e.g.
```
PYTHONPATH=. python examples/layout-analysis/visualization.py example-docs/layout-parser-paper-fast.pdf hi_res pdf
```
---
 CHANGELOG.md                                  |   4 +-
 examples/layout-analysis/visualization.py     |  36 ++++--
 unstructured/__version__.py                   |   2 +-
 unstructured/partition/pdf.py                 |  53 +++++---
 .../partition/pdf_image/pdf_image_utils.py    | 117 ++++++++++++++++++
 .../pdf_image/pdfminer_processing.py          |  96 +++++++-------
 6 files changed, 230 insertions(+), 78 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index cc44acf1ff..5ee219d2d9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,9 @@
-## 0.11.6-dev1
+## 0.11.6-dev2
 
 ### Enhancements
 
+* **Update the layout analysis script.** The previous script only supported annotating `final` elements. The updated script also supports annotating `inferred` and `extracted` elements.
+
 ### Features
 
 ### Fixes
diff --git a/examples/layout-analysis/visualization.py b/examples/layout-analysis/visualization.py
index a13bb930cb..ae42cc8a1e 100644
--- a/examples/layout-analysis/visualization.py
+++ b/examples/layout-analysis/visualization.py
@@ -3,7 +3,8 @@
 import sys
 
 import pdf2image
-from unstructured_inference.inference.elements import Rectangle
+from PIL import Image
+from unstructured_inference.inference.elements import TextRegion
 from unstructured_inference.visualize import draw_bbox
 
 from unstructured.documents.elements import PageBreak
@@ -29,11 +30,14 @@ def extract_element_coordinates(elements):
     return elements_coordinates
 
 
-def run_partition_pdf(f_path, strategy, images, output_dir):
+def run_partition_pdf(f_path, strategy, images, output_dir, output_f_basename, is_image):
     elements = partition_pdf(
         f_path,
         strategy=strategy,
+        is_image=is_image,
         include_page_breaks=True,
+        analysis=True,
+        analyzed_image_output_dir_path=output_dir,
     )
 
     elements_coordinates = extract_element_coordinates(elements)
@@ -44,22 +48,28 @@ def run_partition_pdf(f_path, strategy, images, output_dir):
             points = coordinate.points
             x1, y1 = points[0]
             x2, y2 = points[2]
-            rect = Rectangle(x1, y1, x2, y2)
-            img = draw_bbox(img, rect, color="red")
-
-        output_image_path = os.path.join(output_dir, f"{strategy}-{idx + 1}.jpg")
-        print(f"output_image_path: {output_image_path}")
+            el = TextRegion.from_coords(x1, y1, x2, y2)
+            img = draw_bbox(img, el, color="red")
 
+        output_image_path = os.path.join(output_dir, f"{output_f_basename}_{idx + 1}_final.jpg")
         img.save(output_image_path)
+        print(f"output_image_path: {output_image_path}")
 
 
-def run(f_path, strategy):
+def run(f_path, strategy, document_type):
     f_basename = os.path.splitext(os.path.basename(f_path))[0]
     output_dir_path = os.path.join(output_basedir_path, f_basename)
     os.makedirs(output_dir_path, exist_ok=True)
 
-    images = pdf2image.convert_from_path(f_path)
-    run_partition_pdf(f_path, strategy, images, output_dir_path)
+    is_image = document_type == "image"
+    if is_image:
+        with Image.open(f_path) as img:
+            img = img.convert("RGB")
+            images = [img]
+    else:
+        images = pdf2image.convert_from_path(f_path)
+
+    run_partition_pdf(f_path, strategy, images, output_dir_path, f_basename, is_image)
 
 
 if __name__ == "__main__":
@@ -74,7 +84,11 @@ def run(f_path, strategy):
         print("Invalid strategy")
         sys.exit(1)
 
+    if sys.argv[3] not in ["pdf", "image"]:
+        print("Invalid document type")
+        sys.exit(1)
+
     output_basedir_path = os.path.join(CUR_DIR, "output")
     os.makedirs(output_basedir_path, exist_ok=True)
 
-    run(f_path=sys.argv[1], strategy=sys.argv[2])
+    run(f_path=sys.argv[1], strategy=sys.argv[2], document_type=sys.argv[3])
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 2bc40d4c52..fd7506317c 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.11.6-dev1"  # pragma: no cover
+__version__ = "0.11.6-dev2"  # pragma: no cover
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
index fea31a48ef..c7fd88b0f8 100644
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@@ -74,9 +74,13 @@
     prepare_languages_for_tesseract,
 )
 from unstructured.partition.pdf_image.pdf_image_utils import (
+    annotate_layout_elements,
     check_element_types_to_extract,
     save_elements,
 )
+from unstructured.partition.pdf_image.pdfminer_processing import (
+    merge_inferred_with_extracted_layout,
+)
 from unstructured.partition.pdf_image.pdfminer_utils import (
     open_pdfminer_pages_generator,
     rect_to_bbox,
@@ -247,6 +251,8 @@ def _partition_pdf_or_image_local(
     extract_element_types: Optional[List[str]] = None,
     image_output_dir_path: Optional[str] = None,
     pdf_image_dpi: Optional[int] = None,
+    analysis: bool = False,
+    analyzed_image_output_dir_path: Optional[str] = None,
     **kwargs,
 ) -> List[Element]:
     """Partition using package installed locally"""
@@ -286,14 +292,27 @@ def _partition_pdf_or_image_local(
             pdf_image_dpi=pdf_image_dpi,
         )
 
-        if pdf_text_extractable is True:
-            # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
-            merged_document_layout = process_file_with_pdfminer(
-                inferred_document_layout,
-                filename,
+        extracted_layout = (
+            process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
+            if pdf_text_extractable
+            else []
+        )
+
+        if analysis:
+            annotate_layout_elements(
+                inferred_document_layout=inferred_document_layout,
+                extracted_layout=extracted_layout,
+                filename=filename,
+                output_dir_path=analyzed_image_output_dir_path,
+                pdf_image_dpi=pdf_image_dpi,
+                is_image=is_image,
             )
-        else:
-            merged_document_layout = inferred_document_layout
+
+        # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
+        merged_document_layout = merge_inferred_with_extracted_layout(
+            inferred_document_layout=inferred_document_layout,
+            extracted_layout=extracted_layout,
+        )
 
         if model_name.startswith("chipper"):
             # NOTE(alan): We shouldn't do OCR with chipper
@@ -317,14 +336,16 @@ def _partition_pdf_or_image_local(
         )
         if hasattr(file, "seek"):
             file.seek(0)
-        if pdf_text_extractable is True:
-            # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
-            merged_document_layout = process_data_with_pdfminer(
-                inferred_document_layout,
-                file,
-            )
-        else:
-            merged_document_layout = inferred_document_layout
+
+        extracted_layout = (
+            process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) if pdf_text_extractable else []
+        )
+
+        # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
+        merged_document_layout = merge_inferred_with_extracted_layout(
+            inferred_document_layout=inferred_document_layout,
+            extracted_layout=extracted_layout,
+        )
 
         if model_name.startswith("chipper"):
             # NOTE(alan): We shouldn't do OCR with chipper
@@ -655,7 +676,7 @@ def _process_pdfminer_pages(
                     urls_metadata.append(map_bbox_and_index(words, annot))
 
             if hasattr(obj, "get_text"):
-                _text_snippets: List[str | Any] = [obj.get_text()]  # type: ignore
+                _text_snippets: List = [obj.get_text()]
             else:
                 _text = _extract_text(obj)
                 _text_snippets = re.split(PARAGRAPH_PATTERN, _text)
diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py
index e1514300c6..ee9c9d29ff 100644
--- a/unstructured/partition/pdf_image/pdf_image_utils.py
+++ b/unstructured/partition/pdf_image/pdf_image_utils.py
@@ -13,6 +13,8 @@
 from unstructured.partition.common import convert_to_bytes
 
 if TYPE_CHECKING:
+    from unstructured_inference.inference.layout import DocumentLayout, PageLayout, TextRegion
+
     from unstructured.documents.elements import Element
 
 
@@ -159,3 +161,118 @@ def valid_text(text: str) -> bool:
     if not text:
         return False
     return "(cid:" not in text
+
+
+def annotate_layout_elements_with_image(
+    inferred_page_layout: "PageLayout",
+    extracted_page_layout: Optional["PageLayout"],
+    output_dir_path: str,
+    output_f_basename: str,
+    page_number: int,
+):
+    """
+     Annotates a page image with both inferred and extracted layout elements.
+
+    This function takes the layout elements of a single page, either extracted from or inferred
+    for the document, and annotates them on the page image. It creates two separate annotated
+    images, one for each set of layout elements: 'inferred' and 'extracted'.
+    These annotated images are saved to a specified directory.
+    """
+
+    layout_map = {"inferred": {"layout": inferred_page_layout, "color": "blue"}}
+    if extracted_page_layout:
+        layout_map["extracted"] = {"layout": extracted_page_layout, "color": "green"}
+
+    for label, layout_data in layout_map.items():
+        page_layout = layout_data.get("layout")
+        color = layout_data.get("color")
+
+        img = page_layout.annotate(colors=color)
+        output_f_path = os.path.join(
+            output_dir_path, f"{output_f_basename}_{page_number}_{label}.jpg"
+        )
+        write_image(img, output_f_path)
+        print(f"output_image_path: {output_f_path}")
+
+
+def annotate_layout_elements(
+    inferred_document_layout: "DocumentLayout",
+    extracted_layout: List["TextRegion"],
+    filename: str,
+    output_dir_path: str,
+    pdf_image_dpi: int,
+    is_image: bool = False,
+) -> None:
+    """
+    Annotates layout elements on images extracted from a PDF or an image file.
+
+    This function processes a given document (PDF or image) and annotates layout elements based
+    on the inferred and extracted layout information.
+    It handles both PDF documents and standalone image files. For PDFs, it converts each page
+    into an image, whereas for image files, it processes the single image.
+    """
+
+    from unstructured_inference.inference.layout import PageLayout
+
+    output_f_basename = os.path.splitext(os.path.basename(filename))[0]
+    images = []
+    try:
+        if is_image:
+            with Image.open(filename) as img:
+                img = img.convert("RGB")
+                images.append(img)
+
+                extracted_page_layout = None
+                if extracted_layout:
+                    extracted_page_layout = PageLayout(
+                        number=1,
+                        image=img,
+                    )
+                    extracted_page_layout.elements = extracted_layout[0]
+
+                inferred_page_layout = inferred_document_layout.pages[0]
+                inferred_page_layout.image = img
+
+                annotate_layout_elements_with_image(
+                    inferred_page_layout=inferred_document_layout.pages[0],
+                    extracted_page_layout=extracted_page_layout,
+                    output_dir_path=output_dir_path,
+                    output_f_basename=output_f_basename,
+                    page_number=1,
+                )
+        else:
+            with tempfile.TemporaryDirectory() as temp_dir:
+                _image_paths = pdf2image.convert_from_path(
+                    filename,
+                    dpi=pdf_image_dpi,
+                    output_folder=temp_dir,
+                    paths_only=True,
+                )
+                image_paths = cast(List[str], _image_paths)
+                for i, image_path in enumerate(image_paths):
+                    with Image.open(image_path) as img:
+                        page_number = i + 1
+
+                        extracted_page_layout = None
+                        if extracted_layout:
+                            extracted_page_layout = PageLayout(
+                                number=page_number,
+                                image=img,
+                            )
+                            extracted_page_layout.elements = extracted_layout[i]
+
+                        inferred_page_layout = inferred_document_layout.pages[i]
+                        inferred_page_layout.image = img
+
+                        annotate_layout_elements_with_image(
+                            inferred_page_layout=inferred_document_layout.pages[i],
+                            extracted_page_layout=extracted_page_layout,
+                            output_dir_path=output_dir_path,
+                            output_f_basename=output_f_basename,
+                            page_number=page_number,
+                        )
+    except Exception as e:
+        if os.path.isdir(filename) or os.path.isfile(filename):
+            raise e
+        else:
+            raise FileNotFoundError(f'File "{filename}" not found!') from e
diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py
index 8cffdbeaa7..ac186fea78 100644
--- a/unstructured/partition/pdf_image/pdfminer_processing.py
+++ b/unstructured/partition/pdf_image/pdfminer_processing.py
@@ -7,7 +7,7 @@
     TextRegion,
 )
 from unstructured_inference.inference.layoutelement import (
-    merge_inferred_layout_with_extracted_layout,
+    merge_inferred_layout_with_extracted_layout as merge_inferred_with_extracted_page,
 )
 from unstructured_inference.inference.ordering import order_layout
 from unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel
@@ -25,62 +25,20 @@
 
 
 def process_file_with_pdfminer(
-    inferred_document_layout: "DocumentLayout",
     filename: str = "",
-) -> "DocumentLayout":
+    dpi: int = 200,
+) -> List[List[TextRegion]]:
     with open_filename(filename, "rb") as fp:
         fp = cast(BinaryIO, fp)
-        inferred_document_layout = process_data_with_pdfminer(
-            inferred_document_layout=inferred_document_layout,
+        extracted_layout = process_data_with_pdfminer(
             file=fp,
+            dpi=dpi,
         )
-        return inferred_document_layout
+        return extracted_layout
 
 
 def process_data_with_pdfminer(
-    inferred_document_layout: "DocumentLayout",
     file: Optional[Union[bytes, BinaryIO]] = None,
-) -> "DocumentLayout":
-    """Process document data using PDFMiner to extract layout information."""
-
-    extracted_layouts = get_regions_by_pdfminer(file)
-
-    inferred_pages = inferred_document_layout.pages
-    for i, (inferred_page, extracted_layout) in enumerate(zip(inferred_pages, extracted_layouts)):
-        inferred_layout = inferred_page.elements
-        image_metadata = inferred_page.image_metadata
-        w = image_metadata.get("width")
-        h = image_metadata.get("height")
-        image_size = (w, h)
-
-        threshold_kwargs = {}
-        # NOTE(Benjamin): With this the thresholds are only changed for detextron2_mask_rcnn
-        # In other case the default values for the functions are used
-        if (
-            isinstance(inferred_page.detection_model, UnstructuredDetectronONNXModel)
-            and "R_50" not in inferred_page.detection_model.model_path
-        ):
-            threshold_kwargs = {"same_region_threshold": 0.5, "subregion_threshold": 0.5}
-
-        merged_layout = merge_inferred_layout_with_extracted_layout(
-            inferred_layout=inferred_layout,
-            extracted_layout=extracted_layout,
-            page_image_size=image_size,
-            **threshold_kwargs,
-        )
-
-        elements = inferred_page.get_elements_from_layout(
-            layout=cast(List[TextRegion], merged_layout),
-            pdf_objects=extracted_layout,
-        )
-
-        inferred_page.elements[:] = elements
-
-    return inferred_document_layout
-
-
-def get_regions_by_pdfminer(
-    fp: Optional[Union[bytes, BinaryIO]],
     dpi: int = 200,
 ) -> List[List[TextRegion]]:
     """Loads the image and word objects from a pdf using pdfplumber and the image renderings of the
@@ -89,7 +47,7 @@ def get_regions_by_pdfminer(
     layouts = []
     # Coefficient to rescale bounding box to be compatible with images
     coef = dpi / 72
-    for page, page_layout in open_pdfminer_pages_generator(fp):
+    for page, page_layout in open_pdfminer_pages_generator(file):
         height = page_layout.height
 
         layout: List["TextRegion"] = []
@@ -129,3 +87,43 @@ def get_regions_by_pdfminer(
         layouts.append(layout)
 
     return layouts
+
+
+def merge_inferred_with_extracted_layout(
+    inferred_document_layout: "DocumentLayout",
+    extracted_layout: List[List[TextRegion]],
+) -> "DocumentLayout":
+    inferred_pages = inferred_document_layout.pages
+    for i, (inferred_page, extracted_page_layout) in enumerate(
+        zip(inferred_pages, extracted_layout)
+    ):
+        inferred_layout = inferred_page.elements
+        image_metadata = inferred_page.image_metadata
+        w = image_metadata.get("width")
+        h = image_metadata.get("height")
+        image_size = (w, h)
+
+        threshold_kwargs = {}
+        # NOTE(Benjamin): With this the thresholds are only changed for detextron2_mask_rcnn
+        # In other case the default values for the functions are used
+        if (
+            isinstance(inferred_page.detection_model, UnstructuredDetectronONNXModel)
+            and "R_50" not in inferred_page.detection_model.model_path
+        ):
+            threshold_kwargs = {"same_region_threshold": 0.5, "subregion_threshold": 0.5}
+
+        merged_layout = merge_inferred_with_extracted_page(
+            inferred_layout=inferred_layout,
+            extracted_layout=extracted_page_layout,
+            page_image_size=image_size,
+            **threshold_kwargs,
+        )
+
+        elements = inferred_page.get_elements_from_layout(
+            layout=cast(List[TextRegion], merged_layout),
+            pdf_objects=extracted_page_layout,
+        )
+
+        inferred_page.elements[:] = elements
+
+    return inferred_document_layout