Refactor: support layout analysis (#2273)

### Summary This PR is the second part of the "layout analysis" refactor to move it from unstructured-inference repo to unstructured repo, the first part is done in Unstructured-IO/unstructured-inference#305. This PR adds logic to support annotating `inferred` and `extracted` elements. ### Testing ``` PYTHONPATH=. python examples/layout-analysis/visualization.py <file_path> <strategy> <document_type> ``` e.g. ``` PYTHONPATH=. python examples/layout-analysis/visualization.py example-docs/layout-parser-paper-fast.pdf hi_res pdf ```
Unstructured-IO · Dec 19, 2023 · 096d23b · 096d23b
1 parent 09f86f2
commit 096d23b
Show file tree

Hide file tree

Showing 6 changed files with 230 additions and 78 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,9 @@
-## 0.11.6-dev1
+## 0.11.6-dev2
 
 ### Enhancements
 
+* **Update the layout analysis script.** The previous script only supported annotating `final` elements. The updated script also supports annotating `inferred` and `extracted` elements.
+
 ### Features
 
 ### Fixes

diff --git a/examples/layout-analysis/visualization.py b/examples/layout-analysis/visualization.py
@@ -3,7 +3,8 @@
 import sys
 
 import pdf2image
-from unstructured_inference.inference.elements import Rectangle
+from PIL import Image
+from unstructured_inference.inference.elements import TextRegion
 from unstructured_inference.visualize import draw_bbox
 
 from unstructured.documents.elements import PageBreak
@@ -29,11 +30,14 @@ def extract_element_coordinates(elements):
     return elements_coordinates
 
 
-def run_partition_pdf(f_path, strategy, images, output_dir):
+def run_partition_pdf(f_path, strategy, images, output_dir, output_f_basename, is_image):
     elements = partition_pdf(
         f_path,
         strategy=strategy,
+        is_image=is_image,
         include_page_breaks=True,
+        analysis=True,
+        analyzed_image_output_dir_path=output_dir,
     )
 
     elements_coordinates = extract_element_coordinates(elements)
@@ -44,22 +48,28 @@ def run_partition_pdf(f_path, strategy, images, output_dir):
             points = coordinate.points
             x1, y1 = points[0]
             x2, y2 = points[2]
-            rect = Rectangle(x1, y1, x2, y2)
-            img = draw_bbox(img, rect, color="red")
-
-        output_image_path = os.path.join(output_dir, f"{strategy}-{idx + 1}.jpg")
-        print(f"output_image_path: {output_image_path}")
+            el = TextRegion.from_coords(x1, y1, x2, y2)
+            img = draw_bbox(img, el, color="red")
 
+        output_image_path = os.path.join(output_dir, f"{output_f_basename}_{idx + 1}_final.jpg")
         img.save(output_image_path)
+        print(f"output_image_path: {output_image_path}")
 
 
-def run(f_path, strategy):
+def run(f_path, strategy, document_type):
     f_basename = os.path.splitext(os.path.basename(f_path))[0]
     output_dir_path = os.path.join(output_basedir_path, f_basename)
     os.makedirs(output_dir_path, exist_ok=True)
 
-    images = pdf2image.convert_from_path(f_path)
-    run_partition_pdf(f_path, strategy, images, output_dir_path)
+    is_image = document_type == "image"
+    if is_image:
+        with Image.open(f_path) as img:
+            img = img.convert("RGB")
+            images = [img]
+    else:
+        images = pdf2image.convert_from_path(f_path)
+
+    run_partition_pdf(f_path, strategy, images, output_dir_path, f_basename, is_image)
 
 
 if __name__ == "__main__":
@@ -74,7 +84,11 @@ def run(f_path, strategy):
         print("Invalid strategy")
         sys.exit(1)
 
+    if sys.argv[3] not in ["pdf", "image"]:
+        print("Invalid document type")
+        sys.exit(1)
+
     output_basedir_path = os.path.join(CUR_DIR, "output")
     os.makedirs(output_basedir_path, exist_ok=True)
 
-    run(f_path=sys.argv[1], strategy=sys.argv[2])
+    run(f_path=sys.argv[1], strategy=sys.argv[2], document_type=sys.argv[3])
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.11.6-dev1"  # pragma: no cover
+__version__ = "0.11.6-dev2"  # pragma: no cover
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
@@ -74,9 +74,13 @@
     prepare_languages_for_tesseract,
 )
 from unstructured.partition.pdf_image.pdf_image_utils import (
+    annotate_layout_elements,
     check_element_types_to_extract,
     save_elements,
 )
+from unstructured.partition.pdf_image.pdfminer_processing import (
+    merge_inferred_with_extracted_layout,
+)
 from unstructured.partition.pdf_image.pdfminer_utils import (
     open_pdfminer_pages_generator,
     rect_to_bbox,
@@ -247,6 +251,8 @@ def _partition_pdf_or_image_local(
     extract_element_types: Optional[List[str]] = None,
     image_output_dir_path: Optional[str] = None,
     pdf_image_dpi: Optional[int] = None,
+    analysis: bool = False,
+    analyzed_image_output_dir_path: Optional[str] = None,
     **kwargs,
 ) -> List[Element]:
     """Partition using package installed locally"""
@@ -286,14 +292,27 @@ def _partition_pdf_or_image_local(
             pdf_image_dpi=pdf_image_dpi,
         )
 
-        if pdf_text_extractable is True:
-            # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
-            merged_document_layout = process_file_with_pdfminer(
-                inferred_document_layout,
-                filename,
+        extracted_layout = (
+            process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
+            if pdf_text_extractable
+            else []
+        )
+
+        if analysis:
+            annotate_layout_elements(
+                inferred_document_layout=inferred_document_layout,
+                extracted_layout=extracted_layout,
+                filename=filename,
+                output_dir_path=analyzed_image_output_dir_path,
+                pdf_image_dpi=pdf_image_dpi,
+                is_image=is_image,
             )
-        else:
-            merged_document_layout = inferred_document_layout
+
+        # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
+        merged_document_layout = merge_inferred_with_extracted_layout(
+            inferred_document_layout=inferred_document_layout,
+            extracted_layout=extracted_layout,
+        )
 
         if model_name.startswith("chipper"):
             # NOTE(alan): We shouldn't do OCR with chipper
@@ -317,14 +336,16 @@ def _partition_pdf_or_image_local(
         )
         if hasattr(file, "seek"):
             file.seek(0)
-        if pdf_text_extractable is True:
-            # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
-            merged_document_layout = process_data_with_pdfminer(
-                inferred_document_layout,
-                file,
-            )
-        else:
-            merged_document_layout = inferred_document_layout
+
+        extracted_layout = (
+            process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) if pdf_text_extractable else []
+        )
+
+        # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
+        merged_document_layout = merge_inferred_with_extracted_layout(
+            inferred_document_layout=inferred_document_layout,
+            extracted_layout=extracted_layout,
+        )
 
         if model_name.startswith("chipper"):
             # NOTE(alan): We shouldn't do OCR with chipper
@@ -655,7 +676,7 @@ def _process_pdfminer_pages(
                     urls_metadata.append(map_bbox_and_index(words, annot))
 
             if hasattr(obj, "get_text"):
-                _text_snippets: List[str | Any] = [obj.get_text()]  # type: ignore
+                _text_snippets: List = [obj.get_text()]
             else:
                 _text = _extract_text(obj)
                 _text_snippets = re.split(PARAGRAPH_PATTERN, _text)

diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py
@@ -13,6 +13,8 @@
 from unstructured.partition.common import convert_to_bytes
 
 if TYPE_CHECKING:
+    from unstructured_inference.inference.layout import DocumentLayout, PageLayout, TextRegion
+
     from unstructured.documents.elements import Element
 
 
@@ -159,3 +161,118 @@ def valid_text(text: str) -> bool:
     if not text:
         return False
     return "(cid:" not in text
+
+
+def annotate_layout_elements_with_image(
+    inferred_page_layout: "PageLayout",
+    extracted_page_layout: Optional["PageLayout"],
+    output_dir_path: str,
+    output_f_basename: str,
+    page_number: int,
+):
+    """
+     Annotates a page image with both inferred and extracted layout elements.
+
+    This function takes the layout elements of a single page, either extracted from or inferred
+    for the document, and annotates them on the page image. It creates two separate annotated
+    images, one for each set of layout elements: 'inferred' and 'extracted'.
+    These annotated images are saved to a specified directory.
+    """
+
+    layout_map = {"inferred": {"layout": inferred_page_layout, "color": "blue"}}
+    if extracted_page_layout:
+        layout_map["extracted"] = {"layout": extracted_page_layout, "color": "green"}
+
+    for label, layout_data in layout_map.items():
+        page_layout = layout_data.get("layout")
+        color = layout_data.get("color")
+
+        img = page_layout.annotate(colors=color)
+        output_f_path = os.path.join(
+            output_dir_path, f"{output_f_basename}_{page_number}_{label}.jpg"
+        )
+        write_image(img, output_f_path)
+        print(f"output_image_path: {output_f_path}")
+
+
+def annotate_layout_elements(
+    inferred_document_layout: "DocumentLayout",
+    extracted_layout: List["TextRegion"],
+    filename: str,
+    output_dir_path: str,
+    pdf_image_dpi: int,
+    is_image: bool = False,
+) -> None:
+    """
+    Annotates layout elements on images extracted from a PDF or an image file.
+
+    This function processes a given document (PDF or image) and annotates layout elements based
+    on the inferred and extracted layout information.
+    It handles both PDF documents and standalone image files. For PDFs, it converts each page
+    into an image, whereas for image files, it processes the single image.
+    """
+
+    from unstructured_inference.inference.layout import PageLayout
+
+    output_f_basename = os.path.splitext(os.path.basename(filename))[0]
+    images = []
+    try:
+        if is_image:
+            with Image.open(filename) as img:
+                img = img.convert("RGB")
+                images.append(img)
+
+                extracted_page_layout = None
+                if extracted_layout:
+                    extracted_page_layout = PageLayout(
+                        number=1,
+                        image=img,
+                    )
+                    extracted_page_layout.elements = extracted_layout[0]
+
+                inferred_page_layout = inferred_document_layout.pages[0]
+                inferred_page_layout.image = img
+
+                annotate_layout_elements_with_image(
+                    inferred_page_layout=inferred_document_layout.pages[0],
+                    extracted_page_layout=extracted_page_layout,
+                    output_dir_path=output_dir_path,
+                    output_f_basename=output_f_basename,
+                    page_number=1,
+                )
+        else:
+            with tempfile.TemporaryDirectory() as temp_dir:
+                _image_paths = pdf2image.convert_from_path(
+                    filename,
+                    dpi=pdf_image_dpi,
+                    output_folder=temp_dir,
+                    paths_only=True,
+                )
+                image_paths = cast(List[str], _image_paths)
+                for i, image_path in enumerate(image_paths):
+                    with Image.open(image_path) as img:
+                        page_number = i + 1
+
+                        extracted_page_layout = None
+                        if extracted_layout:
+                            extracted_page_layout = PageLayout(
+                                number=page_number,
+                                image=img,
+                            )
+                            extracted_page_layout.elements = extracted_layout[i]
+
+                        inferred_page_layout = inferred_document_layout.pages[i]
+                        inferred_page_layout.image = img
+
+                        annotate_layout_elements_with_image(
+                            inferred_page_layout=inferred_document_layout.pages[i],
+                            extracted_page_layout=extracted_page_layout,
+                            output_dir_path=output_dir_path,
+                            output_f_basename=output_f_basename,
+                            page_number=page_number,
+                        )
+    except Exception as e:
+        if os.path.isdir(filename) or os.path.isfile(filename):
+            raise e
+        else:
+            raise FileNotFoundError(f'File "{filename}" not found!') from e