Skip to content

Commit

Permalink
Refactor: support layout analysis (#2273)
Browse files Browse the repository at this point in the history
### Summary
This PR is the second part of the "layout analysis" refactor to move it
from unstructured-inference repo to unstructured repo, the first part is
done in
Unstructured-IO/unstructured-inference#305. This
PR adds logic to support annotating `inferred` and `extracted` elements.

### Testing

```
PYTHONPATH=. python examples/layout-analysis/visualization.py <file_path> <strategy> <document_type>
```
e.g.
```
PYTHONPATH=. python examples/layout-analysis/visualization.py example-docs/layout-parser-paper-fast.pdf hi_res pdf
```
  • Loading branch information
christinestraub authored Dec 19, 2023
1 parent 09f86f2 commit 096d23b
Show file tree
Hide file tree
Showing 6 changed files with 230 additions and 78 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
## 0.11.6-dev1
## 0.11.6-dev2

### Enhancements

* **Update the layout analysis script.** The previous script only supported annotating `final` elements. The updated script also supports annotating `inferred` and `extracted` elements.

### Features

### Fixes
Expand Down
36 changes: 25 additions & 11 deletions examples/layout-analysis/visualization.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
import sys

import pdf2image
from unstructured_inference.inference.elements import Rectangle
from PIL import Image
from unstructured_inference.inference.elements import TextRegion
from unstructured_inference.visualize import draw_bbox

from unstructured.documents.elements import PageBreak
Expand All @@ -29,11 +30,14 @@ def extract_element_coordinates(elements):
return elements_coordinates


def run_partition_pdf(f_path, strategy, images, output_dir):
def run_partition_pdf(f_path, strategy, images, output_dir, output_f_basename, is_image):
elements = partition_pdf(
f_path,
strategy=strategy,
is_image=is_image,
include_page_breaks=True,
analysis=True,
analyzed_image_output_dir_path=output_dir,
)

elements_coordinates = extract_element_coordinates(elements)
Expand All @@ -44,22 +48,28 @@ def run_partition_pdf(f_path, strategy, images, output_dir):
points = coordinate.points
x1, y1 = points[0]
x2, y2 = points[2]
rect = Rectangle(x1, y1, x2, y2)
img = draw_bbox(img, rect, color="red")

output_image_path = os.path.join(output_dir, f"{strategy}-{idx + 1}.jpg")
print(f"output_image_path: {output_image_path}")
el = TextRegion.from_coords(x1, y1, x2, y2)
img = draw_bbox(img, el, color="red")

output_image_path = os.path.join(output_dir, f"{output_f_basename}_{idx + 1}_final.jpg")
img.save(output_image_path)
print(f"output_image_path: {output_image_path}")


def run(f_path, strategy):
def run(f_path, strategy, document_type):
f_basename = os.path.splitext(os.path.basename(f_path))[0]
output_dir_path = os.path.join(output_basedir_path, f_basename)
os.makedirs(output_dir_path, exist_ok=True)

images = pdf2image.convert_from_path(f_path)
run_partition_pdf(f_path, strategy, images, output_dir_path)
is_image = document_type == "image"
if is_image:
with Image.open(f_path) as img:
img = img.convert("RGB")
images = [img]
else:
images = pdf2image.convert_from_path(f_path)

run_partition_pdf(f_path, strategy, images, output_dir_path, f_basename, is_image)


if __name__ == "__main__":
Expand All @@ -74,7 +84,11 @@ def run(f_path, strategy):
print("Invalid strategy")
sys.exit(1)

if sys.argv[3] not in ["pdf", "image"]:
print("Invalid document type")
sys.exit(1)

output_basedir_path = os.path.join(CUR_DIR, "output")
os.makedirs(output_basedir_path, exist_ok=True)

run(f_path=sys.argv[1], strategy=sys.argv[2])
run(f_path=sys.argv[1], strategy=sys.argv[2], document_type=sys.argv[3])
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.11.6-dev1" # pragma: no cover
__version__ = "0.11.6-dev2" # pragma: no cover
53 changes: 37 additions & 16 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,13 @@
prepare_languages_for_tesseract,
)
from unstructured.partition.pdf_image.pdf_image_utils import (
annotate_layout_elements,
check_element_types_to_extract,
save_elements,
)
from unstructured.partition.pdf_image.pdfminer_processing import (
merge_inferred_with_extracted_layout,
)
from unstructured.partition.pdf_image.pdfminer_utils import (
open_pdfminer_pages_generator,
rect_to_bbox,
Expand Down Expand Up @@ -247,6 +251,8 @@ def _partition_pdf_or_image_local(
extract_element_types: Optional[List[str]] = None,
image_output_dir_path: Optional[str] = None,
pdf_image_dpi: Optional[int] = None,
analysis: bool = False,
analyzed_image_output_dir_path: Optional[str] = None,
**kwargs,
) -> List[Element]:
"""Partition using package installed locally"""
Expand Down Expand Up @@ -286,14 +292,27 @@ def _partition_pdf_or_image_local(
pdf_image_dpi=pdf_image_dpi,
)

if pdf_text_extractable is True:
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = process_file_with_pdfminer(
inferred_document_layout,
filename,
extracted_layout = (
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
if pdf_text_extractable
else []
)

if analysis:
annotate_layout_elements(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
filename=filename,
output_dir_path=analyzed_image_output_dir_path,
pdf_image_dpi=pdf_image_dpi,
is_image=is_image,
)
else:
merged_document_layout = inferred_document_layout

# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = merge_inferred_with_extracted_layout(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
)

if model_name.startswith("chipper"):
# NOTE(alan): We shouldn't do OCR with chipper
Expand All @@ -317,14 +336,16 @@ def _partition_pdf_or_image_local(
)
if hasattr(file, "seek"):
file.seek(0)
if pdf_text_extractable is True:
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = process_data_with_pdfminer(
inferred_document_layout,
file,
)
else:
merged_document_layout = inferred_document_layout

extracted_layout = (
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) if pdf_text_extractable else []
)

# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = merge_inferred_with_extracted_layout(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
)

if model_name.startswith("chipper"):
# NOTE(alan): We shouldn't do OCR with chipper
Expand Down Expand Up @@ -655,7 +676,7 @@ def _process_pdfminer_pages(
urls_metadata.append(map_bbox_and_index(words, annot))

if hasattr(obj, "get_text"):
_text_snippets: List[str | Any] = [obj.get_text()] # type: ignore
_text_snippets: List = [obj.get_text()]
else:
_text = _extract_text(obj)
_text_snippets = re.split(PARAGRAPH_PATTERN, _text)
Expand Down
117 changes: 117 additions & 0 deletions unstructured/partition/pdf_image/pdf_image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from unstructured.partition.common import convert_to_bytes

if TYPE_CHECKING:
from unstructured_inference.inference.layout import DocumentLayout, PageLayout, TextRegion

from unstructured.documents.elements import Element


Expand Down Expand Up @@ -159,3 +161,118 @@ def valid_text(text: str) -> bool:
if not text:
return False
return "(cid:" not in text


def annotate_layout_elements_with_image(
inferred_page_layout: "PageLayout",
extracted_page_layout: Optional["PageLayout"],
output_dir_path: str,
output_f_basename: str,
page_number: int,
):
"""
Annotates a page image with both inferred and extracted layout elements.
This function takes the layout elements of a single page, either extracted from or inferred
for the document, and annotates them on the page image. It creates two separate annotated
images, one for each set of layout elements: 'inferred' and 'extracted'.
These annotated images are saved to a specified directory.
"""

layout_map = {"inferred": {"layout": inferred_page_layout, "color": "blue"}}
if extracted_page_layout:
layout_map["extracted"] = {"layout": extracted_page_layout, "color": "green"}

for label, layout_data in layout_map.items():
page_layout = layout_data.get("layout")
color = layout_data.get("color")

img = page_layout.annotate(colors=color)
output_f_path = os.path.join(
output_dir_path, f"{output_f_basename}_{page_number}_{label}.jpg"
)
write_image(img, output_f_path)
print(f"output_image_path: {output_f_path}")


def annotate_layout_elements(
inferred_document_layout: "DocumentLayout",
extracted_layout: List["TextRegion"],
filename: str,
output_dir_path: str,
pdf_image_dpi: int,
is_image: bool = False,
) -> None:
"""
Annotates layout elements on images extracted from a PDF or an image file.
This function processes a given document (PDF or image) and annotates layout elements based
on the inferred and extracted layout information.
It handles both PDF documents and standalone image files. For PDFs, it converts each page
into an image, whereas for image files, it processes the single image.
"""

from unstructured_inference.inference.layout import PageLayout

output_f_basename = os.path.splitext(os.path.basename(filename))[0]
images = []
try:
if is_image:
with Image.open(filename) as img:
img = img.convert("RGB")
images.append(img)

extracted_page_layout = None
if extracted_layout:
extracted_page_layout = PageLayout(
number=1,
image=img,
)
extracted_page_layout.elements = extracted_layout[0]

inferred_page_layout = inferred_document_layout.pages[0]
inferred_page_layout.image = img

annotate_layout_elements_with_image(
inferred_page_layout=inferred_document_layout.pages[0],
extracted_page_layout=extracted_page_layout,
output_dir_path=output_dir_path,
output_f_basename=output_f_basename,
page_number=1,
)
else:
with tempfile.TemporaryDirectory() as temp_dir:
_image_paths = pdf2image.convert_from_path(
filename,
dpi=pdf_image_dpi,
output_folder=temp_dir,
paths_only=True,
)
image_paths = cast(List[str], _image_paths)
for i, image_path in enumerate(image_paths):
with Image.open(image_path) as img:
page_number = i + 1

extracted_page_layout = None
if extracted_layout:
extracted_page_layout = PageLayout(
number=page_number,
image=img,
)
extracted_page_layout.elements = extracted_layout[i]

inferred_page_layout = inferred_document_layout.pages[i]
inferred_page_layout.image = img

annotate_layout_elements_with_image(
inferred_page_layout=inferred_document_layout.pages[i],
extracted_page_layout=extracted_page_layout,
output_dir_path=output_dir_path,
output_f_basename=output_f_basename,
page_number=page_number,
)
except Exception as e:
if os.path.isdir(filename) or os.path.isfile(filename):
raise e
else:
raise FileNotFoundError(f'File "{filename}" not found!') from e
Loading

0 comments on commit 096d23b

Please sign in to comment.