From 096d23bc28d96376aaf0822b6980103b120201cc Mon Sep 17 00:00:00 2001 From: Christine Straub Date: Mon, 18 Dec 2023 22:21:56 -0800 Subject: [PATCH] Refactor: support layout analysis (#2273) ### Summary This PR is the second part of the "layout analysis" refactor to move it from unstructured-inference repo to unstructured repo, the first part is done in https://github.com/Unstructured-IO/unstructured-inference/pull/305. This PR adds logic to support annotating `inferred` and `extracted` elements. ### Testing ``` PYTHONPATH=. python examples/layout-analysis/visualization.py ``` e.g. ``` PYTHONPATH=. python examples/layout-analysis/visualization.py example-docs/layout-parser-paper-fast.pdf hi_res pdf ``` --- CHANGELOG.md | 4 +- examples/layout-analysis/visualization.py | 36 ++++-- unstructured/__version__.py | 2 +- unstructured/partition/pdf.py | 53 +++++--- .../partition/pdf_image/pdf_image_utils.py | 117 ++++++++++++++++++ .../pdf_image/pdfminer_processing.py | 96 +++++++------- 6 files changed, 230 insertions(+), 78 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cc44acf1ff..5ee219d2d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,9 @@ -## 0.11.6-dev1 +## 0.11.6-dev2 ### Enhancements +* **Update the layout analysis script.** The previous script only supported annotating `final` elements. The updated script also supports annotating `inferred` and `extracted` elements. + ### Features ### Fixes diff --git a/examples/layout-analysis/visualization.py b/examples/layout-analysis/visualization.py index a13bb930cb..ae42cc8a1e 100644 --- a/examples/layout-analysis/visualization.py +++ b/examples/layout-analysis/visualization.py @@ -3,7 +3,8 @@ import sys import pdf2image -from unstructured_inference.inference.elements import Rectangle +from PIL import Image +from unstructured_inference.inference.elements import TextRegion from unstructured_inference.visualize import draw_bbox from unstructured.documents.elements import PageBreak @@ -29,11 +30,14 @@ def extract_element_coordinates(elements): return elements_coordinates -def run_partition_pdf(f_path, strategy, images, output_dir): +def run_partition_pdf(f_path, strategy, images, output_dir, output_f_basename, is_image): elements = partition_pdf( f_path, strategy=strategy, + is_image=is_image, include_page_breaks=True, + analysis=True, + analyzed_image_output_dir_path=output_dir, ) elements_coordinates = extract_element_coordinates(elements) @@ -44,22 +48,28 @@ def run_partition_pdf(f_path, strategy, images, output_dir): points = coordinate.points x1, y1 = points[0] x2, y2 = points[2] - rect = Rectangle(x1, y1, x2, y2) - img = draw_bbox(img, rect, color="red") - - output_image_path = os.path.join(output_dir, f"{strategy}-{idx + 1}.jpg") - print(f"output_image_path: {output_image_path}") + el = TextRegion.from_coords(x1, y1, x2, y2) + img = draw_bbox(img, el, color="red") + output_image_path = os.path.join(output_dir, f"{output_f_basename}_{idx + 1}_final.jpg") img.save(output_image_path) + print(f"output_image_path: {output_image_path}") -def run(f_path, strategy): +def run(f_path, strategy, document_type): f_basename = os.path.splitext(os.path.basename(f_path))[0] output_dir_path = os.path.join(output_basedir_path, f_basename) os.makedirs(output_dir_path, exist_ok=True) - images = pdf2image.convert_from_path(f_path) - run_partition_pdf(f_path, strategy, images, output_dir_path) + is_image = document_type == "image" + if is_image: + with Image.open(f_path) as img: + img = img.convert("RGB") + images = [img] + else: + images = pdf2image.convert_from_path(f_path) + + run_partition_pdf(f_path, strategy, images, output_dir_path, f_basename, is_image) if __name__ == "__main__": @@ -74,7 +84,11 @@ def run(f_path, strategy): print("Invalid strategy") sys.exit(1) + if sys.argv[3] not in ["pdf", "image"]: + print("Invalid document type") + sys.exit(1) + output_basedir_path = os.path.join(CUR_DIR, "output") os.makedirs(output_basedir_path, exist_ok=True) - run(f_path=sys.argv[1], strategy=sys.argv[2]) + run(f_path=sys.argv[1], strategy=sys.argv[2], document_type=sys.argv[3]) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 2bc40d4c52..fd7506317c 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.11.6-dev1" # pragma: no cover +__version__ = "0.11.6-dev2" # pragma: no cover diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index fea31a48ef..c7fd88b0f8 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -74,9 +74,13 @@ prepare_languages_for_tesseract, ) from unstructured.partition.pdf_image.pdf_image_utils import ( + annotate_layout_elements, check_element_types_to_extract, save_elements, ) +from unstructured.partition.pdf_image.pdfminer_processing import ( + merge_inferred_with_extracted_layout, +) from unstructured.partition.pdf_image.pdfminer_utils import ( open_pdfminer_pages_generator, rect_to_bbox, @@ -247,6 +251,8 @@ def _partition_pdf_or_image_local( extract_element_types: Optional[List[str]] = None, image_output_dir_path: Optional[str] = None, pdf_image_dpi: Optional[int] = None, + analysis: bool = False, + analyzed_image_output_dir_path: Optional[str] = None, **kwargs, ) -> List[Element]: """Partition using package installed locally""" @@ -286,14 +292,27 @@ def _partition_pdf_or_image_local( pdf_image_dpi=pdf_image_dpi, ) - if pdf_text_extractable is True: - # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout - merged_document_layout = process_file_with_pdfminer( - inferred_document_layout, - filename, + extracted_layout = ( + process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi) + if pdf_text_extractable + else [] + ) + + if analysis: + annotate_layout_elements( + inferred_document_layout=inferred_document_layout, + extracted_layout=extracted_layout, + filename=filename, + output_dir_path=analyzed_image_output_dir_path, + pdf_image_dpi=pdf_image_dpi, + is_image=is_image, ) - else: - merged_document_layout = inferred_document_layout + + # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout + merged_document_layout = merge_inferred_with_extracted_layout( + inferred_document_layout=inferred_document_layout, + extracted_layout=extracted_layout, + ) if model_name.startswith("chipper"): # NOTE(alan): We shouldn't do OCR with chipper @@ -317,14 +336,16 @@ def _partition_pdf_or_image_local( ) if hasattr(file, "seek"): file.seek(0) - if pdf_text_extractable is True: - # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout - merged_document_layout = process_data_with_pdfminer( - inferred_document_layout, - file, - ) - else: - merged_document_layout = inferred_document_layout + + extracted_layout = ( + process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) if pdf_text_extractable else [] + ) + + # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout + merged_document_layout = merge_inferred_with_extracted_layout( + inferred_document_layout=inferred_document_layout, + extracted_layout=extracted_layout, + ) if model_name.startswith("chipper"): # NOTE(alan): We shouldn't do OCR with chipper @@ -655,7 +676,7 @@ def _process_pdfminer_pages( urls_metadata.append(map_bbox_and_index(words, annot)) if hasattr(obj, "get_text"): - _text_snippets: List[str | Any] = [obj.get_text()] # type: ignore + _text_snippets: List = [obj.get_text()] else: _text = _extract_text(obj) _text_snippets = re.split(PARAGRAPH_PATTERN, _text) diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py index e1514300c6..ee9c9d29ff 100644 --- a/unstructured/partition/pdf_image/pdf_image_utils.py +++ b/unstructured/partition/pdf_image/pdf_image_utils.py @@ -13,6 +13,8 @@ from unstructured.partition.common import convert_to_bytes if TYPE_CHECKING: + from unstructured_inference.inference.layout import DocumentLayout, PageLayout, TextRegion + from unstructured.documents.elements import Element @@ -159,3 +161,118 @@ def valid_text(text: str) -> bool: if not text: return False return "(cid:" not in text + + +def annotate_layout_elements_with_image( + inferred_page_layout: "PageLayout", + extracted_page_layout: Optional["PageLayout"], + output_dir_path: str, + output_f_basename: str, + page_number: int, +): + """ + Annotates a page image with both inferred and extracted layout elements. + + This function takes the layout elements of a single page, either extracted from or inferred + for the document, and annotates them on the page image. It creates two separate annotated + images, one for each set of layout elements: 'inferred' and 'extracted'. + These annotated images are saved to a specified directory. + """ + + layout_map = {"inferred": {"layout": inferred_page_layout, "color": "blue"}} + if extracted_page_layout: + layout_map["extracted"] = {"layout": extracted_page_layout, "color": "green"} + + for label, layout_data in layout_map.items(): + page_layout = layout_data.get("layout") + color = layout_data.get("color") + + img = page_layout.annotate(colors=color) + output_f_path = os.path.join( + output_dir_path, f"{output_f_basename}_{page_number}_{label}.jpg" + ) + write_image(img, output_f_path) + print(f"output_image_path: {output_f_path}") + + +def annotate_layout_elements( + inferred_document_layout: "DocumentLayout", + extracted_layout: List["TextRegion"], + filename: str, + output_dir_path: str, + pdf_image_dpi: int, + is_image: bool = False, +) -> None: + """ + Annotates layout elements on images extracted from a PDF or an image file. + + This function processes a given document (PDF or image) and annotates layout elements based + on the inferred and extracted layout information. + It handles both PDF documents and standalone image files. For PDFs, it converts each page + into an image, whereas for image files, it processes the single image. + """ + + from unstructured_inference.inference.layout import PageLayout + + output_f_basename = os.path.splitext(os.path.basename(filename))[0] + images = [] + try: + if is_image: + with Image.open(filename) as img: + img = img.convert("RGB") + images.append(img) + + extracted_page_layout = None + if extracted_layout: + extracted_page_layout = PageLayout( + number=1, + image=img, + ) + extracted_page_layout.elements = extracted_layout[0] + + inferred_page_layout = inferred_document_layout.pages[0] + inferred_page_layout.image = img + + annotate_layout_elements_with_image( + inferred_page_layout=inferred_document_layout.pages[0], + extracted_page_layout=extracted_page_layout, + output_dir_path=output_dir_path, + output_f_basename=output_f_basename, + page_number=1, + ) + else: + with tempfile.TemporaryDirectory() as temp_dir: + _image_paths = pdf2image.convert_from_path( + filename, + dpi=pdf_image_dpi, + output_folder=temp_dir, + paths_only=True, + ) + image_paths = cast(List[str], _image_paths) + for i, image_path in enumerate(image_paths): + with Image.open(image_path) as img: + page_number = i + 1 + + extracted_page_layout = None + if extracted_layout: + extracted_page_layout = PageLayout( + number=page_number, + image=img, + ) + extracted_page_layout.elements = extracted_layout[i] + + inferred_page_layout = inferred_document_layout.pages[i] + inferred_page_layout.image = img + + annotate_layout_elements_with_image( + inferred_page_layout=inferred_document_layout.pages[i], + extracted_page_layout=extracted_page_layout, + output_dir_path=output_dir_path, + output_f_basename=output_f_basename, + page_number=page_number, + ) + except Exception as e: + if os.path.isdir(filename) or os.path.isfile(filename): + raise e + else: + raise FileNotFoundError(f'File "{filename}" not found!') from e diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index 8cffdbeaa7..ac186fea78 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -7,7 +7,7 @@ TextRegion, ) from unstructured_inference.inference.layoutelement import ( - merge_inferred_layout_with_extracted_layout, + merge_inferred_layout_with_extracted_layout as merge_inferred_with_extracted_page, ) from unstructured_inference.inference.ordering import order_layout from unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel @@ -25,62 +25,20 @@ def process_file_with_pdfminer( - inferred_document_layout: "DocumentLayout", filename: str = "", -) -> "DocumentLayout": + dpi: int = 200, +) -> List[List[TextRegion]]: with open_filename(filename, "rb") as fp: fp = cast(BinaryIO, fp) - inferred_document_layout = process_data_with_pdfminer( - inferred_document_layout=inferred_document_layout, + extracted_layout = process_data_with_pdfminer( file=fp, + dpi=dpi, ) - return inferred_document_layout + return extracted_layout def process_data_with_pdfminer( - inferred_document_layout: "DocumentLayout", file: Optional[Union[bytes, BinaryIO]] = None, -) -> "DocumentLayout": - """Process document data using PDFMiner to extract layout information.""" - - extracted_layouts = get_regions_by_pdfminer(file) - - inferred_pages = inferred_document_layout.pages - for i, (inferred_page, extracted_layout) in enumerate(zip(inferred_pages, extracted_layouts)): - inferred_layout = inferred_page.elements - image_metadata = inferred_page.image_metadata - w = image_metadata.get("width") - h = image_metadata.get("height") - image_size = (w, h) - - threshold_kwargs = {} - # NOTE(Benjamin): With this the thresholds are only changed for detextron2_mask_rcnn - # In other case the default values for the functions are used - if ( - isinstance(inferred_page.detection_model, UnstructuredDetectronONNXModel) - and "R_50" not in inferred_page.detection_model.model_path - ): - threshold_kwargs = {"same_region_threshold": 0.5, "subregion_threshold": 0.5} - - merged_layout = merge_inferred_layout_with_extracted_layout( - inferred_layout=inferred_layout, - extracted_layout=extracted_layout, - page_image_size=image_size, - **threshold_kwargs, - ) - - elements = inferred_page.get_elements_from_layout( - layout=cast(List[TextRegion], merged_layout), - pdf_objects=extracted_layout, - ) - - inferred_page.elements[:] = elements - - return inferred_document_layout - - -def get_regions_by_pdfminer( - fp: Optional[Union[bytes, BinaryIO]], dpi: int = 200, ) -> List[List[TextRegion]]: """Loads the image and word objects from a pdf using pdfplumber and the image renderings of the @@ -89,7 +47,7 @@ def get_regions_by_pdfminer( layouts = [] # Coefficient to rescale bounding box to be compatible with images coef = dpi / 72 - for page, page_layout in open_pdfminer_pages_generator(fp): + for page, page_layout in open_pdfminer_pages_generator(file): height = page_layout.height layout: List["TextRegion"] = [] @@ -129,3 +87,43 @@ def get_regions_by_pdfminer( layouts.append(layout) return layouts + + +def merge_inferred_with_extracted_layout( + inferred_document_layout: "DocumentLayout", + extracted_layout: List[List[TextRegion]], +) -> "DocumentLayout": + inferred_pages = inferred_document_layout.pages + for i, (inferred_page, extracted_page_layout) in enumerate( + zip(inferred_pages, extracted_layout) + ): + inferred_layout = inferred_page.elements + image_metadata = inferred_page.image_metadata + w = image_metadata.get("width") + h = image_metadata.get("height") + image_size = (w, h) + + threshold_kwargs = {} + # NOTE(Benjamin): With this the thresholds are only changed for detextron2_mask_rcnn + # In other case the default values for the functions are used + if ( + isinstance(inferred_page.detection_model, UnstructuredDetectronONNXModel) + and "R_50" not in inferred_page.detection_model.model_path + ): + threshold_kwargs = {"same_region_threshold": 0.5, "subregion_threshold": 0.5} + + merged_layout = merge_inferred_with_extracted_page( + inferred_layout=inferred_layout, + extracted_layout=extracted_page_layout, + page_image_size=image_size, + **threshold_kwargs, + ) + + elements = inferred_page.get_elements_from_layout( + layout=cast(List[TextRegion], merged_layout), + pdf_objects=extracted_page_layout, + ) + + inferred_page.elements[:] = elements + + return inferred_document_layout