Feat/147 ocr entire page (#159)

Addresses Github issue #147 * Add functionality to Extracts the text aggregated from the regions of the ocr layout that lie within the given block * Add functionality to merge inferred layout with ocr layout * Add functionality to populate inferred region text with ocr text when merging inferred layout with embedded layout * Populate inferred region text with ocr text only for inferred regions that are not populated with text * Make entire-page OCR optional * Update the evaluation script
Unstructured-IO · Aug 11, 2023 · 15bbc56 · 15bbc56
1 parent 203f7ab
commit 15bbc56
Show file tree

Hide file tree

Showing 14 changed files with 499 additions and 12 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.5.10
+
+* Implement full-page OCR
+
 ## 0.5.9
 
 * Handle exceptions from Tesseract

diff --git a/examples/ocr/engine.py b/examples/ocr/engine.py
@@ -0,0 +1,145 @@
+import os
+import re
+import time
+from typing import List, cast
+
+import cv2
+import numpy as np
+import pytesseract
+from pytesseract import Output
+
+from unstructured_inference.inference import layout
+from unstructured_inference.inference.elements import Rectangle, TextRegion
+
+
+def remove_non_printable(s):
+    dst_str = re.sub(r'[^\x20-\x7E]', ' ', s)
+    return ' '.join(dst_str.split())
+
+
+def run_ocr_with_layout_detection(
+    images,
+    detection_model=None,
+    element_extraction_model=None,
+    mode="individual_blocks",
+    output_dir="",
+    drawable=True,
+    printable=True,
+):
+    total_text_extraction_infer_time = 0
+    total_extracted_text = {}
+    for i, image in enumerate(images):
+        page_num = i + 1
+        page_num_str = f"page{page_num}"
+
+        page = layout.PageLayout(
+            number=i+1,
+            image=image,
+            layout=None,
+            detection_model=detection_model,
+            element_extraction_model=element_extraction_model,
+        )
+
+        inferred_layout: List[TextRegion] = cast(List[TextRegion], page.detection_model(page.image))
+
+        cv_img = np.array(image)
+
+        if mode == "individual_blocks":
+            # OCR'ing individual blocks (current approach)
+            text_extraction_start_time = time.time()
+
+            elements = page.get_elements_from_layout(inferred_layout)
+
+            text_extraction_infer_time = time.time() - text_extraction_start_time
+
+            total_text_extraction_infer_time += text_extraction_infer_time
+
+            page_text = ""
+            for el in elements:
+                page_text += el.text
+            filtered_page_text = remove_non_printable(page_text)
+            total_extracted_text[page_num_str] = filtered_page_text
+        elif mode == "entire_page":
+            # OCR'ing entire page (new approach to implement)
+            text_extraction_start_time = time.time()
+
+            ocr_data = pytesseract.image_to_data(image, lang='eng', output_type=Output.DICT)
+            boxes = ocr_data['level']
+            extracted_text_list = []
+            for k in range(len(boxes)):
+                (x, y, w, h) = ocr_data['left'][k], ocr_data['top'][k], ocr_data['width'][k], ocr_data['height'][k]
+                extracted_text = ocr_data['text'][k]
+                if not extracted_text:
+                    continue
+
+                extracted_region = Rectangle(x1=x, y1=y, x2=x+w, y2=y+h)
+
+                extracted_is_subregion_of_inferred = False
+                for inferred_region in inferred_layout:
+                    extracted_is_subregion_of_inferred = extracted_region.is_almost_subregion_of(
+                        inferred_region.pad(12),
+                        subregion_threshold=0.75,
+                    )
+                    if extracted_is_subregion_of_inferred:
+                        break
+
+                if extracted_is_subregion_of_inferred:
+                    extracted_text_list.append(extracted_text)
+
+                if drawable:
+                    if extracted_is_subregion_of_inferred:
+                        cv2.rectangle(cv_img, (x, y), (x + w, y + h), (0, 255, 0), 2, None)
+                    else:
+                        cv2.rectangle(cv_img, (x, y), (x + w, y + h), (255, 0, 0), 2, None)
+
+            text_extraction_infer_time = time.time() - text_extraction_start_time
+            total_text_extraction_infer_time += text_extraction_infer_time
+
+            page_text = " ".join(extracted_text_list)
+            filtered_page_text = remove_non_printable(page_text)
+            total_extracted_text[page_num_str] = filtered_page_text
+        else:
+            raise ValueError("Invalid mode")
+
+        if drawable:
+            for el in inferred_layout:
+                pt1 = [int(el.x1), int(el.y1)]
+                pt2 = [int(el.x2), int(el.y2)]
+                cv2.rectangle(
+                    img=cv_img,
+                    pt1=pt1, pt2=pt2,
+                    color=(0, 0, 255),
+                    thickness=4,
+                    lineType=None,
+                )
+
+            f_path = os.path.join(output_dir, f"ocr_{mode}_{page_num_str}.jpg")
+            cv2.imwrite(f_path, cv_img)
+
+        if printable:
+            print(f"page: {i + 1} - n_layout_elements: {len(inferred_layout)} - "
+                  f"text_extraction_infer_time: {text_extraction_infer_time}")
+
+    return total_text_extraction_infer_time, total_extracted_text
+
+
+def run_ocr(
+    images,
+    printable=True,
+):
+    total_text_extraction_infer_time = 0
+    total_text = ""
+    for i, image in enumerate(images):
+        text_extraction_start_time = time.time()
+
+        page_text = pytesseract.image_to_string(image)
+
+        text_extraction_infer_time = time.time() - text_extraction_start_time
+
+        if printable:
+            print(f"page: {i + 1} - text_extraction_infer_time: {text_extraction_infer_time}")
+
+        total_text_extraction_infer_time += text_extraction_infer_time
+        total_text += page_text
+
+    return total_text_extraction_infer_time, total_text
diff --git a/examples/ocr/output/.gitignore b/examples/ocr/output/.gitignore
@@ -0,0 +1 @@
+*
diff --git a/examples/ocr/requirements.txt b/examples/ocr/requirements.txt
@@ -0,0 +1,2 @@
+unstructured[local-inference]
+nltk
diff --git a/examples/ocr/validate_ocr_performance.py b/examples/ocr/validate_ocr_performance.py
@@ -0,0 +1,204 @@
+import json
+import os
+import time
+from datetime import datetime
+from difflib import SequenceMatcher
+
+import nltk
+import pdf2image
+
+from unstructured_inference.inference.layout import (
+    DocumentLayout,
+    create_image_output_dir,
+    process_file_with_model,
+)
+
+# Download the required resources (run this once)
+nltk.download('punkt')
+
+
+def validate_performance(
+    f_name,
+    validation_mode,
+    is_image_file=False,
+):
+    print(f">>> Start performance comparison - filename: {f_name} - validation_mode: {validation_mode}"
+          f" - is_image_file: {is_image_file}")
+
+    now_dt = datetime.utcnow()
+    now_str = now_dt.strftime("%Y_%m_%d-%H_%M_%S")
+
+    f_path = os.path.join(example_docs_dir, f_name)
+
+    image_f_paths = []
+    if validation_mode == "pdf":
+        pdf_info = pdf2image.pdfinfo_from_path(f_path)
+        n_pages = pdf_info["Pages"]
+    elif validation_mode == "image":
+        if is_image_file:
+            image_f_paths.append(f_path)
+        else:
+            image_output_dir = create_image_output_dir(f_path)
+            images = pdf2image.convert_from_path(f_path, output_folder=image_output_dir)
+            image_f_paths = [image.filename for image in images]
+        n_pages = len(image_f_paths)
+    else:
+        n_pages = 0
+
+    processing_result = {}
+    for ocr_mode in ["individual_blocks", "entire_page"]:
+        start_time = time.time()
+
+        if validation_mode == "pdf":
+            layout = process_file_with_model(
+                f_path,
+                model_name=None,
+                ocr_mode=ocr_mode,
+            )
+        elif validation_mode == "image":
+            pages = []
+            for image_f_path in image_f_paths:
+                _layout = process_file_with_model(
+                    image_f_path,
+                    model_name=None,
+                    ocr_mode=ocr_mode,
+                    is_image=True,
+                )
+                pages += _layout.pages
+            for i, page in enumerate(pages):
+                page.number = i + 1
+            layout = DocumentLayout.from_pages(pages)
+        else:
+            layout = None
+
+        infer_time = time.time() - start_time
+
+        if layout is None:
+            print("Layout is None")
+            return
+
+        full_text = str(layout)
+        page_text = {}
+        for page in layout.pages:
+            page_text[page.number] = str(page)
+
+        processing_result[ocr_mode] = {
+            "infer_time": infer_time,
+            "full_text": full_text,
+            "page_text": page_text,
+        }
+
+    individual_mode_page_text = processing_result["individual_blocks"]["page_text"]
+    entire_mode_page_text = processing_result["individual_blocks"]["page_text"]
+    individual_mode_full_text = processing_result["individual_blocks"]["full_text"]
+    entire_mode_full_text = processing_result["entire_page"]["full_text"]
+
+    compare_result = compare_processed_text(individual_mode_full_text, entire_mode_full_text)
+
+    report = {
+        "validation_mode": validation_mode,
+        "file_info": {
+            "filename": f_name,
+            "n_pages": n_pages,
+        },
+        "processing_time": {
+            "individual_blocks": processing_result["individual_blocks"]["infer_time"],
+            "entire_page": processing_result["entire_page"]["infer_time"],
+        },
+        "text_similarity": compare_result,
+        "extracted_text": {
+            "individual_blocks": {
+                "page_text": individual_mode_page_text,
+                "full_text": individual_mode_full_text,
+            },
+            "entire_page": {
+                "page_text": entire_mode_page_text,
+                "full_text": entire_mode_full_text,
+            },
+        },
+    }
+
+    write_report(report, now_str, validation_mode)
+
+    print("<<< End performance comparison", f_name)
+
+
+def compare_processed_text(individual_mode_full_text, entire_mode_full_text, delimiter=" "):
+    # Calculate similarity ratio
+    similarity_ratio = SequenceMatcher(None, individual_mode_full_text, entire_mode_full_text).ratio()
+
+    print(f"similarity_ratio: {similarity_ratio}")
+
+    # Tokenize the text into words
+    word_list_individual = nltk.word_tokenize(individual_mode_full_text)
+    n_word_list_individual = len(word_list_individual)
+    print("n_word_list_in_text_individual:", n_word_list_individual)
+    word_sets_individual = set(word_list_individual)
+    n_word_sets_individual = len(word_sets_individual)
+    print(f"n_word_sets_in_text_individual: {n_word_sets_individual}")
+    # print("word_sets_merged:", word_sets_merged)
+
+    word_list_entire = nltk.word_tokenize(entire_mode_full_text)
+    n_word_list_entire = len(word_list_entire)
+    print("n_word_list_individual:", n_word_list_entire)
+    word_sets_entire = set(word_list_entire)
+    n_word_sets_entire = len(word_sets_entire)
+    print(f"n_word_sets_individual: {n_word_sets_entire}")
+    # print("word_sets_individual:", word_sets_individual)
+
+    # Find unique elements using difference
+    print("diff_elements:")
+    unique_words_individual = word_sets_individual - word_sets_entire
+    unique_words_entire = word_sets_entire - word_sets_individual
+    print(f"unique_words_in_text_individual: {unique_words_individual}\n")
+    print(f"unique_words_in_text_entire: {unique_words_entire}")
+
+    return {
+        "similarity_ratio": similarity_ratio,
+        "individual_blocks": {
+            "n_word_list": n_word_list_individual,
+            "n_word_sets": n_word_sets_individual,
+            "unique_words": delimiter.join(list(unique_words_individual)),
+        },
+        "entire_page": {
+            "n_word_list": n_word_list_entire,
+            "n_word_sets": n_word_sets_entire,
+            "unique_words": delimiter.join(list(unique_words_entire)),
+        },
+    }
+
+
+def write_report(report, now_str, validation_mode):
+    report_f_name = f"validate-ocr-{validation_mode}-{now_str}.json"
+    report_f_path = os.path.join(output_dir, report_f_name)
+    with open(report_f_path, "w", encoding="utf-8-sig") as f:
+        json.dump(report, f, indent=4)
+
+
+def run():
+    test_files = [
+        {"name": "layout-parser-paper-fast.pdf", "mode": "image", "is_image_file": False},
+        {"name": "loremipsum_multipage.pdf", "mode": "image", "is_image_file": False},
+        {"name": "2023-Jan-economic-outlook.pdf", "mode": "image", "is_image_file": False},
+        {"name": "recalibrating-risk-report.pdf", "mode": "image", "is_image_file": False},
+        {"name": "Silent-Giant.pdf", "mode": "image", "is_image_file": False},
+    ]
+
+    for test_file in test_files:
+        f_name = test_file["name"]
+        validation_mode = test_file["mode"]
+        is_image_file = test_file["is_image_file"]
+
+        validate_performance(f_name, validation_mode, is_image_file)
+
+
+if __name__ == '__main__':
+    cur_dir = os.getcwd()
+    base_dir = os.path.join(cur_dir, os.pardir, os.pardir)
+    example_docs_dir = os.path.join(base_dir, "sample-docs")
+
+    # folder path to save temporary outputs
+    output_dir = os.path.join(cur_dir, "output")
+    os.makedirs(output_dir, exist_ok=True)
+
+    run()
diff --git a/sample-docs/2023-Jan-economic-outlook.pdf b/sample-docs/2023-Jan-economic-outlook.pdf
diff --git a/sample-docs/Silent-Giant.pdf b/sample-docs/Silent-Giant.pdf
diff --git a/sample-docs/layout-parser-paper-fast.jpg b/sample-docs/layout-parser-paper-fast.jpg
diff --git a/sample-docs/layout-parser-paper-fast.pdf b/sample-docs/layout-parser-paper-fast.pdf
diff --git a/sample-docs/recalibrating-risk-report.pdf b/sample-docs/recalibrating-risk-report.pdf
diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py
@@ -110,7 +110,7 @@ def initialize(self, *args, **kwargs):
 
 
 def test_get_page_elements(monkeypatch, mock_final_layout):
-    image = np.random.randint(12, 24, (40, 40))
+    image = Image.fromarray(np.random.randint(12, 14, size=(40, 10, 3)), mode="RGB")
     page = layout.PageLayout(
         number=0,
         image=image,
@@ -834,6 +834,7 @@ def test_process_file_with_model_routing(monkeypatch, model_type, is_detection_m
             element_extraction_model=element_extraction_model,
             ocr_strategy="auto",
             ocr_languages="eng",
+            ocr_mode="entire_page",
             fixed_layouts=None,
             extract_tables=False,
             pdf_image_dpi=200,

diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.5.9"  # pragma: no cover
+__version__ = "0.5.10"  # pragma: no cover