Skip to content

Commit

Permalink
Feat/147 ocr entire page (#159)
Browse files Browse the repository at this point in the history
Addresses Github issue #147

* Add functionality to Extracts the text aggregated from the regions of the ocr layout that lie within the given
block
* Add functionality to merge inferred layout with ocr layout
* Add functionality to populate inferred region text with ocr text when merging inferred layout with embedded layout
* Populate inferred region text with ocr text only for inferred regions that are not populated with text
* Make entire-page OCR optional
* Update the evaluation script
  • Loading branch information
christinestraub authored Aug 11, 2023
1 parent 203f7ab commit 15bbc56
Show file tree
Hide file tree
Showing 14 changed files with 499 additions and 12 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.5.10

* Implement full-page OCR

## 0.5.9

* Handle exceptions from Tesseract
Expand Down
145 changes: 145 additions & 0 deletions examples/ocr/engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import os
import re
import time
from typing import List, cast

import cv2
import numpy as np
import pytesseract
from pytesseract import Output

from unstructured_inference.inference import layout
from unstructured_inference.inference.elements import Rectangle, TextRegion


def remove_non_printable(s):
dst_str = re.sub(r'[^\x20-\x7E]', ' ', s)
return ' '.join(dst_str.split())


def run_ocr_with_layout_detection(
images,
detection_model=None,
element_extraction_model=None,
mode="individual_blocks",
output_dir="",
drawable=True,
printable=True,
):
total_text_extraction_infer_time = 0
total_extracted_text = {}
for i, image in enumerate(images):
page_num = i + 1
page_num_str = f"page{page_num}"

page = layout.PageLayout(
number=i+1,
image=image,
layout=None,
detection_model=detection_model,
element_extraction_model=element_extraction_model,
)

inferred_layout: List[TextRegion] = cast(List[TextRegion], page.detection_model(page.image))

cv_img = np.array(image)

if mode == "individual_blocks":
# OCR'ing individual blocks (current approach)
text_extraction_start_time = time.time()

elements = page.get_elements_from_layout(inferred_layout)

text_extraction_infer_time = time.time() - text_extraction_start_time

total_text_extraction_infer_time += text_extraction_infer_time

page_text = ""
for el in elements:
page_text += el.text
filtered_page_text = remove_non_printable(page_text)
total_extracted_text[page_num_str] = filtered_page_text
elif mode == "entire_page":
# OCR'ing entire page (new approach to implement)
text_extraction_start_time = time.time()

ocr_data = pytesseract.image_to_data(image, lang='eng', output_type=Output.DICT)
boxes = ocr_data['level']
extracted_text_list = []
for k in range(len(boxes)):
(x, y, w, h) = ocr_data['left'][k], ocr_data['top'][k], ocr_data['width'][k], ocr_data['height'][k]
extracted_text = ocr_data['text'][k]
if not extracted_text:
continue

extracted_region = Rectangle(x1=x, y1=y, x2=x+w, y2=y+h)

extracted_is_subregion_of_inferred = False
for inferred_region in inferred_layout:
extracted_is_subregion_of_inferred = extracted_region.is_almost_subregion_of(
inferred_region.pad(12),
subregion_threshold=0.75,
)
if extracted_is_subregion_of_inferred:
break

if extracted_is_subregion_of_inferred:
extracted_text_list.append(extracted_text)

if drawable:
if extracted_is_subregion_of_inferred:
cv2.rectangle(cv_img, (x, y), (x + w, y + h), (0, 255, 0), 2, None)
else:
cv2.rectangle(cv_img, (x, y), (x + w, y + h), (255, 0, 0), 2, None)

text_extraction_infer_time = time.time() - text_extraction_start_time
total_text_extraction_infer_time += text_extraction_infer_time

page_text = " ".join(extracted_text_list)
filtered_page_text = remove_non_printable(page_text)
total_extracted_text[page_num_str] = filtered_page_text
else:
raise ValueError("Invalid mode")

if drawable:
for el in inferred_layout:
pt1 = [int(el.x1), int(el.y1)]
pt2 = [int(el.x2), int(el.y2)]
cv2.rectangle(
img=cv_img,
pt1=pt1, pt2=pt2,
color=(0, 0, 255),
thickness=4,
lineType=None,
)

f_path = os.path.join(output_dir, f"ocr_{mode}_{page_num_str}.jpg")
cv2.imwrite(f_path, cv_img)

if printable:
print(f"page: {i + 1} - n_layout_elements: {len(inferred_layout)} - "
f"text_extraction_infer_time: {text_extraction_infer_time}")

return total_text_extraction_infer_time, total_extracted_text


def run_ocr(
images,
printable=True,
):
total_text_extraction_infer_time = 0
total_text = ""
for i, image in enumerate(images):
text_extraction_start_time = time.time()

page_text = pytesseract.image_to_string(image)

text_extraction_infer_time = time.time() - text_extraction_start_time

if printable:
print(f"page: {i + 1} - text_extraction_infer_time: {text_extraction_infer_time}")

total_text_extraction_infer_time += text_extraction_infer_time
total_text += page_text

return total_text_extraction_infer_time, total_text
1 change: 1 addition & 0 deletions examples/ocr/output/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*
2 changes: 2 additions & 0 deletions examples/ocr/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
unstructured[local-inference]
nltk
204 changes: 204 additions & 0 deletions examples/ocr/validate_ocr_performance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
import json
import os
import time
from datetime import datetime
from difflib import SequenceMatcher

import nltk
import pdf2image

from unstructured_inference.inference.layout import (
DocumentLayout,
create_image_output_dir,
process_file_with_model,
)

# Download the required resources (run this once)
nltk.download('punkt')


def validate_performance(
f_name,
validation_mode,
is_image_file=False,
):
print(f">>> Start performance comparison - filename: {f_name} - validation_mode: {validation_mode}"
f" - is_image_file: {is_image_file}")

now_dt = datetime.utcnow()
now_str = now_dt.strftime("%Y_%m_%d-%H_%M_%S")

f_path = os.path.join(example_docs_dir, f_name)

image_f_paths = []
if validation_mode == "pdf":
pdf_info = pdf2image.pdfinfo_from_path(f_path)
n_pages = pdf_info["Pages"]
elif validation_mode == "image":
if is_image_file:
image_f_paths.append(f_path)
else:
image_output_dir = create_image_output_dir(f_path)
images = pdf2image.convert_from_path(f_path, output_folder=image_output_dir)
image_f_paths = [image.filename for image in images]
n_pages = len(image_f_paths)
else:
n_pages = 0

processing_result = {}
for ocr_mode in ["individual_blocks", "entire_page"]:
start_time = time.time()

if validation_mode == "pdf":
layout = process_file_with_model(
f_path,
model_name=None,
ocr_mode=ocr_mode,
)
elif validation_mode == "image":
pages = []
for image_f_path in image_f_paths:
_layout = process_file_with_model(
image_f_path,
model_name=None,
ocr_mode=ocr_mode,
is_image=True,
)
pages += _layout.pages
for i, page in enumerate(pages):
page.number = i + 1
layout = DocumentLayout.from_pages(pages)
else:
layout = None

infer_time = time.time() - start_time

if layout is None:
print("Layout is None")
return

full_text = str(layout)
page_text = {}
for page in layout.pages:
page_text[page.number] = str(page)

processing_result[ocr_mode] = {
"infer_time": infer_time,
"full_text": full_text,
"page_text": page_text,
}

individual_mode_page_text = processing_result["individual_blocks"]["page_text"]
entire_mode_page_text = processing_result["individual_blocks"]["page_text"]
individual_mode_full_text = processing_result["individual_blocks"]["full_text"]
entire_mode_full_text = processing_result["entire_page"]["full_text"]

compare_result = compare_processed_text(individual_mode_full_text, entire_mode_full_text)

report = {
"validation_mode": validation_mode,
"file_info": {
"filename": f_name,
"n_pages": n_pages,
},
"processing_time": {
"individual_blocks": processing_result["individual_blocks"]["infer_time"],
"entire_page": processing_result["entire_page"]["infer_time"],
},
"text_similarity": compare_result,
"extracted_text": {
"individual_blocks": {
"page_text": individual_mode_page_text,
"full_text": individual_mode_full_text,
},
"entire_page": {
"page_text": entire_mode_page_text,
"full_text": entire_mode_full_text,
},
},
}

write_report(report, now_str, validation_mode)

print("<<< End performance comparison", f_name)


def compare_processed_text(individual_mode_full_text, entire_mode_full_text, delimiter=" "):
# Calculate similarity ratio
similarity_ratio = SequenceMatcher(None, individual_mode_full_text, entire_mode_full_text).ratio()

print(f"similarity_ratio: {similarity_ratio}")

# Tokenize the text into words
word_list_individual = nltk.word_tokenize(individual_mode_full_text)
n_word_list_individual = len(word_list_individual)
print("n_word_list_in_text_individual:", n_word_list_individual)
word_sets_individual = set(word_list_individual)
n_word_sets_individual = len(word_sets_individual)
print(f"n_word_sets_in_text_individual: {n_word_sets_individual}")
# print("word_sets_merged:", word_sets_merged)

word_list_entire = nltk.word_tokenize(entire_mode_full_text)
n_word_list_entire = len(word_list_entire)
print("n_word_list_individual:", n_word_list_entire)
word_sets_entire = set(word_list_entire)
n_word_sets_entire = len(word_sets_entire)
print(f"n_word_sets_individual: {n_word_sets_entire}")
# print("word_sets_individual:", word_sets_individual)

# Find unique elements using difference
print("diff_elements:")
unique_words_individual = word_sets_individual - word_sets_entire
unique_words_entire = word_sets_entire - word_sets_individual
print(f"unique_words_in_text_individual: {unique_words_individual}\n")
print(f"unique_words_in_text_entire: {unique_words_entire}")

return {
"similarity_ratio": similarity_ratio,
"individual_blocks": {
"n_word_list": n_word_list_individual,
"n_word_sets": n_word_sets_individual,
"unique_words": delimiter.join(list(unique_words_individual)),
},
"entire_page": {
"n_word_list": n_word_list_entire,
"n_word_sets": n_word_sets_entire,
"unique_words": delimiter.join(list(unique_words_entire)),
},
}


def write_report(report, now_str, validation_mode):
report_f_name = f"validate-ocr-{validation_mode}-{now_str}.json"
report_f_path = os.path.join(output_dir, report_f_name)
with open(report_f_path, "w", encoding="utf-8-sig") as f:
json.dump(report, f, indent=4)


def run():
test_files = [
{"name": "layout-parser-paper-fast.pdf", "mode": "image", "is_image_file": False},
{"name": "loremipsum_multipage.pdf", "mode": "image", "is_image_file": False},
{"name": "2023-Jan-economic-outlook.pdf", "mode": "image", "is_image_file": False},
{"name": "recalibrating-risk-report.pdf", "mode": "image", "is_image_file": False},
{"name": "Silent-Giant.pdf", "mode": "image", "is_image_file": False},
]

for test_file in test_files:
f_name = test_file["name"]
validation_mode = test_file["mode"]
is_image_file = test_file["is_image_file"]

validate_performance(f_name, validation_mode, is_image_file)


if __name__ == '__main__':
cur_dir = os.getcwd()
base_dir = os.path.join(cur_dir, os.pardir, os.pardir)
example_docs_dir = os.path.join(base_dir, "sample-docs")

# folder path to save temporary outputs
output_dir = os.path.join(cur_dir, "output")
os.makedirs(output_dir, exist_ok=True)

run()
Binary file added sample-docs/2023-Jan-economic-outlook.pdf
Binary file not shown.
Binary file added sample-docs/Silent-Giant.pdf
Binary file not shown.
Binary file added sample-docs/layout-parser-paper-fast.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added sample-docs/layout-parser-paper-fast.pdf
Binary file not shown.
Binary file added sample-docs/recalibrating-risk-report.pdf
Binary file not shown.
3 changes: 2 additions & 1 deletion test_unstructured_inference/inference/test_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def initialize(self, *args, **kwargs):


def test_get_page_elements(monkeypatch, mock_final_layout):
image = np.random.randint(12, 24, (40, 40))
image = Image.fromarray(np.random.randint(12, 14, size=(40, 10, 3)), mode="RGB")
page = layout.PageLayout(
number=0,
image=image,
Expand Down Expand Up @@ -834,6 +834,7 @@ def test_process_file_with_model_routing(monkeypatch, model_type, is_detection_m
element_extraction_model=element_extraction_model,
ocr_strategy="auto",
ocr_languages="eng",
ocr_mode="entire_page",
fixed_layouts=None,
extract_tables=False,
pdf_image_dpi=200,
Expand Down
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.5.9" # pragma: no cover
__version__ = "0.5.10" # pragma: no cover
Loading

0 comments on commit 15bbc56

Please sign in to comment.