-
Notifications
You must be signed in to change notification settings - Fork 52
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Addresses Github issue #147 * Add functionality to Extracts the text aggregated from the regions of the ocr layout that lie within the given block * Add functionality to merge inferred layout with ocr layout * Add functionality to populate inferred region text with ocr text when merging inferred layout with embedded layout * Populate inferred region text with ocr text only for inferred regions that are not populated with text * Make entire-page OCR optional * Update the evaluation script
- Loading branch information
1 parent
203f7ab
commit 15bbc56
Showing
14 changed files
with
499 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,7 @@ | ||
## 0.5.10 | ||
|
||
* Implement full-page OCR | ||
|
||
## 0.5.9 | ||
|
||
* Handle exceptions from Tesseract | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
import os | ||
import re | ||
import time | ||
from typing import List, cast | ||
|
||
import cv2 | ||
import numpy as np | ||
import pytesseract | ||
from pytesseract import Output | ||
|
||
from unstructured_inference.inference import layout | ||
from unstructured_inference.inference.elements import Rectangle, TextRegion | ||
|
||
|
||
def remove_non_printable(s): | ||
dst_str = re.sub(r'[^\x20-\x7E]', ' ', s) | ||
return ' '.join(dst_str.split()) | ||
|
||
|
||
def run_ocr_with_layout_detection( | ||
images, | ||
detection_model=None, | ||
element_extraction_model=None, | ||
mode="individual_blocks", | ||
output_dir="", | ||
drawable=True, | ||
printable=True, | ||
): | ||
total_text_extraction_infer_time = 0 | ||
total_extracted_text = {} | ||
for i, image in enumerate(images): | ||
page_num = i + 1 | ||
page_num_str = f"page{page_num}" | ||
|
||
page = layout.PageLayout( | ||
number=i+1, | ||
image=image, | ||
layout=None, | ||
detection_model=detection_model, | ||
element_extraction_model=element_extraction_model, | ||
) | ||
|
||
inferred_layout: List[TextRegion] = cast(List[TextRegion], page.detection_model(page.image)) | ||
|
||
cv_img = np.array(image) | ||
|
||
if mode == "individual_blocks": | ||
# OCR'ing individual blocks (current approach) | ||
text_extraction_start_time = time.time() | ||
|
||
elements = page.get_elements_from_layout(inferred_layout) | ||
|
||
text_extraction_infer_time = time.time() - text_extraction_start_time | ||
|
||
total_text_extraction_infer_time += text_extraction_infer_time | ||
|
||
page_text = "" | ||
for el in elements: | ||
page_text += el.text | ||
filtered_page_text = remove_non_printable(page_text) | ||
total_extracted_text[page_num_str] = filtered_page_text | ||
elif mode == "entire_page": | ||
# OCR'ing entire page (new approach to implement) | ||
text_extraction_start_time = time.time() | ||
|
||
ocr_data = pytesseract.image_to_data(image, lang='eng', output_type=Output.DICT) | ||
boxes = ocr_data['level'] | ||
extracted_text_list = [] | ||
for k in range(len(boxes)): | ||
(x, y, w, h) = ocr_data['left'][k], ocr_data['top'][k], ocr_data['width'][k], ocr_data['height'][k] | ||
extracted_text = ocr_data['text'][k] | ||
if not extracted_text: | ||
continue | ||
|
||
extracted_region = Rectangle(x1=x, y1=y, x2=x+w, y2=y+h) | ||
|
||
extracted_is_subregion_of_inferred = False | ||
for inferred_region in inferred_layout: | ||
extracted_is_subregion_of_inferred = extracted_region.is_almost_subregion_of( | ||
inferred_region.pad(12), | ||
subregion_threshold=0.75, | ||
) | ||
if extracted_is_subregion_of_inferred: | ||
break | ||
|
||
if extracted_is_subregion_of_inferred: | ||
extracted_text_list.append(extracted_text) | ||
|
||
if drawable: | ||
if extracted_is_subregion_of_inferred: | ||
cv2.rectangle(cv_img, (x, y), (x + w, y + h), (0, 255, 0), 2, None) | ||
else: | ||
cv2.rectangle(cv_img, (x, y), (x + w, y + h), (255, 0, 0), 2, None) | ||
|
||
text_extraction_infer_time = time.time() - text_extraction_start_time | ||
total_text_extraction_infer_time += text_extraction_infer_time | ||
|
||
page_text = " ".join(extracted_text_list) | ||
filtered_page_text = remove_non_printable(page_text) | ||
total_extracted_text[page_num_str] = filtered_page_text | ||
else: | ||
raise ValueError("Invalid mode") | ||
|
||
if drawable: | ||
for el in inferred_layout: | ||
pt1 = [int(el.x1), int(el.y1)] | ||
pt2 = [int(el.x2), int(el.y2)] | ||
cv2.rectangle( | ||
img=cv_img, | ||
pt1=pt1, pt2=pt2, | ||
color=(0, 0, 255), | ||
thickness=4, | ||
lineType=None, | ||
) | ||
|
||
f_path = os.path.join(output_dir, f"ocr_{mode}_{page_num_str}.jpg") | ||
cv2.imwrite(f_path, cv_img) | ||
|
||
if printable: | ||
print(f"page: {i + 1} - n_layout_elements: {len(inferred_layout)} - " | ||
f"text_extraction_infer_time: {text_extraction_infer_time}") | ||
|
||
return total_text_extraction_infer_time, total_extracted_text | ||
|
||
|
||
def run_ocr( | ||
images, | ||
printable=True, | ||
): | ||
total_text_extraction_infer_time = 0 | ||
total_text = "" | ||
for i, image in enumerate(images): | ||
text_extraction_start_time = time.time() | ||
|
||
page_text = pytesseract.image_to_string(image) | ||
|
||
text_extraction_infer_time = time.time() - text_extraction_start_time | ||
|
||
if printable: | ||
print(f"page: {i + 1} - text_extraction_infer_time: {text_extraction_infer_time}") | ||
|
||
total_text_extraction_infer_time += text_extraction_infer_time | ||
total_text += page_text | ||
|
||
return total_text_extraction_infer_time, total_text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
unstructured[local-inference] | ||
nltk |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,204 @@ | ||
import json | ||
import os | ||
import time | ||
from datetime import datetime | ||
from difflib import SequenceMatcher | ||
|
||
import nltk | ||
import pdf2image | ||
|
||
from unstructured_inference.inference.layout import ( | ||
DocumentLayout, | ||
create_image_output_dir, | ||
process_file_with_model, | ||
) | ||
|
||
# Download the required resources (run this once) | ||
nltk.download('punkt') | ||
|
||
|
||
def validate_performance( | ||
f_name, | ||
validation_mode, | ||
is_image_file=False, | ||
): | ||
print(f">>> Start performance comparison - filename: {f_name} - validation_mode: {validation_mode}" | ||
f" - is_image_file: {is_image_file}") | ||
|
||
now_dt = datetime.utcnow() | ||
now_str = now_dt.strftime("%Y_%m_%d-%H_%M_%S") | ||
|
||
f_path = os.path.join(example_docs_dir, f_name) | ||
|
||
image_f_paths = [] | ||
if validation_mode == "pdf": | ||
pdf_info = pdf2image.pdfinfo_from_path(f_path) | ||
n_pages = pdf_info["Pages"] | ||
elif validation_mode == "image": | ||
if is_image_file: | ||
image_f_paths.append(f_path) | ||
else: | ||
image_output_dir = create_image_output_dir(f_path) | ||
images = pdf2image.convert_from_path(f_path, output_folder=image_output_dir) | ||
image_f_paths = [image.filename for image in images] | ||
n_pages = len(image_f_paths) | ||
else: | ||
n_pages = 0 | ||
|
||
processing_result = {} | ||
for ocr_mode in ["individual_blocks", "entire_page"]: | ||
start_time = time.time() | ||
|
||
if validation_mode == "pdf": | ||
layout = process_file_with_model( | ||
f_path, | ||
model_name=None, | ||
ocr_mode=ocr_mode, | ||
) | ||
elif validation_mode == "image": | ||
pages = [] | ||
for image_f_path in image_f_paths: | ||
_layout = process_file_with_model( | ||
image_f_path, | ||
model_name=None, | ||
ocr_mode=ocr_mode, | ||
is_image=True, | ||
) | ||
pages += _layout.pages | ||
for i, page in enumerate(pages): | ||
page.number = i + 1 | ||
layout = DocumentLayout.from_pages(pages) | ||
else: | ||
layout = None | ||
|
||
infer_time = time.time() - start_time | ||
|
||
if layout is None: | ||
print("Layout is None") | ||
return | ||
|
||
full_text = str(layout) | ||
page_text = {} | ||
for page in layout.pages: | ||
page_text[page.number] = str(page) | ||
|
||
processing_result[ocr_mode] = { | ||
"infer_time": infer_time, | ||
"full_text": full_text, | ||
"page_text": page_text, | ||
} | ||
|
||
individual_mode_page_text = processing_result["individual_blocks"]["page_text"] | ||
entire_mode_page_text = processing_result["individual_blocks"]["page_text"] | ||
individual_mode_full_text = processing_result["individual_blocks"]["full_text"] | ||
entire_mode_full_text = processing_result["entire_page"]["full_text"] | ||
|
||
compare_result = compare_processed_text(individual_mode_full_text, entire_mode_full_text) | ||
|
||
report = { | ||
"validation_mode": validation_mode, | ||
"file_info": { | ||
"filename": f_name, | ||
"n_pages": n_pages, | ||
}, | ||
"processing_time": { | ||
"individual_blocks": processing_result["individual_blocks"]["infer_time"], | ||
"entire_page": processing_result["entire_page"]["infer_time"], | ||
}, | ||
"text_similarity": compare_result, | ||
"extracted_text": { | ||
"individual_blocks": { | ||
"page_text": individual_mode_page_text, | ||
"full_text": individual_mode_full_text, | ||
}, | ||
"entire_page": { | ||
"page_text": entire_mode_page_text, | ||
"full_text": entire_mode_full_text, | ||
}, | ||
}, | ||
} | ||
|
||
write_report(report, now_str, validation_mode) | ||
|
||
print("<<< End performance comparison", f_name) | ||
|
||
|
||
def compare_processed_text(individual_mode_full_text, entire_mode_full_text, delimiter=" "): | ||
# Calculate similarity ratio | ||
similarity_ratio = SequenceMatcher(None, individual_mode_full_text, entire_mode_full_text).ratio() | ||
|
||
print(f"similarity_ratio: {similarity_ratio}") | ||
|
||
# Tokenize the text into words | ||
word_list_individual = nltk.word_tokenize(individual_mode_full_text) | ||
n_word_list_individual = len(word_list_individual) | ||
print("n_word_list_in_text_individual:", n_word_list_individual) | ||
word_sets_individual = set(word_list_individual) | ||
n_word_sets_individual = len(word_sets_individual) | ||
print(f"n_word_sets_in_text_individual: {n_word_sets_individual}") | ||
# print("word_sets_merged:", word_sets_merged) | ||
|
||
word_list_entire = nltk.word_tokenize(entire_mode_full_text) | ||
n_word_list_entire = len(word_list_entire) | ||
print("n_word_list_individual:", n_word_list_entire) | ||
word_sets_entire = set(word_list_entire) | ||
n_word_sets_entire = len(word_sets_entire) | ||
print(f"n_word_sets_individual: {n_word_sets_entire}") | ||
# print("word_sets_individual:", word_sets_individual) | ||
|
||
# Find unique elements using difference | ||
print("diff_elements:") | ||
unique_words_individual = word_sets_individual - word_sets_entire | ||
unique_words_entire = word_sets_entire - word_sets_individual | ||
print(f"unique_words_in_text_individual: {unique_words_individual}\n") | ||
print(f"unique_words_in_text_entire: {unique_words_entire}") | ||
|
||
return { | ||
"similarity_ratio": similarity_ratio, | ||
"individual_blocks": { | ||
"n_word_list": n_word_list_individual, | ||
"n_word_sets": n_word_sets_individual, | ||
"unique_words": delimiter.join(list(unique_words_individual)), | ||
}, | ||
"entire_page": { | ||
"n_word_list": n_word_list_entire, | ||
"n_word_sets": n_word_sets_entire, | ||
"unique_words": delimiter.join(list(unique_words_entire)), | ||
}, | ||
} | ||
|
||
|
||
def write_report(report, now_str, validation_mode): | ||
report_f_name = f"validate-ocr-{validation_mode}-{now_str}.json" | ||
report_f_path = os.path.join(output_dir, report_f_name) | ||
with open(report_f_path, "w", encoding="utf-8-sig") as f: | ||
json.dump(report, f, indent=4) | ||
|
||
|
||
def run(): | ||
test_files = [ | ||
{"name": "layout-parser-paper-fast.pdf", "mode": "image", "is_image_file": False}, | ||
{"name": "loremipsum_multipage.pdf", "mode": "image", "is_image_file": False}, | ||
{"name": "2023-Jan-economic-outlook.pdf", "mode": "image", "is_image_file": False}, | ||
{"name": "recalibrating-risk-report.pdf", "mode": "image", "is_image_file": False}, | ||
{"name": "Silent-Giant.pdf", "mode": "image", "is_image_file": False}, | ||
] | ||
|
||
for test_file in test_files: | ||
f_name = test_file["name"] | ||
validation_mode = test_file["mode"] | ||
is_image_file = test_file["is_image_file"] | ||
|
||
validate_performance(f_name, validation_mode, is_image_file) | ||
|
||
|
||
if __name__ == '__main__': | ||
cur_dir = os.getcwd() | ||
base_dir = os.path.join(cur_dir, os.pardir, os.pardir) | ||
example_docs_dir = os.path.join(base_dir, "sample-docs") | ||
|
||
# folder path to save temporary outputs | ||
output_dir = os.path.join(cur_dir, "output") | ||
os.makedirs(output_dir, exist_ok=True) | ||
|
||
run() |
Binary file not shown.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
__version__ = "0.5.9" # pragma: no cover | ||
__version__ = "0.5.10" # pragma: no cover |
Oops, something went wrong.