Skip to content

Commit 847228d

Browse files
Matt Carrolldandhlee
authored andcommitted
docs(samples): add OCR, form, quality, splitter and specialized processing samples (#239)
* docs(samples): add processing samples for OCR, quality, splitter and specialized * Update quality, specialized and splitter samples * Fix lint issues * Fix snippet tags * update library from v1 to v1beta3 * restore previous processing sample to avoid sample tag breakage
1 parent fd9c1c6 commit 847228d

14 files changed

+700
-0
lines changed
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
16+
# [START documentai_process_form_document]
17+
18+
# TODO(developer): Uncomment these variables before running the sample.
19+
# project_id= 'YOUR_PROJECT_ID'
20+
# location = 'YOUR_PROJECT_LOCATION' # Format is 'us' or 'eu'
21+
# processor_id = 'YOUR_PROCESSOR_ID' # Create processor in Cloud Console
22+
# file_path = '/path/to/local/pdf'
23+
24+
def process_document_form_sample(
25+
project_id: str, location: str, processor_id: str, file_path: str
26+
):
27+
from google.cloud import documentai_v1beta3 as documentai
28+
29+
# You must set the api_endpoint if you use a location other than 'us', e.g.:
30+
opts = {}
31+
if location == "eu":
32+
opts = {"api_endpoint": "eu-documentai.googleapis.com"}
33+
34+
client = documentai.DocumentProcessorServiceClient(client_options=opts)
35+
36+
# The full resource name of the processor, e.g.:
37+
# projects/project-id/locations/location/processor/processor-id
38+
# You must create new processors in the Cloud Console first
39+
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
40+
41+
with open(file_path, "rb") as image:
42+
image_content = image.read()
43+
44+
# Read the file into memory
45+
document = {"content": image_content, "mime_type": "application/pdf"}
46+
47+
# Configure the process request
48+
request = {"name": name, "raw_document": document}
49+
50+
# Recognizes text entities in the PDF document
51+
result = client.process_document(request=request)
52+
53+
print("Document processing complete.")
54+
55+
# Read the table and form fields output from the processor
56+
# The form processor also contains OCR data. For more information
57+
# on how to parse OCR data please see the OCR sample.
58+
# For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document
59+
document = result.document
60+
text = document.text
61+
print(f"Full document text: {repr(text)}\n")
62+
print(f"There are {len(document.pages)} page(s) in this document.")
63+
64+
# Read the text recognition output from the processor
65+
for page in document.pages:
66+
print(f"\n\n**** Page {page.page_number} ****")
67+
68+
print(f"Found {len(page.tables)} table(s):")
69+
for table in page.tables:
70+
num_collumns = len(table.header_rows[0].cells)
71+
num_rows = len(table.body_rows)
72+
print(f'Table with {num_collumns} columns and {num_rows} rows:')
73+
print_table_info(table, text)
74+
print(f'Found {len(page.form_fields)} form fields:')
75+
for field in page.form_fields:
76+
name = layout_to_text(field.field_name, text)
77+
value = layout_to_text(field.field_value, text)
78+
print(f" * {repr(name.strip())}: {repr(value.strip())}")
79+
80+
81+
def print_table_info(table: dict, text: str) -> None:
82+
# Print header row
83+
header_row_text = ''
84+
for header_cell in table.header_rows[0].cells:
85+
header_cell_text = layout_to_text(header_cell.layout, text)
86+
header_row_text += f'{repr(header_cell_text.strip())} | '
87+
print(f'Collumns: {header_row_text[:-3]}')
88+
# Print first body row
89+
body_row_text = ''
90+
for body_cell in table.body_rows[0].cells:
91+
body_cell_text = layout_to_text(body_cell.layout, text)
92+
body_row_text += f'{repr(body_cell_text.strip())} | '
93+
print(f'First row data: {body_row_text[:-3]}\n')
94+
95+
96+
def layout_to_text(layout: dict, text: str) -> str:
97+
"""
98+
Document AI identifies form fields by their offsets in the entirity of the
99+
document's text. This function converts offsets to a string.
100+
"""
101+
response = ""
102+
# If a text segment spans several lines, it will
103+
# be stored in different text segments.
104+
for segment in layout.text_anchor.text_segments:
105+
start_index = (
106+
int(segment.start_index)
107+
if segment in layout.text_anchor.text_segments
108+
else 0
109+
)
110+
end_index = int(segment.end_index)
111+
response += text[start_index:end_index]
112+
return response
113+
114+
115+
# [END documentai_process_form_document]
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# # Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
16+
import os
17+
18+
from samples.snippets import process_document_form_sample
19+
20+
21+
location = "us"
22+
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
23+
processor_id = "90484cfdedb024f6"
24+
file_path = "resources/invoice.pdf"
25+
26+
27+
def test_process_documents(capsys):
28+
process_document_form_sample.process_document_form_sample(
29+
project_id=project_id,
30+
location=location,
31+
processor_id=processor_id,
32+
file_path=file_path,
33+
)
34+
out, _ = capsys.readouterr()
35+
36+
expected_strings = [
37+
"There are 1 page(s) in this document.",
38+
"Table with 4 columns and 6 rows",
39+
"Found 13 form fields",
40+
"'BALANCE DUE': '$2140.00'",
41+
]
42+
for expected_string in expected_strings:
43+
assert expected_string in out
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
16+
# [START documentai_process_ocr_document]
17+
18+
# TODO(developer): Uncomment these variables before running the sample.
19+
# project_id= 'YOUR_PROJECT_ID'
20+
# location = 'YOUR_PROJECT_LOCATION' # Format is 'us' or 'eu'
21+
# processor_id = 'YOUR_PROCESSOR_ID' # Create processor in Cloud Console
22+
# file_path = '/path/to/local/pdf'
23+
24+
def process_document_ocr_sample(
25+
project_id: str, location: str, processor_id: str, file_path: str
26+
) -> None:
27+
from google.cloud import documentai_v1beta3 as documentai
28+
29+
# You must set the api_endpoint if you use a location other than 'us', e.g.:
30+
opts = {}
31+
if location == "eu":
32+
opts = {"api_endpoint": "eu-documentai.googleapis.com"}
33+
34+
client = documentai.DocumentProcessorServiceClient(client_options=opts)
35+
36+
# The full resource name of the processor, e.g.:
37+
# projects/project-id/locations/location/processor/processor-id
38+
# You must create new processors in the Cloud Console first
39+
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
40+
41+
with open(file_path, "rb") as image:
42+
image_content = image.read()
43+
44+
# Read the file into memory
45+
document = {"content": image_content, "mime_type": "application/pdf"}
46+
47+
# Configure the process request
48+
request = {"name": name, "raw_document": document}
49+
50+
# Recognizes text entities in the PDF document
51+
result = client.process_document(request=request)
52+
53+
print("Document processing complete.")
54+
55+
# Read the text recognition output from the processor
56+
# For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document
57+
document = result.document
58+
text = document.text
59+
print(f"Full document text: {repr(text)}\n")
60+
print(f"There are {len(document.pages)} page(s) in this document.\n")
61+
62+
for page in document.pages:
63+
print(f"Page {page.page_number}:")
64+
print_page_dimensions(page.dimension)
65+
print_detected_langauges(page.detected_languages)
66+
print_paragraphs(page.paragraphs, text)
67+
print_blocks(page.blocks, text)
68+
print_lines(page.lines, text)
69+
print_tokens(page.tokens, text)
70+
71+
72+
def print_page_dimensions(dimension: dict) -> None:
73+
print(f" Width: {str(dimension.width)}")
74+
print(f" Height: {str(dimension.height)}")
75+
76+
77+
def print_detected_langauges(detected_languages: dict) -> None:
78+
print(" Detected languages:")
79+
for lang in detected_languages:
80+
code = lang.language_code
81+
conf_percent = '{:.1%}'.format(lang.confidence)
82+
print(f" {code} ({conf_percent} confidence)")
83+
84+
85+
def print_paragraphs(paragraphs: dict, text: str) -> None:
86+
print(f" {len(paragraphs)} paragraphs detected:")
87+
first_paragraph_text = layout_to_text(paragraphs[0].layout, text)
88+
print(f" First paragraph text: {repr(first_paragraph_text)}")
89+
last_paragraph_text = layout_to_text(paragraphs[-1].layout, text)
90+
print(f" Last paragraph text: {repr(last_paragraph_text)}")
91+
92+
93+
def print_blocks(blocks: dict, text: str) -> None:
94+
print(f" {len(blocks)} blocks detected:")
95+
first_block_text = layout_to_text(blocks[0].layout, text)
96+
print(f" First text block: {repr(first_block_text)}")
97+
last_block_text = layout_to_text(blocks[-1].layout, text)
98+
print(f" Last text block: {repr(last_block_text)}")
99+
100+
101+
def print_lines(lines: dict, text: str) -> None:
102+
print(f" {len(lines)} lines detected:")
103+
first_line_text = layout_to_text(lines[0].layout, text)
104+
print(f" First line text: {repr(first_line_text)}")
105+
last_line_text = layout_to_text(lines[-1].layout, text)
106+
print(f" Last line text: {repr(last_line_text)}")
107+
108+
109+
def print_tokens(tokens: dict, text: str) -> None:
110+
print(f" {len(tokens)} tokens detected:")
111+
first_token_text = layout_to_text(tokens[0].layout, text)
112+
first_token_break_type = tokens[0].detected_break.type_.name
113+
print(f" First token text: {repr(first_token_text)}")
114+
print(f" First token break type: {repr(first_token_break_type)}")
115+
last_token_text = layout_to_text(tokens[-1].layout, text)
116+
last_token_break_type = tokens[-1].detected_break.type_.name
117+
print(f" Last token text: {repr(last_token_text)}")
118+
print(f" Last token break type: {repr(last_token_break_type)}")
119+
120+
121+
def layout_to_text(layout: dict, text: str) -> str:
122+
"""
123+
Document AI identifies text in different parts of the document by their
124+
offsets in the entirity of the document's text. This function converts
125+
offsets to a string.
126+
"""
127+
response = ""
128+
# If a text segment spans several lines, it will
129+
# be stored in different text segments.
130+
for segment in layout.text_anchor.text_segments:
131+
start_index = (
132+
int(segment.start_index)
133+
if segment in layout.text_anchor.text_segments
134+
else 0
135+
)
136+
end_index = int(segment.end_index)
137+
response += text[start_index:end_index]
138+
return response
139+
140+
141+
# [END documentai_process_ocr_document]
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# # Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
16+
import os
17+
18+
from samples.snippets import process_document_ocr_sample
19+
20+
location = "us"
21+
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
22+
processor_id = "91e072f8626a76b7"
23+
file_path = "resources/handwritten_form.pdf"
24+
25+
26+
def test_process_documents(capsys):
27+
process_document_ocr_sample.process_document_ocr_sample(
28+
project_id=project_id,
29+
location=location,
30+
processor_id=processor_id,
31+
file_path=file_path,
32+
)
33+
out, _ = capsys.readouterr()
34+
35+
assert "Page 1" in out
36+
assert "en" in out
37+
assert "FakeDoc" in out

0 commit comments

Comments
 (0)