Skip to content

Commit 7f07b90

Browse files
holtskinnergalz10
andauthored
feat: Add entities_to_dict() and entities_to_bigquery() to Document wrapper (#50)
* feat: Add `entities_to_dict()` and `entities_to_bigquery()` to Document wrapper - Uploads entities to an existing dataset, creates new table if it doesn't already exist. ## Example Output Table | supplier_iban | purchase_order | supplier_email | freight_amount | supplier_address | receiver_address | total_amount | supplier_name | total_tax_amount | payment_terms | line_item | receiver_name | receiver_email | due_date | invoice_date | invoice_id | currency | receiver_tax_id | net_amount | vat | |---------------|----------------|---------------------|----------------|------------------------------|------------------------------|--------------|---------------|------------------|------------------|-------------------------------------------------------------------------------|---------------|-------------------|------------|--------------|------------|----------|-----------------|------------|-----| | 50 | 1 | user@companyabc.com | 600 | 111 Main Street Anytown, USA | 222 Main Street Anytown, USA | 2140 | Company ABC | 140 | 6 month contract | [Tool A 500 1.00 500.00,Service B 1 900.00 900.00,Resource C 50 12.00 600.00] | John Doe | johndoe@email.com | 2025-01-01 | 1970-01-01 | NO. 001 | $ | 1 | 2000 | 140 | * Removed unneeded test code * Added bigquery library to setup.py * Updated Docstrings * Fixed Test import linter error * Added bigQuery Library to Testing Constraints * Added handling of Nested Entities (properties) * Dependency Update for Tests * Update Dependencies * Fixed Test Output * Updated DatasetReference based on Deprecation Warning * samples: Added Entities to BigQuery Sample Code * Added Required tag to `entities_to_bigquery()` arguments * Fixed Issues from merge conflict * Fixed numpy import --------- Co-authored-by: Gal Zahavi <38544478+galz10@users.noreply.github.com>
1 parent f114624 commit 7f07b90

File tree

15 files changed

+255
-12
lines changed

15 files changed

+255
-12
lines changed

packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/wrappers/document.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from typing import Dict, List, Optional
2222

2323
from google.api_core import client_info
24+
from google.cloud import bigquery
2425
from google.cloud import documentai
2526
from google.cloud import storage
2627
from google.cloud import documentai_toolbox
@@ -50,6 +51,8 @@ def _entities_from_shards(
5051
for shard in shards:
5152
for entity in shard.entities:
5253
result.append(Entity(documentai_entity=entity))
54+
for prop in entity.properties:
55+
result.append(Entity(documentai_entity=prop))
5356
return result
5457

5558

@@ -368,6 +371,69 @@ def get_entity_by_type(self, target_type: str) -> List[Entity]:
368371
"""
369372
return [entity for entity in self.entities if entity.type_ == target_type]
370373

374+
def entities_to_dict(self) -> Dict:
375+
r"""Returns Dictionary of entities in document.
376+
377+
Returns:
378+
Dict:
379+
The Dict of the entities indexed by type.
380+
381+
"""
382+
entities_dict: Dict = {}
383+
for entity in self.entities:
384+
entity_type = entity.type_.replace("/", "_")
385+
386+
existing_entity = entities_dict.get(entity_type)
387+
if not existing_entity:
388+
entities_dict[entity_type] = entity.mention_text
389+
continue
390+
391+
# For entities that can have multiple (e.g. line_item)
392+
# Change Entity Type to a List
393+
if not isinstance(existing_entity, list):
394+
existing_entity = [existing_entity]
395+
396+
existing_entity.append(entity.mention_text)
397+
entities_dict[entity_type] = existing_entity
398+
399+
return entities_dict
400+
401+
def entities_to_bigquery(
402+
self, dataset_name: str, table_name: str, project_id: Optional[str] = None
403+
) -> bigquery.job.LoadJob:
404+
r"""Adds extracted entities to a BigQuery table.
405+
406+
Args:
407+
dataset_name (str):
408+
Required. Name of the BigQuery dataset.
409+
table_name (str):
410+
Required. Name of the BigQuery table.
411+
project_id (Optional[str]):
412+
Optional. Project ID containing the BigQuery table. If not passed, falls back to the default inferred from the environment.
413+
Returns:
414+
bigquery.job.LoadJob:
415+
The BigQuery LoadJob for adding the entities.
416+
417+
"""
418+
bq_client = bigquery.Client(project=project_id)
419+
table_ref = bigquery.DatasetReference(
420+
project=project_id, dataset_id=dataset_name
421+
).table(table_name)
422+
423+
job_config = bigquery.LoadJobConfig(
424+
schema_update_options=[
425+
bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION,
426+
bigquery.SchemaUpdateOption.ALLOW_FIELD_RELAXATION,
427+
],
428+
source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
429+
)
430+
431+
return bq_client.load_table_from_json(
432+
json_rows=[self.entities_to_dict()],
433+
destination=table_ref,
434+
job_config=job_config,
435+
)
436+
371437
def split_pdf(self, pdf_path: str, output_path: str) -> List[str]:
372438
r"""Splits local PDF file into multiple PDF files based on output from a Splitter/Classifier processor.
373439

packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/wrappers/entity.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,20 @@ class Entity:
3737
documentai_entity: documentai.Document.Entity = dataclasses.field(repr=False)
3838
type_: str = dataclasses.field(init=False)
3939
mention_text: str = dataclasses.field(init=False, default="")
40+
normalized_text: str = dataclasses.field(init=False, default="")
4041
# Only Populated for Splitter/Classifier Output
4142
start_page: int = dataclasses.field(init=False)
4243
end_page: int = dataclasses.field(init=False)
4344

4445
def __post_init__(self):
4546
self.type_ = self.documentai_entity.type_
4647
self.mention_text = self.documentai_entity.mention_text
48+
if (
49+
self.documentai_entity.normalized_value
50+
and self.documentai_entity.normalized_value.text
51+
):
52+
self.normalized_text = self.documentai_entity.normalized_value.text
53+
4754
if self.documentai_entity.page_anchor.page_refs:
4855
self.start_page = int(self.documentai_entity.page_anchor.page_refs[0].page)
4956
self.end_page = int(self.documentai_entity.page_anchor.page_refs[-1].page)
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
16+
17+
# [START documentai_toolbox_entities_to_bigquery]
18+
19+
from google.cloud.documentai_toolbox import document
20+
21+
# TODO(developer): Uncomment these variables before running the sample.
22+
# Given a document.proto or sharded document.proto in path gs://bucket/path/to/folder
23+
# gcs_bucket_name = "bucket"
24+
# gcs_prefix = "path/to/folder"
25+
# dataset_name = "test_dataset"
26+
# table_name = "test_table"
27+
# project_id = "YOUR_PROJECT_ID"
28+
29+
30+
def entities_to_bigquery_sample(
31+
gcs_bucket_name: str,
32+
gcs_prefix: str,
33+
dataset_name: str,
34+
table_name: str,
35+
project_id: str,
36+
) -> None:
37+
wrapped_document = document.Document.from_gcs(
38+
gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
39+
)
40+
41+
job = wrapped_document.entities_to_bigquery(
42+
dataset_name=dataset_name, table_name=table_name, project_id=project_id
43+
)
44+
45+
print("Document entities loaded into BigQuery")
46+
print(f"Job ID: {job.job_id}")
47+
print(f"Table: {job.destination.path}")
48+
49+
50+
# [END documentai_toolbox_entities_to_bigquery]
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
pytest==7.2.1
2-
mock==5.0.1
2+
mock==5.0.1
3+
google-cloud-bigquery==3.5.0
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
google-cloud-bigquery==3.5.0
12
google-cloud-documentai==2.12.0
23
google-cloud-storage==2.7.0
34
google-cloud-documentai-toolbox==0.1.1a0
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
16+
import os
17+
import uuid
18+
19+
import pytest
20+
from samples.snippets import entities_to_bigquery_sample
21+
22+
from google.cloud import bigquery
23+
24+
location = "us"
25+
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
26+
gcs_bucket_name = "documentai_toolbox_samples"
27+
gcs_input_uri = "output/123456789/0"
28+
dataset_name = f"document_ai_toolbox_test_{uuid.uuid4().hex}"
29+
table_name = f"test_table_{uuid.uuid4().hex}"
30+
31+
32+
def test_entities_to_bigquery_sample(capsys: pytest.CaptureFixture) -> None:
33+
client = bigquery.Client(project=project_id)
34+
dataset = bigquery.Dataset(f"{project_id}.{dataset_name}")
35+
dataset.location = "US"
36+
dataset = client.create_dataset(dataset, timeout=30, exists_ok=True)
37+
38+
entities_to_bigquery_sample.entities_to_bigquery_sample(
39+
gcs_bucket_name=gcs_bucket_name,
40+
gcs_prefix=gcs_input_uri,
41+
dataset_name=dataset_name,
42+
table_name=table_name,
43+
project_id=project_id,
44+
)
45+
out, _ = capsys.readouterr()
46+
47+
assert "Document entities loaded into BigQuery" in out
48+
assert "Job ID:" in out
49+
assert (
50+
f"Table: /projects/{project_id}/datasets/{dataset_name}/tables/{table_name}"
51+
in out
52+
)
53+
54+
client.delete_dataset(dataset)

packages/google-cloud-documentai-toolbox/samples/snippets/test_quickstart_sample.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2020 Google LLC
1+
# Copyright 2023 Google LLC
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -31,4 +31,4 @@ def test_quickstart_sample(capsys: pytest.CaptureFixture) -> None:
3131
out, _ = capsys.readouterr()
3232

3333
assert "Number of Pages: 1" in out
34-
assert "Number of Entities: 22" in out
34+
assert "Number of Entities: 35" in out

packages/google-cloud-documentai-toolbox/setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
"proto-plus >= 1.22.0, <2.0.0dev",
5050
"proto-plus >= 1.22.2, <2.0.0dev; python_version>='3.11'",
5151
"grpc-google-iam-v1 >= 0.12.4, < 0.13dev",
52+
"google-cloud-bigquery >= 3.5.0, < 4.0.0dev",
5253
"google-cloud-documentai >= 1.2.1, < 3.0.0dev",
5354
"google-cloud-storage >= 1.31.0, < 3.0.0dev",
5455
"numpy >= 1.18.1",

packages/google-cloud-documentai-toolbox/testing/constraints-3.10.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ libcst
66
pandas
77
proto-plus
88
grpc-google-iam-v1
9+
google-cloud-bigquery
910
google-cloud-documentai
1011
google-cloud-storage
12+
numpy
1113
pikepdf

packages/google-cloud-documentai-toolbox/testing/constraints-3.11.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ libcst
66
pandas
77
proto-plus
88
grpc-google-iam-v1
9+
google-cloud-bigquery
910
google-cloud-documentai
1011
google-cloud-storage
12+
numpy
1113
pikepdf

0 commit comments

Comments
 (0)