Skip to content

Commit

Permalink
docs(samples): Added Processor Version Samples (#382)
Browse files Browse the repository at this point in the history
* docs(samples): Added Processor Version Samples

To Be Published in documentation: https://cloud.google.com/document-ai/docs/manage-processor

- `get_processor_version`
- `list_processor_versions`
- `set_default_processor_version`

* docs(samples): Adjusted Bad Batch Input test to

* docs(samples): Added Deploy/Undeploy Samples

* docs(samples): Added process & batchProcess examples for processorVersions

- Removed Processor Version from basic process and batchProcess examples
- Removed Note about must create processors in the Cloud Console
- Added note that processor must be created before running sample where missing

* docs(samples): Adjusted Enable/Disable Processor Test to avoid Race Conditions

* docs(samples): Added Delete Processor Version Sample
- Also Fixed Spelling error in Undeploy Comments

* docs(samples): Updated non-idempotent unit tests to use mocks
  - Also replaced test ocr processor id after making a breaking change to the project
  - Added `field_mask` to process_documents tests
  • Loading branch information
holtskinner authored Sep 26, 2022
1 parent 7cb0b36 commit f7b6110
Show file tree
Hide file tree
Showing 30 changed files with 935 additions and 36 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# [START documentai_batch_process_documents_processor_version]
import re

from google.api_core.client_options import ClientOptions
from google.cloud import documentai, storage

# TODO(developer): Uncomment these variables before running the sample.
# project_id = 'YOUR_PROJECT_ID'
# location = 'YOUR_PROCESSOR_LOCATION' # Format is 'us' or 'eu'
# processor_id = 'YOUR_PROCESSOR_ID' # Example: aeb8cea219b7c272
# processor_version_id = "YOUR_PROCESSOR_VERSION_ID" # Example: pretrained-ocr-v1.0-2020-09-23
# gcs_input_uri = "YOUR_INPUT_URI" # Format: gs://bucket/directory/file.pdf
# input_mime_type = "application/pdf"
# gcs_output_bucket = "YOUR_OUTPUT_BUCKET_NAME" # Format: gs://bucket
# gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX" # Format: directory/subdirectory/


def batch_process_documents_processor_version(
project_id: str,
location: str,
processor_id: str,
processor_version_id: str,
gcs_input_uri: str,
input_mime_type: str,
gcs_output_bucket: str,
gcs_output_uri_prefix: str,
timeout: int = 300,
):

# You must set the api_endpoint if you use a location other than 'us', e.g.:
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

client = documentai.DocumentProcessorServiceClient(client_options=opts)

gcs_document = documentai.GcsDocument(
gcs_uri=gcs_input_uri, mime_type=input_mime_type
)

# Load GCS Input URI into a List of document files
gcs_documents = documentai.GcsDocuments(documents=[gcs_document])
input_config = documentai.BatchDocumentsInputConfig(gcs_documents=gcs_documents)

# NOTE: Alternatively, specify a GCS URI Prefix to process an entire directory
#
# gcs_input_uri = "gs://bucket/directory/"
# gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=gcs_input_uri)
# input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix)
#

# Cloud Storage URI for the Output Directory
destination_uri = f"{gcs_output_bucket}/{gcs_output_uri_prefix}/"

gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
gcs_uri=destination_uri
)

# Where to write results
output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)

# The full resource name of the processor version
# e.g. projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}
name = client.processor_version_path(
project_id, location, processor_id, processor_version_id
)

request = documentai.BatchProcessRequest(
name=name,
input_documents=input_config,
document_output_config=output_config,
)

# BatchProcess returns a Long Running Operation (LRO)
operation = client.batch_process_documents(request)

# Continually polls the operation until it is complete.
# This could take some time for larger files
# Format: projects/PROJECT_NUMBER/locations/LOCATION/operations/OPERATION_ID
print(f"Waiting for operation {operation.operation.name} to complete...")
operation.result(timeout=timeout)

# NOTE: Can also use callbacks for asynchronous processing
#
# def my_callback(future):
# result = future.result()
#
# operation.add_done_callback(my_callback)

# Once the operation is complete,
# get output document information from operation metadata
metadata = documentai.BatchProcessMetadata(operation.metadata)

if metadata.state != documentai.BatchProcessMetadata.State.SUCCEEDED:
raise ValueError(f"Batch Process Failed: {metadata.state_message}")

storage_client = storage.Client()

print("Output files:")
# One process per Input Document
for process in metadata.individual_process_statuses:
# output_gcs_destination format: gs://BUCKET/PREFIX/OPERATION_NUMBER/INPUT_FILE_NUMBER/
# The Cloud Storage API requires the bucket name and URI prefix separately
matches = re.match(r"gs://(.*?)/(.*)", process.output_gcs_destination)
if not matches:
print(
"Could not parse output GCS destination:",
process.output_gcs_destination,
)
continue

output_bucket, output_prefix = matches.groups()

# Get List of Document Objects from the Output Bucket
output_blobs = storage_client.list_blobs(output_bucket, prefix=output_prefix)

# Document AI may output multiple JSON files per source file
for blob in output_blobs:
# Document AI should only output JSON files to GCS
if ".json" not in blob.name:
print(
f"Skipping non-supported file: {blob.name} - Mimetype: {blob.content_type}"
)
continue

# Download JSON File as bytes object and convert to Document Object
print(f"Fetching {blob.name}")
document = documentai.Document.from_json(
blob.download_as_bytes(), ignore_unknown_fields=True
)

# For a full list of Document object attributes, please reference this page:
# https://cloud.google.com/python/docs/reference/documentai/latest/google.cloud.documentai_v1.types.Document

# Read the text recognition output from the processor
print("The document contains the following text:")
print(document.text)


# [END documentai_batch_process_documents_processor_version]
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
from uuid import uuid4

from google.cloud import storage
from google.cloud.exceptions import NotFound
import pytest
from samples.snippets import batch_process_documents_processor_version_sample

location = "us"
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
processor_id = "90484cfdedb024f6"
processor_version_id = "pretrained-form-parser-v1.0-2020-09-23"
gcs_input_uri = "gs://cloud-samples-data/documentai/invoice.pdf"
input_mime_type = "application/pdf"
gcs_output_uri_prefix = uuid4()
BUCKET_NAME = f"document-ai-python-{uuid4()}"


@pytest.fixture(scope="module")
def test_bucket():
storage_client = storage.Client()
bucket = storage_client.create_bucket(BUCKET_NAME)
yield bucket.name

try:
blobs = list(bucket.list_blobs())
for blob in blobs:
blob.delete()
bucket.delete()
except NotFound:
print("Bucket already deleted.")


def test_batch_process_documents_processor_version(capsys, test_bucket):
batch_process_documents_processor_version_sample.batch_process_documents_processor_version(
project_id=project_id,
location=location,
processor_id=processor_id,
processor_version_id=processor_version_id,
gcs_input_uri=gcs_input_uri,
input_mime_type=input_mime_type,
gcs_output_bucket=f"gs://{test_bucket}",
gcs_output_uri_prefix=gcs_output_uri_prefix,
)
out, _ = capsys.readouterr()

assert "operation" in out
assert "Fetching" in out
assert "text:" in out
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
# project_id = 'YOUR_PROJECT_ID'
# location = 'YOUR_PROCESSOR_LOCATION' # Format is 'us' or 'eu'
# processor_id = 'YOUR_PROCESSOR_ID' # Create processor before running sample
# processor_version = "pretrained" # Optional. Processor version to use
# gcs_input_uri = "YOUR_INPUT_URI" # Format: gs://bucket/directory/file.pdf
# input_mime_type = "application/pdf"
# gcs_output_bucket = "YOUR_OUTPUT_BUCKET_NAME" # Format: gs://bucket
Expand Down Expand Up @@ -73,17 +72,8 @@ def batch_process_documents(

# The full resource name of the processor, e.g.:
# projects/project_id/locations/location/processor/processor_id
# You must create new processors in the Cloud Console first
name = client.processor_path(project_id, location, processor_id)

# NOTE: Alternatively, specify the processor_version to specify a particular version of the processor to use
# projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processorVersion}
#
# name = client.processor_version_path(
# project_id, location, processor_id, processor_version
# )
#

request = documentai.BatchProcessRequest(
name=name,
input_documents=input_config,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,4 @@ def test_batch_process_documents_with_bad_input(capsys):
out, _ = capsys.readouterr()
assert "Failed" in out
except Exception as e:
assert "Internal error" in e.message
assert "Failed" in e.message
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# [START documentai_delete_processor_version]

from google.api_core.client_options import ClientOptions
from google.api_core.exceptions import FailedPrecondition, InvalidArgument
from google.cloud import documentai

# TODO(developer): Uncomment these variables before running the sample.
# project_id = 'YOUR_PROJECT_ID'
# location = 'YOUR_PROCESSOR_LOCATION' # Format is 'us' or 'eu'
# processor_id = 'YOUR_PROCESSOR_ID' # Create processor before running sample
# processor_version_id = 'YOUR_PROCESSOR_VERSION_ID'


def delete_processor_version_sample(
project_id: str, location: str, processor_id: str, processor_version_id: str
):
# You must set the api_endpoint if you use a location other than 'us', e.g.:
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

client = documentai.DocumentProcessorServiceClient(client_options=opts)

# The full resource name of the processor version
# e.g.: projects/project_id/locations/location/processors/processor_id/processorVersions/processor_version_id
name = client.processor_version_path(
project_id, location, processor_id, processor_version_id
)

# Make DeleteProcessorVersion request
try:
operation = client.delete_processor_version(name=name)
# Print operation details
print(operation.operation.name)
# Wait for operation to complete
operation.result()
# Delete request will fail if the
# processor version doesn't exist
# or if a request is made on a pretrained processor version
# or the default processor version
except (FailedPrecondition, InvalidArgument) as e:
print(e.message)


# [END documentai_delete_processor_version]
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os

import mock
from samples.snippets import delete_processor_version_sample

location = "us"
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
processor_id = "aaaaaaaaa"
processor_version_id = "xxxxxxxxxx"


@mock.patch(
"google.cloud.documentai.DocumentProcessorServiceClient.delete_processor_version"
)
@mock.patch("google.api_core.operation.Operation")
def test_delete_processor_version(
operation_mock, delete_processor_version_mock, capsys
):
delete_processor_version_mock.return_value = operation_mock

delete_processor_version_sample.delete_processor_version_sample(
project_id=project_id,
location=location,
processor_id=processor_id,
processor_version_id=processor_version_id,
)

delete_processor_version_mock.assert_called_once()

out, _ = capsys.readouterr()

assert "operation" in out
Loading

0 comments on commit f7b6110

Please sign in to comment.