Skip to content

Commit 778421f

Browse files
authored
feat: Added gcs_uri parameter to Document.from_gcs() to allow importing of a single Document JSON (#261)
* feat: Added `gcs_uri` parameter to `Document.from_gcs()` to allow importing of a single Document JSON * Change `get_blob()` to use `storage.Blob.from_string()`
1 parent 2dfcdf6 commit 778421f

File tree

3 files changed

+70
-9
lines changed

3 files changed

+70
-9
lines changed

packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/utilities/gcs_utilities.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -136,17 +136,13 @@ def get_blob(
136136
module (Optional[str]):
137137
Optional. The module for a custom user agent header.
138138
Returns:
139-
List[storage.blob.Blob]:
140-
A list of the blobs in the Cloud Storage path.
139+
storage.blob.Blob:
140+
The blob in the Cloud Storage path.
141141
"""
142-
gcs_bucket_name, gcs_file_name = split_gcs_uri(gcs_uri)
143-
144-
if not re.match(constants.FILE_CHECK_REGEX, gcs_file_name):
142+
if not re.match(constants.FILE_CHECK_REGEX, gcs_uri):
145143
raise ValueError("gcs_uri must link to a single file.")
146144

147-
storage_client = _get_storage_client(module=module)
148-
bucket = storage_client.bucket(bucket_name=gcs_bucket_name)
149-
return bucket.get_blob(gcs_file_name)
145+
return storage.Blob.from_string(gcs_uri, _get_storage_client(module=module))
150146

151147

152148
def split_gcs_uri(gcs_uri: str) -> Tuple[str, str]:

packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/wrappers/document.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,7 @@ class Document:
366366
shards: List[documentai.Document] = dataclasses.field(repr=False)
367367
gcs_bucket_name: Optional[str] = dataclasses.field(default=None, repr=False)
368368
gcs_prefix: Optional[str] = dataclasses.field(default=None, repr=False)
369+
gcs_uri: Optional[str] = dataclasses.field(default=None, repr=False)
369370
gcs_input_uri: Optional[str] = dataclasses.field(default=None, repr=False)
370371

371372
_pages: Optional[List[Page]] = dataclasses.field(
@@ -463,7 +464,7 @@ def from_gcs(
463464
gcs_prefix: str,
464465
gcs_input_uri: Optional[str] = None,
465466
) -> "Document":
466-
r"""Loads Document from Cloud Storage.
467+
r"""Loads a Document from a Cloud Storage directory.
467468
468469
Args:
469470
gcs_bucket_name (str):
@@ -490,6 +491,40 @@ def from_gcs(
490491
gcs_input_uri=gcs_input_uri,
491492
)
492493

494+
@classmethod
495+
def from_gcs_uri(
496+
cls: Type["Document"],
497+
gcs_uri: str,
498+
gcs_input_uri: Optional[str] = None,
499+
) -> "Document":
500+
r"""Loads a Document from a Cloud Storage uri.
501+
502+
Args:
503+
gcs_uri (str):
504+
Required. The full GCS uri to a Document JSON file.
505+
506+
Example: `gs://{bucket_name}/{optional_folder}/{target_file}.json`.
507+
gcs_input_uri (str):
508+
Optional. The gcs uri to the original input file.
509+
510+
Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/{file_name}.pdf`
511+
Returns:
512+
Document:
513+
A document from gcs.
514+
"""
515+
blob = gcs_utilities.get_blob(gcs_uri=gcs_uri, module="get-document")
516+
shards = [
517+
documentai.Document.from_json(
518+
blob.download_as_bytes(),
519+
ignore_unknown_fields=True,
520+
)
521+
]
522+
return cls(
523+
shards=shards,
524+
gcs_uri=gcs_uri,
525+
gcs_input_uri=gcs_input_uri,
526+
)
527+
493528
@classmethod
494529
def from_batch_process_metadata(
495530
cls: Type["Document"], metadata: documentai.BatchProcessMetadata

packages/google-cloud-documentai-toolbox/tests/unit/test_document.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,17 @@ def get_bytes_missing_shard_mock():
105105
yield byte_factory
106106

107107

108+
@pytest.fixture
109+
def get_blob_mock():
110+
with mock.patch.object(gcs_utilities, "get_blob") as blob_factory:
111+
mock_blob = mock.Mock()
112+
mock_blob.download_as_bytes.return_value = get_bytes("tests/unit/resources/0")[
113+
0
114+
]
115+
blob_factory.return_value = mock_blob
116+
yield blob_factory
117+
118+
108119
def create_document_with_images_without_bbox(get_bytes_images_mock):
109120
doc = document.Document.from_gcs(
110121
gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0"
@@ -394,6 +405,25 @@ def test_document_from_gcs_with_unordered_shards(get_bytes_unordered_files_mock)
394405
assert page.page_number == page_index + 1
395406

396407

408+
def test_document_from_gcs_uri(get_blob_mock):
409+
actual = document.Document.from_gcs_uri(
410+
gcs_uri="gs://test-directory/documentai/output/123456789/0/document.json"
411+
)
412+
413+
get_blob_mock.assert_called_once()
414+
415+
assert (
416+
actual.gcs_uri
417+
== "gs://test-directory/documentai/output/123456789/0/document.json"
418+
)
419+
assert len(actual.pages) == 1
420+
# checking cached value
421+
assert len(actual.pages) == 1
422+
423+
assert len(actual.text) > 0
424+
assert len(actual.text) > 0
425+
426+
397427
def test_document_from_batch_process_metadata_with_multiple_input_files(
398428
get_bytes_multiple_directories_mock,
399429
):

0 commit comments

Comments
 (0)