Skip to content

Commit b4762e8

Browse files
authored
fix: Add trailing slash if not present for gcs_prefix in Document.from_gcs() to cover matching prefixes edge case. (#274)
* fix: Add trailing slash if not present for `gcs_prefix` in `Document.from_gcs()` to cover matching prefixes edge case. * Added Tests for GCS Matching Prefixes
1 parent 7248fe1 commit b4762e8

File tree

4 files changed

+66
-6
lines changed

4 files changed

+66
-6
lines changed

google/cloud/documentai_toolbox/wrappers/document.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -504,6 +504,8 @@ def from_gcs(
504504
Document:
505505
A document from gcs.
506506
"""
507+
# Add trailing slash if not present.
508+
gcs_prefix = gcs_prefix.rstrip("/") + "/"
507509
shards = _get_shards(gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix)
508510
return cls(
509511
shards=shards,

samples/snippets/quickstart_sample.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def quickstart_sample(
5252
documentai_document: Optional[documentai.Document] = None,
5353
batch_process_metadata: Optional[documentai.BatchProcessMetadata] = None,
5454
batch_process_operation: Optional[str] = None,
55-
) -> None:
55+
) -> document.Document:
5656
if gcs_bucket_name and gcs_prefix:
5757
# Load from Google Cloud Storage Directory
5858
print("Document structure in Cloud Storage")
@@ -128,5 +128,6 @@ def quickstart_sample(
128128
if entity.normalized_text:
129129
print(f"\tNormalized Text: {entity.normalized_text}")
130130

131+
# [END documentai_toolbox_quickstart]
131132

132-
# [END documentai_toolbox_quickstart]
133+
return wrapped_document

samples/snippets/test_quickstart_sample.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,32 @@ def test_quickstart_sample_batch_process_metadata(
9696
assert "Document Successfully Loaded!" in out
9797

9898

99+
def test_quickstart_sample_batch_process_metadata_matching_prefixes(
100+
capsys: pytest.CaptureFixture,
101+
) -> None:
102+
batch_process_metadata = documentai.BatchProcessMetadata(
103+
state=documentai.BatchProcessMetadata.State.SUCCEEDED,
104+
individual_process_statuses=[
105+
documentai.BatchProcessMetadata.IndividualProcessStatus(
106+
input_gcs_source="gs://test-directory/documentai/input.pdf",
107+
output_gcs_destination="gs://documentai_toolbox_samples/output/matching-prefixes/1",
108+
),
109+
documentai.BatchProcessMetadata.IndividualProcessStatus(
110+
input_gcs_source="gs://test-directory/documentai/input.pdf",
111+
output_gcs_destination="gs://documentai_toolbox_samples/output/matching-prefixes/11",
112+
),
113+
],
114+
)
115+
wrapped_document = quickstart_sample.quickstart_sample(
116+
batch_process_metadata=batch_process_metadata
117+
)
118+
119+
assert wrapped_document.gcs_prefix == "output/matching-prefixes/1/"
120+
out, _ = capsys.readouterr()
121+
122+
assert "Document Successfully Loaded!" in out
123+
124+
99125
def test_quickstart_sample_batch_process_operation(
100126
capsys: pytest.CaptureFixture,
101127
) -> None:

tests/unit/test_document.py

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,7 @@ def test_get_batch_process_metadata_with_valid_operation(
222222
individual_process_statuses=[
223223
documentai.BatchProcessMetadata.IndividualProcessStatus(
224224
input_gcs_source="gs://test-directory/documentai/input.pdf",
225-
output_gcs_destination="gs://test-directory/documentai/output/123456789/1/",
225+
output_gcs_destination="gs://test-directory/documentai/output/123456789/1",
226226
)
227227
],
228228
)
@@ -256,7 +256,7 @@ def test_get_batch_process_metadata_with_running_operation(
256256
individual_process_statuses=[
257257
documentai.BatchProcessMetadata.IndividualProcessStatus(
258258
input_gcs_source="gs://test-directory/documentai/input.pdf",
259-
output_gcs_destination="gs://test-directory/documentai/output/123456789/1/",
259+
output_gcs_destination="gs://test-directory/documentai/output/123456789/1",
260260
)
261261
],
262262
)
@@ -442,11 +442,11 @@ def test_document_from_batch_process_metadata_with_multiple_input_files(
442442
individual_process_statuses=[
443443
mock.Mock(
444444
input_gcs_source="gs://test-directory/documentai/input.pdf",
445-
output_gcs_destination="gs://test-directory/documentai/output/123456789/1/",
445+
output_gcs_destination="gs://test-directory/documentai/output/123456789/1",
446446
),
447447
mock.Mock(
448448
input_gcs_source="gs://test-directory/documentai/input2.pdf",
449-
output_gcs_destination="gs://test-directory/documentai/output/123456789/2/",
449+
output_gcs_destination="gs://test-directory/documentai/output/123456789/2",
450450
),
451451
],
452452
)
@@ -465,6 +465,37 @@ def test_document_from_batch_process_metadata_with_multiple_input_files(
465465
assert documents[1].gcs_input_uri == "gs://test-directory/documentai/input2.pdf"
466466

467467

468+
def test_document_from_batch_process_metadata_with_multiple_input_files_matching_prefix(
469+
get_bytes_multiple_directories_mock,
470+
):
471+
mock_metadata = mock.Mock(
472+
state=documentai.BatchProcessMetadata.State.SUCCEEDED,
473+
individual_process_statuses=[
474+
mock.Mock(
475+
input_gcs_source="gs://test-directory/documentai/input.pdf",
476+
output_gcs_destination="gs://test-directory/documentai/output/123456789/1",
477+
),
478+
mock.Mock(
479+
input_gcs_source="gs://test-directory/documentai/input2.pdf",
480+
output_gcs_destination="gs://test-directory/documentai/output/123456789/11",
481+
),
482+
],
483+
)
484+
documents = document.Document.from_batch_process_metadata(mock_metadata)
485+
486+
get_bytes_multiple_directories_mock.assert_called()
487+
assert get_bytes_multiple_directories_mock.call_count == 2
488+
assert len(documents) == 2
489+
490+
assert documents[0].gcs_bucket_name == "test-directory"
491+
assert documents[0].gcs_prefix == "documentai/output/123456789/1/"
492+
assert documents[0].gcs_input_uri == "gs://test-directory/documentai/input.pdf"
493+
494+
assert documents[1].gcs_bucket_name == "test-directory"
495+
assert documents[1].gcs_prefix == "documentai/output/123456789/11/"
496+
assert documents[1].gcs_input_uri == "gs://test-directory/documentai/input2.pdf"
497+
498+
468499
def test_document_from_batch_process_metadata_with_failed_operation():
469500
with pytest.raises(
470501
ValueError,

0 commit comments

Comments
 (0)