Skip to content

Commit

Permalink
Support GCS.
Browse files Browse the repository at this point in the history
  • Loading branch information
jochenchrist committed Aug 11, 2024
1 parent 90e80d1 commit 17b2912
Show file tree
Hide file tree
Showing 14 changed files with 7,146 additions and 1 deletion.
32 changes: 32 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,12 @@ Feel free to create an [issue](https://github.com/datacontract/datacontract-cli/

Data Contract CLI can test data that is stored in S3 buckets or any S3-compliant endpoints in various formats.

- CSV
- JSON
- Delta
- Parquet
- Iceberg (coming soon)

#### Examples

##### JSON
Expand Down Expand Up @@ -366,6 +372,32 @@ servers:



### Google Cloud Storage (GCS)

The [S3](#S3) integration also works with files on Google Cloud Storage through its [interoperability](https://cloud.google.com/storage/docs/interoperability).
Use `https://storage.googleapis.com` as the endpoint URL.

#### Example

datacontract.yaml
```yaml
servers:
production:
type: s3
endpointUrl: https://storage.googleapis.com
location: s3://bucket-name/path/*/*.json # use s3:// schema instead of gs://
format: json
delimiter: new_line # new_line, array, or none
```

#### Environment Variables

| Environment Variable | Example | Description |
|-------------------------------------|----------------|------------------------------------------------------------------------------------------|
| `DATACONTRACT_S3_ACCESS_KEY_ID` | `GOOG1EZZZ...` | The GCS [HMAC Key](https://cloud.google.com/storage/docs/authentication/hmackeys) Key ID |
| `DATACONTRACT_S3_SECRET_ACCESS_KEY` | `PDWWpb...` | The GCS [HMAC Key](https://cloud.google.com/storage/docs/authentication/hmackeys) Secret |


### BigQuery

We support authentication to BigQuery using Service Account Key. The used Service Account should include the roles:
Expand Down
11 changes: 11 additions & 0 deletions datacontract/engines/fastjsonschema/check_jsonschema.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,17 @@ def check_jsonschema(run: Run, data_contract: DataContractSpecification, server:
process_local_file(run, server, model_name, validate)
elif server.type == "s3":
process_s3_file(server, model_name, validate)
elif server.type == "gcs":
run.checks.append(
Check(
type="schema",
name="Check that JSON has valid schema",
model=model_name,
result="info",
reason="JSON Schema check skipped for GCS, as GCS is currently not supported",
engine="jsonschema",
)
)
else:
run.checks.append(
Check(
Expand Down
2 changes: 1 addition & 1 deletion datacontract/engines/soda/check_soda_execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def check_soda_execute(
run.log_info("Running engine soda-core")
scan = Scan()

if server.type in ["s3", "azure", "local"]:
if server.type in ["s3", "gcs", "azure", "local"]:
if server.format in ["json", "parquet", "csv", "delta"]:
con = get_duckdb_connection(data_contract, server, run)
scan.add_duckdb_connection(duckdb_connection=con, data_source_name=server.type)
Expand Down
21 changes: 21 additions & 0 deletions datacontract/engines/soda/connections/duckdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ def get_duckdb_connection(data_contract, server, run: Run):
if server.type == "s3":
path = server.location
setup_s3_connection(con, server)
if server.type == "gcs":
path = server.location
setup_gcs_connection(con, server)
if server.type == "azure":
path = server.location
setup_azure_connection(con, server)
Expand Down Expand Up @@ -120,6 +123,24 @@ def setup_s3_connection(con, server):
# print(con.sql("SELECT * FROM duckdb_settings() WHERE name like 's3%'"))


def setup_gcs_connection(con, server):
key_id = os.getenv("DATACONTRACT_GCS_KEY_ID")
secret = os.getenv("DATACONTRACT_GCS_SECRET")

if key_id is None:
raise ValueError("Error: Environment variable DATACONTRACT_GCS_KEY_ID is not set")
if secret is None:
raise ValueError("Error: Environment variable DATACONTRACT_GCS_SECRET is not set")

con.sql(f"""
CREATE SECRET gcs_secret (
TYPE GCS,
KEY_ID '{key_id}',
SECRET '{secret}'
);
""")


def setup_azure_connection(con, server):
tenant_id = os.getenv("DATACONTRACT_AZURE_TENANT_ID")
client_id = os.getenv("DATACONTRACT_AZURE_CLIENT_ID")
Expand Down
1 change: 1 addition & 0 deletions tests/fixtures/gcs-json-remote/data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This folder is uploaded to a GCS bucket.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

32 changes: 32 additions & 0 deletions tests/fixtures/gcs-json-remote/datacontract.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
dataContractSpecification: 0.9.2
id: inventory-events
info:
title: Inventory Events
version: 0.0.1
owner: my-domain-team
contact:
email: jochen.christ@innoq.com
servers:
gcs-url:
type: gcs
location: gs://datacontract-test-inventory/inventory/*/*/*/*/*.json
delimiter: new_line
format: json
s3-style:
type: s3
endpointUrl: https://storage.googleapis.com
location: s3://datacontract-test-inventory/inventory/*/*/*/*/*.json
delimiter: new_line
format: json
models:
inventory:
type: table
fields:
updated_at:
type: string
available:
type: numeric
location:
type: string
sku:
type: string
48 changes: 48 additions & 0 deletions tests/test_test_gcs_json_remote.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import os

import pytest
from dotenv import load_dotenv

from datacontract.data_contract import DataContract

datacontract = "fixtures/gcs-json-remote/datacontract.yaml"
load_dotenv(override=True)


@pytest.mark.skipif(
os.environ.get("DATACONTRACT_GCS_KEY_ID") is None or os.environ.get("DATACONTRACT_GCS_SECRET") is None,
reason="Requires DATACONTRACT_GCS_KEY_ID, and DATACONTRACT_GCS_SECRET to be set",
)
def test_test_gcs_json_remote_gcs_url():
"""
server.type "gcs" and gs:// locations work with DuckDB, but are not yet supported for json schema testing
"""
data_contract = DataContract(
data_contract_file=datacontract,
server="gcs-url",
)

run = data_contract.test()

print(run)
assert run.result == "passed"


@pytest.mark.skipif(
os.environ.get("DATACONTRACT_GCS_KEY_ID") is None or os.environ.get("DATACONTRACT_GCS_SECRET") is None,
reason="Requires DATACONTRACT_GCS_KEY_ID, and DATACONTRACT_GCS_SECRET to be set",
)
def test_test_gcs_json_remote_s3_style(monkeypatch):
monkeypatch.setenv("DATACONTRACT_S3_ACCESS_KEY_ID", os.environ.get("DATACONTRACT_GCS_KEY_ID"))
monkeypatch.setenv("DATACONTRACT_S3_SECRET_ACCESS_KEY", os.environ.get("DATACONTRACT_GCS_SECRET"))

data_contract = DataContract(
data_contract_file=datacontract,
server="s3-style",
)

run = data_contract.test()

print(run)
assert run.result == "passed"
assert all(check.result == "passed" for check in run.checks)

0 comments on commit 17b2912

Please sign in to comment.