Skip to content

Commit

Permalink
DuckDB native Deltalake (#370)
Browse files Browse the repository at this point in the history
Resolves #258
  • Loading branch information
jochenchrist authored Aug 10, 2024
1 parent a790517 commit 3368d85
Show file tree
Hide file tree
Showing 4 changed files with 6 additions and 30 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- Support of varchar max length in Glue import (#351)

### Changed
- Deltalake is now using DuckDB's native deltalake support (#258). Extra deltalake removed.

### Fixed
- Fix an issue where Glue database without a location creates invalid data contract (#351)
- Fix bigint -> long data type mapping (#351)
Expand Down
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,6 @@ A list of available extras:
| Avro Support | `pip install datacontract-cli[avro]` |
| Google BigQuery | `pip install datacontract-cli[bigquery]` |
| Databricks Integration | `pip install datacontract-cli[databricks]` |
| Deltalake Integration | `pip install datacontract-cli[deltalake]` |
| Kafka Integration | `pip install datacontract-cli[kafka]` |
| PostgreSQL Integration | `pip install datacontract-cli[postgres]` |
| S3 Integration | `pip install datacontract-cli[s3]` |
Expand Down
26 changes: 2 additions & 24 deletions datacontract/engines/soda/connections/duckdb.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import os

from deltalake import DeltaTable

import duckdb
from datacontract.export.csv_type_converter import convert_to_duckdb_csv_type
from datacontract.model.run import Run
Expand Down Expand Up @@ -49,28 +47,8 @@ def get_duckdb_connection(data_contract, server, run: Run):
f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_csv('{model_path}', hive_partitioning=1, columns={columns});"""
)
elif server.format == "delta":
if server.type == "local":
delta_table_arrow = DeltaTable(model_path).to_pyarrow_dataset()
con.register(model_name, delta_table_arrow)

if server.type == "azure":
# After switching to native delta table support
# in https://github.com/datacontract/datacontract-cli/issues/258,
# azure storage should also work
# https://github.com/duckdb/duckdb_delta/issues/21
raise NotImplementedError("Support for Delta Tables on Azure Storage is not implemented yet")
if server.type == "s3":
storage_options = {
"AWS_ENDPOINT_URL": server.endpointUrl,
"AWS_ACCESS_KEY_ID": os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID"),
"AWS_SECRET_ACCESS_KEY": os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY"),
"AWS_REGION": os.getenv("DATACONTRACT_S3_REGION", "us-east-1"),
"AWS_ALLOW_HTTP": "True" if server.endpointUrl.startswith("http://") else "False",
}

delta_table_arrow = DeltaTable(model_path, storage_options=storage_options).to_pyarrow_dataset()

con.register(model_name, delta_table_arrow)
con.sql("update extensions;") # Make sure we have the latest delta extension
con.sql(f"""CREATE VIEW "{model_name}" AS SELECT * FROM delta_scan('{model_path}');""")
return con


Expand Down
6 changes: 1 addition & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,6 @@ databricks = [
"soda-core-spark[databricks]>=3.3.1,<3.4.0"
]

deltalake = [
"deltalake>=0.17,<0.19"
]

kafka = [
"datacontract-cli[avro]",
"soda-core-spark-df>=3.3.1,<3.4.0"
Expand Down Expand Up @@ -86,7 +82,7 @@ trino = [
]

all = [
"datacontract-cli[kafka,bigquery,snowflake,postgres,databricks,sqlserver,s3,deltalake,trino]"
"datacontract-cli[kafka,bigquery,snowflake,postgres,databricks,sqlserver,s3,trino]"
]

dev = [
Expand Down

0 comments on commit 3368d85

Please sign in to comment.