diff --git a/CHANGELOG.md b/CHANGELOG.md index 240f62ca..8fcd89a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Support of varchar max length in Glue import (#351) +### Changed +- Deltalake is now using DuckDB's native deltalake support (#258). Extra deltalake removed. + ### Fixed - Fix an issue where Glue database without a location creates invalid data contract (#351) - Fix bigint -> long data type mapping (#351) diff --git a/README.md b/README.md index 9018f701..bdfa9aa4 100644 --- a/README.md +++ b/README.md @@ -190,7 +190,6 @@ A list of available extras: | Avro Support | `pip install datacontract-cli[avro]` | | Google BigQuery | `pip install datacontract-cli[bigquery]` | | Databricks Integration | `pip install datacontract-cli[databricks]` | -| Deltalake Integration | `pip install datacontract-cli[deltalake]` | | Kafka Integration | `pip install datacontract-cli[kafka]` | | PostgreSQL Integration | `pip install datacontract-cli[postgres]` | | S3 Integration | `pip install datacontract-cli[s3]` | diff --git a/datacontract/engines/soda/connections/duckdb.py b/datacontract/engines/soda/connections/duckdb.py index 4d36c427..f78e8640 100644 --- a/datacontract/engines/soda/connections/duckdb.py +++ b/datacontract/engines/soda/connections/duckdb.py @@ -1,7 +1,5 @@ import os -from deltalake import DeltaTable - import duckdb from datacontract.export.csv_type_converter import convert_to_duckdb_csv_type from datacontract.model.run import Run @@ -49,28 +47,8 @@ def get_duckdb_connection(data_contract, server, run: Run): f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_csv('{model_path}', hive_partitioning=1, columns={columns});""" ) elif server.format == "delta": - if server.type == "local": - delta_table_arrow = DeltaTable(model_path).to_pyarrow_dataset() - con.register(model_name, delta_table_arrow) - - if server.type == "azure": - # After switching to native delta table support - # in https://github.com/datacontract/datacontract-cli/issues/258, - # azure storage should also work - # https://github.com/duckdb/duckdb_delta/issues/21 - raise NotImplementedError("Support for Delta Tables on Azure Storage is not implemented yet") - if server.type == "s3": - storage_options = { - "AWS_ENDPOINT_URL": server.endpointUrl, - "AWS_ACCESS_KEY_ID": os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID"), - "AWS_SECRET_ACCESS_KEY": os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY"), - "AWS_REGION": os.getenv("DATACONTRACT_S3_REGION", "us-east-1"), - "AWS_ALLOW_HTTP": "True" if server.endpointUrl.startswith("http://") else "False", - } - - delta_table_arrow = DeltaTable(model_path, storage_options=storage_options).to_pyarrow_dataset() - - con.register(model_name, delta_table_arrow) + con.sql("update extensions;") # Make sure we have the latest delta extension + con.sql(f"""CREATE VIEW "{model_name}" AS SELECT * FROM delta_scan('{model_path}');""") return con diff --git a/pyproject.toml b/pyproject.toml index d7171e57..3edadd4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,10 +55,6 @@ databricks = [ "soda-core-spark[databricks]>=3.3.1,<3.4.0" ] -deltalake = [ - "deltalake>=0.17,<0.19" -] - kafka = [ "datacontract-cli[avro]", "soda-core-spark-df>=3.3.1,<3.4.0" @@ -86,7 +82,7 @@ trino = [ ] all = [ - "datacontract-cli[kafka,bigquery,snowflake,postgres,databricks,sqlserver,s3,deltalake,trino]" + "datacontract-cli[kafka,bigquery,snowflake,postgres,databricks,sqlserver,s3,trino]" ] dev = [