DuckDB native Deltalake (#370)

Resolves #258
datacontract · Aug 10, 2024 · 3368d85 · 3368d85
1 parent a790517
commit 3368d85
Show file tree

Hide file tree

Showing 4 changed files with 6 additions and 30 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 - Support of varchar max length in Glue import (#351)
 
+### Changed
+- Deltalake is now using DuckDB's native deltalake support (#258). Extra deltalake removed.
+
 ### Fixed
 - Fix an issue where Glue database without a location creates invalid data contract (#351)
 - Fix bigint -> long data type mapping (#351)

diff --git a/README.md b/README.md
@@ -190,7 +190,6 @@ A list of available extras:
 | Avro Support           | `pip install datacontract-cli[avro]`       |
 | Google BigQuery        | `pip install datacontract-cli[bigquery]`   |
 | Databricks Integration | `pip install datacontract-cli[databricks]` |
-| Deltalake Integration  | `pip install datacontract-cli[deltalake]`  |
 | Kafka Integration      | `pip install datacontract-cli[kafka]`      |
 | PostgreSQL Integration | `pip install datacontract-cli[postgres]`   |
 | S3 Integration         | `pip install datacontract-cli[s3]`         |

diff --git a/datacontract/engines/soda/connections/duckdb.py b/datacontract/engines/soda/connections/duckdb.py
@@ -1,7 +1,5 @@
 import os
 
-from deltalake import DeltaTable
-
 import duckdb
 from datacontract.export.csv_type_converter import convert_to_duckdb_csv_type
 from datacontract.model.run import Run
@@ -49,28 +47,8 @@ def get_duckdb_connection(data_contract, server, run: Run):
                     f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_csv('{model_path}', hive_partitioning=1, columns={columns});"""
                 )
         elif server.format == "delta":
-            if server.type == "local":
-                delta_table_arrow = DeltaTable(model_path).to_pyarrow_dataset()
-                con.register(model_name, delta_table_arrow)
-
-            if server.type == "azure":
-                # After switching to native delta table support
-                # in https://github.com/datacontract/datacontract-cli/issues/258,
-                # azure storage should also work
-                # https://github.com/duckdb/duckdb_delta/issues/21
-                raise NotImplementedError("Support for Delta Tables on Azure Storage is not implemented yet")
-            if server.type == "s3":
-                storage_options = {
-                    "AWS_ENDPOINT_URL": server.endpointUrl,
-                    "AWS_ACCESS_KEY_ID": os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID"),
-                    "AWS_SECRET_ACCESS_KEY": os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY"),
-                    "AWS_REGION": os.getenv("DATACONTRACT_S3_REGION", "us-east-1"),
-                    "AWS_ALLOW_HTTP": "True" if server.endpointUrl.startswith("http://") else "False",
-                }
-
-                delta_table_arrow = DeltaTable(model_path, storage_options=storage_options).to_pyarrow_dataset()
-
-                con.register(model_name, delta_table_arrow)
+            con.sql("update extensions;")  # Make sure we have the latest delta extension
+            con.sql(f"""CREATE VIEW "{model_name}" AS SELECT * FROM delta_scan('{model_path}');""")
     return con
 
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -55,10 +55,6 @@ databricks = [
   "soda-core-spark[databricks]>=3.3.1,<3.4.0"
 ]
 
-deltalake = [
-  "deltalake>=0.17,<0.19"
-]
-
 kafka = [
   "datacontract-cli[avro]",
   "soda-core-spark-df>=3.3.1,<3.4.0"
@@ -86,7 +82,7 @@ trino = [
 ]
 
 all = [
-  "datacontract-cli[kafka,bigquery,snowflake,postgres,databricks,sqlserver,s3,deltalake,trino]"
+  "datacontract-cli[kafka,bigquery,snowflake,postgres,databricks,sqlserver,s3,trino]"
 ]
 
 dev = [