diff --git a/CHANGELOG.md b/CHANGELOG.md index c0d6f656..22e49e17 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ - Update default scope/redirect Url for OAuth U2M, so with default OAuth app user can run python models ([776](https://github.com/databricks/dbt-databricks/pull/776)) - Fix foreign key constraints by switching from `parent` to `to` and `parent_columns` to `to_columns` ([789](https://github.com/databricks/dbt-databricks/pull/789)) - Now handles external shallow clones without blowing up ([795](https://github.com/databricks/dbt-databricks/pull/795)) +- Use information_schema to get column types when possible, since describe extended truncates complex types ([796](https://github.com/databricks/dbt-databricks/pull/796)) ## dbt-databricks 1.8.5 (August 6, 2024) diff --git a/dbt/adapters/databricks/impl.py b/dbt/adapters/databricks/impl.py index 46cb11dd..a973a5b4 100644 --- a/dbt/adapters/databricks/impl.py +++ b/dbt/adapters/databricks/impl.py @@ -73,6 +73,7 @@ from dbt.adapters.spark.impl import LIST_SCHEMAS_MACRO_NAME from dbt.adapters.spark.impl import SparkAdapter from dbt.adapters.spark.impl import TABLE_OR_VIEW_NOT_FOUND_MESSAGES +from dbt_common.behavior_flags import BehaviorFlag from dbt_common.exceptions import DbtRuntimeError from dbt_common.utils import executor from dbt_common.utils.dict import AttrDict @@ -88,6 +89,7 @@ SHOW_TABLES_MACRO_NAME = "show_tables" SHOW_VIEWS_MACRO_NAME = "show_views" GET_COLUMNS_COMMENTS_MACRO_NAME = "get_columns_comments" +GET_COLUMNS_BY_INFO_MACRO_NAME = "get_columns_comments_via_information_schema" @dataclass @@ -164,6 +166,12 @@ class DatabricksAdapter(SparkAdapter): } ) + # This will begin working once we have 1.9 of dbt-core. + # For now does nothing + @property + def _behavior_flags(self) -> List[BehaviorFlag]: + return [{"name": "column_types_from_information_schema", "default": False}] # type: ignore + # override/overload def acquire_connection( self, name: Optional[str] = None, query_header_context: Any = None @@ -376,6 +384,36 @@ def parse_describe_extended( # type: ignore[override] def get_columns_in_relation( # type: ignore[override] self, relation: DatabricksRelation + ) -> List[DatabricksColumn]: + if ( + # We can uncomment this once behavior flags are available to adapters + # self.behavior.column_types_from_information_schema and # type: ignore + not relation.is_hive_metastore() + ): + return self._get_columns_in_relation_by_information_schema(relation) + else: + return self._get_columns_in_relation_by_describe(relation) + + def _get_columns_in_relation_by_information_schema( + self, relation: DatabricksRelation + ) -> List[DatabricksColumn]: + rows = list( + handle_missing_objects( + lambda: self.execute_macro( + GET_COLUMNS_BY_INFO_MACRO_NAME, kwargs={"relation": relation} + ), + AttrDict(), + ) + ) + + columns = [] + for row in rows: + columns.append(DatabricksColumn(column=row[0], dtype=row[1], comment=row[2])) + + return columns + + def _get_columns_in_relation_by_describe( + self, relation: DatabricksRelation ) -> List[DatabricksColumn]: rows = list( handle_missing_objects( diff --git a/dbt/include/databricks/macros/adapters/persist_docs.sql b/dbt/include/databricks/macros/adapters/persist_docs.sql index 5c7a358d..46cfff7a 100644 --- a/dbt/include/databricks/macros/adapters/persist_docs.sql +++ b/dbt/include/databricks/macros/adapters/persist_docs.sql @@ -28,6 +28,22 @@ {% do return(load_result('get_columns_comments').table) %} {% endmacro %} +{% macro get_columns_comments_via_information_schema(relation) -%} + {% call statement('get_columns_comments_via_information_schema', fetch_result=True) -%} + select + column_name, + full_data_type, + comment + from `system`.`information_schema`.`columns` + where + table_catalog = '{{ relation.database|lower }}' and + table_schema = '{{ relation.schema|lower }}' and + table_name = '{{ relation.identifier|lower }}' + {% endcall %} + + {% do return(load_result('get_columns_comments_via_information_schema').table) %} +{% endmacro %} + {% macro databricks__persist_docs(relation, model, for_relation, for_columns) -%} {%- if for_relation and config.persist_relation_docs() and model.description %} {% do alter_table_comment(relation, model) %} diff --git a/dev-requirements.txt b/dev-requirements.txt index 46c95b6e..c0345f9a 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -15,4 +15,4 @@ types-requests types-mock pre-commit -dbt-tests-adapter~=1.8.0 +dbt-tests-adapter~=1.9.0 diff --git a/requirements.txt b/requirements.txt index 4398f0d6..a262fedb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ databricks-sql-connector>=3.1.0, <3.2.0 dbt-spark~=1.8.0 dbt-core>=1.8.0, <2.0 -dbt-adapters>=1.3.0, <2.0 +dbt-adapters>=1.6.0, <2.0 databricks-sdk==0.17.0 keyring>=23.13.0 pandas<2.2.0 diff --git a/tests/functional/adapter/columns/fixtures.py b/tests/functional/adapter/columns/fixtures.py new file mode 100644 index 00000000..12e2ed60 --- /dev/null +++ b/tests/functional/adapter/columns/fixtures.py @@ -0,0 +1,13 @@ +base_model = """ +select struct('a', 1, 'b', 'b', 'c', ARRAY(1,2,3)) as struct_col, 'hello' as str_col +""" + +schema = """ +version: 2 + +models: + - name: base_model + columns: + - name: struct_col + - name: str_col +""" diff --git a/tests/functional/adapter/columns/test_get_columns.py b/tests/functional/adapter/columns/test_get_columns.py new file mode 100644 index 00000000..ef5feb8d --- /dev/null +++ b/tests/functional/adapter/columns/test_get_columns.py @@ -0,0 +1,55 @@ +import pytest + +from dbt.adapters.databricks.column import DatabricksColumn +from dbt.adapters.databricks.relation import DatabricksRelation +from tests.functional.adapter.columns import fixtures +from dbt.tests import util + + +class ColumnsInRelation: + + @pytest.fixture(scope="class") + def models(self): + return {"base_model.sql": fixtures.base_model, "schema.yml": fixtures.schema} + + @pytest.fixture(scope="class", autouse=True) + def setup(self, project): + util.run_dbt(["run"]) + + @pytest.fixture(scope="class") + def expected_columns(self): + + return [ + DatabricksColumn( + column="struct_col", + dtype=( + "struct>" + ), + ), + DatabricksColumn(column="str_col", dtype="string"), + ] + + def test_columns_in_relation(self, project, expected_columns): + my_relation = DatabricksRelation.create( + database=project.database, + schema=project.test_schema, + identifier="base_model", + type=DatabricksRelation.Table, + ) + + with project.adapter.connection_named("_test"): + actual_columns = project.adapter.get_columns_in_relation(my_relation) + assert actual_columns == expected_columns + + +class TestColumnsInRelationBehaviorFlagOff(ColumnsInRelation): + @pytest.fixture(scope="class") + def project_config_update(self): + return {"flags": {}} + + +class TestColumnsInRelationBehaviorFlagOn(ColumnsInRelation): + @pytest.fixture(scope="class") + def project_config_update(self): + return {"flags": {"column_types_from_information_schema": True}}