Skip to content

Commit 3796047

Browse files
Add Databricks support
1 parent 18af04d commit 3796047

File tree

4 files changed

+162
-24
lines changed

4 files changed

+162
-24
lines changed

cardinal_pythonlib/sql/validation.py

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,6 @@
2424
2525
**Functions to check table/column names etc. for validity in SQL.**
2626
27-
This is a slight
28-
2927
"""
3028

3129
import re
@@ -41,13 +39,29 @@
4139
# ... SQL Server is very liberal!
4240

4341

44-
# - ANSI: http://jakewheat.github.io/sql-overview/sql-2011-foundation-grammar.html#predefined-type # noqa: E501
42+
# - ANSI:
43+
# - http://jakewheat.github.io/sql-overview/sql-2011-foundation-grammar.html#predefined-type # noqa: E501
44+
#
4545
# - SQL Server:
4646
# - https://support.microsoft.com/en-us/office/equivalent-ansi-sql-data-types-7a0a6bef-ef25-45f9-8a9a-3c5f21b5c65d # noqa: E501
4747
# - https://docs.microsoft.com/en-us/sql/t-sql/data-types/data-types-transact-sql?view=sql-server-ver15 # noqa: E501
48+
# - https://learn.microsoft.com/en-us/sql/t-sql/data-types/data-types-transact-sql?view=sql-server-ver16 # noqa: E501
4849
# - Note that ANSI "BIT" is SQL Server "BINARY".
49-
# - MySQL: https://dev.mysql.com/doc/refman/8.0/en/data-types.html
50-
# - PostgreSQL: https://www.postgresql.org/docs/9.5/datatype.html
50+
#
51+
# - MySQL:
52+
# - https://dev.mysql.com/doc/refman/8.0/en/data-types.html
53+
# - https://dev.mysql.com/doc/refman/9.1/en/data-types.html
54+
#
55+
# - PostgreSQL:
56+
# - https://www.postgresql.org/docs/9.5/datatype.html
57+
#
58+
# - SQLite:
59+
# - https://www.sqlite.org/datatype3.html
60+
#
61+
# - Databricks:
62+
# - https://github.com/databricks/databricks-sqlalchemy
63+
64+
SQLTYPE_DATE = "DATE" # ANSI
5165

5266
SQLTYPES_INTEGER = (
5367
"BIGINT", # ANSI
@@ -71,6 +85,12 @@
7185
"SMALLSERIAL", # PostgreSQL
7286
"TINYINT", # SQL Server, MySQL
7387
)
88+
SQLTYPES_BIT = (
89+
"BIT VARYING", # ANSI
90+
"BIT", # ANSI
91+
"BOOL", # MySQL synonym for BOOLEAN or TINYINT(1)
92+
"BOOLEAN", # ANSI
93+
)
7494
SQLTYPES_FLOAT = (
7595
"DOUBLE PRECISION", # ANSI (8 bytes)
7696
"DOUBLE", # SQL Server, MySQL; synonym for DOUBLE PRECISION
@@ -84,16 +104,13 @@
84104
"SINGLE", # SQL Server
85105
)
86106
SQLTYPES_OTHER_NUMERIC = (
87-
"BIT VARYING", # ANSI
88-
"BIT", # ANSI
89-
"BOOL", # MySQL synonym for BOOLEAN or TINYINT(1)
90-
"BOOLEAN", # ANSI
91107
"DEC", # ANSI; synonym for DECIMAL
92108
"DECIMAL", # ANSI
93109
"FIXED", # MySQL; synonym for DECIMAL
94110
"LOGICAL", # SQL Server
95111
"LOGICAL1", # SQL Server
96112
"NUMERIC", # ANSI; synonym for DECIMAL
113+
"SMALLMONEY", # SQL Server
97114
"ROWVERSION", # SQL Server
98115
"VARBIT", # PostgreSQL synonym for BIT VARYING
99116
"YESNO", # SQL Server
@@ -125,8 +142,8 @@
125142
"NTEXT", # SQL Server
126143
"NVARCHAR", # SQL Server
127144
"SET", # MySQL
128-
"STRING", # SQL Server
129-
"TEXT", # SQL Server, MySQL
145+
"STRING", # SQL Server, Databricks
146+
"TEXT", # SQL Server, MySQL, SQLite
130147
"TINYTEXT", # MySQL
131148
"VARCHAR", # ANSI
132149
)
@@ -146,12 +163,13 @@
146163
"VARBINARY", # ANSI
147164
)
148165
SQLTYPES_WITH_DATE = (
149-
"DATE", # ANSI
150-
"DATETIME", # SQL Server, MySQL
166+
SQLTYPE_DATE, # ANSI
167+
"DATETIME", # SQL Server, MySQL, most
151168
"DATETIME2", # SQL Server
152169
"DATETIMEOFFSET", # SQL Server (date + time + time zone)
153170
"SMALLDATETIME", # SQL Server
154171
"TIMESTAMP", # ANSI
172+
"TIMESTAMP_NTZ", # Databricks
155173
)
156174
SQLTYPES_DATETIME_OTHER = (
157175
"INTERVAL", # ANSI (not always supported); PostgreSQL

cardinal_pythonlib/sqlalchemy/dialect.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,9 @@ class SqlaDialectName(object):
5656
SYBASE = "sybase"
5757

5858
# Additional third-party dialects:
59+
# - https://docs.sqlalchemy.org/en/20/dialects/
60+
# Interface:
61+
# - https://docs.sqlalchemy.org/en/20/core/internals.html#sqlalchemy.engine.Dialect # noqa: E501
5962

6063
DATABRICKS = "databricks"
6164
# ... https://github.com/databricks/databricks-sqlalchemy

cardinal_pythonlib/sqlalchemy/schema.py

Lines changed: 76 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@
2525
**Functions to work with SQLAlchemy schemas (schemata) directly, via SQLAlchemy
2626
Core.**
2727
28+
Functions that have to work with specific dialect information are marked
29+
DIALECT-AWARE.
30+
2831
"""
2932

3033
import ast
@@ -60,7 +63,19 @@
6063
)
6164
from sqlalchemy.sql import sqltypes, text
6265
from sqlalchemy.sql.ddl import DDLElement
63-
from sqlalchemy.sql.sqltypes import BigInteger, TypeEngine
66+
from sqlalchemy.sql.sqltypes import (
67+
BigInteger,
68+
Boolean,
69+
Date,
70+
DateTime,
71+
Double,
72+
Float,
73+
Integer,
74+
Numeric,
75+
SmallInteger,
76+
Text,
77+
TypeEngine,
78+
)
6479
from sqlalchemy.sql.visitors import Visitable
6580

6681
from cardinal_pythonlib.logs import get_brace_style_log_with_null_handler
@@ -86,6 +101,23 @@
86101
MSSQL_DEFAULT_SCHEMA = "dbo"
87102
POSTGRES_DEFAULT_SCHEMA = "public"
88103

104+
DATABRICKS_SQLCOLTYPE_TO_SQLALCHEMY_GENERIC = {
105+
# A bit nasty: https://github.com/databricks/databricks-sqlalchemy
106+
# Part of the reverse mapping is via
107+
# from databricks.sqlalchemy import DatabricksDialect
108+
# print(DatabricksDialect.colspecs)
109+
"BIGINT": BigInteger,
110+
"BOOLEAN": Boolean,
111+
"DATE": Date,
112+
"TIMESTAMP_NTZ": DateTime,
113+
"DOUBLE": Double,
114+
"FLOAT": Float,
115+
"INT": Integer,
116+
"DECIMAL": Numeric,
117+
"SMALLINT": SmallInteger,
118+
"STRING": Text,
119+
}
120+
89121

90122
# =============================================================================
91123
# Inspect tables (SQLAlchemy Core)
@@ -498,6 +530,8 @@ def add_index(
498530
499531
The table name is worked out from the :class:`Column` object.
500532
533+
DIALECT-AWARE.
534+
501535
Args:
502536
engine: SQLAlchemy :class:`Engine` object
503537
sqla_column: single column to index
@@ -733,6 +767,8 @@ def giant_text_sqltype(dialect: Dialect) -> str:
733767
Returns the SQL column type used to make very large text columns for a
734768
given dialect.
735769
770+
DIALECT-AWARE.
771+
736772
Args:
737773
dialect: a SQLAlchemy :class:`Dialect`
738774
Returns:
@@ -755,6 +791,9 @@ def giant_text_sqltype(dialect: Dialect) -> str:
755791
elif dname == SqlaDialectName.SQLITE:
756792
return "TEXT"
757793
# https://www.sqlite.org/datatype3.html
794+
elif dname == SqlaDialectName.DATABRICKS:
795+
return "STRING"
796+
# https://github.com/databricks/databricks-sqlalchemy
758797
else:
759798
raise ValueError(f"Unknown dialect: {dname}")
760799

@@ -787,16 +826,40 @@ def _get_sqla_coltype_class_from_str(
787826
Returns the SQLAlchemy class corresponding to a particular SQL column
788827
type in a given dialect.
789828
829+
DIALECT-AWARE.
830+
790831
Performs an upper- and lower-case search.
791832
For example, the SQLite dialect uses upper case, and the
792833
MySQL dialect uses lower case.
834+
835+
For exploratory thinking, see
836+
dev_notes/convert_sql_string_coltype_to_sqlalchemy_type.py.
837+
838+
DISCUSSION AT: https://github.com/sqlalchemy/sqlalchemy/discussions/12230
793839
"""
794-
# noinspection PyUnresolvedReferences
795-
ischema_names = dialect.ischema_names
796-
try:
797-
return ischema_names[coltype.upper()]
798-
except KeyError:
799-
return ischema_names[coltype.lower()]
840+
if hasattr(dialect, "ischema_names"):
841+
# The built-in dialects all have this, even though it's an internal
842+
# detail.
843+
ischema_names = dialect.ischema_names
844+
try:
845+
return ischema_names[coltype.upper()]
846+
except KeyError:
847+
return ischema_names[coltype.lower()]
848+
elif dialect.name == SqlaDialectName.DATABRICKS:
849+
# Ugly hack.
850+
# Databricks is an example that doesn't have ischema_names.
851+
try:
852+
return DATABRICKS_SQLCOLTYPE_TO_SQLALCHEMY_GENERIC[coltype.upper()]
853+
except KeyError:
854+
raise ValueError(
855+
f"Don't know how to convert SQL column type {coltype!r} "
856+
f"to SQLAlchemy dialect {dialect!r}"
857+
)
858+
else:
859+
raise ValueError(
860+
f"Don't know a generic way to convert SQL column types "
861+
f"(in text format) to SQLAlchemy dialect {dialect.name!r}. "
862+
)
800863

801864

802865
def get_list_of_sql_string_literals_from_quoted_csv(x: str) -> List[str]:
@@ -830,6 +893,8 @@ def get_sqla_coltype_from_dialect_str(
830893
``coltype.compile()`` or ``coltype.compile(dialect)``; see
831894
:class:`TypeEngine`.
832895
896+
DIALECT-AWARE.
897+
833898
Args:
834899
dialect: a SQLAlchemy :class:`Dialect` class
835900
@@ -999,6 +1064,8 @@ def convert_sqla_type_for_dialect(
9991064
"""
10001065
Converts an SQLAlchemy column type from one SQL dialect to another.
10011066
1067+
DIALECT-AWARE.
1068+
10021069
Args:
10031070
coltype: SQLAlchemy column type in the source dialect
10041071
@@ -1024,9 +1091,7 @@ def convert_sqla_type_for_dialect(
10241091
"""
10251092
assert coltype is not None
10261093

1027-
# noinspection PyUnresolvedReferences
10281094
to_mysql = dialect.name == SqlaDialectName.MYSQL
1029-
# noinspection PyUnresolvedReferences
10301095
to_mssql = dialect.name == SqlaDialectName.MSSQL
10311096
typeclass = type(coltype)
10321097

@@ -1201,10 +1266,10 @@ def does_sqlatype_require_index_len(
12011266

12021267

12031268
# =============================================================================
1204-
# hack_in_mssql_xml_type:
1269+
# hack_in_mssql_xml_type
1270+
# =============================================================================
12051271
#
12061272
# Removed, as mssql.base.ischema_names["xml"] is now defined.
1207-
# =============================================================================
12081273

12091274

12101275
# =============================================================================
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# EXPLORATORY CODE ONLY.
2+
#
3+
# PROBLEM: Take a SQL string fragment representing a column type (e.g.
4+
# "VARCHAR(32)", "STRING") and an SQLAlchemy dialect (a core one like mysql or
5+
# sqlite, or a third-party one like databricks), and return the appropriate
6+
# SQLAlchemy type as a TypeEngine class/instance.
7+
#
8+
# CURRENT IMPLEMENTATION:
9+
# cardinal_pythonlib.sqlalchemy.schema.get_sqla_coltype_from_dialect_str()
10+
# ... with its sub-function, _get_sqla_coltype_class_from_str()
11+
#
12+
# DISCUSSION AT: https://github.com/sqlalchemy/sqlalchemy/discussions/12230
13+
14+
15+
# For exploring some files directly:
16+
from sqlalchemy.inspection import inspect # noqa: F401
17+
import sqlalchemy.dialects.sqlite.base # noqa: F401
18+
import sqlalchemy.dialects.sqlite.pysqlite # noqa: F401
19+
20+
# Test code for dialects:
21+
from sqlalchemy.engine.default import DefaultDialect
22+
from sqlalchemy.dialects.mssql import dialect as MSSQLDialect
23+
from sqlalchemy.dialects.mysql import dialect as MySQLDialect
24+
from sqlalchemy.dialects.postgresql import dialect as PostgreSQLDialect
25+
from sqlalchemy.dialects.sqlite import dialect as SQLiteDialect
26+
27+
# Third-party dialect
28+
from databricks.sqlalchemy import DatabricksDialect
29+
30+
# Create instances to explore:
31+
default_dialect = DefaultDialect()
32+
postgresql_dialect = PostgreSQLDialect()
33+
mssql_dialect = MSSQLDialect()
34+
mysql_dialect = MySQLDialect()
35+
sqlite_dialect = SQLiteDialect()
36+
databricks_dialect = DatabricksDialect()
37+
38+
print(sqlite_dialect.ischema_names)
39+
40+
# The native ones all have an "ischema_names" dictionary, apart from
41+
# DefaultDialect. The Databricks one doesn't.
42+
43+
# The way SQLAlchemy does this for real is via an Inspector, which passes on
44+
# to the Dialect.
45+
# Inspector: https://docs.sqlalchemy.org/en/20/core/reflection.html#sqlalchemy.engine.reflection.Inspector # noqa: E501
46+
# Engine: https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.Engine # noqa: E501
47+
# Dialect: https://docs.sqlalchemy.org/en/14/core/internals.html#sqlalchemy.engine.Dialect # noqa: E501
48+
# ... get_columns()
49+
# ... type_descriptor(), convers generic SQLA type to dialect-specific type.
50+
# DefaultDialect: https://docs.sqlalchemy.org/en/14/core/internals.html#sqlalchemy.engine.default.DefaultDialect # noqa: E501
51+
52+
# I can't find a generic method. See discussion above: there isn't one.

0 commit comments

Comments
 (0)