Skip to content

Commit 413cd24

Browse files
chalmerlowerenovate-botgcf-owl-bot[bot]parthea
authored
Feat: Update partitioning by DATE, DATETIME, TIMESTAMP, _PARTITIONDATE (#1113)
* adds additional functionality to cover more partitioning capability * Updates the partitioning algorithm and tests * Updates special case and tests * Updates test in possible effort to increase coverage. * Tweaks the conditionals in time partitioning process * Updates linting * chore(deps): update all dependencies (#1136) * chore(deps): update all dependencies * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * Update protobuf to 5.28.3 * pin google-crc32c for python 3.7/3.8 * Pin mako for python 3.8 * Pin markupsafe for Python 3.8 * Pin pyparsing for python 3.8 * Pin pyparsing for Python 3.8 --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com> Co-authored-by: Anthonios Partheniou <partheniou@google.com> * chore(deps): update all dependencies (#1140) * chore(deps): update all dependencies * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com> * Removes duplicate test --------- Co-authored-by: Mend Renovate <bot@renovateapp.com> Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com> Co-authored-by: Anthonios Partheniou <partheniou@google.com>
1 parent fb8f009 commit 413cd24

File tree

3 files changed

+210
-91
lines changed

3 files changed

+210
-91
lines changed

sqlalchemy_bigquery/base.py

Lines changed: 77 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -812,42 +812,99 @@ def _raise_for_type(self, option, value, expected_type):
812812
)
813813

814814
def _process_time_partitioning(
815-
self, table: Table, time_partitioning: TimePartitioning
815+
self,
816+
table: Table,
817+
time_partitioning: TimePartitioning,
816818
):
817819
"""
818-
Generates a SQL 'PARTITION BY' clause for partitioning a table by a date or timestamp.
820+
Generates a SQL 'PARTITION BY' clause for partitioning a table,
819821
820822
Args:
821-
- table (Table): The SQLAlchemy table object representing the BigQuery table to be partitioned.
823+
- table (Table): The SQLAlchemy table object representing the BigQuery
824+
table to be partitioned.
822825
- time_partitioning (TimePartitioning): The time partitioning details,
823826
including the field to be used for partitioning.
824827
825828
Returns:
826-
- str: A SQL 'PARTITION BY' clause that uses either TIMESTAMP_TRUNC or DATE_TRUNC to
827-
partition data on the specified field.
829+
- str: A SQL 'PARTITION BY' clause.
828830
829831
Example:
830-
- Given a table with a TIMESTAMP type column 'event_timestamp' and setting
831-
'time_partitioning.field' to 'event_timestamp', the function returns
832+
- Given a table with an 'event_timestamp' and setting time_partitioning.type
833+
as DAY and by setting 'time_partitioning.field' as 'event_timestamp', the
834+
function returns:
832835
"PARTITION BY TIMESTAMP_TRUNC(event_timestamp, DAY)".
836+
837+
Current inputs allowed by BQ and covered by this function include:
838+
* _PARTITIONDATE
839+
* DATETIME_TRUNC(<datetime_column>, DAY/HOUR/MONTH/YEAR)
840+
* TIMESTAMP_TRUNC(<timestamp_column>, DAY/HOUR/MONTH/YEAR)
841+
* DATE_TRUNC(<date_column>, MONTH/YEAR)
842+
843+
Additional options allowed by BQ but not explicitly covered by this
844+
function include:
845+
* DATE(_PARTITIONTIME)
846+
* DATE(<timestamp_column>)
847+
* DATE(<datetime_column>)
848+
* DATE column
833849
"""
834-
field = "_PARTITIONDATE"
835-
trunc_fn = "DATE_TRUNC"
836850

851+
sqltypes = {
852+
"_PARTITIONDATE": ("_PARTITIONDATE", None),
853+
"TIMESTAMP": ("TIMESTAMP_TRUNC", {"DAY", "HOUR", "MONTH", "YEAR"}),
854+
"DATETIME": ("DATETIME_TRUNC", {"DAY", "HOUR", "MONTH", "YEAR"}),
855+
"DATE": ("DATE_TRUNC", {"MONTH", "YEAR"}),
856+
}
857+
858+
# Extract field (i.e <column_name> or _PARTITIONDATE)
859+
# AND extract the name of the column_type (i.e. "TIMESTAMP", "DATE",
860+
# "DATETIME", "_PARTITIONDATE")
837861
if time_partitioning.field is not None:
838862
field = time_partitioning.field
839-
if isinstance(
840-
table.columns[time_partitioning.field].type,
841-
sqlalchemy.sql.sqltypes.DATE,
842-
):
843-
return f"PARTITION BY {field}"
844-
elif isinstance(
845-
table.columns[time_partitioning.field].type,
846-
sqlalchemy.sql.sqltypes.TIMESTAMP,
847-
):
848-
trunc_fn = "TIMESTAMP_TRUNC"
863+
column_type = table.columns[field].type.__visit_name__.upper()
864+
865+
else:
866+
field = "_PARTITIONDATE"
867+
column_type = "_PARTITIONDATE"
868+
869+
# Extract time_partitioning.type_ (DAY, HOUR, MONTH, YEAR)
870+
# i.e. generates one partition per type (1/DAY, 1/HOUR)
871+
# NOTE: if time_partitioning.type_ == None, it gets
872+
# immediately overwritten by python-bigquery to a default of DAY.
873+
partitioning_period = time_partitioning.type_
874+
875+
# Extract the truncation_function (i.e. DATE_TRUNC)
876+
# and the set of allowable partition_periods
877+
# that can be used in that function
878+
trunc_fn, allowed_partitions = sqltypes[column_type]
879+
880+
# Create output:
881+
# Special Case: _PARTITIONDATE does NOT use a function or partitioning_period
882+
if trunc_fn == "_PARTITIONDATE":
883+
return f"PARTITION BY {field}"
884+
885+
# Special Case: BigQuery will not accept DAY as partitioning_period for
886+
# DATE_TRUNC.
887+
# However, the default argument in python-bigquery for TimePartioning
888+
# is DAY. This case overwrites that to avoid making a breaking change in
889+
# python-bigquery.
890+
# https://github.com/googleapis/python-bigquery/blob/a4d9534a900f13ae7355904cda05097d781f27e3/google/cloud/bigquery/table.py#L2916
891+
if trunc_fn == "DATE_TRUNC" and partitioning_period == "DAY":
892+
raise ValueError(
893+
"The TimePartitioning.type_ must be one of: "
894+
f"{allowed_partitions}, received {partitioning_period}."
895+
"NOTE: the `default` value for TimePartioning.type_ as set in "
896+
"python-bigquery is 'DAY', if you wish to use 'DATE_TRUNC' "
897+
"ensure that you overwrite the default TimePartitioning.type_. "
898+
)
899+
900+
# Generic Case
901+
if partitioning_period not in allowed_partitions:
902+
raise ValueError(
903+
"The TimePartitioning.type_ must be one of: "
904+
f"{allowed_partitions}, received {partitioning_period}."
905+
)
849906

850-
return f"PARTITION BY {trunc_fn}({field}, {time_partitioning.type_})"
907+
return f"PARTITION BY {trunc_fn}({field}, {partitioning_period})"
851908

852909
def _process_range_partitioning(
853910
self, table: Table, range_partitioning: RangePartitioning

tests/system/test_sqlalchemy_bigquery.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -561,7 +561,16 @@ def test_dml(engine, session, table_dml):
561561
assert len(result) == 0
562562

563563

564-
@pytest.mark.parametrize("time_partitioning_field", ["timestamp_c", "date_c"])
564+
@pytest.mark.parametrize(
565+
"time_partitioning_field",
566+
[
567+
("timestamp_c"),
568+
("datetime_c"),
569+
# Fails because python-bigquery TimePartitioning.type_ defaults to "DAY", but
570+
# the DATE_TRUNC() function only allows "MONTH"/"YEAR"
571+
pytest.param("date_c", marks=[pytest.mark.xfail]),
572+
],
573+
)
565574
def test_create_table(engine, bigquery_dataset, time_partitioning_field):
566575
meta = MetaData()
567576
Table(

tests/unit/test_table_options.py

Lines changed: 123 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -104,111 +104,161 @@ def test_table_clustering_fields_dialect_option_type_error(faux_conn):
104104
)
105105

106106

107-
def test_table_time_partitioning_dialect_option(faux_conn):
108-
# expect table creation to fail as SQLite does not support partitioned tables
109-
with pytest.raises(sqlite3.OperationalError):
110-
setup_table(
111-
faux_conn,
112-
"some_table",
113-
sqlalchemy.Column("id", sqlalchemy.Integer),
114-
sqlalchemy.Column("createdAt", sqlalchemy.DateTime),
115-
bigquery_time_partitioning=TimePartitioning(),
116-
)
107+
@pytest.mark.parametrize(
108+
"column_dtype,time_partitioning_type,func_name",
109+
[
110+
# DATE dtype
111+
pytest.param(
112+
sqlalchemy.DATE,
113+
TimePartitioningType.HOUR, # Only MONTH/YEAR are permitted in BigQuery
114+
"DATE_TRUNC",
115+
marks=pytest.mark.xfail,
116+
),
117+
pytest.param(
118+
sqlalchemy.DATE,
119+
TimePartitioningType.DAY, # Only MONTH/YEAR are permitted in BigQuery
120+
"DATE_TRUNC",
121+
marks=pytest.mark.xfail,
122+
),
123+
(sqlalchemy.DATE, TimePartitioningType.MONTH, "DATE_TRUNC"),
124+
(sqlalchemy.DATE, TimePartitioningType.YEAR, "DATE_TRUNC"),
125+
# TIMESTAMP dtype
126+
(sqlalchemy.TIMESTAMP, TimePartitioningType.HOUR, "TIMESTAMP_TRUNC"),
127+
(sqlalchemy.TIMESTAMP, TimePartitioningType.DAY, "TIMESTAMP_TRUNC"),
128+
(sqlalchemy.TIMESTAMP, TimePartitioningType.MONTH, "TIMESTAMP_TRUNC"),
129+
(sqlalchemy.TIMESTAMP, TimePartitioningType.YEAR, "TIMESTAMP_TRUNC"),
130+
# DATETIME dtype
131+
(sqlalchemy.DATETIME, TimePartitioningType.HOUR, "DATETIME_TRUNC"),
132+
(sqlalchemy.DATETIME, TimePartitioningType.DAY, "DATETIME_TRUNC"),
133+
(sqlalchemy.DATETIME, TimePartitioningType.MONTH, "DATETIME_TRUNC"),
134+
(sqlalchemy.DATETIME, TimePartitioningType.YEAR, "DATETIME_TRUNC"),
135+
# TimePartitioning.type_ == None
136+
(sqlalchemy.DATETIME, None, "DATETIME_TRUNC"),
137+
],
138+
)
139+
def test_table_time_partitioning_given_field_and_type__dialect_options(
140+
faux_conn, column_dtype, time_partitioning_type, func_name
141+
):
142+
"""NOTE: Expect table creation to fail as SQLite does not support
143+
partitioned tables, despite that, we are still able to test the generation
144+
of SQL statements.
117145
118-
assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == (
119-
"CREATE TABLE `some_table` ( `id` INT64, `createdAt` DATETIME )"
120-
" PARTITION BY DATE_TRUNC(_PARTITIONDATE, DAY)"
121-
)
146+
Each parametrization ensures that the appropriate function is generated
147+
depending on whether the column datatype is DATE, TIMESTAMP, DATETIME and
148+
whether the TimePartitioningType is HOUR, DAY, MONTH, YEAR.
122149
150+
`DATE_TRUNC` only returns a result if TimePartitioningType is DAY, MONTH,
151+
YEAR. BigQuery cannot partition on DATE by HOUR, so that is expected to
152+
xfail.
123153
124-
def test_table_require_partition_filter_dialect_option(faux_conn):
125-
# expect table creation to fail as SQLite does not support partitioned tables
126-
with pytest.raises(sqlite3.OperationalError):
127-
setup_table(
128-
faux_conn,
129-
"some_table",
130-
sqlalchemy.Column("createdAt", sqlalchemy.DateTime),
131-
bigquery_time_partitioning=TimePartitioning(field="createdAt"),
132-
bigquery_require_partition_filter=True,
133-
)
154+
A distinguishing characteristic of this test is we provide an argument to
155+
the TimePartitioning class for both field and type_.
134156
135-
assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == (
136-
"CREATE TABLE `some_table` ( `createdAt` DATETIME )"
137-
" PARTITION BY DATE_TRUNC(createdAt, DAY)"
138-
" OPTIONS(require_partition_filter=true)"
139-
)
157+
Special case: IF time_partitioning_type is None, the __init__() in the
158+
TimePartitioning class will overwrite it with TimePartitioningType.DAY as
159+
the default.
160+
"""
140161

162+
if time_partitioning_type is None:
163+
time_partitioning_type = TimePartitioningType.DAY
141164

142-
def test_table_time_partitioning_with_field_dialect_option(faux_conn):
143-
# expect table creation to fail as SQLite does not support partitioned tables
144165
with pytest.raises(sqlite3.OperationalError):
145166
setup_table(
146167
faux_conn,
147168
"some_table",
148169
sqlalchemy.Column("id", sqlalchemy.Integer),
149-
sqlalchemy.Column("createdAt", sqlalchemy.DateTime),
150-
bigquery_time_partitioning=TimePartitioning(field="createdAt"),
170+
sqlalchemy.Column("createdAt", column_dtype),
171+
bigquery_time_partitioning=TimePartitioning(
172+
field="createdAt", type_=time_partitioning_type
173+
),
151174
)
152175

153-
assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == (
154-
"CREATE TABLE `some_table` ( `id` INT64, `createdAt` DATETIME )"
155-
" PARTITION BY DATE_TRUNC(createdAt, DAY)"
176+
result = " ".join(faux_conn.test_data["execute"][-1][0].strip().split())
177+
expected = (
178+
f"CREATE TABLE `some_table` ( `id` INT64, `createdAt` {column_dtype.__visit_name__} )"
179+
f" PARTITION BY {func_name}(createdAt, {time_partitioning_type})"
156180
)
181+
assert result == expected
157182

158183

159-
def test_table_time_partitioning_by_month_dialect_option(faux_conn):
160-
# expect table creation to fail as SQLite does not support partitioned tables
161-
with pytest.raises(sqlite3.OperationalError):
162-
setup_table(
163-
faux_conn,
164-
"some_table",
165-
sqlalchemy.Column("id", sqlalchemy.Integer),
166-
sqlalchemy.Column("createdAt", sqlalchemy.DateTime),
167-
bigquery_time_partitioning=TimePartitioning(
168-
field="createdAt",
169-
type_=TimePartitioningType.MONTH,
170-
),
171-
)
184+
def test_table_time_partitioning_given_field_but_no_type__dialect_option(faux_conn):
185+
"""Expect table creation to fail as SQLite does not support partitioned tables
172186
173-
assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == (
174-
"CREATE TABLE `some_table` ( `id` INT64, `createdAt` DATETIME )"
175-
" PARTITION BY DATE_TRUNC(createdAt, MONTH)"
176-
)
187+
Confirms that if the column datatype is DATETIME but no TimePartitioning.type_
188+
has been supplied, the system will default to DAY.
177189
190+
A distinguishing characteristic of this test is we provide an argument to
191+
the TimePartitioning class for field but not type_.
192+
"""
178193

179-
def test_table_time_partitioning_with_timestamp_dialect_option(faux_conn):
180-
# expect table creation to fail as SQLite does not support partitioned tables
181194
with pytest.raises(sqlite3.OperationalError):
182195
setup_table(
183196
faux_conn,
184197
"some_table",
185198
sqlalchemy.Column("id", sqlalchemy.Integer),
186-
sqlalchemy.Column("createdAt", sqlalchemy.TIMESTAMP),
199+
sqlalchemy.Column("createdAt", sqlalchemy.DateTime),
187200
bigquery_time_partitioning=TimePartitioning(field="createdAt"),
188201
)
189-
190-
assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == (
191-
"CREATE TABLE `some_table` ( `id` INT64, `createdAt` TIMESTAMP )"
192-
" PARTITION BY TIMESTAMP_TRUNC(createdAt, DAY)"
202+
result = " ".join(faux_conn.test_data["execute"][-1][0].strip().split())
203+
expected = (
204+
"CREATE TABLE `some_table` ( `id` INT64, `createdAt` DATETIME )"
205+
" PARTITION BY DATETIME_TRUNC(createdAt, DAY)"
193206
)
207+
assert result == expected
194208

195209

196-
def test_table_time_partitioning_with_date_dialect_option(faux_conn):
197-
# expect table creation to fail as SQLite does not support partitioned tables
210+
@pytest.mark.parametrize(
211+
"column_dtype,time_partitioning_type",
212+
[
213+
pytest.param(
214+
sqlalchemy.DATE,
215+
TimePartitioningType.HOUR,
216+
marks=pytest.mark.xfail,
217+
),
218+
(sqlalchemy.DATE, TimePartitioningType.DAY),
219+
(sqlalchemy.DATE, TimePartitioningType.MONTH),
220+
(sqlalchemy.DATE, TimePartitioningType.YEAR),
221+
],
222+
)
223+
def test_table_time_partitioning_given_type__but_no_field_dialect_option(
224+
faux_conn,
225+
column_dtype,
226+
time_partitioning_type,
227+
):
228+
"""NOTE: Expect table creation to fail as SQLite does not support
229+
partitioned tables, despite that, we are still able to test the generation
230+
of SQL statements
231+
232+
If the `field` argument to TimePartitioning() is not provided, it defaults to
233+
None. That causes the pseudocolumn "_PARTITIONDATE" to be used by default as
234+
the column to partition by.
235+
236+
_PARTITIONTIME only returns a result if TimePartitioningType is DAY, MONTH,
237+
YEAR. BigQuery cannot partition on _PARTITIONDATE by HOUR, so that is
238+
expected to xfail.
239+
240+
A distinguishing characteristic of this test is we provide an argument to
241+
the TimePartitioning class for type_ but not field.
242+
"""
243+
198244
with pytest.raises(sqlite3.OperationalError):
199245
setup_table(
200246
faux_conn,
201247
"some_table_2",
202248
sqlalchemy.Column("id", sqlalchemy.Integer),
203-
sqlalchemy.Column("createdAt", sqlalchemy.DATE),
204-
bigquery_time_partitioning=TimePartitioning(field="createdAt"),
249+
sqlalchemy.Column("createdAt", column_dtype),
250+
bigquery_time_partitioning=TimePartitioning(type_=time_partitioning_type),
205251
)
206252

207253
# confirm that the following code creates the correct SQL string
208-
assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == (
209-
"CREATE TABLE `some_table_2` ( `id` INT64, `createdAt` DATE )"
210-
" PARTITION BY createdAt"
254+
result = " ".join(faux_conn.test_data["execute"][-1][0].strip().split())
255+
256+
# We need two versions of expected depending on whether we use _PARTITIONDATE
257+
expected = (
258+
f"CREATE TABLE `some_table_2` ( `id` INT64, `createdAt` {column_dtype.__visit_name__} )"
259+
f" PARTITION BY _PARTITIONDATE"
211260
)
261+
assert result == expected
212262

213263

214264
def test_table_time_partitioning_dialect_option_partition_expiration_days(faux_conn):
@@ -227,7 +277,7 @@ def test_table_time_partitioning_dialect_option_partition_expiration_days(faux_c
227277

228278
assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == (
229279
"CREATE TABLE `some_table` ( `createdAt` DATETIME )"
230-
" PARTITION BY DATE_TRUNC(createdAt, DAY)"
280+
" PARTITION BY DATETIME_TRUNC(createdAt, DAY)"
231281
" OPTIONS(partition_expiration_days=0.25)"
232282
)
233283

@@ -400,13 +450,16 @@ def test_table_all_dialect_option(faux_conn):
400450
),
401451
)
402452

403-
assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == (
453+
result = " ".join(faux_conn.test_data["execute"][-1][0].strip().split())
454+
expected = (
404455
"CREATE TABLE `some_table` ( `id` INT64, `country` STRING, `town` STRING, `createdAt` DATETIME )"
405-
" PARTITION BY DATE_TRUNC(createdAt, DAY)"
456+
" PARTITION BY DATETIME_TRUNC(createdAt, DAY)"
406457
" CLUSTER BY country, town"
407458
" OPTIONS(partition_expiration_days=30.0, expiration_timestamp=TIMESTAMP '2038-01-01 00:00:00+00:00', require_partition_filter=true, default_rounding_mode='ROUND_HALF_EVEN')"
408459
)
409460

461+
assert result == expected
462+
410463

411464
def test_validate_friendly_name_value_type(ddl_compiler):
412465
# expect option value to be transformed as a string expression

0 commit comments

Comments
 (0)