-
Notifications
You must be signed in to change notification settings - Fork 139
Feat: Update partitioning by DATE, DATETIME, TIMESTAMP, _PARTITIONDATE #1113
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
42f9778
12c0b83
d64e0cf
bb84d95
dffd707
6eb2656
a1375a8
f88e472
611922d
0e9c3b8
2d25e61
a989a74
2229784
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -104,111 +104,161 @@ def test_table_clustering_fields_dialect_option_type_error(faux_conn): | |
) | ||
|
||
|
||
def test_table_time_partitioning_dialect_option(faux_conn): | ||
# expect table creation to fail as SQLite does not support partitioned tables | ||
with pytest.raises(sqlite3.OperationalError): | ||
setup_table( | ||
faux_conn, | ||
"some_table", | ||
sqlalchemy.Column("id", sqlalchemy.Integer), | ||
sqlalchemy.Column("createdAt", sqlalchemy.DateTime), | ||
bigquery_time_partitioning=TimePartitioning(), | ||
) | ||
@pytest.mark.parametrize( | ||
"column_dtype,time_partitioning_type,func_name", | ||
[ | ||
# DATE dtype | ||
pytest.param( | ||
sqlalchemy.DATE, | ||
TimePartitioningType.HOUR, # Only MONTH/YEAR are permitted in BigQuery | ||
"DATE_TRUNC", | ||
marks=pytest.mark.xfail, | ||
), | ||
pytest.param( | ||
sqlalchemy.DATE, | ||
TimePartitioningType.DAY, # Only MONTH/YEAR are permitted in BigQuery | ||
"DATE_TRUNC", | ||
marks=pytest.mark.xfail, | ||
), | ||
(sqlalchemy.DATE, TimePartitioningType.MONTH, "DATE_TRUNC"), | ||
(sqlalchemy.DATE, TimePartitioningType.YEAR, "DATE_TRUNC"), | ||
# TIMESTAMP dtype | ||
(sqlalchemy.TIMESTAMP, TimePartitioningType.HOUR, "TIMESTAMP_TRUNC"), | ||
(sqlalchemy.TIMESTAMP, TimePartitioningType.DAY, "TIMESTAMP_TRUNC"), | ||
(sqlalchemy.TIMESTAMP, TimePartitioningType.MONTH, "TIMESTAMP_TRUNC"), | ||
(sqlalchemy.TIMESTAMP, TimePartitioningType.YEAR, "TIMESTAMP_TRUNC"), | ||
# DATETIME dtype | ||
(sqlalchemy.DATETIME, TimePartitioningType.HOUR, "DATETIME_TRUNC"), | ||
(sqlalchemy.DATETIME, TimePartitioningType.DAY, "DATETIME_TRUNC"), | ||
(sqlalchemy.DATETIME, TimePartitioningType.MONTH, "DATETIME_TRUNC"), | ||
(sqlalchemy.DATETIME, TimePartitioningType.YEAR, "DATETIME_TRUNC"), | ||
# TimePartitioning.type_ == None | ||
(sqlalchemy.DATETIME, None, "DATETIME_TRUNC"), | ||
], | ||
) | ||
def test_table_time_partitioning_given_field_and_type__dialect_options( | ||
faux_conn, column_dtype, time_partitioning_type, func_name | ||
): | ||
"""NOTE: Expect table creation to fail as SQLite does not support | ||
partitioned tables, despite that, we are still able to test the generation | ||
of SQL statements. | ||
|
||
assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == ( | ||
"CREATE TABLE `some_table` ( `id` INT64, `createdAt` DATETIME )" | ||
" PARTITION BY DATE_TRUNC(_PARTITIONDATE, DAY)" | ||
) | ||
Each parametrization ensures that the appropriate function is generated | ||
depending on whether the column datatype is DATE, TIMESTAMP, DATETIME and | ||
whether the TimePartitioningType is HOUR, DAY, MONTH, YEAR. | ||
|
||
`DATE_TRUNC` only returns a result if TimePartitioningType is DAY, MONTH, | ||
YEAR. BigQuery cannot partition on DATE by HOUR, so that is expected to | ||
xfail. | ||
|
||
def test_table_require_partition_filter_dialect_option(faux_conn): | ||
# expect table creation to fail as SQLite does not support partitioned tables | ||
with pytest.raises(sqlite3.OperationalError): | ||
setup_table( | ||
faux_conn, | ||
"some_table", | ||
sqlalchemy.Column("createdAt", sqlalchemy.DateTime), | ||
bigquery_time_partitioning=TimePartitioning(field="createdAt"), | ||
bigquery_require_partition_filter=True, | ||
) | ||
A distinguishing characteristic of this test is we provide an argument to | ||
the TimePartitioning class for both field and type_. | ||
|
||
assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == ( | ||
"CREATE TABLE `some_table` ( `createdAt` DATETIME )" | ||
" PARTITION BY DATE_TRUNC(createdAt, DAY)" | ||
" OPTIONS(require_partition_filter=true)" | ||
) | ||
Special case: IF time_partitioning_type is None, the __init__() in the | ||
TimePartitioning class will overwrite it with TimePartitioningType.DAY as | ||
the default. | ||
""" | ||
|
||
if time_partitioning_type is None: | ||
time_partitioning_type = TimePartitioningType.DAY | ||
|
||
def test_table_time_partitioning_with_field_dialect_option(faux_conn): | ||
# expect table creation to fail as SQLite does not support partitioned tables | ||
with pytest.raises(sqlite3.OperationalError): | ||
setup_table( | ||
faux_conn, | ||
"some_table", | ||
sqlalchemy.Column("id", sqlalchemy.Integer), | ||
sqlalchemy.Column("createdAt", sqlalchemy.DateTime), | ||
bigquery_time_partitioning=TimePartitioning(field="createdAt"), | ||
sqlalchemy.Column("createdAt", column_dtype), | ||
bigquery_time_partitioning=TimePartitioning( | ||
field="createdAt", type_=time_partitioning_type | ||
), | ||
) | ||
|
||
assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == ( | ||
"CREATE TABLE `some_table` ( `id` INT64, `createdAt` DATETIME )" | ||
" PARTITION BY DATE_TRUNC(createdAt, DAY)" | ||
result = " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) | ||
expected = ( | ||
f"CREATE TABLE `some_table` ( `id` INT64, `createdAt` {column_dtype.__visit_name__} )" | ||
f" PARTITION BY {func_name}(createdAt, {time_partitioning_type})" | ||
) | ||
assert result == expected | ||
|
||
|
||
def test_table_time_partitioning_by_month_dialect_option(faux_conn): | ||
# expect table creation to fail as SQLite does not support partitioned tables | ||
with pytest.raises(sqlite3.OperationalError): | ||
setup_table( | ||
faux_conn, | ||
"some_table", | ||
sqlalchemy.Column("id", sqlalchemy.Integer), | ||
sqlalchemy.Column("createdAt", sqlalchemy.DateTime), | ||
bigquery_time_partitioning=TimePartitioning( | ||
field="createdAt", | ||
type_=TimePartitioningType.MONTH, | ||
), | ||
) | ||
def test_table_time_partitioning_given_field_but_no_type__dialect_option(faux_conn): | ||
"""Expect table creation to fail as SQLite does not support partitioned tables | ||
|
||
assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == ( | ||
"CREATE TABLE `some_table` ( `id` INT64, `createdAt` DATETIME )" | ||
" PARTITION BY DATE_TRUNC(createdAt, MONTH)" | ||
) | ||
Confirms that if the column datatype is DATETIME but no TimePartitioning.type_ | ||
has been supplied, the system will default to DAY. | ||
|
||
A distinguishing characteristic of this test is we provide an argument to | ||
the TimePartitioning class for field but not type_. | ||
""" | ||
|
||
def test_table_time_partitioning_with_timestamp_dialect_option(faux_conn): | ||
# expect table creation to fail as SQLite does not support partitioned tables | ||
with pytest.raises(sqlite3.OperationalError): | ||
setup_table( | ||
faux_conn, | ||
"some_table", | ||
sqlalchemy.Column("id", sqlalchemy.Integer), | ||
sqlalchemy.Column("createdAt", sqlalchemy.TIMESTAMP), | ||
sqlalchemy.Column("createdAt", sqlalchemy.DateTime), | ||
bigquery_time_partitioning=TimePartitioning(field="createdAt"), | ||
) | ||
|
||
assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == ( | ||
"CREATE TABLE `some_table` ( `id` INT64, `createdAt` TIMESTAMP )" | ||
" PARTITION BY TIMESTAMP_TRUNC(createdAt, DAY)" | ||
result = " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems the second half of this test does the following, as the docstring suggests:
Maybe I'm missing something, but I think it's a duplicate of the test case at line 136. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The difference between this test and the parametrized test above is minor. Essentially, in the parameters on line 136 we explicitly provide a value of We know the default is In line 193, we do not set the value of |
||
expected = ( | ||
"CREATE TABLE `some_table` ( `id` INT64, `createdAt` DATETIME )" | ||
" PARTITION BY DATETIME_TRUNC(createdAt, DAY)" | ||
) | ||
assert result == expected | ||
|
||
|
||
def test_table_time_partitioning_with_date_dialect_option(faux_conn): | ||
# expect table creation to fail as SQLite does not support partitioned tables | ||
@pytest.mark.parametrize( | ||
"column_dtype,time_partitioning_type", | ||
[ | ||
pytest.param( | ||
sqlalchemy.DATE, | ||
TimePartitioningType.HOUR, | ||
marks=pytest.mark.xfail, | ||
), | ||
(sqlalchemy.DATE, TimePartitioningType.DAY), | ||
(sqlalchemy.DATE, TimePartitioningType.MONTH), | ||
(sqlalchemy.DATE, TimePartitioningType.YEAR), | ||
], | ||
) | ||
def test_table_time_partitioning_given_type__but_no_field_dialect_option( | ||
faux_conn, | ||
column_dtype, | ||
time_partitioning_type, | ||
): | ||
"""NOTE: Expect table creation to fail as SQLite does not support | ||
partitioned tables, despite that, we are still able to test the generation | ||
of SQL statements | ||
|
||
If the `field` argument to TimePartitioning() is not provided, it defaults to | ||
None. That causes the pseudocolumn "_PARTITIONDATE" to be used by default as | ||
the column to partition by. | ||
|
||
_PARTITIONTIME only returns a result if TimePartitioningType is DAY, MONTH, | ||
YEAR. BigQuery cannot partition on _PARTITIONDATE by HOUR, so that is | ||
expected to xfail. | ||
|
||
A distinguishing characteristic of this test is we provide an argument to | ||
the TimePartitioning class for type_ but not field. | ||
""" | ||
|
||
with pytest.raises(sqlite3.OperationalError): | ||
setup_table( | ||
faux_conn, | ||
"some_table_2", | ||
sqlalchemy.Column("id", sqlalchemy.Integer), | ||
sqlalchemy.Column("createdAt", sqlalchemy.DATE), | ||
bigquery_time_partitioning=TimePartitioning(field="createdAt"), | ||
sqlalchemy.Column("createdAt", column_dtype), | ||
bigquery_time_partitioning=TimePartitioning(type_=time_partitioning_type), | ||
) | ||
|
||
# confirm that the following code creates the correct SQL string | ||
assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == ( | ||
"CREATE TABLE `some_table_2` ( `id` INT64, `createdAt` DATE )" | ||
" PARTITION BY createdAt" | ||
result = " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this test, we do NOT provide a This is needed to confirm that in the absence of a Both the tests above end up including a truncation function but this on does not (and should not). |
||
|
||
# We need two versions of expected depending on whether we use _PARTITIONDATE | ||
expected = ( | ||
f"CREATE TABLE `some_table_2` ( `id` INT64, `createdAt` {column_dtype.__visit_name__} )" | ||
f" PARTITION BY _PARTITIONDATE" | ||
) | ||
assert result == expected | ||
|
||
|
||
def test_table_time_partitioning_dialect_option_partition_expiration_days(faux_conn): | ||
|
@@ -227,7 +277,7 @@ def test_table_time_partitioning_dialect_option_partition_expiration_days(faux_c | |
|
||
assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == ( | ||
"CREATE TABLE `some_table` ( `createdAt` DATETIME )" | ||
" PARTITION BY DATE_TRUNC(createdAt, DAY)" | ||
" PARTITION BY DATETIME_TRUNC(createdAt, DAY)" | ||
" OPTIONS(partition_expiration_days=0.25)" | ||
) | ||
|
||
|
@@ -400,13 +450,16 @@ def test_table_all_dialect_option(faux_conn): | |
), | ||
) | ||
|
||
assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == ( | ||
result = " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) | ||
expected = ( | ||
"CREATE TABLE `some_table` ( `id` INT64, `country` STRING, `town` STRING, `createdAt` DATETIME )" | ||
" PARTITION BY DATE_TRUNC(createdAt, DAY)" | ||
" PARTITION BY DATETIME_TRUNC(createdAt, DAY)" | ||
" CLUSTER BY country, town" | ||
" OPTIONS(partition_expiration_days=30.0, expiration_timestamp=TIMESTAMP '2038-01-01 00:00:00+00:00', require_partition_filter=true, default_rounding_mode='ROUND_HALF_EVEN')" | ||
) | ||
|
||
assert result == expected | ||
|
||
|
||
def test_validate_friendly_name_value_type(ddl_compiler): | ||
# expect option value to be transformed as a string expression | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess this would be added when you complete the draft, I think we need to set the default value of
partitioning_period
somewhereThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Linchin
I am unclear on why we might need to create a default value of
partitioning_period
. It is set based on the value oftime_partitioning.type_
online 873
.time_partitioning.type_
is set in thepython-bigquery
library in the TimePartitioning class.When that class is instantiated, the default for
type_
in the__init__
isNone
but it is then processed and immediately overwritten byDAY
as shown at the link above.So by the time
.type_
gets to us, it will either beDAY
or will be set to something else that we use to set the variable namedpartitioning_period
.NOTE: I rename the variable to
partitioning_period
simply to make this code more readable.type_
seemed less intuitive thanpartitioning_period
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you, I missed the part in
__init__()
that sets the default value.