Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Fix date and time formatting #854

Merged
merged 3 commits into from
Jan 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions data_diff/abcs/database_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,11 @@ class Date(TemporalType):
pass


@attrs.define(frozen=True)
class Time(TemporalType):
pass


@attrs.define(frozen=True)
class NumericType(ColType):
# 'precision' signifies how many fractional digits (after the dot) we want to compare
Expand Down
19 changes: 19 additions & 0 deletions data_diff/databases/bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
TemporalType,
Boolean,
UnknownColType,
Time,
Date,
)
from data_diff.databases.base import (
BaseDialect,
Expand Down Expand Up @@ -63,6 +65,8 @@ class Dialect(BaseDialect):
# Dates
"TIMESTAMP": Timestamp,
"DATETIME": Datetime,
"DATE": Date,
"TIME": Time,
# Numbers
"INT64": Integer,
"INT32": Integer,
Expand Down Expand Up @@ -160,6 +164,21 @@ def md5_as_hex(self, s: str) -> str:
return f"md5({s})"

def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
try:
is_date = coltype.is_date
is_time = coltype.is_time
except:
is_date = False
is_time = False
if isinstance(coltype, Date) or is_date:
return f"FORMAT_DATE('%F', {value})"
if isinstance(coltype, Time) or is_time:
microseconds = f"TIME_DIFF( {value}, cast('00:00:00' as time), microsecond)"
rounded = f"ROUND({microseconds}, -6 + {coltype.precision})"
time_value = f"TIME_ADD(cast('00:00:00' as time), interval cast({rounded} as int64) microsecond)"
converted = f"FORMAT_TIME('%H:%M:%E6S', {time_value})"
return converted

if coltype.rounds:
timestamp = f"timestamp_micros(cast(round(unix_micros(cast({value} as timestamp))/1000000, {coltype.precision})*1000000 as int))"
return f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', {timestamp})"
Expand Down
7 changes: 6 additions & 1 deletion data_diff/databases/databricks.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,12 @@ def md5_as_hex(self, s: str) -> str:

def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
"""Databricks timestamp contains no more than 6 digits in precision"""

try:
is_date = coltype.is_date
except:
is_date = False
if isinstance(coltype, Date) or is_date:
return f"date_format({value}, 'yyyy-MM-dd')"
if coltype.rounds:
# cast to timestamp due to unix_micros() requiring timestamp
timestamp = f"cast(round(unix_micros(cast({value} as timestamp)) / 1000000, {coltype.precision}) * 1000000 as bigint)"
Expand Down
4 changes: 3 additions & 1 deletion data_diff/databases/mssql.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
)
from data_diff.abcs.database_types import (
JSON,
Date,
NumericType,
Timestamp,
TimestampTZ,
Expand All @@ -25,6 +24,8 @@
Native_UUID,
Text,
Boolean,
Date,
Time
)


Expand All @@ -48,6 +49,7 @@ class Dialect(BaseDialect):
"datetime2": Timestamp,
"smalldatetime": Timestamp,
"date": Date,
"time": Time,
# Numbers
"float": Float,
"real": Float,
Expand Down
20 changes: 20 additions & 0 deletions data_diff/databases/postgresql.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
FractionalType,
Boolean,
Date,
Time
)
from data_diff.databases.base import BaseDialect, ThreadedDatabase, import_helper, ConnectError
from data_diff.databases.base import (
Expand Down Expand Up @@ -57,6 +58,8 @@ class PostgresqlDialect(BaseDialect):
"timestamp without time zone": Timestamp,
"timestamp": Timestamp,
"date": Date,
"time with time zone": Time,
"time without time zone": Time,
# Numbers
"double precision": Float,
"real": Float,
Expand Down Expand Up @@ -111,6 +114,23 @@ def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
def _add_padding(coltype: TemporalType, timestamp6: str):
return f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')"

try:
is_date = coltype.is_date
is_time = coltype.is_time
except:
is_date = False
is_time = False

if isinstance(coltype, Date) or is_date:
return f"cast({value} as varchar)"

if isinstance(coltype, Time) or is_time:
seconds = f"EXTRACT( epoch from {value})"
rounded = f"ROUND({seconds}, {coltype.precision})"
time_value = f"CAST('00:00:00' as time) + make_interval(0, 0, 0, 0, 0, 0, {rounded})" # 6th arg = seconds
converted = f"to_char({time_value}, 'hh24:mi:ss.ff6')"
return converted

if coltype.rounds:
# NULL value expected to return NULL after normalization
null_case_begin = f"CASE WHEN {value} IS NULL THEN NULL ELSE "
Expand Down
17 changes: 17 additions & 0 deletions data_diff/databases/snowflake.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
DbPath,
Boolean,
Date,
Time,
)
from data_diff.databases.base import (
BaseDialect,
Expand Down Expand Up @@ -45,6 +46,7 @@ class Dialect(BaseDialect):
"TIMESTAMP_LTZ": Timestamp,
"TIMESTAMP_TZ": TimestampTZ,
"DATE": Date,
"TIME": Time,
# Numbers
"NUMBER": Decimal,
"FLOAT": Float,
Expand Down Expand Up @@ -81,6 +83,21 @@ def md5_as_hex(self, s: str) -> str:
return f"md5({s})"

def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
try:
is_date = coltype.is_date
is_time = coltype.is_time
except:
is_date = False
is_time = False
if isinstance(coltype, Date) or is_date:
return f"({value}::varchar)"
elif isinstance(coltype, Time) or is_time:
microseconds = f"TIMEDIFF(microsecond, cast('00:00:00' as time), {value})"
rounded = f"round({microseconds}, -6 + {coltype.precision})"
time_value = f"TIMEADD(microsecond, {rounded}, cast('00:00:00' as time))"
converted = f"TO_VARCHAR({time_value}, 'HH24:MI:SS.FF6')"
return converted

if coltype.rounds:
timestamp = f"to_timestamp(round(date_part(epoch_nanosecond, convert_timezone('UTC', {value})::timestamp(9))/1000000000, {coltype.precision}))"
else:
Expand Down