Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
gFix date/time formatting for snowlake, postres and bigquery
  • Loading branch information
teraamp committed Jan 15, 2024
commit 54d6f2c069c4ab64e8fab8f58a05ba16ef89528f
5 changes: 5 additions & 0 deletions data_diff/abcs/database_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,11 @@ class Date(TemporalType):
pass


@attrs.define(frozen=True)
class Time(TemporalType):
pass


@attrs.define(frozen=True)
class NumericType(ColType):
# 'precision' signifies how many fractional digits (after the dot) we want to compare
Expand Down
19 changes: 19 additions & 0 deletions data_diff/databases/bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
TemporalType,
Boolean,
UnknownColType,
Time,
Date,
)
from data_diff.databases.base import (
BaseDialect,
Expand Down Expand Up @@ -63,6 +65,8 @@ class Dialect(BaseDialect):
# Dates
"TIMESTAMP": Timestamp,
"DATETIME": Datetime,
"DATE": Date,
"TIME": Time,
# Numbers
"INT64": Integer,
"INT32": Integer,
Expand Down Expand Up @@ -160,6 +164,21 @@ def md5_as_hex(self, s: str) -> str:
return f"md5({s})"

def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
try:
is_date = coltype.is_date
is_time = coltype.is_time
except:
is_date = False
is_time = False
if isinstance(coltype, Date) or is_date:
return f"FORMAT_DATE('%F', {value})"
if isinstance(coltype, Time) or is_time:
microseconds = f"TIME_DIFF( {value}, cast('00:00:00' as time), microsecond)"
rounded = f"ROUND({microseconds}, -6 + {coltype.precision})"
time_value = f"TIME_ADD(cast('00:00:00' as time), interval cast({rounded} as int64) microsecond)"
converted = f"FORMAT_TIME('%H:%M:%E6S', {time_value})"
return converted

if coltype.rounds:
timestamp = f"timestamp_micros(cast(round(unix_micros(cast({value} as timestamp))/1000000, {coltype.precision})*1000000 as int))"
return f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', {timestamp})"
Expand Down
4 changes: 3 additions & 1 deletion data_diff/databases/mssql.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
)
from data_diff.abcs.database_types import (
JSON,
Date,
NumericType,
Timestamp,
TimestampTZ,
Expand All @@ -25,6 +24,8 @@
Native_UUID,
Text,
Boolean,
Date,
Time
)


Expand All @@ -48,6 +49,7 @@ class Dialect(BaseDialect):
"datetime2": Timestamp,
"smalldatetime": Timestamp,
"date": Date,
"time": Time,
# Numbers
"float": Float,
"real": Float,
Expand Down
20 changes: 20 additions & 0 deletions data_diff/databases/postgresql.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
FractionalType,
Boolean,
Date,
Time
)
from data_diff.databases.base import BaseDialect, ThreadedDatabase, import_helper, ConnectError
from data_diff.databases.base import (
Expand Down Expand Up @@ -57,6 +58,8 @@ class PostgresqlDialect(BaseDialect):
"timestamp without time zone": Timestamp,
"timestamp": Timestamp,
"date": Date,
"time with time zone": Time,
"time without time zone": Time,
# Numbers
"double precision": Float,
"real": Float,
Expand Down Expand Up @@ -111,6 +114,23 @@ def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
def _add_padding(coltype: TemporalType, timestamp6: str):
return f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')"

try:
is_date = coltype.is_date
is_time = coltype.is_time
except:
is_date = False
is_time = False

if isinstance(coltype, Date) or is_date:
return f"cast({value} as varchar)"

if isinstance(coltype, Time) or is_time:
seconds = f"EXTRACT( epoch from {value})"
rounded = f"ROUND({seconds}, {coltype.precision})"
time_value = f"CAST('00:00:00' as time) + make_interval(0, 0, 0, 0, 0, 0, {rounded})" # 6th arg = seconds
converted = f"to_char({time_value}, 'hh24:mi:ss.ff6')"
return converted

if coltype.rounds:
# NULL value expected to return NULL after normalization
null_case_begin = f"CASE WHEN {value} IS NULL THEN NULL ELSE "
Expand Down
10 changes: 10 additions & 0 deletions data_diff/databases/snowflake.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
DbPath,
Boolean,
Date,
Time,
)
from data_diff.databases.base import (
BaseDialect,
Expand Down Expand Up @@ -45,6 +46,7 @@ class Dialect(BaseDialect):
"TIMESTAMP_LTZ": Timestamp,
"TIMESTAMP_TZ": TimestampTZ,
"DATE": Date,
"TIME": Time,
# Numbers
"NUMBER": Decimal,
"FLOAT": Float,
Expand Down Expand Up @@ -83,10 +85,18 @@ def md5_as_hex(self, s: str) -> str:
def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
try:
is_date = coltype.is_date
is_time = coltype.is_time
except:
is_date = False
is_time = False
if isinstance(coltype, Date) or is_date:
return f"({value}::varchar)"
elif isinstance(coltype, Time) or is_time:
microseconds = f"TIMEDIFF(microsecond, cast('00:00:00' as time), {value})"
rounded = f"round({microseconds}, -6 + {coltype.precision})"
time_value = f"TIMEADD(microsecond, {rounded}, cast('00:00:00' as time))"
converted = f"TO_VARCHAR({time_value}, 'HH24:MI:SS.FF6')"
return converted

if coltype.rounds:
timestamp = f"to_timestamp(round(date_part(epoch_nanosecond, convert_timezone('UTC', {value})::timestamp(9))/1000000000, {coltype.precision}))"
Expand Down