Skip to content

Commit 8cb8ff2

Browse files
committed
add: timestamp validation
add: timestamp validation add: timestamp validation fix: module error
1 parent 65c1105 commit 8cb8ff2

File tree

6 files changed

+642
-8
lines changed

6 files changed

+642
-8
lines changed

dcs_core/core/datasource/sql_datasource.py

+280
Original file line numberDiff line numberDiff line change
@@ -724,3 +724,283 @@ def query_get_null_keyword_count(
724724
return round((result[0] / result[1]) * 100, 2) if result[1] > 0 else 0
725725

726726
return result[0] if result else 0
727+
728+
def query_timestamp_metric(
729+
self,
730+
table: str,
731+
field: str,
732+
operation: str,
733+
predefined_regex: str,
734+
filters: str = None,
735+
) -> Union[float, int]:
736+
"""
737+
:param table: Table name
738+
:param field: Column name
739+
:param operation: Metric operation ("count" or "percent")
740+
:param predefined_regex: regex pattern
741+
:param filters: filter condition
742+
:return: Tuple containing valid count and total count (or percentage)
743+
"""
744+
745+
qualified_table_name = self.qualified_table_name(table)
746+
747+
timestamp_iso_regex = r"^\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])T([01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9](?:\.\d{1,3})?(Z|[+-](0[0-9]|1[0-4]):[0-5][0-9])?$"
748+
749+
if predefined_regex == "timestamp_iso":
750+
regex_condition = f"{field} ~ '{timestamp_iso_regex}'"
751+
else:
752+
raise ValueError(f"Unknown predefined regex pattern: {predefined_regex}")
753+
754+
filters_clause = f"WHERE {filters}" if filters else ""
755+
756+
query = f"""
757+
WITH extracted_timestamps AS (
758+
SELECT
759+
{field},
760+
SUBSTRING({field} FROM '^(\d{{4}})') AS year, -- Extract year
761+
SUBSTRING({field} FROM '^\d{{4}}-(\d{{2}})') AS month, -- Extract month
762+
SUBSTRING({field} FROM '^\d{{4}}-\d{{2}}-(\d{{2}})') AS day, -- Extract day
763+
SUBSTRING({field} FROM 'T(\d{{2}})') AS hour, -- Extract hour
764+
SUBSTRING({field} FROM 'T\d{{2}}:(\d{{2}})') AS minute, -- Extract minute
765+
SUBSTRING({field} FROM 'T\d{{2}}:\d{{2}}:(\d{{2}})') AS second, -- Extract second
766+
SUBSTRING({field} FROM '([+-]\d{{2}}:\d{{2}}|Z)$') AS timezone -- Extract timezone
767+
FROM {qualified_table_name}
768+
{filters_clause}
769+
),
770+
validated_timestamps AS (
771+
SELECT
772+
{field},
773+
CASE
774+
WHEN
775+
-- Validate each component with its specific rules
776+
year ~ '^\d{{4}}$' AND
777+
month ~ '^(0[1-9]|1[0-2])$' AND
778+
day ~ '^((0[1-9]|[12][0-9])|(30|31))$' AND
779+
hour ~ '^([01][0-9]|2[0-3])$' AND
780+
minute ~ '^[0-5][0-9]$' AND
781+
second ~ '^[0-5][0-9]$' AND
782+
(timezone IS NULL OR timezone ~ '^(Z|[+-](0[0-9]|1[0-4]):[0-5][0-9])$') AND
783+
-- Additional check for days in months (e.g., February)
784+
(
785+
(month IN ('01', '03', '05', '07', '08', '10', '12') AND day BETWEEN '01' AND '31') OR
786+
(month IN ('04', '06', '09', '11') AND day BETWEEN '01' AND '30') OR
787+
(month = '02' AND day BETWEEN '01' AND
788+
CASE
789+
-- Handle leap years
790+
WHEN (year::int % 400 = 0 OR (year::int % 100 != 0 AND year::int % 4 = 0)) THEN '29'
791+
ELSE '28'
792+
END
793+
)
794+
)
795+
THEN 1
796+
ELSE 0
797+
END AS is_valid
798+
FROM extracted_timestamps
799+
)
800+
SELECT COUNT(*) AS valid_count, COUNT(*) AS total_count
801+
FROM validated_timestamps
802+
WHERE is_valid = 1;
803+
"""
804+
805+
try:
806+
valid_count = self.fetchone(query)[0]
807+
total_count_query = (
808+
f"SELECT COUNT(*) FROM {qualified_table_name} {filters_clause}"
809+
)
810+
total_count = self.fetchone(total_count_query)[0]
811+
812+
if operation == "count":
813+
return valid_count, total_count
814+
elif operation == "percent":
815+
return valid_count, total_count
816+
else:
817+
raise ValueError(f"Unknown operation: {operation}")
818+
819+
except Exception as e:
820+
print(f"Error occurred: {e}")
821+
return 0, 0
822+
823+
def query_timestamp_not_in_future_metric(
824+
self,
825+
table: str,
826+
field: str,
827+
operation: str,
828+
predefined_regex: str,
829+
filters: str = None,
830+
) -> Union[float, int]:
831+
"""
832+
:param table: Table name
833+
:param field: Column name
834+
:param operation: Metric operation ("count" or "percent")
835+
:param predefined_regex: regex pattern
836+
:param filters: filter condition
837+
:return: Tuple containing count of valid timestamps not in the future and total count
838+
"""
839+
840+
qualified_table_name = self.qualified_table_name(table)
841+
842+
timestamp_iso_regex = r"^\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])T([01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9](?:\.\d{1,3})?(Z|[+-](0[0-9]|1[0-4]):[0-5][0-9])?$"
843+
844+
if predefined_regex == "timestamp_iso":
845+
regex_condition = f"{field} ~ '{timestamp_iso_regex}'"
846+
else:
847+
raise ValueError(f"Unknown predefined regex pattern: {predefined_regex}")
848+
849+
filters_clause = f"WHERE {filters}" if filters else ""
850+
851+
query = f"""
852+
WITH extracted_timestamps AS (
853+
SELECT
854+
{field},
855+
SUBSTRING({field} FROM '^(\d{{4}})') AS year, -- Extract year
856+
SUBSTRING({field} FROM '^\d{{4}}-(\d{{2}})') AS month, -- Extract month
857+
SUBSTRING({field} FROM '^\d{{4}}-\d{{2}}-(\d{{2}})') AS day, -- Extract day
858+
SUBSTRING({field} FROM 'T(\d{{2}})') AS hour, -- Extract hour
859+
SUBSTRING({field} FROM 'T\d{{2}}:(\d{{2}})') AS minute, -- Extract minute
860+
SUBSTRING({field} FROM 'T\d{{2}}:\d{{2}}:(\d{{2}})') AS second, -- Extract second
861+
SUBSTRING({field} FROM '([+-]\d{{2}}:\d{{2}}|Z)$') AS timezone -- Extract timezone
862+
FROM {qualified_table_name}
863+
{filters_clause}
864+
),
865+
validated_timestamps AS (
866+
SELECT
867+
{field},
868+
CASE
869+
WHEN
870+
year ~ '^\d{{4}}$' AND
871+
month ~ '^(0[1-9]|1[0-2])$' AND
872+
day ~ '^((0[1-9]|[12][0-9])|(30|31))$' AND
873+
hour ~ '^([01][0-9]|2[0-3])$' AND
874+
minute ~ '^[0-5][0-9]$' AND
875+
second ~ '^[0-5][0-9]$' AND
876+
(timezone IS NULL OR timezone ~ '^(Z|[+-](0[0-9]|1[0-4]):[0-5][0-9])$') AND
877+
(
878+
(month IN ('01', '03', '05', '07', '08', '10', '12') AND day BETWEEN '01' AND '31') OR
879+
(month IN ('04', '06', '09', '11') AND day BETWEEN '01' AND '30') OR
880+
(month = '02' AND day BETWEEN '01' AND
881+
CASE
882+
WHEN (year::int % 400 = 0 OR (year::int % 100 != 0 AND year::int % 4 = 0)) THEN '29'
883+
ELSE '28'
884+
END
885+
)
886+
)
887+
THEN 1
888+
ELSE 0
889+
END AS is_valid
890+
FROM extracted_timestamps
891+
),
892+
timestamps_not_in_future AS (
893+
SELECT *
894+
FROM validated_timestamps
895+
WHERE is_valid = 1 AND ({field} ~ '{timestamp_iso_regex}') AND {field}::timestamp <= CURRENT_TIMESTAMP
896+
)
897+
SELECT COUNT(*) AS valid_count, (SELECT COUNT(*) FROM {qualified_table_name} {filters_clause}) AS total_count
898+
FROM timestamps_not_in_future;
899+
"""
900+
try:
901+
valid_count = self.fetchone(query)[0]
902+
total_count_query = (
903+
f"SELECT COUNT(*) FROM {qualified_table_name} {filters_clause}"
904+
)
905+
total_count = self.fetchone(total_count_query)[0]
906+
907+
if operation == "count":
908+
return valid_count, total_count
909+
elif operation == "percent":
910+
return valid_count, total_count
911+
else:
912+
raise ValueError(f"Unknown operation: {operation}")
913+
914+
except Exception as e:
915+
print(f"Error occurred: {e}")
916+
return 0, 0
917+
918+
def query_timestamp_date_not_in_future_metric(
919+
self,
920+
table: str,
921+
field: str,
922+
operation: str,
923+
predefined_regex: str,
924+
filters: str = None,
925+
) -> Union[float, int]:
926+
"""
927+
:param table: Table name
928+
:param field: Column name
929+
:param operation: Metric operation ("count" or "percent")
930+
:param predefined_regex: The regex pattern to use (e.g., "timestamp_iso")
931+
:param filters: Optional filter condition
932+
:return: Tuple containing count of valid dates not in the future and total count
933+
"""
934+
935+
qualified_table_name = self.qualified_table_name(table)
936+
937+
timestamp_iso_regex = r"^\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])T([01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9](?:\.\d{1,3})?(Z|[+-](0[0-9]|1[0-4]):[0-5][0-9])?$"
938+
939+
if predefined_regex == "timestamp_iso":
940+
regex_condition = f"{field} ~ '{timestamp_iso_regex}'"
941+
else:
942+
raise ValueError(f"Unknown predefined regex pattern: {predefined_regex}")
943+
944+
filters_clause = f"WHERE {filters}" if filters else ""
945+
946+
query = f"""
947+
WITH extracted_timestamps AS (
948+
SELECT
949+
{field},
950+
SUBSTRING({field} FROM '^(\d{{4}})') AS year, -- Extract year
951+
SUBSTRING({field} FROM '^\d{{4}}-(\d{{2}})') AS month, -- Extract month
952+
SUBSTRING({field} FROM '^\d{{4}}-\d{{2}}-(\d{{2}})') AS day -- Extract day
953+
FROM {qualified_table_name}
954+
{filters_clause}
955+
),
956+
validated_dates AS (
957+
SELECT
958+
{field},
959+
CASE
960+
WHEN
961+
year ~ '^\d{{4}}$' AND
962+
month ~ '^(0[1-9]|1[0-2])$' AND
963+
day ~ '^((0[1-9]|[12][0-9])|(30|31))$' AND
964+
(
965+
(month IN ('01', '03', '05', '07', '08', '10', '12') AND day BETWEEN '01' AND '31') OR
966+
(month IN ('04', '06', '09', '11') AND day BETWEEN '01' AND '30') OR
967+
(month = '02' AND day BETWEEN '01' AND
968+
CASE
969+
WHEN (year::int % 400 = 0 OR (year::int % 100 != 0 AND year::int % 4 = 0)) THEN '29'
970+
ELSE '28'
971+
END
972+
)
973+
)
974+
THEN 1
975+
ELSE 0
976+
END AS is_valid
977+
FROM extracted_timestamps
978+
),
979+
dates_not_in_future AS (
980+
SELECT *
981+
FROM validated_dates
982+
WHERE is_valid = 1
983+
AND ({field} ~ '{timestamp_iso_regex}')
984+
AND ({field})::date <= CURRENT_DATE -- Compare only the date part against the current date
985+
)
986+
SELECT COUNT(*) AS valid_count, (SELECT COUNT(*) FROM {qualified_table_name} {filters_clause}) AS total_count
987+
FROM dates_not_in_future;
988+
"""
989+
990+
try:
991+
valid_count = self.fetchone(query)[0]
992+
total_count_query = (
993+
f"SELECT COUNT(*) FROM {qualified_table_name} {filters_clause}"
994+
)
995+
total_count = self.fetchone(total_count_query)[0]
996+
997+
if operation == "count":
998+
return valid_count, total_count
999+
elif operation == "percent":
1000+
return valid_count, total_count
1001+
else:
1002+
raise ValueError(f"Unknown operation: {operation}")
1003+
1004+
except Exception as e:
1005+
print(f"Error occurred: {e}")
1006+
return 0, 0

dcs_core/core/validation/manager.py

+12
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
)
6363
from dcs_core.core.validation.validity_validation import ( # noqa F401 this is used in globals
6464
CountCUSIPValidation,
65+
CountDateNotInFutureValidation,
6566
CountEmailValidation,
6667
CountFIGIValidation,
6768
CountInvalidRegex,
@@ -70,16 +71,19 @@
7071
CountLatitudeValidation,
7172
CountLEIValidation,
7273
CountLongitudeValidation,
74+
CountNotInFutureValidation,
7375
CountPermIDValidation,
7476
CountSEDOLValidation,
7577
CountSSNValidation,
78+
CountTimeStampValidation,
7679
CountUSAPhoneValidation,
7780
CountUSAStateCodeValidation,
7881
CountUSAZipCodeValidation,
7982
CountUUIDValidation,
8083
CountValidRegex,
8184
CountValidValues,
8285
PercentCUSIPValidation,
86+
PercentDateNotInFutureValidation,
8387
PercentEmailValidation,
8488
PercentFIGIValidation,
8589
PercentInvalidRegex,
@@ -88,9 +92,11 @@
8892
PercentLatitudeValidation,
8993
PercentLEIValidation,
9094
PercentLongitudeValidation,
95+
PercentNotInFutureValidation,
9196
PercentPermIDValidation,
9297
PercentSEDOLValidation,
9398
PercentSSNValidation,
99+
PercentTimeStampValidation,
94100
PercentUSAPhoneValidation,
95101
PercentUSAStateCodeValidation,
96102
PercentUSAZipCodeValidation,
@@ -173,6 +179,12 @@ class ValidationManager:
173179
ValidationFunction.PERCENT_ALL_SPACE.value: "PercentageAllSpaceValidation",
174180
ValidationFunction.COUNT_NULL_KEYWORD.value: "CountNullKeywordValidation",
175181
ValidationFunction.PERCENT_NULL_KEYWORD.value: "PercentageNullKeywordValidation",
182+
ValidationFunction.COUNT_TIMESTAMP_STRING.value: "CountTimeStampValidation",
183+
ValidationFunction.PERCENT_TIMESTAMP_STRING.value: "PercentTimeStampValidation",
184+
ValidationFunction.COUNT_NOT_IN_FUTURE.value: "CountNotInFutureValidation",
185+
ValidationFunction.PERCENT_NOT_IN_FUTURE.value: "PercentNotInFutureValidation",
186+
ValidationFunction.COUNT_DATE_NOT_IN_FUTURE.value: "CountDateNotInFutureValidation",
187+
ValidationFunction.PERCENT_DATE_NOT_IN_FUTURE.value: "PercentDateNotInFutureValidation",
176188
}
177189

178190
def __init__(

0 commit comments

Comments
 (0)