Skip to content

Fix freshness if no data, hanlde corner cases better #2309

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,13 @@ def evaluate(self, measurement_values: MeasurementValues, contract: Contract) ->
threshold_metric_name: str = f"freshness_in_{self.unit}s"

threshold_value: Optional[float] = None
if data_timestamp_utc and max_timestamp_utc:
if max_timestamp_utc is None:
outcome = CheckOutcome.FAILED

elif data_timestamp_utc is not None:
logger.debug(
f"Calculating freshness using '{max_timestamp}' as 'max' and '{data_timestamp}' as 'now' values"
)
freshness = data_timestamp_utc - max_timestamp_utc
freshness_in_seconds = freshness.total_seconds()

Expand All @@ -111,6 +117,8 @@ def evaluate(self, measurement_values: MeasurementValues, contract: Contract) ->
else:
outcome = CheckOutcome.FAILED

freshness_str: Optional[str] = str(freshness) if freshness is not None else None

return FreshnessCheckResult(
contract=contract,
check=self._build_check_info(),
Expand All @@ -121,27 +129,32 @@ def evaluate(self, measurement_values: MeasurementValues, contract: Contract) ->
max_timestamp_utc=max_timestamp_utc,
data_timestamp=data_timestamp,
data_timestamp_utc=data_timestamp_utc,
freshness=str(freshness),
freshness=freshness_str,
freshness_in_seconds=freshness_in_seconds,
unit=self.unit,
)

def _get_max_timestamp(self, measurement_values: MeasurementValues) -> Optional[datetime]:
max_timestamp: Optional[datetime] = measurement_values.get_value(self.max_timestamp_metric)
if not isinstance(max_timestamp, datetime):
if max_timestamp is None:
logger.warning(
f"Freshness metric '{self.max_timestamp_metric.type}' for column '{self.column}' "
f"returned no value. Does the table or partition have rows?"
)
return None
elif not isinstance(max_timestamp, datetime):
logger.debug(
f"Attempting to convert freshness value '{max_timestamp}' of data type '{type(max_timestamp).__name__}' to datetime"
)
if isinstance(max_timestamp, date):
max_timestamp = datetime.combine(max_timestamp, datetime.min.time())
elif isinstance(max_timestamp, str):
max_timestamp = convert_str_to_datetime(max_timestamp)
else:
logger.error(
f"Freshness metric '{self.max_timestamp_metric.type}' for column '{self.column}' "
f"has an invalid data type '({type(max_timestamp).__name__})'. "
f"Is the column a timestamp or a timestamp-compatible type?"
)

if not isinstance(max_timestamp, datetime):
logger.error(f"Freshness column '{self.column}' does not have timestamp values: '{max_timestamp}'")
logger.error(
f"Freshness column '{self.column}' returned value '{max_timestamp}' of data type '{type(max_timestamp).__name__}' which is not a datetime or datetime-compatible type."
)
max_timestamp = None

return max_timestamp
Expand Down
32 changes: 32 additions & 0 deletions soda-tests/tests/features/test_freshness_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,3 +115,35 @@ def test_freshness_now_variable(data_source_test_helper: DataSourceTestHelper):
assert str(check_result.freshness_in_seconds) == "3600.0"
assert str(check_result.unit) == "hour"
assert 0.99 < get_diagnostic_value(check_result, "freshness_in_hours") < 1.01


def test_freshness_no_rows(data_source_test_helper: DataSourceTestHelper):
test_table = data_source_test_helper.ensure_test_table(test_table_specification)

id_quoted = data_source_test_helper.quote_column("id")

contract_yaml_str: str = f"""
filter: |
{id_quoted} > 10
checks:
- freshness:
column: created_at
threshold:
must_be_less_than: 2
"""

with freeze_time(datetime(year=2025, month=1, day=4, hour=10, minute=0, second=0)):
contract_verification_result_t1: ContractVerificationResult = data_source_test_helper.assert_contract_fail(
test_table=test_table, contract_yaml_str=contract_yaml_str
)
check_result: FreshnessCheckResult = contract_verification_result_t1.check_results[0]
assert check_result.max_timestamp is None
assert check_result.max_timestamp_utc is None
assert str(check_result.data_timestamp) == "2025-01-04 10:00:00+00:00"
assert str(check_result.data_timestamp_utc) == "2025-01-04 10:00:00+00:00"
assert check_result.freshness is None
assert check_result.freshness_in_seconds is None
assert str(check_result.unit) == "hour"
assert len(check_result.diagnostic_metric_values) == 0

assert not contract_verification_result_t1.has_errors()