Skip to content

Commit

Permalink
fallback to string stats if datetime didn't work
Browse files Browse the repository at this point in the history
+ test
  • Loading branch information
polinaeterna committed Jan 16, 2025
1 parent 0ee76bf commit b7fee0b
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 7 deletions.
18 changes: 12 additions & 6 deletions services/worker/src/worker/statistics_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,12 +505,18 @@ def _compute_statistics(
nan_count, nan_proportion = nan_count_proportion(data, column_name, n_samples)
n_unique = data[column_name].n_unique()
if cls.is_datetime(data, column_name):
datetime_stats: DatetimeStatisticsItem = DatetimeColumn.compute_statistics(
data,
column_name=column_name,
n_samples=n_samples,
)
return datetime_stats
try:
stats: DatetimeStatisticsItem = DatetimeColumn.compute_statistics(
data,
column_name=column_name,
n_samples=n_samples,
)
return stats
except Exception as error:
logging.info(
f"Column {column_name} is datetime, but datetime stats compute failed ({error}), "
f"compute string stats instead. "
)

if cls.is_class(n_unique, n_samples):
labels2counts: dict[str, int] = value_counts(data, column_name) if nan_count != n_samples else {}
Expand Down
14 changes: 14 additions & 0 deletions services/worker/tests/fixtures/statistics_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1755,6 +1755,19 @@ def null_column(n_samples: int) -> list[None]:
"2024-01-10 00:00:00+0200",
"2024-01-11 00:00:00+0200",
],
"datetime_string_error": [
"16/01/2023",
"17/01/2023",
"18/01/2023",
"19/01/2023",
"01/2023",
"02/2023",
"20/01/2023",
"21/01/2023",
"03/2023",
"25/01/2023",
"26/01/2023",
],
"datetime": [
datetime.strptime("2024-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-02 00:00:00", "%Y-%m-%d %H:%M:%S"),
Expand Down Expand Up @@ -1802,6 +1815,7 @@ def null_column(n_samples: int) -> list[None]:
"datetime_string_z": Value("string"),
"datetime_string_t_z": Value("string"),
"datetime_string_tz": Value("string"),
"datetime_string_error": Value("string"),
"datetime": Value("timestamp[s]"),
"datetime_tz": Value("timestamp[s, tz=+02:00]"),
"datetime_null": Value("timestamp[s]"),
Expand Down
8 changes: 7 additions & 1 deletion services/worker/tests/test_statistics_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,10 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name
"histogram": None,
}

# testcase contains multiple datetime formats, and we compute string lengths distributions instead of error
if column_name == "datetime_string_error":
return count_expected_statistics_for_string_column(column)

# hardcode expected values
minv = "2024-01-01 00:00:00"
maxv = "2024-01-11 00:00:00"
Expand Down Expand Up @@ -546,6 +550,7 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name
"datetime_string_z",
"datetime_string_t_z",
"datetime_string_tz",
"datetime_string_error",
"datetime_tz",
"datetime_null",
"datetime_all_null",
Expand All @@ -569,8 +574,9 @@ def test_datetime_statistics(
column_name=column_name,
n_samples=len(data[column_name]),
)

computed_std, expected_std = computed.pop("std"), expected.pop("std")
if computed_std:
if computed_std and column_name != "datetime_string_error":
assert computed_std.split(".")[0] == expected_std.split(".")[0] # check with precision up to seconds
else:
assert computed_std == expected_std
Expand Down

0 comments on commit b7fee0b

Please sign in to comment.