fallback to string stats if datetime didn't work

+ test
huggingface · Jan 16, 2025 · b7fee0b · b7fee0b
1 parent 0ee76bf
commit b7fee0b
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 7 deletions.
diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py
@@ -505,12 +505,18 @@ def _compute_statistics(
         nan_count, nan_proportion = nan_count_proportion(data, column_name, n_samples)
         n_unique = data[column_name].n_unique()
         if cls.is_datetime(data, column_name):
-            datetime_stats: DatetimeStatisticsItem = DatetimeColumn.compute_statistics(
-                data,
-                column_name=column_name,
-                n_samples=n_samples,
-            )
-            return datetime_stats
+            try:
+                stats: DatetimeStatisticsItem = DatetimeColumn.compute_statistics(
+                    data,
+                    column_name=column_name,
+                    n_samples=n_samples,
+                )
+                return stats
+            except Exception as error:
+                logging.info(
+                    f"Column {column_name} is datetime, but datetime stats compute failed ({error}), "
+                    f"compute string stats instead. "
+                )
 
         if cls.is_class(n_unique, n_samples):
             labels2counts: dict[str, int] = value_counts(data, column_name) if nan_count != n_samples else {}

diff --git a/services/worker/tests/fixtures/statistics_dataset.py b/services/worker/tests/fixtures/statistics_dataset.py
@@ -1755,6 +1755,19 @@ def null_column(n_samples: int) -> list[None]:
             "2024-01-10 00:00:00+0200",
             "2024-01-11 00:00:00+0200",
         ],
+        "datetime_string_error": [
+            "16/01/2023",
+            "17/01/2023",
+            "18/01/2023",
+            "19/01/2023",
+            "01/2023",
+            "02/2023",
+            "20/01/2023",
+            "21/01/2023",
+            "03/2023",
+            "25/01/2023",
+            "26/01/2023",
+        ],
         "datetime": [
             datetime.strptime("2024-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"),
             datetime.strptime("2024-01-02 00:00:00", "%Y-%m-%d %H:%M:%S"),
@@ -1802,6 +1815,7 @@ def null_column(n_samples: int) -> list[None]:
             "datetime_string_z": Value("string"),
             "datetime_string_t_z": Value("string"),
             "datetime_string_tz": Value("string"),
+            "datetime_string_error": Value("string"),
             "datetime": Value("timestamp[s]"),
             "datetime_tz": Value("timestamp[s, tz=+02:00]"),
             "datetime_null": Value("timestamp[s]"),

diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py
@@ -489,6 +489,10 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name
             "histogram": None,
         }
 
+    # testcase contains multiple datetime formats, and we compute string lengths distributions instead of error
+    if column_name == "datetime_string_error":
+        return count_expected_statistics_for_string_column(column)
+
     # hardcode expected values
     minv = "2024-01-01 00:00:00"
     maxv = "2024-01-11 00:00:00"
@@ -546,6 +550,7 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name
         "datetime_string_z",
         "datetime_string_t_z",
         "datetime_string_tz",
+        "datetime_string_error",
         "datetime_tz",
         "datetime_null",
         "datetime_all_null",
@@ -569,8 +574,9 @@ def test_datetime_statistics(
             column_name=column_name,
             n_samples=len(data[column_name]),
         )
+
     computed_std, expected_std = computed.pop("std"), expected.pop("std")
-    if computed_std:
+    if computed_std and column_name != "datetime_string_error":
         assert computed_std.split(".")[0] == expected_std.split(".")[0]  # check with precision up to seconds
     else:
         assert computed_std == expected_std